532
532
static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
533
const vector signed int vsint32_8 = vec_splat_s32(8);
534
const vector unsigned int vuint32_4 = vec_splat_u32(4);
535
const vector signed char neg1 = vec_splat_s8(-1);
537
const vector unsigned char permA1 = (vector unsigned char)
538
{0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
539
0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
540
const vector unsigned char permA2 = (vector unsigned char)
541
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
542
0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
543
const vector unsigned char permA1inc = (vector unsigned char)
544
{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
545
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
546
const vector unsigned char permA2inc = (vector unsigned char)
547
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
548
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
549
const vector unsigned char magic = (vector unsigned char)
550
{0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
551
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
552
const vector unsigned char extractPerm = (vector unsigned char)
553
{0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
554
0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
555
const vector unsigned char extractPermInc = (vector unsigned char)
556
{0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
557
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
558
const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
559
const vector unsigned char tenRight = (vector unsigned char)
560
{0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
561
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
562
const vector unsigned char eightLeft = (vector unsigned char)
563
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
564
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
534
567
this code makes no assumption on src or stride.
535
568
One could remove the recomputation of the perm
575
611
const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
577
613
#define EXTRACT(op) do { \
578
const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
579
const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
580
const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
581
const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
582
const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
583
const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \
584
const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \
585
const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \
586
const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \
587
const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
588
const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
589
const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
590
const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
591
const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
592
v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
614
const vector unsigned char s_1 = vec_##op(trunc_src12, trunc_src34); \
615
const vector unsigned char s_2 = vec_##op(trunc_src56, trunc_src78); \
616
const vector unsigned char s_6 = vec_##op(s_1, s_2); \
617
const vector unsigned char s_8h = vec_mergeh(s_6, s_6); \
618
const vector unsigned char s_8l = vec_mergel(s_6, s_6); \
619
const vector unsigned char s_9 = vec_##op(s_8h, s_8l); \
620
const vector unsigned char s_9h = vec_mergeh(s_9, s_9); \
621
const vector unsigned char s_9l = vec_mergel(s_9, s_9); \
622
const vector unsigned char s_10 = vec_##op(s_9h, s_9l); \
623
const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \
624
const vector unsigned char s_10l = vec_mergel(s_10, s_10); \
625
const vector unsigned char s_11 = vec_##op(s_10h, s_10l); \
626
const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \
627
const vector unsigned char s_11l = vec_mergel(s_11, s_11); \
628
v_##op = vec_##op(s_11h, s_11l); \
594
631
vector unsigned char v_min;
595
632
vector unsigned char v_max;
615
651
const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
616
652
const vector unsigned int vuint32_1 = vec_splat_u32(1);
654
vector signed int sumA2;
655
vector signed int sumB2;
656
vector signed int sum0, sum1, sum2, sum3, sum4;
657
vector signed int sum5, sum6, sum7, sum8, sum9;
618
659
#define COMPARE(i) \
619
vector signed int sum##i; \
621
const vector unsigned char cmp##i = \
661
const vector unsigned char cmp = \
622
662
(vector unsigned char)vec_cmpgt(src##i, v_avg); \
623
const vector unsigned short cmpHi##i = \
624
(vector unsigned short)vec_mergeh(cmp##i, cmp##i); \
625
const vector unsigned short cmpLi##i = \
626
(vector unsigned short)vec_mergel(cmp##i, cmp##i); \
627
const vector signed short cmpHf##i = \
628
(vector signed short)vec_and(cmpHi##i, mask1); \
629
const vector signed short cmpLf##i = \
630
(vector signed short)vec_and(cmpLi##i, mask2); \
631
const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \
632
const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
633
sum##i = vec_sums(sumq##i, zero); } while (0)
663
const vector unsigned short cmpHi = \
664
(vector unsigned short)vec_mergeh(cmp, cmp); \
665
const vector unsigned short cmpLi = \
666
(vector unsigned short)vec_mergel(cmp, cmp); \
667
const vector signed short cmpHf = \
668
(vector signed short)vec_and(cmpHi, mask1); \
669
const vector signed short cmpLf = \
670
(vector signed short)vec_and(cmpLi, mask2); \
671
const vector signed int sump = vec_sum4s(cmpHf, zero); \
672
const vector signed int sumq = vec_sum4s(cmpLf, sump); \
673
sum##i = vec_sums(sumq, zero); \
699
738
/* I'm not sure the following is actually faster
700
739
than straight, unvectorized C code :-( */
702
DECLARE_ALIGNED(16, int, tQP2)[4];
703
tQP2[0]= c->QP/2 + 1;
704
vector signed int vQP2 = vec_ld(0, tQP2);
705
vQP2 = vec_splat(vQP2, 0);
706
const vector signed int vsint32_8 = vec_splat_s32(8);
707
const vector unsigned int vuint32_4 = vec_splat_u32(4);
709
const vector unsigned char permA1 = (vector unsigned char)
710
{0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
711
0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
712
const vector unsigned char permA2 = (vector unsigned char)
713
{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
714
0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
715
const vector unsigned char permA1inc = (vector unsigned char)
716
{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
717
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
718
const vector unsigned char permA2inc = (vector unsigned char)
719
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
720
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
721
const vector unsigned char magic = (vector unsigned char)
722
{0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
723
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
724
const vector unsigned char extractPerm = (vector unsigned char)
725
{0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
726
0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
727
const vector unsigned char extractPermInc = (vector unsigned char)
728
{0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
729
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
730
const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
731
const vector unsigned char tenRight = (vector unsigned char)
732
{0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
733
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
734
const vector unsigned char eightLeft = (vector unsigned char)
735
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
736
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
740
vector unsigned char tenRightM##i = tenRight; \
741
vector unsigned char permA1M##i = permA1; \
742
vector unsigned char permA2M##i = permA2; \
743
vector unsigned char extractPermM##i = extractPerm
742
vector unsigned char tenRightM = tenRight; \
743
vector unsigned char permA1M = permA1; \
744
vector unsigned char permA2M = permA2; \
745
vector unsigned char extractPermM = extractPerm
745
747
#define F2(i, j, k, l) \
746
748
if (S[i] & (1 << (l+1))) { \
747
const vector unsigned char a_##j##_A##l = \
748
vec_perm(src##i, src##j, permA1M##i); \
749
const vector unsigned char a_##j##_B##l = \
750
vec_perm(a_##j##_A##l, src##k, permA2M##i); \
751
const vector signed int a_##j##_sump##l = \
752
(vector signed int)vec_msum(a_##j##_B##l, magic, \
753
(vector unsigned int)zero); \
754
vector signed int F_##j##_##l = \
755
vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \
756
F_##j##_##l = vec_splat(F_##j##_##l, 3); \
757
const vector signed int p_##j##_##l = \
758
(vector signed int)vec_perm(src##j, \
759
(vector unsigned char)zero, \
761
const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\
762
const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\
763
vector signed int newpm_##j##_##l; \
764
if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \
765
newpm_##j##_##l = sum_##j##_##l; \
766
else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \
767
newpm_##j##_##l = diff_##j##_##l; \
768
else newpm_##j##_##l = F_##j##_##l; \
769
const vector unsigned char newpm2_##j##_##l = \
770
vec_splat((vector unsigned char)newpm_##j##_##l, 15); \
771
const vector unsigned char mask##j##l = vec_add(identity, \
773
src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \
749
const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \
750
const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \
751
const vector signed int a_sump = \
752
(vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\
753
vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \
754
const vector signed int p = \
755
(vector signed int)vec_perm(src##j, (vector unsigned char)zero, \
757
const vector signed int sum = vec_add(p, vQP2); \
758
const vector signed int diff = vec_sub(p, vQP2); \
759
vector signed int newpm; \
760
vector unsigned char newpm2, mask; \
761
F = vec_splat(F, 3); \
762
if (vec_all_lt(sum, F)) \
764
else if (vec_all_gt(diff, F)) \
767
newpm2 = vec_splat((vector unsigned char)newpm, 15); \
768
mask = vec_add(identity, tenRightM); \
769
src##j = vec_perm(src##j, newpm2, mask); \
775
permA1M##i = vec_add(permA1M##i, permA1inc); \
776
permA2M##i = vec_add(permA2M##i, permA2inc); \
777
tenRightM##i = vec_sro(tenRightM##i, eightLeft); \
778
extractPermM##i = vec_add(extractPermM##i, extractPermInc)
771
permA1M = vec_add(permA1M, permA1inc); \
772
permA2M = vec_add(permA2M, permA2inc); \
773
tenRightM = vec_sro(tenRightM, eightLeft); \
774
extractPermM = vec_add(extractPermM, extractPermInc)
780
#define ITER(i, j, k) \
776
#define ITER(i, j, k) do { \
782
778
F2(i, j, k, 0); \
783
779
F2(i, j, k, 1); \
784
780
F2(i, j, k, 2); \
982
const vector signed char neg1 = vec_splat_s8(-1);
983
const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
984
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
986
#define PACK_AND_STORE(src, i) \
987
const vector unsigned char perms##src##i = \
988
vec_lvsr(i * stride, src); \
989
const vector unsigned char vf##src##i = \
990
vec_packsu(v_tempBlurredAss##i, (vector signed short)zero); \
991
const vector unsigned char vg##src##i = \
992
vec_perm(vf##src##i, v_##src##A##i, permHH); \
993
const vector unsigned char mask##src##i = \
994
vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
995
const vector unsigned char vg2##src##i = \
996
vec_perm(vg##src##i, vg##src##i, perms##src##i); \
997
const vector unsigned char svA##src##i = \
998
vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \
999
const vector unsigned char svB##src##i = \
1000
vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \
1001
vec_st(svA##src##i, i * stride, src); \
1002
vec_st(svB##src##i, i * stride + 16, src)
982
#define PACK_AND_STORE(src, i) do { \
983
const vector unsigned char perms = vec_lvsr(i * stride, src); \
984
const vector unsigned char vf = \
985
vec_packsu(v_tempBlurredAss##1, (vector signed short)zero); \
986
const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \
987
const vector unsigned char mask = \
988
vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \
989
const vector unsigned char vg2 = vec_perm(vg, vg, perms); \
990
const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \
991
const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \
992
vec_st(svA, i * stride, src); \
993
vec_st(svB, i * stride + 16, src); \
1004
996
PACK_AND_STORE(src, 0);
1005
997
PACK_AND_STORE(src, 1);
1187
1180
temp7 = vec_mergel(tempD, tempL);
1190
const vector signed char neg1 = vec_splat_s8(-1);
1191
#define STORE_DOUBLE_LINE(i, j) \
1192
vector unsigned char dstA##i = vec_ld(i * stride, dst); \
1193
vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \
1194
vector unsigned char dstA##j = vec_ld(j * stride, dst); \
1195
vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \
1196
vector unsigned char align##i = vec_lvsr(i * stride, dst); \
1197
vector unsigned char align##j = vec_lvsr(j * stride, dst); \
1198
vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
1199
vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
1200
vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);\
1201
vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);\
1202
vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \
1203
vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \
1204
vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \
1205
vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \
1206
vec_st(dstAF##i, i * stride, dst); \
1207
vec_st(dstBF##i, i * stride + 16, dst); \
1208
vec_st(dstAF##j, j * stride, dst); \
1209
vec_st(dstBF##j, j * stride + 16, dst)
1183
#define STORE_DOUBLE_LINE(i, j) do { \
1184
vector unsigned char dstAi = vec_ld(i * stride, dst); \
1185
vector unsigned char dstBi = vec_ld(i * stride + 16, dst); \
1186
vector unsigned char dstAj = vec_ld(j * stride, dst); \
1187
vector unsigned char dstBj = vec_ld(j * stride+ 16, dst); \
1188
vector unsigned char aligni = vec_lvsr(i * stride, dst); \
1189
vector unsigned char alignj = vec_lvsr(j * stride, dst); \
1190
vector unsigned char maski = \
1191
vec_perm(zero, (vector unsigned char)neg1, aligni); \
1192
vector unsigned char maskj = \
1193
vec_perm(zero, (vector unsigned char)neg1, alignj); \
1194
vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni); \
1195
vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj); \
1196
vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski); \
1197
vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski); \
1198
vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj); \
1199
vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj); \
1200
vec_st(dstAFi, i * stride, dst); \
1201
vec_st(dstBFi, i * stride + 16, dst); \
1202
vec_st(dstAFj, j * stride, dst); \
1203
vec_st(dstBFj, j * stride + 16, dst); \
1211
1206
STORE_DOUBLE_LINE(0,1);
1212
1207
STORE_DOUBLE_LINE(2,3);