122
123
#define LOAD_TOP_RIGHT_EDGE\
123
const int av_unused t4= topright[0];\
124
const int av_unused t5= topright[1];\
125
const int av_unused t6= topright[2];\
126
const int av_unused t7= topright[3];\
124
const unsigned av_unused t4 = topright[0];\
125
const unsigned av_unused t5 = topright[1];\
126
const unsigned av_unused t6 = topright[2];\
127
const unsigned av_unused t7 = topright[3];\
128
129
#define LOAD_DOWN_LEFT_EDGE\
129
const int av_unused l4= src[-1+4*stride];\
130
const int av_unused l5= src[-1+5*stride];\
131
const int av_unused l6= src[-1+6*stride];\
132
const int av_unused l7= src[-1+7*stride];\
130
const unsigned av_unused l4 = src[-1+4*stride];\
131
const unsigned av_unused l5 = src[-1+5*stride];\
132
const unsigned av_unused l6 = src[-1+6*stride];\
133
const unsigned av_unused l7 = src[-1+7*stride];\
134
135
#define LOAD_LEFT_EDGE\
135
const int av_unused l0= src[-1+0*stride];\
136
const int av_unused l1= src[-1+1*stride];\
137
const int av_unused l2= src[-1+2*stride];\
138
const int av_unused l3= src[-1+3*stride];\
136
const unsigned av_unused l0 = src[-1+0*stride];\
137
const unsigned av_unused l1 = src[-1+1*stride];\
138
const unsigned av_unused l2 = src[-1+2*stride];\
139
const unsigned av_unused l3 = src[-1+3*stride];\
140
141
#define LOAD_TOP_EDGE\
141
const int av_unused t0= src[ 0-1*stride];\
142
const int av_unused t1= src[ 1-1*stride];\
143
const int av_unused t2= src[ 2-1*stride];\
144
const int av_unused t3= src[ 3-1*stride];\
142
const unsigned av_unused t0 = src[ 0-1*stride];\
143
const unsigned av_unused t1 = src[ 1-1*stride];\
144
const unsigned av_unused t2 = src[ 2-1*stride];\
145
const unsigned av_unused t3 = src[ 3-1*stride];\
146
147
static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
147
148
pixel *src = (pixel*)_src;
480
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
481
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
482
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
505
PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
506
PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
507
PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
509
static void FUNCC(pred8x16_128_dc)(uint8_t *_src, int stride){
510
FUNCC(pred8x8_128_dc)(_src, stride);
511
FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
484
514
static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
562
//the following 4 function should not be optimized!
618
static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){
620
int dc0, dc1, dc2, dc3, dc4;
621
pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
622
pixel *src = (pixel*)_src;
623
stride >>= sizeof(pixel)-1;
625
dc0=dc1=dc2=dc3=dc4=0;
627
dc0+= src[-1+i*stride] + src[i-stride];
628
dc1+= src[4+i-stride];
629
dc2+= src[-1+(i+4)*stride];
630
dc3+= src[-1+(i+8)*stride];
631
dc4+= src[-1+(i+12)*stride];
633
dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
634
dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
635
dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
636
dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
637
dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
638
dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
639
dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
640
dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
643
AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
644
AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
647
AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
648
AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
651
AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
652
AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
654
for(i=12; i<16; i++){
655
AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
656
AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
563
660
static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
564
661
FUNCC(pred8x8_top_dc)(src, stride);
565
662
FUNCC(pred4x4_dc)(src, NULL, stride);
665
static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
666
FUNCC(pred8x16_top_dc)(src, stride);
667
FUNCC(pred4x4_dc)(src, NULL, stride);
568
670
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
569
671
FUNCC(pred8x8_dc)(src, stride);
570
672
FUNCC(pred4x4_top_dc)(src, NULL, stride);
675
static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
676
FUNCC(pred8x16_dc)(src, stride);
677
FUNCC(pred4x4_top_dc)(src, NULL, stride);
573
680
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
574
681
FUNCC(pred8x8_left_dc)(src, stride);
575
682
FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
576
683
FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
686
static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
687
FUNCC(pred8x16_left_dc)(src, stride);
688
FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
689
FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
579
692
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
580
693
FUNCC(pred8x8_left_dc)(src, stride);
581
694
FUNCC(pred4x4_128_dc)(src , NULL, stride);
582
695
FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
698
static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
699
FUNCC(pred8x16_left_dc)(src, stride);
700
FUNCC(pred4x4_128_dc)(src , NULL, stride);
701
FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
585
704
static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
739
static void FUNCC(pred8x16_plane)(uint8_t *_src, int _stride){
743
pixel *src = (pixel*)_src;
744
int stride = _stride>>(sizeof(pixel)-1);
745
const pixel * const src0 = src +3-stride;
746
const pixel * src1 = src +8*stride-1;
747
const pixel * src2 = src1-2*stride; // == src+6*stride-1;
748
int H = src0[1] - src0[-1];
749
int V = src1[0] - src2[ 0];
751
for (k = 2; k <= 4; ++k) {
752
src1 += stride; src2 -= stride;
753
H += k*(src0[k] - src0[-k]);
754
V += k*(src1[0] - src2[ 0]);
756
for (; k <= 8; ++k) {
757
src1 += stride; src2 -= stride;
758
V += k*(src1[0] - src2[0]);
764
a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
765
for(j=16; j>0; --j) {
768
src[0] = CLIP((b ) >> 5);
769
src[1] = CLIP((b+ H) >> 5);
770
src[2] = CLIP((b+2*H) >> 5);
771
src[3] = CLIP((b+3*H) >> 5);
772
src[4] = CLIP((b+4*H) >> 5);
773
src[5] = CLIP((b+5*H) >> 5);
774
src[6] = CLIP((b+6*H) >> 5);
775
src[7] = CLIP((b+7*H) >> 5);
620
780
#define SRC(x,y) src[(x)+(y)*stride]
622
782
const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
987
1147
FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1150
static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1153
FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1155
FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
990
1158
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
992
1160
for(i=0; i<4; i++)
993
1161
FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1164
static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1167
FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1169
FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);