710
692
"r" ((long) stride ),
711
693
"r" ((long)(3*stride))
715
#ifdef CONFIG_ENCODERS
716
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
723
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
724
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
726
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
728
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
729
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
731
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
732
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
733
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
735
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
736
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
738
"pmaddwd %%mm3,%%mm3\n"
739
"pmaddwd %%mm4,%%mm4\n"
741
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
742
pix2^2+pix3^2+pix6^2+pix7^2) */
743
"paddd %%mm3,%%mm4\n"
744
"paddd %%mm2,%%mm7\n"
747
"paddd %%mm4,%%mm7\n"
752
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
753
"paddd %%mm7,%%mm1\n"
755
: "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
759
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
764
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
765
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
767
"movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
768
"movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
769
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
770
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
772
/* todo: mm1-mm2, mm3-mm4 */
773
/* algo: substract mm1 from mm2 with saturation and vice versa */
774
/* OR the results to get absolute difference */
777
"psubusb %%mm2,%%mm1\n"
778
"psubusb %%mm4,%%mm3\n"
779
"psubusb %%mm5,%%mm2\n"
780
"psubusb %%mm6,%%mm4\n"
785
/* now convert to 16-bit vectors so we can square them */
789
"punpckhbw %%mm0,%%mm2\n"
790
"punpckhbw %%mm0,%%mm4\n"
791
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
792
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
794
"pmaddwd %%mm2,%%mm2\n"
795
"pmaddwd %%mm4,%%mm4\n"
796
"pmaddwd %%mm1,%%mm1\n"
797
"pmaddwd %%mm3,%%mm3\n"
799
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
800
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
802
"paddd %%mm2,%%mm1\n"
803
"paddd %%mm4,%%mm3\n"
804
"paddd %%mm1,%%mm7\n"
805
"paddd %%mm3,%%mm7\n"
811
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
812
"paddd %%mm7,%%mm1\n"
814
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
815
: "r" ((long)line_size) , "m" (h)
820
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
824
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
825
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
827
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
828
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
829
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
830
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
832
/* todo: mm1-mm2, mm3-mm4 */
833
/* algo: substract mm1 from mm2 with saturation and vice versa */
834
/* OR the results to get absolute difference */
837
"psubusb %%mm2,%%mm1\n"
838
"psubusb %%mm4,%%mm3\n"
839
"psubusb %%mm5,%%mm2\n"
840
"psubusb %%mm6,%%mm4\n"
845
/* now convert to 16-bit vectors so we can square them */
849
"punpckhbw %%mm0,%%mm2\n"
850
"punpckhbw %%mm0,%%mm4\n"
851
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
852
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
854
"pmaddwd %%mm2,%%mm2\n"
855
"pmaddwd %%mm4,%%mm4\n"
856
"pmaddwd %%mm1,%%mm1\n"
857
"pmaddwd %%mm3,%%mm3\n"
862
"paddd %%mm2,%%mm1\n"
863
"paddd %%mm4,%%mm3\n"
864
"paddd %%mm1,%%mm7\n"
865
"paddd %%mm3,%%mm7\n"
871
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
872
"paddd %%mm7,%%mm1\n"
874
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
875
: "r" ((long)line_size) , "m" (h)
880
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
884
"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
885
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
887
"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
888
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
889
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
890
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
892
/* todo: mm1-mm2, mm3-mm4 */
893
/* algo: substract mm1 from mm2 with saturation and vice versa */
894
/* OR the results to get absolute difference */
895
"movdqa %%xmm1,%%xmm5\n"
896
"movdqa %%xmm3,%%xmm6\n"
897
"psubusb %%xmm2,%%xmm1\n"
898
"psubusb %%xmm4,%%xmm3\n"
899
"psubusb %%xmm5,%%xmm2\n"
900
"psubusb %%xmm6,%%xmm4\n"
902
"por %%xmm1,%%xmm2\n"
903
"por %%xmm3,%%xmm4\n"
905
/* now convert to 16-bit vectors so we can square them */
906
"movdqa %%xmm2,%%xmm1\n"
907
"movdqa %%xmm4,%%xmm3\n"
909
"punpckhbw %%xmm0,%%xmm2\n"
910
"punpckhbw %%xmm0,%%xmm4\n"
911
"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
912
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
914
"pmaddwd %%xmm2,%%xmm2\n"
915
"pmaddwd %%xmm4,%%xmm4\n"
916
"pmaddwd %%xmm1,%%xmm1\n"
917
"pmaddwd %%xmm3,%%xmm3\n"
919
"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
920
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
922
"paddd %%xmm2,%%xmm1\n"
923
"paddd %%xmm4,%%xmm3\n"
924
"paddd %%xmm1,%%xmm7\n"
925
"paddd %%xmm3,%%xmm7\n"
930
"movdqa %%xmm7,%%xmm1\n"
931
"psrldq $8, %%xmm7\n" /* shift hi qword to lo */
932
"paddd %%xmm1,%%xmm7\n"
933
"movdqa %%xmm7,%%xmm1\n"
934
"psrldq $4, %%xmm7\n" /* shift hi dword to lo */
935
"paddd %%xmm1,%%xmm7\n"
937
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
938
: "r" ((long)line_size));
942
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
950
"movq %%mm0, %%mm1\n"
954
"movq %%mm0, %%mm2\n"
955
"movq %%mm1, %%mm3\n"
956
"punpcklbw %%mm7,%%mm0\n"
957
"punpcklbw %%mm7,%%mm1\n"
958
"punpckhbw %%mm7,%%mm2\n"
959
"punpckhbw %%mm7,%%mm3\n"
960
"psubw %%mm1, %%mm0\n"
961
"psubw %%mm3, %%mm2\n"
966
"movq %%mm4, %%mm1\n"
970
"movq %%mm4, %%mm5\n"
971
"movq %%mm1, %%mm3\n"
972
"punpcklbw %%mm7,%%mm4\n"
973
"punpcklbw %%mm7,%%mm1\n"
974
"punpckhbw %%mm7,%%mm5\n"
975
"punpckhbw %%mm7,%%mm3\n"
976
"psubw %%mm1, %%mm4\n"
977
"psubw %%mm3, %%mm5\n"
978
"psubw %%mm4, %%mm0\n"
979
"psubw %%mm5, %%mm2\n"
980
"pxor %%mm3, %%mm3\n"
981
"pxor %%mm1, %%mm1\n"
982
"pcmpgtw %%mm0, %%mm3\n\t"
983
"pcmpgtw %%mm2, %%mm1\n\t"
984
"pxor %%mm3, %%mm0\n"
985
"pxor %%mm1, %%mm2\n"
986
"psubw %%mm3, %%mm0\n"
987
"psubw %%mm1, %%mm2\n"
988
"paddw %%mm0, %%mm2\n"
989
"paddw %%mm2, %%mm6\n"
995
"movq %%mm0, %%mm1\n"
999
"movq %%mm0, %%mm2\n"
1000
"movq %%mm1, %%mm3\n"
1001
"punpcklbw %%mm7,%%mm0\n"
1002
"punpcklbw %%mm7,%%mm1\n"
1003
"punpckhbw %%mm7,%%mm2\n"
1004
"punpckhbw %%mm7,%%mm3\n"
1005
"psubw %%mm1, %%mm0\n"
1006
"psubw %%mm3, %%mm2\n"
1007
"psubw %%mm0, %%mm4\n"
1008
"psubw %%mm2, %%mm5\n"
1009
"pxor %%mm3, %%mm3\n"
1010
"pxor %%mm1, %%mm1\n"
1011
"pcmpgtw %%mm4, %%mm3\n\t"
1012
"pcmpgtw %%mm5, %%mm1\n\t"
1013
"pxor %%mm3, %%mm4\n"
1014
"pxor %%mm1, %%mm5\n"
1015
"psubw %%mm3, %%mm4\n"
1016
"psubw %%mm1, %%mm5\n"
1017
"paddw %%mm4, %%mm5\n"
1018
"paddw %%mm5, %%mm6\n"
1023
"movq %%mm4, %%mm1\n"
1027
"movq %%mm4, %%mm5\n"
1028
"movq %%mm1, %%mm3\n"
1029
"punpcklbw %%mm7,%%mm4\n"
1030
"punpcklbw %%mm7,%%mm1\n"
1031
"punpckhbw %%mm7,%%mm5\n"
1032
"punpckhbw %%mm7,%%mm3\n"
1033
"psubw %%mm1, %%mm4\n"
1034
"psubw %%mm3, %%mm5\n"
1035
"psubw %%mm4, %%mm0\n"
1036
"psubw %%mm5, %%mm2\n"
1037
"pxor %%mm3, %%mm3\n"
1038
"pxor %%mm1, %%mm1\n"
1039
"pcmpgtw %%mm0, %%mm3\n\t"
1040
"pcmpgtw %%mm2, %%mm1\n\t"
1041
"pxor %%mm3, %%mm0\n"
1042
"pxor %%mm1, %%mm2\n"
1043
"psubw %%mm3, %%mm0\n"
1044
"psubw %%mm1, %%mm2\n"
1045
"paddw %%mm0, %%mm2\n"
1046
"paddw %%mm2, %%mm6\n"
1052
"movq %%mm6, %%mm0\n"
1053
"punpcklwd %%mm7,%%mm0\n"
1054
"punpckhwd %%mm7,%%mm6\n"
1055
"paddd %%mm0, %%mm6\n"
1057
"movq %%mm6,%%mm0\n"
1058
"psrlq $32, %%mm6\n"
1059
"paddd %%mm6,%%mm0\n"
1061
: "+r" (pix1), "=r"(tmp)
1062
: "r" ((long)line_size) , "g" (h-2)
1067
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1069
uint8_t * pix= pix1;
1072
"pxor %%mm7,%%mm7\n"
1073
"pxor %%mm6,%%mm6\n"
1076
"movq 1(%0),%%mm1\n"
1077
"movq %%mm0, %%mm2\n"
1078
"movq %%mm1, %%mm3\n"
1079
"punpcklbw %%mm7,%%mm0\n"
1080
"punpcklbw %%mm7,%%mm1\n"
1081
"punpckhbw %%mm7,%%mm2\n"
1082
"punpckhbw %%mm7,%%mm3\n"
1083
"psubw %%mm1, %%mm0\n"
1084
"psubw %%mm3, %%mm2\n"
1089
"movq 1(%0),%%mm1\n"
1090
"movq %%mm4, %%mm5\n"
1091
"movq %%mm1, %%mm3\n"
1092
"punpcklbw %%mm7,%%mm4\n"
1093
"punpcklbw %%mm7,%%mm1\n"
1094
"punpckhbw %%mm7,%%mm5\n"
1095
"punpckhbw %%mm7,%%mm3\n"
1096
"psubw %%mm1, %%mm4\n"
1097
"psubw %%mm3, %%mm5\n"
1098
"psubw %%mm4, %%mm0\n"
1099
"psubw %%mm5, %%mm2\n"
1100
"pxor %%mm3, %%mm3\n"
1101
"pxor %%mm1, %%mm1\n"
1102
"pcmpgtw %%mm0, %%mm3\n\t"
1103
"pcmpgtw %%mm2, %%mm1\n\t"
1104
"pxor %%mm3, %%mm0\n"
1105
"pxor %%mm1, %%mm2\n"
1106
"psubw %%mm3, %%mm0\n"
1107
"psubw %%mm1, %%mm2\n"
1108
"paddw %%mm0, %%mm2\n"
1109
"paddw %%mm2, %%mm6\n"
1115
"movq 1(%0),%%mm1\n"
1116
"movq %%mm0, %%mm2\n"
1117
"movq %%mm1, %%mm3\n"
1118
"punpcklbw %%mm7,%%mm0\n"
1119
"punpcklbw %%mm7,%%mm1\n"
1120
"punpckhbw %%mm7,%%mm2\n"
1121
"punpckhbw %%mm7,%%mm3\n"
1122
"psubw %%mm1, %%mm0\n"
1123
"psubw %%mm3, %%mm2\n"
1124
"psubw %%mm0, %%mm4\n"
1125
"psubw %%mm2, %%mm5\n"
1126
"pxor %%mm3, %%mm3\n"
1127
"pxor %%mm1, %%mm1\n"
1128
"pcmpgtw %%mm4, %%mm3\n\t"
1129
"pcmpgtw %%mm5, %%mm1\n\t"
1130
"pxor %%mm3, %%mm4\n"
1131
"pxor %%mm1, %%mm5\n"
1132
"psubw %%mm3, %%mm4\n"
1133
"psubw %%mm1, %%mm5\n"
1134
"paddw %%mm4, %%mm5\n"
1135
"paddw %%mm5, %%mm6\n"
1140
"movq 1(%0),%%mm1\n"
1141
"movq %%mm4, %%mm5\n"
1142
"movq %%mm1, %%mm3\n"
1143
"punpcklbw %%mm7,%%mm4\n"
1144
"punpcklbw %%mm7,%%mm1\n"
1145
"punpckhbw %%mm7,%%mm5\n"
1146
"punpckhbw %%mm7,%%mm3\n"
1147
"psubw %%mm1, %%mm4\n"
1148
"psubw %%mm3, %%mm5\n"
1149
"psubw %%mm4, %%mm0\n"
1150
"psubw %%mm5, %%mm2\n"
1151
"pxor %%mm3, %%mm3\n"
1152
"pxor %%mm1, %%mm1\n"
1153
"pcmpgtw %%mm0, %%mm3\n\t"
1154
"pcmpgtw %%mm2, %%mm1\n\t"
1155
"pxor %%mm3, %%mm0\n"
1156
"pxor %%mm1, %%mm2\n"
1157
"psubw %%mm3, %%mm0\n"
1158
"psubw %%mm1, %%mm2\n"
1159
"paddw %%mm0, %%mm2\n"
1160
"paddw %%mm2, %%mm6\n"
1166
"movq %%mm6, %%mm0\n"
1167
"punpcklwd %%mm7,%%mm0\n"
1168
"punpckhwd %%mm7,%%mm6\n"
1169
"paddd %%mm0, %%mm6\n"
1171
"movq %%mm6,%%mm0\n"
1172
"psrlq $32, %%mm6\n"
1173
"paddd %%mm6,%%mm0\n"
1175
: "+r" (pix1), "=r"(tmp)
1176
: "r" ((long)line_size) , "g" (h-2)
1178
return tmp + hf_noise8_mmx(pix+8, line_size, h);
1181
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1182
MpegEncContext *c = p;
1185
if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1186
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1187
score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1189
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1190
else return score1 + FFABS(score2)*8;
1193
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1194
MpegEncContext *c = p;
1195
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1196
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1198
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1199
else return score1 + FFABS(score2)*8;
1202
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1205
assert( (((int)pix) & 7) == 0);
1206
assert((line_size &7) ==0);
1208
#define SUM(in0, in1, out0, out1) \
1209
"movq (%0), %%mm2\n"\
1210
"movq 8(%0), %%mm3\n"\
1212
"movq %%mm2, " #out0 "\n"\
1213
"movq %%mm3, " #out1 "\n"\
1214
"psubusb " #in0 ", %%mm2\n"\
1215
"psubusb " #in1 ", %%mm3\n"\
1216
"psubusb " #out0 ", " #in0 "\n"\
1217
"psubusb " #out1 ", " #in1 "\n"\
1218
"por %%mm2, " #in0 "\n"\
1219
"por %%mm3, " #in1 "\n"\
1220
"movq " #in0 ", %%mm2\n"\
1221
"movq " #in1 ", %%mm3\n"\
1222
"punpcklbw %%mm7, " #in0 "\n"\
1223
"punpcklbw %%mm7, " #in1 "\n"\
1224
"punpckhbw %%mm7, %%mm2\n"\
1225
"punpckhbw %%mm7, %%mm3\n"\
1226
"paddw " #in1 ", " #in0 "\n"\
1227
"paddw %%mm3, %%mm2\n"\
1228
"paddw %%mm2, " #in0 "\n"\
1229
"paddw " #in0 ", %%mm6\n"
1234
"pxor %%mm6,%%mm6\n"
1235
"pxor %%mm7,%%mm7\n"
1237
"movq 8(%0),%%mm1\n"
1240
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1243
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1245
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1250
"movq %%mm6,%%mm0\n"
1251
"psrlq $32, %%mm6\n"
1252
"paddw %%mm6,%%mm0\n"
1253
"movq %%mm0,%%mm6\n"
1254
"psrlq $16, %%mm0\n"
1255
"paddw %%mm6,%%mm0\n"
1257
: "+r" (pix), "=r"(tmp)
1258
: "r" ((long)line_size) , "m" (h)
1260
return tmp & 0xFFFF;
1264
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1267
assert( (((int)pix) & 7) == 0);
1268
assert((line_size &7) ==0);
1270
#define SUM(in0, in1, out0, out1) \
1271
"movq (%0), " #out0 "\n"\
1272
"movq 8(%0), " #out1 "\n"\
1274
"psadbw " #out0 ", " #in0 "\n"\
1275
"psadbw " #out1 ", " #in1 "\n"\
1276
"paddw " #in1 ", " #in0 "\n"\
1277
"paddw " #in0 ", %%mm6\n"
1281
"pxor %%mm6,%%mm6\n"
1282
"pxor %%mm7,%%mm7\n"
1284
"movq 8(%0),%%mm1\n"
1287
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1290
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1292
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1298
: "+r" (pix), "=r"(tmp)
1299
: "r" ((long)line_size) , "m" (h)
1305
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1308
assert( (((int)pix1) & 7) == 0);
1309
assert( (((int)pix2) & 7) == 0);
1310
assert((line_size &7) ==0);
1312
#define SUM(in0, in1, out0, out1) \
1313
"movq (%0),%%mm2\n"\
1314
"movq (%1)," #out0 "\n"\
1315
"movq 8(%0),%%mm3\n"\
1316
"movq 8(%1)," #out1 "\n"\
1319
"psubb " #out0 ", %%mm2\n"\
1320
"psubb " #out1 ", %%mm3\n"\
1321
"pxor %%mm7, %%mm2\n"\
1322
"pxor %%mm7, %%mm3\n"\
1323
"movq %%mm2, " #out0 "\n"\
1324
"movq %%mm3, " #out1 "\n"\
1325
"psubusb " #in0 ", %%mm2\n"\
1326
"psubusb " #in1 ", %%mm3\n"\
1327
"psubusb " #out0 ", " #in0 "\n"\
1328
"psubusb " #out1 ", " #in1 "\n"\
1329
"por %%mm2, " #in0 "\n"\
1330
"por %%mm3, " #in1 "\n"\
1331
"movq " #in0 ", %%mm2\n"\
1332
"movq " #in1 ", %%mm3\n"\
1333
"punpcklbw %%mm7, " #in0 "\n"\
1334
"punpcklbw %%mm7, " #in1 "\n"\
1335
"punpckhbw %%mm7, %%mm2\n"\
1336
"punpckhbw %%mm7, %%mm3\n"\
1337
"paddw " #in1 ", " #in0 "\n"\
1338
"paddw %%mm3, %%mm2\n"\
1339
"paddw %%mm2, " #in0 "\n"\
1340
"paddw " #in0 ", %%mm6\n"
1345
"pxor %%mm6,%%mm6\n"
1346
"pcmpeqw %%mm7,%%mm7\n"
1347
"psllw $15, %%mm7\n"
1348
"packsswb %%mm7, %%mm7\n"
1351
"movq 8(%0),%%mm1\n"
1352
"movq 8(%1),%%mm3\n"
1356
"psubb %%mm2, %%mm0\n"
1357
"psubb %%mm3, %%mm1\n"
1358
"pxor %%mm7, %%mm0\n"
1359
"pxor %%mm7, %%mm1\n"
1360
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1363
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1365
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1370
"movq %%mm6,%%mm0\n"
1371
"psrlq $32, %%mm6\n"
1372
"paddw %%mm6,%%mm0\n"
1373
"movq %%mm0,%%mm6\n"
1374
"psrlq $16, %%mm0\n"
1375
"paddw %%mm6,%%mm0\n"
1377
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
1378
: "r" ((long)line_size) , "m" (h)
1380
return tmp & 0x7FFF;
1384
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1387
assert( (((int)pix1) & 7) == 0);
1388
assert( (((int)pix2) & 7) == 0);
1389
assert((line_size &7) ==0);
1391
#define SUM(in0, in1, out0, out1) \
1392
"movq (%0)," #out0 "\n"\
1393
"movq (%1),%%mm2\n"\
1394
"movq 8(%0)," #out1 "\n"\
1395
"movq 8(%1),%%mm3\n"\
1398
"psubb %%mm2, " #out0 "\n"\
1399
"psubb %%mm3, " #out1 "\n"\
1400
"pxor %%mm7, " #out0 "\n"\
1401
"pxor %%mm7, " #out1 "\n"\
1402
"psadbw " #out0 ", " #in0 "\n"\
1403
"psadbw " #out1 ", " #in1 "\n"\
1404
"paddw " #in1 ", " #in0 "\n"\
1405
"paddw " #in0 ", %%mm6\n"
1409
"pxor %%mm6,%%mm6\n"
1410
"pcmpeqw %%mm7,%%mm7\n"
1411
"psllw $15, %%mm7\n"
1412
"packsswb %%mm7, %%mm7\n"
1415
"movq 8(%0),%%mm1\n"
1416
"movq 8(%1),%%mm3\n"
1420
"psubb %%mm2, %%mm0\n"
1421
"psubb %%mm3, %%mm1\n"
1422
"pxor %%mm7, %%mm0\n"
1423
"pxor %%mm7, %%mm1\n"
1424
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1427
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1429
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1435
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
1436
: "r" ((long)line_size) , "m" (h)
1442
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1446
"movq (%2, %0), %%mm0 \n\t"
1447
"movq (%1, %0), %%mm1 \n\t"
1448
"psubb %%mm0, %%mm1 \n\t"
1449
"movq %%mm1, (%3, %0) \n\t"
1450
"movq 8(%2, %0), %%mm0 \n\t"
1451
"movq 8(%1, %0), %%mm1 \n\t"
1452
"psubb %%mm0, %%mm1 \n\t"
1453
"movq %%mm1, 8(%3, %0) \n\t"
1458
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1461
dst[i+0] = src1[i+0]-src2[i+0];
1464
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1470
"movq -1(%1, %0), %%mm0 \n\t" // LT
1471
"movq (%1, %0), %%mm1 \n\t" // T
1472
"movq -1(%2, %0), %%mm2 \n\t" // L
1473
"movq (%2, %0), %%mm3 \n\t" // X
1474
"movq %%mm2, %%mm4 \n\t" // L
1475
"psubb %%mm0, %%mm2 \n\t"
1476
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
1477
"movq %%mm4, %%mm5 \n\t" // L
1478
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1479
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
1480
"pminub %%mm2, %%mm4 \n\t"
1481
"pmaxub %%mm1, %%mm4 \n\t"
1482
"psubb %%mm4, %%mm3 \n\t" // dst - pred
1483
"movq %%mm3, (%3, %0) \n\t"
1488
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1494
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1496
*left_top= src1[w-1];
1500
#define LBUTTERFLY2(a1,b1,a2,b2)\
1501
"paddw " #b1 ", " #a1 " \n\t"\
1502
"paddw " #b2 ", " #a2 " \n\t"\
1503
"paddw " #b1 ", " #b1 " \n\t"\
1504
"paddw " #b2 ", " #b2 " \n\t"\
1505
"psubw " #a1 ", " #b1 " \n\t"\
1506
"psubw " #a2 ", " #b2 " \n\t"
1509
LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
1510
LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
1511
LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
1512
LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
1513
LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
1514
LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1517
"pxor " #z ", " #z " \n\t"\
1518
"pcmpgtw " #a ", " #z " \n\t"\
1519
"pxor " #z ", " #a " \n\t"\
1520
"psubw " #z ", " #a " \n\t"
1522
#define MMABS_SUM(a,z, sum)\
1523
"pxor " #z ", " #z " \n\t"\
1524
"pcmpgtw " #a ", " #z " \n\t"\
1525
"pxor " #z ", " #a " \n\t"\
1526
"psubw " #z ", " #a " \n\t"\
1527
"paddusw " #a ", " #sum " \n\t"
1529
#define MMABS_MMX2(a,z)\
1530
"pxor " #z ", " #z " \n\t"\
1531
"psubw " #a ", " #z " \n\t"\
1532
"pmaxsw " #z ", " #a " \n\t"
1534
#define MMABS_SUM_MMX2(a,z, sum)\
1535
"pxor " #z ", " #z " \n\t"\
1536
"psubw " #a ", " #z " \n\t"\
1537
"pmaxsw " #z ", " #a " \n\t"\
1538
"paddusw " #a ", " #sum " \n\t"
1540
#define TRANSPOSE4(a,b,c,d,t)\
1541
SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
1542
SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
1543
SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
1544
SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
1546
#define LOAD4(o, a, b, c, d)\
1547
"movq "#o"(%1), " #a " \n\t"\
1548
"movq "#o"+16(%1), " #b " \n\t"\
1549
"movq "#o"+32(%1), " #c " \n\t"\
1550
"movq "#o"+48(%1), " #d " \n\t"
1552
#define STORE4(o, a, b, c, d)\
1553
"movq "#a", "#o"(%1) \n\t"\
1554
"movq "#b", "#o"+16(%1) \n\t"\
1555
"movq "#c", "#o"+32(%1) \n\t"\
1556
"movq "#d", "#o"+48(%1) \n\t"\
1558
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1559
DECLARE_ALIGNED_8(uint64_t, temp[16]);
1564
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1567
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1568
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1572
"movq %%mm7, 112(%1) \n\t"
1574
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1575
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1577
"movq 112(%1), %%mm7 \n\t"
1578
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1579
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1581
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1582
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1586
"movq %%mm7, 120(%1) \n\t"
1588
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1589
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1591
"movq 120(%1), %%mm7 \n\t"
1592
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1593
"movq %%mm7, %%mm5 \n\t"//FIXME remove
1594
"movq %%mm6, %%mm7 \n\t"
1595
"movq %%mm0, %%mm6 \n\t"
1596
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1598
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1599
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1602
"movq %%mm7, 64(%1) \n\t"
1604
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1605
MMABS_SUM(%%mm2, %%mm7, %%mm0)
1606
MMABS_SUM(%%mm3, %%mm7, %%mm0)
1607
MMABS_SUM(%%mm4, %%mm7, %%mm0)
1608
MMABS_SUM(%%mm5, %%mm7, %%mm0)
1609
MMABS_SUM(%%mm6, %%mm7, %%mm0)
1610
"movq 64(%1), %%mm1 \n\t"
1611
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1612
"movq %%mm0, 64(%1) \n\t"
1614
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1615
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1618
"movq %%mm7, (%1) \n\t"
1620
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1621
MMABS_SUM(%%mm2, %%mm7, %%mm0)
1622
MMABS_SUM(%%mm3, %%mm7, %%mm0)
1623
MMABS_SUM(%%mm4, %%mm7, %%mm0)
1624
MMABS_SUM(%%mm5, %%mm7, %%mm0)
1625
MMABS_SUM(%%mm6, %%mm7, %%mm0)
1626
"movq (%1), %%mm1 \n\t"
1627
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1628
"movq 64(%1), %%mm1 \n\t"
1629
MMABS_SUM(%%mm1, %%mm7, %%mm0)
1631
"movq %%mm0, %%mm1 \n\t"
1632
"psrlq $32, %%mm0 \n\t"
1633
"paddusw %%mm1, %%mm0 \n\t"
1634
"movq %%mm0, %%mm1 \n\t"
1635
"psrlq $16, %%mm0 \n\t"
1636
"paddusw %%mm1, %%mm0 \n\t"
1637
"movd %%mm0, %0 \n\t"
1645
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1646
DECLARE_ALIGNED_8(uint64_t, temp[16]);
1651
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1654
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1655
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1659
"movq %%mm7, 112(%1) \n\t"
1661
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1662
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1664
"movq 112(%1), %%mm7 \n\t"
1665
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1666
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1668
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1669
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1673
"movq %%mm7, 120(%1) \n\t"
1675
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1676
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1678
"movq 120(%1), %%mm7 \n\t"
1679
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1680
"movq %%mm7, %%mm5 \n\t"//FIXME remove
1681
"movq %%mm6, %%mm7 \n\t"
1682
"movq %%mm0, %%mm6 \n\t"
1683
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1685
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1686
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1689
"movq %%mm7, 64(%1) \n\t"
1690
MMABS_MMX2(%%mm0, %%mm7)
1691
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1692
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1693
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1694
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1695
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1696
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1697
"movq 64(%1), %%mm1 \n\t"
1698
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1699
"movq %%mm0, 64(%1) \n\t"
1701
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1702
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1705
"movq %%mm7, (%1) \n\t"
1706
MMABS_MMX2(%%mm0, %%mm7)
1707
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1708
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1709
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1710
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1711
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1712
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1713
"movq (%1), %%mm1 \n\t"
1714
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1715
"movq 64(%1), %%mm1 \n\t"
1716
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1718
"pshufw $0x0E, %%mm0, %%mm1 \n\t"
1719
"paddusw %%mm1, %%mm0 \n\t"
1720
"pshufw $0x01, %%mm0, %%mm1 \n\t"
1721
"paddusw %%mm1, %%mm0 \n\t"
1722
"movd %%mm0, %0 \n\t"
1731
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
1732
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
1733
#endif //CONFIG_ENCODERS
1735
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1736
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
698
/* draw the edges of width 'w' of an image of size width, height
699
this mmx version can only handle w==8 || w==16 */
700
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
702
uint8_t *ptr, *last_line;
705
last_line = buf + (height - 1) * wrap;
712
"movd (%0), %%mm0 \n\t"
713
"punpcklbw %%mm0, %%mm0 \n\t"
714
"punpcklwd %%mm0, %%mm0 \n\t"
715
"punpckldq %%mm0, %%mm0 \n\t"
716
"movq %%mm0, -8(%0) \n\t"
717
"movq -8(%0, %2), %%mm1 \n\t"
718
"punpckhbw %%mm1, %%mm1 \n\t"
719
"punpckhwd %%mm1, %%mm1 \n\t"
720
"punpckhdq %%mm1, %%mm1 \n\t"
721
"movq %%mm1, (%0, %2) \n\t"
726
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
733
"movd (%0), %%mm0 \n\t"
734
"punpcklbw %%mm0, %%mm0 \n\t"
735
"punpcklwd %%mm0, %%mm0 \n\t"
736
"punpckldq %%mm0, %%mm0 \n\t"
737
"movq %%mm0, -8(%0) \n\t"
738
"movq %%mm0, -16(%0) \n\t"
739
"movq -8(%0, %2), %%mm1 \n\t"
740
"punpckhbw %%mm1, %%mm1 \n\t"
741
"punpckhwd %%mm1, %%mm1 \n\t"
742
"punpckhdq %%mm1, %%mm1 \n\t"
743
"movq %%mm1, (%0, %2) \n\t"
744
"movq %%mm1, 8(%0, %2) \n\t"
749
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
754
/* top and bottom (and hopefully also the corners) */
755
ptr= buf - (i + 1) * wrap - w;
758
"movq (%1, %0), %%mm0 \n\t"
759
"movq %%mm0, (%0) \n\t"
760
"movq %%mm0, (%0, %2) \n\t"
761
"movq %%mm0, (%0, %2, 2) \n\t"
762
"movq %%mm0, (%0, %3) \n\t"
767
: "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
769
ptr= last_line + (i + 1) * wrap - w;
772
"movq (%1, %0), %%mm0 \n\t"
773
"movq %%mm0, (%0) \n\t"
774
"movq %%mm0, (%0, %2) \n\t"
775
"movq %%mm0, (%0, %2, 2) \n\t"
776
"movq %%mm0, (%0, %3) \n\t"
781
: "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
786
#define PAETH(cpu, abs3)\
787
void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
792
"pxor %%mm7, %%mm7 \n"\
793
"movd (%1,%0), %%mm0 \n"\
794
"movd (%2,%0), %%mm1 \n"\
795
"punpcklbw %%mm7, %%mm0 \n"\
796
"punpcklbw %%mm7, %%mm1 \n"\
799
"movq %%mm1, %%mm2 \n"\
800
"movd (%2,%0), %%mm1 \n"\
801
"movq %%mm2, %%mm3 \n"\
802
"punpcklbw %%mm7, %%mm1 \n"\
803
"movq %%mm2, %%mm4 \n"\
804
"psubw %%mm1, %%mm3 \n"\
805
"psubw %%mm0, %%mm4 \n"\
806
"movq %%mm3, %%mm5 \n"\
807
"paddw %%mm4, %%mm5 \n"\
809
"movq %%mm4, %%mm6 \n"\
810
"pminsw %%mm5, %%mm6 \n"\
811
"pcmpgtw %%mm6, %%mm3 \n"\
812
"pcmpgtw %%mm5, %%mm4 \n"\
813
"movq %%mm4, %%mm6 \n"\
814
"pand %%mm3, %%mm4 \n"\
815
"pandn %%mm3, %%mm6 \n"\
816
"pandn %%mm0, %%mm3 \n"\
817
"movd (%3,%0), %%mm0 \n"\
818
"pand %%mm1, %%mm6 \n"\
819
"pand %%mm4, %%mm2 \n"\
820
"punpcklbw %%mm7, %%mm0 \n"\
822
"paddw %%mm6, %%mm0 \n"\
823
"paddw %%mm2, %%mm3 \n"\
824
"paddw %%mm3, %%mm0 \n"\
825
"pand %%mm5, %%mm0 \n"\
826
"movq %%mm0, %%mm3 \n"\
827
"packuswb %%mm3, %%mm3 \n"\
828
"movd %%mm3, (%1,%0) \n"\
833
:"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
840
"psubw %%mm5, %%mm7 \n"\
841
"pmaxsw %%mm7, %%mm5 \n"\
842
"pxor %%mm6, %%mm6 \n"\
843
"pxor %%mm7, %%mm7 \n"\
844
"psubw %%mm3, %%mm6 \n"\
845
"psubw %%mm4, %%mm7 \n"\
846
"pmaxsw %%mm6, %%mm3 \n"\
847
"pmaxsw %%mm7, %%mm4 \n"\
848
"pxor %%mm7, %%mm7 \n"
851
"pabsw %%mm3, %%mm3 \n"\
852
"pabsw %%mm4, %%mm4 \n"\
853
"pabsw %%mm5, %%mm5 \n"
855
PAETH(mmx2, ABS3_MMX2)
857
PAETH(ssse3, ABS3_SSSE3)
1738
860
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1739
861
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\
3395
2312
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
3398
SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
3399
SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
3400
SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
3401
SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
3402
SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
3403
SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
3404
SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
3405
SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
3406
SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
3407
SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
3408
SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
3409
SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
3410
SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
3411
SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
3412
SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
3413
SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
3414
SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
3415
SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
3416
SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
3417
SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
3418
SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
3419
SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
3420
SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
3421
SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
3422
SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
3423
SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
3424
SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
3425
SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
3426
SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
3427
SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
3428
SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
3429
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
3431
#define dspfunc(PFX, IDX, NUM) \
3432
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
3433
c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
3434
c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
3435
c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
3436
c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
3437
c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
3438
c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
3439
c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
3440
c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
3441
c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
3442
c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
3443
c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
3444
c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
3445
c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
3446
c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
3447
c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
3449
dspfunc(put_h264_qpel, 0, 16);
3450
dspfunc(put_h264_qpel, 1, 8);
3451
dspfunc(put_h264_qpel, 2, 4);
3452
dspfunc(avg_h264_qpel, 0, 16);
3453
dspfunc(avg_h264_qpel, 1, 8);
3454
dspfunc(avg_h264_qpel, 2, 4);
3456
dspfunc(put_2tap_qpel, 0, 16);
3457
dspfunc(put_2tap_qpel, 1, 8);
3458
dspfunc(avg_2tap_qpel, 0, 16);
3459
dspfunc(avg_2tap_qpel, 1, 8);
3461
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2315
SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2316
SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2317
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2318
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2319
SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2320
SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2322
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2323
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2324
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2325
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2326
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2327
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2329
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2330
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2331
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2332
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2334
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
3462
2335
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
3465
#ifdef CONFIG_SNOW_DECODER
2337
if (ENABLE_CAVS_DECODER)
2338
ff_cavsdsp_init_3dnow(c, avctx);
2342
#define H264_QPEL_FUNCS(x, y, CPU)\
2343
c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2344
c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2345
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2346
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2347
if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){
2348
// these functions are slower than mmx on AMD, but faster on Intel
2349
/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
2350
c->put_pixels_tab[0][0] = put_pixels16_sse2;
2351
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2353
H264_QPEL_FUNCS(0, 0, sse2);
3466
2355
if(mm_flags & MM_SSE2){
2356
H264_QPEL_FUNCS(0, 1, sse2);
2357
H264_QPEL_FUNCS(0, 2, sse2);
2358
H264_QPEL_FUNCS(0, 3, sse2);
2359
H264_QPEL_FUNCS(1, 1, sse2);
2360
H264_QPEL_FUNCS(1, 2, sse2);
2361
H264_QPEL_FUNCS(1, 3, sse2);
2362
H264_QPEL_FUNCS(2, 1, sse2);
2363
H264_QPEL_FUNCS(2, 2, sse2);
2364
H264_QPEL_FUNCS(2, 3, sse2);
2365
H264_QPEL_FUNCS(3, 1, sse2);
2366
H264_QPEL_FUNCS(3, 2, sse2);
2367
H264_QPEL_FUNCS(3, 3, sse2);
2370
if(mm_flags & MM_SSSE3){
2371
H264_QPEL_FUNCS(1, 0, ssse3);
2372
H264_QPEL_FUNCS(1, 1, ssse3);
2373
H264_QPEL_FUNCS(1, 2, ssse3);
2374
H264_QPEL_FUNCS(1, 3, ssse3);
2375
H264_QPEL_FUNCS(2, 0, ssse3);
2376
H264_QPEL_FUNCS(2, 1, ssse3);
2377
H264_QPEL_FUNCS(2, 2, ssse3);
2378
H264_QPEL_FUNCS(2, 3, ssse3);
2379
H264_QPEL_FUNCS(3, 0, ssse3);
2380
H264_QPEL_FUNCS(3, 1, ssse3);
2381
H264_QPEL_FUNCS(3, 2, ssse3);
2382
H264_QPEL_FUNCS(3, 3, ssse3);
2383
c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
2384
c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
2385
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
2386
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
2387
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
2388
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2392
#ifdef CONFIG_SNOW_DECODER
2393
if(mm_flags & MM_SSE2 & 0){
3467
2394
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3468
2396
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3469
2398
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
2401
if(mm_flags & MM_MMXEXT){
3472
2402
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3473
2404
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3474
2407
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;