25
25
#include "libavutil/cpu.h"
26
#include "libavutil/x86_cpu.h"
26
#include "libavutil/x86/asm.h"
27
#include "libavutil/x86/cpu.h"
27
28
#include "libavcodec/dsputil.h"
28
29
#include "libavcodec/mpegvideo.h"
29
30
#include "libavcodec/mathops.h"
30
31
#include "dsputil_mmx.h"
33
36
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
326
int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
328
329
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
330
331
__asm__ volatile (
649
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
650
static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
651
int line_size, int h)
652
655
assert( (((int)pix) & 7) == 0);
767
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
770
static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
771
int line_size, int h)
770
775
assert( (((int)pix1) & 7) == 0);
843
848
dst[i+0] = src1[i+0]-src2[i+0];
846
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
851
static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
852
const uint8_t *src2, int w,
853
int *left, int *left_top)
850
858
__asm__ volatile(
859
"movq (%1, %0), %%mm0 \n\t" // LT
860
"psllq $8, %%mm0 \n\t"
852
"movq -1(%1, %0), %%mm0 \n\t" // LT
853
862
"movq (%1, %0), %%mm1 \n\t" // T
854
863
"movq -1(%2, %0), %%mm2 \n\t" // L
855
864
"movq (%2, %0), %%mm3 \n\t" // X
864
873
"psubb %%mm4, %%mm3 \n\t" // dst - pred
865
874
"movq %%mm3, (%3, %0) \n\t"
866
875
"add $8, %0 \n\t"
876
"movq -1(%1, %0), %%mm0 \n\t" // LT
867
877
"cmp %4, %0 \n\t"
885
895
"pxor " #z ", " #a " \n\t"\
886
896
"psubw " #z ", " #a " \n\t"
888
#define MMABS_MMX2(a,z)\
898
#define MMABS_MMXEXT(a, z) \
889
899
"pxor " #z ", " #z " \n\t"\
890
900
"psubw " #a ", " #z " \n\t"\
891
901
"pmaxsw " #z ", " #a " \n\t"
909
919
"paddusw "#t", "#a" \n\t"\
910
920
"movd "#a", "#dst" \n\t"\
912
#define HSUM_MMX2(a, t, dst)\
922
#define HSUM_MMXEXT(a, t, dst) \
913
923
"pshufw $0x0E, "#a", "#t" \n\t"\
914
924
"paddusw "#t", "#a" \n\t"\
915
925
"pshufw $0x01, "#a", "#t" \n\t"\
925
935
"paddusw "#t", "#a" \n\t"\
926
936
"movd "#a", "#dst" \n\t"\
928
#define hadamard_func(cpu) \
929
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
930
int stride, int h); \
931
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
939
938
#define DCT_SAD4(m,mm,o)\
940
939
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
941
940
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
985
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
986
#define MMABS(a,z) MMABS_MMX2(a,z)
984
#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
985
#define MMABS(a,z) MMABS_MMXEXT(a,z)
1066
1065
"pmulhrw " #s ", "#x " \n\t"\
1067
1066
"pmulhrw " #s ", "#y " \n\t"
1069
#include "dsputil_mmx_qns_template.c"
1068
#include "dsputil_qns_template.c"
1073
1072
#undef SCALE_OFFSET
1075
#if HAVE_SSSE3_INLINE
1078
1077
#define DEF(x) x ## _ssse3
1079
1078
#define SET_RND(x)
1085
1084
"pmulhrsw " #s ", "#x " \n\t"\
1086
1085
"pmulhrsw " #s ", "#y " \n\t"
1088
#include "dsputil_mmx_qns_template.c"
1087
#include "dsputil_qns_template.c"
1092
1091
#undef SCALE_OFFSET
1098
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1094
#endif /* HAVE_SSSE3_INLINE */
1096
#endif /* HAVE_INLINE_ASM */
1098
int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
1100
#define hadamard_func(cpu) \
1101
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
1102
int stride, int h); \
1103
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
1107
hadamard_func(mmxext)
1109
hadamard_func(ssse3)
1111
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1100
1113
int mm_flags = av_get_cpu_flags();
1101
1116
int bit_depth = avctx->bits_per_raw_sample;
1103
1118
if (mm_flags & AV_CPU_FLAG_MMX) {
1106
1121
(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
1107
1122
if(mm_flags & AV_CPU_FLAG_SSE2){
1108
1123
c->fdct = ff_fdct_sse2;
1109
}else if(mm_flags & AV_CPU_FLAG_MMX2){
1110
c->fdct = ff_fdct_mmx2;
1124
} else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1125
c->fdct = ff_fdct_mmxext;
1112
1127
c->fdct = ff_fdct_mmx;
1121
1136
c->diff_bytes= diff_bytes_mmx;
1122
1137
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1125
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
1126
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
1129
1139
c->pix_norm1 = pix_norm1_mmx;
1130
c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
1131
c->sse[1] = sse8_mmx;
1140
c->sse[0] = sse16_mmx;
1141
c->sse[1] = sse8_mmx;
1132
1142
c->vsad[4]= vsad_intra16_mmx;
1134
1144
c->nsse[0] = nsse16_mmx;
1145
1155
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1148
if (mm_flags & AV_CPU_FLAG_MMX2) {
1149
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1151
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
1152
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
1154
c->vsad[4]= vsad_intra16_mmx2;
1157
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1158
c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
1159
c->vsad[4] = vsad_intra16_mmxext;
1156
1161
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1157
c->vsad[0] = vsad16_mmx2;
1162
c->vsad[0] = vsad16_mmxext;
1160
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1165
c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
1163
1168
if(mm_flags & AV_CPU_FLAG_SSE2){
1164
1169
if (bit_depth <= 8)
1165
1170
c->get_pixels = get_pixels_sse2;
1166
1171
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1167
#if HAVE_YASM && HAVE_ALIGNED_STACK
1168
c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
1169
c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
1174
#if HAVE_SSSE3_INLINE
1174
1175
if(mm_flags & AV_CPU_FLAG_SSSE3){
1175
1176
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1176
1177
c->try_8x8basis= try_8x8basis_ssse3;
1178
1179
c->add_8x8basis= add_8x8basis_ssse3;
1179
1180
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1180
#if HAVE_YASM && HAVE_ALIGNED_STACK
1181
c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
1182
c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
1191
1188
c->add_8x8basis= add_8x8basis_3dnow;
1195
dsputil_init_pix_mmx(c, avctx);
1191
#endif /* HAVE_INLINE_ASM */
1193
if (EXTERNAL_MMX(mm_flags)) {
1194
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
1195
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
1197
if (EXTERNAL_MMXEXT(mm_flags)) {
1198
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
1199
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
1202
if (EXTERNAL_SSE2(mm_flags)) {
1203
c->sse[0] = ff_sse16_sse2;
1205
#if HAVE_ALIGNED_STACK
1206
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
1207
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
1211
if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) {
1212
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
1213
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
1217
ff_dsputil_init_pix_mmx(c, avctx);