95
77
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
96
78
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
98
static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
99
__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
100
"movd (%1), %%mm0 \n\t"
102
"movd (%1), %%mm1 \n\t"
103
"movd (%1,%3,1), %%mm2 \n\t"
104
"movd (%1,%3,2), %%mm3 \n\t"
105
"punpcklbw %%mm1, %%mm0 \n\t"
106
"punpcklbw %%mm3, %%mm2 \n\t"
107
"movq %%mm0, %%mm1 \n\t"
108
"punpcklwd %%mm2, %%mm0 \n\t"
109
"punpckhwd %%mm2, %%mm1 \n\t"
110
"movd %%mm0, (%0) \n\t"
112
"punpckhdq %%mm0, %%mm0 \n\t"
113
"movd %%mm0, (%0) \n\t"
114
"movd %%mm1, (%0,%2,1) \n\t"
115
"punpckhdq %%mm1, %%mm1 \n\t"
116
"movd %%mm1, (%0,%2,2) \n\t"
126
// e,f,g,h can be memory
128
#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
129
"punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
130
"punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
131
"punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
132
"punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
133
SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
134
/* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
135
SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
136
/* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
137
SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
138
/* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
139
SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
140
/* c= a3 b3 c3 d3 e3 f3 g3 h3 */
143
// permutes 01234567 -> 05736421
144
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
145
SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
146
SBUTTERFLY(c,d,b,wd,dqa)\
147
SBUTTERFLY(e,f,d,wd,dqa)\
148
SBUTTERFLY(g,h,f,wd,dqa)\
149
SBUTTERFLY(a,c,h,dq,dqa)\
150
SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
151
SBUTTERFLY(e,g,b,dq,dqa)\
152
SBUTTERFLY(d,f,g,dq,dqa)\
153
SBUTTERFLY(a,e,f,qdq,dqa)\
154
SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
155
SBUTTERFLY(h,b,d,qdq,dqa)\
156
SBUTTERFLY(c,g,b,qdq,dqa)\
157
"movdqa %%xmm8, "#g" \n\t"
159
#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
160
"movdqa "#h", "#t" \n\t"\
161
SBUTTERFLY(a,b,h,wd,dqa)\
162
"movdqa "#h", 16"#t" \n\t"\
163
"movdqa "#t", "#h" \n\t"\
164
SBUTTERFLY(c,d,b,wd,dqa)\
165
SBUTTERFLY(e,f,d,wd,dqa)\
166
SBUTTERFLY(g,h,f,wd,dqa)\
167
SBUTTERFLY(a,c,h,dq,dqa)\
168
"movdqa "#h", "#t" \n\t"\
169
"movdqa 16"#t", "#h" \n\t"\
170
SBUTTERFLY(h,b,c,dq,dqa)\
171
SBUTTERFLY(e,g,b,dq,dqa)\
172
SBUTTERFLY(d,f,g,dq,dqa)\
173
SBUTTERFLY(a,e,f,qdq,dqa)\
174
SBUTTERFLY(h,d,e,qdq,dqa)\
175
"movdqa "#h", 16"#t" \n\t"\
176
"movdqa "#t", "#h" \n\t"\
177
SBUTTERFLY(h,b,d,qdq,dqa)\
178
SBUTTERFLY(c,g,b,qdq,dqa)\
179
"movdqa 16"#t", "#g" \n\t"
182
80
#define MOVQ_WONE(regd) \
183
81
__asm__ volatile ( \
184
82
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
185
83
"psrlw $15, %%" #regd ::)
187
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
188
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
85
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
86
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
190
88
void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
191
89
void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
192
90
void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
194
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
195
void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
196
void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
197
void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
92
void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
93
void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
94
void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
95
void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
199
97
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
200
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
202
void ff_mmx_idct(DCTELEM *block);
203
void ff_mmxext_idct(DCTELEM *block);
98
void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, int stride, int rnd);
100
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
101
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
102
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
103
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
206
105
void ff_deinterlace_line_mmx(uint8_t *dst,
207
106
const uint8_t *lum_m4, const uint8_t *lum_m3,