19
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
20
;******************************************************************************
23
%include "x86util.asm"
22
%include "libavutil/x86/x86util.asm"
27
34
;---------------------------------------------------------------------------------
28
35
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
29
36
;---------------------------------------------------------------------------------
30
%macro INT32_TO_FLOAT_FMUL_SCALAR 2
32
cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
37
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
39
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
34
cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
41
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
68
%define SPLATD SPLATD_SSE
70
INT32_TO_FLOAT_FMUL_SCALAR sse, 5
72
%define SPLATD SPLATD_SSE2
73
INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
75
INT32_TO_FLOAT_FMUL_SCALAR 5
77
INT32_TO_FLOAT_FMUL_SCALAR 3
77
80
;------------------------------------------------------------------------------
78
81
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
79
82
;------------------------------------------------------------------------------
80
%macro FLOAT_TO_INT16 2
81
cglobal float_to_int16_%1, 3,3,%2, dst, src, len
83
%macro FLOAT_TO_INT16 1
84
cglobal float_to_int16, 3, 3, %1, dst, src, len
83
86
lea srcq, [srcq+2*lenq]
88
91
cvtps2dq m0, [srcq+2*lenq ]
89
92
cvtps2dq m1, [srcq+2*lenq+16]
91
94
mova [dstq+lenq], m0
93
cvtps2pi m0, [srcq+2*lenq ]
94
cvtps2pi m1, [srcq+2*lenq+ 8]
95
cvtps2pi m2, [srcq+2*lenq+16]
96
cvtps2pi m3, [srcq+2*lenq+24]
96
CVTPS2PI m0, [srcq+2*lenq ]
97
CVTPS2PI m1, [srcq+2*lenq+ 8]
98
CVTPS2PI m2, [srcq+2*lenq+16]
99
CVTPS2PI m3, [srcq+2*lenq+24]
99
102
mova [dstq+lenq ], m0
111
FLOAT_TO_INT16 sse2, 2
113
FLOAT_TO_INT16 sse, 0
114
%define cvtps2pi pf2id
115
FLOAT_TO_INT16 3dnow, 0
120
;------------------------------------------------------------------------------
121
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
122
;------------------------------------------------------------------------------
123
%macro FLOAT_TO_INT16_STEP 1
124
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
126
lea srcq, [srcq+2*lenq]
127
lea step3q, [stepq*3]
131
cvtps2dq m0, [srcq+2*lenq ]
132
cvtps2dq m1, [srcq+2*lenq+16]
139
mov [dstq+stepq*4], v2w
142
mov [dstq+stepq*2], v1w
143
mov [dstq+step3q*2], v2w
144
lea dstq, [dstq+stepq*8]
149
mov [dstq+stepq*4], v2w
152
mov [dstq+stepq*2], v1w
153
mov [dstq+step3q*2], v2w
154
lea dstq, [dstq+stepq*8]
156
CVTPS2PI m0, [srcq+2*lenq ]
157
CVTPS2PI m1, [srcq+2*lenq+ 8]
158
CVTPS2PI m2, [srcq+2*lenq+16]
159
CVTPS2PI m3, [srcq+2*lenq+24]
166
mov [dstq+stepq*4], v2w
169
mov [dstq+stepq*2], v1w
170
mov [dstq+step3q*2], v2w
171
lea dstq, [dstq+stepq*8]
176
mov [dstq+stepq*4], v2w
179
mov [dstq+stepq*2], v1w
180
mov [dstq+step3q*2], v2w
181
lea dstq, [dstq+stepq*8]
192
FLOAT_TO_INT16_STEP 2
194
FLOAT_TO_INT16_STEP 0
196
FLOAT_TO_INT16_STEP 0
119
198
;-------------------------------------------------------------------------------
120
199
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
121
200
;-------------------------------------------------------------------------------
122
%macro FLOAT_TO_INT16_INTERLEAVE2 1
123
cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
201
%macro FLOAT_TO_INT16_INTERLEAVE2 0
202
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
124
203
lea lenq, [4*r2q]
125
204
mov src1q, [src0q+gprsize]
126
205
mov src0q, [src0q]
138
217
mova [dstq+lenq], m0
140
cvtps2pi m0, [src0q+lenq ]
141
cvtps2pi m1, [src0q+lenq+8]
142
cvtps2pi m2, [src1q+lenq ]
143
cvtps2pi m3, [src1q+lenq+8]
219
CVTPS2PI m0, [src0q+lenq ]
220
CVTPS2PI m1, [src0q+lenq+8]
221
CVTPS2PI m2, [src1q+lenq ]
222
CVTPS2PI m3, [src1q+lenq+8]
161
%define cvtps2pi pf2id
162
FLOAT_TO_INT16_INTERLEAVE2 3dnow
164
%define movdqa movaps
165
FLOAT_TO_INT16_INTERLEAVE2 sse
168
FLOAT_TO_INT16_INTERLEAVE2 sse2
180
%macro FLOAT_TO_INT16_INTERLEAVE6 1
240
FLOAT_TO_INT16_INTERLEAVE2
242
FLOAT_TO_INT16_INTERLEAVE2
244
FLOAT_TO_INT16_INTERLEAVE2
246
%macro FLOAT_TO_INT16_INTERLEAVE6 0
181
247
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
182
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
248
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
187
252
%define lend dword r2m
202
cvtps2pi mm1, [srcq+src1q]
203
cvtps2pi mm2, [srcq+src2q]
204
cvtps2pi mm3, [srcq+src3q]
205
cvtps2pi mm4, [srcq+src4q]
206
cvtps2pi mm5, [srcq+src5q]
267
CVTPS2PI mm1, [srcq+src1q]
268
CVTPS2PI mm2, [srcq+src2q]
269
CVTPS2PI mm3, [srcq+src3q]
270
CVTPS2PI mm4, [srcq+src4q]
271
CVTPS2PI mm5, [srcq+src5q]
207
272
packssdw mm0, mm3
208
273
packssdw mm1, mm4
209
274
packssdw mm2, mm5
211
276
punpcklwd mm0, mm1
212
277
punpckhwd mm1, mm2
213
278
punpcklwd mm2, mm3
215
280
punpckldq mm0, mm2
216
281
punpckhdq mm2, mm1
217
282
punpckldq mm1, mm3
227
292
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
229
%define pswapd PSWAPD_SSE
230
FLOAT_TO_INT16_INTERLEAVE6 sse
231
%define cvtps2pi pf2id
232
%define pswapd PSWAPD_3DN1
233
FLOAT_TO_INT16_INTERLEAVE6 3dnow
235
FLOAT_TO_INT16_INTERLEAVE6 3dn2
295
FLOAT_TO_INT16_INTERLEAVE6
297
FLOAT_TO_INT16_INTERLEAVE6
299
FLOAT_TO_INT16_INTERLEAVE6
238
301
;-----------------------------------------------------------------------------
239
302
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
240
303
;-----------------------------------------------------------------------------
242
%macro FLOAT_INTERLEAVE6 2
243
cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
305
%macro FLOAT_INTERLEAVE6 1
306
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
248
310
%define lend dword r2m
308
370
add dstq, mmsize*6
309
371
sub lend, mmsize/4
318
FLOAT_INTERLEAVE6 mmx, 0
320
FLOAT_INTERLEAVE6 sse, 7
322
384
;-----------------------------------------------------------------------------
323
385
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
324
386
;-----------------------------------------------------------------------------
326
%macro FLOAT_INTERLEAVE2 2
327
cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
388
%macro FLOAT_INTERLEAVE2 1
389
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
328
390
mov src1q, [srcq+gprsize]
329
391
mov srcq, [srcq ]
333
MOVPS m1, [srcq+src1q ]
334
MOVPS m3, [srcq +mmsize]
335
MOVPS m4, [srcq+src1q+mmsize]
395
mova m1, [srcq+src1q ]
396
mova m3, [srcq +mmsize]
397
mova m4, [srcq+src1q+mmsize]
346
MOVPS [dstq+1*mmsize], m2
347
MOVPS [dstq+2*mmsize], m3
348
MOVPS [dstq+3*mmsize], m1
408
mova [dstq+1*mmsize], m2
409
mova [dstq+2*mmsize], m3
410
mova [dstq+3*mmsize], m1
350
412
add srcq, mmsize*2
351
413
add dstq, mmsize*4
352
414
sub lend, mmsize/2
362
423
%define PUNPCKLDQ punpckldq
363
424
%define PUNPCKHDQ punpckhdq
364
FLOAT_INTERLEAVE2 mmx, 0
367
427
%define PUNPCKLDQ unpcklps
368
428
%define PUNPCKHDQ unpckhps
369
FLOAT_INTERLEAVE2 sse, 5