24
24
#include "libavcodec/dsputil.h"
25
25
#include "libavcodec/mpegaudiodsp.h"
27
void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
28
void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
29
void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
30
void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
31
void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
32
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
34
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
37
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
27
39
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
28
40
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
163
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
164
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
165
int count, int switch_point, int block_type) \
167
int align_end = count - (count & 3); \
169
for (j = 0; j < align_end; j+= 4) { \
170
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
171
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
172
/* apply window & overlap with previous buffer */ \
174
/* select window */ \
175
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
180
for (; j < count; j++) { \
181
/* apply window & overlap with previous buffer */ \
183
/* select window */ \
184
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
185
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
187
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
195
DECL_IMDCT_BLOCKS(sse,sse)
196
DECL_IMDCT_BLOCKS(sse2,sse)
197
DECL_IMDCT_BLOCKS(sse3,sse)
198
DECL_IMDCT_BLOCKS(ssse3,sse)
199
DECL_IMDCT_BLOCKS(avx,avx)
150
201
void ff_mpadsp_init_mmx(MPADSPContext *s)
152
203
int mm_flags = av_get_cpu_flags();
206
for (j = 0; j < 4; j++) {
207
for (i = 0; i < 40; i ++) {
208
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
209
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
210
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
211
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
212
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
213
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
214
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
215
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
154
219
if (mm_flags & AV_CPU_FLAG_SSE2) {
155
220
s->apply_window_float = apply_window_mp3;
223
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
224
s->imdct36_blocks_float = imdct36_blocks_avx;
226
} else if (mm_flags & AV_CPU_FLAG_SSSE3) {
227
s->imdct36_blocks_float = imdct36_blocks_ssse3;
228
} else if (mm_flags & AV_CPU_FLAG_SSE3) {
229
s->imdct36_blocks_float = imdct36_blocks_sse3;
230
} else if (mm_flags & AV_CPU_FLAG_SSE2) {
231
s->imdct36_blocks_float = imdct36_blocks_sse2;
232
} else if (mm_flags & AV_CPU_FLAG_SSE) {
233
s->imdct36_blocks_float = imdct36_blocks_sse;
234
#endif /* HAVE_SSE */
236
#endif /* HAVE_YASM */