3
* Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
6
* See http://libmpeg2.sourceforge.net/ for updates.
8
* mpeg2dec is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
13
* mpeg2dec is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
18
* You should have received a copy of the GNU General Public License
19
* along with mpeg2dec; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
#include "libavutil/common.h"
24
#include "libavcodec/dsputil.h"
28
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
33
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
34
#define rounder(bias) {round (bias), round (bias)}
38
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
39
static inline void idct_row (int16_t * row, int offset,
40
int16_t * table, int32_t * rounder)
42
int C1, C2, C3, C4, C5, C6, C7;
43
int a0, a1, a2, a3, b0, b1, b2, b3;
55
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
56
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
57
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
58
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
60
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
61
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
62
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
63
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
65
row[0] = (a0 + b0) >> ROW_SHIFT;
66
row[1] = (a1 + b1) >> ROW_SHIFT;
67
row[2] = (a2 + b2) >> ROW_SHIFT;
68
row[3] = (a3 + b3) >> ROW_SHIFT;
69
row[4] = (a3 - b3) >> ROW_SHIFT;
70
row[5] = (a2 - b2) >> ROW_SHIFT;
71
row[6] = (a1 - b1) >> ROW_SHIFT;
72
row[7] = (a0 - b0) >> ROW_SHIFT;
79
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
88
static inline void mmxext_row_head (int16_t * const row, const int offset,
89
const int16_t * const table)
91
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
93
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
94
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
96
movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */
97
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
99
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */
100
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
102
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */
105
static inline void mmxext_row (const int16_t * const table,
106
const int32_t * const rounder)
108
movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */
109
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
111
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
112
pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */
114
movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */
115
pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
117
paddd_m2r (*rounder, mm3); /* mm3 += rounder */
118
pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
120
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
121
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */
123
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
124
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */
126
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
127
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */
129
paddd_m2r (*rounder, mm0); /* mm0 += rounder */
130
psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */
132
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */
133
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */
135
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */
136
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */
138
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */
139
movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */
141
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */
142
psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */
145
static inline void mmxext_row_tail (int16_t * const row, const int store)
147
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
149
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */
151
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
153
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */
155
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
156
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */
160
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */
163
static inline void mmxext_row_mid (int16_t * const row, const int store,
165
const int16_t * const table)
167
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
168
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
170
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
171
psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */
173
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
174
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
176
packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */
177
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
179
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
180
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */
182
movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */
183
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */
185
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
187
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */
188
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */
194
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
203
static inline void mmx_row_head (int16_t * const row, const int offset,
204
const int16_t * const table)
206
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
208
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
209
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
211
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */
212
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
214
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */
216
movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */
217
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
219
movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */
220
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */
223
static inline void mmx_row (const int16_t * const table,
224
const int32_t * const rounder)
226
pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
227
punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */
229
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
230
punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */
232
movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */
233
pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
235
paddd_m2r (*rounder, mm3); /* mm3 += rounder */
236
pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
238
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
239
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */
241
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
242
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */
244
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
245
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */
247
paddd_m2r (*rounder, mm0); /* mm0 += rounder */
248
psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */
250
psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */
251
paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */
253
paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */
254
psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */
256
paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */
257
movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */
259
paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */
260
psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */
263
static inline void mmx_row_tail (int16_t * const row, const int store)
265
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
267
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */
269
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
271
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */
273
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
274
movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */
276
pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */
278
psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */
280
por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */
284
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */
287
static inline void mmx_row_mid (int16_t * const row, const int store,
288
const int offset, const int16_t * const table)
290
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
291
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
293
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
294
psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */
296
packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
297
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
299
packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */
300
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
302
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
303
movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */
305
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */
306
psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */
308
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */
309
pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */
311
movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */
312
por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */
314
movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */
315
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */
317
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */
318
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
323
/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
324
static inline void idct_col (int16_t * col, int offset)
326
/* multiplication - as implemented on mmx */
327
#define F(c,x) (((c) * (x)) >> 16)
329
/* saturation - it helps us handle torture test cases */
330
#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
332
int16_t x0, x1, x2, x3, x4, x5, x6, x7;
333
int16_t y0, y1, y2, y3, y4, y5, y6, y7;
334
int16_t a0, a1, a2, a3, b0, b1, b2, b3;
335
int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
350
u26 = S (F (T2, x6) + x2);
351
v26 = S (F (T2, x2) - x6);
358
u17 = S (F (T1, x7) + x1);
359
v17 = S (F (T1, x1) - x7);
360
u35 = S (F (T3, x5) + x3);
361
v35 = S (F (T3, x3) - x5);
367
u12 = S (2 * F (C4, u12));
368
v12 = S (2 * F (C4, v12));
372
y0 = S (a0 + b0) >> COL_SHIFT;
373
y1 = S (a1 + b1) >> COL_SHIFT;
374
y2 = S (a2 + b2) >> COL_SHIFT;
375
y3 = S (a3 + b3) >> COL_SHIFT;
377
y4 = S (a3 - b3) >> COL_SHIFT;
378
y5 = S (a2 - b2) >> COL_SHIFT;
379
y6 = S (a1 - b1) >> COL_SHIFT;
380
y7 = S (a0 - b0) >> COL_SHIFT;
394
/* MMX column IDCT */
395
static inline void idct_col (int16_t * const col, const int offset)
402
static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
403
static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
404
static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
405
static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
407
/* column code adapted from Peter Gubanov */
408
/* http://www.elecard.com/peter/idct.shtml */
410
movq_m2r (*t1_vector, mm0); /* mm0 = T1 */
412
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */
413
movq_r2r (mm0, mm2); /* mm2 = T1 */
415
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */
416
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */
418
movq_m2r (*t3_vector, mm5); /* mm5 = T3 */
419
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */
421
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */
422
movq_r2r (mm5, mm7); /* mm7 = T3-1 */
424
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */
425
psubsw_r2r (mm4, mm0); /* mm0 = v17 */
427
movq_m2r (*t2_vector, mm4); /* mm4 = T2 */
428
pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */
430
paddsw_r2r (mm2, mm1); /* mm1 = u17 */
431
pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */
435
movq_r2r (mm4, mm2); /* mm2 = T2 */
436
paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */
438
pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
439
paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */
441
psubsw_r2r (mm6, mm5); /* mm5 = v35 */
442
paddsw_r2r (mm3, mm7); /* mm7 = u35 */
444
movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */
445
movq_r2r (mm0, mm6); /* mm6 = v17 */
447
pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */
448
psubsw_r2r (mm5, mm0); /* mm0 = b3 */
450
psubsw_r2r (mm3, mm4); /* mm4 = v26 */
451
paddsw_r2r (mm6, mm5); /* mm5 = v12 */
453
movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */
454
movq_r2r (mm1, mm6); /* mm6 = u17 */
456
paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
457
paddsw_r2r (mm7, mm6); /* mm6 = b0 */
459
psubsw_r2r (mm7, mm1); /* mm1 = u12 */
460
movq_r2r (mm1, mm7); /* mm7 = u12 */
462
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */
463
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */
465
movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */
466
psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */
468
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */
469
pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */
471
movq_r2r (mm4, mm6); /* mm6 = v26 */
472
pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */
474
movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */
475
movq_r2r (mm3, mm0); /* mm0 = x0 */
477
psubsw_r2r (mm5, mm3); /* mm3 = v04 */
478
paddsw_r2r (mm5, mm0); /* mm0 = u04 */
480
paddsw_r2r (mm3, mm4); /* mm4 = a1 */
481
movq_r2r (mm0, mm5); /* mm5 = u04 */
483
psubsw_r2r (mm6, mm3); /* mm3 = a2 */
484
paddsw_r2r (mm2, mm5); /* mm5 = a0 */
486
paddsw_r2r (mm1, mm1); /* mm1 = b1 */
487
psubsw_r2r (mm2, mm0); /* mm0 = a3 */
489
paddsw_r2r (mm7, mm7); /* mm7 = b2 */
490
movq_r2r (mm3, mm2); /* mm2 = a2 */
492
movq_r2r (mm4, mm6); /* mm6 = a1 */
493
paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */
495
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */
496
paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */
498
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */
499
psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */
501
movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */
502
psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */
504
psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */
505
movq_r2r (mm5, mm7); /* mm7 = a0 */
507
movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */
508
psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */
510
movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */
511
paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */
513
movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */
514
psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */
516
psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */
517
movq_r2r (mm0, mm3); /* mm3 = a3 */
519
movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */
520
psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */
522
psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */
523
paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */
525
movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */
526
psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */
528
movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */
529
psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */
531
movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */
533
movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */
535
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */
544
static const int32_t rounder0[] ATTR_ALIGN(8) =
545
rounder ((1 << (COL_SHIFT - 1)) - 0.5);
546
static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
547
static const int32_t rounder1[] ATTR_ALIGN(8) =
548
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */
549
static const int32_t rounder7[] ATTR_ALIGN(8) =
550
rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */
551
static const int32_t rounder2[] ATTR_ALIGN(8) =
552
rounder (0.60355339059); /* C2 * (C6+C2)/2 */
553
static const int32_t rounder6[] ATTR_ALIGN(8) =
554
rounder (-0.25); /* C2 * (C6-C2)/2 */
555
static const int32_t rounder3[] ATTR_ALIGN(8) =
556
rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */
557
static const int32_t rounder5[] ATTR_ALIGN(8) =
558
rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */
563
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
564
void idct (int16_t * const block) \
566
static const int16_t table04[] ATTR_ALIGN(16) = \
567
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
568
static const int16_t table17[] ATTR_ALIGN(16) = \
569
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
570
static const int16_t table26[] ATTR_ALIGN(16) = \
571
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
572
static const int16_t table35[] ATTR_ALIGN(16) = \
573
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
575
idct_row_head (block, 0*8, table04); \
576
idct_row (table04, rounder0); \
577
idct_row_mid (block, 0*8, 4*8, table04); \
578
idct_row (table04, rounder4); \
579
idct_row_mid (block, 4*8, 1*8, table17); \
580
idct_row (table17, rounder1); \
581
idct_row_mid (block, 1*8, 7*8, table17); \
582
idct_row (table17, rounder7); \
583
idct_row_mid (block, 7*8, 2*8, table26); \
584
idct_row (table26, rounder2); \
585
idct_row_mid (block, 2*8, 6*8, table26); \
586
idct_row (table26, rounder6); \
587
idct_row_mid (block, 6*8, 3*8, table35); \
588
idct_row (table35, rounder3); \
589
idct_row_mid (block, 3*8, 5*8, table35); \
590
idct_row (table35, rounder5); \
591
idct_row_tail (block, 5*8); \
593
idct_col (block, 0); \
594
idct_col (block, 4); \
597
void ff_mmx_idct(DCTELEM *block);
598
void ff_mmxext_idct(DCTELEM *block);
600
declare_idct (ff_mmxext_idct, mmxext_table,
601
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
603
declare_idct (ff_mmx_idct, mmx_table,
604
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)