24
47
//////////////////////////////////////////////////////////////////////
26
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
27
#define SHIFT_FRW_COL BITS_FRW_ACC
28
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
29
//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
30
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
31
//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
32
#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
49
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
50
#define SHIFT_FRW_COL BITS_FRW_ACC
51
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
52
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
53
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
34
55
//concatenated table, for forward DCT transformation
35
56
static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
36
13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
37
27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
38
-21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
40
static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
41
-19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5
57
13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
58
27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
59
-21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
44
62
static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
45
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
63
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
48
static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
49
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
66
static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
68
static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
72
const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
73
} fdct_r_row_sse2 ATTR_ALIGN(16)=
75
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
77
//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
51
79
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
53
16384, 16384, 21407, -8867, // w09 w01 w08 w00
54
16384, 16384, 8867, -21407, // w13 w05 w12 w04
55
16384, -16384, 8867, 21407, // w11 w03 w10 w02
56
-16384, 16384, -21407, -8867, // w15 w07 w14 w06
57
22725, 12873, 19266, -22725, // w22 w20 w18 w16
58
19266, 4520, -4520, -12873, // w23 w21 w19 w17
59
12873, 4520, 4520, 19266, // w30 w28 w26 w24
60
-22725, 19266, -12873, -22725, // w31 w29 w27 w25
63
22725, 22725, 29692, -12299, // w09 w01 w08 w00
64
22725, 22725, 12299, -29692, // w13 w05 w12 w04
65
22725, -22725, 12299, 29692, // w11 w03 w10 w02
66
-22725, 22725, -29692, -12299, // w15 w07 w14 w06
67
31521, 17855, 26722, -31521, // w22 w20 w18 w16
68
26722, 6270, -6270, -17855, // w23 w21 w19 w17
69
17855, 6270, 6270, 26722, // w30 w28 w26 w24
70
-31521, 26722, -17855, -31521, // w31 w29 w27 w25
73
21407, 21407, 27969, -11585, // w09 w01 w08 w00
74
21407, 21407, 11585, -27969, // w13 w05 w12 w04
75
21407, -21407, 11585, 27969, // w11 w03 w10 w02
76
-21407, 21407, -27969, -11585, // w15 w07 w14 w06
77
29692, 16819, 25172, -29692, // w22 w20 w18 w16
78
25172, 5906, -5906, -16819, // w23 w21 w19 w17
79
16819, 5906, 5906, 25172, // w30 w28 w26 w24
80
-29692, 25172, -16819, -29692, // w31 w29 w27 w25
83
19266, 19266, 25172, -10426, // w09 w01 w08 w00
84
19266, 19266, 10426, -25172, // w13 w05 w12 w04
85
19266, -19266, 10426, 25172, // w11 w03 w10 w02
86
-19266, 19266, -25172, -10426, // w15 w07 w14 w06,
87
26722, 15137, 22654, -26722, // w22 w20 w18 w16
88
22654, 5315, -5315, -15137, // w23 w21 w19 w17
89
15137, 5315, 5315, 22654, // w30 w28 w26 w24
90
-26722, 22654, -15137, -26722, // w31 w29 w27 w25,
93
16384, 16384, 21407, -8867, // w09 w01 w08 w00
94
16384, 16384, 8867, -21407, // w13 w05 w12 w04
95
16384, -16384, 8867, 21407, // w11 w03 w10 w02
96
-16384, 16384, -21407, -8867, // w15 w07 w14 w06
97
22725, 12873, 19266, -22725, // w22 w20 w18 w16
98
19266, 4520, -4520, -12873, // w23 w21 w19 w17
99
12873, 4520, 4520, 19266, // w30 w28 w26 w24
100
-22725, 19266, -12873, -22725, // w31 w29 w27 w25
103
19266, 19266, 25172, -10426, // w09 w01 w08 w00
104
19266, 19266, 10426, -25172, // w13 w05 w12 w04
105
19266, -19266, 10426, 25172, // w11 w03 w10 w02
106
-19266, 19266, -25172, -10426, // w15 w07 w14 w06
107
26722, 15137, 22654, -26722, // w22 w20 w18 w16
108
22654, 5315, -5315, -15137, // w23 w21 w19 w17
109
15137, 5315, 5315, 22654, // w30 w28 w26 w24
110
-26722, 22654, -15137, -26722, // w31 w29 w27 w25
113
21407, 21407, 27969, -11585, // w09 w01 w08 w00
114
21407, 21407, 11585, -27969, // w13 w05 w12 w04
115
21407, -21407, 11585, 27969, // w11 w03 w10 w02
116
-21407, 21407, -27969, -11585, // w15 w07 w14 w06,
117
29692, 16819, 25172, -29692, // w22 w20 w18 w16
118
25172, 5906, -5906, -16819, // w23 w21 w19 w17
119
16819, 5906, 5906, 25172, // w30 w28 w26 w24
120
-29692, 25172, -16819, -29692, // w31 w29 w27 w25,
123
22725, 22725, 29692, -12299, // w09 w01 w08 w00
124
22725, 22725, 12299, -29692, // w13 w05 w12 w04
125
22725, -22725, 12299, 29692, // w11 w03 w10 w02
126
-22725, 22725, -29692, -12299, // w15 w07 w14 w06,
127
31521, 17855, 26722, -31521, // w22 w20 w18 w16
128
26722, 6270, -6270, -17855, // w23 w21 w19 w17
129
17855, 6270, 6270, 26722, // w30 w28 w26 w24
130
-31521, 26722, -17855, -31521 // w31 w29 w27 w25
80
16384, 16384, 22725, 19266,
81
16384, 16384, 12873, 4520,
82
21407, 8867, 19266, -4520,
83
-8867, -21407, -22725, -12873,
84
16384, -16384, 12873, -22725,
85
-16384, 16384, 4520, 19266,
86
8867, -21407, 4520, -12873,
87
21407, -8867, 19266, -22725,
89
22725, 22725, 31521, 26722,
90
22725, 22725, 17855, 6270,
91
29692, 12299, 26722, -6270,
92
-12299, -29692, -31521, -17855,
93
22725, -22725, 17855, -31521,
94
-22725, 22725, 6270, 26722,
95
12299, -29692, 6270, -17855,
96
29692, -12299, 26722, -31521,
98
21407, 21407, 29692, 25172,
99
21407, 21407, 16819, 5906,
100
27969, 11585, 25172, -5906,
101
-11585, -27969, -29692, -16819,
102
21407, -21407, 16819, -29692,
103
-21407, 21407, 5906, 25172,
104
11585, -27969, 5906, -16819,
105
27969, -11585, 25172, -29692,
107
19266, 19266, 26722, 22654,
108
19266, 19266, 15137, 5315,
109
25172, 10426, 22654, -5315,
110
-10426, -25172, -26722, -15137,
111
19266, -19266, 15137, -26722,
112
-19266, 19266, 5315, 22654,
113
10426, -25172, 5315, -15137,
114
25172, -10426, 22654, -26722,
116
16384, 16384, 22725, 19266,
117
16384, 16384, 12873, 4520,
118
21407, 8867, 19266, -4520,
119
-8867, -21407, -22725, -12873,
120
16384, -16384, 12873, -22725,
121
-16384, 16384, 4520, 19266,
122
8867, -21407, 4520, -12873,
123
21407, -8867, 19266, -22725,
125
19266, 19266, 26722, 22654,
126
19266, 19266, 15137, 5315,
127
25172, 10426, 22654, -5315,
128
-10426, -25172, -26722, -15137,
129
19266, -19266, 15137, -26722,
130
-19266, 19266, 5315, 22654,
131
10426, -25172, 5315, -15137,
132
25172, -10426, 22654, -26722,
134
21407, 21407, 29692, 25172,
135
21407, 21407, 16819, 5906,
136
27969, 11585, 25172, -5906,
137
-11585, -27969, -29692, -16819,
138
21407, -21407, 16819, -29692,
139
-21407, 21407, 5906, 25172,
140
11585, -27969, 5906, -16819,
141
27969, -11585, 25172, -29692,
143
22725, 22725, 31521, 26722,
144
22725, 22725, 17855, 6270,
145
29692, 12299, 26722, -6270,
146
-12299, -29692, -31521, -17855,
147
22725, -22725, 17855, -31521,
148
-22725, 22725, 6270, 26722,
149
12299, -29692, 6270, -17855,
150
29692, -12299, 26722, -31521,
134
static inline void fdct_col(const int16_t *in, int16_t *out, int offset)
155
const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
156
} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
158
//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
159
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
160
C4, C4, C5, C7, C2, C6, C3, -C7, \
161
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
162
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
163
// c1..c7 * cos(pi/4) * 2^15
287
static av_always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
136
289
movq_m2r(*(in + offset + 1 * 8), mm0);
137
290
movq_m2r(*(in + offset + 6 * 8), mm1);
210
363
movq_r2m(mm3, *(out + offset + 7 * 8));
213
static inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
215
movd_m2r(*(in + 6), mm5);
216
punpcklwd_m2r(*(in + 4), mm5);
218
psrlq_i2r(0x20, mm5);
367
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
370
#define FDCT_ROW_SSE2_H1(i,t) \
371
"movq " #i "(%0), %%xmm2 \n\t" \
372
"movq " #i "+8(%0), %%xmm0 \n\t" \
373
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
374
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
375
"movdqa " #t "(%1), %%xmm4 \n\t" \
376
"movdqa " #t "+16(%1), %%xmm5 \n\t"
378
#define FDCT_ROW_SSE2_H2(i,t) \
379
"movq " #i "(%0), %%xmm2 \n\t" \
380
"movq " #i "+8(%0), %%xmm0 \n\t" \
381
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
382
"movdqa " #t "+48(%1), %%xmm7 \n\t"
384
#define FDCT_ROW_SSE2(i) \
385
"movq %%xmm2, %%xmm1 \n\t" \
386
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
387
"paddsw %%xmm0, %%xmm1 \n\t" \
388
"psubsw %%xmm0, %%xmm2 \n\t" \
389
"punpckldq %%xmm2, %%xmm1 \n\t" \
390
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
391
"pmaddwd %%xmm2, %%xmm3 \n\t" \
392
"pmaddwd %%xmm1, %%xmm7 \n\t" \
393
"pmaddwd %%xmm5, %%xmm2 \n\t" \
394
"pmaddwd %%xmm4, %%xmm1 \n\t" \
395
"paddd %%xmm7, %%xmm3 \n\t" \
396
"paddd %%xmm2, %%xmm1 \n\t" \
397
"paddd %%xmm6, %%xmm3 \n\t" \
398
"paddd %%xmm6, %%xmm1 \n\t" \
399
"psrad %3, %%xmm3 \n\t" \
400
"psrad %3, %%xmm1 \n\t" \
401
"packssdw %%xmm3, %%xmm1 \n\t" \
402
"movdqa %%xmm1, " #i "(%4) \n\t"
404
"movdqa (%2), %%xmm6 \n\t"
405
FDCT_ROW_SSE2_H1(0,0)
407
FDCT_ROW_SSE2_H2(64,0)
410
FDCT_ROW_SSE2_H1(16,64)
412
FDCT_ROW_SSE2_H2(112,64)
415
FDCT_ROW_SSE2_H1(32,128)
417
FDCT_ROW_SSE2_H2(96,128)
420
FDCT_ROW_SSE2_H1(48,192)
422
FDCT_ROW_SSE2_H2(80,192)
425
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
429
static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
431
pshufw_m2r(*(in + 4), mm5, 0x1B);
219
432
movq_m2r(*(in + 0), mm0);
220
punpcklwd_r2r(mm2, mm5);
221
433
movq_r2r(mm0, mm1);
222
434
paddsw_r2r(mm5, mm0);
223
435
psubsw_r2r(mm5, mm1);
224
436
movq_r2r(mm0, mm2);
225
punpcklwd_r2r(mm1, mm0);
226
punpckhwd_r2r(mm1, mm2);
229
movq_m2r(*(table + 0), mm3);
230
punpcklwd_r2r(mm1, mm0);
437
punpckldq_r2r(mm1, mm0);
438
punpckhdq_r2r(mm1, mm2);
439
movq_m2r(*(table + 0), mm1);
440
movq_m2r(*(table + 4), mm3);
441
movq_m2r(*(table + 8), mm4);
442
movq_m2r(*(table + 12), mm5);
443
movq_m2r(*(table + 16), mm6);
444
movq_m2r(*(table + 20), mm7);
445
pmaddwd_r2r(mm0, mm1);
446
pmaddwd_r2r(mm2, mm3);
447
pmaddwd_r2r(mm0, mm4);
448
pmaddwd_r2r(mm2, mm5);
449
pmaddwd_r2r(mm0, mm6);
450
pmaddwd_r2r(mm2, mm7);
451
pmaddwd_m2r(*(table + 24), mm0);
452
pmaddwd_m2r(*(table + 28), mm2);
457
movq_m2r(*fdct_r_row, mm0);
462
psrad_i2r(SHIFT_FRW_ROW, mm3);
463
psrad_i2r(SHIFT_FRW_ROW, mm5);
464
psrad_i2r(SHIFT_FRW_ROW, mm7);
465
psrad_i2r(SHIFT_FRW_ROW, mm2);
466
packssdw_r2r(mm5, mm3);
467
packssdw_r2r(mm2, mm7);
468
movq_r2m(mm3, *(out + 0));
469
movq_r2m(mm7, *(out + 4));
472
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
474
//FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
475
movd_m2r(*(in + 6), mm1);
476
punpcklwd_m2r(*(in + 4), mm1);
478
psrlq_i2r(0x20, mm1);
479
movq_m2r(*(in + 0), mm0);
480
punpcklwd_r2r(mm2, mm1);
231
481
movq_r2r(mm0, mm5);
232
punpckldq_r2r(mm0, mm0);
233
movq_m2r(*(table + 4), mm4);
234
punpckhwd_r2r(mm1, mm2);
235
pmaddwd_r2r(mm0, mm3);
237
movq_m2r(*(table + 16), mm1);
238
punpckldq_r2r(mm2, mm2);
239
pmaddwd_r2r(mm2, mm4);
240
punpckhdq_r2r(mm5, mm5);
241
pmaddwd_m2r(*(table + 8), mm0);
242
punpckhdq_r2r(mm6, mm6);
482
paddsw_r2r(mm1, mm0);
483
psubsw_r2r(mm1, mm5);
485
punpckldq_r2r(mm5, mm0);
486
punpckhdq_r2r(mm5, mm2);
487
movq_m2r(*(table + 0), mm1);
488
movq_m2r(*(table + 4), mm3);
489
movq_m2r(*(table + 8), mm4);
490
movq_m2r(*(table + 12), mm5);
491
movq_m2r(*(table + 16), mm6);
243
492
movq_m2r(*(table + 20), mm7);
244
pmaddwd_r2r(mm5, mm1);
245
paddd_m2r(*fdct_r_row, mm3);
246
pmaddwd_r2r(mm6, mm7);
247
pmaddwd_m2r(*(table + 12), mm2);
249
pmaddwd_m2r(*(table + 24), mm5);
250
pmaddwd_m2r(*(table + 28), mm6);
252
paddd_m2r(*fdct_r_row, mm0);
493
pmaddwd_r2r(mm0, mm1);
494
pmaddwd_r2r(mm2, mm3);
495
pmaddwd_r2r(mm0, mm4);
496
pmaddwd_r2r(mm2, mm5);
497
pmaddwd_r2r(mm0, mm6);
498
pmaddwd_r2r(mm2, mm7);
499
pmaddwd_m2r(*(table + 24), mm0);
500
pmaddwd_m2r(*(table + 28), mm2);
505
movq_m2r(*fdct_r_row, mm0);
253
510
psrad_i2r(SHIFT_FRW_ROW, mm3);
254
paddd_m2r(*fdct_r_row, mm1);
256
paddd_m2r(*fdct_r_row, mm5);
257
psrad_i2r(SHIFT_FRW_ROW, mm1);
259
psrad_i2r(SHIFT_FRW_ROW, mm0);
260
511
psrad_i2r(SHIFT_FRW_ROW, mm5);
261
packssdw_r2r(mm0, mm3);
262
packssdw_r2r(mm5, mm1);
264
punpcklwd_r2r(mm1, mm3);
265
punpckhwd_r2r(mm1, mm6);
512
psrad_i2r(SHIFT_FRW_ROW, mm7);
513
psrad_i2r(SHIFT_FRW_ROW, mm2);
514
packssdw_r2r(mm5, mm3);
515
packssdw_r2r(mm2, mm7);
266
516
movq_r2m(mm3, *(out + 0));
267
movq_r2m(mm6, *(out + 4));
517
movq_r2m(mm7, *(out + 4));
270
520
void ff_fdct_mmx(int16_t *block)
272
522
int64_t align_tmp[16] ATTR_ALIGN(8);
273
int16_t * const block_tmp= (int16_t*)align_tmp;
274
int16_t *block1, *out;
275
const int16_t *table;
279
fdct_col(block, block1, 0);
280
fdct_col(block, block1, 4);
283
table = tab_frw_01234567;
286
fdct_row(block1, out, table);
523
int16_t * block1= (int16_t*)align_tmp;
524
const int16_t *table= tab_frw_01234567;
527
fdct_col(block, block1, 0);
528
fdct_col(block, block1, 4);
531
fdct_row_mmx(block1, block, table);
538
void ff_fdct_mmx2(int16_t *block)
540
int64_t align_tmp[16] ATTR_ALIGN(8);
541
int16_t *block1= (int16_t*)align_tmp;
542
const int16_t *table= tab_frw_01234567;
545
fdct_col(block, block1, 0);
546
fdct_col(block, block1, 4);
549
fdct_row_mmx2(block1, block, table);
556
void ff_fdct_sse2(int16_t *block)
558
int64_t align_tmp[16] ATTR_ALIGN(16);
559
int16_t * const block1= (int16_t*)align_tmp;
561
fdct_col(block, block1, 0);
562
fdct_col(block, block1, 4);
564
fdct_row_sse2(block1, block);