2
* Copyright (C) 2004 the ffmpeg project
4
* This library is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU Lesser General Public
6
* License as published by the Free Software Foundation; either
7
* version 2 of the License, or (at your option) any later version.
9
* This library is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
* Lesser General Public License for more details.
14
* You should have received a copy of the GNU Lesser General Public
15
* License along with this library; if not, write to the Free Software
16
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
* MMX-optimized functions cribbed from the original VP3 source code.
24
#include "../dsputil.h"
27
#define IdctAdjustBeforeShift 8
29
/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30
* idct_constants[0..15] = Mask table (M(I))
31
* idct_constants[16..43] = Cosine table (C(I))
32
* idct_constants[44..47] = 8
34
static uint16_t idct_constants[(4 + 7 + 1) * 4];
35
static uint16_t idct_cosine_table[7] = {
36
64277, 60547, 54491, 46341, 36410, 25080, 12785
48
/* from original comments: The Macro does IDct on 4 1-D Dcts */
49
#define BeginIDCT() { \
50
movq_m2r(*I(3), r2); \
51
movq_m2r(*C(3), r6); \
53
movq_m2r(*J(5), r7); \
54
pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
55
movq_m2r(*C(5), r1); \
56
pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
58
pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
59
movq_m2r(*I(1), r3); \
60
pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
61
movq_m2r(*C(1), r0); /* (all registers are in use) */ \
62
paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
63
paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
64
paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
65
movq_m2r(*J(7), r1); \
66
paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
67
movq_r2r(r0, r5); /* r5 = c1 */ \
68
pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
69
paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
70
pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
71
movq_m2r(*C(7), r7); \
72
psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
73
paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
74
pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
75
movq_m2r(*I(2), r2); \
76
pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
77
paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
78
movq_r2r(r2, r1); /* r1 = i2 */ \
79
pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
80
psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
81
movq_m2r(*J(6), r5); \
82
paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
83
movq_r2r(r5, r7); /* r7 = i6 */ \
84
psubsw_r2r(r4, r0); /* r0 = A - C */ \
85
pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
86
paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
87
pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
88
paddsw_r2r(r4, r4); /* r4 = C + C */ \
89
paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
90
psubsw_r2r(r6, r3); /* r3 = B - D */ \
91
paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
92
paddsw_r2r(r6, r6); /* r6 = D + D */ \
93
pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
94
paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
95
movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
96
psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
97
movq_m2r(*C(4), r4); \
98
movq_r2r(r3, r5); /* r5 = B - D */ \
99
pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
100
paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
101
movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
102
movq_r2r(r0, r2); /* r2 = A - C */ \
103
movq_m2r(*I(0), r6); \
104
pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
105
paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
106
movq_m2r(*J(4), r3); \
107
psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
108
paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
109
psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
111
pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
112
paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
113
paddsw_r2r(r1, r1); /* r1 = H + H */ \
114
paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
115
paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
116
pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
117
paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
118
psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
119
paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
120
movq_m2r(*I(1), r0); /* r0 = C. */ \
121
paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
122
paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
123
psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
126
/* RowIDCT gets ready to transpose */
127
#define RowIDCT() { \
131
movq_m2r(*I(2), r3); /* r3 = D. */ \
132
psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
133
paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
134
paddsw_r2r(r7, r7); /* r7 = G + G */ \
135
paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
136
paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
137
psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
138
paddsw_r2r(r3, r3); \
139
psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
140
paddsw_r2r(r5, r5); \
141
paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
142
paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
143
psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
144
paddsw_r2r(r0, r0); \
145
movq_r2m(r1, *I(1)); /* save R1 */ \
146
paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
149
/* Column IDCT normalizes and stores final results */
150
#define ColumnIDCT() { \
154
paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
155
paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
156
paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
157
psraw_i2r(4, r2); /* r2 = NR2 */ \
158
psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
159
psraw_i2r(4, r1); /* r1 = NR1 */ \
160
movq_m2r(*I(2), r3); /* r3 = D. */ \
161
paddsw_r2r(r7, r7); /* r7 = G + G */ \
162
movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
163
paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
164
movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
165
psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
166
paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
167
paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
168
paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
169
psraw_i2r(4, r4); /* r4 = NR4 */ \
170
psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
171
psraw_i2r(4, r3); /* r3 = NR3 */ \
172
paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
173
paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
174
paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
175
psraw_i2r(4, r6); /* r6 = NR6 */ \
176
movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
177
psraw_i2r(4, r5); /* r5 = NR5 */ \
178
movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
179
psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
180
paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
181
paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
182
paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
183
psraw_i2r(4, r7); /* r7 = NR7 */ \
184
movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
185
psraw_i2r(4, r0); /* r0 = NR0 */ \
186
movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
187
movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
188
movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
191
/* Following macro does two 4x4 transposes in place.
193
At entry (we assume):
217
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
218
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
220
Since r1 is free at entry, we calculate the Js first. */
222
#define Transpose() { \
223
movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
224
punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
225
movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
226
punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
227
movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
228
punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
229
movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
230
punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
231
punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
232
movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
233
movq_r2m(r4, *J(4)); \
234
punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
235
movq_r2m(r5, *J(5)); \
236
punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
237
movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
238
punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
239
movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
240
movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
241
movq_r2m(r6, *J(7)); \
242
punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
243
movq_r2m(r1, *J(6)); \
244
punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
245
movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
246
punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
247
movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
248
punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
249
punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
250
movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
251
movq_r2m(r0, *I(0)); \
252
punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
253
movq_r2m(r1, *I(1)); \
254
punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
255
punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
256
movq_r2m(r4, *I(3)); \
257
movq_r2m(r2, *I(2)); \
260
void vp3_dsp_init_mmx(void)
266
idct_constants[--j] = 0;
269
idct_constants[0] = idct_constants[5] =
270
idct_constants[10] = idct_constants[15] = 65535;
274
p = idct_constants + ((j + 3) << 2);
275
p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
278
idct_constants[44] = idct_constants[45] =
279
idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
282
void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
283
int coeff_count, int16_t *output_data)
285
/* eax = quantized input
286
* ebx = dequantizer matrix
287
* ecx = IDCT constants
288
* M(I) = ecx + MaskOffset(0) + I * 8
289
* C(I) = ecx + CosineOffset(32) + (I-1) * 8
294
#define M(x) (idct_constants + x * 4)
295
#define C(x) (idct_constants + 16 + (x - 1) * 4)
296
#define Eight (idct_constants + 44)
298
unsigned char *input_bytes = (unsigned char *)input_data;
299
unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
300
unsigned char *output_data_bytes = (unsigned char *)output_data;
302
movq_m2r(*(input_bytes), r0);
303
pmullw_m2r(*(dequant_matrix_bytes), r0); /* r0 = 03 02 01 00 */
304
movq_m2r(*(input_bytes+16), r1);
305
pmullw_m2r(*(dequant_matrix_bytes+16), r1); /* r1 = 13 12 11 10 */
306
movq_m2r(*M(0), r2); /* r2 = __ __ __ FF */
307
movq_r2r(r0, r3); /* r3 = 03 02 01 00 */
308
movq_m2r(*(input_bytes+8), r4);
309
psrlq_i2r(16, r0); /* r0 = __ 03 02 01 */
310
pmullw_m2r(*(dequant_matrix_bytes+8), r4); /* r4 = 07 06 05 04 */
311
pand_r2r(r2, r3); /* r3 = __ __ __ 00 */
312
movq_r2r(r0, r5); /* r5 = __ 03 02 01 */
313
movq_r2r(r1, r6); /* r6 = 13 12 11 10 */
314
pand_r2r(r2, r5); /* r5 = __ __ __ 01 */
315
psllq_i2r(32, r6); /* r6 = 11 10 __ __ */
316
movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
317
pxor_r2r(r5, r0); /* r0 = __ 03 02 __ */
318
pand_r2r(r6, r7); /* r7 = 11 __ __ __ */
319
por_r2r(r3, r0); /* r0 = __ 03 02 00 */
320
pxor_r2r(r7, r6); /* r6 = __ 10 __ __ */
321
por_r2r(r7, r0); /* r0 = 11 03 02 00 = R0 */
322
movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
323
movq_r2r(r4, r3); /* r3 = 07 06 05 04 */
324
movq_r2m(r0, *(output_data_bytes)); /* write R0 = r0 */
325
pand_r2r(r2, r3); /* r3 = __ __ __ 04 */
326
movq_m2r(*(input_bytes+32), r0);
327
psllq_i2r(16, r3); /* r3 = __ __ 04 __ */
328
pmullw_m2r(*(dequant_matrix_bytes+32), r0); /* r0 = 23 22 21 20 */
329
pand_r2r(r1, r7); /* r7 = 13 __ __ __ */
330
por_r2r(r3, r5); /* r5 = __ __ 04 01 */
331
por_r2r(r6, r7); /* r7 = 13 10 __ __ */
332
movq_m2r(*(input_bytes+24), r3);
333
por_r2r(r5, r7); /* r7 = 13 10 04 01 = R1 */
334
pmullw_m2r(*(dequant_matrix_bytes+24), r3); /* r3 = 17 16 15 14 */
335
psrlq_i2r(16, r4); /* r4 = __ 07 06 05 */
336
movq_r2m(r7, *(output_data_bytes+16)); /* write R1 = r7 */
337
movq_r2r(r4, r5); /* r5 = __ 07 06 05 */
338
movq_r2r(r0, r7); /* r7 = 23 22 21 20 */
339
psrlq_i2r(16, r4); /* r4 = __ __ 07 06 */
340
psrlq_i2r(48, r7); /* r7 = __ __ __ 23 */
341
movq_r2r(r2, r6); /* r6 = __ __ __ FF */
342
pand_r2r(r2, r5); /* r5 = __ __ __ 05 */
343
pand_r2r(r4, r6); /* r6 = __ __ __ 06 */
344
movq_r2m(r7, *(output_data_bytes+80)); /* partial R9 = __ __ __ 23 */
345
pxor_r2r(r6, r4); /* r4 = __ __ 07 __ */
346
psrlq_i2r(32, r1); /* r1 = __ __ 13 12 */
347
por_r2r(r5, r4); /* r4 = __ __ 07 05 */
348
movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
349
pand_r2r(r2, r1); /* r1 = __ __ __ 12 */
350
movq_m2r(*(input_bytes+48), r5);
351
psllq_i2r(16, r0); /* r0 = 22 21 20 __ */
352
pmullw_m2r(*(dequant_matrix_bytes+48), r5); /* r5 = 33 32 31 30 */
353
pand_r2r(r0, r7); /* r7 = 22 __ __ __ */
354
movq_r2m(r1, *(output_data_bytes+64)); /* partial R8 = __ __ __ 12 */
355
por_r2r(r4, r7); /* r7 = 22 __ 07 05 */
356
movq_r2r(r3, r4); /* r4 = 17 16 15 14 */
357
pand_r2r(r2, r3); /* r3 = __ __ __ 14 */
358
movq_m2r(*M(2), r1); /* r1 = __ FF __ __ */
359
psllq_i2r(32, r3); /* r3 = __ 14 __ __ */
360
por_r2r(r3, r7); /* r7 = 22 14 07 05 = R2 */
361
movq_r2r(r5, r3); /* r3 = 33 32 31 30 */
362
psllq_i2r(48, r3); /* r3 = 30 __ __ __ */
363
pand_r2r(r0, r1); /* r1 = __ 21 __ __ */
364
movq_r2m(r7, *(output_data_bytes+32)); /* write R2 = r7 */
365
por_r2r(r3, r6); /* r6 = 30 __ __ 06 */
366
movq_m2r(*M(1), r7); /* r7 = __ __ FF __ */
367
por_r2r(r1, r6); /* r6 = 30 21 __ 06 */
368
movq_m2r(*(input_bytes+56), r1);
369
pand_r2r(r4, r7); /* r7 = __ __ 15 __ */
370
pmullw_m2r(*(dequant_matrix_bytes+56), r1); /* r1 = 37 36 35 34 */
371
por_r2r(r6, r7); /* r7 = 30 21 15 06 = R3 */
372
pand_m2r(*M(1), r0); /* r0 = __ __ 20 __ */
373
psrlq_i2r(32, r4); /* r4 = __ __ 17 16 */
374
movq_r2m(r7, *(output_data_bytes+48)); /* write R3 = r7 */
375
movq_r2r(r4, r6); /* r6 = __ __ 17 16 */
376
movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
377
pand_r2r(r2, r4); /* r4 = __ __ __ 16 */
378
movq_m2r(*M(1), r3); /* r3 = __ __ FF __ */
379
pand_r2r(r1, r7); /* r7 = 37 __ __ __ */
380
pand_r2r(r5, r3); /* r3 = __ __ 31 __ */
381
por_r2r(r4, r0); /* r0 = __ __ 20 16 */
382
psllq_i2r(16, r3); /* r3 = __ 31 __ __ */
383
por_r2r(r0, r7); /* r7 = 37 __ 20 16 */
384
movq_m2r(*M(2), r4); /* r4 = __ FF __ __ */
385
por_r2r(r3, r7); /* r7 = 37 31 20 16 = R4 */
386
movq_m2r(*(input_bytes+80), r0);
387
movq_r2r(r4, r3); /* r3 = __ __ FF __ */
388
pmullw_m2r(*(dequant_matrix_bytes+80), r0); /* r0 = 53 52 51 50 */
389
pand_r2r(r5, r4); /* r4 = __ 32 __ __ */
390
movq_r2m(r7, *(output_data_bytes+8)); /* write R4 = r7 */
391
por_r2r(r4, r6); /* r6 = __ 32 17 16 */
392
movq_r2r(r3, r4); /* r4 = __ FF __ __ */
393
psrlq_i2r(16, r6); /* r6 = __ __ 32 17 */
394
movq_r2r(r0, r7); /* r7 = 53 52 51 50 */
395
pand_r2r(r1, r4); /* r4 = __ 36 __ __ */
396
psllq_i2r(48, r7); /* r7 = 50 __ __ __ */
397
por_r2r(r4, r6); /* r6 = __ 36 32 17 */
398
movq_m2r(*(input_bytes+88), r4);
399
por_r2r(r6, r7); /* r7 = 50 36 32 17 = R5 */
400
pmullw_m2r(*(dequant_matrix_bytes+88), r4); /* r4 = 57 56 55 54 */
401
psrlq_i2r(16, r3); /* r3 = __ __ FF __ */
402
movq_r2m(r7, *(output_data_bytes+24)); /* write R5 = r7 */
403
pand_r2r(r1, r3); /* r3 = __ __ 35 __ */
404
psrlq_i2r(48, r5); /* r5 = __ __ __ 33 */
405
pand_r2r(r2, r1); /* r1 = __ __ __ 34 */
406
movq_m2r(*(input_bytes+104), r6);
407
por_r2r(r3, r5); /* r5 = __ __ 35 33 */
408
pmullw_m2r(*(dequant_matrix_bytes+104), r6); /* r6 = 67 66 65 64 */
409
psrlq_i2r(16, r0); /* r0 = __ 53 52 51 */
410
movq_r2r(r4, r7); /* r7 = 57 56 55 54 */
411
movq_r2r(r2, r3); /* r3 = __ __ __ FF */
412
psllq_i2r(48, r7); /* r7 = 54 __ __ __ */
413
pand_r2r(r0, r3); /* r3 = __ __ __ 51 */
414
pxor_r2r(r3, r0); /* r0 = __ 53 52 __ */
415
psllq_i2r(32, r3); /* r3 = __ 51 __ __ */
416
por_r2r(r5, r7); /* r7 = 54 __ 35 33 */
417
movq_r2r(r6, r5); /* r5 = 67 66 65 64 */
418
pand_m2r(*M(1), r6); /* r6 = __ __ 65 __ */
419
por_r2r(r3, r7); /* r7 = 54 51 35 33 = R6 */
420
psllq_i2r(32, r6); /* r6 = 65 __ __ __ */
421
por_r2r(r1, r0); /* r0 = __ 53 52 34 */
422
movq_r2m(r7, *(output_data_bytes+40)); /* write R6 = r7 */
423
por_r2r(r6, r0); /* r0 = 65 53 52 34 = R7 */
424
movq_m2r(*(input_bytes+120), r7);
425
movq_r2r(r5, r6); /* r6 = 67 66 65 64 */
426
pmullw_m2r(*(dequant_matrix_bytes+120), r7); /* r7 = 77 76 75 74 */
427
psrlq_i2r(32, r5); /* r5 = __ __ 67 66 */
428
pand_r2r(r2, r6); /* r6 = __ __ __ 64 */
429
movq_r2r(r5, r1); /* r1 = __ __ 67 66 */
430
movq_r2m(r0, *(output_data_bytes+56)); /* write R7 = r0 */
431
pand_r2r(r2, r1); /* r1 = __ __ __ 66 */
432
movq_m2r(*(input_bytes+112), r0);
433
movq_r2r(r7, r3); /* r3 = 77 76 75 74 */
434
pmullw_m2r(*(dequant_matrix_bytes+112), r0); /* r0 = 73 72 71 70 */
435
psllq_i2r(16, r3); /* r3 = 76 75 74 __ */
436
pand_m2r(*M(3), r7); /* r7 = 77 __ __ __ */
437
pxor_r2r(r1, r5); /* r5 = __ __ 67 __ */
438
por_r2r(r5, r6); /* r6 = __ __ 67 64 */
439
movq_r2r(r3, r5); /* r5 = 76 75 74 __ */
440
pand_m2r(*M(3), r5); /* r5 = 76 __ __ __ */
441
por_r2r(r1, r7); /* r7 = 77 __ __ 66 */
442
movq_m2r(*(input_bytes+96), r1);
443
pxor_r2r(r5, r3); /* r3 = __ 75 74 __ */
444
pmullw_m2r(*(dequant_matrix_bytes+96), r1); /* r1 = 63 62 61 60 */
445
por_r2r(r3, r7); /* r7 = 77 75 74 66 = R15 */
446
por_r2r(r5, r6); /* r6 = 76 __ 67 64 */
447
movq_r2r(r0, r5); /* r5 = 73 72 71 70 */
448
movq_r2m(r7, *(output_data_bytes+120)); /* store R15 = r7 */
449
psrlq_i2r(16, r5); /* r5 = __ 73 72 71 */
450
pand_m2r(*M(2), r5); /* r5 = __ 73 __ __ */
451
movq_r2r(r0, r7); /* r7 = 73 72 71 70 */
452
por_r2r(r5, r6); /* r6 = 76 73 67 64 = R14 */
453
pand_r2r(r2, r0); /* r0 = __ __ __ 70 */
454
pxor_r2r(r0, r7); /* r7 = 73 72 71 __ */
455
psllq_i2r(32, r0); /* r0 = __ 70 __ __ */
456
movq_r2m(r6, *(output_data_bytes+104)); /* write R14 = r6 */
457
psrlq_i2r(16, r4); /* r4 = __ 57 56 55 */
458
movq_m2r(*(input_bytes+72), r5);
459
psllq_i2r(16, r7); /* r7 = 72 71 __ __ */
460
pmullw_m2r(*(dequant_matrix_bytes+72), r5); /* r5 = 47 46 45 44 */
461
movq_r2r(r7, r6); /* r6 = 72 71 __ __ */
462
movq_m2r(*M(2), r3); /* r3 = __ FF __ __ */
463
psllq_i2r(16, r6); /* r6 = 71 __ __ __ */
464
pand_m2r(*M(3), r7); /* r7 = 72 __ __ __ */
465
pand_r2r(r1, r3); /* r3 = __ 62 __ __ */
466
por_r2r(r0, r7); /* r7 = 72 70 __ __ */
467
movq_r2r(r1, r0); /* r0 = 63 62 61 60 */
468
pand_m2r(*M(3), r1); /* r1 = 63 __ __ __ */
469
por_r2r(r3, r6); /* r6 = 71 62 __ __ */
470
movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
471
psrlq_i2r(32, r1); /* r1 = __ __ 63 __ */
472
pand_r2r(r2, r3); /* r3 = __ __ __ 55 */
473
por_r2r(r1, r7); /* r7 = 72 70 63 __ */
474
por_r2r(r3, r7); /* r7 = 72 70 63 55 = R13 */
475
movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
476
pand_m2r(*M(1), r3); /* r3 = __ __ 56 __ */
477
movq_r2r(r5, r1); /* r1 = 47 46 45 44 */
478
movq_r2m(r7, *(output_data_bytes+88)); /* write R13 = r7 */
479
psrlq_i2r(48, r5); /* r5 = __ __ __ 47 */
480
movq_m2r(*(input_bytes+64), r7);
481
por_r2r(r3, r6); /* r6 = 71 62 56 __ */
482
pmullw_m2r(*(dequant_matrix_bytes+64), r7); /* r7 = 43 42 41 40 */
483
por_r2r(r5, r6); /* r6 = 71 62 56 47 = R12 */
484
pand_m2r(*M(2), r4); /* r4 = __ 57 __ __ */
485
psllq_i2r(32, r0); /* r0 = 61 60 __ __ */
486
movq_r2m(r6, *(output_data_bytes+72)); /* write R12 = r6 */
487
movq_r2r(r0, r6); /* r6 = 61 60 __ __ */
488
pand_m2r(*M(3), r0); /* r0 = 61 __ __ __ */
489
psllq_i2r(16, r6); /* r6 = 60 __ __ __ */
490
movq_m2r(*(input_bytes+40), r5);
491
movq_r2r(r1, r3); /* r3 = 47 46 45 44 */
492
pmullw_m2r(*(dequant_matrix_bytes+40), r5); /* r5 = 27 26 25 24 */
493
psrlq_i2r(16, r1); /* r1 = __ 47 46 45 */
494
pand_m2r(*M(1), r1); /* r1 = __ __ 46 __ */
495
por_r2r(r4, r0); /* r0 = 61 57 __ __ */
496
pand_r2r(r7, r2); /* r2 = __ __ __ 40 */
497
por_r2r(r1, r0); /* r0 = 61 57 46 __ */
498
por_r2r(r2, r0); /* r0 = 61 57 46 40 = R11 */
499
psllq_i2r(16, r3); /* r3 = 46 45 44 __ */
500
movq_r2r(r3, r4); /* r4 = 46 45 44 __ */
501
movq_r2r(r5, r2); /* r2 = 27 26 25 24 */
502
movq_r2m(r0, *(output_data_bytes+112)); /* write R11 = r0 */
503
psrlq_i2r(48, r2); /* r2 = __ __ __ 27 */
504
pand_m2r(*M(2), r4); /* r4 = __ 45 __ __ */
505
por_r2r(r2, r6); /* r6 = 60 __ __ 27 */
506
movq_m2r(*M(1), r2); /* r2 = __ __ FF __ */
507
por_r2r(r4, r6); /* r6 = 60 45 __ 27 */
508
pand_r2r(r7, r2); /* r2 = __ __ 41 __ */
509
psllq_i2r(32, r3); /* r3 = 44 __ __ __ */
510
por_m2r(*(output_data_bytes+80), r3); /* r3 = 44 __ __ 23 */
511
por_r2r(r2, r6); /* r6 = 60 45 41 27 = R10 */
512
movq_m2r(*M(3), r2); /* r2 = FF __ __ __ */
513
psllq_i2r(16, r5); /* r5 = 26 25 24 __ */
514
movq_r2m(r6, *(output_data_bytes+96)); /* store R10 = r6 */
515
pand_r2r(r5, r2); /* r2 = 26 __ __ __ */
516
movq_m2r(*M(2), r6); /* r6 = __ FF __ __ */
517
pxor_r2r(r2, r5); /* r5 = __ 25 24 __ */
518
pand_r2r(r7, r6); /* r6 = __ 42 __ __ */
519
psrlq_i2r(32, r2); /* r2 = __ __ 26 __ */
520
pand_m2r(*M(3), r7); /* r7 = 43 __ __ __ */
521
por_r2r(r2, r3); /* r3 = 44 __ 26 23 */
522
por_m2r(*(output_data_bytes+64), r7); /* r7 = 43 __ __ 12 */
523
por_r2r(r3, r6); /* r6 = 44 42 26 23 = R9 */
524
por_r2r(r5, r7); /* r7 = 43 25 24 12 = R8 */
525
movq_r2m(r6, *(output_data_bytes+80)); /* store R9 = r6 */
526
movq_r2m(r7, *(output_data_bytes+64)); /* store R8 = r7 */
531
/* at this point, function has completed dequantization + dezigzag +
532
* partial transposition; now do the idct itself */
534
#define I(K) (output_data + K * 8)
535
#define J(K) (output_data + ((K - 4) * 8) + 4)
542
#define I(K) (output_data + (K * 8) + 32)
543
#define J(K) (output_data + ((K - 4) * 8) + 36)
550
#define I(K) (output_data + K * 8)
551
#define J(K) (output_data + K * 8)
557
#define I(K) (output_data + (K * 8) + 4)
558
#define J(K) (output_data + (K * 8) + 4)