4
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6
* This library is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2 of the License, or (at your option) any later version.
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with this library; if not, write to the Free Software
18
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
#include "../dsputil.h"
32
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define COL_SHIFT 20 // 6
48
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
49
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
51
static const int16_t __attribute__((aligned(8))) coeffs[]= {
52
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
80
static void unused_var_killer(){
81
int a= wm1010 + d40000;
85
static void inline idctCol (int16_t * col, int16_t *input)
95
int a0, a1, a2, a3, b0, b1, b2, b3;
96
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
97
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105
if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
106
col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
107
col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
111
col[8*0] = input[8*0 + 0];
112
col[8*1] = input[8*2 + 0];
113
col[8*2] = input[8*0 + 1];
114
col[8*3] = input[8*2 + 1];
115
col[8*4] = input[8*4 + 0];
116
col[8*5] = input[8*6 + 0];
117
col[8*6] = input[8*4 + 1];
118
col[8*7] = input[8*6 + 1];
120
a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
121
a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
122
a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
123
a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125
b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
126
b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
127
b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
128
b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130
col[8*0] = (a0 + b0) >> COL_SHIFT;
131
col[8*1] = (a1 + b1) >> COL_SHIFT;
132
col[8*2] = (a2 + b2) >> COL_SHIFT;
133
col[8*3] = (a3 + b3) >> COL_SHIFT;
134
col[8*4] = (a3 - b3) >> COL_SHIFT;
135
col[8*5] = (a2 - b2) >> COL_SHIFT;
136
col[8*6] = (a1 - b1) >> COL_SHIFT;
137
col[8*7] = (a0 - b0) >> COL_SHIFT;
140
static void inline idctRow (int16_t * output, int16_t * input)
144
int a0, a1, a2, a3, b0, b1, b2, b3;
145
const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
146
const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147
const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148
const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
163
if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
164
row[0] = row[1] = row[2] = row[3] = row[4] =
165
row[5] = row[6] = row[7] = row[0]<<3;
177
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
178
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
179
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
180
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
183
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
184
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
185
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187
row[0] = (a0 + b0) >> ROW_SHIFT;
188
row[1] = (a1 + b1) >> ROW_SHIFT;
189
row[2] = (a2 + b2) >> ROW_SHIFT;
190
row[3] = (a3 + b3) >> ROW_SHIFT;
191
row[4] = (a3 - b3) >> ROW_SHIFT;
192
row[5] = (a2 - b2) >> ROW_SHIFT;
193
row[6] = (a1 - b1) >> ROW_SHIFT;
194
row[7] = (a0 - b0) >> ROW_SHIFT;
207
static inline void idct(int16_t *block)
209
int64_t __attribute__((aligned(8))) align_tmp[16];
210
int16_t * const temp= (int16_t*)align_tmp;
213
#if 0 //Alternative, simpler variant
215
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
216
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
217
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
218
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
219
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
220
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
221
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
222
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
223
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
224
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
225
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
226
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
227
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
228
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
229
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
230
#rounder ", %%mm4 \n\t"\
231
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
232
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
233
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
234
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
235
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
236
#rounder ", %%mm0 \n\t"\
237
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
238
"paddd %%mm0, %%mm0 \n\t" \
239
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
240
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
241
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
242
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
243
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
244
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
245
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
246
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
247
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
248
"psrad $" #shift ", %%mm7 \n\t"\
249
"psrad $" #shift ", %%mm4 \n\t"\
250
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
251
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
252
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
253
"psrad $" #shift ", %%mm1 \n\t"\
254
"psrad $" #shift ", %%mm2 \n\t"\
255
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
256
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
257
"movq %%mm7, " #dst " \n\t"\
258
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
259
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
260
"movq %%mm2, 24+" #dst " \n\t"\
261
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
262
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
263
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
264
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
265
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
266
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
267
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
268
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
269
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
270
"psrad $" #shift ", %%mm2 \n\t"\
271
"psrad $" #shift ", %%mm0 \n\t"\
272
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
273
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
274
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
275
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
276
"psrad $" #shift ", %%mm6 \n\t"\
277
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
278
"movq %%mm2, 8+" #dst " \n\t"\
279
"psrad $" #shift ", %%mm4 \n\t"\
280
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
281
"movq %%mm4, 16+" #dst " \n\t"\
283
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
284
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
285
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
286
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
287
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
288
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
289
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
290
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
291
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
292
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
293
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
294
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
295
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
296
#rounder ", %%mm4 \n\t"\
297
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
298
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
299
#rounder ", %%mm0 \n\t"\
300
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
301
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
302
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
303
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
304
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
305
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
306
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
307
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
308
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
309
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
310
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
311
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
312
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
313
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
314
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
315
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
316
"psrad $" #shift ", %%mm7 \n\t"\
317
"psrad $" #shift ", %%mm4 \n\t"\
318
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
319
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
320
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
321
"psrad $" #shift ", %%mm0 \n\t"\
322
"psrad $" #shift ", %%mm2 \n\t"\
323
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
324
"movd %%mm7, " #dst " \n\t"\
325
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
326
"movd %%mm0, 16+" #dst " \n\t"\
327
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
328
"movd %%mm2, 96+" #dst " \n\t"\
329
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
330
"movd %%mm4, 112+" #dst " \n\t"\
331
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
332
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
333
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
334
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
335
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
336
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
337
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
338
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
339
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
340
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
341
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
342
"psrad $" #shift ", %%mm2 \n\t"\
343
"psrad $" #shift ", %%mm5 \n\t"\
344
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
345
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
346
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
347
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
348
"psrad $" #shift ", %%mm6 \n\t"\
349
"psrad $" #shift ", %%mm4 \n\t"\
350
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
351
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
352
"movd %%mm2, 32+" #dst " \n\t"\
353
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
354
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
355
"movd %%mm6, 48+" #dst " \n\t"\
356
"movd %%mm4, 64+" #dst " \n\t"\
357
"movd %%mm5, 80+" #dst " \n\t"\
360
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
361
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
362
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
363
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
364
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
365
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
366
"pand %%mm0, %%mm4 \n\t"\
367
"por %%mm1, %%mm4 \n\t"\
368
"por %%mm2, %%mm4 \n\t"\
369
"por %%mm3, %%mm4 \n\t"\
370
"packssdw %%mm4,%%mm4 \n\t"\
371
"movd %%mm4, %%eax \n\t"\
372
"orl %%eax, %%eax \n\t"\
374
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
375
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
376
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
377
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
378
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
379
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
380
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
381
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
382
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
383
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
384
#rounder ", %%mm4 \n\t"\
385
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
386
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
387
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
388
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
389
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
390
#rounder ", %%mm0 \n\t"\
391
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
392
"paddd %%mm0, %%mm0 \n\t" \
393
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
394
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
395
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
396
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
397
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
398
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
399
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
400
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
401
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
402
"psrad $" #shift ", %%mm7 \n\t"\
403
"psrad $" #shift ", %%mm4 \n\t"\
404
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
405
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
406
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
407
"psrad $" #shift ", %%mm1 \n\t"\
408
"psrad $" #shift ", %%mm2 \n\t"\
409
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
410
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
411
"movq %%mm7, " #dst " \n\t"\
412
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
413
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
414
"movq %%mm2, 24+" #dst " \n\t"\
415
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
416
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
417
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
418
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
419
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
420
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
421
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
422
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
423
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
424
"psrad $" #shift ", %%mm2 \n\t"\
425
"psrad $" #shift ", %%mm0 \n\t"\
426
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
427
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
428
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
429
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
430
"psrad $" #shift ", %%mm6 \n\t"\
431
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
432
"movq %%mm2, 8+" #dst " \n\t"\
433
"psrad $" #shift ", %%mm4 \n\t"\
434
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
435
"movq %%mm4, 16+" #dst " \n\t"\
438
"pslld $16, %%mm0 \n\t"\
439
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
440
"psrad $13, %%mm0 \n\t"\
441
"packssdw %%mm0, %%mm0 \n\t"\
442
"movq %%mm0, " #dst " \n\t"\
443
"movq %%mm0, 8+" #dst " \n\t"\
444
"movq %%mm0, 16+" #dst " \n\t"\
445
"movq %%mm0, 24+" #dst " \n\t"\
449
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
450
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
451
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
452
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
453
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
456
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
457
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
460
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
461
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
462
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
463
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
464
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
468
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
469
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
470
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
471
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
472
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
473
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
474
"pand %%mm0, %%mm4 \n\t"\
475
"por %%mm1, %%mm4 \n\t"\
476
"por %%mm2, %%mm4 \n\t"\
477
"por %%mm3, %%mm4 \n\t"\
478
"packssdw %%mm4,%%mm4 \n\t"\
479
"movd %%mm4, %%eax \n\t"\
480
"orl %%eax, %%eax \n\t"\
482
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
483
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
484
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
485
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
486
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
487
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
488
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
489
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
490
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
491
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
492
#rounder ", %%mm4 \n\t"\
493
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
494
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
495
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
496
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
497
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
498
#rounder ", %%mm0 \n\t"\
499
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
500
"paddd %%mm0, %%mm0 \n\t" \
501
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
502
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
503
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
504
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
505
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
506
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
507
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
508
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
509
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
510
"psrad $" #shift ", %%mm7 \n\t"\
511
"psrad $" #shift ", %%mm4 \n\t"\
512
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
513
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
514
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
515
"psrad $" #shift ", %%mm1 \n\t"\
516
"psrad $" #shift ", %%mm2 \n\t"\
517
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
518
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
519
"movq %%mm7, " #dst " \n\t"\
520
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
521
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
522
"movq %%mm2, 24+" #dst " \n\t"\
523
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
524
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
525
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
526
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
527
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
528
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
529
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
530
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
531
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
532
"psrad $" #shift ", %%mm2 \n\t"\
533
"psrad $" #shift ", %%mm0 \n\t"\
534
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
535
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
536
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
537
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
538
"psrad $" #shift ", %%mm6 \n\t"\
539
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
540
"movq %%mm2, 8+" #dst " \n\t"\
541
"psrad $" #shift ", %%mm4 \n\t"\
542
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
543
"movq %%mm4, 16+" #dst " \n\t"\
546
"pslld $16, %%mm0 \n\t"\
547
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
548
"psrad $13, %%mm0 \n\t"\
549
"packssdw %%mm0, %%mm0 \n\t"\
550
"movq %%mm0, " #dst " \n\t"\
551
"movq %%mm0, 8+" #dst " \n\t"\
552
"movq %%mm0, 16+" #dst " \n\t"\
553
"movq %%mm0, 24+" #dst " \n\t"\
556
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
557
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
558
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
559
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
560
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
561
"movq %%mm0, %%mm4 \n\t"\
562
"por %%mm1, %%mm4 \n\t"\
563
"por %%mm2, %%mm4 \n\t"\
564
"por %%mm3, %%mm4 \n\t"\
565
"packssdw %%mm4,%%mm4 \n\t"\
566
"movd %%mm4, %%eax \n\t"\
567
"orl %%eax, %%eax \n\t"\
569
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
570
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
571
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
572
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
573
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
574
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
575
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
576
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
577
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
578
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
579
#rounder ", %%mm4 \n\t"\
580
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
581
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
582
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
583
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
584
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
585
#rounder ", %%mm0 \n\t"\
586
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
587
"paddd %%mm0, %%mm0 \n\t" \
588
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
589
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
590
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
591
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
592
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
593
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
594
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
595
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
596
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
597
"psrad $" #shift ", %%mm7 \n\t"\
598
"psrad $" #shift ", %%mm4 \n\t"\
599
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
600
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
601
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
602
"psrad $" #shift ", %%mm1 \n\t"\
603
"psrad $" #shift ", %%mm2 \n\t"\
604
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
605
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
606
"movq %%mm7, " #dst " \n\t"\
607
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
608
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
609
"movq %%mm2, 24+" #dst " \n\t"\
610
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
611
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
612
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
613
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
614
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
615
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
616
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
617
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
618
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
619
"psrad $" #shift ", %%mm2 \n\t"\
620
"psrad $" #shift ", %%mm0 \n\t"\
621
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
622
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
623
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
624
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
625
"psrad $" #shift ", %%mm6 \n\t"\
626
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
627
"movq %%mm2, 8+" #dst " \n\t"\
628
"psrad $" #shift ", %%mm4 \n\t"\
629
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
630
"movq %%mm4, 16+" #dst " \n\t"\
632
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
633
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
634
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
635
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
636
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
637
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
638
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
639
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
640
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
641
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
642
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
643
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
644
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
645
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
646
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
647
#rounder ", %%mm4 \n\t"\
648
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
649
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
650
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
651
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
652
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
653
#rounder ", %%mm0 \n\t"\
654
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
655
"paddd %%mm0, %%mm0 \n\t" \
656
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
657
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
658
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
659
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
660
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
661
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
662
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
663
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
664
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
665
"psrad $" #shift ", %%mm7 \n\t"\
666
"psrad $" #shift ", %%mm4 \n\t"\
667
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
668
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
669
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
670
"psrad $" #shift ", %%mm1 \n\t"\
671
"psrad $" #shift ", %%mm2 \n\t"\
672
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
673
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
674
"movq %%mm7, " #dst " \n\t"\
675
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
676
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
677
"movq %%mm2, 24+" #dst " \n\t"\
678
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
679
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
680
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
681
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
682
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
683
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
684
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
685
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
686
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
687
"psrad $" #shift ", %%mm2 \n\t"\
688
"psrad $" #shift ", %%mm0 \n\t"\
689
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
690
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
691
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
692
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
693
"psrad $" #shift ", %%mm6 \n\t"\
694
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
695
"movq %%mm2, 8+" #dst " \n\t"\
696
"psrad $" #shift ", %%mm4 \n\t"\
697
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
698
"movq %%mm4, 16+" #dst " \n\t"\
700
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
701
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
702
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
703
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
704
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
707
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
708
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
709
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
710
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
711
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
712
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
713
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
714
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
715
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
716
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
717
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
718
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
719
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
720
#rounder ", %%mm4 \n\t"\
721
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
722
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
723
#rounder ", %%mm0 \n\t"\
724
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
725
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
726
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
727
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
728
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
729
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
730
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
731
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
732
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
733
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
734
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
735
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
736
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
737
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
738
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
739
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
740
"psrad $" #shift ", %%mm7 \n\t"\
741
"psrad $" #shift ", %%mm4 \n\t"\
742
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
743
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
744
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
745
"psrad $" #shift ", %%mm0 \n\t"\
746
"psrad $" #shift ", %%mm2 \n\t"\
747
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
748
"movd %%mm7, " #dst " \n\t"\
749
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
750
"movd %%mm0, 16+" #dst " \n\t"\
751
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
752
"movd %%mm2, 96+" #dst " \n\t"\
753
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
754
"movd %%mm4, 112+" #dst " \n\t"\
755
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
756
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
757
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
758
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
759
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
760
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
761
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
762
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
763
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
764
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
765
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
766
"psrad $" #shift ", %%mm2 \n\t"\
767
"psrad $" #shift ", %%mm5 \n\t"\
768
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
769
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
770
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
771
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
772
"psrad $" #shift ", %%mm6 \n\t"\
773
"psrad $" #shift ", %%mm4 \n\t"\
774
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
776
"movd %%mm2, 32+" #dst " \n\t"\
777
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
778
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
779
"movd %%mm6, 48+" #dst " \n\t"\
780
"movd %%mm4, 64+" #dst " \n\t"\
781
"movd %%mm5, 80+" #dst " \n\t"
784
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
785
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
786
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
787
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
788
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
793
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
797
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
798
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
799
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
800
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
801
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
802
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
803
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
804
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
805
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
806
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
807
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
808
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
809
#rounder ", %%mm4 \n\t"\
810
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
811
#rounder ", %%mm0 \n\t"\
812
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
813
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
814
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
815
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
816
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
817
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
818
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
819
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
820
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
821
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
822
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
823
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
824
"psrad $" #shift ", %%mm1 \n\t"\
825
"psrad $" #shift ", %%mm4 \n\t"\
826
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
827
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
828
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
829
"psrad $" #shift ", %%mm0 \n\t"\
830
"psrad $" #shift ", %%mm2 \n\t"\
831
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
832
"movd %%mm1, " #dst " \n\t"\
833
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
834
"movd %%mm0, 16+" #dst " \n\t"\
835
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
836
"movd %%mm2, 96+" #dst " \n\t"\
837
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
838
"movd %%mm4, 112+" #dst " \n\t"\
839
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
840
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
841
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
842
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
843
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
844
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
845
"psrad $" #shift ", %%mm2 \n\t"\
846
"psrad $" #shift ", %%mm5 \n\t"\
847
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
848
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
849
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
850
"psrad $" #shift ", %%mm6 \n\t"\
851
"psrad $" #shift ", %%mm1 \n\t"\
852
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
853
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
854
"movd %%mm2, 32+" #dst " \n\t"\
855
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
856
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
857
"movd %%mm6, 48+" #dst " \n\t"\
858
"movd %%mm1, 64+" #dst " \n\t"\
859
"movd %%mm5, 80+" #dst " \n\t"
861
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
862
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
863
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
864
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
865
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
870
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
873
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
874
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
875
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
876
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
877
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
878
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
879
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
880
#rounder ", %%mm4 \n\t"\
881
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
882
#rounder ", %%mm0 \n\t"\
883
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
884
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
885
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
886
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
887
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
888
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
889
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
890
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
891
"psrad $" #shift ", %%mm1 \n\t"\
892
"psrad $" #shift ", %%mm4 \n\t"\
893
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
894
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
895
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
896
"psrad $" #shift ", %%mm0 \n\t"\
897
"psrad $" #shift ", %%mm2 \n\t"\
898
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
899
"movd %%mm1, " #dst " \n\t"\
900
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
901
"movd %%mm0, 16+" #dst " \n\t"\
902
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
903
"movd %%mm2, 96+" #dst " \n\t"\
904
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
905
"movd %%mm4, 112+" #dst " \n\t"\
906
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
907
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
908
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
909
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
910
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
911
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
912
"psrad $" #shift ", %%mm2 \n\t"\
913
"psrad $" #shift ", %%mm5 \n\t"\
914
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
915
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
916
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
917
"psrad $" #shift ", %%mm6 \n\t"\
918
"psrad $" #shift ", %%mm1 \n\t"\
919
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
920
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
921
"movd %%mm2, 32+" #dst " \n\t"\
922
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
923
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
924
"movd %%mm6, 48+" #dst " \n\t"\
925
"movd %%mm1, 64+" #dst " \n\t"\
926
"movd %%mm5, 80+" #dst " \n\t"
929
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
930
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
931
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
932
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
933
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
938
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
941
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
942
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
943
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
944
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
945
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
946
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
947
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
948
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
949
#rounder ", %%mm4 \n\t"\
950
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
951
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
952
#rounder ", %%mm0 \n\t"\
953
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
954
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
955
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
956
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
957
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
958
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
959
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
960
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
961
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
962
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
963
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
964
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
965
"psrad $" #shift ", %%mm7 \n\t"\
966
"psrad $" #shift ", %%mm4 \n\t"\
967
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
968
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
969
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
970
"psrad $" #shift ", %%mm0 \n\t"\
971
"psrad $" #shift ", %%mm2 \n\t"\
972
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
973
"movd %%mm7, " #dst " \n\t"\
974
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
975
"movd %%mm0, 16+" #dst " \n\t"\
976
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
977
"movd %%mm2, 96+" #dst " \n\t"\
978
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
979
"movd %%mm4, 112+" #dst " \n\t"\
980
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
981
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
982
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
983
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
984
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
985
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
986
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
987
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
988
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
989
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
990
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
991
"psrad $" #shift ", %%mm2 \n\t"\
992
"psrad $" #shift ", %%mm5 \n\t"\
993
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
994
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
995
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
996
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
997
"psrad $" #shift ", %%mm6 \n\t"\
998
"psrad $" #shift ", %%mm4 \n\t"\
999
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1000
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1001
"movd %%mm2, 32+" #dst " \n\t"\
1002
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1003
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1004
"movd %%mm6, 48+" #dst " \n\t"\
1005
"movd %%mm4, 64+" #dst " \n\t"\
1006
"movd %%mm5, 80+" #dst " \n\t"
1008
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1009
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1010
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1011
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1012
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1018
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1019
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1020
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1021
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1022
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1023
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1024
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1025
#rounder ", %%mm4 \n\t"\
1026
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1027
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1028
#rounder ", %%mm0 \n\t"\
1029
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1030
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1031
"movq 64(%2), %%mm3 \n\t"\
1032
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1033
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1034
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1035
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1036
"psrad $" #shift ", %%mm7 \n\t"\
1037
"psrad $" #shift ", %%mm4 \n\t"\
1038
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1039
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1040
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1041
"psrad $" #shift ", %%mm0 \n\t"\
1042
"psrad $" #shift ", %%mm1 \n\t"\
1043
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1044
"movd %%mm7, " #dst " \n\t"\
1045
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1046
"movd %%mm0, 16+" #dst " \n\t"\
1047
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1048
"movd %%mm1, 96+" #dst " \n\t"\
1049
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1050
"movd %%mm4, 112+" #dst " \n\t"\
1051
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1052
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1053
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1054
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1055
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1056
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1057
"psrad $" #shift ", %%mm1 \n\t"\
1058
"psrad $" #shift ", %%mm5 \n\t"\
1059
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1060
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1061
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1062
"psrad $" #shift ", %%mm6 \n\t"\
1063
"psrad $" #shift ", %%mm4 \n\t"\
1064
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1065
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1066
"movd %%mm1, 32+" #dst " \n\t"\
1067
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1068
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1069
"movd %%mm6, 48+" #dst " \n\t"\
1070
"movd %%mm4, 64+" #dst " \n\t"\
1071
"movd %%mm5, 80+" #dst " \n\t"
1074
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1075
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1076
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1077
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1078
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1084
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1085
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1086
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1087
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1088
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1089
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1090
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1091
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1092
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1093
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1094
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1095
#rounder ", %%mm4 \n\t"\
1096
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1098
#rounder ", %%mm0 \n\t"\
1099
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1100
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1101
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1102
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1103
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1104
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1105
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1106
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1107
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1108
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1109
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1110
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1111
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1112
#rounder ", %%mm1 \n\t"\
1113
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1114
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1115
#rounder ", %%mm2 \n\t"\
1116
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1117
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1118
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1119
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1120
"psrad $" #shift ", %%mm4 \n\t"\
1121
"psrad $" #shift ", %%mm7 \n\t"\
1122
"psrad $" #shift ", %%mm3 \n\t"\
1123
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1124
"movq %%mm4, " #dst " \n\t"\
1125
"psrad $" #shift ", %%mm0 \n\t"\
1126
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1127
"movq %%mm0, 16+" #dst " \n\t"\
1128
"movq %%mm0, 96+" #dst " \n\t"\
1129
"movq %%mm4, 112+" #dst " \n\t"\
1130
"psrad $" #shift ", %%mm5 \n\t"\
1131
"psrad $" #shift ", %%mm6 \n\t"\
1132
"psrad $" #shift ", %%mm2 \n\t"\
1133
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1134
"movq %%mm5, 32+" #dst " \n\t"\
1135
"psrad $" #shift ", %%mm1 \n\t"\
1136
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1137
"movq %%mm6, 48+" #dst " \n\t"\
1138
"movq %%mm6, 64+" #dst " \n\t"\
1139
"movq %%mm5, 80+" #dst " \n\t"
1142
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1143
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1144
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1145
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1146
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1153
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1154
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1155
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1156
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1157
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1158
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1159
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1160
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1161
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1162
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1163
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1164
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1165
#rounder ", %%mm4 \n\t"\
1166
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1167
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1168
#rounder ", %%mm0 \n\t"\
1169
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1170
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1171
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1172
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1173
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1174
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1175
"movq 64(%2), %%mm1 \n\t"\
1176
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1177
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1178
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1179
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1180
"psrad $" #shift ", %%mm7 \n\t"\
1181
"psrad $" #shift ", %%mm4 \n\t"\
1182
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1183
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1184
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1185
"psrad $" #shift ", %%mm0 \n\t"\
1186
"psrad $" #shift ", %%mm3 \n\t"\
1187
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1188
"movd %%mm7, " #dst " \n\t"\
1189
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1190
"movd %%mm0, 16+" #dst " \n\t"\
1191
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1192
"movd %%mm3, 96+" #dst " \n\t"\
1193
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1194
"movd %%mm4, 112+" #dst " \n\t"\
1195
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1196
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1197
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1198
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1199
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1200
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1201
"psrad $" #shift ", %%mm3 \n\t"\
1202
"psrad $" #shift ", %%mm5 \n\t"\
1203
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1204
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1205
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1206
"psrad $" #shift ", %%mm6 \n\t"\
1207
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1208
"movd %%mm3, 32+" #dst " \n\t"\
1209
"psrad $" #shift ", %%mm4 \n\t"\
1210
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1211
"movd %%mm6, 48+" #dst " \n\t"\
1212
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1213
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1214
"movd %%mm4, 64+" #dst " \n\t"\
1215
"movd %%mm5, 80+" #dst " \n\t"
1218
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1219
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1220
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1221
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1222
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1229
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1230
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1231
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1232
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1233
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1234
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1235
#rounder ", %%mm4 \n\t"\
1236
#rounder ", %%mm0 \n\t"\
1237
"psrad $" #shift ", %%mm4 \n\t"\
1238
"psrad $" #shift ", %%mm0 \n\t"\
1239
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1240
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1241
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1242
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1243
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1244
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1245
#rounder ", %%mm1 \n\t"\
1246
#rounder ", %%mm2 \n\t"\
1247
"psrad $" #shift ", %%mm1 \n\t"\
1248
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1249
"movq %%mm4, " #dst " \n\t"\
1250
"psrad $" #shift ", %%mm2 \n\t"\
1251
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1252
"movq %%mm0, 16+" #dst " \n\t"\
1253
"movq %%mm0, 96+" #dst " \n\t"\
1254
"movq %%mm4, 112+" #dst " \n\t"\
1255
"movq %%mm0, 32+" #dst " \n\t"\
1256
"movq %%mm4, 48+" #dst " \n\t"\
1257
"movq %%mm4, 64+" #dst " \n\t"\
1258
"movq %%mm0, 80+" #dst " \n\t"
1260
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1261
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1262
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1263
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1264
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1271
00 40 04 44 20 60 24 64
1272
10 30 14 34 50 70 54 74
1273
01 41 03 43 21 61 23 63
1274
11 31 13 33 51 71 53 73
1275
02 42 06 46 22 62 26 66
1276
12 32 16 36 52 72 56 76
1277
05 45 07 47 25 65 27 67
1278
15 35 17 37 55 75 57 77
1281
00 04 10 14 20 24 30 34
1282
40 44 50 54 60 64 70 74
1283
01 03 11 13 21 23 31 33
1284
41 43 51 53 61 63 71 73
1285
02 06 12 16 22 26 32 36
1286
42 46 52 56 62 66 72 76
1287
05 07 15 17 25 27 35 37
1288
45 47 55 57 65 67 75 77
1292
:: "r" (block), "r" (temp), "r" (coeffs)
1297
void ff_simple_idct_a64_mmx(int16_t *block)
1302
//FIXME merge add/put into the idct
1304
void ff_simple_idct_put_a64_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1307
put_pixels_clamped_a64_mmx(block, dest, line_size);
1309
void ff_simple_idct_add_a64_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1312
add_pixels_clamped_a64_mmx(block, dest, line_size);