160
163
row[5] = input[12];
161
164
row[7] = input[13];
163
if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
164
row[0] = row[1] = row[2] = row[3] = row[4] =
165
row[5] = row[6] = row[7] = row[0]<<3;
177
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
178
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
179
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
180
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
183
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
184
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
185
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187
row[0] = (a0 + b0) >> ROW_SHIFT;
188
row[1] = (a1 + b1) >> ROW_SHIFT;
189
row[2] = (a2 + b2) >> ROW_SHIFT;
190
row[3] = (a3 + b3) >> ROW_SHIFT;
191
row[4] = (a3 - b3) >> ROW_SHIFT;
192
row[5] = (a2 - b2) >> ROW_SHIFT;
193
row[6] = (a1 - b1) >> ROW_SHIFT;
194
row[7] = (a0 - b0) >> ROW_SHIFT;
166
if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
167
row[0] = row[1] = row[2] = row[3] = row[4] =
168
row[5] = row[6] = row[7] = row[0]<<3;
180
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
181
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
182
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
183
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
185
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
186
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
187
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
188
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
190
row[0] = (a0 + b0) >> ROW_SHIFT;
191
row[1] = (a1 + b1) >> ROW_SHIFT;
192
row[2] = (a2 + b2) >> ROW_SHIFT;
193
row[3] = (a3 + b3) >> ROW_SHIFT;
194
row[4] = (a3 - b3) >> ROW_SHIFT;
195
row[5] = (a2 - b2) >> ROW_SHIFT;
196
row[6] = (a1 - b1) >> ROW_SHIFT;
197
row[7] = (a0 - b0) >> ROW_SHIFT;
207
210
static inline void idct(int16_t *block)
209
int64_t __attribute__((aligned(8))) align_tmp[16];
210
int16_t * const temp= (int16_t*)align_tmp;
212
int64_t __attribute__((aligned(8))) align_tmp[16];
213
int16_t * const temp= (int16_t*)align_tmp;
213
216
#if 0 //Alternative, simpler variant
215
218
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
216
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
217
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
218
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
219
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
220
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
221
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
222
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
223
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
224
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
225
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
226
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
227
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
228
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
229
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
230
#rounder ", %%mm4 \n\t"\
231
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
232
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
233
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
234
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
235
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
236
#rounder ", %%mm0 \n\t"\
237
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
238
"paddd %%mm0, %%mm0 \n\t" \
239
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
240
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
241
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
242
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
243
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
244
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
245
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
246
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
247
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
248
"psrad $" #shift ", %%mm7 \n\t"\
249
"psrad $" #shift ", %%mm4 \n\t"\
250
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
251
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
252
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
253
"psrad $" #shift ", %%mm1 \n\t"\
254
"psrad $" #shift ", %%mm2 \n\t"\
255
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
256
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
257
"movq %%mm7, " #dst " \n\t"\
258
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
259
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
260
"movq %%mm2, 24+" #dst " \n\t"\
261
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
262
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
263
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
264
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
265
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
266
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
267
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
268
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
269
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
270
"psrad $" #shift ", %%mm2 \n\t"\
271
"psrad $" #shift ", %%mm0 \n\t"\
272
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
273
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
274
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
275
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
276
"psrad $" #shift ", %%mm6 \n\t"\
277
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
278
"movq %%mm2, 8+" #dst " \n\t"\
279
"psrad $" #shift ", %%mm4 \n\t"\
280
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
281
"movq %%mm4, 16+" #dst " \n\t"\
283
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
284
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
285
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
286
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
287
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
288
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
289
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
290
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
291
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
292
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
293
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
294
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
295
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
296
#rounder ", %%mm4 \n\t"\
297
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
298
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
299
#rounder ", %%mm0 \n\t"\
300
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
301
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
302
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
303
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
304
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
305
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
306
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
307
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
308
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
309
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
310
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
311
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
312
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
313
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
314
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
315
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
316
"psrad $" #shift ", %%mm7 \n\t"\
317
"psrad $" #shift ", %%mm4 \n\t"\
318
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
319
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
320
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
321
"psrad $" #shift ", %%mm0 \n\t"\
322
"psrad $" #shift ", %%mm2 \n\t"\
323
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
324
"movd %%mm7, " #dst " \n\t"\
325
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
326
"movd %%mm0, 16+" #dst " \n\t"\
327
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
328
"movd %%mm2, 96+" #dst " \n\t"\
329
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
330
"movd %%mm4, 112+" #dst " \n\t"\
331
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
332
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
333
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
334
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
335
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
336
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
337
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
338
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
339
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
340
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
341
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
342
"psrad $" #shift ", %%mm2 \n\t"\
343
"psrad $" #shift ", %%mm5 \n\t"\
344
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
345
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
346
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
347
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
348
"psrad $" #shift ", %%mm6 \n\t"\
349
"psrad $" #shift ", %%mm4 \n\t"\
350
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
351
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
352
"movd %%mm2, 32+" #dst " \n\t"\
353
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
354
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
355
"movd %%mm6, 48+" #dst " \n\t"\
356
"movd %%mm4, 64+" #dst " \n\t"\
357
"movd %%mm5, 80+" #dst " \n\t"\
219
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
220
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
221
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
222
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
223
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
224
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
225
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
226
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
227
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
228
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
229
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
230
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
231
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
232
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
233
#rounder ", %%mm4 \n\t"\
234
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
235
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
236
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
237
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
238
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
239
#rounder ", %%mm0 \n\t"\
240
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
241
"paddd %%mm0, %%mm0 \n\t" \
242
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
243
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
244
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
245
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
246
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
247
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
248
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
249
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
250
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
251
"psrad $" #shift ", %%mm7 \n\t"\
252
"psrad $" #shift ", %%mm4 \n\t"\
253
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
254
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
255
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
256
"psrad $" #shift ", %%mm1 \n\t"\
257
"psrad $" #shift ", %%mm2 \n\t"\
258
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
259
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
260
"movq %%mm7, " #dst " \n\t"\
261
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
262
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
263
"movq %%mm2, 24+" #dst " \n\t"\
264
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
265
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
266
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
267
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
268
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
269
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
270
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
271
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
272
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
273
"psrad $" #shift ", %%mm2 \n\t"\
274
"psrad $" #shift ", %%mm0 \n\t"\
275
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
276
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
277
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
278
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
279
"psrad $" #shift ", %%mm6 \n\t"\
280
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
281
"movq %%mm2, 8+" #dst " \n\t"\
282
"psrad $" #shift ", %%mm4 \n\t"\
283
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
284
"movq %%mm4, 16+" #dst " \n\t"\
286
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
287
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
288
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
289
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
290
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
291
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
292
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
293
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
294
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
295
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
296
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
297
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
298
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
299
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
300
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
301
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
302
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
303
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
304
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
305
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
306
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
307
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
308
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
309
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
310
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
311
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
312
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
313
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
314
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
315
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
316
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
317
"psrad $" #shift ", %%mm7 \n\t"\
318
"psrad $" #shift ", %%mm4 \n\t"\
319
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
320
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
321
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
322
"psrad $" #shift ", %%mm0 \n\t"\
323
"psrad $" #shift ", %%mm2 \n\t"\
324
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
325
"movd %%mm7, " #dst " \n\t"\
326
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
327
"movd %%mm0, 16+" #dst " \n\t"\
328
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
329
"movd %%mm2, 96+" #dst " \n\t"\
330
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
331
"movd %%mm4, 112+" #dst " \n\t"\
332
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
333
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
334
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
335
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
336
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
337
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
338
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
339
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
340
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
341
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
342
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
343
"psrad $" #shift ", %%mm2 \n\t"\
344
"psrad $" #shift ", %%mm5 \n\t"\
345
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
346
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
347
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
348
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
349
"psrad $" #shift ", %%mm6 \n\t"\
350
"psrad $" #shift ", %%mm4 \n\t"\
351
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
352
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
353
"movd %%mm2, 32+" #dst " \n\t"\
354
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
355
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
356
"movd %%mm6, 48+" #dst " \n\t"\
357
"movd %%mm4, 64+" #dst " \n\t"\
358
"movd %%mm5, 80+" #dst " \n\t"\
360
361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
361
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
362
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
363
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
364
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
365
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
366
"pand %%mm0, %%mm4 \n\t"\
367
"por %%mm1, %%mm4 \n\t"\
368
"por %%mm2, %%mm4 \n\t"\
369
"por %%mm3, %%mm4 \n\t"\
370
"packssdw %%mm4,%%mm4 \n\t"\
371
"movd %%mm4, %%eax \n\t"\
372
"orl %%eax, %%eax \n\t"\
374
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
375
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
376
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
377
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
378
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
379
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
380
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
381
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
382
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
383
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
384
#rounder ", %%mm4 \n\t"\
385
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
386
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
387
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
388
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
389
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
390
#rounder ", %%mm0 \n\t"\
391
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
392
"paddd %%mm0, %%mm0 \n\t" \
393
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
394
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
395
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
396
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
397
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
398
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
399
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
400
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
401
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
402
"psrad $" #shift ", %%mm7 \n\t"\
403
"psrad $" #shift ", %%mm4 \n\t"\
404
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
405
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
406
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
407
"psrad $" #shift ", %%mm1 \n\t"\
408
"psrad $" #shift ", %%mm2 \n\t"\
409
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
410
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
411
"movq %%mm7, " #dst " \n\t"\
412
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
413
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
414
"movq %%mm2, 24+" #dst " \n\t"\
415
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
416
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
417
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
418
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
419
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
420
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
421
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
422
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
423
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
424
"psrad $" #shift ", %%mm2 \n\t"\
425
"psrad $" #shift ", %%mm0 \n\t"\
426
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
427
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
428
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
429
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
430
"psrad $" #shift ", %%mm6 \n\t"\
431
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
432
"movq %%mm2, 8+" #dst " \n\t"\
433
"psrad $" #shift ", %%mm4 \n\t"\
434
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
435
"movq %%mm4, 16+" #dst " \n\t"\
438
"pslld $16, %%mm0 \n\t"\
439
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
440
"psrad $13, %%mm0 \n\t"\
441
"packssdw %%mm0, %%mm0 \n\t"\
442
"movq %%mm0, " #dst " \n\t"\
443
"movq %%mm0, 8+" #dst " \n\t"\
444
"movq %%mm0, 16+" #dst " \n\t"\
445
"movq %%mm0, 24+" #dst " \n\t"\
362
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
363
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
364
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
365
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
366
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
367
"pand %%mm0, %%mm4 \n\t"\
368
"por %%mm1, %%mm4 \n\t"\
369
"por %%mm2, %%mm4 \n\t"\
370
"por %%mm3, %%mm4 \n\t"\
371
"packssdw %%mm4,%%mm4 \n\t"\
372
"movd %%mm4, %%eax \n\t"\
373
"orl %%eax, %%eax \n\t"\
375
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
376
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
377
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
378
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
379
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
380
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
381
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
382
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
383
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
384
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
385
#rounder ", %%mm4 \n\t"\
386
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
387
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
388
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
389
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
390
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
391
#rounder ", %%mm0 \n\t"\
392
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
393
"paddd %%mm0, %%mm0 \n\t" \
394
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
395
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
396
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
397
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
398
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
399
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
400
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
401
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
402
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
403
"psrad $" #shift ", %%mm7 \n\t"\
404
"psrad $" #shift ", %%mm4 \n\t"\
405
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
406
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
407
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
408
"psrad $" #shift ", %%mm1 \n\t"\
409
"psrad $" #shift ", %%mm2 \n\t"\
410
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
411
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
412
"movq %%mm7, " #dst " \n\t"\
413
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
414
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
415
"movq %%mm2, 24+" #dst " \n\t"\
416
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
417
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
418
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
419
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
420
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
421
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
422
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
423
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
424
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
425
"psrad $" #shift ", %%mm2 \n\t"\
426
"psrad $" #shift ", %%mm0 \n\t"\
427
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
428
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
429
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
430
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
431
"psrad $" #shift ", %%mm6 \n\t"\
432
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
433
"movq %%mm2, 8+" #dst " \n\t"\
434
"psrad $" #shift ", %%mm4 \n\t"\
435
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
436
"movq %%mm4, 16+" #dst " \n\t"\
439
"pslld $16, %%mm0 \n\t"\
440
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441
"psrad $13, %%mm0 \n\t"\
442
"packssdw %%mm0, %%mm0 \n\t"\
443
"movq %%mm0, " #dst " \n\t"\
444
"movq %%mm0, 8+" #dst " \n\t"\
445
"movq %%mm0, 16+" #dst " \n\t"\
446
"movq %%mm0, 24+" #dst " \n\t"\
449
450
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
457
458
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
460
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
461
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
462
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
463
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
464
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
461
//IDCT( src0, src4, src1, src5, dst, shift)
462
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
463
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
464
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
465
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
468
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
469
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
470
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
471
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
472
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
473
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
474
"pand %%mm0, %%mm4 \n\t"\
475
"por %%mm1, %%mm4 \n\t"\
476
"por %%mm2, %%mm4 \n\t"\
477
"por %%mm3, %%mm4 \n\t"\
478
"packssdw %%mm4,%%mm4 \n\t"\
479
"movd %%mm4, %%eax \n\t"\
480
"orl %%eax, %%eax \n\t"\
482
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
483
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
484
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
485
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
486
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
487
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
488
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
489
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
490
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
491
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
492
#rounder ", %%mm4 \n\t"\
493
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
494
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
495
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
496
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
497
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
498
#rounder ", %%mm0 \n\t"\
499
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
500
"paddd %%mm0, %%mm0 \n\t" \
501
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
502
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
503
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
504
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
505
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
506
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
507
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
508
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
509
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
510
"psrad $" #shift ", %%mm7 \n\t"\
511
"psrad $" #shift ", %%mm4 \n\t"\
512
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
513
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
514
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
515
"psrad $" #shift ", %%mm1 \n\t"\
516
"psrad $" #shift ", %%mm2 \n\t"\
517
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
518
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
519
"movq %%mm7, " #dst " \n\t"\
520
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
521
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
522
"movq %%mm2, 24+" #dst " \n\t"\
523
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
524
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
525
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
526
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
527
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
528
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
529
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
530
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
531
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
532
"psrad $" #shift ", %%mm2 \n\t"\
533
"psrad $" #shift ", %%mm0 \n\t"\
534
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
535
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
536
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
537
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
538
"psrad $" #shift ", %%mm6 \n\t"\
539
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
540
"movq %%mm2, 8+" #dst " \n\t"\
541
"psrad $" #shift ", %%mm4 \n\t"\
542
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
543
"movq %%mm4, 16+" #dst " \n\t"\
546
"pslld $16, %%mm0 \n\t"\
547
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
548
"psrad $13, %%mm0 \n\t"\
549
"packssdw %%mm0, %%mm0 \n\t"\
550
"movq %%mm0, " #dst " \n\t"\
551
"movq %%mm0, 8+" #dst " \n\t"\
552
"movq %%mm0, 16+" #dst " \n\t"\
553
"movq %%mm0, 24+" #dst " \n\t"\
470
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
471
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
472
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
473
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
474
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
475
"pand %%mm0, %%mm4 \n\t"\
476
"por %%mm1, %%mm4 \n\t"\
477
"por %%mm2, %%mm4 \n\t"\
478
"por %%mm3, %%mm4 \n\t"\
479
"packssdw %%mm4,%%mm4 \n\t"\
480
"movd %%mm4, %%eax \n\t"\
481
"orl %%eax, %%eax \n\t"\
483
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
484
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
485
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
486
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
487
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
488
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
489
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
490
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
491
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
492
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
493
#rounder ", %%mm4 \n\t"\
494
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
495
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
496
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
497
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
498
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
499
#rounder ", %%mm0 \n\t"\
500
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
501
"paddd %%mm0, %%mm0 \n\t" \
502
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
503
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
504
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
505
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
506
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
507
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
508
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
509
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
510
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
511
"psrad $" #shift ", %%mm7 \n\t"\
512
"psrad $" #shift ", %%mm4 \n\t"\
513
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
514
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
515
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
516
"psrad $" #shift ", %%mm1 \n\t"\
517
"psrad $" #shift ", %%mm2 \n\t"\
518
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
519
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
520
"movq %%mm7, " #dst " \n\t"\
521
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
522
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
523
"movq %%mm2, 24+" #dst " \n\t"\
524
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
525
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
526
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
527
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
528
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
529
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
530
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
531
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
532
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
533
"psrad $" #shift ", %%mm2 \n\t"\
534
"psrad $" #shift ", %%mm0 \n\t"\
535
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
536
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
537
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
538
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
539
"psrad $" #shift ", %%mm6 \n\t"\
540
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
541
"movq %%mm2, 8+" #dst " \n\t"\
542
"psrad $" #shift ", %%mm4 \n\t"\
543
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
544
"movq %%mm4, 16+" #dst " \n\t"\
547
"pslld $16, %%mm0 \n\t"\
548
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
549
"psrad $13, %%mm0 \n\t"\
550
"packssdw %%mm0, %%mm0 \n\t"\
551
"movq %%mm0, " #dst " \n\t"\
552
"movq %%mm0, 8+" #dst " \n\t"\
553
"movq %%mm0, 16+" #dst " \n\t"\
554
"movq %%mm0, 24+" #dst " \n\t"\
556
557
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
557
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
558
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
559
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
560
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
561
"movq %%mm0, %%mm4 \n\t"\
562
"por %%mm1, %%mm4 \n\t"\
563
"por %%mm2, %%mm4 \n\t"\
564
"por %%mm3, %%mm4 \n\t"\
565
"packssdw %%mm4,%%mm4 \n\t"\
566
"movd %%mm4, %%eax \n\t"\
567
"orl %%eax, %%eax \n\t"\
569
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
570
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
571
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
572
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
573
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
574
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
575
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
576
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
577
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
578
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
579
#rounder ", %%mm4 \n\t"\
580
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
581
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
582
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
583
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
584
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
585
#rounder ", %%mm0 \n\t"\
586
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
587
"paddd %%mm0, %%mm0 \n\t" \
588
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
589
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
590
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
591
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
592
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
593
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
594
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
595
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
596
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
597
"psrad $" #shift ", %%mm7 \n\t"\
598
"psrad $" #shift ", %%mm4 \n\t"\
599
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
600
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
601
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
602
"psrad $" #shift ", %%mm1 \n\t"\
603
"psrad $" #shift ", %%mm2 \n\t"\
604
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
605
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
606
"movq %%mm7, " #dst " \n\t"\
607
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
608
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
609
"movq %%mm2, 24+" #dst " \n\t"\
610
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
611
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
612
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
613
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
614
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
615
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
616
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
617
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
618
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
619
"psrad $" #shift ", %%mm2 \n\t"\
620
"psrad $" #shift ", %%mm0 \n\t"\
621
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
622
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
623
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
624
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
625
"psrad $" #shift ", %%mm6 \n\t"\
626
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
627
"movq %%mm2, 8+" #dst " \n\t"\
628
"psrad $" #shift ", %%mm4 \n\t"\
629
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
630
"movq %%mm4, 16+" #dst " \n\t"\
558
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
559
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
560
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
561
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
562
"movq %%mm0, %%mm4 \n\t"\
563
"por %%mm1, %%mm4 \n\t"\
564
"por %%mm2, %%mm4 \n\t"\
565
"por %%mm3, %%mm4 \n\t"\
566
"packssdw %%mm4,%%mm4 \n\t"\
567
"movd %%mm4, %%eax \n\t"\
568
"orl %%eax, %%eax \n\t"\
570
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
571
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
572
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
573
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
574
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
575
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
576
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
577
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
578
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
579
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
580
#rounder ", %%mm4 \n\t"\
581
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
582
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
583
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
584
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
585
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
586
#rounder ", %%mm0 \n\t"\
587
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
588
"paddd %%mm0, %%mm0 \n\t" \
589
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
590
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
591
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
592
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
593
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
594
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
595
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
596
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
597
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
598
"psrad $" #shift ", %%mm7 \n\t"\
599
"psrad $" #shift ", %%mm4 \n\t"\
600
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
601
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
602
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
603
"psrad $" #shift ", %%mm1 \n\t"\
604
"psrad $" #shift ", %%mm2 \n\t"\
605
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
606
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
607
"movq %%mm7, " #dst " \n\t"\
608
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
609
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
610
"movq %%mm2, 24+" #dst " \n\t"\
611
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
612
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
613
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
614
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
615
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
616
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
617
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
618
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
619
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
620
"psrad $" #shift ", %%mm2 \n\t"\
621
"psrad $" #shift ", %%mm0 \n\t"\
622
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
623
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
624
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
625
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
626
"psrad $" #shift ", %%mm6 \n\t"\
627
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
628
"movq %%mm2, 8+" #dst " \n\t"\
629
"psrad $" #shift ", %%mm4 \n\t"\
630
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
631
"movq %%mm4, 16+" #dst " \n\t"\
632
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
633
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
634
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
635
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
636
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
637
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
638
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
639
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
640
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
641
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
642
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
643
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
644
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
645
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
646
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
647
#rounder ", %%mm4 \n\t"\
648
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
649
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
650
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
651
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
652
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
653
#rounder ", %%mm0 \n\t"\
654
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
655
"paddd %%mm0, %%mm0 \n\t" \
656
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
657
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
658
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
659
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
660
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
661
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
662
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
663
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
664
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
665
"psrad $" #shift ", %%mm7 \n\t"\
666
"psrad $" #shift ", %%mm4 \n\t"\
667
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
668
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
669
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
670
"psrad $" #shift ", %%mm1 \n\t"\
671
"psrad $" #shift ", %%mm2 \n\t"\
672
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
673
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
674
"movq %%mm7, " #dst " \n\t"\
675
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
676
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
677
"movq %%mm2, 24+" #dst " \n\t"\
678
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
679
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
680
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
681
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
682
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
683
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
684
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
685
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
686
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
687
"psrad $" #shift ", %%mm2 \n\t"\
688
"psrad $" #shift ", %%mm0 \n\t"\
689
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
690
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
691
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
692
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
693
"psrad $" #shift ", %%mm6 \n\t"\
694
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
695
"movq %%mm2, 8+" #dst " \n\t"\
696
"psrad $" #shift ", %%mm4 \n\t"\
697
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
698
"movq %%mm4, 16+" #dst " \n\t"\
634
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
635
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
636
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
637
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
638
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
639
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
640
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
641
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
642
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
643
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
644
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
645
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
646
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
647
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
648
#rounder ", %%mm4 \n\t"\
649
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
650
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
651
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
652
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
653
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
654
#rounder ", %%mm0 \n\t"\
655
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
656
"paddd %%mm0, %%mm0 \n\t" \
657
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
658
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
659
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
660
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
661
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
662
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
663
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
664
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
665
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
666
"psrad $" #shift ", %%mm7 \n\t"\
667
"psrad $" #shift ", %%mm4 \n\t"\
668
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
669
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
670
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
671
"psrad $" #shift ", %%mm1 \n\t"\
672
"psrad $" #shift ", %%mm2 \n\t"\
673
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
674
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
675
"movq %%mm7, " #dst " \n\t"\
676
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
677
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
678
"movq %%mm2, 24+" #dst " \n\t"\
679
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
680
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
681
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
682
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
683
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
684
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
685
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
686
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
687
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
688
"psrad $" #shift ", %%mm2 \n\t"\
689
"psrad $" #shift ", %%mm0 \n\t"\
690
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
691
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
692
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
693
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
694
"psrad $" #shift ", %%mm6 \n\t"\
695
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
696
"movq %%mm2, 8+" #dst " \n\t"\
697
"psrad $" #shift ", %%mm4 \n\t"\
698
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
699
"movq %%mm4, 16+" #dst " \n\t"\
700
701
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
701
702
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
704
705
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
707
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
708
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
709
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
710
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
711
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
712
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
713
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
714
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
715
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
716
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
717
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
718
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
719
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
720
#rounder ", %%mm4 \n\t"\
721
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
722
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
723
#rounder ", %%mm0 \n\t"\
724
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
725
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
726
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
727
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
728
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
729
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
730
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
731
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
732
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
733
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
734
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
735
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
736
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
737
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
738
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
739
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
740
"psrad $" #shift ", %%mm7 \n\t"\
741
"psrad $" #shift ", %%mm4 \n\t"\
742
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
743
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
744
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
745
"psrad $" #shift ", %%mm0 \n\t"\
746
"psrad $" #shift ", %%mm2 \n\t"\
747
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
748
"movd %%mm7, " #dst " \n\t"\
749
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
750
"movd %%mm0, 16+" #dst " \n\t"\
751
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
752
"movd %%mm2, 96+" #dst " \n\t"\
753
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
754
"movd %%mm4, 112+" #dst " \n\t"\
755
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
756
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
757
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
758
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
759
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
760
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
761
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
762
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
763
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
764
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
765
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
766
"psrad $" #shift ", %%mm2 \n\t"\
767
"psrad $" #shift ", %%mm5 \n\t"\
768
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
769
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
770
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
771
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
772
"psrad $" #shift ", %%mm6 \n\t"\
773
"psrad $" #shift ", %%mm4 \n\t"\
774
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
776
"movd %%mm2, 32+" #dst " \n\t"\
777
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
778
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
779
"movd %%mm6, 48+" #dst " \n\t"\
780
"movd %%mm4, 64+" #dst " \n\t"\
781
"movd %%mm5, 80+" #dst " \n\t"
784
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
785
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
786
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
787
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
788
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
708
#define IDCT(src0, src4, src1, src5, dst, shift) \
709
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
710
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
711
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
712
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
713
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
714
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
715
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
716
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
717
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
718
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
719
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
720
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
721
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
722
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
723
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
724
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
725
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
726
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
727
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
728
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
729
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
730
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
731
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
732
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
733
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
734
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
735
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
736
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
737
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
738
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
739
"psrad $" #shift ", %%mm7 \n\t"\
740
"psrad $" #shift ", %%mm4 \n\t"\
741
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
742
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
743
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
744
"psrad $" #shift ", %%mm0 \n\t"\
745
"psrad $" #shift ", %%mm2 \n\t"\
746
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
747
"movd %%mm7, " #dst " \n\t"\
748
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
749
"movd %%mm0, 16+" #dst " \n\t"\
750
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
751
"movd %%mm2, 96+" #dst " \n\t"\
752
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
753
"movd %%mm4, 112+" #dst " \n\t"\
754
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
755
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
756
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
757
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
758
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
759
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
760
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
761
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
762
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
763
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
764
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
765
"psrad $" #shift ", %%mm2 \n\t"\
766
"psrad $" #shift ", %%mm5 \n\t"\
767
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
768
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
769
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
770
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
771
"psrad $" #shift ", %%mm6 \n\t"\
772
"psrad $" #shift ", %%mm4 \n\t"\
773
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
774
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
775
"movd %%mm2, 32+" #dst " \n\t"\
776
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
777
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
778
"movd %%mm6, 48+" #dst " \n\t"\
779
"movd %%mm4, 64+" #dst " \n\t"\
780
"movd %%mm5, 80+" #dst " \n\t"
783
//IDCT( src0, src4, src1, src5, dst, shift)
784
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
785
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
786
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
787
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
793
792
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794
793
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
797
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
798
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
799
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
800
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
801
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
802
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
803
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
804
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
805
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
806
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
807
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
808
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
809
#rounder ", %%mm4 \n\t"\
810
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
811
#rounder ", %%mm0 \n\t"\
812
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
813
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
814
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
815
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
816
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
817
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
818
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
819
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
820
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
821
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
822
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
823
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
824
"psrad $" #shift ", %%mm1 \n\t"\
825
"psrad $" #shift ", %%mm4 \n\t"\
826
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
827
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
828
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
829
"psrad $" #shift ", %%mm0 \n\t"\
830
"psrad $" #shift ", %%mm2 \n\t"\
831
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
832
"movd %%mm1, " #dst " \n\t"\
833
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
834
"movd %%mm0, 16+" #dst " \n\t"\
835
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
836
"movd %%mm2, 96+" #dst " \n\t"\
837
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
838
"movd %%mm4, 112+" #dst " \n\t"\
839
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
840
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
841
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
842
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
843
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
844
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
845
"psrad $" #shift ", %%mm2 \n\t"\
846
"psrad $" #shift ", %%mm5 \n\t"\
847
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
848
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
849
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
850
"psrad $" #shift ", %%mm6 \n\t"\
851
"psrad $" #shift ", %%mm1 \n\t"\
852
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
853
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
854
"movd %%mm2, 32+" #dst " \n\t"\
855
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
856
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
857
"movd %%mm6, 48+" #dst " \n\t"\
858
"movd %%mm1, 64+" #dst " \n\t"\
859
"movd %%mm5, 80+" #dst " \n\t"
861
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
862
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
863
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
864
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
865
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
796
#define IDCT(src0, src4, src1, src5, dst, shift) \
797
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
798
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
799
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
800
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
801
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
802
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
803
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
804
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
805
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
806
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
807
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
808
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
809
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
810
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
811
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
812
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
813
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
814
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
815
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
816
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
817
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
818
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
819
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
820
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
821
"psrad $" #shift ", %%mm1 \n\t"\
822
"psrad $" #shift ", %%mm4 \n\t"\
823
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
824
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
825
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
826
"psrad $" #shift ", %%mm0 \n\t"\
827
"psrad $" #shift ", %%mm2 \n\t"\
828
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
829
"movd %%mm1, " #dst " \n\t"\
830
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
831
"movd %%mm0, 16+" #dst " \n\t"\
832
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
833
"movd %%mm2, 96+" #dst " \n\t"\
834
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
835
"movd %%mm4, 112+" #dst " \n\t"\
836
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
837
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
838
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
839
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
840
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
841
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
842
"psrad $" #shift ", %%mm2 \n\t"\
843
"psrad $" #shift ", %%mm5 \n\t"\
844
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
845
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
846
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
847
"psrad $" #shift ", %%mm6 \n\t"\
848
"psrad $" #shift ", %%mm1 \n\t"\
849
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
850
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
851
"movd %%mm2, 32+" #dst " \n\t"\
852
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
853
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
854
"movd %%mm6, 48+" #dst " \n\t"\
855
"movd %%mm1, 64+" #dst " \n\t"\
856
"movd %%mm5, 80+" #dst " \n\t"
858
//IDCT( src0, src4, src1, src5, dst, shift)
859
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
860
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
861
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
862
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
870
867
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
873
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
874
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
875
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
876
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
877
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
878
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
879
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
880
#rounder ", %%mm4 \n\t"\
881
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
882
#rounder ", %%mm0 \n\t"\
883
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
884
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
885
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
886
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
887
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
888
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
889
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
890
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
891
"psrad $" #shift ", %%mm1 \n\t"\
892
"psrad $" #shift ", %%mm4 \n\t"\
893
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
894
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
895
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
896
"psrad $" #shift ", %%mm0 \n\t"\
897
"psrad $" #shift ", %%mm2 \n\t"\
898
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
899
"movd %%mm1, " #dst " \n\t"\
900
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
901
"movd %%mm0, 16+" #dst " \n\t"\
902
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
903
"movd %%mm2, 96+" #dst " \n\t"\
904
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
905
"movd %%mm4, 112+" #dst " \n\t"\
906
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
907
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
908
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
909
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
910
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
911
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
912
"psrad $" #shift ", %%mm2 \n\t"\
913
"psrad $" #shift ", %%mm5 \n\t"\
914
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
915
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
916
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
917
"psrad $" #shift ", %%mm6 \n\t"\
918
"psrad $" #shift ", %%mm1 \n\t"\
919
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
920
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
921
"movd %%mm2, 32+" #dst " \n\t"\
922
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
923
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
924
"movd %%mm6, 48+" #dst " \n\t"\
925
"movd %%mm1, 64+" #dst " \n\t"\
926
"movd %%mm5, 80+" #dst " \n\t"
929
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
930
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
931
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
932
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
933
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
870
#define IDCT(src0, src4, src1, src5, dst, shift) \
871
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
872
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
873
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
874
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
875
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
876
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
877
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
878
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
879
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
880
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
881
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
882
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
883
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
884
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
885
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
886
"psrad $" #shift ", %%mm1 \n\t"\
887
"psrad $" #shift ", %%mm4 \n\t"\
888
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
889
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
890
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
891
"psrad $" #shift ", %%mm0 \n\t"\
892
"psrad $" #shift ", %%mm2 \n\t"\
893
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
894
"movd %%mm1, " #dst " \n\t"\
895
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
896
"movd %%mm0, 16+" #dst " \n\t"\
897
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
898
"movd %%mm2, 96+" #dst " \n\t"\
899
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
900
"movd %%mm4, 112+" #dst " \n\t"\
901
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
902
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
903
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
904
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
905
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
906
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
907
"psrad $" #shift ", %%mm2 \n\t"\
908
"psrad $" #shift ", %%mm5 \n\t"\
909
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
910
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
911
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
912
"psrad $" #shift ", %%mm6 \n\t"\
913
"psrad $" #shift ", %%mm1 \n\t"\
914
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
915
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
916
"movd %%mm2, 32+" #dst " \n\t"\
917
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
918
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
919
"movd %%mm6, 48+" #dst " \n\t"\
920
"movd %%mm1, 64+" #dst " \n\t"\
921
"movd %%mm5, 80+" #dst " \n\t"
924
//IDCT( src0, src4, src1, src5, dst, shift)
925
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
926
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
927
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
928
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
938
933
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
941
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
942
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
943
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
944
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
945
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
946
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
947
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
948
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
949
#rounder ", %%mm4 \n\t"\
950
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
951
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
952
#rounder ", %%mm0 \n\t"\
953
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
954
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
955
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
956
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
957
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
958
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
959
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
960
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
961
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
962
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
963
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
964
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
965
"psrad $" #shift ", %%mm7 \n\t"\
966
"psrad $" #shift ", %%mm4 \n\t"\
967
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
968
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
969
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
970
"psrad $" #shift ", %%mm0 \n\t"\
971
"psrad $" #shift ", %%mm2 \n\t"\
972
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
973
"movd %%mm7, " #dst " \n\t"\
974
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
975
"movd %%mm0, 16+" #dst " \n\t"\
976
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
977
"movd %%mm2, 96+" #dst " \n\t"\
978
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
979
"movd %%mm4, 112+" #dst " \n\t"\
980
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
981
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
982
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
983
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
984
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
985
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
986
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
987
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
988
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
989
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
990
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
991
"psrad $" #shift ", %%mm2 \n\t"\
992
"psrad $" #shift ", %%mm5 \n\t"\
993
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
994
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
995
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
996
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
997
"psrad $" #shift ", %%mm6 \n\t"\
998
"psrad $" #shift ", %%mm4 \n\t"\
999
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1000
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1001
"movd %%mm2, 32+" #dst " \n\t"\
1002
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1003
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1004
"movd %%mm6, 48+" #dst " \n\t"\
1005
"movd %%mm4, 64+" #dst " \n\t"\
1006
"movd %%mm5, 80+" #dst " \n\t"
1008
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1009
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1010
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1011
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1012
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1018
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1019
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1020
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1021
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1022
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1023
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1024
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1025
#rounder ", %%mm4 \n\t"\
1026
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1027
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1028
#rounder ", %%mm0 \n\t"\
1029
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1030
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1031
"movq 64(%2), %%mm3 \n\t"\
1032
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1033
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1034
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1035
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1036
"psrad $" #shift ", %%mm7 \n\t"\
1037
"psrad $" #shift ", %%mm4 \n\t"\
1038
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1039
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1040
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1041
"psrad $" #shift ", %%mm0 \n\t"\
1042
"psrad $" #shift ", %%mm1 \n\t"\
1043
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1044
"movd %%mm7, " #dst " \n\t"\
1045
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1046
"movd %%mm0, 16+" #dst " \n\t"\
1047
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1048
"movd %%mm1, 96+" #dst " \n\t"\
1049
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1050
"movd %%mm4, 112+" #dst " \n\t"\
1051
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1052
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1053
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1054
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1055
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1056
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1057
"psrad $" #shift ", %%mm1 \n\t"\
1058
"psrad $" #shift ", %%mm5 \n\t"\
1059
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1060
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1061
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1062
"psrad $" #shift ", %%mm6 \n\t"\
1063
"psrad $" #shift ", %%mm4 \n\t"\
1064
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1065
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1066
"movd %%mm1, 32+" #dst " \n\t"\
1067
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1068
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1069
"movd %%mm6, 48+" #dst " \n\t"\
1070
"movd %%mm4, 64+" #dst " \n\t"\
1071
"movd %%mm5, 80+" #dst " \n\t"
1074
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1075
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1076
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1077
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1078
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1084
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1085
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1086
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1087
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1088
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1089
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1090
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1091
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1092
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1093
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1094
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1095
#rounder ", %%mm4 \n\t"\
1096
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1098
#rounder ", %%mm0 \n\t"\
1099
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1100
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1101
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1102
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1103
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1104
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1105
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1106
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1107
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1108
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1109
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1110
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1111
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1112
#rounder ", %%mm1 \n\t"\
1113
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1114
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1115
#rounder ", %%mm2 \n\t"\
1116
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1117
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1118
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1119
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1120
"psrad $" #shift ", %%mm4 \n\t"\
1121
"psrad $" #shift ", %%mm7 \n\t"\
1122
"psrad $" #shift ", %%mm3 \n\t"\
1123
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1124
"movq %%mm4, " #dst " \n\t"\
1125
"psrad $" #shift ", %%mm0 \n\t"\
1126
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1127
"movq %%mm0, 16+" #dst " \n\t"\
1128
"movq %%mm0, 96+" #dst " \n\t"\
1129
"movq %%mm4, 112+" #dst " \n\t"\
1130
"psrad $" #shift ", %%mm5 \n\t"\
1131
"psrad $" #shift ", %%mm6 \n\t"\
1132
"psrad $" #shift ", %%mm2 \n\t"\
1133
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1134
"movq %%mm5, 32+" #dst " \n\t"\
1135
"psrad $" #shift ", %%mm1 \n\t"\
1136
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1137
"movq %%mm6, 48+" #dst " \n\t"\
1138
"movq %%mm6, 64+" #dst " \n\t"\
1139
"movq %%mm5, 80+" #dst " \n\t"
1142
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1143
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1144
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1145
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1146
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1153
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1154
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1155
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1156
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1157
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1158
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1159
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1160
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1161
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1162
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1163
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1164
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1165
#rounder ", %%mm4 \n\t"\
1166
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1167
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1168
#rounder ", %%mm0 \n\t"\
1169
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1170
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1171
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1172
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1173
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1174
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1175
"movq 64(%2), %%mm1 \n\t"\
1176
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1177
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1178
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1179
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1180
"psrad $" #shift ", %%mm7 \n\t"\
1181
"psrad $" #shift ", %%mm4 \n\t"\
1182
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1183
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1184
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1185
"psrad $" #shift ", %%mm0 \n\t"\
1186
"psrad $" #shift ", %%mm3 \n\t"\
1187
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1188
"movd %%mm7, " #dst " \n\t"\
1189
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1190
"movd %%mm0, 16+" #dst " \n\t"\
1191
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1192
"movd %%mm3, 96+" #dst " \n\t"\
1193
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1194
"movd %%mm4, 112+" #dst " \n\t"\
1195
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1196
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1197
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1198
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1199
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1200
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1201
"psrad $" #shift ", %%mm3 \n\t"\
1202
"psrad $" #shift ", %%mm5 \n\t"\
1203
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1204
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1205
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1206
"psrad $" #shift ", %%mm6 \n\t"\
1207
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1208
"movd %%mm3, 32+" #dst " \n\t"\
1209
"psrad $" #shift ", %%mm4 \n\t"\
1210
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1211
"movd %%mm6, 48+" #dst " \n\t"\
1212
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1213
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1214
"movd %%mm4, 64+" #dst " \n\t"\
1215
"movd %%mm5, 80+" #dst " \n\t"
1218
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1219
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1220
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1221
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1222
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1229
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1230
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1231
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1232
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1233
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1234
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1235
#rounder ", %%mm4 \n\t"\
1236
#rounder ", %%mm0 \n\t"\
1237
"psrad $" #shift ", %%mm4 \n\t"\
1238
"psrad $" #shift ", %%mm0 \n\t"\
1239
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1240
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1241
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1242
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1243
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1244
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1245
#rounder ", %%mm1 \n\t"\
1246
#rounder ", %%mm2 \n\t"\
1247
"psrad $" #shift ", %%mm1 \n\t"\
1248
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1249
"movq %%mm4, " #dst " \n\t"\
1250
"psrad $" #shift ", %%mm2 \n\t"\
1251
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1252
"movq %%mm0, 16+" #dst " \n\t"\
1253
"movq %%mm0, 96+" #dst " \n\t"\
1254
"movq %%mm4, 112+" #dst " \n\t"\
1255
"movq %%mm0, 32+" #dst " \n\t"\
1256
"movq %%mm4, 48+" #dst " \n\t"\
1257
"movq %%mm4, 64+" #dst " \n\t"\
1258
"movq %%mm0, 80+" #dst " \n\t"
1260
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1261
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1262
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1263
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1264
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
936
#define IDCT(src0, src4, src1, src5, dst, shift) \
937
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
938
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
939
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
940
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
941
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
942
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
943
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
944
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
945
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
946
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
947
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
948
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
949
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
950
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
951
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
952
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
953
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
954
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
955
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
956
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
957
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
958
"psrad $" #shift ", %%mm7 \n\t"\
959
"psrad $" #shift ", %%mm4 \n\t"\
960
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
961
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
962
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
963
"psrad $" #shift ", %%mm0 \n\t"\
964
"psrad $" #shift ", %%mm2 \n\t"\
965
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
966
"movd %%mm7, " #dst " \n\t"\
967
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
968
"movd %%mm0, 16+" #dst " \n\t"\
969
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
970
"movd %%mm2, 96+" #dst " \n\t"\
971
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
972
"movd %%mm4, 112+" #dst " \n\t"\
973
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
974
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
975
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
976
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
977
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
978
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
979
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
980
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
981
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
982
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
983
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
984
"psrad $" #shift ", %%mm2 \n\t"\
985
"psrad $" #shift ", %%mm5 \n\t"\
986
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
987
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
988
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
989
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
990
"psrad $" #shift ", %%mm6 \n\t"\
991
"psrad $" #shift ", %%mm4 \n\t"\
992
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
993
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
994
"movd %%mm2, 32+" #dst " \n\t"\
995
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
996
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
997
"movd %%mm6, 48+" #dst " \n\t"\
998
"movd %%mm4, 64+" #dst " \n\t"\
999
"movd %%mm5, 80+" #dst " \n\t"
1001
//IDCT( src0, src4, src1, src5, dst, shift)
1002
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1003
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1004
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1005
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1011
#define IDCT(src0, src4, src1, src5, dst, shift) \
1012
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1013
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1014
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1015
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1016
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1017
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1018
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1019
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1020
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1021
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1022
"movq 64(%2), %%mm3 \n\t"\
1023
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1024
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1025
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1026
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1027
"psrad $" #shift ", %%mm7 \n\t"\
1028
"psrad $" #shift ", %%mm4 \n\t"\
1029
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1030
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1031
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1032
"psrad $" #shift ", %%mm0 \n\t"\
1033
"psrad $" #shift ", %%mm1 \n\t"\
1034
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1035
"movd %%mm7, " #dst " \n\t"\
1036
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1037
"movd %%mm0, 16+" #dst " \n\t"\
1038
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1039
"movd %%mm1, 96+" #dst " \n\t"\
1040
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1041
"movd %%mm4, 112+" #dst " \n\t"\
1042
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1043
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1044
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1045
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1046
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1047
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1048
"psrad $" #shift ", %%mm1 \n\t"\
1049
"psrad $" #shift ", %%mm5 \n\t"\
1050
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1051
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1052
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1053
"psrad $" #shift ", %%mm6 \n\t"\
1054
"psrad $" #shift ", %%mm4 \n\t"\
1055
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1056
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1057
"movd %%mm1, 32+" #dst " \n\t"\
1058
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1059
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1060
"movd %%mm6, 48+" #dst " \n\t"\
1061
"movd %%mm4, 64+" #dst " \n\t"\
1062
"movd %%mm5, 80+" #dst " \n\t"
1065
//IDCT( src0, src4, src1, src5, dst, shift)
1066
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1067
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1068
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1069
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1075
#define IDCT(src0, src4, src1, src5, dst, shift) \
1076
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1077
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1078
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1079
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1080
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1081
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1082
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1083
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1084
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1085
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1086
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1087
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1088
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1089
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1090
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1091
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1092
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1093
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1094
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1095
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1096
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1097
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1098
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1099
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1100
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1101
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1102
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1103
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1104
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1105
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1106
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1107
"psrad $" #shift ", %%mm4 \n\t"\
1108
"psrad $" #shift ", %%mm7 \n\t"\
1109
"psrad $" #shift ", %%mm3 \n\t"\
1110
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1111
"movq %%mm4, " #dst " \n\t"\
1112
"psrad $" #shift ", %%mm0 \n\t"\
1113
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1114
"movq %%mm0, 16+" #dst " \n\t"\
1115
"movq %%mm0, 96+" #dst " \n\t"\
1116
"movq %%mm4, 112+" #dst " \n\t"\
1117
"psrad $" #shift ", %%mm5 \n\t"\
1118
"psrad $" #shift ", %%mm6 \n\t"\
1119
"psrad $" #shift ", %%mm2 \n\t"\
1120
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1121
"movq %%mm5, 32+" #dst " \n\t"\
1122
"psrad $" #shift ", %%mm1 \n\t"\
1123
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1124
"movq %%mm6, 48+" #dst " \n\t"\
1125
"movq %%mm6, 64+" #dst " \n\t"\
1126
"movq %%mm5, 80+" #dst " \n\t"
1129
//IDCT( src0, src4, src1, src5, dst, shift)
1130
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1131
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1132
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1133
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1140
#define IDCT(src0, src4, src1, src5, dst, shift) \
1141
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1142
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1143
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1144
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1145
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1146
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1147
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1148
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1149
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1150
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1151
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1152
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1153
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1154
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1155
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1156
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1157
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1158
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1159
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1160
"movq 64(%2), %%mm1 \n\t"\
1161
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1162
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1163
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1164
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1165
"psrad $" #shift ", %%mm7 \n\t"\
1166
"psrad $" #shift ", %%mm4 \n\t"\
1167
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1168
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1169
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1170
"psrad $" #shift ", %%mm0 \n\t"\
1171
"psrad $" #shift ", %%mm3 \n\t"\
1172
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1173
"movd %%mm7, " #dst " \n\t"\
1174
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1175
"movd %%mm0, 16+" #dst " \n\t"\
1176
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1177
"movd %%mm3, 96+" #dst " \n\t"\
1178
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1179
"movd %%mm4, 112+" #dst " \n\t"\
1180
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1181
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1182
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1183
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1184
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1185
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1186
"psrad $" #shift ", %%mm3 \n\t"\
1187
"psrad $" #shift ", %%mm5 \n\t"\
1188
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1189
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1190
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1191
"psrad $" #shift ", %%mm6 \n\t"\
1192
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1193
"movd %%mm3, 32+" #dst " \n\t"\
1194
"psrad $" #shift ", %%mm4 \n\t"\
1195
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1196
"movd %%mm6, 48+" #dst " \n\t"\
1197
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1198
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1199
"movd %%mm4, 64+" #dst " \n\t"\
1200
"movd %%mm5, 80+" #dst " \n\t"
1203
//IDCT( src0, src4, src1, src5, dst, shift)
1204
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1205
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1206
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1207
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1214
#define IDCT(src0, src4, src1, src5, dst, shift) \
1215
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1216
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1217
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1218
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1219
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1220
"psrad $" #shift ", %%mm4 \n\t"\
1221
"psrad $" #shift ", %%mm0 \n\t"\
1222
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1223
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1224
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1225
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1226
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1227
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1228
"psrad $" #shift ", %%mm1 \n\t"\
1229
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1230
"movq %%mm4, " #dst " \n\t"\
1231
"psrad $" #shift ", %%mm2 \n\t"\
1232
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1233
"movq %%mm0, 16+" #dst " \n\t"\
1234
"movq %%mm0, 96+" #dst " \n\t"\
1235
"movq %%mm4, 112+" #dst " \n\t"\
1236
"movq %%mm0, 32+" #dst " \n\t"\
1237
"movq %%mm4, 48+" #dst " \n\t"\
1238
"movq %%mm4, 64+" #dst " \n\t"\
1239
"movq %%mm0, 80+" #dst " \n\t"
1241
//IDCT( src0, src4, src1, src5, dst, shift)
1242
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1243
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1244
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1245
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)