4
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6
* This file is part of Libav.
8
* Libav is free software; you can redistribute it and/or
9
* modify it under the terms of the GNU Lesser General Public
10
* License as published by the Free Software Foundation; either
11
* version 2.1 of the License, or (at your option) any later version.
13
* Libav is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
* Lesser General Public License for more details.
18
* You should have received a copy of the GNU Lesser General Public
19
* License along with Libav; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
#include "libavcodec/dsputil.h"
23
#include "libavcodec/simple_idct.h"
24
#include "dsputil_mmx.h"
36
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define COL_SHIFT 20 // 6
48
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
49
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
51
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
52
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
79
static inline void idct(int16_t *block)
81
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
82
int16_t * const temp= (int16_t*)align_tmp;
85
#if 0 //Alternative, simpler variant
87
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
88
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
89
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
90
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
91
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
92
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
93
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
94
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
95
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
96
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
97
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
98
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
99
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
100
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
101
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
102
#rounder ", %%mm4 \n\t"\
103
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
104
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
105
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
106
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
107
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
108
#rounder ", %%mm0 \n\t"\
109
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
110
"paddd %%mm0, %%mm0 \n\t" \
111
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
112
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
113
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
114
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
115
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
116
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
117
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
118
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
119
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
120
"psrad $" #shift ", %%mm7 \n\t"\
121
"psrad $" #shift ", %%mm4 \n\t"\
122
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
123
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
124
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
125
"psrad $" #shift ", %%mm1 \n\t"\
126
"psrad $" #shift ", %%mm2 \n\t"\
127
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
128
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
129
"movq %%mm7, " #dst " \n\t"\
130
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
131
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
132
"movq %%mm2, 24+" #dst " \n\t"\
133
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
134
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
135
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
136
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
137
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
138
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
139
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
140
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
141
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
142
"psrad $" #shift ", %%mm2 \n\t"\
143
"psrad $" #shift ", %%mm0 \n\t"\
144
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
145
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
146
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
147
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
148
"psrad $" #shift ", %%mm6 \n\t"\
149
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
150
"movq %%mm2, 8+" #dst " \n\t"\
151
"psrad $" #shift ", %%mm4 \n\t"\
152
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
153
"movq %%mm4, 16+" #dst " \n\t"\
155
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
156
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
157
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
158
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
159
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
160
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
161
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
162
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
163
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
164
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
165
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
166
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
167
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
168
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
169
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
170
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
171
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
172
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
173
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
174
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
175
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
176
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
177
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
178
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
179
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
180
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
181
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
182
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
183
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
184
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
185
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
186
"psrad $" #shift ", %%mm7 \n\t"\
187
"psrad $" #shift ", %%mm4 \n\t"\
188
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
189
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
190
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
191
"psrad $" #shift ", %%mm0 \n\t"\
192
"psrad $" #shift ", %%mm2 \n\t"\
193
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
194
"movd %%mm7, " #dst " \n\t"\
195
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
196
"movd %%mm0, 16+" #dst " \n\t"\
197
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
198
"movd %%mm2, 96+" #dst " \n\t"\
199
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
200
"movd %%mm4, 112+" #dst " \n\t"\
201
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
202
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
203
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
204
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
205
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
206
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
207
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
208
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
209
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
210
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
211
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
212
"psrad $" #shift ", %%mm2 \n\t"\
213
"psrad $" #shift ", %%mm5 \n\t"\
214
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
215
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
216
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
217
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
218
"psrad $" #shift ", %%mm6 \n\t"\
219
"psrad $" #shift ", %%mm4 \n\t"\
220
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
221
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
222
"movd %%mm2, 32+" #dst " \n\t"\
223
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
224
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
225
"movd %%mm6, 48+" #dst " \n\t"\
226
"movd %%mm4, 64+" #dst " \n\t"\
227
"movd %%mm5, 80+" #dst " \n\t"\
230
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
231
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
232
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
233
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
234
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
235
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
236
"pand %%mm0, %%mm4 \n\t"\
237
"por %%mm1, %%mm4 \n\t"\
238
"por %%mm2, %%mm4 \n\t"\
239
"por %%mm3, %%mm4 \n\t"\
240
"packssdw %%mm4,%%mm4 \n\t"\
241
"movd %%mm4, %%eax \n\t"\
242
"orl %%eax, %%eax \n\t"\
244
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
245
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
246
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
247
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
248
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
249
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
250
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
251
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
252
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
253
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
254
#rounder ", %%mm4 \n\t"\
255
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
256
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
257
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
258
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
259
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
260
#rounder ", %%mm0 \n\t"\
261
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
262
"paddd %%mm0, %%mm0 \n\t" \
263
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
264
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
265
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
266
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
267
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
268
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
269
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
270
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
271
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
272
"psrad $" #shift ", %%mm7 \n\t"\
273
"psrad $" #shift ", %%mm4 \n\t"\
274
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
275
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
276
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
277
"psrad $" #shift ", %%mm1 \n\t"\
278
"psrad $" #shift ", %%mm2 \n\t"\
279
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
280
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
281
"movq %%mm7, " #dst " \n\t"\
282
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
283
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
284
"movq %%mm2, 24+" #dst " \n\t"\
285
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
286
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
287
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
288
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
289
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
290
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
291
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
292
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
293
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
294
"psrad $" #shift ", %%mm2 \n\t"\
295
"psrad $" #shift ", %%mm0 \n\t"\
296
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
297
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
298
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
299
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
300
"psrad $" #shift ", %%mm6 \n\t"\
301
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
302
"movq %%mm2, 8+" #dst " \n\t"\
303
"psrad $" #shift ", %%mm4 \n\t"\
304
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
305
"movq %%mm4, 16+" #dst " \n\t"\
308
"pslld $16, %%mm0 \n\t"\
309
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
310
"psrad $13, %%mm0 \n\t"\
311
"packssdw %%mm0, %%mm0 \n\t"\
312
"movq %%mm0, " #dst " \n\t"\
313
"movq %%mm0, 8+" #dst " \n\t"\
314
"movq %%mm0, 16+" #dst " \n\t"\
315
"movq %%mm0, 24+" #dst " \n\t"\
319
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
320
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
321
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
322
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
323
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
325
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
326
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
327
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
330
//IDCT( src0, src4, src1, src5, dst, shift)
331
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
332
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
333
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
334
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
338
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
339
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
340
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
341
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
342
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
343
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
344
"pand %%mm0, %%mm4 \n\t"\
345
"por %%mm1, %%mm4 \n\t"\
346
"por %%mm2, %%mm4 \n\t"\
347
"por %%mm3, %%mm4 \n\t"\
348
"packssdw %%mm4,%%mm4 \n\t"\
349
"movd %%mm4, %%eax \n\t"\
350
"orl %%eax, %%eax \n\t"\
352
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
353
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
354
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
355
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
356
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
357
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
358
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
359
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
360
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
361
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
362
#rounder ", %%mm4 \n\t"\
363
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
364
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
365
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
366
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
367
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
368
#rounder ", %%mm0 \n\t"\
369
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
370
"paddd %%mm0, %%mm0 \n\t" \
371
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
372
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
373
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
374
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
375
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
376
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
377
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
378
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
379
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
380
"psrad $" #shift ", %%mm7 \n\t"\
381
"psrad $" #shift ", %%mm4 \n\t"\
382
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
383
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
384
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
385
"psrad $" #shift ", %%mm1 \n\t"\
386
"psrad $" #shift ", %%mm2 \n\t"\
387
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
388
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
389
"movq %%mm7, " #dst " \n\t"\
390
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
391
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
392
"movq %%mm2, 24+" #dst " \n\t"\
393
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
394
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
395
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
396
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
397
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
398
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
399
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
400
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
401
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
402
"psrad $" #shift ", %%mm2 \n\t"\
403
"psrad $" #shift ", %%mm0 \n\t"\
404
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
405
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
406
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
407
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
408
"psrad $" #shift ", %%mm6 \n\t"\
409
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
410
"movq %%mm2, 8+" #dst " \n\t"\
411
"psrad $" #shift ", %%mm4 \n\t"\
412
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
413
"movq %%mm4, 16+" #dst " \n\t"\
416
"pslld $16, %%mm0 \n\t"\
417
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
418
"psrad $13, %%mm0 \n\t"\
419
"packssdw %%mm0, %%mm0 \n\t"\
420
"movq %%mm0, " #dst " \n\t"\
421
"movq %%mm0, 8+" #dst " \n\t"\
422
"movq %%mm0, 16+" #dst " \n\t"\
423
"movq %%mm0, 24+" #dst " \n\t"\
426
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
427
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
428
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
429
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
430
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
431
"movq %%mm0, %%mm4 \n\t"\
432
"por %%mm1, %%mm4 \n\t"\
433
"por %%mm2, %%mm4 \n\t"\
434
"por %%mm3, %%mm4 \n\t"\
435
"packssdw %%mm4,%%mm4 \n\t"\
436
"movd %%mm4, %%eax \n\t"\
437
"orl %%eax, %%eax \n\t"\
439
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
440
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
441
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
442
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
443
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
444
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
445
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
446
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
447
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
448
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
449
#rounder ", %%mm4 \n\t"\
450
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
451
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
452
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
453
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
454
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
455
#rounder ", %%mm0 \n\t"\
456
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
457
"paddd %%mm0, %%mm0 \n\t" \
458
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
459
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
460
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
461
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
462
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
463
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
464
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
465
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
466
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
467
"psrad $" #shift ", %%mm7 \n\t"\
468
"psrad $" #shift ", %%mm4 \n\t"\
469
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
470
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
471
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
472
"psrad $" #shift ", %%mm1 \n\t"\
473
"psrad $" #shift ", %%mm2 \n\t"\
474
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
475
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
476
"movq %%mm7, " #dst " \n\t"\
477
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
478
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
479
"movq %%mm2, 24+" #dst " \n\t"\
480
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
481
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
482
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
483
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
484
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
485
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
486
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
487
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
488
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
489
"psrad $" #shift ", %%mm2 \n\t"\
490
"psrad $" #shift ", %%mm0 \n\t"\
491
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
492
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
493
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
494
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
495
"psrad $" #shift ", %%mm6 \n\t"\
496
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
497
"movq %%mm2, 8+" #dst " \n\t"\
498
"psrad $" #shift ", %%mm4 \n\t"\
499
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
500
"movq %%mm4, 16+" #dst " \n\t"\
502
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
503
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
504
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
505
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
506
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
507
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
508
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
509
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
510
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
511
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
512
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
513
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
514
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
515
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
516
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
517
#rounder ", %%mm4 \n\t"\
518
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
519
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
520
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
521
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
522
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
523
#rounder ", %%mm0 \n\t"\
524
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
525
"paddd %%mm0, %%mm0 \n\t" \
526
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
527
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
528
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
529
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
530
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
531
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
532
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
533
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
534
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
535
"psrad $" #shift ", %%mm7 \n\t"\
536
"psrad $" #shift ", %%mm4 \n\t"\
537
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
538
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
539
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
540
"psrad $" #shift ", %%mm1 \n\t"\
541
"psrad $" #shift ", %%mm2 \n\t"\
542
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
543
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
544
"movq %%mm7, " #dst " \n\t"\
545
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
546
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
547
"movq %%mm2, 24+" #dst " \n\t"\
548
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
549
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
550
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
551
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
552
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
553
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
554
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
555
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
556
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
557
"psrad $" #shift ", %%mm2 \n\t"\
558
"psrad $" #shift ", %%mm0 \n\t"\
559
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
560
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
561
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
562
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
563
"psrad $" #shift ", %%mm6 \n\t"\
564
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
565
"movq %%mm2, 8+" #dst " \n\t"\
566
"psrad $" #shift ", %%mm4 \n\t"\
567
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
568
"movq %%mm4, 16+" #dst " \n\t"\
570
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
571
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
572
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
573
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
574
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
577
#define IDCT(src0, src4, src1, src5, dst, shift) \
578
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
579
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
580
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
581
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
582
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
583
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
584
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
585
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
586
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
587
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
588
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
589
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
590
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
591
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
592
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
593
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
594
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
595
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
596
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
597
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
598
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
599
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
600
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
601
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
602
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
603
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
604
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
605
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
606
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
607
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
608
"psrad $" #shift ", %%mm7 \n\t"\
609
"psrad $" #shift ", %%mm4 \n\t"\
610
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
611
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
612
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
613
"psrad $" #shift ", %%mm0 \n\t"\
614
"psrad $" #shift ", %%mm2 \n\t"\
615
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
616
"movd %%mm7, " #dst " \n\t"\
617
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
618
"movd %%mm0, 16+" #dst " \n\t"\
619
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
620
"movd %%mm2, 96+" #dst " \n\t"\
621
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
622
"movd %%mm4, 112+" #dst " \n\t"\
623
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
624
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
625
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
626
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
627
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
628
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
629
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
630
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
631
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
632
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
633
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
634
"psrad $" #shift ", %%mm2 \n\t"\
635
"psrad $" #shift ", %%mm5 \n\t"\
636
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
637
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
638
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
639
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
640
"psrad $" #shift ", %%mm6 \n\t"\
641
"psrad $" #shift ", %%mm4 \n\t"\
642
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
643
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
644
"movd %%mm2, 32+" #dst " \n\t"\
645
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
646
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
647
"movd %%mm6, 48+" #dst " \n\t"\
648
"movd %%mm4, 64+" #dst " \n\t"\
649
"movd %%mm5, 80+" #dst " \n\t"
652
//IDCT( src0, src4, src1, src5, dst, shift)
653
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
654
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
655
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
656
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
661
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
662
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
665
#define IDCT(src0, src4, src1, src5, dst, shift) \
666
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
667
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
668
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
669
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
670
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
671
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
672
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
673
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
674
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
675
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
676
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
677
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
678
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
679
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
680
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
681
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
682
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
683
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
684
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
685
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
686
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
687
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
688
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
689
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
690
"psrad $" #shift ", %%mm1 \n\t"\
691
"psrad $" #shift ", %%mm4 \n\t"\
692
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
693
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
694
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
695
"psrad $" #shift ", %%mm0 \n\t"\
696
"psrad $" #shift ", %%mm2 \n\t"\
697
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
698
"movd %%mm1, " #dst " \n\t"\
699
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
700
"movd %%mm0, 16+" #dst " \n\t"\
701
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
702
"movd %%mm2, 96+" #dst " \n\t"\
703
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
704
"movd %%mm4, 112+" #dst " \n\t"\
705
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
706
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
707
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
708
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
709
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
710
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
711
"psrad $" #shift ", %%mm2 \n\t"\
712
"psrad $" #shift ", %%mm5 \n\t"\
713
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
714
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
715
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
716
"psrad $" #shift ", %%mm6 \n\t"\
717
"psrad $" #shift ", %%mm1 \n\t"\
718
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
719
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
720
"movd %%mm2, 32+" #dst " \n\t"\
721
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
722
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
723
"movd %%mm6, 48+" #dst " \n\t"\
724
"movd %%mm1, 64+" #dst " \n\t"\
725
"movd %%mm5, 80+" #dst " \n\t"
727
//IDCT( src0, src4, src1, src5, dst, shift)
728
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
729
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
730
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
731
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
736
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
739
#define IDCT(src0, src4, src1, src5, dst, shift) \
740
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
741
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
742
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
743
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
744
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
745
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
746
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
747
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
748
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
749
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
750
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
751
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
752
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
753
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
754
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
755
"psrad $" #shift ", %%mm1 \n\t"\
756
"psrad $" #shift ", %%mm4 \n\t"\
757
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
758
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
759
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
760
"psrad $" #shift ", %%mm0 \n\t"\
761
"psrad $" #shift ", %%mm2 \n\t"\
762
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
763
"movd %%mm1, " #dst " \n\t"\
764
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
765
"movd %%mm0, 16+" #dst " \n\t"\
766
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
767
"movd %%mm2, 96+" #dst " \n\t"\
768
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
769
"movd %%mm4, 112+" #dst " \n\t"\
770
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
771
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
772
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
773
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
774
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
776
"psrad $" #shift ", %%mm2 \n\t"\
777
"psrad $" #shift ", %%mm5 \n\t"\
778
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
779
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
780
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
781
"psrad $" #shift ", %%mm6 \n\t"\
782
"psrad $" #shift ", %%mm1 \n\t"\
783
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
784
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
785
"movd %%mm2, 32+" #dst " \n\t"\
786
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
787
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
788
"movd %%mm6, 48+" #dst " \n\t"\
789
"movd %%mm1, 64+" #dst " \n\t"\
790
"movd %%mm5, 80+" #dst " \n\t"
793
//IDCT( src0, src4, src1, src5, dst, shift)
794
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
795
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
796
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
797
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
802
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
805
#define IDCT(src0, src4, src1, src5, dst, shift) \
806
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
807
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
808
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
809
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
810
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
811
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
812
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
813
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
814
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
815
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
816
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
817
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
818
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
819
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
820
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
821
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
822
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
823
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
824
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
825
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
826
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
827
"psrad $" #shift ", %%mm7 \n\t"\
828
"psrad $" #shift ", %%mm4 \n\t"\
829
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
830
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
831
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
832
"psrad $" #shift ", %%mm0 \n\t"\
833
"psrad $" #shift ", %%mm2 \n\t"\
834
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
835
"movd %%mm7, " #dst " \n\t"\
836
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
837
"movd %%mm0, 16+" #dst " \n\t"\
838
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
839
"movd %%mm2, 96+" #dst " \n\t"\
840
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
841
"movd %%mm4, 112+" #dst " \n\t"\
842
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
843
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
844
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
845
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
846
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
847
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
848
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
849
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
850
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
851
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
852
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
853
"psrad $" #shift ", %%mm2 \n\t"\
854
"psrad $" #shift ", %%mm5 \n\t"\
855
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
856
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
857
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
858
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
859
"psrad $" #shift ", %%mm6 \n\t"\
860
"psrad $" #shift ", %%mm4 \n\t"\
861
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
862
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
863
"movd %%mm2, 32+" #dst " \n\t"\
864
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
865
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
866
"movd %%mm6, 48+" #dst " \n\t"\
867
"movd %%mm4, 64+" #dst " \n\t"\
868
"movd %%mm5, 80+" #dst " \n\t"
870
//IDCT( src0, src4, src1, src5, dst, shift)
871
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
872
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
873
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
874
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
880
#define IDCT(src0, src4, src1, src5, dst, shift) \
881
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
882
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
883
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
884
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
885
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
886
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
887
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
888
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
889
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
890
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
891
"movq 64(%2), %%mm3 \n\t"\
892
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
893
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
894
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
895
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
896
"psrad $" #shift ", %%mm7 \n\t"\
897
"psrad $" #shift ", %%mm4 \n\t"\
898
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
899
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
900
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
901
"psrad $" #shift ", %%mm0 \n\t"\
902
"psrad $" #shift ", %%mm1 \n\t"\
903
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
904
"movd %%mm7, " #dst " \n\t"\
905
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
906
"movd %%mm0, 16+" #dst " \n\t"\
907
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
908
"movd %%mm1, 96+" #dst " \n\t"\
909
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
910
"movd %%mm4, 112+" #dst " \n\t"\
911
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
912
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
913
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
914
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
915
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
916
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
917
"psrad $" #shift ", %%mm1 \n\t"\
918
"psrad $" #shift ", %%mm5 \n\t"\
919
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
920
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
921
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
922
"psrad $" #shift ", %%mm6 \n\t"\
923
"psrad $" #shift ", %%mm4 \n\t"\
924
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
925
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
926
"movd %%mm1, 32+" #dst " \n\t"\
927
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
928
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
929
"movd %%mm6, 48+" #dst " \n\t"\
930
"movd %%mm4, 64+" #dst " \n\t"\
931
"movd %%mm5, 80+" #dst " \n\t"
934
//IDCT( src0, src4, src1, src5, dst, shift)
935
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
936
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
937
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
938
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
944
#define IDCT(src0, src4, src1, src5, dst, shift) \
945
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
946
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
947
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
948
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
949
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
950
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
951
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
952
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
953
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
954
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
955
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
956
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
957
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
958
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
959
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
960
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
961
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
962
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
963
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
964
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
965
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
966
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
967
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
968
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
969
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
970
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
971
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
972
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
973
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
974
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
975
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
976
"psrad $" #shift ", %%mm4 \n\t"\
977
"psrad $" #shift ", %%mm7 \n\t"\
978
"psrad $" #shift ", %%mm3 \n\t"\
979
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
980
"movq %%mm4, " #dst " \n\t"\
981
"psrad $" #shift ", %%mm0 \n\t"\
982
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
983
"movq %%mm0, 16+" #dst " \n\t"\
984
"movq %%mm0, 96+" #dst " \n\t"\
985
"movq %%mm4, 112+" #dst " \n\t"\
986
"psrad $" #shift ", %%mm5 \n\t"\
987
"psrad $" #shift ", %%mm6 \n\t"\
988
"psrad $" #shift ", %%mm2 \n\t"\
989
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
990
"movq %%mm5, 32+" #dst " \n\t"\
991
"psrad $" #shift ", %%mm1 \n\t"\
992
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
993
"movq %%mm6, 48+" #dst " \n\t"\
994
"movq %%mm6, 64+" #dst " \n\t"\
995
"movq %%mm5, 80+" #dst " \n\t"
998
//IDCT( src0, src4, src1, src5, dst, shift)
999
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1000
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1001
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1002
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006
"# .p2align 4 \n\t"\
1009
#define IDCT(src0, src4, src1, src5, dst, shift) \
1010
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1011
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1012
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1013
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1014
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1015
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1016
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1017
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1018
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1019
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1020
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1021
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1022
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1023
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1024
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1025
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1026
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1027
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1028
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1029
"movq 64(%2), %%mm1 \n\t"\
1030
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1031
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1032
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1033
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1034
"psrad $" #shift ", %%mm7 \n\t"\
1035
"psrad $" #shift ", %%mm4 \n\t"\
1036
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1037
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1038
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1039
"psrad $" #shift ", %%mm0 \n\t"\
1040
"psrad $" #shift ", %%mm3 \n\t"\
1041
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1042
"movd %%mm7, " #dst " \n\t"\
1043
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1044
"movd %%mm0, 16+" #dst " \n\t"\
1045
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1046
"movd %%mm3, 96+" #dst " \n\t"\
1047
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1048
"movd %%mm4, 112+" #dst " \n\t"\
1049
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1050
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1051
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1052
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1053
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1054
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1055
"psrad $" #shift ", %%mm3 \n\t"\
1056
"psrad $" #shift ", %%mm5 \n\t"\
1057
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1058
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1059
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1060
"psrad $" #shift ", %%mm6 \n\t"\
1061
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1062
"movd %%mm3, 32+" #dst " \n\t"\
1063
"psrad $" #shift ", %%mm4 \n\t"\
1064
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1065
"movd %%mm6, 48+" #dst " \n\t"\
1066
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1067
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1068
"movd %%mm4, 64+" #dst " \n\t"\
1069
"movd %%mm5, 80+" #dst " \n\t"
1072
//IDCT( src0, src4, src1, src5, dst, shift)
1073
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1074
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1075
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1076
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1083
#define IDCT(src0, src4, src1, src5, dst, shift) \
1084
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1085
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1086
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1087
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1088
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1089
"psrad $" #shift ", %%mm4 \n\t"\
1090
"psrad $" #shift ", %%mm0 \n\t"\
1091
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1092
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1093
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1094
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1095
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1096
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1097
"psrad $" #shift ", %%mm1 \n\t"\
1098
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1099
"movq %%mm4, " #dst " \n\t"\
1100
"psrad $" #shift ", %%mm2 \n\t"\
1101
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1102
"movq %%mm0, 16+" #dst " \n\t"\
1103
"movq %%mm0, 96+" #dst " \n\t"\
1104
"movq %%mm4, 112+" #dst " \n\t"\
1105
"movq %%mm0, 32+" #dst " \n\t"\
1106
"movq %%mm4, 48+" #dst " \n\t"\
1107
"movq %%mm4, 64+" #dst " \n\t"\
1108
"movq %%mm0, 80+" #dst " \n\t"
1110
//IDCT( src0, src4, src1, src5, dst, shift)
1111
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1112
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1113
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1114
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1121
00 40 04 44 20 60 24 64
1122
10 30 14 34 50 70 54 74
1123
01 41 03 43 21 61 23 63
1124
11 31 13 33 51 71 53 73
1125
02 42 06 46 22 62 26 66
1126
12 32 16 36 52 72 56 76
1127
05 45 07 47 25 65 27 67
1128
15 35 17 37 55 75 57 77
1131
00 04 10 14 20 24 30 34
1132
40 44 50 54 60 64 70 74
1133
01 03 11 13 21 23 31 33
1134
41 43 51 53 61 63 71 73
1135
02 06 12 16 22 26 32 36
1136
42 46 52 56 62 66 72 76
1137
05 07 15 17 25 27 35 37
1138
45 47 55 57 65 67 75 77
1142
:: "r" (block), "r" (temp), "r" (coeffs)
1147
void ff_simple_idct_mmx(int16_t *block)
1152
//FIXME merge add/put into the idct
1154
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1157
ff_put_pixels_clamped_mmx(block, dest, line_size);
1159
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1162
ff_add_pixels_clamped_mmx(block, dest, line_size);