~ubuntu-branches/ubuntu/feisty/avidemux/feisty

« back to all changes in this revision

Viewing changes to adm_lavcodec/x86_64/simple_idct_a64.c

  • Committer: Bazaar Package Importer
  • Author(s): Christian Marillat
  • Date: 2005-05-25 13:02:29 UTC
  • mfrom: (1.1.2 upstream)
  • Revision ID: james.westby@ubuntu.com-20050525130229-jw94cav0yhmg7vjw
Tags: 1:2.0.40-0.0
New upstream release.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * Simple IDCT MMX
3
 
 *
4
 
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
 
 *
6
 
 * This library is free software; you can redistribute it and/or
7
 
 * modify it under the terms of the GNU Lesser General Public
8
 
 * License as published by the Free Software Foundation; either
9
 
 * version 2 of the License, or (at your option) any later version.
10
 
 *
11
 
 * This library is distributed in the hope that it will be useful,
12
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 
 * Lesser General Public License for more details.
15
 
 *
16
 
 * You should have received a copy of the GNU Lesser General Public
17
 
 * License along with this library; if not, write to the Free Software
18
 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 
 */
20
 
#include "../dsputil.h"
21
 
 
22
 
/*
23
 
23170.475006
24
 
22725.260826
25
 
21406.727617
26
 
19265.545870
27
 
16384.000000
28
 
12872.826198
29
 
8866.956905
30
 
4520.335430
31
 
*/
32
 
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33
 
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34
 
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35
 
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36
 
#if 0
37
 
#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38
 
#else
39
 
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
40
 
#endif
41
 
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
 
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
 
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
 
 
45
 
#define ROW_SHIFT 11
46
 
#define COL_SHIFT 20 // 6
47
 
 
48
 
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
49
 
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
50
 
 
51
 
static const int16_t __attribute__((aligned(8))) coeffs[]= {
52
 
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53
 
//      1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54
 
//      0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55
 
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56
 
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
57
 
//      0, 0, 0, 0,
58
 
//      0, 0, 0, 0,
59
 
 
60
 
 C4,  C4,  C4,  C4,
61
 
 C4, -C4,  C4, -C4,
62
 
 
63
 
 C2,  C6,  C2,  C6,
64
 
 C6, -C2,  C6, -C2,
65
 
 
66
 
 C1,  C3,  C1,  C3,
67
 
 C5,  C7,  C5,  C7,
68
 
 
69
 
 C3, -C7,  C3, -C7,
70
 
-C1, -C5, -C1, -C5,
71
 
 
72
 
 C5, -C1,  C5, -C1,
73
 
 C7,  C3,  C7,  C3,
74
 
 
75
 
 C7, -C5,  C7, -C5,
76
 
 C3, -C1,  C3, -C1
77
 
};
78
 
 
79
 
#if 0
80
 
static void unused_var_killer(){
81
 
        int a= wm1010 + d40000;
82
 
        temp[0]=a;
83
 
}
84
 
 
85
 
static void inline idctCol (int16_t * col, int16_t *input)
86
 
{
87
 
#undef C0
88
 
#undef C1
89
 
#undef C2
90
 
#undef C3
91
 
#undef C4
92
 
#undef C5
93
 
#undef C6
94
 
#undef C7
95
 
        int a0, a1, a2, a3, b0, b1, b2, b3;
96
 
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
97
 
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98
 
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99
 
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
 
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
 
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
 
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
 
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
 
/*
105
 
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
106
 
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
107
 
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
108
 
                return;
109
 
        }*/
110
 
 
111
 
col[8*0] = input[8*0 + 0];
112
 
col[8*1] = input[8*2 + 0];
113
 
col[8*2] = input[8*0 + 1];
114
 
col[8*3] = input[8*2 + 1];
115
 
col[8*4] = input[8*4 + 0];
116
 
col[8*5] = input[8*6 + 0];
117
 
col[8*6] = input[8*4 + 1];
118
 
col[8*7] = input[8*6 + 1];
119
 
 
120
 
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
121
 
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
122
 
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
123
 
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
124
 
 
125
 
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
126
 
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
127
 
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
128
 
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
129
 
 
130
 
        col[8*0] = (a0 + b0) >> COL_SHIFT;
131
 
        col[8*1] = (a1 + b1) >> COL_SHIFT;
132
 
        col[8*2] = (a2 + b2) >> COL_SHIFT;
133
 
        col[8*3] = (a3 + b3) >> COL_SHIFT;
134
 
        col[8*4] = (a3 - b3) >> COL_SHIFT;
135
 
        col[8*5] = (a2 - b2) >> COL_SHIFT;
136
 
        col[8*6] = (a1 - b1) >> COL_SHIFT;
137
 
        col[8*7] = (a0 - b0) >> COL_SHIFT;
138
 
}
139
 
 
140
 
static void inline idctRow (int16_t * output, int16_t * input)
141
 
{
142
 
        int16_t row[8];
143
 
 
144
 
        int a0, a1, a2, a3, b0, b1, b2, b3;
145
 
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
146
 
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147
 
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148
 
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
 
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
 
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
 
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
 
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
 
 
154
 
row[0] = input[0];
155
 
row[2] = input[1];
156
 
row[4] = input[4];
157
 
row[6] = input[5];
158
 
row[1] = input[8];
159
 
row[3] = input[9];
160
 
row[5] = input[12];
161
 
row[7] = input[13];
162
 
 
163
 
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
164
 
                row[0] = row[1] = row[2] = row[3] = row[4] =
165
 
                        row[5] = row[6] = row[7] = row[0]<<3;
166
 
        output[0] = row[0];
167
 
        output[2] = row[1];
168
 
        output[4] = row[2];
169
 
        output[6] = row[3];
170
 
        output[8] = row[4];
171
 
        output[10] = row[5];
172
 
        output[12] = row[6];
173
 
        output[14] = row[7];
174
 
                return;
175
 
        }
176
 
 
177
 
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
178
 
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
179
 
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
180
 
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
181
 
 
182
 
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
183
 
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
184
 
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
185
 
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
186
 
 
187
 
        row[0] = (a0 + b0) >> ROW_SHIFT;
188
 
        row[1] = (a1 + b1) >> ROW_SHIFT;
189
 
        row[2] = (a2 + b2) >> ROW_SHIFT;
190
 
        row[3] = (a3 + b3) >> ROW_SHIFT;
191
 
        row[4] = (a3 - b3) >> ROW_SHIFT;
192
 
        row[5] = (a2 - b2) >> ROW_SHIFT;
193
 
        row[6] = (a1 - b1) >> ROW_SHIFT;
194
 
        row[7] = (a0 - b0) >> ROW_SHIFT;
195
 
 
196
 
        output[0] = row[0];
197
 
        output[2] = row[1];
198
 
        output[4] = row[2];
199
 
        output[6] = row[3];
200
 
        output[8] = row[4];
201
 
        output[10] = row[5];
202
 
        output[12] = row[6];
203
 
        output[14] = row[7];
204
 
}
205
 
#endif
206
 
 
207
 
static inline void idct(int16_t *block)
208
 
{
209
 
        int64_t __attribute__((aligned(8))) align_tmp[16];
210
 
        int16_t * const temp= (int16_t*)align_tmp;
211
 
 
212
 
        asm volatile(
213
 
#if 0 //Alternative, simpler variant
214
 
 
215
 
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
216
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
217
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
218
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
219
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
220
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
221
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
222
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
223
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
224
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
225
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
226
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
227
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
228
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
229
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
230
 
        #rounder ", %%mm4                       \n\t"\
231
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
232
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
233
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
234
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
235
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
236
 
        #rounder ", %%mm0                       \n\t"\
237
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
238
 
        "paddd %%mm0, %%mm0                     \n\t" \
239
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
240
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
241
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
242
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
243
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
244
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
245
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
246
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
247
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
248
 
        "psrad $" #shift ", %%mm7               \n\t"\
249
 
        "psrad $" #shift ", %%mm4               \n\t"\
250
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
251
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
252
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
253
 
        "psrad $" #shift ", %%mm1               \n\t"\
254
 
        "psrad $" #shift ", %%mm2               \n\t"\
255
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
256
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
257
 
        "movq %%mm7, " #dst "                   \n\t"\
258
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
259
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
260
 
        "movq %%mm2, 24+" #dst "                \n\t"\
261
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
262
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
263
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
264
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
265
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
266
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
267
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
268
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
269
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
270
 
        "psrad $" #shift ", %%mm2               \n\t"\
271
 
        "psrad $" #shift ", %%mm0               \n\t"\
272
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
273
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
274
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
275
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
276
 
        "psrad $" #shift ", %%mm6               \n\t"\
277
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
278
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
279
 
        "psrad $" #shift ", %%mm4               \n\t"\
280
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
281
 
        "movq %%mm4, 16+" #dst "                \n\t"\
282
 
 
283
 
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
284
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
285
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
286
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
287
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
288
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
289
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
290
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
291
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
292
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
293
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
294
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
295
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
296
 
        #rounder ", %%mm4                       \n\t"\
297
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
298
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
299
 
        #rounder ", %%mm0                       \n\t"\
300
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
301
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
302
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
303
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
304
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
305
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
306
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
307
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
308
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
309
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
310
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
311
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
312
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
313
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
314
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
315
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
316
 
        "psrad $" #shift ", %%mm7               \n\t"\
317
 
        "psrad $" #shift ", %%mm4               \n\t"\
318
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
319
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
320
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
321
 
        "psrad $" #shift ", %%mm0               \n\t"\
322
 
        "psrad $" #shift ", %%mm2               \n\t"\
323
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
324
 
        "movd %%mm7, " #dst "                   \n\t"\
325
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
326
 
        "movd %%mm0, 16+" #dst "                \n\t"\
327
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
328
 
        "movd %%mm2, 96+" #dst "                \n\t"\
329
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
330
 
        "movd %%mm4, 112+" #dst "               \n\t"\
331
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
332
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
333
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
334
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
335
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
336
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
337
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
338
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
339
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
340
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
341
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
342
 
        "psrad $" #shift ", %%mm2               \n\t"\
343
 
        "psrad $" #shift ", %%mm5               \n\t"\
344
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
345
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
346
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
347
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
348
 
        "psrad $" #shift ", %%mm6               \n\t"\
349
 
        "psrad $" #shift ", %%mm4               \n\t"\
350
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
351
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
352
 
        "movd %%mm2, 32+" #dst "                \n\t"\
353
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
354
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
355
 
        "movd %%mm6, 48+" #dst "                \n\t"\
356
 
        "movd %%mm4, 64+" #dst "                \n\t"\
357
 
        "movd %%mm5, 80+" #dst "                \n\t"\
358
 
 
359
 
        
360
 
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
361
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
362
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
363
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
364
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
365
 
        "movq "MANGLE(wm1010)", %%mm4           \n\t"\
366
 
        "pand %%mm0, %%mm4                      \n\t"\
367
 
        "por %%mm1, %%mm4                       \n\t"\
368
 
        "por %%mm2, %%mm4                       \n\t"\
369
 
        "por %%mm3, %%mm4                       \n\t"\
370
 
        "packssdw %%mm4,%%mm4                   \n\t"\
371
 
        "movd %%mm4, %%eax                      \n\t"\
372
 
        "orl %%eax, %%eax                       \n\t"\
373
 
        "jz 1f                                  \n\t"\
374
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
375
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
376
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
377
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
378
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
379
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
380
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
381
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
382
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
383
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
384
 
        #rounder ", %%mm4                       \n\t"\
385
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
386
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
387
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
388
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
389
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
390
 
        #rounder ", %%mm0                       \n\t"\
391
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
392
 
        "paddd %%mm0, %%mm0                     \n\t" \
393
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
394
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
395
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
396
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
397
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
398
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
399
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
400
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
401
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
402
 
        "psrad $" #shift ", %%mm7               \n\t"\
403
 
        "psrad $" #shift ", %%mm4               \n\t"\
404
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
405
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
406
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
407
 
        "psrad $" #shift ", %%mm1               \n\t"\
408
 
        "psrad $" #shift ", %%mm2               \n\t"\
409
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
410
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
411
 
        "movq %%mm7, " #dst "                   \n\t"\
412
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
413
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
414
 
        "movq %%mm2, 24+" #dst "                \n\t"\
415
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
416
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
417
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
418
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
419
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
420
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
421
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
422
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
423
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
424
 
        "psrad $" #shift ", %%mm2               \n\t"\
425
 
        "psrad $" #shift ", %%mm0               \n\t"\
426
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
427
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
428
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
429
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
430
 
        "psrad $" #shift ", %%mm6               \n\t"\
431
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
432
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
433
 
        "psrad $" #shift ", %%mm4               \n\t"\
434
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
435
 
        "movq %%mm4, 16+" #dst "                \n\t"\
436
 
        "jmp 2f                                 \n\t"\
437
 
        "1:                                     \n\t"\
438
 
        "pslld $16, %%mm0                       \n\t"\
439
 
        "#paddd "MANGLE(d40000)", %%mm0         \n\t"\
440
 
        "psrad $13, %%mm0                       \n\t"\
441
 
        "packssdw %%mm0, %%mm0                  \n\t"\
442
 
        "movq %%mm0, " #dst "                   \n\t"\
443
 
        "movq %%mm0, 8+" #dst "                 \n\t"\
444
 
        "movq %%mm0, 16+" #dst "                \n\t"\
445
 
        "movq %%mm0, 24+" #dst "                \n\t"\
446
 
        "2:                                     \n\t"
447
 
 
448
 
 
449
 
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
450
 
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
451
 
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
452
 
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
453
 
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
454
 
 
455
 
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
456
 
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
457
 
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
458
 
 
459
 
 
460
 
//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
461
 
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
462
 
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
463
 
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
464
 
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
465
 
 
466
 
#else
467
 
 
468
 
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
469
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
470
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
471
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
472
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
473
 
        "movq "MANGLE(wm1010)", %%mm4           \n\t"\
474
 
        "pand %%mm0, %%mm4                      \n\t"\
475
 
        "por %%mm1, %%mm4                       \n\t"\
476
 
        "por %%mm2, %%mm4                       \n\t"\
477
 
        "por %%mm3, %%mm4                       \n\t"\
478
 
        "packssdw %%mm4,%%mm4                   \n\t"\
479
 
        "movd %%mm4, %%eax                      \n\t"\
480
 
        "orl %%eax, %%eax                       \n\t"\
481
 
        "jz 1f                                  \n\t"\
482
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
483
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
484
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
485
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
486
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
487
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
488
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
489
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
490
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
491
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
492
 
        #rounder ", %%mm4                       \n\t"\
493
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
494
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
495
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
496
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
497
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
498
 
        #rounder ", %%mm0                       \n\t"\
499
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
500
 
        "paddd %%mm0, %%mm0                     \n\t" \
501
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
502
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
503
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
504
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
505
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
506
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
507
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
508
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
509
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
510
 
        "psrad $" #shift ", %%mm7               \n\t"\
511
 
        "psrad $" #shift ", %%mm4               \n\t"\
512
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
513
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
514
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
515
 
        "psrad $" #shift ", %%mm1               \n\t"\
516
 
        "psrad $" #shift ", %%mm2               \n\t"\
517
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
518
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
519
 
        "movq %%mm7, " #dst "                   \n\t"\
520
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
521
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
522
 
        "movq %%mm2, 24+" #dst "                \n\t"\
523
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
524
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
525
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
526
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
527
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
528
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
529
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
530
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
531
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
532
 
        "psrad $" #shift ", %%mm2               \n\t"\
533
 
        "psrad $" #shift ", %%mm0               \n\t"\
534
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
535
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
536
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
537
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
538
 
        "psrad $" #shift ", %%mm6               \n\t"\
539
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
540
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
541
 
        "psrad $" #shift ", %%mm4               \n\t"\
542
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
543
 
        "movq %%mm4, 16+" #dst "                \n\t"\
544
 
        "jmp 2f                                 \n\t"\
545
 
        "1:                                     \n\t"\
546
 
        "pslld $16, %%mm0                       \n\t"\
547
 
        "paddd "MANGLE(d40000)", %%mm0          \n\t"\
548
 
        "psrad $13, %%mm0                       \n\t"\
549
 
        "packssdw %%mm0, %%mm0                  \n\t"\
550
 
        "movq %%mm0, " #dst "                   \n\t"\
551
 
        "movq %%mm0, 8+" #dst "                 \n\t"\
552
 
        "movq %%mm0, 16+" #dst "                \n\t"\
553
 
        "movq %%mm0, 24+" #dst "                \n\t"\
554
 
        "2:                                     \n\t"
555
 
 
556
 
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
557
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
558
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
559
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
560
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
561
 
        "movq %%mm0, %%mm4                      \n\t"\
562
 
        "por %%mm1, %%mm4                       \n\t"\
563
 
        "por %%mm2, %%mm4                       \n\t"\
564
 
        "por %%mm3, %%mm4                       \n\t"\
565
 
        "packssdw %%mm4,%%mm4                   \n\t"\
566
 
        "movd %%mm4, %%eax                      \n\t"\
567
 
        "orl %%eax, %%eax                       \n\t"\
568
 
        "jz " #bt "                             \n\t"\
569
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
570
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
571
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
572
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
573
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
574
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
575
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
576
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
577
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
578
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
579
 
        #rounder ", %%mm4                       \n\t"\
580
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
581
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
582
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
583
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
584
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
585
 
        #rounder ", %%mm0                       \n\t"\
586
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
587
 
        "paddd %%mm0, %%mm0                     \n\t" \
588
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
589
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
590
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
591
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
592
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
593
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
594
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
595
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
596
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
597
 
        "psrad $" #shift ", %%mm7               \n\t"\
598
 
        "psrad $" #shift ", %%mm4               \n\t"\
599
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
600
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
601
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
602
 
        "psrad $" #shift ", %%mm1               \n\t"\
603
 
        "psrad $" #shift ", %%mm2               \n\t"\
604
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
605
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
606
 
        "movq %%mm7, " #dst "                   \n\t"\
607
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
608
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
609
 
        "movq %%mm2, 24+" #dst "                \n\t"\
610
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
611
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
612
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
613
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
614
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
615
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
616
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
617
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
618
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
619
 
        "psrad $" #shift ", %%mm2               \n\t"\
620
 
        "psrad $" #shift ", %%mm0               \n\t"\
621
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
622
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
623
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
624
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
625
 
        "psrad $" #shift ", %%mm6               \n\t"\
626
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
627
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
628
 
        "psrad $" #shift ", %%mm4               \n\t"\
629
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
630
 
        "movq %%mm4, 16+" #dst "                \n\t"\
631
 
 
632
 
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
633
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
634
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
635
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
636
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
637
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
638
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
639
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
640
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
641
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
642
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
643
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
644
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
645
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
646
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
647
 
        #rounder ", %%mm4                       \n\t"\
648
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
649
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
650
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
651
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
652
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
653
 
        #rounder ", %%mm0                       \n\t"\
654
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
655
 
        "paddd %%mm0, %%mm0                     \n\t" \
656
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
657
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
658
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
659
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
660
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
661
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
662
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
663
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
664
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
665
 
        "psrad $" #shift ", %%mm7               \n\t"\
666
 
        "psrad $" #shift ", %%mm4               \n\t"\
667
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
668
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
669
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
670
 
        "psrad $" #shift ", %%mm1               \n\t"\
671
 
        "psrad $" #shift ", %%mm2               \n\t"\
672
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
673
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
674
 
        "movq %%mm7, " #dst "                   \n\t"\
675
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
676
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
677
 
        "movq %%mm2, 24+" #dst "                \n\t"\
678
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
679
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
680
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
681
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
682
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
683
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
684
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
685
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
686
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
687
 
        "psrad $" #shift ", %%mm2               \n\t"\
688
 
        "psrad $" #shift ", %%mm0               \n\t"\
689
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
690
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
691
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
692
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
693
 
        "psrad $" #shift ", %%mm6               \n\t"\
694
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
695
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
696
 
        "psrad $" #shift ", %%mm4               \n\t"\
697
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
698
 
        "movq %%mm4, 16+" #dst "                \n\t"\
699
 
 
700
 
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
701
 
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
702
 
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
703
 
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
704
 
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
705
 
 
706
 
#undef IDCT
707
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
708
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
709
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
710
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
711
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
712
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
713
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
714
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
715
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
716
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
717
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
718
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
719
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
720
 
        #rounder ", %%mm4                       \n\t"\
721
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
723
 
        #rounder ", %%mm0                       \n\t"\
724
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
725
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
726
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
727
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
728
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
729
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
730
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
731
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
732
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
733
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
734
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
735
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
736
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
737
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
738
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
739
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
740
 
        "psrad $" #shift ", %%mm7               \n\t"\
741
 
        "psrad $" #shift ", %%mm4               \n\t"\
742
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
743
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
744
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
745
 
        "psrad $" #shift ", %%mm0               \n\t"\
746
 
        "psrad $" #shift ", %%mm2               \n\t"\
747
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
748
 
        "movd %%mm7, " #dst "                   \n\t"\
749
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
750
 
        "movd %%mm0, 16+" #dst "                \n\t"\
751
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
752
 
        "movd %%mm2, 96+" #dst "                \n\t"\
753
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
754
 
        "movd %%mm4, 112+" #dst "               \n\t"\
755
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
756
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
757
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
758
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
759
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
760
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
761
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
762
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
763
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
764
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
765
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
766
 
        "psrad $" #shift ", %%mm2               \n\t"\
767
 
        "psrad $" #shift ", %%mm5               \n\t"\
768
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
769
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
770
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
771
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
772
 
        "psrad $" #shift ", %%mm6               \n\t"\
773
 
        "psrad $" #shift ", %%mm4               \n\t"\
774
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
775
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
776
 
        "movd %%mm2, 32+" #dst "                \n\t"\
777
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
778
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
779
 
        "movd %%mm6, 48+" #dst "                \n\t"\
780
 
        "movd %%mm4, 64+" #dst "                \n\t"\
781
 
        "movd %%mm5, 80+" #dst "                \n\t"
782
 
 
783
 
 
784
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
785
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
786
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
787
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
788
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
789
 
        "jmp 9f                                 \n\t"
790
 
 
791
 
        "#.balign 16                            \n\t"\
792
 
        "4:                                     \n\t"
793
 
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794
 
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
795
 
 
796
 
#undef IDCT
797
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
798
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
799
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
800
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
801
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
802
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
803
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
804
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
805
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
806
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
807
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
808
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
809
 
        #rounder ", %%mm4                       \n\t"\
810
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811
 
        #rounder ", %%mm0                       \n\t"\
812
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
813
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
814
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
815
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
816
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
817
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
818
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
819
 
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
820
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
821
 
        "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
822
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
823
 
        "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
824
 
        "psrad $" #shift ", %%mm1               \n\t"\
825
 
        "psrad $" #shift ", %%mm4               \n\t"\
826
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
827
 
        "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
828
 
        "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
829
 
        "psrad $" #shift ", %%mm0               \n\t"\
830
 
        "psrad $" #shift ", %%mm2               \n\t"\
831
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
832
 
        "movd %%mm1, " #dst "                   \n\t"\
833
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
834
 
        "movd %%mm0, 16+" #dst "                \n\t"\
835
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
836
 
        "movd %%mm2, 96+" #dst "                \n\t"\
837
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
838
 
        "movd %%mm4, 112+" #dst "               \n\t"\
839
 
        "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
840
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
841
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
842
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
843
 
        "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
844
 
        "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
845
 
        "psrad $" #shift ", %%mm2               \n\t"\
846
 
        "psrad $" #shift ", %%mm5               \n\t"\
847
 
        "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
848
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
849
 
        "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
850
 
        "psrad $" #shift ", %%mm6               \n\t"\
851
 
        "psrad $" #shift ", %%mm1               \n\t"\
852
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
853
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
854
 
        "movd %%mm2, 32+" #dst "                \n\t"\
855
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
856
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
857
 
        "movd %%mm6, 48+" #dst "                \n\t"\
858
 
        "movd %%mm1, 64+" #dst "                \n\t"\
859
 
        "movd %%mm5, 80+" #dst "                \n\t"   
860
 
 
861
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
862
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
863
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
864
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
865
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
866
 
        "jmp 9f                                 \n\t"
867
 
 
868
 
        "#.balign 16                            \n\t"\
869
 
        "6:                                     \n\t"
870
 
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
871
 
 
872
 
#undef IDCT
873
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
874
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
875
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
876
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
877
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
879
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
880
 
        #rounder ", %%mm4                       \n\t"\
881
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
882
 
        #rounder ", %%mm0                       \n\t"\
883
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
884
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
885
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
886
 
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
887
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
888
 
        "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
889
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
890
 
        "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
891
 
        "psrad $" #shift ", %%mm1               \n\t"\
892
 
        "psrad $" #shift ", %%mm4               \n\t"\
893
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
894
 
        "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
895
 
        "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
896
 
        "psrad $" #shift ", %%mm0               \n\t"\
897
 
        "psrad $" #shift ", %%mm2               \n\t"\
898
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
899
 
        "movd %%mm1, " #dst "                   \n\t"\
900
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
901
 
        "movd %%mm0, 16+" #dst "                \n\t"\
902
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
903
 
        "movd %%mm2, 96+" #dst "                \n\t"\
904
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
905
 
        "movd %%mm4, 112+" #dst "               \n\t"\
906
 
        "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
907
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
908
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
909
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
910
 
        "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
911
 
        "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
912
 
        "psrad $" #shift ", %%mm2               \n\t"\
913
 
        "psrad $" #shift ", %%mm5               \n\t"\
914
 
        "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
915
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
916
 
        "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
917
 
        "psrad $" #shift ", %%mm6               \n\t"\
918
 
        "psrad $" #shift ", %%mm1               \n\t"\
919
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
920
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
921
 
        "movd %%mm2, 32+" #dst "                \n\t"\
922
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
923
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
924
 
        "movd %%mm6, 48+" #dst "                \n\t"\
925
 
        "movd %%mm1, 64+" #dst "                \n\t"\
926
 
        "movd %%mm5, 80+" #dst "                \n\t"   
927
 
 
928
 
 
929
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
930
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
931
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
932
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
933
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
934
 
        "jmp 9f                                 \n\t"
935
 
 
936
 
        "#.balign 16                            \n\t"\
937
 
        "2:                                     \n\t"
938
 
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
939
 
 
940
 
#undef IDCT
941
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
942
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
943
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
944
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
945
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
946
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
948
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
949
 
        #rounder ", %%mm4                       \n\t"\
950
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
951
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
952
 
        #rounder ", %%mm0                       \n\t"\
953
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
954
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
955
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
956
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
957
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
958
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
959
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
960
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
961
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
962
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
963
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
964
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
965
 
        "psrad $" #shift ", %%mm7               \n\t"\
966
 
        "psrad $" #shift ", %%mm4               \n\t"\
967
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
968
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
969
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
970
 
        "psrad $" #shift ", %%mm0               \n\t"\
971
 
        "psrad $" #shift ", %%mm2               \n\t"\
972
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
973
 
        "movd %%mm7, " #dst "                   \n\t"\
974
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
975
 
        "movd %%mm0, 16+" #dst "                \n\t"\
976
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
977
 
        "movd %%mm2, 96+" #dst "                \n\t"\
978
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
979
 
        "movd %%mm4, 112+" #dst "               \n\t"\
980
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
981
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
982
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
983
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
984
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
985
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
986
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
987
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
988
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
989
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
990
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
991
 
        "psrad $" #shift ", %%mm2               \n\t"\
992
 
        "psrad $" #shift ", %%mm5               \n\t"\
993
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
994
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
995
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
996
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
997
 
        "psrad $" #shift ", %%mm6               \n\t"\
998
 
        "psrad $" #shift ", %%mm4               \n\t"\
999
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
1000
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1001
 
        "movd %%mm2, 32+" #dst "                \n\t"\
1002
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1003
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1004
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1005
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1006
 
        "movd %%mm5, 80+" #dst "                \n\t"
1007
 
 
1008
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1009
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1010
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1011
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1012
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1013
 
        "jmp 9f                                 \n\t"
1014
 
 
1015
 
        "#.balign 16                            \n\t"\
1016
 
        "3:                                     \n\t"
1017
 
#undef IDCT
1018
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1019
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1020
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1021
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1022
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1023
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1024
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1025
 
        #rounder ", %%mm4                       \n\t"\
1026
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1027
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1028
 
        #rounder ", %%mm0                       \n\t"\
1029
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1030
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1031
 
        "movq 64(%2), %%mm3                     \n\t"\
1032
 
        "pmaddwd %%mm2, %%mm3                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1033
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1034
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1035
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1036
 
        "psrad $" #shift ", %%mm7               \n\t"\
1037
 
        "psrad $" #shift ", %%mm4               \n\t"\
1038
 
        "movq %%mm0, %%mm1                      \n\t" /* A1             a1 */\
1039
 
        "paddd %%mm3, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1040
 
        "psubd %%mm3, %%mm1                     \n\t" /* A1-B1          a1-b1 */\
1041
 
        "psrad $" #shift ", %%mm0               \n\t"\
1042
 
        "psrad $" #shift ", %%mm1               \n\t"\
1043
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1044
 
        "movd %%mm7, " #dst "                   \n\t"\
1045
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1046
 
        "movd %%mm0, 16+" #dst "                \n\t"\
1047
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A1-B1  a1-b1 */\
1048
 
        "movd %%mm1, 96+" #dst "                \n\t"\
1049
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1050
 
        "movd %%mm4, 112+" #dst "               \n\t"\
1051
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1052
 
        "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1053
 
        "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1054
 
        "movq %%mm5, %%mm1                      \n\t" /* A2             a2 */\
1055
 
        "paddd %%mm4, %%mm1                     \n\t" /* A2+B2          a2+b2 */\
1056
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1057
 
        "psrad $" #shift ", %%mm1               \n\t"\
1058
 
        "psrad $" #shift ", %%mm5               \n\t"\
1059
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1060
 
        "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1061
 
        "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1062
 
        "psrad $" #shift ", %%mm6               \n\t"\
1063
 
        "psrad $" #shift ", %%mm4               \n\t"\
1064
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A2+B2  a2+b2 */\
1065
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1066
 
        "movd %%mm1, 32+" #dst "                \n\t"\
1067
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1068
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1069
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1070
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1071
 
        "movd %%mm5, 80+" #dst "                \n\t"
1072
 
 
1073
 
 
1074
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1075
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1076
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1077
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1078
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1079
 
        "jmp 9f                                 \n\t"
1080
 
 
1081
 
        "#.balign 16                            \n\t"\
1082
 
        "5:                                     \n\t"
1083
 
#undef IDCT
1084
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1085
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1086
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1087
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1088
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1090
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1092
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1093
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1094
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1095
 
        #rounder ", %%mm4                       \n\t"\
1096
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1097
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1098
 
        #rounder ", %%mm0                       \n\t"\
1099
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1100
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1101
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1102
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1103
 
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1104
 
        "movq 8+" #src4 ", %%mm3                \n\t" /* R6     R2      r6      r2 */\
1105
 
        "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1106
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1107
 
        "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1108
 
        "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1109
 
        "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1110
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1111
 
        "pmaddwd 40(%2), %%mm3                  \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1112
 
        #rounder ", %%mm1                       \n\t"\
1113
 
        "paddd %%mm1, %%mm7                     \n\t" /* A0             a0 */\
1114
 
        "paddd %%mm1, %%mm1                     \n\t" /* 2C0            2c0 */\
1115
 
        #rounder ", %%mm2                       \n\t"\
1116
 
        "psubd %%mm7, %%mm1                     \n\t" /* A3             a3 */\
1117
 
        "paddd %%mm2, %%mm3                     \n\t" /* A1             a1 */\
1118
 
        "paddd %%mm2, %%mm2                     \n\t" /* 2C1            2c1 */\
1119
 
        "psubd %%mm3, %%mm2                     \n\t" /* A2             a2 */\
1120
 
        "psrad $" #shift ", %%mm4               \n\t"\
1121
 
        "psrad $" #shift ", %%mm7               \n\t"\
1122
 
        "psrad $" #shift ", %%mm3               \n\t"\
1123
 
        "packssdw %%mm7, %%mm4                  \n\t" /* A0     a0 */\
1124
 
        "movq %%mm4, " #dst "                   \n\t"\
1125
 
        "psrad $" #shift ", %%mm0               \n\t"\
1126
 
        "packssdw %%mm3, %%mm0                  \n\t" /* A1     a1 */\
1127
 
        "movq %%mm0, 16+" #dst "                \n\t"\
1128
 
        "movq %%mm0, 96+" #dst "                \n\t"\
1129
 
        "movq %%mm4, 112+" #dst "               \n\t"\
1130
 
        "psrad $" #shift ", %%mm5               \n\t"\
1131
 
        "psrad $" #shift ", %%mm6               \n\t"\
1132
 
        "psrad $" #shift ", %%mm2               \n\t"\
1133
 
        "packssdw %%mm2, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1134
 
        "movq %%mm5, 32+" #dst "                \n\t"\
1135
 
        "psrad $" #shift ", %%mm1               \n\t"\
1136
 
        "packssdw %%mm1, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1137
 
        "movq %%mm6, 48+" #dst "                \n\t"\
1138
 
        "movq %%mm6, 64+" #dst "                \n\t"\
1139
 
        "movq %%mm5, 80+" #dst "                \n\t"   
1140
 
        
1141
 
 
1142
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1143
 
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1144
 
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1145
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1146
 
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1147
 
        "jmp 9f                                 \n\t"
1148
 
 
1149
 
 
1150
 
        "#.balign 16                            \n\t"\
1151
 
        "1:                                     \n\t"
1152
 
#undef IDCT
1153
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1154
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1155
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1156
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1157
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1158
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1159
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1160
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1161
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1162
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1163
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1164
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1165
 
        #rounder ", %%mm4                       \n\t"\
1166
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1167
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1168
 
        #rounder ", %%mm0                       \n\t"\
1169
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1170
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1171
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1172
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1173
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1174
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1175
 
        "movq 64(%2), %%mm1                     \n\t"\
1176
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1177
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1178
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1179
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1180
 
        "psrad $" #shift ", %%mm7               \n\t"\
1181
 
        "psrad $" #shift ", %%mm4               \n\t"\
1182
 
        "movq %%mm0, %%mm3                      \n\t" /* A1             a1 */\
1183
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1184
 
        "psubd %%mm1, %%mm3                     \n\t" /* A1-B1          a1-b1 */\
1185
 
        "psrad $" #shift ", %%mm0               \n\t"\
1186
 
        "psrad $" #shift ", %%mm3               \n\t"\
1187
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1188
 
        "movd %%mm7, " #dst "                   \n\t"\
1189
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1190
 
        "movd %%mm0, 16+" #dst "                \n\t"\
1191
 
        "packssdw %%mm3, %%mm3                  \n\t" /* A1-B1  a1-b1 */\
1192
 
        "movd %%mm3, 96+" #dst "                \n\t"\
1193
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1194
 
        "movd %%mm4, 112+" #dst "               \n\t"\
1195
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1196
 
        "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1197
 
        "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1198
 
        "movq %%mm5, %%mm3                      \n\t" /* A2             a2 */\
1199
 
        "paddd %%mm4, %%mm3                     \n\t" /* A2+B2          a2+b2 */\
1200
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1201
 
        "psrad $" #shift ", %%mm3               \n\t"\
1202
 
        "psrad $" #shift ", %%mm5               \n\t"\
1203
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1204
 
        "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1205
 
        "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1206
 
        "psrad $" #shift ", %%mm6               \n\t"\
1207
 
        "packssdw %%mm3, %%mm3                  \n\t" /* A2+B2  a2+b2 */\
1208
 
        "movd %%mm3, 32+" #dst "                \n\t"\
1209
 
        "psrad $" #shift ", %%mm4               \n\t"\
1210
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1211
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1212
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1213
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1214
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1215
 
        "movd %%mm5, 80+" #dst "                \n\t"
1216
 
        
1217
 
 
1218
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1219
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1220
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1221
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1222
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1223
 
        "jmp 9f                                 \n\t"
1224
 
 
1225
 
 
1226
 
        "#.balign 16                            \n\t"
1227
 
        "7:                                     \n\t"
1228
 
#undef IDCT
1229
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1230
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1231
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1232
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1233
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1234
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1235
 
        #rounder ", %%mm4                       \n\t"\
1236
 
        #rounder ", %%mm0                       \n\t"\
1237
 
        "psrad $" #shift ", %%mm4               \n\t"\
1238
 
        "psrad $" #shift ", %%mm0               \n\t"\
1239
 
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1240
 
        "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1241
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1242
 
        "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1243
 
        "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1244
 
        "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1245
 
        #rounder ", %%mm1                       \n\t"\
1246
 
        #rounder ", %%mm2                       \n\t"\
1247
 
        "psrad $" #shift ", %%mm1               \n\t"\
1248
 
        "packssdw %%mm1, %%mm4                  \n\t" /* A0     a0 */\
1249
 
        "movq %%mm4, " #dst "                   \n\t"\
1250
 
        "psrad $" #shift ", %%mm2               \n\t"\
1251
 
        "packssdw %%mm2, %%mm0                  \n\t" /* A1     a1 */\
1252
 
        "movq %%mm0, 16+" #dst "                \n\t"\
1253
 
        "movq %%mm0, 96+" #dst "                \n\t"\
1254
 
        "movq %%mm4, 112+" #dst "               \n\t"\
1255
 
        "movq %%mm0, 32+" #dst "                \n\t"\
1256
 
        "movq %%mm4, 48+" #dst "                \n\t"\
1257
 
        "movq %%mm4, 64+" #dst "                \n\t"\
1258
 
        "movq %%mm0, 80+" #dst "                \n\t"   
1259
 
 
1260
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1261
 
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1262
 
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1263
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1264
 
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1265
 
 
1266
 
 
1267
 
#endif
1268
 
 
1269
 
/*
1270
 
Input
1271
 
 00 40 04 44 20 60 24 64
1272
 
 10 30 14 34 50 70 54 74
1273
 
 01 41 03 43 21 61 23 63
1274
 
 11 31 13 33 51 71 53 73
1275
 
 02 42 06 46 22 62 26 66
1276
 
 12 32 16 36 52 72 56 76
1277
 
 05 45 07 47 25 65 27 67
1278
 
 15 35 17 37 55 75 57 77
1279
 
  
1280
 
Temp
1281
 
 00 04 10 14 20 24 30 34
1282
 
 40 44 50 54 60 64 70 74
1283
 
 01 03 11 13 21 23 31 33
1284
 
 41 43 51 53 61 63 71 73
1285
 
 02 06 12 16 22 26 32 36
1286
 
 42 46 52 56 62 66 72 76
1287
 
 05 07 15 17 25 27 35 37
1288
 
 45 47 55 57 65 67 75 77
1289
 
*/
1290
 
 
1291
 
"9: \n\t"
1292
 
                :: "r" (block), "r" (temp), "r" (coeffs)
1293
 
                : "%eax"
1294
 
        );
1295
 
}
1296
 
 
1297
 
void ff_simple_idct_a64_mmx(int16_t *block)
1298
 
{
1299
 
    idct(block);
1300
 
}
1301
 
 
1302
 
//FIXME merge add/put into the idct
1303
 
 
1304
 
void ff_simple_idct_put_a64_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1305
 
{
1306
 
    idct(block);
1307
 
    put_pixels_clamped_a64_mmx(block, dest, line_size);
1308
 
}
1309
 
void ff_simple_idct_add_a64_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1310
 
{
1311
 
    idct(block);
1312
 
    add_pixels_clamped_a64_mmx(block, dest, line_size);
1313
 
}