~ubuntu-branches/ubuntu/jaunty/xvidcap/jaunty-proposed

« back to all changes in this revision

Viewing changes to ffmpeg/libavcodec/i386/simple_idct_mmx.c

  • Committer: Bazaar Package Importer
  • Author(s): John Dong
  • Date: 2008-02-25 15:47:12 UTC
  • mfrom: (1.1.1 upstream)
  • Revision ID: james.westby@ubuntu.com-20080225154712-qvr11ekcea4c9ry8
Tags: 1.1.6-0.1ubuntu1
* Merge from debian-multimedia (LP: #120003), Ubuntu Changes:
 - For ffmpeg-related build-deps, remove cvs from package names.
 - Standards-Version 3.7.3
 - Maintainer Spec

Show diffs side-by-side

added added

removed removed

Lines of Context:
3
3
 *
4
4
 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5
5
 *
6
 
 * This library is free software; you can redistribute it and/or
 
6
 * This file is part of FFmpeg.
 
7
 *
 
8
 * FFmpeg is free software; you can redistribute it and/or
7
9
 * modify it under the terms of the GNU Lesser General Public
8
10
 * License as published by the Free Software Foundation; either
9
 
 * version 2 of the License, or (at your option) any later version.
 
11
 * version 2.1 of the License, or (at your option) any later version.
10
12
 *
11
 
 * This library is distributed in the hope that it will be useful,
 
13
 * FFmpeg is distributed in the hope that it will be useful,
12
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
16
 * Lesser General Public License for more details.
15
17
 *
16
18
 * You should have received a copy of the GNU Lesser General Public
17
 
 * License along with this library; if not, write to the Free Software
18
 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
19
 * License along with FFmpeg; if not, write to the Free Software
 
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
21
 */
20
22
#include "../dsputil.h"
 
23
#include "../simple_idct.h"
21
24
 
22
25
/*
23
26
23170.475006
39
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
40
43
#endif
41
44
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
 
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43
 
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
45
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
46
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
47
 
45
48
#define ROW_SHIFT 11
46
49
#define COL_SHIFT 20 // 6
47
50
 
48
 
static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
49
 
static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
 
51
static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
 
52
static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
50
53
 
51
54
static const int16_t __attribute__((aligned(8))) coeffs[]= {
52
 
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53
 
//      1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54
 
//      0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55
 
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56
 
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
57
 
//      0, 0, 0, 0,
58
 
//      0, 0, 0, 0,
 
55
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
 
56
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
 
57
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
 
58
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
 
59
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
 
60
//        0, 0, 0, 0,
 
61
//        0, 0, 0, 0,
59
62
 
60
63
 C4,  C4,  C4,  C4,
61
64
 C4, -C4,  C4, -C4,
62
 
 
 
65
 
63
66
 C2,  C6,  C2,  C6,
64
67
 C6, -C2,  C6, -C2,
65
 
 
 
68
 
66
69
 C1,  C3,  C1,  C3,
67
70
 C5,  C7,  C5,  C7,
68
 
 
 
71
 
69
72
 C3, -C7,  C3, -C7,
70
73
-C1, -C5, -C1, -C5,
71
 
 
 
74
 
72
75
 C5, -C1,  C5, -C1,
73
76
 C7,  C3,  C7,  C3,
74
 
 
 
77
 
75
78
 C7, -C5,  C7, -C5,
76
79
 C3, -C1,  C3, -C1
77
80
};
78
81
 
79
82
#if 0
80
83
static void unused_var_killer(){
81
 
        int a= wm1010 + d40000;
82
 
        temp[0]=a;
 
84
        int a= wm1010 + d40000;
 
85
        temp[0]=a;
83
86
}
84
87
 
85
88
static void inline idctCol (int16_t * col, int16_t *input)
92
95
#undef C5
93
96
#undef C6
94
97
#undef C7
95
 
        int a0, a1, a2, a3, b0, b1, b2, b3;
96
 
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
97
 
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98
 
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99
 
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100
 
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101
 
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102
 
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103
 
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
98
        int a0, a1, a2, a3, b0, b1, b2, b3;
 
99
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
100
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
101
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
102
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
103
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
104
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
105
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
106
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104
107
/*
105
 
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
106
 
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
107
 
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
108
 
                return;
109
 
        }*/
 
108
        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
 
109
                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
 
110
                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
 
111
                return;
 
112
        }*/
110
113
 
111
114
col[8*0] = input[8*0 + 0];
112
115
col[8*1] = input[8*2 + 0];
117
120
col[8*6] = input[8*4 + 1];
118
121
col[8*7] = input[8*6 + 1];
119
122
 
120
 
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
121
 
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
122
 
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
123
 
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
124
 
 
125
 
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
126
 
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
127
 
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
128
 
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
129
 
 
130
 
        col[8*0] = (a0 + b0) >> COL_SHIFT;
131
 
        col[8*1] = (a1 + b1) >> COL_SHIFT;
132
 
        col[8*2] = (a2 + b2) >> COL_SHIFT;
133
 
        col[8*3] = (a3 + b3) >> COL_SHIFT;
134
 
        col[8*4] = (a3 - b3) >> COL_SHIFT;
135
 
        col[8*5] = (a2 - b2) >> COL_SHIFT;
136
 
        col[8*6] = (a1 - b1) >> COL_SHIFT;
137
 
        col[8*7] = (a0 - b0) >> COL_SHIFT;
 
123
        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
 
124
        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
 
125
        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
 
126
        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
 
127
 
 
128
        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
 
129
        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
 
130
        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
 
131
        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
 
132
 
 
133
        col[8*0] = (a0 + b0) >> COL_SHIFT;
 
134
        col[8*1] = (a1 + b1) >> COL_SHIFT;
 
135
        col[8*2] = (a2 + b2) >> COL_SHIFT;
 
136
        col[8*3] = (a3 + b3) >> COL_SHIFT;
 
137
        col[8*4] = (a3 - b3) >> COL_SHIFT;
 
138
        col[8*5] = (a2 - b2) >> COL_SHIFT;
 
139
        col[8*6] = (a1 - b1) >> COL_SHIFT;
 
140
        col[8*7] = (a0 - b0) >> COL_SHIFT;
138
141
}
139
142
 
140
143
static void inline idctRow (int16_t * output, int16_t * input)
141
144
{
142
 
        int16_t row[8];
 
145
        int16_t row[8];
143
146
 
144
 
        int a0, a1, a2, a3, b0, b1, b2, b3;
145
 
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
146
 
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147
 
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148
 
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149
 
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150
 
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151
 
        const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152
 
        const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
147
        int a0, a1, a2, a3, b0, b1, b2, b3;
 
148
        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
149
        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
150
        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
151
        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
152
        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
153
        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
154
        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
155
        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153
156
 
154
157
row[0] = input[0];
155
158
row[2] = input[1];
160
163
row[5] = input[12];
161
164
row[7] = input[13];
162
165
 
163
 
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
164
 
                row[0] = row[1] = row[2] = row[3] = row[4] =
165
 
                        row[5] = row[6] = row[7] = row[0]<<3;
166
 
        output[0] = row[0];
167
 
        output[2] = row[1];
168
 
        output[4] = row[2];
169
 
        output[6] = row[3];
170
 
        output[8] = row[4];
171
 
        output[10] = row[5];
172
 
        output[12] = row[6];
173
 
        output[14] = row[7];
174
 
                return;
175
 
        }
176
 
 
177
 
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
178
 
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
179
 
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
180
 
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
181
 
 
182
 
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
183
 
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
184
 
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
185
 
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
186
 
 
187
 
        row[0] = (a0 + b0) >> ROW_SHIFT;
188
 
        row[1] = (a1 + b1) >> ROW_SHIFT;
189
 
        row[2] = (a2 + b2) >> ROW_SHIFT;
190
 
        row[3] = (a3 + b3) >> ROW_SHIFT;
191
 
        row[4] = (a3 - b3) >> ROW_SHIFT;
192
 
        row[5] = (a2 - b2) >> ROW_SHIFT;
193
 
        row[6] = (a1 - b1) >> ROW_SHIFT;
194
 
        row[7] = (a0 - b0) >> ROW_SHIFT;
195
 
 
196
 
        output[0] = row[0];
197
 
        output[2] = row[1];
198
 
        output[4] = row[2];
199
 
        output[6] = row[3];
200
 
        output[8] = row[4];
201
 
        output[10] = row[5];
202
 
        output[12] = row[6];
203
 
        output[14] = row[7];
 
166
        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
 
167
                row[0] = row[1] = row[2] = row[3] = row[4] =
 
168
                        row[5] = row[6] = row[7] = row[0]<<3;
 
169
        output[0]  = row[0];
 
170
        output[2]  = row[1];
 
171
        output[4]  = row[2];
 
172
        output[6]  = row[3];
 
173
        output[8]  = row[4];
 
174
        output[10] = row[5];
 
175
        output[12] = row[6];
 
176
        output[14] = row[7];
 
177
                return;
 
178
        }
 
179
 
 
180
        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
 
181
        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
 
182
        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
 
183
        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
 
184
 
 
185
        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
 
186
        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
 
187
        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
 
188
        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
 
189
 
 
190
        row[0] = (a0 + b0) >> ROW_SHIFT;
 
191
        row[1] = (a1 + b1) >> ROW_SHIFT;
 
192
        row[2] = (a2 + b2) >> ROW_SHIFT;
 
193
        row[3] = (a3 + b3) >> ROW_SHIFT;
 
194
        row[4] = (a3 - b3) >> ROW_SHIFT;
 
195
        row[5] = (a2 - b2) >> ROW_SHIFT;
 
196
        row[6] = (a1 - b1) >> ROW_SHIFT;
 
197
        row[7] = (a0 - b0) >> ROW_SHIFT;
 
198
 
 
199
        output[0]  = row[0];
 
200
        output[2]  = row[1];
 
201
        output[4]  = row[2];
 
202
        output[6]  = row[3];
 
203
        output[8]  = row[4];
 
204
        output[10] = row[5];
 
205
        output[12] = row[6];
 
206
        output[14] = row[7];
204
207
}
205
208
#endif
206
209
 
207
210
static inline void idct(int16_t *block)
208
211
{
209
 
        int64_t __attribute__((aligned(8))) align_tmp[16];
210
 
        int16_t * const temp= (int16_t*)align_tmp;
 
212
        int64_t __attribute__((aligned(8))) align_tmp[16];
 
213
        int16_t * const temp= (int16_t*)align_tmp;
211
214
 
212
 
        asm volatile(
 
215
        asm volatile(
213
216
#if 0 //Alternative, simpler variant
214
217
 
215
218
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
216
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
217
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
218
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
219
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
220
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
221
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
222
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
223
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
224
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
225
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
226
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
227
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
228
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
229
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
230
 
        #rounder ", %%mm4                       \n\t"\
231
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
232
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
233
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
234
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
235
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
236
 
        #rounder ", %%mm0                       \n\t"\
237
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
238
 
        "paddd %%mm0, %%mm0                     \n\t" \
239
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
240
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
241
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
242
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
243
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
244
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
245
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
246
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
247
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
248
 
        "psrad $" #shift ", %%mm7               \n\t"\
249
 
        "psrad $" #shift ", %%mm4               \n\t"\
250
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
251
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
252
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
253
 
        "psrad $" #shift ", %%mm1               \n\t"\
254
 
        "psrad $" #shift ", %%mm2               \n\t"\
255
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
256
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
257
 
        "movq %%mm7, " #dst "                   \n\t"\
258
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
259
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
260
 
        "movq %%mm2, 24+" #dst "                \n\t"\
261
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
262
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
263
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
264
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
265
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
266
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
267
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
268
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
269
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
270
 
        "psrad $" #shift ", %%mm2               \n\t"\
271
 
        "psrad $" #shift ", %%mm0               \n\t"\
272
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
273
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
274
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
275
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
276
 
        "psrad $" #shift ", %%mm6               \n\t"\
277
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
278
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
279
 
        "psrad $" #shift ", %%mm4               \n\t"\
280
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
281
 
        "movq %%mm4, 16+" #dst "                \n\t"\
282
 
 
283
 
#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
284
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
285
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
286
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
287
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
288
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
289
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
290
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
291
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
292
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
293
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
294
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
295
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
296
 
        #rounder ", %%mm4                       \n\t"\
297
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
298
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
299
 
        #rounder ", %%mm0                       \n\t"\
300
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
301
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
302
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
303
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
304
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
305
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
306
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
307
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
308
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
309
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
310
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
311
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
312
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
313
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
314
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
315
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
316
 
        "psrad $" #shift ", %%mm7               \n\t"\
317
 
        "psrad $" #shift ", %%mm4               \n\t"\
318
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
319
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
320
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
321
 
        "psrad $" #shift ", %%mm0               \n\t"\
322
 
        "psrad $" #shift ", %%mm2               \n\t"\
323
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
324
 
        "movd %%mm7, " #dst "                   \n\t"\
325
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
326
 
        "movd %%mm0, 16+" #dst "                \n\t"\
327
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
328
 
        "movd %%mm2, 96+" #dst "                \n\t"\
329
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
330
 
        "movd %%mm4, 112+" #dst "               \n\t"\
331
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
332
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
333
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
334
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
335
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
336
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
337
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
338
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
339
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
340
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
341
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
342
 
        "psrad $" #shift ", %%mm2               \n\t"\
343
 
        "psrad $" #shift ", %%mm5               \n\t"\
344
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
345
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
346
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
347
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
348
 
        "psrad $" #shift ", %%mm6               \n\t"\
349
 
        "psrad $" #shift ", %%mm4               \n\t"\
350
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
351
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
352
 
        "movd %%mm2, 32+" #dst "                \n\t"\
353
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
354
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
355
 
        "movd %%mm6, 48+" #dst "                \n\t"\
356
 
        "movd %%mm4, 64+" #dst "                \n\t"\
357
 
        "movd %%mm5, 80+" #dst "                \n\t"\
358
 
 
359
 
        
 
219
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
220
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
221
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
222
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
223
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
224
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
225
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
226
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
227
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
228
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
229
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
230
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
231
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
232
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
233
        #rounder ", %%mm4               \n\t"\
 
234
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
235
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
236
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
237
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
 
238
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
239
        #rounder ", %%mm0               \n\t"\
 
240
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
 
241
        "paddd %%mm0, %%mm0             \n\t" \
 
242
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
 
243
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
244
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
 
245
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
 
246
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
247
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
248
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
249
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
250
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
 
251
        "psrad $" #shift ", %%mm7       \n\t"\
 
252
        "psrad $" #shift ", %%mm4       \n\t"\
 
253
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
 
254
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
 
255
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
256
        "psrad $" #shift ", %%mm1       \n\t"\
 
257
        "psrad $" #shift ", %%mm2       \n\t"\
 
258
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
 
259
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
 
260
        "movq %%mm7, " #dst "           \n\t"\
 
261
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
 
262
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
263
        "movq %%mm2, 24+" #dst "        \n\t"\
 
264
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
265
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
266
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
267
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
268
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
 
269
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
270
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
271
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
272
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
 
273
        "psrad $" #shift ", %%mm2       \n\t"\
 
274
        "psrad $" #shift ", %%mm0       \n\t"\
 
275
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
276
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
 
277
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
278
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
279
        "psrad $" #shift ", %%mm6       \n\t"\
 
280
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
 
281
        "movq %%mm2, 8+" #dst "         \n\t"\
 
282
        "psrad $" #shift ", %%mm4       \n\t"\
 
283
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
 
284
        "movq %%mm4, 16+" #dst "        \n\t"\
 
285
 
 
286
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
 
287
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
288
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
289
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
290
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
291
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
292
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
293
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
294
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
295
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
296
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
297
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
298
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
299
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
300
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
301
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
302
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
303
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
304
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
305
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
 
306
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
 
307
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
 
308
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
309
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
310
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
 
311
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
 
312
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
313
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
314
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
315
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
316
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
 
317
        "psrad $" #shift ", %%mm7       \n\t"\
 
318
        "psrad $" #shift ", %%mm4       \n\t"\
 
319
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
 
320
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
321
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
322
        "psrad $" #shift ", %%mm0       \n\t"\
 
323
        "psrad $" #shift ", %%mm2       \n\t"\
 
324
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
 
325
        "movd %%mm7, " #dst "           \n\t"\
 
326
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
327
        "movd %%mm0, 16+" #dst "        \n\t"\
 
328
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
 
329
        "movd %%mm2, 96+" #dst "        \n\t"\
 
330
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
331
        "movd %%mm4, 112+" #dst "       \n\t"\
 
332
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
 
333
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
334
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
335
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
336
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
337
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
338
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
 
339
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
340
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
341
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
342
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
343
        "psrad $" #shift ", %%mm2       \n\t"\
 
344
        "psrad $" #shift ", %%mm5       \n\t"\
 
345
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
346
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
 
347
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
348
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
349
        "psrad $" #shift ", %%mm6       \n\t"\
 
350
        "psrad $" #shift ", %%mm4       \n\t"\
 
351
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
 
352
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
353
        "movd %%mm2, 32+" #dst "        \n\t"\
 
354
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
 
355
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
356
        "movd %%mm6, 48+" #dst "        \n\t"\
 
357
        "movd %%mm4, 64+" #dst "        \n\t"\
 
358
        "movd %%mm5, 80+" #dst "        \n\t"\
 
359
 
 
360
 
360
361
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
361
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
362
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
363
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
364
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
365
 
        "movq "MANGLE(wm1010)", %%mm4           \n\t"\
366
 
        "pand %%mm0, %%mm4                      \n\t"\
367
 
        "por %%mm1, %%mm4                       \n\t"\
368
 
        "por %%mm2, %%mm4                       \n\t"\
369
 
        "por %%mm3, %%mm4                       \n\t"\
370
 
        "packssdw %%mm4,%%mm4                   \n\t"\
371
 
        "movd %%mm4, %%eax                      \n\t"\
372
 
        "orl %%eax, %%eax                       \n\t"\
373
 
        "jz 1f                                  \n\t"\
374
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
375
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
376
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
377
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
378
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
379
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
380
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
381
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
382
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
383
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
384
 
        #rounder ", %%mm4                       \n\t"\
385
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
386
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
387
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
388
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
389
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
390
 
        #rounder ", %%mm0                       \n\t"\
391
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
392
 
        "paddd %%mm0, %%mm0                     \n\t" \
393
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
394
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
395
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
396
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
397
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
398
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
399
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
400
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
401
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
402
 
        "psrad $" #shift ", %%mm7               \n\t"\
403
 
        "psrad $" #shift ", %%mm4               \n\t"\
404
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
405
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
406
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
407
 
        "psrad $" #shift ", %%mm1               \n\t"\
408
 
        "psrad $" #shift ", %%mm2               \n\t"\
409
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
410
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
411
 
        "movq %%mm7, " #dst "                   \n\t"\
412
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
413
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
414
 
        "movq %%mm2, 24+" #dst "                \n\t"\
415
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
416
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
417
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
418
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
419
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
420
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
421
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
422
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
423
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
424
 
        "psrad $" #shift ", %%mm2               \n\t"\
425
 
        "psrad $" #shift ", %%mm0               \n\t"\
426
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
427
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
428
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
429
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
430
 
        "psrad $" #shift ", %%mm6               \n\t"\
431
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
432
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
433
 
        "psrad $" #shift ", %%mm4               \n\t"\
434
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
435
 
        "movq %%mm4, 16+" #dst "                \n\t"\
436
 
        "jmp 2f                                 \n\t"\
437
 
        "1:                                     \n\t"\
438
 
        "pslld $16, %%mm0                       \n\t"\
439
 
        "#paddd "MANGLE(d40000)", %%mm0         \n\t"\
440
 
        "psrad $13, %%mm0                       \n\t"\
441
 
        "packssdw %%mm0, %%mm0                  \n\t"\
442
 
        "movq %%mm0, " #dst "                   \n\t"\
443
 
        "movq %%mm0, 8+" #dst "                 \n\t"\
444
 
        "movq %%mm0, 16+" #dst "                \n\t"\
445
 
        "movq %%mm0, 24+" #dst "                \n\t"\
446
 
        "2:                                     \n\t"
 
362
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
363
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
364
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
365
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
366
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
 
367
        "pand %%mm0, %%mm4              \n\t"\
 
368
        "por %%mm1, %%mm4               \n\t"\
 
369
        "por %%mm2, %%mm4               \n\t"\
 
370
        "por %%mm3, %%mm4               \n\t"\
 
371
        "packssdw %%mm4,%%mm4           \n\t"\
 
372
        "movd %%mm4, %%eax              \n\t"\
 
373
        "orl %%eax, %%eax               \n\t"\
 
374
        "jz 1f                          \n\t"\
 
375
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
376
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
377
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
378
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
379
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
380
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
381
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
382
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
383
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
384
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
385
        #rounder ", %%mm4               \n\t"\
 
386
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
387
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
388
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
389
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
 
390
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
391
        #rounder ", %%mm0               \n\t"\
 
392
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
 
393
        "paddd %%mm0, %%mm0             \n\t" \
 
394
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
 
395
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
396
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
 
397
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
 
398
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
399
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
400
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
401
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
402
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
 
403
        "psrad $" #shift ", %%mm7       \n\t"\
 
404
        "psrad $" #shift ", %%mm4       \n\t"\
 
405
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
 
406
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
 
407
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
408
        "psrad $" #shift ", %%mm1       \n\t"\
 
409
        "psrad $" #shift ", %%mm2       \n\t"\
 
410
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
 
411
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
 
412
        "movq %%mm7, " #dst "           \n\t"\
 
413
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
 
414
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
415
        "movq %%mm2, 24+" #dst "        \n\t"\
 
416
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
417
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
418
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
419
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
420
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
 
421
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
422
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
423
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
424
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
 
425
        "psrad $" #shift ", %%mm2       \n\t"\
 
426
        "psrad $" #shift ", %%mm0       \n\t"\
 
427
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
428
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
 
429
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
430
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
431
        "psrad $" #shift ", %%mm6       \n\t"\
 
432
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
 
433
        "movq %%mm2, 8+" #dst "         \n\t"\
 
434
        "psrad $" #shift ", %%mm4       \n\t"\
 
435
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
 
436
        "movq %%mm4, 16+" #dst "        \n\t"\
 
437
        "jmp 2f                         \n\t"\
 
438
        "1:                             \n\t"\
 
439
        "pslld $16, %%mm0               \n\t"\
 
440
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
 
441
        "psrad $13, %%mm0               \n\t"\
 
442
        "packssdw %%mm0, %%mm0          \n\t"\
 
443
        "movq %%mm0, " #dst "           \n\t"\
 
444
        "movq %%mm0, 8+" #dst "         \n\t"\
 
445
        "movq %%mm0, 16+" #dst "        \n\t"\
 
446
        "movq %%mm0, 24+" #dst "        \n\t"\
 
447
        "2:                             \n\t"
447
448
 
448
449
 
449
450
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
457
458
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
458
459
 
459
460
 
460
 
//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
461
 
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
462
 
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
463
 
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
464
 
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
461
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
 
462
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
463
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
464
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
465
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
465
466
 
466
467
#else
467
468
 
468
469
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
469
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
470
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
471
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
472
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
473
 
        "movq "MANGLE(wm1010)", %%mm4           \n\t"\
474
 
        "pand %%mm0, %%mm4                      \n\t"\
475
 
        "por %%mm1, %%mm4                       \n\t"\
476
 
        "por %%mm2, %%mm4                       \n\t"\
477
 
        "por %%mm3, %%mm4                       \n\t"\
478
 
        "packssdw %%mm4,%%mm4                   \n\t"\
479
 
        "movd %%mm4, %%eax                      \n\t"\
480
 
        "orl %%eax, %%eax                       \n\t"\
481
 
        "jz 1f                                  \n\t"\
482
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
483
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
484
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
485
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
486
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
487
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
488
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
489
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
490
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
491
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
492
 
        #rounder ", %%mm4                       \n\t"\
493
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
494
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
495
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
496
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
497
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
498
 
        #rounder ", %%mm0                       \n\t"\
499
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
500
 
        "paddd %%mm0, %%mm0                     \n\t" \
501
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
502
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
503
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
504
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
505
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
506
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
507
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
508
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
509
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
510
 
        "psrad $" #shift ", %%mm7               \n\t"\
511
 
        "psrad $" #shift ", %%mm4               \n\t"\
512
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
513
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
514
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
515
 
        "psrad $" #shift ", %%mm1               \n\t"\
516
 
        "psrad $" #shift ", %%mm2               \n\t"\
517
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
518
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
519
 
        "movq %%mm7, " #dst "                   \n\t"\
520
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
521
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
522
 
        "movq %%mm2, 24+" #dst "                \n\t"\
523
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
524
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
525
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
526
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
527
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
528
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
529
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
530
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
531
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
532
 
        "psrad $" #shift ", %%mm2               \n\t"\
533
 
        "psrad $" #shift ", %%mm0               \n\t"\
534
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
535
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
536
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
537
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
538
 
        "psrad $" #shift ", %%mm6               \n\t"\
539
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
540
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
541
 
        "psrad $" #shift ", %%mm4               \n\t"\
542
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
543
 
        "movq %%mm4, 16+" #dst "                \n\t"\
544
 
        "jmp 2f                                 \n\t"\
545
 
        "1:                                     \n\t"\
546
 
        "pslld $16, %%mm0                       \n\t"\
547
 
        "paddd "MANGLE(d40000)", %%mm0          \n\t"\
548
 
        "psrad $13, %%mm0                       \n\t"\
549
 
        "packssdw %%mm0, %%mm0                  \n\t"\
550
 
        "movq %%mm0, " #dst "                   \n\t"\
551
 
        "movq %%mm0, 8+" #dst "                 \n\t"\
552
 
        "movq %%mm0, 16+" #dst "                \n\t"\
553
 
        "movq %%mm0, 24+" #dst "                \n\t"\
554
 
        "2:                                     \n\t"
 
470
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
471
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
472
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
473
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
474
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
 
475
        "pand %%mm0, %%mm4              \n\t"\
 
476
        "por %%mm1, %%mm4               \n\t"\
 
477
        "por %%mm2, %%mm4               \n\t"\
 
478
        "por %%mm3, %%mm4               \n\t"\
 
479
        "packssdw %%mm4,%%mm4           \n\t"\
 
480
        "movd %%mm4, %%eax              \n\t"\
 
481
        "orl %%eax, %%eax               \n\t"\
 
482
        "jz 1f                          \n\t"\
 
483
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
484
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
485
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
486
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
487
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
488
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
489
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
490
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
491
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
492
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
493
        #rounder ", %%mm4               \n\t"\
 
494
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
495
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
496
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
497
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
 
498
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
499
        #rounder ", %%mm0               \n\t"\
 
500
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
 
501
        "paddd %%mm0, %%mm0             \n\t" \
 
502
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
 
503
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
504
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
 
505
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
 
506
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
507
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
508
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
509
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
510
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
 
511
        "psrad $" #shift ", %%mm7       \n\t"\
 
512
        "psrad $" #shift ", %%mm4       \n\t"\
 
513
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
 
514
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
 
515
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
516
        "psrad $" #shift ", %%mm1       \n\t"\
 
517
        "psrad $" #shift ", %%mm2       \n\t"\
 
518
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
 
519
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
 
520
        "movq %%mm7, " #dst "           \n\t"\
 
521
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
 
522
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
523
        "movq %%mm2, 24+" #dst "        \n\t"\
 
524
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
525
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
526
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
527
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
528
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
 
529
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
530
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
531
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
532
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
 
533
        "psrad $" #shift ", %%mm2       \n\t"\
 
534
        "psrad $" #shift ", %%mm0       \n\t"\
 
535
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
536
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
 
537
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
538
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
539
        "psrad $" #shift ", %%mm6       \n\t"\
 
540
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
 
541
        "movq %%mm2, 8+" #dst "         \n\t"\
 
542
        "psrad $" #shift ", %%mm4       \n\t"\
 
543
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
 
544
        "movq %%mm4, 16+" #dst "        \n\t"\
 
545
        "jmp 2f                         \n\t"\
 
546
        "1:                             \n\t"\
 
547
        "pslld $16, %%mm0               \n\t"\
 
548
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
 
549
        "psrad $13, %%mm0               \n\t"\
 
550
        "packssdw %%mm0, %%mm0          \n\t"\
 
551
        "movq %%mm0, " #dst "           \n\t"\
 
552
        "movq %%mm0, 8+" #dst "         \n\t"\
 
553
        "movq %%mm0, 16+" #dst "        \n\t"\
 
554
        "movq %%mm0, 24+" #dst "        \n\t"\
 
555
        "2:                             \n\t"
555
556
 
556
557
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
557
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
558
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
559
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
560
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
561
 
        "movq %%mm0, %%mm4                      \n\t"\
562
 
        "por %%mm1, %%mm4                       \n\t"\
563
 
        "por %%mm2, %%mm4                       \n\t"\
564
 
        "por %%mm3, %%mm4                       \n\t"\
565
 
        "packssdw %%mm4,%%mm4                   \n\t"\
566
 
        "movd %%mm4, %%eax                      \n\t"\
567
 
        "orl %%eax, %%eax                       \n\t"\
568
 
        "jz " #bt "                             \n\t"\
569
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
570
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
571
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
572
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
573
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
574
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
575
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
576
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
577
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
578
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
579
 
        #rounder ", %%mm4                       \n\t"\
580
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
581
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
582
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
583
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
584
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
585
 
        #rounder ", %%mm0                       \n\t"\
586
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
587
 
        "paddd %%mm0, %%mm0                     \n\t" \
588
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
589
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
590
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
591
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
592
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
593
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
594
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
595
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
596
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
597
 
        "psrad $" #shift ", %%mm7               \n\t"\
598
 
        "psrad $" #shift ", %%mm4               \n\t"\
599
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
600
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
601
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
602
 
        "psrad $" #shift ", %%mm1               \n\t"\
603
 
        "psrad $" #shift ", %%mm2               \n\t"\
604
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
605
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
606
 
        "movq %%mm7, " #dst "                   \n\t"\
607
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
608
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
609
 
        "movq %%mm2, 24+" #dst "                \n\t"\
610
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
611
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
612
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
613
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
614
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
615
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
616
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
617
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
618
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
619
 
        "psrad $" #shift ", %%mm2               \n\t"\
620
 
        "psrad $" #shift ", %%mm0               \n\t"\
621
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
622
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
623
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
624
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
625
 
        "psrad $" #shift ", %%mm6               \n\t"\
626
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
627
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
628
 
        "psrad $" #shift ", %%mm4               \n\t"\
629
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
630
 
        "movq %%mm4, 16+" #dst "                \n\t"\
 
558
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
559
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
560
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
561
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
562
        "movq %%mm0, %%mm4              \n\t"\
 
563
        "por %%mm1, %%mm4               \n\t"\
 
564
        "por %%mm2, %%mm4               \n\t"\
 
565
        "por %%mm3, %%mm4               \n\t"\
 
566
        "packssdw %%mm4,%%mm4           \n\t"\
 
567
        "movd %%mm4, %%eax              \n\t"\
 
568
        "orl %%eax, %%eax               \n\t"\
 
569
        "jz " #bt "                     \n\t"\
 
570
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
571
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
572
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
573
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
574
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
575
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
576
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
577
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
578
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
579
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
580
        #rounder ", %%mm4               \n\t"\
 
581
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
582
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
583
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
584
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
 
585
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
586
        #rounder ", %%mm0               \n\t"\
 
587
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
 
588
        "paddd %%mm0, %%mm0             \n\t" \
 
589
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
 
590
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
591
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
 
592
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
 
593
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
594
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
595
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
596
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
597
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
 
598
        "psrad $" #shift ", %%mm7       \n\t"\
 
599
        "psrad $" #shift ", %%mm4       \n\t"\
 
600
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
 
601
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
 
602
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
603
        "psrad $" #shift ", %%mm1       \n\t"\
 
604
        "psrad $" #shift ", %%mm2       \n\t"\
 
605
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
 
606
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
 
607
        "movq %%mm7, " #dst "           \n\t"\
 
608
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
 
609
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
610
        "movq %%mm2, 24+" #dst "        \n\t"\
 
611
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
612
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
613
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
614
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
615
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
 
616
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
617
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
618
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
619
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
 
620
        "psrad $" #shift ", %%mm2       \n\t"\
 
621
        "psrad $" #shift ", %%mm0       \n\t"\
 
622
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
623
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
 
624
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
625
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
626
        "psrad $" #shift ", %%mm6       \n\t"\
 
627
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
 
628
        "movq %%mm2, 8+" #dst "         \n\t"\
 
629
        "psrad $" #shift ", %%mm4       \n\t"\
 
630
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
 
631
        "movq %%mm4, 16+" #dst "        \n\t"\
631
632
 
632
633
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
633
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
634
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
635
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
636
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
637
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
638
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
639
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
640
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
641
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
642
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
643
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
644
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
645
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
646
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
647
 
        #rounder ", %%mm4                       \n\t"\
648
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
649
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
650
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
651
 
        "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
652
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
653
 
        #rounder ", %%mm0                       \n\t"\
654
 
        "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
655
 
        "paddd %%mm0, %%mm0                     \n\t" \
656
 
        "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
657
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
658
 
        "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
659
 
        "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
660
 
        "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
661
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
662
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
663
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
664
 
        "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
665
 
        "psrad $" #shift ", %%mm7               \n\t"\
666
 
        "psrad $" #shift ", %%mm4               \n\t"\
667
 
        "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
668
 
        "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
669
 
        "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
670
 
        "psrad $" #shift ", %%mm1               \n\t"\
671
 
        "psrad $" #shift ", %%mm2               \n\t"\
672
 
        "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
673
 
        "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
674
 
        "movq %%mm7, " #dst "                   \n\t"\
675
 
        "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
676
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
677
 
        "movq %%mm2, 24+" #dst "                \n\t"\
678
 
        "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
679
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
680
 
        "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
681
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
682
 
        "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
683
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
684
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
685
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
686
 
        "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
687
 
        "psrad $" #shift ", %%mm2               \n\t"\
688
 
        "psrad $" #shift ", %%mm0               \n\t"\
689
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
690
 
        "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
691
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
692
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
693
 
        "psrad $" #shift ", %%mm6               \n\t"\
694
 
        "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
695
 
        "movq %%mm2, 8+" #dst "                 \n\t"\
696
 
        "psrad $" #shift ", %%mm4               \n\t"\
697
 
        "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
698
 
        "movq %%mm4, 16+" #dst "                \n\t"\
 
634
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
635
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
636
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
637
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
638
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
639
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
640
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
641
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
642
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
643
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
644
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
645
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
646
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
647
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
648
        #rounder ", %%mm4               \n\t"\
 
649
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
650
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
651
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
652
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
 
653
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
654
        #rounder ", %%mm0               \n\t"\
 
655
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
 
656
        "paddd %%mm0, %%mm0             \n\t" \
 
657
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
 
658
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
659
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
 
660
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
 
661
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
662
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
663
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
664
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
665
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
 
666
        "psrad $" #shift ", %%mm7       \n\t"\
 
667
        "psrad $" #shift ", %%mm4       \n\t"\
 
668
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
 
669
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
 
670
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
671
        "psrad $" #shift ", %%mm1       \n\t"\
 
672
        "psrad $" #shift ", %%mm2       \n\t"\
 
673
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
 
674
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
 
675
        "movq %%mm7, " #dst "           \n\t"\
 
676
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
 
677
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
678
        "movq %%mm2, 24+" #dst "        \n\t"\
 
679
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
680
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
681
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
682
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
683
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
 
684
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
685
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
686
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
687
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
 
688
        "psrad $" #shift ", %%mm2       \n\t"\
 
689
        "psrad $" #shift ", %%mm0       \n\t"\
 
690
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
691
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
 
692
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
693
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
694
        "psrad $" #shift ", %%mm6       \n\t"\
 
695
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
 
696
        "movq %%mm2, 8+" #dst "         \n\t"\
 
697
        "psrad $" #shift ", %%mm4       \n\t"\
 
698
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
 
699
        "movq %%mm4, 16+" #dst "        \n\t"\
699
700
 
700
701
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
701
702
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
704
705
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
705
706
 
706
707
#undef IDCT
707
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
708
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
709
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
710
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
711
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
712
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
713
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
714
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
715
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
716
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
717
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
718
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
719
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
720
 
        #rounder ", %%mm4                       \n\t"\
721
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
722
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
723
 
        #rounder ", %%mm0                       \n\t"\
724
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
725
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
726
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
727
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
728
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
729
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
730
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
731
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
732
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
733
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
734
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
735
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
736
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
737
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
738
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
739
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
740
 
        "psrad $" #shift ", %%mm7               \n\t"\
741
 
        "psrad $" #shift ", %%mm4               \n\t"\
742
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
743
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
744
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
745
 
        "psrad $" #shift ", %%mm0               \n\t"\
746
 
        "psrad $" #shift ", %%mm2               \n\t"\
747
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
748
 
        "movd %%mm7, " #dst "                   \n\t"\
749
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
750
 
        "movd %%mm0, 16+" #dst "                \n\t"\
751
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
752
 
        "movd %%mm2, 96+" #dst "                \n\t"\
753
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
754
 
        "movd %%mm4, 112+" #dst "               \n\t"\
755
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
756
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
757
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
758
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
759
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
760
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
761
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
762
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
763
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
764
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
765
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
766
 
        "psrad $" #shift ", %%mm2               \n\t"\
767
 
        "psrad $" #shift ", %%mm5               \n\t"\
768
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
769
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
770
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
771
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
772
 
        "psrad $" #shift ", %%mm6               \n\t"\
773
 
        "psrad $" #shift ", %%mm4               \n\t"\
774
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
775
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
776
 
        "movd %%mm2, 32+" #dst "                \n\t"\
777
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
778
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
779
 
        "movd %%mm6, 48+" #dst "                \n\t"\
780
 
        "movd %%mm4, 64+" #dst "                \n\t"\
781
 
        "movd %%mm5, 80+" #dst "                \n\t"
782
 
 
783
 
 
784
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
785
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
786
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
787
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
788
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
789
 
        "jmp 9f                                 \n\t"
790
 
 
791
 
        "#.balign 16                            \n\t"\
792
 
        "4:                                     \n\t"
 
708
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
709
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
710
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
711
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
712
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
713
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
714
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
715
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
716
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
717
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
718
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
719
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
720
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
721
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
722
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
723
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
724
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
725
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
726
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
727
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
 
728
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
 
729
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
 
730
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
731
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
732
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
 
733
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
 
734
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
735
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
736
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
737
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
738
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
 
739
        "psrad $" #shift ", %%mm7       \n\t"\
 
740
        "psrad $" #shift ", %%mm4       \n\t"\
 
741
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
 
742
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
743
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
744
        "psrad $" #shift ", %%mm0       \n\t"\
 
745
        "psrad $" #shift ", %%mm2       \n\t"\
 
746
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
 
747
        "movd %%mm7, " #dst "           \n\t"\
 
748
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
749
        "movd %%mm0, 16+" #dst "        \n\t"\
 
750
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
 
751
        "movd %%mm2, 96+" #dst "        \n\t"\
 
752
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
753
        "movd %%mm4, 112+" #dst "       \n\t"\
 
754
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
 
755
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
756
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
757
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
758
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
759
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
760
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
 
761
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
762
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
763
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
764
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
765
        "psrad $" #shift ", %%mm2       \n\t"\
 
766
        "psrad $" #shift ", %%mm5       \n\t"\
 
767
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
768
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
 
769
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
770
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
771
        "psrad $" #shift ", %%mm6       \n\t"\
 
772
        "psrad $" #shift ", %%mm4       \n\t"\
 
773
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
 
774
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
775
        "movd %%mm2, 32+" #dst "        \n\t"\
 
776
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
 
777
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
778
        "movd %%mm6, 48+" #dst "        \n\t"\
 
779
        "movd %%mm4, 64+" #dst "        \n\t"\
 
780
        "movd %%mm5, 80+" #dst "        \n\t"
 
781
 
 
782
 
 
783
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
784
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
785
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
786
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
787
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
788
        "jmp 9f                         \n\t"
 
789
 
 
790
        "#" ASMALIGN(4)                      \
 
791
        "4:                             \n\t"
793
792
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794
793
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
795
794
 
796
795
#undef IDCT
797
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
798
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
799
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
800
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
801
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
802
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
803
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
804
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
805
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
806
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
807
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
808
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
809
 
        #rounder ", %%mm4                       \n\t"\
810
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
811
 
        #rounder ", %%mm0                       \n\t"\
812
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
813
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
814
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
815
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
816
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
817
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
818
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
819
 
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
820
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
821
 
        "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
822
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
823
 
        "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
824
 
        "psrad $" #shift ", %%mm1               \n\t"\
825
 
        "psrad $" #shift ", %%mm4               \n\t"\
826
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
827
 
        "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
828
 
        "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
829
 
        "psrad $" #shift ", %%mm0               \n\t"\
830
 
        "psrad $" #shift ", %%mm2               \n\t"\
831
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
832
 
        "movd %%mm1, " #dst "                   \n\t"\
833
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
834
 
        "movd %%mm0, 16+" #dst "                \n\t"\
835
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
836
 
        "movd %%mm2, 96+" #dst "                \n\t"\
837
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
838
 
        "movd %%mm4, 112+" #dst "               \n\t"\
839
 
        "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
840
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
841
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
842
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
843
 
        "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
844
 
        "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
845
 
        "psrad $" #shift ", %%mm2               \n\t"\
846
 
        "psrad $" #shift ", %%mm5               \n\t"\
847
 
        "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
848
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
849
 
        "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
850
 
        "psrad $" #shift ", %%mm6               \n\t"\
851
 
        "psrad $" #shift ", %%mm1               \n\t"\
852
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
853
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
854
 
        "movd %%mm2, 32+" #dst "                \n\t"\
855
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
856
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
857
 
        "movd %%mm6, 48+" #dst "                \n\t"\
858
 
        "movd %%mm1, 64+" #dst "                \n\t"\
859
 
        "movd %%mm5, 80+" #dst "                \n\t"   
860
 
 
861
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
862
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
863
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
864
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
865
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
866
 
        "jmp 9f                                 \n\t"
867
 
 
868
 
        "#.balign 16                            \n\t"\
869
 
        "6:                                     \n\t"
 
796
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
797
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
798
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
799
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
800
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
801
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
802
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
803
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
804
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
805
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
806
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
807
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
808
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
809
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
810
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
811
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
812
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
 
813
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
 
814
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
 
815
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
816
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
 
817
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
818
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
 
819
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
820
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
821
        "psrad $" #shift ", %%mm1       \n\t"\
 
822
        "psrad $" #shift ", %%mm4       \n\t"\
 
823
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
 
824
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
825
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
826
        "psrad $" #shift ", %%mm0       \n\t"\
 
827
        "psrad $" #shift ", %%mm2       \n\t"\
 
828
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
 
829
        "movd %%mm1, " #dst "           \n\t"\
 
830
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
831
        "movd %%mm0, 16+" #dst "        \n\t"\
 
832
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
 
833
        "movd %%mm2, 96+" #dst "        \n\t"\
 
834
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
835
        "movd %%mm4, 112+" #dst "       \n\t"\
 
836
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
 
837
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
838
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
 
839
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
840
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
841
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
842
        "psrad $" #shift ", %%mm2       \n\t"\
 
843
        "psrad $" #shift ", %%mm5       \n\t"\
 
844
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
 
845
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
846
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
 
847
        "psrad $" #shift ", %%mm6       \n\t"\
 
848
        "psrad $" #shift ", %%mm1       \n\t"\
 
849
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
 
850
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
851
        "movd %%mm2, 32+" #dst "        \n\t"\
 
852
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
 
853
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
854
        "movd %%mm6, 48+" #dst "        \n\t"\
 
855
        "movd %%mm1, 64+" #dst "        \n\t"\
 
856
        "movd %%mm5, 80+" #dst "        \n\t"
 
857
 
 
858
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
859
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
860
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
861
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
862
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
863
        "jmp 9f                         \n\t"
 
864
 
 
865
        "#" ASMALIGN(4)                      \
 
866
        "6:                             \n\t"
870
867
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
871
868
 
872
869
#undef IDCT
873
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
874
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
875
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
876
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
877
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
878
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
879
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
880
 
        #rounder ", %%mm4                       \n\t"\
881
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
882
 
        #rounder ", %%mm0                       \n\t"\
883
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
884
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
885
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
886
 
        "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
887
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
888
 
        "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
889
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
890
 
        "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
891
 
        "psrad $" #shift ", %%mm1               \n\t"\
892
 
        "psrad $" #shift ", %%mm4               \n\t"\
893
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
894
 
        "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
895
 
        "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
896
 
        "psrad $" #shift ", %%mm0               \n\t"\
897
 
        "psrad $" #shift ", %%mm2               \n\t"\
898
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
899
 
        "movd %%mm1, " #dst "                   \n\t"\
900
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
901
 
        "movd %%mm0, 16+" #dst "                \n\t"\
902
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
903
 
        "movd %%mm2, 96+" #dst "                \n\t"\
904
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
905
 
        "movd %%mm4, 112+" #dst "               \n\t"\
906
 
        "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
907
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
908
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
909
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
910
 
        "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
911
 
        "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
912
 
        "psrad $" #shift ", %%mm2               \n\t"\
913
 
        "psrad $" #shift ", %%mm5               \n\t"\
914
 
        "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
915
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
916
 
        "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
917
 
        "psrad $" #shift ", %%mm6               \n\t"\
918
 
        "psrad $" #shift ", %%mm1               \n\t"\
919
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
920
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
921
 
        "movd %%mm2, 32+" #dst "                \n\t"\
922
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
923
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
924
 
        "movd %%mm6, 48+" #dst "                \n\t"\
925
 
        "movd %%mm1, 64+" #dst "                \n\t"\
926
 
        "movd %%mm5, 80+" #dst "                \n\t"   
927
 
 
928
 
 
929
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
930
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
931
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
932
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
933
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
934
 
        "jmp 9f                                 \n\t"
935
 
 
936
 
        "#.balign 16                            \n\t"\
937
 
        "2:                                     \n\t"
 
870
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
871
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
872
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
873
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
874
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
875
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
876
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
877
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
878
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
879
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
 
880
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
881
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
 
882
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
883
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
 
884
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
885
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
886
        "psrad $" #shift ", %%mm1       \n\t"\
 
887
        "psrad $" #shift ", %%mm4       \n\t"\
 
888
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
 
889
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
890
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
891
        "psrad $" #shift ", %%mm0       \n\t"\
 
892
        "psrad $" #shift ", %%mm2       \n\t"\
 
893
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
 
894
        "movd %%mm1, " #dst "           \n\t"\
 
895
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
896
        "movd %%mm0, 16+" #dst "        \n\t"\
 
897
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
 
898
        "movd %%mm2, 96+" #dst "        \n\t"\
 
899
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
900
        "movd %%mm4, 112+" #dst "       \n\t"\
 
901
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
 
902
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
903
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
 
904
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
905
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
906
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
907
        "psrad $" #shift ", %%mm2       \n\t"\
 
908
        "psrad $" #shift ", %%mm5       \n\t"\
 
909
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
 
910
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
911
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
 
912
        "psrad $" #shift ", %%mm6       \n\t"\
 
913
        "psrad $" #shift ", %%mm1       \n\t"\
 
914
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
 
915
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
916
        "movd %%mm2, 32+" #dst "        \n\t"\
 
917
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
 
918
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
919
        "movd %%mm6, 48+" #dst "        \n\t"\
 
920
        "movd %%mm1, 64+" #dst "        \n\t"\
 
921
        "movd %%mm5, 80+" #dst "        \n\t"
 
922
 
 
923
 
 
924
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
925
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
926
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
927
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
928
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
929
        "jmp 9f                         \n\t"
 
930
 
 
931
        "#" ASMALIGN(4)                      \
 
932
        "2:                             \n\t"
938
933
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
939
934
 
940
935
#undef IDCT
941
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
942
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
943
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
944
 
        "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
945
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
946
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
947
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
948
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
949
 
        #rounder ", %%mm4                       \n\t"\
950
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
951
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
952
 
        #rounder ", %%mm0                       \n\t"\
953
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
954
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
955
 
        "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
956
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
957
 
        "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
958
 
        "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
959
 
        "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
960
 
        "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
961
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
962
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
963
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
964
 
        "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
965
 
        "psrad $" #shift ", %%mm7               \n\t"\
966
 
        "psrad $" #shift ", %%mm4               \n\t"\
967
 
        "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
968
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
969
 
        "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
970
 
        "psrad $" #shift ", %%mm0               \n\t"\
971
 
        "psrad $" #shift ", %%mm2               \n\t"\
972
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
973
 
        "movd %%mm7, " #dst "                   \n\t"\
974
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
975
 
        "movd %%mm0, 16+" #dst "                \n\t"\
976
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
977
 
        "movd %%mm2, 96+" #dst "                \n\t"\
978
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
979
 
        "movd %%mm4, 112+" #dst "               \n\t"\
980
 
        "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
981
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
982
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
983
 
        "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
984
 
        "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
985
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
986
 
        "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
987
 
        "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
988
 
        "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
989
 
        "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
990
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
991
 
        "psrad $" #shift ", %%mm2               \n\t"\
992
 
        "psrad $" #shift ", %%mm5               \n\t"\
993
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
994
 
        "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
995
 
        "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
996
 
        "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
997
 
        "psrad $" #shift ", %%mm6               \n\t"\
998
 
        "psrad $" #shift ", %%mm4               \n\t"\
999
 
        "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
1000
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1001
 
        "movd %%mm2, 32+" #dst "                \n\t"\
1002
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1003
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1004
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1005
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1006
 
        "movd %%mm5, 80+" #dst "                \n\t"
1007
 
 
1008
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1009
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1010
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1011
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1012
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1013
 
        "jmp 9f                                 \n\t"
1014
 
 
1015
 
        "#.balign 16                            \n\t"\
1016
 
        "3:                                     \n\t"
1017
 
#undef IDCT
1018
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1019
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1020
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1021
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1022
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1023
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1024
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1025
 
        #rounder ", %%mm4                       \n\t"\
1026
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1027
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1028
 
        #rounder ", %%mm0                       \n\t"\
1029
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1030
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1031
 
        "movq 64(%2), %%mm3                     \n\t"\
1032
 
        "pmaddwd %%mm2, %%mm3                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1033
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1034
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1035
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1036
 
        "psrad $" #shift ", %%mm7               \n\t"\
1037
 
        "psrad $" #shift ", %%mm4               \n\t"\
1038
 
        "movq %%mm0, %%mm1                      \n\t" /* A1             a1 */\
1039
 
        "paddd %%mm3, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1040
 
        "psubd %%mm3, %%mm1                     \n\t" /* A1-B1          a1-b1 */\
1041
 
        "psrad $" #shift ", %%mm0               \n\t"\
1042
 
        "psrad $" #shift ", %%mm1               \n\t"\
1043
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1044
 
        "movd %%mm7, " #dst "                   \n\t"\
1045
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1046
 
        "movd %%mm0, 16+" #dst "                \n\t"\
1047
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A1-B1  a1-b1 */\
1048
 
        "movd %%mm1, 96+" #dst "                \n\t"\
1049
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1050
 
        "movd %%mm4, 112+" #dst "               \n\t"\
1051
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1052
 
        "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1053
 
        "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1054
 
        "movq %%mm5, %%mm1                      \n\t" /* A2             a2 */\
1055
 
        "paddd %%mm4, %%mm1                     \n\t" /* A2+B2          a2+b2 */\
1056
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1057
 
        "psrad $" #shift ", %%mm1               \n\t"\
1058
 
        "psrad $" #shift ", %%mm5               \n\t"\
1059
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1060
 
        "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1061
 
        "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1062
 
        "psrad $" #shift ", %%mm6               \n\t"\
1063
 
        "psrad $" #shift ", %%mm4               \n\t"\
1064
 
        "packssdw %%mm1, %%mm1                  \n\t" /* A2+B2  a2+b2 */\
1065
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1066
 
        "movd %%mm1, 32+" #dst "                \n\t"\
1067
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1068
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1069
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1070
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1071
 
        "movd %%mm5, 80+" #dst "                \n\t"
1072
 
 
1073
 
 
1074
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1075
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1076
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1077
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1078
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1079
 
        "jmp 9f                                 \n\t"
1080
 
 
1081
 
        "#.balign 16                            \n\t"\
1082
 
        "5:                                     \n\t"
1083
 
#undef IDCT
1084
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1085
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1086
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1087
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1088
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1090
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1092
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1093
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1094
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1095
 
        #rounder ", %%mm4                       \n\t"\
1096
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1097
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1098
 
        #rounder ", %%mm0                       \n\t"\
1099
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1100
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1101
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1102
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1103
 
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1104
 
        "movq 8+" #src4 ", %%mm3                \n\t" /* R6     R2      r6      r2 */\
1105
 
        "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1106
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1107
 
        "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1108
 
        "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1109
 
        "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1110
 
        "pmaddwd %%mm3, %%mm7                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1111
 
        "pmaddwd 40(%2), %%mm3                  \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1112
 
        #rounder ", %%mm1                       \n\t"\
1113
 
        "paddd %%mm1, %%mm7                     \n\t" /* A0             a0 */\
1114
 
        "paddd %%mm1, %%mm1                     \n\t" /* 2C0            2c0 */\
1115
 
        #rounder ", %%mm2                       \n\t"\
1116
 
        "psubd %%mm7, %%mm1                     \n\t" /* A3             a3 */\
1117
 
        "paddd %%mm2, %%mm3                     \n\t" /* A1             a1 */\
1118
 
        "paddd %%mm2, %%mm2                     \n\t" /* 2C1            2c1 */\
1119
 
        "psubd %%mm3, %%mm2                     \n\t" /* A2             a2 */\
1120
 
        "psrad $" #shift ", %%mm4               \n\t"\
1121
 
        "psrad $" #shift ", %%mm7               \n\t"\
1122
 
        "psrad $" #shift ", %%mm3               \n\t"\
1123
 
        "packssdw %%mm7, %%mm4                  \n\t" /* A0     a0 */\
1124
 
        "movq %%mm4, " #dst "                   \n\t"\
1125
 
        "psrad $" #shift ", %%mm0               \n\t"\
1126
 
        "packssdw %%mm3, %%mm0                  \n\t" /* A1     a1 */\
1127
 
        "movq %%mm0, 16+" #dst "                \n\t"\
1128
 
        "movq %%mm0, 96+" #dst "                \n\t"\
1129
 
        "movq %%mm4, 112+" #dst "               \n\t"\
1130
 
        "psrad $" #shift ", %%mm5               \n\t"\
1131
 
        "psrad $" #shift ", %%mm6               \n\t"\
1132
 
        "psrad $" #shift ", %%mm2               \n\t"\
1133
 
        "packssdw %%mm2, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1134
 
        "movq %%mm5, 32+" #dst "                \n\t"\
1135
 
        "psrad $" #shift ", %%mm1               \n\t"\
1136
 
        "packssdw %%mm1, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1137
 
        "movq %%mm6, 48+" #dst "                \n\t"\
1138
 
        "movq %%mm6, 64+" #dst "                \n\t"\
1139
 
        "movq %%mm5, 80+" #dst "                \n\t"   
1140
 
        
1141
 
 
1142
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1143
 
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1144
 
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1145
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1146
 
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1147
 
        "jmp 9f                                 \n\t"
1148
 
 
1149
 
 
1150
 
        "#.balign 16                            \n\t"\
1151
 
        "1:                                     \n\t"
1152
 
#undef IDCT
1153
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1154
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1155
 
        "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1156
 
        "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1157
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1158
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1159
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1160
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1161
 
        "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1162
 
        "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1163
 
        "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1164
 
        "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1165
 
        #rounder ", %%mm4                       \n\t"\
1166
 
        "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1167
 
        "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1168
 
        #rounder ", %%mm0                       \n\t"\
1169
 
        "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1170
 
        "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1171
 
        "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1172
 
        "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1173
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1174
 
        "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1175
 
        "movq 64(%2), %%mm1                     \n\t"\
1176
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1177
 
        "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1178
 
        "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1179
 
        "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1180
 
        "psrad $" #shift ", %%mm7               \n\t"\
1181
 
        "psrad $" #shift ", %%mm4               \n\t"\
1182
 
        "movq %%mm0, %%mm3                      \n\t" /* A1             a1 */\
1183
 
        "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1184
 
        "psubd %%mm1, %%mm3                     \n\t" /* A1-B1          a1-b1 */\
1185
 
        "psrad $" #shift ", %%mm0               \n\t"\
1186
 
        "psrad $" #shift ", %%mm3               \n\t"\
1187
 
        "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1188
 
        "movd %%mm7, " #dst "                   \n\t"\
1189
 
        "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1190
 
        "movd %%mm0, 16+" #dst "                \n\t"\
1191
 
        "packssdw %%mm3, %%mm3                  \n\t" /* A1-B1  a1-b1 */\
1192
 
        "movd %%mm3, 96+" #dst "                \n\t"\
1193
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1194
 
        "movd %%mm4, 112+" #dst "               \n\t"\
1195
 
        "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1196
 
        "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1197
 
        "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1198
 
        "movq %%mm5, %%mm3                      \n\t" /* A2             a2 */\
1199
 
        "paddd %%mm4, %%mm3                     \n\t" /* A2+B2          a2+b2 */\
1200
 
        "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1201
 
        "psrad $" #shift ", %%mm3               \n\t"\
1202
 
        "psrad $" #shift ", %%mm5               \n\t"\
1203
 
        "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1204
 
        "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1205
 
        "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1206
 
        "psrad $" #shift ", %%mm6               \n\t"\
1207
 
        "packssdw %%mm3, %%mm3                  \n\t" /* A2+B2  a2+b2 */\
1208
 
        "movd %%mm3, 32+" #dst "                \n\t"\
1209
 
        "psrad $" #shift ", %%mm4               \n\t"\
1210
 
        "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1211
 
        "movd %%mm6, 48+" #dst "                \n\t"\
1212
 
        "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1213
 
        "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1214
 
        "movd %%mm4, 64+" #dst "                \n\t"\
1215
 
        "movd %%mm5, 80+" #dst "                \n\t"
1216
 
        
1217
 
 
1218
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1219
 
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1220
 
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1221
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1222
 
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1223
 
        "jmp 9f                                 \n\t"
1224
 
 
1225
 
 
1226
 
        "#.balign 16                            \n\t"
1227
 
        "7:                                     \n\t"
1228
 
#undef IDCT
1229
 
#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1230
 
        "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1231
 
        "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1232
 
        "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1233
 
        "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1234
 
        "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1235
 
        #rounder ", %%mm4                       \n\t"\
1236
 
        #rounder ", %%mm0                       \n\t"\
1237
 
        "psrad $" #shift ", %%mm4               \n\t"\
1238
 
        "psrad $" #shift ", %%mm0               \n\t"\
1239
 
        "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1240
 
        "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1241
 
        "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1242
 
        "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1243
 
        "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1244
 
        "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1245
 
        #rounder ", %%mm1                       \n\t"\
1246
 
        #rounder ", %%mm2                       \n\t"\
1247
 
        "psrad $" #shift ", %%mm1               \n\t"\
1248
 
        "packssdw %%mm1, %%mm4                  \n\t" /* A0     a0 */\
1249
 
        "movq %%mm4, " #dst "                   \n\t"\
1250
 
        "psrad $" #shift ", %%mm2               \n\t"\
1251
 
        "packssdw %%mm2, %%mm0                  \n\t" /* A1     a1 */\
1252
 
        "movq %%mm0, 16+" #dst "                \n\t"\
1253
 
        "movq %%mm0, 96+" #dst "                \n\t"\
1254
 
        "movq %%mm4, 112+" #dst "               \n\t"\
1255
 
        "movq %%mm0, 32+" #dst "                \n\t"\
1256
 
        "movq %%mm4, 48+" #dst "                \n\t"\
1257
 
        "movq %%mm4, 64+" #dst "                \n\t"\
1258
 
        "movq %%mm0, 80+" #dst "                \n\t"   
1259
 
 
1260
 
//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1261
 
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1262
 
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1263
 
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1264
 
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
936
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
937
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
938
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
939
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
 
940
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
941
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
942
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
943
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
944
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
945
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
946
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
947
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
948
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
 
949
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
 
950
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
951
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
 
952
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
 
953
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
 
954
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
955
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
956
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
957
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
 
958
        "psrad $" #shift ", %%mm7       \n\t"\
 
959
        "psrad $" #shift ", %%mm4       \n\t"\
 
960
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
 
961
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
962
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
 
963
        "psrad $" #shift ", %%mm0       \n\t"\
 
964
        "psrad $" #shift ", %%mm2       \n\t"\
 
965
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
 
966
        "movd %%mm7, " #dst "           \n\t"\
 
967
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
968
        "movd %%mm0, 16+" #dst "        \n\t"\
 
969
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
 
970
        "movd %%mm2, 96+" #dst "        \n\t"\
 
971
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
972
        "movd %%mm4, 112+" #dst "       \n\t"\
 
973
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
 
974
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
975
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
976
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
 
977
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
978
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
 
979
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
 
980
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
 
981
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
 
982
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
 
983
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
984
        "psrad $" #shift ", %%mm2       \n\t"\
 
985
        "psrad $" #shift ", %%mm5       \n\t"\
 
986
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
987
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
 
988
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
989
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
990
        "psrad $" #shift ", %%mm6       \n\t"\
 
991
        "psrad $" #shift ", %%mm4       \n\t"\
 
992
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
 
993
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
994
        "movd %%mm2, 32+" #dst "        \n\t"\
 
995
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
 
996
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
997
        "movd %%mm6, 48+" #dst "        \n\t"\
 
998
        "movd %%mm4, 64+" #dst "        \n\t"\
 
999
        "movd %%mm5, 80+" #dst "        \n\t"
 
1000
 
 
1001
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
1002
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
1003
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
1004
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
1005
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
1006
        "jmp 9f                         \n\t"
 
1007
 
 
1008
        "#" ASMALIGN(4)                      \
 
1009
        "3:                             \n\t"
 
1010
#undef IDCT
 
1011
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
1012
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
1013
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
1014
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
1015
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1016
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
1017
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1018
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1019
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
1020
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
1021
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1022
        "movq 64(%2), %%mm3             \n\t"\
 
1023
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
1024
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
1025
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
1026
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
1027
        "psrad $" #shift ", %%mm7       \n\t"\
 
1028
        "psrad $" #shift ", %%mm4       \n\t"\
 
1029
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
 
1030
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
1031
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
 
1032
        "psrad $" #shift ", %%mm0       \n\t"\
 
1033
        "psrad $" #shift ", %%mm1       \n\t"\
 
1034
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
 
1035
        "movd %%mm7, " #dst "           \n\t"\
 
1036
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
1037
        "movd %%mm0, 16+" #dst "        \n\t"\
 
1038
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
 
1039
        "movd %%mm1, 96+" #dst "        \n\t"\
 
1040
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
1041
        "movd %%mm4, 112+" #dst "       \n\t"\
 
1042
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
1043
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
1044
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
1045
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
 
1046
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
 
1047
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
1048
        "psrad $" #shift ", %%mm1       \n\t"\
 
1049
        "psrad $" #shift ", %%mm5       \n\t"\
 
1050
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
1051
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
1052
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
1053
        "psrad $" #shift ", %%mm6       \n\t"\
 
1054
        "psrad $" #shift ", %%mm4       \n\t"\
 
1055
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
 
1056
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
1057
        "movd %%mm1, 32+" #dst "        \n\t"\
 
1058
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
 
1059
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
1060
        "movd %%mm6, 48+" #dst "        \n\t"\
 
1061
        "movd %%mm4, 64+" #dst "        \n\t"\
 
1062
        "movd %%mm5, 80+" #dst "        \n\t"
 
1063
 
 
1064
 
 
1065
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
1066
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
1067
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
1068
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
1069
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
1070
        "jmp 9f                         \n\t"
 
1071
 
 
1072
        "#" ASMALIGN(4)                      \
 
1073
        "5:                             \n\t"
 
1074
#undef IDCT
 
1075
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
1076
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
1077
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
1078
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
1079
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1080
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
1081
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1082
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
1083
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
1084
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
1085
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
1086
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1087
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
1088
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
1089
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1090
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
 
1091
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
 
1092
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
 
1093
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
 
1094
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
 
1095
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1096
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
 
1097
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1098
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
 
1099
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
1100
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
1101
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
 
1102
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
 
1103
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
 
1104
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
 
1105
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
 
1106
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
 
1107
        "psrad $" #shift ", %%mm4       \n\t"\
 
1108
        "psrad $" #shift ", %%mm7       \n\t"\
 
1109
        "psrad $" #shift ", %%mm3       \n\t"\
 
1110
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
 
1111
        "movq %%mm4, " #dst "           \n\t"\
 
1112
        "psrad $" #shift ", %%mm0       \n\t"\
 
1113
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
 
1114
        "movq %%mm0, 16+" #dst "        \n\t"\
 
1115
        "movq %%mm0, 96+" #dst "        \n\t"\
 
1116
        "movq %%mm4, 112+" #dst "       \n\t"\
 
1117
        "psrad $" #shift ", %%mm5       \n\t"\
 
1118
        "psrad $" #shift ", %%mm6       \n\t"\
 
1119
        "psrad $" #shift ", %%mm2       \n\t"\
 
1120
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
1121
        "movq %%mm5, 32+" #dst "        \n\t"\
 
1122
        "psrad $" #shift ", %%mm1       \n\t"\
 
1123
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
1124
        "movq %%mm6, 48+" #dst "        \n\t"\
 
1125
        "movq %%mm6, 64+" #dst "        \n\t"\
 
1126
        "movq %%mm5, 80+" #dst "        \n\t"
 
1127
 
 
1128
 
 
1129
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
1130
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
1131
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
1132
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
1133
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
1134
        "jmp 9f                         \n\t"
 
1135
 
 
1136
 
 
1137
        "#" ASMALIGN(4)                      \
 
1138
        "1:                             \n\t"
 
1139
#undef IDCT
 
1140
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
1141
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
1142
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
 
1143
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
 
1144
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
1145
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1146
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
1147
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1148
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
 
1149
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
 
1150
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
 
1151
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
 
1152
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1153
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
 
1154
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
 
1155
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
 
1156
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
 
1157
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1158
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
 
1159
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
 
1160
        "movq 64(%2), %%mm1             \n\t"\
 
1161
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
 
1162
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
 
1163
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
 
1164
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
 
1165
        "psrad $" #shift ", %%mm7       \n\t"\
 
1166
        "psrad $" #shift ", %%mm4       \n\t"\
 
1167
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
 
1168
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
 
1169
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
 
1170
        "psrad $" #shift ", %%mm0       \n\t"\
 
1171
        "psrad $" #shift ", %%mm3       \n\t"\
 
1172
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
 
1173
        "movd %%mm7, " #dst "           \n\t"\
 
1174
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
 
1175
        "movd %%mm0, 16+" #dst "        \n\t"\
 
1176
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
 
1177
        "movd %%mm3, 96+" #dst "        \n\t"\
 
1178
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
 
1179
        "movd %%mm4, 112+" #dst "       \n\t"\
 
1180
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
 
1181
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
 
1182
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
 
1183
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
 
1184
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
 
1185
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
 
1186
        "psrad $" #shift ", %%mm3       \n\t"\
 
1187
        "psrad $" #shift ", %%mm5       \n\t"\
 
1188
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
 
1189
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
 
1190
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
 
1191
        "psrad $" #shift ", %%mm6       \n\t"\
 
1192
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
 
1193
        "movd %%mm3, 32+" #dst "        \n\t"\
 
1194
        "psrad $" #shift ", %%mm4       \n\t"\
 
1195
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
 
1196
        "movd %%mm6, 48+" #dst "        \n\t"\
 
1197
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
 
1198
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
 
1199
        "movd %%mm4, 64+" #dst "        \n\t"\
 
1200
        "movd %%mm5, 80+" #dst "        \n\t"
 
1201
 
 
1202
 
 
1203
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
1204
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
1205
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
1206
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
1207
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
1208
        "jmp 9f                         \n\t"
 
1209
 
 
1210
 
 
1211
        "#" ASMALIGN(4)
 
1212
        "7:                             \n\t"
 
1213
#undef IDCT
 
1214
#define IDCT(src0, src4, src1, src5, dst, shift) \
 
1215
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
 
1216
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
 
1217
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1218
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
 
1219
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1220
        "psrad $" #shift ", %%mm4       \n\t"\
 
1221
        "psrad $" #shift ", %%mm0       \n\t"\
 
1222
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
 
1223
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
 
1224
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
 
1225
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
 
1226
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
 
1227
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
 
1228
        "psrad $" #shift ", %%mm1       \n\t"\
 
1229
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
 
1230
        "movq %%mm4, " #dst "           \n\t"\
 
1231
        "psrad $" #shift ", %%mm2       \n\t"\
 
1232
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
 
1233
        "movq %%mm0, 16+" #dst "        \n\t"\
 
1234
        "movq %%mm0, 96+" #dst "        \n\t"\
 
1235
        "movq %%mm4, 112+" #dst "       \n\t"\
 
1236
        "movq %%mm0, 32+" #dst "        \n\t"\
 
1237
        "movq %%mm4, 48+" #dst "        \n\t"\
 
1238
        "movq %%mm4, 64+" #dst "        \n\t"\
 
1239
        "movq %%mm0, 80+" #dst "        \n\t"
 
1240
 
 
1241
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
 
1242
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 
1243
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
 
1244
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 
1245
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1265
1246
 
1266
1247
 
1267
1248
#endif
1276
1257
 12 32 16 36 52 72 56 76
1277
1258
 05 45 07 47 25 65 27 67
1278
1259
 15 35 17 37 55 75 57 77
1279
 
  
 
1260
 
1280
1261
Temp
1281
1262
 00 04 10 14 20 24 30 34
1282
1263
 40 44 50 54 60 64 70 74
1289
1270
*/
1290
1271
 
1291
1272
"9: \n\t"
1292
 
                :: "r" (block), "r" (temp), "r" (coeffs)
1293
 
                : "%eax"
1294
 
        );
 
1273
                :: "r" (block), "r" (temp), "r" (coeffs)
 
1274
                : "%eax"
 
1275
        );
1295
1276
}
1296
1277
 
1297
1278
void ff_simple_idct_mmx(int16_t *block)