2
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
3
* Copyright (c) 2006 Konstantin Shishkov
5
* This file is part of FFmpeg.
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
#include "../dsputil.h"
25
#include "gcc_fixes.h"
27
#include "dsputil_altivec.h"
29
// main steps of 8x8 transform
30
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
32
t0 = vec_sl(vec_add(s0, s4), vec_2); \
33
t0 = vec_add(vec_sl(t0, vec_1), t0); \
34
t0 = vec_add(t0, vec_rnd); \
35
t1 = vec_sl(vec_sub(s0, s4), vec_2); \
36
t1 = vec_add(vec_sl(t1, vec_1), t1); \
37
t1 = vec_add(t1, vec_rnd); \
38
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
39
t2 = vec_add(t2, vec_sl(s2, vec_4)); \
40
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
41
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
42
t4 = vec_add(t0, t2); \
43
t5 = vec_add(t1, t3); \
44
t6 = vec_sub(t1, t3); \
45
t7 = vec_sub(t0, t2); \
47
t0 = vec_sl(vec_add(s1, s3), vec_4); \
48
t0 = vec_add(t0, vec_sl(s5, vec_3)); \
49
t0 = vec_add(t0, vec_sl(s7, vec_2)); \
50
t0 = vec_add(t0, vec_sub(s5, s3)); \
52
t1 = vec_sl(vec_sub(s1, s5), vec_4); \
53
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
54
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
55
t1 = vec_sub(t1, vec_add(s1, s7)); \
57
t2 = vec_sl(vec_sub(s7, s3), vec_4); \
58
t2 = vec_add(t2, vec_sl(s1, vec_3)); \
59
t2 = vec_add(t2, vec_sl(s5, vec_2)); \
60
t2 = vec_add(t2, vec_sub(s1, s7)); \
62
t3 = vec_sl(vec_sub(s5, s7), vec_4); \
63
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
64
t3 = vec_add(t3, vec_sl(s1, vec_2)); \
65
t3 = vec_sub(t3, vec_add(s3, s5)); \
67
s0 = vec_add(t4, t0); \
68
s1 = vec_add(t5, t1); \
69
s2 = vec_add(t6, t2); \
70
s3 = vec_add(t7, t3); \
71
s4 = vec_sub(t7, t3); \
72
s5 = vec_sub(t6, t2); \
73
s6 = vec_sub(t5, t1); \
74
s7 = vec_sub(t4, t0); \
77
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
79
s0 = vec_sra(s0, vec_3); \
80
s1 = vec_sra(s1, vec_3); \
81
s2 = vec_sra(s2, vec_3); \
82
s3 = vec_sra(s3, vec_3); \
83
s4 = vec_sra(s4, vec_3); \
84
s5 = vec_sra(s5, vec_3); \
85
s6 = vec_sra(s6, vec_3); \
86
s7 = vec_sra(s7, vec_3); \
89
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
91
s0 = vec_sra(s0, vec_7); \
92
s1 = vec_sra(s1, vec_7); \
93
s2 = vec_sra(s2, vec_7); \
94
s3 = vec_sra(s3, vec_7); \
95
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
96
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
97
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
98
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
101
/* main steps of 4x4 transform */
102
#define STEP4(s0, s1, s2, s3, vec_rnd) \
104
t1 = vec_add(vec_sl(s0, vec_4), s0); \
105
t1 = vec_add(t1, vec_rnd); \
106
t2 = vec_add(vec_sl(s2, vec_4), s2); \
107
t0 = vec_add(t1, t2); \
108
t1 = vec_sub(t1, t2); \
109
t3 = vec_sl(vec_sub(s3, s1), vec_1); \
110
t3 = vec_add(t3, vec_sl(t3, vec_2)); \
111
t2 = vec_add(t3, vec_sl(s1, vec_5)); \
112
t3 = vec_add(t3, vec_sl(s3, vec_3)); \
113
t3 = vec_add(t3, vec_sl(s3, vec_2)); \
114
s0 = vec_add(t0, t2); \
115
s1 = vec_sub(t1, t3); \
116
s2 = vec_add(t1, t3); \
117
s3 = vec_sub(t0, t2); \
120
#define SHIFT_HOR4(s0, s1, s2, s3) \
121
s0 = vec_sra(s0, vec_3); \
122
s1 = vec_sra(s1, vec_3); \
123
s2 = vec_sra(s2, vec_3); \
124
s3 = vec_sra(s3, vec_3);
126
#define SHIFT_VERT4(s0, s1, s2, s3) \
127
s0 = vec_sra(s0, vec_7); \
128
s1 = vec_sra(s1, vec_7); \
129
s2 = vec_sra(s2, vec_7); \
130
s3 = vec_sra(s3, vec_7);
132
/** Do inverse transform on 8x8 block
134
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
136
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
137
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
138
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
139
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
140
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
141
const vector unsigned int vec_7 = vec_splat_u32(7);
142
const vector unsigned int vec_5 = vec_splat_u32(5);
143
const vector unsigned int vec_4 = vec_splat_u32(4);
144
const vector signed int vec_4s = vec_splat_s32(4);
145
const vector unsigned int vec_3 = vec_splat_u32(3);
146
const vector unsigned int vec_2 = vec_splat_u32(2);
147
const vector signed int vec_1s = vec_splat_s32(1);
148
const vector unsigned int vec_1 = vec_splat_u32(1);
151
src0 = vec_ld( 0, block);
152
src1 = vec_ld( 16, block);
153
src2 = vec_ld( 32, block);
154
src3 = vec_ld( 48, block);
155
src4 = vec_ld( 64, block);
156
src5 = vec_ld( 80, block);
157
src6 = vec_ld( 96, block);
158
src7 = vec_ld(112, block);
160
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
161
s0 = vec_unpackl(src0);
162
s1 = vec_unpackl(src1);
163
s2 = vec_unpackl(src2);
164
s3 = vec_unpackl(src3);
165
s4 = vec_unpackl(src4);
166
s5 = vec_unpackl(src5);
167
s6 = vec_unpackl(src6);
168
s7 = vec_unpackl(src7);
169
s8 = vec_unpackh(src0);
170
s9 = vec_unpackh(src1);
171
sA = vec_unpackh(src2);
172
sB = vec_unpackh(src3);
173
sC = vec_unpackh(src4);
174
sD = vec_unpackh(src5);
175
sE = vec_unpackh(src6);
176
sF = vec_unpackh(src7);
177
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
178
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
179
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
180
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
181
src0 = vec_pack(s8, s0);
182
src1 = vec_pack(s9, s1);
183
src2 = vec_pack(sA, s2);
184
src3 = vec_pack(sB, s3);
185
src4 = vec_pack(sC, s4);
186
src5 = vec_pack(sD, s5);
187
src6 = vec_pack(sE, s6);
188
src7 = vec_pack(sF, s7);
189
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
191
s0 = vec_unpackl(src0);
192
s1 = vec_unpackl(src1);
193
s2 = vec_unpackl(src2);
194
s3 = vec_unpackl(src3);
195
s4 = vec_unpackl(src4);
196
s5 = vec_unpackl(src5);
197
s6 = vec_unpackl(src6);
198
s7 = vec_unpackl(src7);
199
s8 = vec_unpackh(src0);
200
s9 = vec_unpackh(src1);
201
sA = vec_unpackh(src2);
202
sB = vec_unpackh(src3);
203
sC = vec_unpackh(src4);
204
sD = vec_unpackh(src5);
205
sE = vec_unpackh(src6);
206
sF = vec_unpackh(src7);
207
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
208
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
209
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
210
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
211
src0 = vec_pack(s8, s0);
212
src1 = vec_pack(s9, s1);
213
src2 = vec_pack(sA, s2);
214
src3 = vec_pack(sB, s3);
215
src4 = vec_pack(sC, s4);
216
src5 = vec_pack(sD, s5);
217
src6 = vec_pack(sE, s6);
218
src7 = vec_pack(sF, s7);
220
vec_st(src0, 0, block);
221
vec_st(src1, 16, block);
222
vec_st(src2, 32, block);
223
vec_st(src3, 48, block);
224
vec_st(src4, 64, block);
225
vec_st(src5, 80, block);
226
vec_st(src6, 96, block);
227
vec_st(src7,112, block);
230
/** Do inverse transform on 8x4 part of block
232
static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)
234
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
235
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
236
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
237
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
238
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
239
const vector unsigned int vec_7 = vec_splat_u32(7);
240
const vector unsigned int vec_5 = vec_splat_u32(5);
241
const vector unsigned int vec_4 = vec_splat_u32(4);
242
const vector signed int vec_4s = vec_splat_s32(4);
243
const vector unsigned int vec_3 = vec_splat_u32(3);
244
const vector unsigned int vec_2 = vec_splat_u32(2);
245
const vector unsigned int vec_1 = vec_splat_u32(1);
247
src0 = vec_ld( 0, block);
248
src1 = vec_ld( 16, block);
249
src2 = vec_ld( 32, block);
250
src3 = vec_ld( 48, block);
251
src4 = vec_ld( 64, block);
252
src5 = vec_ld( 80, block);
253
src6 = vec_ld( 96, block);
254
src7 = vec_ld(112, block);
256
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
257
s0 = vec_unpackl(src0);
258
s1 = vec_unpackl(src1);
259
s2 = vec_unpackl(src2);
260
s3 = vec_unpackl(src3);
261
s4 = vec_unpackl(src4);
262
s5 = vec_unpackl(src5);
263
s6 = vec_unpackl(src6);
264
s7 = vec_unpackl(src7);
265
s8 = vec_unpackh(src0);
266
s9 = vec_unpackh(src1);
267
sA = vec_unpackh(src2);
268
sB = vec_unpackh(src3);
269
sC = vec_unpackh(src4);
270
sD = vec_unpackh(src5);
271
sE = vec_unpackh(src6);
272
sF = vec_unpackh(src7);
273
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
274
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
275
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
276
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
277
src0 = vec_pack(s8, s0);
278
src1 = vec_pack(s9, s1);
279
src2 = vec_pack(sA, s2);
280
src3 = vec_pack(sB, s3);
281
src4 = vec_pack(sC, s4);
282
src5 = vec_pack(sD, s5);
283
src6 = vec_pack(sE, s6);
284
src7 = vec_pack(sF, s7);
285
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
287
if(!n){ // upper half of block
288
s0 = vec_unpackh(src0);
289
s1 = vec_unpackh(src1);
290
s2 = vec_unpackh(src2);
291
s3 = vec_unpackh(src3);
292
s8 = vec_unpackl(src0);
293
s9 = vec_unpackl(src1);
294
sA = vec_unpackl(src2);
295
sB = vec_unpackl(src3);
296
STEP4(s0, s1, s2, s3, vec_64);
297
SHIFT_VERT4(s0, s1, s2, s3);
298
STEP4(s8, s9, sA, sB, vec_64);
299
SHIFT_VERT4(s8, s9, sA, sB);
300
src0 = vec_pack(s0, s8);
301
src1 = vec_pack(s1, s9);
302
src2 = vec_pack(s2, sA);
303
src3 = vec_pack(s3, sB);
305
vec_st(src0, 0, block);
306
vec_st(src1, 16, block);
307
vec_st(src2, 32, block);
308
vec_st(src3, 48, block);
309
} else { //lower half of block
310
s0 = vec_unpackh(src4);
311
s1 = vec_unpackh(src5);
312
s2 = vec_unpackh(src6);
313
s3 = vec_unpackh(src7);
314
s8 = vec_unpackl(src4);
315
s9 = vec_unpackl(src5);
316
sA = vec_unpackl(src6);
317
sB = vec_unpackl(src7);
318
STEP4(s0, s1, s2, s3, vec_64);
319
SHIFT_VERT4(s0, s1, s2, s3);
320
STEP4(s8, s9, sA, sB, vec_64);
321
SHIFT_VERT4(s8, s9, sA, sB);
322
src4 = vec_pack(s0, s8);
323
src5 = vec_pack(s1, s9);
324
src6 = vec_pack(s2, sA);
325
src7 = vec_pack(s3, sB);
327
vec_st(src4, 64, block);
328
vec_st(src5, 80, block);
329
vec_st(src6, 96, block);
330
vec_st(src7,112, block);
335
void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
336
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
337
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;