4
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
6
* This library is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2 of the License, or (at your option) any later version.
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with this library; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
based upon some outcommented c code from mpeg2dec (idct_mmx.c
28
written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
32
#include "simple_idct.h"
35
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
36
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
37
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
38
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
39
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
40
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
41
#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
45
#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47
#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48
#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49
#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50
#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51
#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53
#define COL_SHIFT 20 // 6
56
#if defined(ARCH_POWERPC_405)
58
/* signed 16x16 -> 32 multiply add accumulate */
59
#define MAC16(rt, ra, rb) \
60
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
62
/* signed 16x16 -> 32 multiply */
63
#define MUL16(rt, ra, rb) \
64
asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
68
/* signed 16x16 -> 32 multiply add accumulate */
69
#define MAC16(rt, ra, rb) rt += (ra) * (rb)
71
/* signed 16x16 -> 32 multiply */
72
#define MUL16(rt, ra, rb) rt = (ra) * (rb)
76
static inline void idctRowCondDC (DCTELEM * row)
78
int a0, a1, a2, a3, b0, b1, b2, b3;
86
#ifdef WORDS_BIGENDIAN
87
#define ROW0_MASK 0xffff000000000000LL
89
#define ROW0_MASK 0xffffLL
91
if(sizeof(DCTELEM)==2){
92
if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
93
((uint64_t *)row)[1]) == 0) {
94
temp = (row[0] << 3) & 0xffff;
97
((uint64_t *)row)[0] = temp;
98
((uint64_t *)row)[1] = temp;
102
if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
103
row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
108
if(sizeof(DCTELEM)==2){
109
if (!(((uint32_t*)row)[1] |
110
((uint32_t*)row)[2] |
111
((uint32_t*)row)[3] |
113
temp = (row[0] << 3) & 0xffff;
115
((uint32_t*)row)[0]=((uint32_t*)row)[1] =
116
((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
120
if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
121
row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
127
a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
132
/* no need to optimize : gcc does it */
138
MUL16(b0, W1, row[1]);
139
MAC16(b0, W3, row[3]);
140
MUL16(b1, W3, row[1]);
141
MAC16(b1, -W7, row[3]);
142
MUL16(b2, W5, row[1]);
143
MAC16(b2, -W1, row[3]);
144
MUL16(b3, W7, row[1]);
145
MAC16(b3, -W5, row[3]);
148
temp = ((uint64_t*)row)[1];
150
temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
153
a0 += W4*row[4] + W6*row[6];
154
a1 += - W4*row[4] - W2*row[6];
155
a2 += - W4*row[4] + W2*row[6];
156
a3 += W4*row[4] - W6*row[6];
158
MAC16(b0, W5, row[5]);
159
MAC16(b0, W7, row[7]);
161
MAC16(b1, -W1, row[5]);
162
MAC16(b1, -W5, row[7]);
164
MAC16(b2, W7, row[5]);
165
MAC16(b2, W3, row[7]);
167
MAC16(b3, W3, row[5]);
168
MAC16(b3, -W1, row[7]);
171
row[0] = (a0 + b0) >> ROW_SHIFT;
172
row[7] = (a0 - b0) >> ROW_SHIFT;
173
row[1] = (a1 + b1) >> ROW_SHIFT;
174
row[6] = (a1 - b1) >> ROW_SHIFT;
175
row[2] = (a2 + b2) >> ROW_SHIFT;
176
row[5] = (a2 - b2) >> ROW_SHIFT;
177
row[3] = (a3 + b3) >> ROW_SHIFT;
178
row[4] = (a3 - b3) >> ROW_SHIFT;
181
static inline void idctSparseColPut (uint8_t *dest, int line_size,
184
int a0, a1, a2, a3, b0, b1, b2, b3;
185
uint8_t *cm = cropTbl + MAX_NEG_CROP;
187
/* XXX: I did that only to give same values as previous code */
188
a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
198
MUL16(b0, W1, col[8*1]);
199
MUL16(b1, W3, col[8*1]);
200
MUL16(b2, W5, col[8*1]);
201
MUL16(b3, W7, col[8*1]);
203
MAC16(b0, + W3, col[8*3]);
204
MAC16(b1, - W7, col[8*3]);
205
MAC16(b2, - W1, col[8*3]);
206
MAC16(b3, - W5, col[8*3]);
216
MAC16(b0, + W5, col[8*5]);
217
MAC16(b1, - W1, col[8*5]);
218
MAC16(b2, + W7, col[8*5]);
219
MAC16(b3, + W3, col[8*5]);
230
MAC16(b0, + W7, col[8*7]);
231
MAC16(b1, - W5, col[8*7]);
232
MAC16(b2, + W3, col[8*7]);
233
MAC16(b3, - W1, col[8*7]);
236
dest[0] = cm[(a0 + b0) >> COL_SHIFT];
238
dest[0] = cm[(a1 + b1) >> COL_SHIFT];
240
dest[0] = cm[(a2 + b2) >> COL_SHIFT];
242
dest[0] = cm[(a3 + b3) >> COL_SHIFT];
244
dest[0] = cm[(a3 - b3) >> COL_SHIFT];
246
dest[0] = cm[(a2 - b2) >> COL_SHIFT];
248
dest[0] = cm[(a1 - b1) >> COL_SHIFT];
250
dest[0] = cm[(a0 - b0) >> COL_SHIFT];
253
static inline void idctSparseColAdd (uint8_t *dest, int line_size,
256
int a0, a1, a2, a3, b0, b1, b2, b3;
257
uint8_t *cm = cropTbl + MAX_NEG_CROP;
259
/* XXX: I did that only to give same values as previous code */
260
a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
270
MUL16(b0, W1, col[8*1]);
271
MUL16(b1, W3, col[8*1]);
272
MUL16(b2, W5, col[8*1]);
273
MUL16(b3, W7, col[8*1]);
275
MAC16(b0, + W3, col[8*3]);
276
MAC16(b1, - W7, col[8*3]);
277
MAC16(b2, - W1, col[8*3]);
278
MAC16(b3, - W5, col[8*3]);
288
MAC16(b0, + W5, col[8*5]);
289
MAC16(b1, - W1, col[8*5]);
290
MAC16(b2, + W7, col[8*5]);
291
MAC16(b3, + W3, col[8*5]);
302
MAC16(b0, + W7, col[8*7]);
303
MAC16(b1, - W5, col[8*7]);
304
MAC16(b2, + W3, col[8*7]);
305
MAC16(b3, - W1, col[8*7]);
308
dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
310
dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
312
dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
314
dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
316
dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
318
dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
320
dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
322
dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
325
static inline void idctSparseCol (DCTELEM * col)
327
int a0, a1, a2, a3, b0, b1, b2, b3;
329
/* XXX: I did that only to give same values as previous code */
330
a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
340
MUL16(b0, W1, col[8*1]);
341
MUL16(b1, W3, col[8*1]);
342
MUL16(b2, W5, col[8*1]);
343
MUL16(b3, W7, col[8*1]);
345
MAC16(b0, + W3, col[8*3]);
346
MAC16(b1, - W7, col[8*3]);
347
MAC16(b2, - W1, col[8*3]);
348
MAC16(b3, - W5, col[8*3]);
358
MAC16(b0, + W5, col[8*5]);
359
MAC16(b1, - W1, col[8*5]);
360
MAC16(b2, + W7, col[8*5]);
361
MAC16(b3, + W3, col[8*5]);
372
MAC16(b0, + W7, col[8*7]);
373
MAC16(b1, - W5, col[8*7]);
374
MAC16(b2, + W3, col[8*7]);
375
MAC16(b3, - W1, col[8*7]);
378
col[0 ] = ((a0 + b0) >> COL_SHIFT);
379
col[8 ] = ((a1 + b1) >> COL_SHIFT);
380
col[16] = ((a2 + b2) >> COL_SHIFT);
381
col[24] = ((a3 + b3) >> COL_SHIFT);
382
col[32] = ((a3 - b3) >> COL_SHIFT);
383
col[40] = ((a2 - b2) >> COL_SHIFT);
384
col[48] = ((a1 - b1) >> COL_SHIFT);
385
col[56] = ((a0 - b0) >> COL_SHIFT);
388
void simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
392
idctRowCondDC(block + i*8);
395
idctSparseColPut(dest + i, line_size, block + i);
398
void simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
402
idctRowCondDC(block + i*8);
405
idctSparseColAdd(dest + i, line_size, block + i);
408
void simple_idct(DCTELEM *block)
412
idctRowCondDC(block + i*8);
415
idctSparseCol(block + i);
421
#define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
422
#define C1 C_FIX(0.6532814824)
423
#define C2 C_FIX(0.2705980501)
425
/* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
426
and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
427
#define C_SHIFT (4+1+12)
429
static inline void idct4col(uint8_t *dest, int line_size, const DCTELEM *col)
431
int c0, c1, c2, c3, a0, a1, a2, a3;
432
const uint8_t *cm = cropTbl + MAX_NEG_CROP;
438
c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
439
c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
440
c1 = a1 * C1 + a3 * C2;
441
c3 = a1 * C2 - a3 * C1;
442
dest[0] = cm[(c0 + c1) >> C_SHIFT];
444
dest[0] = cm[(c2 + c3) >> C_SHIFT];
446
dest[0] = cm[(c2 - c3) >> C_SHIFT];
448
dest[0] = cm[(c0 - c1) >> C_SHIFT];
457
ptr[8 + k] = a0 - a1;\
460
/* only used by DV codec. The input must be interlaced. 128 is added
461
to the pixels before clamping to avoid systematic error
462
(1024*sqrt(2)) offset would be needed otherwise. */
463
/* XXX: I think a 1.0/sqrt(2) normalization should be needed to
464
compensate the extra butterfly stage - I don't have the full DV
466
void simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
485
/* IDCT8 on each line */
487
idctRowCondDC(block + i*8);
490
/* IDCT4 and store */
492
idct4col(dest + i, 2 * line_size, block + i);
493
idct4col(dest + line_size + i, 2 * line_size, block + 8 + i);
497
/* 8x4 & 4x8 WMV2 IDCT */
504
#define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
505
#define C1 C_FIX(0.6532814824)
506
#define C2 C_FIX(0.2705980501)
507
#define C3 C_FIX(0.5)
508
#define C_SHIFT (4+1+12)
509
static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col)
511
int c0, c1, c2, c3, a0, a1, a2, a3;
512
const uint8_t *cm = cropTbl + MAX_NEG_CROP;
518
c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1));
519
c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
520
c1 = a1 * C1 + a3 * C2;
521
c3 = a1 * C2 - a3 * C1;
522
dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)];
524
dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)];
526
dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)];
528
dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)];
532
#define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
533
#define R1 R_FIX(0.6532814824)
534
#define R2 R_FIX(0.2705980501)
535
#define R3 R_FIX(0.5)
537
static inline void idct4row(DCTELEM *row)
539
int c0, c1, c2, c3, a0, a1, a2, a3;
540
//const uint8_t *cm = cropTbl + MAX_NEG_CROP;
546
c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1));
547
c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1));
548
c1 = a1 * R1 + a3 * R2;
549
c3 = a1 * R2 - a3 * R1;
550
row[0]= (c0 + c1) >> R_SHIFT;
551
row[1]= (c2 + c3) >> R_SHIFT;
552
row[2]= (c2 - c3) >> R_SHIFT;
553
row[3]= (c0 - c1) >> R_SHIFT;
556
void simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block)
560
/* IDCT8 on each line */
562
idctRowCondDC(block + i*8);
565
/* IDCT4 and store */
567
idct4col_add(dest + i, line_size, block + i);
571
void simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block)
575
/* IDCT4 on each line */
577
idct4row(block + i*8);
580
/* IDCT8 and store */
582
idctSparseColAdd(dest + i, line_size, block + i);