2
2
* Copyright (c) 2002 Dieter Shirley
4
* This library is free software; you can redistribute it and/or
4
* dct_unquantize_h263_altivec:
5
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
7
* This file is part of FFmpeg.
9
* FFmpeg is free software; you can redistribute it and/or
5
10
* modify it under the terms of the GNU Lesser General Public
6
11
* License as published by the Free Software Foundation; either
7
* version 2 of the License, or (at your option) any later version.
12
* version 2.1 of the License, or (at your option) any later version.
9
* This library is distributed in the hope that it will be useful,
14
* FFmpeg is distributed in the hope that it will be useful,
10
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
17
* Lesser General Public License for more details.
14
19
* You should have received a copy of the GNU Lesser General Public
15
* License along with this library; if not, write to the Free Software
16
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
* License along with FFmpeg; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
24
#include <stdlib.h>
47
52
d = vec_mergel(_trans_acl, _trans_bdl); \
50
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
52
__typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
53
__typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
55
_A1 = vec_mergeh (a, e); \
56
_B1 = vec_mergel (a, e); \
57
_C1 = vec_mergeh (b, f); \
58
_D1 = vec_mergel (b, f); \
59
_E1 = vec_mergeh (c, g); \
60
_F1 = vec_mergel (c, g); \
61
_G1 = vec_mergeh (d, h); \
62
_H1 = vec_mergel (d, h); \
64
_A2 = vec_mergeh (_A1, _E1); \
65
_B2 = vec_mergel (_A1, _E1); \
66
_C2 = vec_mergeh (_B1, _F1); \
67
_D2 = vec_mergel (_B1, _F1); \
68
_E2 = vec_mergeh (_C1, _G1); \
69
_F2 = vec_mergel (_C1, _G1); \
70
_G2 = vec_mergeh (_D1, _H1); \
71
_H2 = vec_mergel (_D1, _H1); \
73
a = vec_mergeh (_A2, _E2); \
74
b = vec_mergel (_A2, _E2); \
75
c = vec_mergeh (_B2, _F2); \
76
d = vec_mergel (_B2, _F2); \
77
e = vec_mergeh (_C2, _G2); \
78
f = vec_mergel (_C2, _G2); \
79
g = vec_mergeh (_D2, _H2); \
80
h = vec_mergel (_D2, _H2); \
84
56
// Loads a four-byte value (int or float) from the target address
85
57
// into every element in the target vector. Only works if the
100
72
// slower, for dumb non-apple GCC
101
73
#define FOUROF(a) {a,a,a,a}
103
int dct_quantize_altivec(MpegEncContext* s,
75
int dct_quantize_altivec(MpegEncContext* s,
104
76
DCTELEM* data, int n,
105
77
int qscale, int* overflow)
108
80
vector float row0, row1, row2, row3, row4, row5, row6, row7;
109
81
vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
110
const vector float zero = (const vector float)FOUROF(0.);
82
const_vector float zero = (const_vector float)FOUROF(0.);
83
// used after quantise step
112
86
// Load the data into the row/alt vectors
149
123
// The following block could exist as a separate an altivec dct
150
// function. However, if we put it inline, the DCT data can remain
151
// in the vector local variables, as floats, which we'll use during the
124
// function. However, if we put it inline, the DCT data can remain
125
// in the vector local variables, as floats, which we'll use during the
154
128
const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
155
129
const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
201
175
z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
203
177
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
204
// CONST_BITS-PASS1_BITS);
178
// CONST_BITS-PASS1_BITS);
205
179
row2 = vec_madd(tmp13, vec_0_765366865, z1);
207
181
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
208
// CONST_BITS-PASS1_BITS);
182
// CONST_BITS-PASS1_BITS);
209
183
row6 = vec_madd(tmp12, vec_1_847759065, z1);
211
185
z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
414
385
data7 = vec_max(vec_min(data7, max_q), min_q);
417
389
vector bool char zero_01, zero_23, zero_45, zero_67;
418
390
vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67;
419
391
vector signed char negOne = vec_splat_s8(-1);
420
392
vector signed char* scanPtr =
421
393
(vector signed char*)(s->intra_scantable.inverse);
394
signed char lastNonZeroChar;
423
396
// Determine the largest non-zero index.
424
zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero),
425
vec_cmpeq(data1, (vector short)zero));
426
zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero),
427
vec_cmpeq(data3, (vector short)zero));
428
zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero),
429
vec_cmpeq(data5, (vector short)zero));
430
zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero),
431
vec_cmpeq(data7, (vector short)zero));
397
zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero),
398
vec_cmpeq(data1, (vector signed short)zero));
399
zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero),
400
vec_cmpeq(data3, (vector signed short)zero));
401
zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero),
402
vec_cmpeq(data5, (vector signed short)zero));
403
zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero),
404
vec_cmpeq(data7, (vector signed short)zero));
433
406
// 64 biggest values
434
407
scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
462
435
scanIndices_01 = vec_splat(scanIndices_01, 0);
464
signed char lastNonZeroChar;
466
438
vec_ste(scanIndices_01, 0, &lastNonZeroChar);
468
440
lastNonZero = lastNonZeroChar;
470
442
// While the data is still in vectors we check for the transpose IDCT permute
471
443
// and handle it using the vector unit if we can. This is the permute used
472
444
// by the altivec idct, so it is common when using the altivec dct.
519
492
AltiVec version of dct_unquantize_h263
520
493
this code assumes `block' is 16 bytes-aligned
522
void dct_unquantize_h263_altivec(MpegEncContext *s,
495
void dct_unquantize_h263_altivec(MpegEncContext *s,
523
496
DCTELEM *block, int n, int qscale)
525
498
POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
526
499
int i, level, qmul, qadd;
529
502
assert(s->block_last_index[n]>=0);
531
504
POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
533
506
qadd = (qscale - 1) | 1;
534
507
qmul = qscale << 1;
536
509
if (s->mb_intra) {
537
510
if (!s->h263_aic) {
539
512
block[0] = block[0] * s->y_dc_scale;
541
514
block[0] = block[0] * s->c_dc_scale;
545
nCoeffs= 63; //does not allways use zigzag table
518
nCoeffs= 63; //does not allways use zigzag table
548
521
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
551
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
552
for(;i<=nCoeffs;i++) {
556
level = level * qmul - qadd;
558
level = level * qmul + qadd;
563
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
565
register const vector short vczero = (const vector short)vec_splat_s16(0);
525
register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0);
566
526
short __attribute__ ((aligned(16))) qmul8[] =
568
528
qmul, qmul, qmul, qmul,
578
538
-qadd, -qadd, -qadd, -qadd,
579
539
-qadd, -qadd, -qadd, -qadd
581
register vector short blockv, qmulv, qaddv, nqaddv, temp1;
541
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
582
542
register vector bool short blockv_null, blockv_neg;
583
543
register short backup_0 = block[0];
584
544
register int j = 0;
586
546
qmulv = vec_ld(0, qmul8);
587
547
qaddv = vec_ld(0, qadd8);
588
548
nqaddv = vec_ld(0, nqadd8);