2
marc.hoffman@analog.com March 8, 2004
4
Altivec Acceleration for Color Space Conversion revision 0.2
6
convert I420 YV12 to RGB in various formats,
7
it rejects images that are not in 420 formats
8
it rejects images that don't have widths of multiples of 16
9
it rejects images that don't have heights of multiples of 2
10
reject defers to C simulation codes.
12
lots of optimizations to be done here
14
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15
so we currently use max min to clip
17
2. the inefficient use of chroma loading needs a bit of brushing up
19
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22
MODIFIED to calculate coeffs from currently selected color space.
23
MODIFIED core to be a macro which you spec the output format.
24
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25
CORRECTED algorithim selection to be strict on input formats.
26
ADDED runtime detection of altivec.
28
ADDED altivec_yuv2packedX vertical scl + RGB converter
33
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38
so we have roughly 10clocks per pixel this is too high something has to be wrong.
40
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43
guaranteed to have the input video frame it was just decompressed so
44
it probably resides in L1 caches. However we are creating the
45
output video stream this needs to use the DSTST instruction to
46
optimize for the cache. We couple this with the fact that we are
47
not going to be visiting the input buffer again so we mark it Least
48
Recently Used. This shaves 25% of the processor cycles off.
50
Now MEMCPY is the largest mips consumer in the system, probably due
51
to the inefficient X11 stuff.
53
GL libraries seem to be very slow on this machine 1.33Ghz PB running
54
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55
a versioning issues, however i have libGL.1.2.dylib for both
56
machines. ((We need to figure this out now))
58
GL2 libraries work now with patch for RGB32
60
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
73
#include "swscale_internal.h"
75
#include "img_format.h" //FIXME try to reduce dependency of such stuff
77
#undef PROFILE_THE_BEAST
80
typedef unsigned char ubyte;
81
typedef signed char sbyte;
84
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
85
homogeneous vector registers x0,x1,x2 are interleaved with the
88
o0 = vec_mergeh (x0,x1);
89
o1 = vec_perm (o0, x2, perm_rgb_0);
90
o2 = vec_perm (o0, x2, perm_rgb_1);
91
o3 = vec_mergel (x0,x1);
92
o4 = vec_perm (o3,o2,perm_rgb_2);
93
o5 = vec_perm (o3,o2,perm_rgb_3);
95
perm_rgb_0: o0(RG).h v1(B) --> o1*
101
perm_rgb_1: o0(RG).h v1(B) --> o2
107
perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
113
perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
121
const vector unsigned char
122
perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123
0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124
perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126
perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127
0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128
perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129
0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
131
#define vec_merge3(x2,x1,x0,y0,y1,y2) \
133
typeof(x0) o0,o2,o3; \
134
o0 = vec_mergeh (x0,x1); \
135
y0 = vec_perm (o0, x2, perm_rgb_0);\
136
o2 = vec_perm (o0, x2, perm_rgb_1);\
137
o3 = vec_mergel (x0,x1); \
138
y1 = vec_perm (o3,o2,perm_rgb_2); \
139
y2 = vec_perm (o3,o2,perm_rgb_3); \
142
#define vec_mstrgb24(x0,x1,x2,ptr) \
144
typeof(x0) _0,_1,_2; \
145
vec_merge3 (x0,x1,x2,_0,_1,_2); \
146
vec_st (_0, 0, ptr++); \
147
vec_st (_1, 0, ptr++); \
148
vec_st (_2, 0, ptr++); \
151
#define vec_mstbgr24(x0,x1,x2,ptr) \
153
typeof(x0) _0,_1,_2; \
154
vec_merge3 (x2,x1,x0,_0,_1,_2); \
155
vec_st (_0, 0, ptr++); \
156
vec_st (_1, 0, ptr++); \
157
vec_st (_2, 0, ptr++); \
160
/* pack the pixels in rgb0 format
164
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
167
_0 = vec_mergeh (x0,x1); \
168
_1 = vec_mergeh (x2,x3); \
169
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
170
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
171
vec_st (_2, 0*16, (T *)ptr); \
172
vec_st (_3, 1*16, (T *)ptr); \
173
_0 = vec_mergel (x0,x1); \
174
_1 = vec_mergel (x2,x3); \
175
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
176
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
177
vec_st (_2, 2*16, (T *)ptr); \
178
vec_st (_3, 3*16, (T *)ptr); \
185
| 1 -0.3441 -0.7142 |x| Cb|
192
typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
200
(vector signed short) \
201
vec_perm(x,(typeof(x))(0),\
202
(vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
205
(vector signed short) \
206
vec_perm(x,(typeof(x))(0),\
207
(vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
210
#define vec_clip(x) \
211
vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
213
#define vec_packclp_a(x,y) \
214
(vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
216
#define vec_packclp(x,y) \
217
(vector unsigned char)vec_packs \
218
((vector unsigned short)vec_max (x,(vector signed short) (0)), \
219
(vector unsigned short)vec_max (y,(vector signed short) (0)))
221
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
224
static inline void cvtyuvtoRGB (SwsContext *c,
225
vector signed short Y, vector signed short U, vector signed short V,
226
vector signed short *R, vector signed short *G, vector signed short *B)
228
vector signed short vx,ux,uvx;
230
Y = vec_mradds (Y, c->CY, c->OY);
232
U = vec_sub (U,(vector signed short)(128));
233
V = vec_sub (V,(vector signed short)(128));
235
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
236
ux = vec_sl (U, c->CSHIFT);
237
*B = vec_mradds (ux, c->CBU, Y);
239
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
240
vx = vec_sl (V, c->CSHIFT);
241
*R = vec_mradds (vx, c->CRV, Y);
243
// uvx = ((CGU*u) + (CGV*v))>>15;
244
uvx = vec_mradds (U, c->CGU, Y);
245
*G = vec_mradds (V, c->CGV, uvx);
250
------------------------------------------------------------------------------
252
------------------------------------------------------------------------------
256
#define DEFCSP420_CVT(name,out_pixels) \
257
static int altivec_##name (SwsContext *c, \
258
unsigned char **in, int *instrides, \
259
int srcSliceY, int srcSliceH, \
260
unsigned char **oplanes, int *outstrides) \
265
int instrides_scl[3]; \
266
vector unsigned char y0,y1; \
268
vector signed char u,v; \
270
vector signed short Y0,Y1,Y2,Y3; \
271
vector signed short U,V; \
272
vector signed short vx,ux,uvx; \
273
vector signed short vx0,ux0,uvx0; \
274
vector signed short vx1,ux1,uvx1; \
275
vector signed short R0,G0,B0; \
276
vector signed short R1,G1,B1; \
277
vector unsigned char R,G,B; \
279
vector unsigned char *uivP, *vivP; \
280
vector unsigned char align_perm; \
282
vector signed short \
290
vector unsigned short lCSHIFT = c->CSHIFT; \
292
ubyte *y1i = in[0]; \
293
ubyte *y2i = in[0]+w; \
297
vector unsigned char *oute \
298
= (vector unsigned char *) \
299
(oplanes[0]+srcSliceY*outstrides[0]); \
300
vector unsigned char *outo \
301
= (vector unsigned char *) \
302
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
305
instrides_scl[0] = instrides[0]; \
306
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
307
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
310
for (i=0;i<h/2;i++) { \
311
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
312
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
314
for (j=0;j<w/16;j++) { \
316
y0 = vec_ldl (0,y1i); \
317
y1 = vec_ldl (0,y2i); \
318
uivP = (vector unsigned char *)ui; \
319
vivP = (vector unsigned char *)vi; \
321
align_perm = vec_lvsl (0, ui); \
322
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
324
align_perm = vec_lvsl (0, vi); \
325
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
327
u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
328
v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
329
U = vec_unpackh (u); \
330
V = vec_unpackh (v); \
338
Y0 = vec_mradds (Y0, lCY, lOY); \
339
Y1 = vec_mradds (Y1, lCY, lOY); \
340
Y2 = vec_mradds (Y2, lCY, lOY); \
341
Y3 = vec_mradds (Y3, lCY, lOY); \
343
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
344
ux = vec_sl (U, lCSHIFT); \
345
ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
346
ux0 = vec_mergeh (ux,ux); \
347
ux1 = vec_mergel (ux,ux); \
349
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
350
vx = vec_sl (V, lCSHIFT); \
351
vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
352
vx0 = vec_mergeh (vx,vx); \
353
vx1 = vec_mergel (vx,vx); \
355
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \
356
uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
357
uvx = vec_mradds (V, lCGV, uvx); \
358
uvx0 = vec_mergeh (uvx,uvx); \
359
uvx1 = vec_mergel (uvx,uvx); \
361
R0 = vec_add (Y0,vx0); \
362
G0 = vec_add (Y0,uvx0); \
363
B0 = vec_add (Y0,ux0); \
364
R1 = vec_add (Y1,vx1); \
365
G1 = vec_add (Y1,uvx1); \
366
B1 = vec_add (Y1,ux1); \
368
R = vec_packclp (R0,R1); \
369
G = vec_packclp (G0,G1); \
370
B = vec_packclp (B0,B1); \
372
out_pixels(R,G,B,oute); \
374
R0 = vec_add (Y2,vx0); \
375
G0 = vec_add (Y2,uvx0); \
376
B0 = vec_add (Y2,ux0); \
377
R1 = vec_add (Y3,vx1); \
378
G1 = vec_add (Y3,uvx1); \
379
B1 = vec_add (Y3,ux1); \
380
R = vec_packclp (R0,R1); \
381
G = vec_packclp (G0,G1); \
382
B = vec_packclp (B0,B1); \
385
out_pixels(R,G,B,outo); \
394
outo += (outstrides[0])>>4; \
395
oute += (outstrides[0])>>4; \
397
ui += instrides_scl[1]; \
398
vi += instrides_scl[2]; \
399
y1i += instrides_scl[0]; \
400
y2i += instrides_scl[0]; \
406
#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
407
#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
408
#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
409
#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
410
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
411
#define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
413
DEFCSP420_CVT (yuv2_abgr32, out_abgr)
414
DEFCSP420_CVT (yuv2_bgra32, out_argb)
415
DEFCSP420_CVT (yuv2_rgba32, out_rgba)
416
DEFCSP420_CVT (yuv2_argb32, out_argb)
417
DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
418
DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
421
// uyvy|uyvy|uyvy|uyvy
422
// 0123 4567 89ab cdef
424
const vector unsigned char
425
demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
428
0x10,0x0c,0x10,0x0c),
429
demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
432
0x10,0x0E,0x10,0x0E),
433
demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
436
0x10,0x0D,0x10,0x0F);
439
this is so I can play live CCIR raw video
441
static int altivec_uyvy_rgb32 (SwsContext *c,
442
unsigned char **in, int *instrides,
443
int srcSliceY, int srcSliceH,
444
unsigned char **oplanes, int *outstrides)
449
vector unsigned char uyvy;
450
vector signed short Y,U,V;
451
vector signed short vx,ux,uvx;
452
vector signed short R0,G0,B0,R1,G1,B1;
453
vector unsigned char R,G,B;
454
vector unsigned char *out;
458
out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
461
for (j=0;j<w/16;j++) {
462
uyvy = vec_ld (0, img);
463
U = (vector signed short)
464
vec_perm (uyvy, (vector unsigned char)(0), demux_u);
466
V = (vector signed short)
467
vec_perm (uyvy, (vector unsigned char)(0), demux_v);
469
Y = (vector signed short)
470
vec_perm (uyvy, (vector unsigned char)(0), demux_y);
472
cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
474
uyvy = vec_ld (16, img);
475
U = (vector signed short)
476
vec_perm (uyvy, (vector unsigned char)(0), demux_u);
478
V = (vector signed short)
479
vec_perm (uyvy, (vector unsigned char)(0), demux_v);
481
Y = (vector signed short)
482
vec_perm (uyvy, (vector unsigned char)(0), demux_y);
484
cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
486
R = vec_packclp (R0,R1);
487
G = vec_packclp (G0,G1);
488
B = vec_packclp (B0,B1);
490
// vec_mstbgr24 (R,G,B, out);
491
out_rgba (R,G,B,out);
501
/* Ok currently the acceleration routine only supports
502
inputs of widths a multiple of 16
503
and heights a multiple 2
505
So we just fall back to the C codes for this.
507
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
509
if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
513
and this seems not to matter too much I tried a bunch of
514
videos with abnormal widths and mplayer crashes else where.
515
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
516
boom with X11 bad match.
519
if ((c->srcW & 0xf) != 0) return NULL;
521
switch (c->srcFormat) {
532
if ((c->srcH & 0x1) != 0)
535
switch(c->dstFormat){
537
MSG_WARN("ALTIVEC: Color Space RGB24\n");
538
return altivec_yuv2_rgb24;
540
MSG_WARN("ALTIVEC: Color Space BGR24\n");
541
return altivec_yuv2_bgr24;
543
MSG_WARN("ALTIVEC: Color Space ARGB32\n");
544
return altivec_yuv2_argb32;
546
MSG_WARN("ALTIVEC: Color Space BGRA32\n");
547
// return profile_altivec_bgra32;
549
return altivec_yuv2_bgra32;
550
default: return NULL;
555
switch(c->dstFormat){
557
MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
558
return altivec_uyvy_rgb32;
559
default: return NULL;
567
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
569
vector signed short CY, CRV, CBU, CGU, CGV, OY, Y0;
570
int64_t crv __attribute__ ((aligned(16))) = inv_table[0];
571
int64_t cbu __attribute__ ((aligned(16))) = inv_table[1];
572
int64_t cgu __attribute__ ((aligned(16))) = inv_table[2];
573
int64_t cgv __attribute__ ((aligned(16))) = inv_table[3];
574
int64_t cy = (1<<16)-1;
576
short tmp __attribute__ ((aligned(16)));
578
if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
581
cy = (cy *c->contrast )>>17;
582
crv= (crv*c->contrast * c->saturation)>>32;
583
cbu= (cbu*c->contrast * c->saturation)>>32;
584
cgu= (cgu*c->contrast * c->saturation)>>32;
585
cgv= (cgv*c->contrast * c->saturation)>>32;
587
oy -= 256*c->brightness;
590
CY = vec_lde (0, &tmp);
591
CY = vec_splat (CY, 0);
594
OY = vec_lde (0, &tmp);
595
OY = vec_splat (OY, 0);
598
CRV = vec_lde (0, &tmp);
599
CRV = vec_splat (CRV, 0);
601
CBU = vec_lde (0, &tmp);
602
CBU = vec_splat (CBU, 0);
605
CGU = vec_lde (0, &tmp);
606
CGU = vec_splat (CGU, 0);
608
CGV = vec_lde (0, &tmp);
609
CGV = vec_splat (CGV, 0);
611
c->CSHIFT = (vector unsigned short)(2);
620
printf ("cy: %hvx\n", CY);
621
printf ("oy: %hvx\n", OY);
622
printf ("crv: %hvx\n", CRV);
623
printf ("cbu: %hvx\n", CBU);
624
printf ("cgv: %hvx\n", CGV);
625
printf ("cgu: %hvx\n", CGU);
633
altivec_yuv2packedX (SwsContext *c,
634
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
635
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
636
uint8_t *dest, int dstW, int dstY)
639
short tmp __attribute__((aligned (16)));
642
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
643
vector signed short R0,G0,B0,R1,G1,B1;
645
vector unsigned char R,G,B,pels[3];
646
vector unsigned char *out,*nout;
647
vector signed short RND = (vector signed short)(1<<3);
648
vector unsigned short SCL = (vector unsigned short)(4);
649
unsigned long scratch[16] __attribute__ ((aligned (16)));
651
vector signed short *vYCoeffsBank, *vCCoeffsBank;
653
vector signed short *YCoeffs, *CCoeffs;
655
vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
656
vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
658
for (i=0;i<lumFilterSize*dstW;i++) {
659
tmp = c->vLumFilter[i];
660
p = &vYCoeffsBank[i];
665
for (i=0;i<chrFilterSize*dstW;i++) {
666
tmp = c->vChrFilter[i];
667
p = &vCCoeffsBank[i];
672
YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
673
CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
675
out = (vector unsigned char *)dest;
677
for(i=0; i<dstW; i+=16){
680
/* extract 16 coeffs from lumSrc */
681
for(j=0; j<lumFilterSize; j++) {
682
X0 = vec_ld (0, &lumSrc[j][i]);
683
X1 = vec_ld (16, &lumSrc[j][i]);
684
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
685
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
690
/* extract 8 coeffs from U,V */
691
for(j=0; j<chrFilterSize; j++) {
692
X = vec_ld (0, &chrSrc[j][i/2]);
693
U = vec_mradds (X, CCoeffs[j], U);
694
X = vec_ld (0, &chrSrc[j][i/2+2048]);
695
V = vec_mradds (X, CCoeffs[j], V);
698
/* scale and clip signals */
699
Y0 = vec_sra (Y0, SCL);
700
Y1 = vec_sra (Y1, SCL);
701
U = vec_sra (U, SCL);
702
V = vec_sra (V, SCL);
710
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
711
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
713
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
714
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
715
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
718
U0 = vec_mergeh (U,U);
719
V0 = vec_mergeh (V,V);
721
U1 = vec_mergel (U,U);
722
V1 = vec_mergel (V,V);
724
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
725
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
727
R = vec_packclp (R0,R1);
728
G = vec_packclp (G0,G1);
729
B = vec_packclp (B0,B1);
731
out_rgba (R,G,B,out);
739
/* extract 16 coeffs from lumSrc */
740
for(j=0; j<lumFilterSize; j++) {
741
X0 = vec_ld (0, &lumSrc[j][i]);
742
X1 = vec_ld (16, &lumSrc[j][i]);
743
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
744
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
749
/* extract 8 coeffs from U,V */
750
for(j=0; j<chrFilterSize; j++) {
751
X = vec_ld (0, &chrSrc[j][i/2]);
752
U = vec_mradds (X, CCoeffs[j], U);
753
X = vec_ld (0, &chrSrc[j][i/2+2048]);
754
V = vec_mradds (X, CCoeffs[j], V);
757
/* scale and clip signals */
758
Y0 = vec_sra (Y0, SCL);
759
Y1 = vec_sra (Y1, SCL);
760
U = vec_sra (U, SCL);
761
V = vec_sra (V, SCL);
769
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
770
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
772
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
773
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
774
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
777
U0 = vec_mergeh (U,U);
778
V0 = vec_mergeh (V,V);
780
U1 = vec_mergel (U,U);
781
V1 = vec_mergel (V,V);
783
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
784
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
786
R = vec_packclp (R0,R1);
787
G = vec_packclp (G0,G1);
788
B = vec_packclp (B0,B1);
790
nout = (vector unsigned char *)scratch;
791
out_rgba (R,G,B,nout);
793
memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
796
if (vYCoeffsBank) free (vYCoeffsBank);
797
if (vCCoeffsBank) free (vCCoeffsBank);