2
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5
* Blackfin video color space converter operations
6
* convert I420 YV12 to RGB in various formats
8
* This file is part of FFmpeg.
10
* FFmpeg is free software; you can redistribute it and/or
11
* modify it under the terms of the GNU Lesser General Public
12
* License as published by the Free Software Foundation; either
13
* version 2.1 of the License, or (at your option) any later version.
15
* FFmpeg is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* Lesser General Public License for more details.
20
* You should have received a copy of the GNU Lesser General Public
21
* License along with FFmpeg; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
31
The following calculation is used for the conversion:
33
r = clipz((y-oy)*cy + crv*(v-128))
34
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35
b = clipz((y-oy)*cy + cbu*(u-128))
37
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
40
New factorization to eliminate the truncation error which was
41
occurring due to the byteop3p.
44
1) Use the bytop16m to subtract quad bytes we use this in U8 this
45
then so the offsets need to be renormalized to 8bits.
47
2) Scale operands up by a factor of 4 not 8 because Blackfin
48
multiplies include a shift.
50
3) Compute into the accumulators cy*yx0, cy*yx1.
52
4) Compute each of the linear equations:
53
r = clipz((y - oy) * cy + crv * (v - 128))
55
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
57
b = clipz((y - oy) * cy + cbu * (u - 128))
59
Reuse of the accumulators requires that we actually multiply
60
twice once with addition and the second time with a subtraction.
62
Because of this we need to compute the equations in the order R B
63
then G saving the writes for B in the case of 24/32 bit color
66
API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67
int dW, uint32_t *coeffs);
74
Where coeffs have the following layout in memory.
76
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78
coeffs is a pointer to oy.
80
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual Mac
82
architecture of BlackFin.
84
All routines are exported with _ff_bfin_ as a symbol prefix.
86
Rough performance gain compared against -O3:
88
2779809/1484290 187.28%
90
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91
c/pel for the optimized implementations. Not sure why there is such a
92
huge variation on the reference codes on Blackfin I guess it must have
93
to do with the memory system.
97
#if defined(__FDPIC__) && CONFIG_SRAM
104
#define DEFUN(fname,where,interface) \
106
.global _ff_bfin_ ## fname; \
107
.type _ff_bfin_ ## fname, STT_FUNC; \
111
#define DEFUN_END(fname) \
112
.size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
117
#define COEFF_LEN 11*4
118
#define COEFF_REL_CY_OFF 4*4
124
DEFUN(yuv2rgb565_line,MEM,
125
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
139
m0 = COEFF_REL_CY_OFF;
143
r1.l = w[i2++]; // 2u
144
r1.h = w[i3++]; // 2v
147
lsetup (.L0565, .L1565) lc0 = p0;
150
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151
r0 -- used to load 4ys
152
r1 -- used to load 2us,2vs
161
rrrrrrrr gggggggg bbbbbbbb
168
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
169
(r7,r6) = byteop16m (r1:0, r3:2) (r);
170
r5 = r5 << 2 (v); // y1,y0
171
r4 = r4 << 2 (v); // y3,y2
172
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
173
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
175
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
177
/* R = Y+ crv*(Cr-128) */
178
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
180
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
184
/* B = Y+ cbu*(Cb-128) */
185
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
187
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
192
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
194
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
199
[p1++]=r3 || r1=[i1++]; // cy
203
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
205
/* R = Y+ crv*(Cr-128) */
206
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
208
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
212
/* B = Y+ cbu*(Cb-128) */
213
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
215
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
220
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
222
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223
r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
224
r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
227
[p1++]=r3 || r1.h = w[i3++]; // 2v
228
.L1565: r2=[i1++]; // oy
235
DEFUN_END(yuv2rgb565_line)
237
DEFUN(yuv2rgb555_line,MEM,
238
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
252
m0 = COEFF_REL_CY_OFF;
256
r1.l = w[i2++]; // 2u
257
r1.h = w[i3++]; // 2v
260
lsetup (.L0555, .L1555) lc0 = p0;
263
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264
r0 -- used to load 4ys
265
r1 -- used to load 2us,2vs
274
rrrrrrrr gggggggg bbbbbbbb
282
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
283
(r7,r6) = byteop16m (r1:0, r3:2) (r);
284
r5 = r5 << 2 (v); // y1,y0
285
r4 = r4 << 2 (v); // y3,y2
286
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
287
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
289
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
291
/* R = Y+ crv*(Cr-128) */
292
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
294
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
298
/* B = Y+ cbu*(Cb-128) */
299
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
301
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
306
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
308
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
313
[p1++]=r3 || r1=[i1++]; // cy
317
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
319
/* R = Y+ crv*(Cr-128) */
320
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
322
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
326
/* B = Y+ cbu*(Cb-128) */
327
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
329
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
334
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
336
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337
r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
338
r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
341
[p1++]=r3 || r1.h=w[i3++]; // 2v
343
.L1555: r2=[i1++]; // oy
350
DEFUN_END(yuv2rgb555_line)
352
DEFUN(yuv2rgb24_line,MEM,
353
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
365
r0 = [fp+ARG_COEFF]; // coeff buffer
369
m0 = COEFF_REL_CY_OFF;
373
r1.l = w[i2++]; // 2u
374
r1.h = w[i3++]; // 2v
377
lsetup (.L0888, .L1888) lc0 = p0;
380
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381
r0 -- used to load 4ys
382
r1 -- used to load 2us,2vs
390
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
391
(r7,r6) = byteop16m (r1:0, r3:2) (r);
392
r5 = r5 << 2 (v); // y1,y0
393
r4 = r4 << 2 (v); // y3,y2
394
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
395
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
398
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
400
/* R = Y+ crv*(Cr-128) */
401
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
403
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
404
r2=r2>>16 || B[p1++]=r2;
407
/* B = Y+ cbu*(Cb-128) */
408
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
410
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
412
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
414
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
417
r2=r2>>16 || B[p1++]=r2;
420
r3=r3>>16 || B[p1++]=r3;
421
B[p2++]=r3 || r1=[i1++]; // cy
426
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
428
/* R = Y+ crv*(Cr-128) */
429
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
431
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
432
r2=r2>>16 || B[p1++]=r2;
435
/* B = Y+ cbu*(Cb-128) */
436
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
438
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
440
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
442
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
444
r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
445
B[p2++]=r2 || r1.l = w[i2++]; // 2u
446
r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447
B[p2++]=r3 || r2=[i1++]; // oy
457
DEFUN_END(yuv2rgb24_line)
463
#define ARG_height 28
464
#define ARG_lumStride 32
465
#define ARG_chromStride 36
466
#define ARG_srcStride 40
468
DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469
long width, long height,
470
long lumStride, long chromStride, long srcStride)):
472
[--sp] = (r7:4,p5:4);
474
p0 = r1; // Y top even
477
r2 = [fp + ARG_vdst];
480
r1 = [fp + ARG_srcStride];
482
i0 = r0; // uyvy_T even
483
i1 = r2; // uyvy_B odd
485
p2 = [fp + ARG_lumStride];
486
p1 = p0 + p2; // Y bot odd
488
p5 = [fp + ARG_width];
489
p4 = [fp + ARG_height];
496
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
497
r1 += -8; // i0,i1 is pre read need to correct
500
r2 = [fp + ARG_chromStride];
505
/* I0,I1 - src input line pointers
506
* p0,p1 - luma output line pointers
511
lsetup (0f, 1f) lc1 = p4; // H/2
512
0: r0 = [i0++] || r2 = [i1++];
513
r1 = [i0++] || r3 = [i1++];
514
r4 = byteop1p(r1:0, r3:2);
515
r5 = byteop1p(r1:0, r3:2) (r);
516
lsetup (2f, 3f) lc0 = p5; // W/4
521
r0 = bytepack(r0, r1);
522
r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
523
r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
524
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
525
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
526
r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
527
3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
536
(r7:4,p5:4) = [sp++];
539
DEFUN_END(uyvytoyv12)
541
DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
542
long width, long height,
543
long lumStride, long chromStride, long srcStride)):
545
[--sp] = (r7:4,p5:4);
547
p0 = r1; // Y top even
550
r2 = [fp + ARG_vdst];
553
r1 = [fp + ARG_srcStride];
556
i0 = r0; // uyvy_T even
557
i1 = r2; // uyvy_B odd
559
p2 = [fp + ARG_lumStride];
560
p1 = p0 + p2; // Y bot odd
562
p5 = [fp + ARG_width];
563
p4 = [fp + ARG_height];
570
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
571
r1 += -8; // i0,i1 is pre read need to correct
574
r2 = [fp + ARG_chromStride];
579
/* I0,I1 - src input line pointers
580
* p0,p1 - luma output line pointers
585
lsetup (0f, 1f) lc1 = p4; // H/2
586
0: r0 = [i0++] || r2 = [i1++];
587
r1 = [i0++] || r3 = [i1++];
588
r4 = bytepack(r0, r1);
589
r5 = bytepack(r2, r3);
590
lsetup (2f, 3f) lc0 = p5; // W/4
591
2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
592
r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
595
r4 = byteop1p(r1:0, r3:2);
596
r5 = byteop1p(r1:0, r3:2) (r);
597
r6 = pack(r5.l, r4.l);
598
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
599
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
600
r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
601
3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
610
(r7:4,p5:4) = [sp++];
613
DEFUN_END(yuyvtoyv12)