1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
18
// Jia Haipeng, jiahaipeng95@gmail.com
19
// Peng Xiao, pengxiao@multicorewareinc.com
21
// Redistribution and use in source and binary forms, with or without modification,
22
// are permitted provided that the following conditions are met:
24
// * Redistribution's of source code must retain the above copyright notice,
25
// this list of conditions and the following disclaimer.
27
// * Redistribution's in binary form must reproduce the above copyright notice,
28
// this list of conditions and the following disclaimer in the documentation
29
// and/or other materials provided with the distribution.
31
// * The name of the copyright holders may not be used to endorse or promote products
32
// derived from this software without specific prior written permission.
34
// This software is provided by the copyright holders and contributors as is and
35
// any express or implied warranties, including, but not limited to, the implied
36
// warranties of merchantability and fitness for a particular purpose are disclaimed.
37
// In no event shall the Intel Corporation or contributors be liable for any direct,
38
// indirect, incidental, special, exemplary, or consequential damages
39
// (including, but not limited to, procurement of substitute goods or services;
40
// loss of use, data, or profits; or business interruption) however caused
41
// and on any theory of liability, whether in contract, strict liability,
42
// or tort (including negligence or otherwise) arising in any way out of
43
// the use of this software, even if advised of the possibility of such damage.
47
/**************************************PUBLICFUNC*************************************/
50
#define DATA_TYPE uchar
53
#define COEFF_TYPE int
54
#define SAT_CAST(num) convert_uchar_sat(num)
57
#define DATA_TYPE ushort
59
#define HALF_MAX 32768
60
#define COEFF_TYPE int
61
#define SAT_CAST(num) convert_ushort_sat(num)
64
#define DATA_TYPE float
67
#define COEFF_TYPE float
68
#define SAT_CAST(num) (num)
71
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
74
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
87
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
88
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
107
// The only kernel that uses bidx == 3 doesn't use these macros.
108
// But we still need to make the compiler happy.
123
#define PIX_PER_WI_X 1
126
#define __CAT(x, y) x##y
127
#define CAT(x, y) __CAT(x, y)
129
#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
131
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
133
__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
134
__global uchar * dstptr, int dst_step, int dst_offset,
137
int x = get_global_id(0);
138
int y = get_global_id(1) * PIX_PER_WI_Y;
142
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
143
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
146
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
150
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
151
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
152
DATA_TYPE_4 src_pix = vload4(0, src);
154
dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
156
dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
159
src_index += src_step;
160
dst_index += dst_step;
166
__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
167
__global uchar * dstptr, int dst_step, int dst_offset,
170
int x = get_global_id(0);
171
int y = get_global_id(1) * PIX_PER_WI_Y;
175
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
176
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
179
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
183
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
184
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
185
DATA_TYPE val = src[0];
186
#if dcn == 3 || defined DEPTH_5
187
dst[0] = dst[1] = dst[2] = val;
192
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
195
dst_index += dst_step;
196
src_index += src_step;
202
///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
204
__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
205
__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
207
__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
208
__global uchar* dstptr, int dst_step, int dt_offset,
211
int x = get_global_id(0);
212
int y = get_global_id(1) * PIX_PER_WI_Y;
216
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
217
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
220
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
224
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
225
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
226
DATA_TYPE_4 src_pix = vload4(0, src);
227
DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
230
__constant float * coeffs = c_RGB2YUVCoeffs_f;
231
const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
232
const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);
233
const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);
235
__constant int * coeffs = c_RGB2YUVCoeffs_i;
236
const int delta = HALF_MAX * (1 << yuv_shift);
237
const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
238
const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
239
const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
242
dst[0] = SAT_CAST( Y );
243
dst[1] = SAT_CAST( U );
244
dst[2] = SAT_CAST( V );
247
dst_index += dst_step;
248
src_index += src_step;
254
__constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };
255
__constant int c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };
257
__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
258
__global uchar* dstptr, int dst_step, int dt_offset,
261
int x = get_global_id(0);
262
int y = get_global_id(1) * PIX_PER_WI_Y;
266
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
267
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
270
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
274
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
275
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
276
DATA_TYPE_4 src_pix = vload4(0, src);
277
DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
280
__constant float * coeffs = c_YUV2RGBCoeffs_f;
281
float r = fma(V - HALF_MAX, coeffs[3], Y);
282
float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));
283
float b = fma(U - HALF_MAX, coeffs[0], Y);
285
__constant int * coeffs = c_YUV2RGBCoeffs_i;
286
const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);
287
const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);
288
const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);
291
dst[bidx] = SAT_CAST( b );
292
dst[1] = SAT_CAST( g );
293
dst[bidx^2] = SAT_CAST( r );
298
dst_index += dst_step;
299
src_index += src_step;
304
__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
305
-0.812999725f, 1.5959997177f };
307
__kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
308
__global uchar* dstptr, int dst_step, int dt_offset,
311
int x = get_global_id(0);
312
int y = get_global_id(1) * PIX_PER_WI_Y;
317
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
321
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
322
__global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
323
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
324
__global uchar* dst2 = dst1 + dst_step;
328
float Y3 = ysrc[src_step];
329
float Y4 = ysrc[src_step + 1];
331
float U = ((float)usrc[uidx]) - HALF_MAX;
332
float V = ((float)usrc[1-uidx]) - HALF_MAX;
334
__constant float* coeffs = c_YUV2RGBCoeffs_420;
335
float ruv = fma(coeffs[4], V, 0.5f);
336
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
337
float buv = fma(coeffs[1], U, 0.5f);
339
Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
340
dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
341
dst1[1] = convert_uchar_sat(Y1 + guv);
342
dst1[bidx] = convert_uchar_sat(Y1 + buv);
347
Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
348
dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
349
dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
350
dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
355
Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
356
dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
357
dst2[1] = convert_uchar_sat(Y3 + guv);
358
dst2[bidx] = convert_uchar_sat(Y3 + buv);
363
Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
364
dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
365
dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
366
dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
376
__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
377
__global uchar* dstptr, int dst_step, int dt_offset,
380
int x = get_global_id(0);
381
int y = get_global_id(1) * PIX_PER_WI_Y;
386
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
390
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
391
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
392
__global uchar* dst2 = dst1 + dst_step;
396
float Y3 = ysrc[src_step];
397
float Y4 = ysrc[src_step + 1];
400
__global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
401
int u_ind = mad24(y, cols >> 1, x);
402
float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };
404
int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
405
__global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
406
__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
407
float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };
410
float V = uv[1-uidx];
412
__constant float* coeffs = c_YUV2RGBCoeffs_420;
413
float ruv = fma(coeffs[4], V, 0.5f);
414
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
415
float buv = fma(coeffs[1], U, 0.5f);
417
Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
418
dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
419
dst1[1] = convert_uchar_sat(Y1 + guv);
420
dst1[bidx] = convert_uchar_sat(Y1 + buv);
425
Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
426
dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
427
dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
428
dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
433
Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
434
dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
435
dst2[1] = convert_uchar_sat(Y3 + guv);
436
dst2[bidx] = convert_uchar_sat(Y3 + buv);
441
Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
442
dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
443
dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
444
dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
454
__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
455
0.438999176f, -0.3679990768f, -0.0709991455f };
457
__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
458
__global uchar* dstptr, int dst_step, int dst_offset,
461
int x = get_global_id(0) * PIX_PER_WI_X;
462
int y = get_global_id(1) * PIX_PER_WI_Y;
466
int src_index = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
467
int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
468
int y_rows = rows / 3 * 2;
469
int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
470
__constant float* coeffs = c_RGB2YUVCoeffs_420;
473
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
477
__global const uchar* src1 = srcptr + src_index;
478
__global const uchar* src2 = src1 + src_step;
479
__global uchar* ydst1 = dstptr + ydst_index;
480
__global uchar* ydst2 = ydst1 + dst_step;
482
__global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
483
__global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
485
#if PIX_PER_WI_X == 2
486
int s11 = *((__global const int*) src1);
487
int s12 = *((__global const int*) src1 + 1);
488
int s13 = *((__global const int*) src1 + 2);
490
int s14 = *((__global const int*) src1 + 3);
492
int s21 = *((__global const int*) src2);
493
int s22 = *((__global const int*) src2 + 1);
494
int s23 = *((__global const int*) src2 + 2);
496
int s24 = *((__global const int*) src2 + 3);
498
float src_pix1[scn * 4], src_pix2[scn * 4];
500
*((float4*) src_pix1) = convert_float4(as_uchar4(s11));
501
*((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
502
*((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
504
*((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
506
*((float4*) src_pix2) = convert_float4(as_uchar4(s21));
507
*((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
508
*((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
510
*((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
513
y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));
514
y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));
515
y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
516
y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
517
y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));
518
y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));
519
y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
520
y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
522
*((__global int*) ydst1) = as_int(y1);
523
*((__global int*) ydst2) = as_int(y2);
525
float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),
526
fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),
527
fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
528
fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
530
udst[0] = convert_uchar_sat(uv[uidx] );
531
vdst[0] = convert_uchar_sat(uv[1 - uidx]);
532
udst[1] = convert_uchar_sat(uv[2 + uidx]);
533
vdst[1] = convert_uchar_sat(uv[3 - uidx]);
535
float4 src_pix1 = convert_float4(vload4(0, src1));
536
float4 src_pix2 = convert_float4(vload4(0, src1+scn));
537
float4 src_pix3 = convert_float4(vload4(0, src2));
538
float4 src_pix4 = convert_float4(vload4(0, src2+scn));
540
ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
541
ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
542
ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
543
ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));
545
float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
546
fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
548
udst[0] = convert_uchar_sat(uv[uidx] );
549
vdst[0] = convert_uchar_sat(uv[1-uidx]);
552
src_index += 2*src_step;
553
ydst_index += 2*dst_step;
559
__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
560
__global uchar* dstptr, int dst_step, int dst_offset,
563
int x = get_global_id(0);
564
int y = get_global_id(1) * PIX_PER_WI_Y;
568
__global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
569
__global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
572
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
576
__constant float* coeffs = c_YUV2RGBCoeffs_420;
578
#ifndef USE_OPTIMIZED_LOAD
579
float U = ((float) src[uidx]) - HALF_MAX;
580
float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
581
float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
582
float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
584
int load_src = *((__global int*) src);
585
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
586
float U = vec_src[uidx] - HALF_MAX;
587
float V = vec_src[(2 + uidx) % 4] - HALF_MAX;
588
float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
589
float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
592
float ruv = fma(coeffs[4], V, 0.5f);
593
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
594
float buv = fma(coeffs[1], U, 0.5f);
596
dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
597
dst[1] = convert_uchar_sat(y00 + guv);
598
dst[bidx] = convert_uchar_sat(y00 + buv);
603
dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
604
dst[dcn + 1] = convert_uchar_sat(y01 + guv);
605
dst[dcn + bidx] = convert_uchar_sat(y01 + buv);
617
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
619
__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
620
__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
622
__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
623
__global uchar* dstptr, int dst_step, int dt_offset,
626
int x = get_global_id(0);
627
int y = get_global_id(1) * PIX_PER_WI_Y;
631
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
632
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
635
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
639
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
640
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
641
DATA_TYPE_4 src_pix = vload4(0, src);
642
DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
645
__constant float * coeffs = c_RGB2YCrCbCoeffs_f;
646
DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
647
DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);
648
DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);
650
__constant int * coeffs = c_RGB2YCrCbCoeffs_i;
651
int delta = HALF_MAX * (1 << yuv_shift);
652
int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
653
int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
654
int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
657
dst[0] = SAT_CAST( Y );
658
dst[1] = SAT_CAST( Cr );
659
dst[2] = SAT_CAST( Cb );
662
dst_index += dst_step;
663
src_index += src_step;
669
__constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
670
__constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
672
__kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
673
__global uchar* dst, int dst_step, int dst_offset,
676
int x = get_global_id(0);
677
int y = get_global_id(1) * PIX_PER_WI_Y;
681
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
682
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
685
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
689
__global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
690
__global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);
692
DATA_TYPE_4 src_pix = vload4(0, srcptr);
693
DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;
696
__constant float * coeff = c_YCrCb2RGBCoeffs_f;
697
float r = fma(coeff[0], cr - HALF_MAX, yp);
698
float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp));
699
float b = fma(coeff[3], cb - HALF_MAX, yp);
701
__constant int * coeff = c_YCrCb2RGBCoeffs_i;
702
int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
703
int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);
704
int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
707
dstptr[(bidx^2)] = SAT_CAST(r);
708
dstptr[1] = SAT_CAST(g);
709
dstptr[bidx] = SAT_CAST(b);
715
dst_index += dst_step;
716
src_index += src_step;
722
///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
724
__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
725
__global uchar * dstptr, int dst_step, int dst_offset,
726
int rows, int cols, __constant COEFF_TYPE * coeffs)
728
int dx = get_global_id(0);
729
int dy = get_global_id(1) * PIX_PER_WI_Y;
733
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
734
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
737
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
741
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
742
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
744
DATA_TYPE_4 src_pix = vload4(0, src);
745
DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
748
float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
749
float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
750
float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
752
int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
753
int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
754
int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
756
dst[0] = SAT_CAST(x);
757
dst[1] = SAT_CAST(y);
758
dst[2] = SAT_CAST(z);
761
dst_index += dst_step;
762
src_index += src_step;
768
__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
769
__global uchar * dstptr, int dst_step, int dst_offset,
770
int rows, int cols, __constant COEFF_TYPE * coeffs)
772
int dx = get_global_id(0);
773
int dy = get_global_id(1) * PIX_PER_WI_Y;
777
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
778
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
781
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
785
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
786
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
788
DATA_TYPE_4 src_pix = vload4(0, src);
789
DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
792
float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
793
float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
794
float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
796
int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
797
int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
798
int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
801
DATA_TYPE dst0 = SAT_CAST(b);
802
DATA_TYPE dst1 = SAT_CAST(g);
803
DATA_TYPE dst2 = SAT_CAST(r);
804
#if dcn == 3 || defined DEPTH_5
812
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
816
dst_index += dst_step;
817
src_index += src_step;
823
///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
825
__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
826
__global uchar* dstptr, int dst_step, int dst_offset,
829
int x = get_global_id(0);
830
int y = get_global_id(1) * PIX_PER_WI_Y;
834
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
835
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
838
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
842
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
843
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
844
DATA_TYPE_4 src_pix = vload4(0, src);
865
dst_index += dst_step;
866
src_index += src_step;
872
///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
874
__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
875
__global uchar* dst, int dst_step, int dst_offset,
878
int x = get_global_id(0);
879
int y = get_global_id(1) * PIX_PER_WI_Y;
883
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
884
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
887
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
891
ushort t = *((__global const ushort*)(src + src_index));
894
dst[dst_index + bidx] = (uchar)(t << 3);
895
dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
896
dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
898
dst[dst_index + bidx] = (uchar)(t << 3);
899
dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
900
dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
905
dst[dst_index + 3] = 255;
907
dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
912
dst_index += dst_step;
913
src_index += src_step;
919
__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
920
__global uchar* dst, int dst_step, int dst_offset,
923
int x = get_global_id(0);
924
int y = get_global_id(1) * PIX_PER_WI_Y;
928
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
929
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
932
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
936
uchar4 src_pix = vload4(0, src + src_index);
939
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
941
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
943
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
944
((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
948
dst_index += dst_step;
949
src_index += src_step;
955
///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
957
__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
958
__global uchar* dst, int dst_step, int dst_offset,
961
int x = get_global_id(0);
962
int y = get_global_id(1) * PIX_PER_WI_Y;
966
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
967
int dst_index = mad24(y, dst_step, dst_offset + x);
970
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
974
int t = *((__global const ushort*)(src + src_index));
977
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
979
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
982
dst_index += dst_step;
983
src_index += src_step;
989
__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
990
__global uchar* dst, int dst_step, int dst_offset,
993
int x = get_global_id(0);
994
int y = get_global_id(1) * PIX_PER_WI_Y;
998
int src_index = mad24(y, src_step, src_offset + x);
999
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1002
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1006
int t = src[src_index];
1009
*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
1012
*((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
1015
dst_index += dst_step;
1016
src_index += src_step;
1022
//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
1024
__constant int sector_data[][3] = { { 1, 3, 0 },
1033
__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
1034
__global uchar* dst, int dst_step, int dst_offset,
1036
__constant int * sdiv_table, __constant int * hdiv_table)
1038
int x = get_global_id(0);
1039
int y = get_global_id(1) * PIX_PER_WI_Y;
1043
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1044
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1047
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1051
uchar4 src_pix = vload4(0, src + src_index);
1053
int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1060
vmin = min(vmin, g);
1061
vmin = min(vmin, r);
1064
vr = v == r ? -1 : 0;
1065
vg = v == g ? -1 : 0;
1067
s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
1068
h = (vr & (g - b)) +
1069
(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
1070
h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
1071
h += h < 0 ? hrange : 0;
1073
dst[dst_index] = convert_uchar_sat_rte(h);
1074
dst[dst_index + 1] = (uchar)s;
1075
dst[dst_index + 2] = (uchar)v;
1078
dst_index += dst_step;
1079
src_index += src_step;
1085
__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
1086
__global uchar* dst, int dst_step, int dst_offset,
1089
int x = get_global_id(0);
1090
int y = get_global_id(1) * PIX_PER_WI_Y;
1094
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1095
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1098
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1102
uchar4 src_pix = vload4(0, src + src_index);
1104
float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
1113
do h += 6; while( h < 0 );
1115
do h -= 6; while( h >= 6 );
1116
sector = convert_int_sat_rtn(h);
1118
if( (unsigned)sector >= 6u )
1125
tab[1] = v*(1.f - s);
1126
tab[2] = v*(1.f - s*h);
1127
tab[3] = v*(1.f - s*(1.f - h));
1129
b = tab[sector_data[sector][0]];
1130
g = tab[sector_data[sector][1]];
1131
r = tab[sector_data[sector][2]];
1136
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1137
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1138
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1140
dst[dst_index + 3] = MAX_NUM;
1144
dst_index += dst_step;
1145
src_index += src_step;
1151
#elif defined DEPTH_5
1153
__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
1154
__global uchar* dstptr, int dst_step, int dst_offset,
1157
int x = get_global_id(0);
1158
int y = get_global_id(1) * PIX_PER_WI_Y;
1162
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1163
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1166
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1170
__global const float * src = (__global const float *)(srcptr + src_index);
1171
__global float * dst = (__global float *)(dstptr + dst_index);
1172
float4 src_pix = vload4(0, src);
1174
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1182
if( vmin > g ) vmin = g;
1183
if( vmin > b ) vmin = b;
1186
s = diff/(float)(fabs(v) + FLT_EPSILON);
1187
diff = (float)(60.f/(diff + FLT_EPSILON));
1191
h = fma(b - r, diff, 120.f);
1193
h = fma(r - g, diff, 240.f);
1203
dst_index += dst_step;
1204
src_index += src_step;
1210
__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1211
__global uchar* dstptr, int dst_step, int dst_offset,
1214
int x = get_global_id(0);
1215
int y = get_global_id(1) * PIX_PER_WI_Y;
1219
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1220
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1223
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1228
__global const float * src = (__global const float *)(srcptr + src_index);
1229
__global float * dst = (__global float *)(dstptr + dst_index);
1230
float4 src_pix = vload4(0, src);
1232
float h = src_pix.x, s = src_pix.y, v = src_pix.z;
1241
do h += 6; while (h < 0);
1243
do h -= 6; while (h >= 6);
1244
sector = convert_int_sat_rtn(h);
1246
if ((unsigned)sector >= 6u)
1253
tab[1] = v*(1.f - s);
1254
tab[2] = v*(1.f - s*h);
1255
tab[3] = v*(1.f - s*(1.f - h));
1257
b = tab[sector_data[sector][0]];
1258
g = tab[sector_data[sector][1]];
1259
r = tab[sector_data[sector][2]];
1272
dst_index += dst_step;
1273
src_index += src_step;
1281
///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
1285
__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
1286
__global uchar* dst, int dst_step, int dst_offset,
1289
int x = get_global_id(0);
1290
int y = get_global_id(1) * PIX_PER_WI_Y;
1294
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1295
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1298
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1302
uchar4 src_pix = vload4(0, src + src_index);
1304
float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
1305
float h = 0.f, s = 0.f, l;
1306
float vmin, vmax, diff;
1309
if (vmax < g) vmax = g;
1310
if (vmax < b) vmax = b;
1311
if (vmin > g) vmin = g;
1312
if (vmin > b) vmin = b;
1315
l = (vmax + vmin)*0.5f;
1317
if (diff > FLT_EPSILON)
1319
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1324
else if( vmax == g )
1325
h = fma(b - r, diff, 120.f);
1327
h = fma(r - g, diff, 240.f);
1333
dst[dst_index] = convert_uchar_sat_rte(h*hscale);
1334
dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
1335
dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
1338
dst_index += dst_step;
1339
src_index += src_step;
1345
__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
1346
__global uchar* dst, int dst_step, int dst_offset,
1349
int x = get_global_id(0);
1350
int y = get_global_id(1) * PIX_PER_WI_Y;
1354
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1355
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1358
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1362
uchar4 src_pix = vload4(0, src + src_index);
1364
float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
1371
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1372
float p1 = 2*l - p2;
1376
do h += 6; while( h < 0 );
1378
do h -= 6; while( h >= 6 );
1380
int sector = convert_int_sat_rtn(h);
1385
tab[2] = fma(p2 - p1, 1-h, p1);
1386
tab[3] = fma(p2 - p1, h, p1);
1388
b = tab[sector_data[sector][0]];
1389
g = tab[sector_data[sector][1]];
1390
r = tab[sector_data[sector][2]];
1395
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1396
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1397
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1399
dst[dst_index + 3] = MAX_NUM;
1403
dst_index += dst_step;
1404
src_index += src_step;
1410
#elif defined DEPTH_5
1412
__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
1413
__global uchar* dstptr, int dst_step, int dst_offset,
1416
int x = get_global_id(0);
1417
int y = get_global_id(1) * PIX_PER_WI_Y;
1421
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1422
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1425
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1429
__global const float * src = (__global const float *)(srcptr + src_index);
1430
__global float * dst = (__global float *)(dstptr + dst_index);
1431
float4 src_pix = vload4(0, src);
1433
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1434
float h = 0.f, s = 0.f, l;
1435
float vmin, vmax, diff;
1438
if (vmax < g) vmax = g;
1439
if (vmax < b) vmax = b;
1440
if (vmin > g) vmin = g;
1441
if (vmin > b) vmin = b;
1444
l = (vmax + vmin)*0.5f;
1446
if (diff > FLT_EPSILON)
1448
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1453
else if( vmax == g )
1454
h = fma(b - r, diff, 120.f);
1456
h = fma(r - g, diff, 240.f);
1458
if( h < 0.f ) h += 360.f;
1466
dst_index += dst_step;
1467
src_index += src_step;
1473
__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1474
__global uchar* dstptr, int dst_step, int dst_offset,
1477
int x = get_global_id(0);
1478
int y = get_global_id(1) * PIX_PER_WI_Y;
1482
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1483
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1486
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1490
__global const float * src = (__global const float *)(srcptr + src_index);
1491
__global float * dst = (__global float *)(dstptr + dst_index);
1492
float4 src_pix = vload4(0, src);
1494
float h = src_pix.x, l = src_pix.y, s = src_pix.z;
1502
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1503
float p1 = 2*l - p2;
1507
do h += 6; while( h < 0 );
1509
do h -= 6; while( h >= 6 );
1511
sector = convert_int_sat_rtn(h);
1516
tab[2] = fma(p2 - p1, 1-h, p1);
1517
tab[3] = fma(p2 - p1, h, p1);
1519
b = tab[sector_data[sector][0]];
1520
g = tab[sector_data[sector][1]];
1521
r = tab[sector_data[sector][2]];
1534
dst_index += dst_step;
1535
src_index += src_step;
1543
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
1547
__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
1548
__global uchar* dst, int dst_step, int dst_offset,
1551
int x = get_global_id(0);
1552
int y = get_global_id(1) * PIX_PER_WI_Y;
1556
int src_index = mad24(y, src_step, src_offset + (x << 2));
1557
int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
1560
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1564
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1566
*(__global uchar4 *)(dst + dst_index) =
1567
(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,
1568
mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,
1569
mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);
1572
dst_index += dst_step;
1573
src_index += src_step;
1579
__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
1580
__global uchar* dst, int dst_step, int dst_offset,
1583
int x = get_global_id(0);
1584
int y = get_global_id(1) * PIX_PER_WI_Y;
1588
int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
1589
int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
1592
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1596
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1597
uchar v3 = src_pix.w, v3_half = v3 / 2;
1600
*(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
1602
*(__global uchar4 *)(dst + dst_index) =
1603
(uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
1604
mad24(src_pix.y, MAX_NUM, v3_half) / v3,
1605
mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
1608
dst_index += dst_step;
1609
src_index += src_step;
1617
/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
1619
#define lab_shift xyz_shift
1620
#define gamma_shift 3
1621
#define lab_shift2 (lab_shift + gamma_shift)
1622
#define GAMMA_TAB_SIZE 1024
1623
#define GammaTabScale (float)GAMMA_TAB_SIZE
1625
inline float splineInterpolate(float x, __global const float * tab, int n)
1627
int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
1630
return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
1635
__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
1636
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1637
__global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
1638
__constant int * coeffs, int Lscale, int Lshift)
1640
int x = get_global_id(0);
1641
int y = get_global_id(1) * PIX_PER_WI_Y;
1645
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1646
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1649
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1653
__global const uchar* src_ptr = src + src_index;
1654
__global uchar* dst_ptr = dst + dst_index;
1655
uchar4 src_pix = vload4(0, src_ptr);
1657
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1658
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1659
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1661
int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
1662
int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
1663
int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
1664
int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
1666
int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
1667
int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
1668
int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
1670
dst_ptr[0] = SAT_CAST(L);
1671
dst_ptr[1] = SAT_CAST(a);
1672
dst_ptr[2] = SAT_CAST(b);
1675
dst_index += dst_step;
1676
src_index += src_step;
1682
#elif defined DEPTH_5
1684
__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
1685
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1687
__global const float * gammaTab,
1689
__constant float * coeffs, float _1_3, float _a)
1691
int x = get_global_id(0);
1692
int y = get_global_id(1) * PIX_PER_WI_Y;
1696
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1697
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1700
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1704
__global const float * src = (__global const float *)(srcptr + src_index);
1705
__global float * dst = (__global float *)(dstptr + dst_index);
1706
float4 src_pix = vload4(0, src);
1708
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1709
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1710
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1712
float R = clamp(src_pix.x, 0.0f, 1.0f);
1713
float G = clamp(src_pix.y, 0.0f, 1.0f);
1714
float B = clamp(src_pix.z, 0.0f, 1.0f);
1717
R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1718
G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1719
B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1722
float X = fma(R, C0, fma(G, C1, B*C2));
1723
float Y = fma(R, C3, fma(G, C4, B*C5));
1724
float Z = fma(R, C6, fma(G, C7, B*C8));
1726
float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
1727
float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
1728
float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
1730
float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
1731
float a = 500.f * (FX - FY);
1732
float b = 200.f * (FY - FZ);
1739
dst_index += dst_step;
1740
src_index += src_step;
1748
inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
1750
__global const float * gammaTab,
1752
__constant float * coeffs, float lThresh, float fThresh)
1754
float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
1756
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1757
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1758
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1764
fy = fma(7.787f, y, 16.0f / 116.0f);
1768
fy = (li + 16.0f) / 116.0f;
1772
float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
1775
for (int j = 0; j < 2; j++)
1776
if (fxz[j] <= fThresh)
1777
fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
1779
fxz[j] = fxz[j] * fxz[j] * fxz[j];
1781
float x = fxz[0], z = fxz[1];
1782
float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
1783
float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
1784
float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
1787
ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1788
go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1789
bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1792
dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
1797
__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
1798
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1800
__global const float * gammaTab,
1802
__constant float * coeffs, float lThresh, float fThresh)
1804
int x = get_global_id(0);
1805
int y = get_global_id(1) * PIX_PER_WI_Y;
1809
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1810
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1813
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1817
__global const uchar* src_ptr = src + src_index;
1818
__global uchar * dst_ptr = dst + dst_index;
1819
uchar4 src_pix = vload4(0, src_ptr);
1821
float srcbuf[3], dstbuf[3];
1822
srcbuf[0] = src_pix.x*(100.f/255.f);
1823
srcbuf[1] = convert_float(src_pix.y - 128);
1824
srcbuf[2] = convert_float(src_pix.z - 128);
1826
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1830
coeffs, lThresh, fThresh);
1833
dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
1834
dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
1835
dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
1837
*(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
1838
SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
1841
dst_index += dst_step;
1842
src_index += src_step;
1848
#elif defined DEPTH_5
1850
__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
1851
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1853
__global const float * gammaTab,
1855
__constant float * coeffs, float lThresh, float fThresh)
1857
int x = get_global_id(0);
1858
int y = get_global_id(1) * PIX_PER_WI_Y;
1862
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1863
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1866
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1870
__global const float * src = (__global const float *)(srcptr + src_index);
1871
__global float * dst = (__global float *)(dstptr + dst_index);
1872
float4 src_pix = vload4(0, src);
1874
float srcbuf[3], dstbuf[3];
1875
srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
1877
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1881
coeffs, lThresh, fThresh);
1883
dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
1888
dst_index += dst_step;
1889
src_index += src_step;
1897
/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
1899
#define LAB_CBRT_TAB_SIZE 1024
1900
#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
1902
__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
1906
__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
1907
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1909
__global const float * gammaTab,
1911
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1913
int x = get_global_id(0);
1914
int y = get_global_id(1) * PIX_PER_WI_Y;
1918
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1919
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1922
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1925
__global const float * src = (__global const float *)(srcptr + src_index);
1926
__global float * dst = (__global float *)(dstptr + dst_index);
1928
float R = src[0], G = src[1], B = src[2];
1931
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1932
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1933
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1935
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1936
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1937
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1939
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1940
L = fma(116.f, L, -16.f);
1942
float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1943
float u = L*fma(X, d, -_un);
1944
float v = L*fma(2.25f, Y*d, -_vn);
1951
dst_index += dst_step;
1952
src_index += src_step;
1957
#elif defined DEPTH_0
1959
__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
1960
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1962
__global const float * gammaTab,
1964
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1966
int x = get_global_id(0);
1967
int y = get_global_id(1) * PIX_PER_WI_Y;
1971
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
1972
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1975
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1978
float scale = 1.0f / 255.0f;
1979
float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
1982
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1983
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1984
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1986
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1987
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1988
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1990
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1993
float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1994
float u = L*(X*d - _un);
1995
float v = L*fma(2.25f, Y*d, -_vn);
1997
dst[0] = SAT_CAST(L * 2.55f);
1998
dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
1999
dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
2012
__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
2013
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
2015
__global const float * gammaTab,
2017
__constant float * coeffs, float _un, float _vn)
2019
int x = get_global_id(0);
2020
int y = get_global_id(1) * PIX_PER_WI_Y;
2024
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
2025
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2028
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2031
__global const float * src = (__global const float *)(srcptr + src_index);
2032
__global float * dst = (__global float *)(dstptr + dst_index);
2034
float L = src[0], u = src[1], v = src[2], d, X, Y, Z;
2035
Y = (L + 16.f) * (1.f/116.f);
2041
X = 2.25f * u * Y * iv;
2042
Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2044
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2045
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2046
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2048
R = clamp(R, 0.f, 1.f);
2049
G = clamp(G, 0.f, 1.f);
2050
B = clamp(B, 0.f, 1.f);
2053
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2054
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2055
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2065
dst_index += dst_step;
2066
src_index += src_step;
2071
#elif defined DEPTH_0
2073
__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
2074
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
2076
__global const float * gammaTab,
2078
__constant float * coeffs, float _un, float _vn)
2080
int x = get_global_id(0);
2081
int y = get_global_id(1) * PIX_PER_WI_Y;
2085
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
2086
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2089
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2093
float L = src[0]*(100.f/255.f);
2094
float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
2095
float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
2096
Y = (L + 16.f) * (1.f/116.f);
2102
X = 2.25f * u * Y * iv ;
2103
Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2105
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2106
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2107
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2109
R = clamp(R, 0.f, 1.f);
2110
G = clamp(G, 0.f, 1.f);
2111
B = clamp(B, 0.f, 1.f);
2114
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2115
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2116
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2119
uchar dst0 = SAT_CAST(R * 255.0f);
2120
uchar dst1 = SAT_CAST(G * 255.0f);
2121
uchar dst2 = SAT_CAST(B * 255.0f);
2124
*(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);