1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
18
// Wu Zailong, bullet@yeah.net
20
// Redistribution and use in source and binary forms, with or without modification,
21
// are permitted provided that the following conditions are met:
23
// * Redistribution's of source code must retain the above copyright notice,
24
// this list of conditions and the following disclaimer.
26
// * Redistribution's in binary form must reproduce the above copyright notice,
27
// this list of conditions and the following disclaimer in the documentation
28
// and/or other materials provided with the distribution.
30
// * The name of the copyright holders may not be used to endorse or promote products
31
// derived from this software without specific prior written permission.
33
// This software is provided by the copyright holders and contributors as is and
34
// any express or implied warranties, including, but not limited to, the implied
35
// warranties of merchantability and fitness for a particular purpose are disclaimed.
36
// In no event shall the Intel Corporation or contributors be liable for any direct,
37
// indirect, incidental, special, exemplary, or consequential damages
38
// (including, but not limited to, procurement of substitute goods or services;
39
// loss of use, data, or profits; or business interruption) however caused
40
// and on any theory of liability, whether in contract, strict liability,
41
// or tort (including negligence or otherwise) arising in any way out of
42
// the use of this software, even if advised of the possibility of such damage.
48
#pragma OPENCL EXTENSION cl_amd_fp64:enable
49
#elif defined (cl_khr_fp64)
50
#pragma OPENCL EXTENSION cl_khr_fp64:enable
57
#define loadpix(addr) *(__global const T*)(addr)
58
#define storepix(val, addr) *(__global T*)(addr) = val
59
#define TSIZE ((int)sizeof(T))
60
#define convertScalar(a) (a)
62
#define loadpix(addr) vload3(0, (__global const T1*)(addr))
63
#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
64
#define TSIZE ((int)sizeof(T1)*3)
65
#define convertScalar(a) (T)(a.x, a.y, a.z)
71
INTER_TAB_SIZE = 1 << INTER_BITS,
72
INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
79
#ifdef BORDER_CONSTANT
80
#define EXTRAPOLATE(v2, v) v = scalar;
81
#elif defined BORDER_REPLICATE
82
#define EXTRAPOLATE(v2, v) \
84
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
85
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
87
#elif defined BORDER_WRAP
88
#define EXTRAPOLATE(v2, v) \
91
v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \
92
if (v2.x >= src_cols) \
96
v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
97
if( v2.y >= src_rows ) \
99
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
101
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
102
#ifdef BORDER_REFLECT
103
#define DELTA int delta = 0
105
#define DELTA int delta = 1
107
#define EXTRAPOLATE(v2, v) \
116
v2.x = -v2.x - 1 + delta; \
118
v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \
120
while (v2.x >= src_cols || v2.x < 0); \
128
v2.y = -v2.y - 1 + delta; \
130
v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
132
while (v2.y >= src_rows || v2.y < 0); \
133
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
136
#error No extrapolation method
139
#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
143
__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
144
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
145
__global const uchar * map1ptr, int map1_step, int map1_offset,
146
__global const uchar * map2ptr, int map2_step, int map2_offset,
149
int x = get_global_id(0);
150
int y = get_global_id(1) * rowsPerWI;
154
T scalar = convertScalar(nVal);
156
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
157
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
158
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
161
for (int i = 0; i < rowsPerWI; ++i, ++y,
162
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
165
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
166
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
167
__global T * dst = (__global T *)(dstptr + dst_index);
169
int gx = convert_int_sat_rte(map1[0]);
170
int gy = convert_int_sat_rte(map2[0]);
172
if (NEED_EXTRAPOLATION(gx, gy))
174
#ifndef BORDER_CONSTANT
175
int2 gxy = (int2)(gx, gy);
183
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
184
storepix(loadpix((__global const T*)(srcptr + src_index)), dst);
190
__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
191
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
192
__global const uchar * mapptr, int map_step, int map_offset,
195
int x = get_global_id(0);
196
int y = get_global_id(1) * rowsPerWI;
200
T scalar = convertScalar(nVal);
201
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
202
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
205
for (int i = 0; i < rowsPerWI; ++i, ++y,
206
map_index += map_step, dst_index += dst_step)
209
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
210
__global T * dst = (__global T *)(dstptr + dst_index);
212
int2 gxy = convert_int2_sat_rte(map[0]);
213
int gx = gxy.x, gy = gxy.y;
215
if (NEED_EXTRAPOLATION(gx, gy))
223
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
224
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
230
__kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
231
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
232
__global const uchar * mapptr, int map_step, int map_offset,
235
int x = get_global_id(0);
236
int y = get_global_id(1) * rowsPerWI;
240
T scalar = convertScalar(nVal);
241
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
242
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));
245
for (int i = 0; i < rowsPerWI; ++i, ++y,
246
map_index += map_step, dst_index += dst_step)
249
__global const short2 * map = (__global const short2 *)(mapptr + map_index);
250
__global T * dst = (__global T *)(dstptr + dst_index);
252
int2 gxy = convert_int2(map[0]);
253
int gx = gxy.x, gy = gxy.y;
255
if (NEED_EXTRAPOLATION(gx, gy))
263
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
264
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
270
__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
271
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
272
__global const uchar * map1ptr, int map1_step, int map1_offset,
273
__global const uchar * map2ptr, int map2_step, int map2_offset,
276
int x = get_global_id(0);
277
int y = get_global_id(1) * rowsPerWI;
281
T scalar = convertScalar(nVal);
282
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
283
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
284
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
287
for (int i = 0; i < rowsPerWI; ++i, ++y,
288
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
291
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
292
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
293
__global T * dst = (__global T *)(dstptr + dst_index);
295
int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1);
296
int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
297
int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
298
int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy);
299
int gx = gxy.x, gy = gxy.y;
301
if (NEED_EXTRAPOLATION(gx, gy))
309
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
310
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
316
#elif defined INTER_LINEAR
318
__constant float coeffs[64] =
319
{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
320
0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
321
0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
322
0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
323
0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
324
0.062500f, 0.937500f, 0.031250f, 0.968750f };
326
__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
327
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
328
__global const uchar * map1ptr, int map1_step, int map1_offset,
329
__global const uchar * map2ptr, int map2_step, int map2_offset,
332
int x = get_global_id(0);
333
int y = get_global_id(1) * rowsPerWI;
337
WT scalar = convertToWT(convertScalar(nVal));
338
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
339
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
340
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
343
for (int i = 0; i < rowsPerWI; ++i, ++y,
344
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
347
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
348
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
349
__global T * dst = (__global T *)(dstptr + dst_index);
351
int2 map_dataA = convert_int2(map1[0]);
352
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
353
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
354
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
356
ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));
357
WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);
359
WT a = scalar, b = scalar, c = scalar, d = scalar;
361
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
362
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
364
EXTRAPOLATE(map_dataA, a);
366
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
367
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
369
EXTRAPOLATE(map_dataB, b);
371
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
372
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
374
EXTRAPOLATE(map_dataC, c);
376
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
377
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
379
EXTRAPOLATE(map_dataD, d);
381
WT dst_data = a * (1 - u.x) * (1 - u.y) +
382
b * (u.x) * (1 - u.y) +
383
c * (1 - u.x) * (u.y) +
385
storepix(convertToT(dst_data), dst);
390
__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
391
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
392
__global const uchar * map1ptr, int map1_step, int map1_offset,
393
__global const uchar * map2ptr, int map2_step, int map2_offset,
396
int x = get_global_id(0);
397
int y = get_global_id(1) * rowsPerWI;
401
WT scalar = convertToWT(convertScalar(nVal));
402
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
403
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
404
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
407
for (int i = 0; i < rowsPerWI; ++i, ++y,
408
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
411
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
412
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
413
__global T * dst = (__global T *)(dstptr + dst_index);
415
#if defined BORDER_CONSTANT
416
float xf = map1[0], yf = map2[0];
417
int sx = convert_int_sat_rtz(mad(xf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
418
int sy = convert_int_sat_rtz(mad(yf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
420
__constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
421
__constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
423
WT sum = (WT)(0), xsum;
424
int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));
427
for (int yp = 0; yp < 2; ++yp, src_index += src_step)
429
if (sy + yp >= 0 && sy + yp < src_rows)
432
if (sx >= 0 && sx + 2 < src_cols)
434
#if depth == 0 && cn == 1
435
uchar2 value = vload2(0, srcptr + src_index);
436
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
439
for (int xp = 0; xp < 2; ++xp)
440
xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
446
for (int xp = 0; xp < 2; ++xp)
447
xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
448
convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
450
sum = fma(xsum, coeffs_y[yp], sum);
453
sum = fma(scalar, coeffs_y[yp], sum);
456
storepix(convertToT(sum), dst);
458
float2 map_data = (float2)(map1[0], map2[0]);
460
int2 map_dataA = convert_int2_sat_rtn(map_data);
461
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
462
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
463
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
465
float2 _u = map_data - convert_float2(map_dataA);
466
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
467
WT scalar = convertToWT(convertScalar(nVal));
468
WT a = scalar, b = scalar, c = scalar, d = scalar;
470
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
471
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
473
EXTRAPOLATE(map_dataA, a);
475
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
476
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
478
EXTRAPOLATE(map_dataB, b);
480
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
481
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
483
EXTRAPOLATE(map_dataC, c);
485
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
486
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
488
EXTRAPOLATE(map_dataD, d);
490
WT dst_data = a * (1 - u.x) * (1 - u.y) +
491
b * (u.x) * (1 - u.y) +
492
c * (1 - u.x) * (u.y) +
494
storepix(convertToT(dst_data), dst);
500
__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
501
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
502
__global const uchar * mapptr, int map_step, int map_offset,
505
int x = get_global_id(0);
506
int y = get_global_id(1) * rowsPerWI;
510
WT scalar = convertToWT(convertScalar(nVal));
511
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
512
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
515
for (int i = 0; i < rowsPerWI; ++i, ++y,
516
map_index += map_step, dst_index += dst_step)
519
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
520
__global T * dst = (__global T *)(dstptr + dst_index);
522
float2 map_data = map[0];
523
int2 map_dataA = convert_int2_sat_rtn(map_data);
524
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
525
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
526
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
528
float2 _u = map_data - convert_float2(map_dataA);
529
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
530
WT a = scalar, b = scalar, c = scalar, d = scalar;
532
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
533
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
535
EXTRAPOLATE(map_dataA, a);
537
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
538
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
540
EXTRAPOLATE(map_dataB, b);
542
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
543
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
545
EXTRAPOLATE(map_dataC, c);
547
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
548
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
550
EXTRAPOLATE(map_dataD, d);
552
WT dst_data = a * (1 - u.x) * (1 - u.y) +
553
b * (u.x) * (1 - u.y) +
554
c * (1 - u.x) * (u.y) +
556
storepix(convertToT(dst_data), dst);