1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2014, Intel Corporation, all rights reserved.
14
// Third party copyrights are property of their respective owners.
16
// Redistribution and use in source and binary forms, with or without modification,
17
// are permitted provided that the following conditions are met:
19
// * Redistribution's of source code must retain the above copyright notice,
20
// this list of conditions and the following disclaimer.
22
// * Redistribution's in binary form must reproduce the above copyright notice,
23
// this list of conditions and the following disclaimer in the documentation
24
// and/or other materials provided with the distribution.
26
// * The name of the copyright holders may not be used to endorse or promote products
27
// derived from this software without specific prior written permission.
29
// This software is provided by the copyright holders and contributors "as is" and
30
// any express or implied warranties, including, but not limited to, the implied
31
// warranties of merchantability and fitness for a particular purpose are disclaimed.
32
// In no event shall the Intel Corporation or contributors be liable for any direct,
33
// indirect, incidental, special, exemplary, or consequential damages
34
// (including, but not limited to, procurement of substitute goods or services;
35
// loss of use, data, or profits; or business interruption) however caused
36
// and on any theory of liability, whether in contract, strict liability,
37
// or tort (including negligence or otherwise) arising in any way out of
38
// the use of this software, even if advised of the possibility of such damage.
42
///////////////////////////////////////////////////////////////////////////////////////////////////
43
/////////////////////////////////Macro for border type////////////////////////////////////////////
44
/////////////////////////////////////////////////////////////////////////////////////////////////
46
#ifdef BORDER_CONSTANT
47
// CCCCCC|abcdefgh|CCCCCCC
48
#define EXTRAPOLATE(x, maxV)
49
#elif defined BORDER_REPLICATE
50
// aaaaaa|abcdefgh|hhhhhhh
51
#define EXTRAPOLATE(x, maxV) \
53
(x) = clamp((x), 0, (maxV)-1); \
55
#elif defined BORDER_WRAP
56
// cdefgh|abcdefgh|abcdefg
57
#define EXTRAPOLATE(x, maxV) \
59
(x) = ( (x) + (maxV) ) % (maxV); \
61
#elif defined BORDER_REFLECT
62
// fedcba|abcdefgh|hgfedcb
63
#define EXTRAPOLATE(x, maxV) \
65
(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
67
#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
68
// gfedcb|abcdefgh|gfedcba
69
#define EXTRAPOLATE(x, maxV) \
71
(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
74
#error No extrapolation method
78
#define loadpix(addr) *(__global const srcT *)(addr)
79
#define storepix(val, addr) *(__global dstT *)(addr) = val
80
#define SRCSIZE (int)sizeof(srcT)
81
#define DSTSIZE (int)sizeof(dstT)
83
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
84
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
85
#define SRCSIZE (int)sizeof(srcT1)*3
86
#define DSTSIZE (int)sizeof(dstT1)*3
89
#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
91
#ifdef BORDER_CONSTANT
92
// CCCCCC|abcdefgh|CCCCCCC
93
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
95
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
100
// horizontal and vertical filter kernels
101
// should be defined on host during compile time to avoid overhead
103
__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };
104
__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };
106
__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
107
__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
109
// RADIUSX, RADIUSY are filter dimensions
110
// BLK_X, BLK_Y are local wrogroup sizes
111
// all these should be defined on host during compile time
112
// first lsmem array for source pixels used in first pass,
113
// second lsmemDy for storing first pass results
114
__local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
115
__local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
117
// get local and global ids - used as image and local memory array indexes
118
int lix = get_local_id(0);
119
int liy = get_local_id(1);
121
int x = get_global_id(0);
123
// calculate pixel position in source image taking image offset into account
124
int srcX = x + srcOffsetX - RADIUSX;
126
// extrapolate coordinates, if needed
127
// and read my own source pixel into local memory
128
// with account for extra border pixels, which will be read by starting workitems
132
int yb = clocY + srcOffsetY - RADIUSY;
133
EXTRAPOLATE(yb, (height));
140
EXTRAPOLATE(xb,(width));
141
lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
146
while(clocX < BLK_X+(RADIUSX*2));
150
while (clocY < BLK_Y+(RADIUSY*2));
151
barrier(CLK_LOCAL_MEM_FENCE);
153
for (int y = 0; y < dst_rows; y+=BLK_Y)
155
// do vertical filter pass
156
// and store intermediate results to second local memory array
162
for (i=0; i<=2*RADIUSY; i++)
163
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
164
sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
166
sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
168
lsmemDy[liy][clocX] = sum;
171
while(clocX < BLK_X+(RADIUSX*2));
172
barrier(CLK_LOCAL_MEM_FENCE);
174
// if this pixel happened to be out of image borders because of global size rounding,
176
if ((x < dst_cols) && (y + liy < dst_rows))
178
// do second horizontal filter pass
179
// and calculate final result
181
for (i=0; i<=2*RADIUSX; i++)
182
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
183
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
185
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
188
#ifdef INTEGER_ARITHMETIC
190
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
192
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
195
// store result into destination image
196
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
199
for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
201
int clocX = i % (BLK_X+(RADIUSX*2));
202
int clocY = i / (BLK_X+(RADIUSX*2));
203
lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
205
barrier(CLK_LOCAL_MEM_FENCE);
207
int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;
208
EXTRAPOLATE(yb, (height));
211
int cSrcX = x + srcOffsetX - RADIUSX;
215
EXTRAPOLATE(xb,(width));
216
lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );
221
while(clocX < BLK_X+(RADIUSX*2));
222
barrier(CLK_LOCAL_MEM_FENCE);