2
// For Open Source Computer Vision Library
4
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
5
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
6
// Third party copyrights are property of their respective owners.
9
// Rock Li, Rock.li@amd.com
11
// Redistribution and use in source and binary forms, with or without modification,
12
// are permitted provided that the following conditions are met:
14
// * Redistribution's of source code must retain the above copyright notice,
15
// this list of conditions and the following disclaimer.
17
// * Redistribution's in binary form must reproduce the above copyright notice,
18
// this list of conditions and the following disclaimer in the documentation
19
// and/or other materials provided with the distribution.
21
// * The name of the copyright holders may not be used to endorse or promote products
22
// derived from this software without specific prior written permission.
24
// This software is provided by the copyright holders and contributors as is and
25
// any express or implied warranties, including, but not limited to, the implied
26
// warranties of merchantability and fitness for a particular purpose are disclaimed.
27
// In no event shall the Intel Corporation or contributors be liable for any direct,
28
// indirect, incidental, special, exemplary, or consequential damages
29
// (including, but not limited to, procurement of substitute goods or services;
30
// loss of use, data, or profits; or business interruption) however caused
31
// and on any theory of liability, whether in contract, strict liability,
32
// or tort (including negligence or otherwise) arising in any way out of
33
// the use of this software, even if advised of the possibility of such damage.
36
#define loadpix(addr) *(__global const uchar_t *)(addr)
37
#define storepix(val, addr) *(__global uchar_t *)(addr) = val
40
#define loadpix(addr) vload3(0, (__global const uchar *)(addr))
41
#define storepix(val, addr) vstore3(val, 0, (__global uchar *)(addr))
48
#define SUM(a) a.x + a.y
50
#define SUM(a) a.x + a.y + a.z
52
#define SUM(a) a.x + a.y + a.z + a.w
54
#error "cn should be <= 4"
57
//Read pixels as integers
58
// Intel Device - Read Pixels as floats
59
__kernel void bilateral(__global const uchar * src, int src_step, int src_offset,
60
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
61
__constant float * space_weight, __constant int * space_ofs)
63
int x = get_global_id(0);
64
int y = get_global_id(1);
66
if (y < dst_rows && x < dst_cols)
68
int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));
69
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
71
float_t sum = (float_t)(0.0f);
74
float_t val0 = convert_float_t(loadpix(src + src_index));
76
int_t val0 = convert_int_t(loadpix(src + src_index));
79
for (int k = 0; k < maxk; k++ )
82
float_t val = convert_float_t(loadpix(src + src_index + space_ofs[k]));
83
float diff = SUM(fabs(val - val0));
85
int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));
86
int diff = SUM(abs(val - val0));
88
float w = space_weight[k] * native_exp((float)(diff * diff * gauss_color_coeff));
89
sum += convert_float_t(val) * (float_t)(w);
92
storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);
98
//for single channgel x4 sized images.
99
__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset,
100
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
101
__constant float * space_weight, __constant int * space_ofs)
103
int x = get_global_id(0);
104
int y = get_global_id(1);
105
if (y < dst_rows && x < dst_cols / 4 )
107
int src_index = ((y + radius) * src_step) + x * 4 + (radius + src_offset);
108
int dst_index = (y * dst_step) + x * 4 + dst_offset ;
109
float4 sum = 0.f, wsum = 0.f;
110
float4 val0 = convert_float4(vload4(0, src + src_index));
112
for (int k = 0; k < maxk; k++ )
114
float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k]));
115
float4 w = space_weight[k] * native_exp((val - val0) * (val - val0) * gauss_color_coeff);
119
sum = sum / wsum + .5f;
120
vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index);
b'\\ No newline at end of file'