2
This file is part of darktable,
3
copyright (c) 2011 johannes hanika.
5
darktable is free software: you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation, either version 3 of the License, or
8
(at your option) any later version.
10
darktable is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with darktable. If not, see <http://www.gnu.org/licenses/>.
19
const sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
20
const sampler_t samplerf = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
23
float gh(const float f)
25
// make spread bigger: less smoothing
26
const float spread = 100.f;
27
return 1.0f/(1.0f + fabs(f)*spread);
31
nlmeans (read_only image2d_t in, write_only image2d_t out, const int P, const int K, const float nL, const float nC)
33
const int x = get_global_id(0);
34
const int y = get_global_id(1);
35
const float4 norm2 = (float4)(nL, nC, nC, 1.0f);
38
// this is 20s (compared to 29s brute force below) but still unusable:
39
// load a block of shared memory, initialize to zero
40
local float4 block[32*32];//get_local_size(0)*get_local_size(1)];
41
block[get_local_id(0) + get_local_id(1) * get_local_size(0)] = (float4)0.0f;
42
barrier(CLK_LOCAL_MEM_FENCE);
44
// coalesced mem accesses:
45
const float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
47
// for each shift vector
48
for(int kj=-K;kj<=K;kj++)
50
for(int ki=-K;ki<=K;ki++)
52
const float4 p2 = read_imagef(in, sampleri, (int2)(x+ki, y+kj));
53
const float4 tmp = (p1 - p2)*norm2;
54
const float dist = tmp.x + tmp.y + tmp.z;
55
for(int pj=-P;pj<=P;pj++)
57
for(int pi=-P;pi<=P;pi++)
59
float4 p2 = read_imagef(in, sampleri, (int2)(x+pi+ki, y+pj+kj));
61
const int i = get_local_id(0) + pi, j = get_local_id(1) + pj;
62
if(i >= 0 && i < get_local_size(0) && j >= 0 && j < get_local_size(1))
64
// TODO: for non-linear gh(), this produces results different than the CPU
65
block[i + get_local_size(0) * j].x += gh(p2.x);
66
block[i + get_local_size(0) * j].y += gh(p2.y);
67
block[i + get_local_size(0) * j].z += gh(p2.z);
68
block[i + get_local_size(0) * j].w += gh(p2.w);
74
// write back normalized shm
75
barrier(CLK_LOCAL_MEM_FENCE);
76
const float4 tmp = block[get_local_id(0) + get_local_id(1) * get_local_size(0)];
80
write_imagef (out, (int2)(x, y), tmp);
85
const float4 acc = (float4)(0.0f);
86
// brute force (superslow baseline)!
87
// for each shift vector
88
for(int kj=-K;kj<=K;kj++)
90
for(int ki=-K;ki<=K;ki++)
93
for(int pj=-P;pj<=P;pj++)
95
for(int pi=-P;pi<=P;pi++)
97
float4 p1 = read_imagef(in, sampleri, (int2)(x+pi, y+pj));
98
float4 p2 = read_imagef(in, sampleri, (int2)(x+pi+ki, y+pj+kj));
99
float4 tmp = (p1 - p2)*norm2;
100
dist += tmp.x + tmp.y + tmp.z;
103
float4 pin = read_imagef(in, sampleri, (int2)(x+ki, y+kj));
105
acc.x += dist * pin.x;
106
acc.y += dist * pin.y;
107
acc.z += dist * pin.z;
114
write_imagef (out, (int2)(x, y), acc);