144
161
// nothing to do from this distance:
145
memcpy (o, i, sizeof(float)*4*roi_out->width*roi_out->height);
162
memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);
149
166
// adjust to Lab, make L more important
150
float max_L = 100.0f, max_C = 256.0f;
151
float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
167
// float max_L = 100.0f, max_C = 256.0f;
168
// float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
169
float max_L = 120.0f, max_C = 512.0f;
170
float nL = 1.0f/max_L, nC = 1.0f/max_C;
152
171
const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f };
154
#define SLIDING_WINDOW // brings down time from 15 secs to 3 secs on a core2 duo
155
#ifdef SLIDING_WINDOW
156
173
float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads());
158
float *S = dt_alloc_align(64, sizeof(float)*roi_out->width*roi_out->height);
160
174
// we want to sum up weights in col[3], so need to init to 0:
161
memset(o, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);
175
memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);
163
177
// for each shift vector
164
178
for(int kj=-K;kj<=K;kj++)
166
180
for(int ki=-K;ki<=K;ki++)
168
#ifdef SLIDING_WINDOW
169
182
int inited_slide = 0;
170
183
// don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors)
171
184
// do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory
173
# pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, i, o, Sa)
186
# pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa)
175
188
for(int j=0; j<roi_out->height; j++)
177
190
if(j+kj < 0 || j+kj >= roi_out->height) continue;
178
191
float *S = Sa + dt_get_thread_num() * roi_out->width;
179
const float *ins = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);
180
float *out = ((float *)o) + 4*roi_out->width*j;
192
const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki);
193
float *out = ((float *)ovoid) + 4*roi_out->width*j;
182
195
const int Pm = MIN(MIN(P, j+kj), j);
183
196
const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j);
228
238
if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height)
230
240
// sliding window in j direction:
232
const float *inp = ((float *)i) + 4* roi_in->width *(j+P+1);
233
const float *inps = ((float *)i) + 4*(roi_in->width *(j+P+1+kj) + ki);
234
const float *inm = ((float *)i) + 4* roi_in->width *(j-P);
235
const float *inms = ((float *)i) + 4*(roi_in->width *(j-P+kj) + ki);
236
for(int i=0; i<roi_out->width; i++)
238
if(i+ki >= 0 && i+ki < roi_out->width) for(int k=0;k<3;k++)
239
s[0] += ((inp[4*i + k] - inps[4*i + k])*(inp[4*i + k] - inps[4*i + k])
240
- (inm[4*i + k] - inms[4*i + k])*(inm[4*i + k] - inms[4*i + k])) * norm2[k];
243
const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1);
244
const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki);
245
const float *inm = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P);
246
const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki);
247
const int last = roi_out->width + MIN(0, -ki);
248
for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
252
stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
253
- (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
256
/* Process most of the line 4 pixels at a time */
257
for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4)
259
__m128 sv = _mm_load_ps(s);
260
const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps);
261
const __m128 inp2 = _mm_load_ps(inp+4) - _mm_load_ps(inps+4);
262
const __m128 inp3 = _mm_load_ps(inp+8) - _mm_load_ps(inps+8);
263
const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12);
265
const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2);
266
const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4);
267
const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2);
268
const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4);
270
const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo);
271
sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]);
273
const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo);
274
sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]);
276
const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi);
277
sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]);
279
const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms);
280
const __m128 inm2 = _mm_load_ps(inm+4) - _mm_load_ps(inms+4);
281
const __m128 inm3 = _mm_load_ps(inm+8) - _mm_load_ps(inms+8);
282
const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12);
284
const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2);
285
const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4);
286
const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2);
287
const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4);
289
const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo);
290
sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]);
292
const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo);
293
sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]);
295
const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi);
296
sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]);
300
for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
304
stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
305
- (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
244
309
else inited_slide = 0;
247
// construct summed area table of weights:
249
#pragma omp parallel for default(none) schedule(static) shared(i,S,roi_in,roi_out,kj,ki)
251
for(int j=0; j<roi_out->height; j++)
253
const float *in = ((float *)i) + 4* roi_in->width * j;
254
const float *ins = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);
255
float *out = ((float *)S) + roi_out->width*j;
256
if(j+kj < 0 || j+kj >= roi_out->height) memset(out, 0x0, sizeof(float)*roi_out->width);
257
else for(int i=0; i<roi_out->width; i++)
259
if(i+ki < 0 || i+ki >= roi_out->width) out[0] = 0.0f;
262
out[0] = (in[0] - ins[0])*(in[0] - ins[0]) * norm2[0];
263
out[0] += (in[1] - ins[1])*(in[1] - ins[1]) * norm2[1];
264
out[0] += (in[2] - ins[2])*(in[2] - ins[2]) * norm2[2];
274
#pragma omp parallel for default(none) schedule(static) shared(S,roi_out)
276
for(int j=0; j<roi_out->height; j++)
279
while(stride < roi_out->width)
281
float *out = ((float *)S) + roi_out->width*j;
282
for(int i=0;i<roi_out->width-stride;i++)
284
out[0] += out[stride];
292
#pragma omp parallel for default(none) schedule(static) shared(S,roi_out)
294
for(int i=0; i<roi_out->width; i++)
297
while(stride < roi_out->height)
300
for(int j=0;j<roi_out->height-stride;j++)
302
out[0] += out[roi_out->width*stride];
303
out += roi_out->width;
308
// now the denoising loop:
310
#pragma omp parallel for default(none) schedule(static) shared(i,o,S,kj,ki,roi_in,roi_out)
312
for(int j=0; j<roi_out->height; j++)
314
if(j+kj < 0 || j+kj >= roi_out->height) continue;
315
const float *in = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);
316
float *out = ((float *)o) + 4*roi_out->width*j;
317
const float *s = S + roi_out->width*j;
318
const int offy = MIN(j, MIN(roi_out->height - j - 1, P)) * roi_out->width;
319
for(int i=0; i<roi_out->width; i++)
321
if(i+ki >= 0 && i+ki < roi_out->width)
323
const int offx = MIN(i, MIN(roi_out->width - i - 1, P));
324
const float m1 = s[offx - offy], m2 = s[- offx + offy], p1 = s[offx + offy], p2 = s[- offx - offy];
325
const float w = gh(p1 + p2 - m1 - m2);
326
for(int k=0;k<3;k++) out[k] += in[k] * w;
313
// normalize and apply chroma/luma blending
314
// bias a bit towards higher values for low input values:
315
const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));
316
const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);
339
#pragma omp parallel for default(none) schedule(static) shared(o,roi_out)
318
#pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d)
341
320
for(int j=0; j<roi_out->height; j++)
343
float *out = ((float *)o) + 4*roi_out->width*j;
322
float *out = ((float *)ovoid) + 4*roi_out->width*j;
323
float *in = ((float *)ivoid) + 4*roi_out->width*j;
344
324
for(int i=0; i<roi_out->width; i++)
346
for(int k=0;k<3;k++) out[k] *= 1.0f/out[3];
326
_mm_store_ps(out, _mm_add_ps(
327
_mm_mul_ps(_mm_load_ps(in), invert),
328
_mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));
350
#ifdef SLIDING_WINDOW
351
333
// free shared tmp memory:
354
// free the summed area table:
359
337
/** this will be called to init new defaults if a new image is loaded from film strip mode. */
467
445
// TODO: adjust defaults:
468
446
g->luma = DTGTK_SLIDER(dtgtk_slider_new_with_range(DARKTABLE_SLIDER_BAR, 0.0f, 100.0f, 1., 50.f, 0));
469
447
g->chroma = DTGTK_SLIDER(dtgtk_slider_new_with_range(DARKTABLE_SLIDER_BAR, 0.0f, 100.0f, 1., 50.f, 0));
470
dtgtk_slider_set_default_value(g->luma, 50.f);
471
dtgtk_slider_set_default_value(g->chroma, 50.f);
448
dtgtk_slider_set_default_value(g->luma, 10.f);
449
dtgtk_slider_set_default_value(g->chroma, 30.f);
472
450
gtk_box_pack_start(GTK_BOX(self->widget), GTK_WIDGET(g->luma), TRUE, TRUE, 0);
473
451
gtk_box_pack_start(GTK_BOX(self->widget), GTK_WIDGET(g->chroma), TRUE, TRUE, 0);
474
452
dtgtk_slider_set_label(g->luma, _("luma"));
475
453
dtgtk_slider_set_unit (g->luma, "%");
476
454
dtgtk_slider_set_label(g->chroma, _("chroma"));
477
455
dtgtk_slider_set_unit (g->chroma, "%");
478
dtgtk_slider_set_accel(g->luma,darktable.control->accels_darkroom,"<Darktable>/darkroom/plugins/nlmeans/luma");
479
dtgtk_slider_set_accel(g->chroma,darktable.control->accels_darkroom,"<Darktable>/darkroom/plugins/nlmeans/chroma");
480
456
g_object_set (GTK_OBJECT(g->luma), "tooltip-text", _("how much to smooth brightness"), (char *)NULL);
481
457
g_object_set (GTK_OBJECT(g->chroma), "tooltip-text", _("how much to smooth colors"), (char *)NULL);
482
458
g_signal_connect (G_OBJECT (g->luma), "value-changed", G_CALLBACK (luma_callback), self);