~ubuntu-branches/ubuntu/saucy/darktable/saucy

void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)

134

151

{

135

152

// this is called for preview and full pipe separately, each with its own pixelpipe piece.

136

153

// get our data struct:

142

159

if(P <= 1)

143

160

{

144

161

// nothing to do from this distance:

145

memcpy (o, i, sizeof(float)*4*roi_out->width*roi_out->height);

162

memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);

146

163

return;

147

164

}

148

165

149

166

// adjust to Lab, make L more important

150

float max_L = 100.0f, max_C = 256.0f;

151

float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);

167

// float max_L = 100.0f, max_C = 256.0f;

168

// float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);

169

float max_L = 120.0f, max_C = 512.0f;

170

float nL = 1.0f/max_L, nC = 1.0f/max_C;

152

171

const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f };

153

172

154

#define SLIDING_WINDOW // brings down time from 15 secs to 3 secs on a core2 duo

155

#ifdef SLIDING_WINDOW

156

173

float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads());

157

#else

158

float *S = dt_alloc_align(64, sizeof(float)*roi_out->width*roi_out->height);

159

#endif

160

174

// we want to sum up weights in col[3], so need to init to 0:

161

memset(o, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);

175

memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);

162

176

163

177

// for each shift vector

164

178

for(int kj=-K;kj<=K;kj++)

165

179

{

166

180

for(int ki=-K;ki<=K;ki++)

167

181

{

168

#ifdef SLIDING_WINDOW

169

182

int inited_slide = 0;

170

183

// don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors)

171

184

// do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory

172

185

#ifdef _OPENMP

173

# pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, i, o, Sa)

186

# pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa)

174

187

#endif

175

188

for(int j=0; j<roi_out->height; j++)

176

189

{

177

190

if(j+kj < 0 || j+kj >= roi_out->height) continue;

178

191

float *S = Sa + dt_get_thread_num() * roi_out->width;

179

const float *ins = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);

180

float *out = ((float *)o) + 4*roi_out->width*j;

192

const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki);

193

float *out = ((float *)ovoid) + 4*roi_out->width*j;

181

194

182

195

const int Pm = MIN(MIN(P, j+kj), j);

183

196

const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j);

189

202

memset(S, 0x0, sizeof(float)*roi_out->width);

190

203

for(int jj=-Pm;jj<=PM;jj++)

191

204

{

192

float *s = S;

193

const float *inp = ((float *)i) + 4* roi_in->width *(j+jj);

194

const float *inps = ((float *)i) + 4*(roi_in->width *(j+jj+kj) + ki);

195

for(int i=0; i<roi_out->width; i++)

205

int i = MAX(0, -ki);

206

float *s = S + i;

207

const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj);

208

const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki);

209

const int last = roi_out->width + MIN(0, -ki);

210

for(; i<last; i++, inp+=4, inps+=4, s++)

196

211

{

197

if(i+ki >= 0 && i+ki < roi_out->width)

198

{

199

for(int k=0;k<3;k++)

200

s[0] += (inp[4*i + k] - inps[4*i + k])*(inp[4*i + k] - inps[4*i + k]) * norm2[k];

201

}

202

s++;

212

for(int k=0;k<3;k++)

213

s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k];

203

214

}

204

215

}

205

216

// only reuse this if we had a full stripe

217

228

slide += s[P] - s[-P-1];

218

229

if(i+ki >= 0 && i+ki < roi_out->width)

219

230

{

220

const float w = gh(slide);

221

for(int k=0;k<3;k++) out[k] += ins[k] * w;

222

out[3] += w;

231

const __m128 iv = { ins[0], ins[1], ins[2], 1.0f };

232

_mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide)));

223

233

}

224

234

s ++;

225

235

ins += 4;

228

238

if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height)

229

239

{

230

240

// sliding window in j direction:

231

float *s = S;

232

const float *inp = ((float *)i) + 4* roi_in->width *(j+P+1);

233

const float *inps = ((float *)i) + 4*(roi_in->width *(j+P+1+kj) + ki);

234

const float *inm = ((float *)i) + 4* roi_in->width *(j-P);

235

const float *inms = ((float *)i) + 4*(roi_in->width *(j-P+kj) + ki);

236

for(int i=0; i<roi_out->width; i++)

237

{

238

if(i+ki >= 0 && i+ki < roi_out->width) for(int k=0;k<3;k++)

239

s[0] += ((inp[4*i + k] - inps[4*i + k])*(inp[4*i + k] - inps[4*i + k])

240

- (inm[4*i + k] - inms[4*i + k])*(inm[4*i + k] - inms[4*i + k])) * norm2[k];

241

s++;

241

int i = MAX(0, -ki);

242

float *s = S + i;

243

const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1);

244

const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki);

245

const float *inm = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P);

246

const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki);

247

const int last = roi_out->width + MIN(0, -ki);

248

for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)

249

{

250

float stmp = s[0];

251

for(int k=0;k<3;k++)

252

stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])

253

- (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];

254

s[0] = stmp;

255

}

256

/* Process most of the line 4 pixels at a time */

257

for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4)

258

{

259

__m128 sv = _mm_load_ps(s);

260

const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps);

261

const __m128 inp2 = _mm_load_ps(inp+4) - _mm_load_ps(inps+4);

262

const __m128 inp3 = _mm_load_ps(inp+8) - _mm_load_ps(inps+8);

263

const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12);

264

265

const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2);

266

const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4);

267

const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2);

268

const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4);

269

270

const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo);

271

sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]);

272

273

const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo);

274

sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]);

275

276

const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi);

277

sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]);

278

279

const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms);

280

const __m128 inm2 = _mm_load_ps(inm+4) - _mm_load_ps(inms+4);

281

const __m128 inm3 = _mm_load_ps(inm+8) - _mm_load_ps(inms+8);

282

const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12);

283

284

const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2);

285

const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4);

286

const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2);

287

const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4);

288

289

const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo);

290

sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]);

291

292

const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo);

293

sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]);

294

295

const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi);

296

sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]);

297

298

_mm_store_ps(s, sv);

299

}

300

for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)

301

{

302

float stmp = s[0];

303

for(int k=0;k<3;k++)

304

stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])

305

- (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];

306

s[0] = stmp;

242

307

}

243

308

}

244

309

else inited_slide = 0;

245

310

}

246

#else

247

// construct summed area table of weights:

248

#ifdef _OPENMP

249

#pragma omp parallel for default(none) schedule(static) shared(i,S,roi_in,roi_out,kj,ki)

250

#endif

251

for(int j=0; j<roi_out->height; j++)

252

{

253

const float *in = ((float *)i) + 4* roi_in->width * j;

254

const float *ins = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);

255

float *out = ((float *)S) + roi_out->width*j;

256

if(j+kj < 0 || j+kj >= roi_out->height) memset(out, 0x0, sizeof(float)*roi_out->width);

257

else for(int i=0; i<roi_out->width; i++)

258

{

259

if(i+ki < 0 || i+ki >= roi_out->width) out[0] = 0.0f;

260

else

261

{

262

out[0] = (in[0] - ins[0])*(in[0] - ins[0]) * norm2[0];

263

out[0] += (in[1] - ins[1])*(in[1] - ins[1]) * norm2[1];

264

out[0] += (in[2] - ins[2])*(in[2] - ins[2]) * norm2[2];

265

}

266

in += 4;

267

ins += 4;

268

out ++;

269

}

270

}

271

// now sum up:

272

// horizontal phase:

273

#ifdef _OPENMP

274

#pragma omp parallel for default(none) schedule(static) shared(S,roi_out)

275

#endif

276

for(int j=0; j<roi_out->height; j++)

277

{

278

int stride = 1;

279

while(stride < roi_out->width)

280

{

281

float *out = ((float *)S) + roi_out->width*j;

282

for(int i=0;i<roi_out->width-stride;i++)

283

{

284

out[0] += out[stride];

285

out ++;

286

}

287

stride <<= 1;

288

}

289

}

290

// vertical phase:

291

#ifdef _OPENMP

292

#pragma omp parallel for default(none) schedule(static) shared(S,roi_out)

293

#endif

294

for(int i=0; i<roi_out->width; i++)

295

{

296

int stride = 1;

297

while(stride < roi_out->height)

298

{

299

float *out = S + i;

300

for(int j=0;j<roi_out->height-stride;j++)

301

{

302

out[0] += out[roi_out->width*stride];

303

out += roi_out->width;

304

}

305

stride <<= 1;

306

}

307

}

308

// now the denoising loop:

309

#ifdef _OPENMP

310

#pragma omp parallel for default(none) schedule(static) shared(i,o,S,kj,ki,roi_in,roi_out)

311

#endif

312

for(int j=0; j<roi_out->height; j++)

313

{

314

if(j+kj < 0 || j+kj >= roi_out->height) continue;

315

const float *in = ((float *)i) + 4*(roi_in->width *(j+kj) + ki);

316

float *out = ((float *)o) + 4*roi_out->width*j;

317

const float *s = S + roi_out->width*j;

318

const int offy = MIN(j, MIN(roi_out->height - j - 1, P)) * roi_out->width;

319

for(int i=0; i<roi_out->width; i++)

320

{

321

if(i+ki >= 0 && i+ki < roi_out->width)

322

{

323

const int offx = MIN(i, MIN(roi_out->width - i - 1, P));

324

const float m1 = s[offx - offy], m2 = s[- offx + offy], p1 = s[offx + offy], p2 = s[- offx - offy];

325

const float w = gh(p1 + p2 - m1 - m2);

326

for(int k=0;k<3;k++) out[k] += in[k] * w;

327

out[3] += w;

328

}

329

s ++;

330

in += 4;

331

out += 4;

332

}

333

}

334

#endif

335

311

}

336

312

}

337

// normalize:

313

// normalize and apply chroma/luma blending

314

// bias a bit towards higher values for low input values:

315

const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));

316

const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);

338

317

#ifdef _OPENMP

339

#pragma omp parallel for default(none) schedule(static) shared(o,roi_out)

318

#pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d)

340

319

#endif

341

320

for(int j=0; j<roi_out->height; j++)

342

321

{

343

float *out = ((float *)o) + 4*roi_out->width*j;

322

float *out = ((float *)ovoid) + 4*roi_out->width*j;

323

float *in = ((float *)ivoid) + 4*roi_out->width*j;

344

324

for(int i=0; i<roi_out->width; i++)

345

325

{

346

for(int k=0;k<3;k++) out[k] *= 1.0f/out[3];

326

_mm_store_ps(out, _mm_add_ps(

327

_mm_mul_ps(_mm_load_ps(in), invert),

328

_mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));

347

329

out += 4;

330

in += 4;

348

331

}

349

332

}

350

#ifdef SLIDING_WINDOW

351

333

// free shared tmp memory:

352

334

free(Sa);

353

#else

354

// free the summed area table:

355

free(S);

356

#endif

357

335

}

358

336

359

337

/** this will be called to init new defaults if a new image is loaded from film strip mode. */

364

342

// init defaults:

365

343

dt_iop_nlmeans_params_t tmp = (dt_iop_nlmeans_params_t)

366

344

{

367

0.5f, 0.5f

345

0.1f, 0.3f

368

346

};

369

347

memcpy(module->params, &tmp, sizeof(dt_iop_nlmeans_params_t));

370

348

memcpy(module->default_params, &tmp, sizeof(dt_iop_nlmeans_params_t));

467

445

// TODO: adjust defaults:

468

446

g->luma = DTGTK_SLIDER(dtgtk_slider_new_with_range(DARKTABLE_SLIDER_BAR, 0.0f, 100.0f, 1., 50.f, 0));

469

447

g->chroma = DTGTK_SLIDER(dtgtk_slider_new_with_range(DARKTABLE_SLIDER_BAR, 0.0f, 100.0f, 1., 50.f, 0));

470

dtgtk_slider_set_default_value(g->luma, 50.f);

471

dtgtk_slider_set_default_value(g->chroma, 50.f);

448

dtgtk_slider_set_default_value(g->luma, 10.f);

449

dtgtk_slider_set_default_value(g->chroma, 30.f);

472

450

gtk_box_pack_start(GTK_BOX(self->widget), GTK_WIDGET(g->luma), TRUE, TRUE, 0);

473

451

gtk_box_pack_start(GTK_BOX(self->widget), GTK_WIDGET(g->chroma), TRUE, TRUE, 0);

474

452

dtgtk_slider_set_label(g->luma, _("luma"));

475

453

dtgtk_slider_set_unit (g->luma, "%");

476

454

dtgtk_slider_set_label(g->chroma, _("chroma"));

477

455

dtgtk_slider_set_unit (g->chroma, "%");

478

dtgtk_slider_set_accel(g->luma,darktable.control->accels_darkroom,"<Darktable>/darkroom/plugins/nlmeans/luma");

479

dtgtk_slider_set_accel(g->chroma,darktable.control->accels_darkroom,"<Darktable>/darkroom/plugins/nlmeans/chroma");

480

456

g_object_set (GTK_OBJECT(g->luma), "tooltip-text", _("how much to smooth brightness"), (char *)NULL);

481

457

g_object_set (GTK_OBJECT(g->chroma), "tooltip-text", _("how much to smooth colors"), (char *)NULL);

482

458

g_signal_connect (G_OBJECT (g->luma), "value-changed", G_CALLBACK (luma_callback), self);

Older »