1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
// Third party copyrights are property of their respective owners.
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
21
// * Redistribution's of source code must retain the above copyright notice,
22
// this list of conditions and the following disclaimer.
24
// * Redistribution's in binary form must reproduce the above copyright notice,
25
// this list of conditions and the following disclaimer in the documentation
26
// and/or other materials provided with the distribution.
28
// * The name of the copyright holders may not be used to endorse or promote products
29
// derived from this software without specific prior written permission.
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
45
This is a variation of
46
"Stereo Processing by Semiglobal Matching and Mutual Information"
47
by Heiko Hirschmuller.
49
We match blocks rather than individual pixels, thus the algorithm is called
50
SGBM (Semi-global block matching)
53
#include "precomp.hpp"
55
#include "opencv2/core/hal/intrin.hpp"
60
typedef uchar PixType;
61
typedef short CostType;
62
typedef short DispType;
64
enum { NR = 16, NR2 = NR/2 };
67
struct StereoSGBMParams
71
minDisparity = numDisparities = 0;
77
speckleWindowSize = 0;
79
mode = StereoSGBM::MODE_SGBM;
82
StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
83
int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
84
int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
87
minDisparity = _minDisparity;
88
numDisparities = _numDisparities;
89
SADWindowSize = _SADWindowSize;
92
disp12MaxDiff = _disp12MaxDiff;
93
preFilterCap = _preFilterCap;
94
uniquenessRatio = _uniquenessRatio;
95
speckleWindowSize = _speckleWindowSize;
96
speckleRange = _speckleRange;
107
int speckleWindowSize;
114
For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
115
and for each disparity minD<=d<maxD the function
116
computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
117
row1[x] and row2[x-d]. The subpixel algorithm from
118
"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
119
is used, hence the suffix BT.
121
the temporary buffer should contain width2*2 elements
123
static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
124
int minD, int maxD, CostType* cost,
125
PixType* buffer, const PixType* tab,
128
int x, c, width = img1.cols, cn = img1.channels();
129
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
130
int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
131
int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
132
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
133
PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
137
for( c = 0; c < cn*2; c++ )
139
prow1[width*c] = prow1[width*c + width-1] =
140
prow2[width*c] = prow2[width*c + width-1] = tab[0];
143
int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
144
int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
148
for( x = 1; x < width-1; x++ )
150
prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
151
prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
153
prow1[x+width] = row1[x];
154
prow2[width-1-x+width] = row2[x];
159
for( x = 1; x < width-1; x++ )
161
prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
162
prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
163
prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
165
prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
166
prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
167
prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
169
prow1[x+width*3] = row1[x*3];
170
prow1[x+width*4] = row1[x*3+1];
171
prow1[x+width*5] = row1[x*3+2];
173
prow2[width-1-x+width*3] = row2[x*3];
174
prow2[width-1-x+width*4] = row2[x*3+1];
175
prow2[width-1-x+width*5] = row2[x*3+2];
179
memset( cost, 0, width1*D*sizeof(cost[0]) );
182
cost -= minX1*D + minD; // simplify the cost indices inside the loop
185
for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
187
int diff_scale = c < cn ? 0 : 2;
190
// v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
191
// v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
192
for( x = minX2; x < maxX2; x++ )
195
int vl = x > 0 ? (v + prow2[x-1])/2 : v;
196
int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
197
int v0 = std::min(vl, vr); v0 = std::min(v0, v);
198
int v1 = std::max(vl, vr); v1 = std::max(v1, v);
199
buffer[x] = (PixType)v0;
200
buffer[x + width2] = (PixType)v1;
203
for( x = minX1; x < maxX1; x++ )
206
int ul = x > 0 ? (u + prow1[x-1])/2 : u;
207
int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
208
int u0 = std::min(ul, ur); u0 = std::min(u0, u);
209
int u1 = std::max(ul, ur); u1 = std::max(u1, u);
212
v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
213
v_uint8x16 _u1 = v_setall_u8((uchar)u1);
215
for( int d = minD; d < maxD; d += 16 )
217
v_uint8x16 _v = v_load(prow2 + width-x-1 + d);
218
v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
219
v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
220
v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
221
v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
222
v_uint8x16 diff = v_min(c0, c1);
224
v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
225
v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
227
v_uint16x8 diff1,diff2;
228
v_expand(diff,diff1,diff2);
229
v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
230
v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
233
for( int d = minD; d < maxD; d++ )
235
int v = prow2[width-x-1 + d];
236
int v0 = buffer[width-x-1 + d];
237
int v1 = buffer[width-x-1 + d + width2];
238
int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
239
int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
241
cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
247
for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
249
for( x = minX1; x < maxX1; x++ )
255
__m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
257
for( int d = minD; d < maxD; d += 16 )
259
__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
260
__m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
261
__m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
262
__m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
264
_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
265
_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
271
for( int d = minD; d < maxD; d++ )
273
int v = prow2[width-1-x + d];
274
cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
284
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
285
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
287
disp2full is the reverse disparity map, that is:
288
disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
290
note that disp1buf will have the same size as the roi and
291
disp2full will have the same size as img1 (or img2).
292
On exit disp2buf is not the final disparity, it is an intermediate result that becomes
293
final after all the tiles are processed.
295
the disparity in disp1buf is written with sub-pixel accuracy
296
(4 fractional bits, see StereoSGBM::DISP_SCALE),
297
using quadratic interpolation, while the disparity in disp2buf
298
is written as is, without interpolation.
300
disp2cost also has the same size as img1 (or img2).
301
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
303
static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
304
Mat& disp1, const StereoSGBMParams& params,
308
static const uchar LSBTab[] =
310
0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
311
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
312
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
313
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
314
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
315
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
316
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
317
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
320
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
323
const int ALIGN = 16;
324
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
325
const int DISP_SCALE = (1 << DISP_SHIFT);
326
const CostType MAX_COST = SHRT_MAX;
328
int minD = params.minDisparity, maxD = minD + params.numDisparities;
330
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
331
int ftzero = std::max(params.preFilterCap, 15) | 1;
332
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
333
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
334
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
335
int k, width = disp1.cols, height = disp1.rows;
336
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
337
int D = maxD - minD, width1 = maxX1 - minX1;
338
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
339
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
340
bool fullDP = params.mode == StereoSGBM::MODE_HH;
341
int npasses = fullDP ? 2 : 1;
342
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
343
PixType clipTab[TAB_SIZE];
345
for( k = 0; k < TAB_SIZE; k++ )
346
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
350
disp1 = Scalar::all(INVALID_DISP_SCALED);
354
CV_Assert( D % 16 == 0 );
356
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
357
// if you change NR, please, modify the loop as well.
358
int D2 = D+16, NRD2 = NR2*D2;
360
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
361
// for 8-way dynamic programming we need the current row and
362
// the previous row, i.e. 2 rows in total
364
const int LrBorder = NLR - 1;
366
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
367
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
368
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
369
size_t costBufSize = width1*D;
370
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
371
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
372
int hsumBufNRows = SH2*2 + 2;
373
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
374
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
375
CSBufSize*2*sizeof(CostType) + // C, S
376
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
377
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
379
if( buffer.empty() || !buffer.isContinuous() ||
380
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
381
buffer.create(1, (int)totalBufSize, CV_8U);
383
// summary cost over different (nDirs) directions
384
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
385
CostType* Sbuf = Cbuf + CSBufSize;
386
CostType* hsumBuf = Sbuf + CSBufSize;
387
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
389
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
390
DispType* disp2ptr = (DispType*)(disp2cost + width);
391
PixType* tempBuf = (PixType*)(disp2ptr + width);
393
// add P2 to every C(x,y). it saves a few operations in the inner loops
394
for( k = 0; k < width1*D; k++ )
395
Cbuf[k] = (CostType)P2;
397
for( int pass = 1; pass <= npasses; pass++ )
399
int x1, y1, x2, y2, dx, dy;
403
y1 = 0; y2 = height; dy = 1;
404
x1 = 0; x2 = width1; dx = 1;
408
y1 = height-1; y2 = -1; dy = -1;
409
x1 = width1-1; x2 = -1; dx = -1;
412
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
414
for( k = 0; k < NLR; k++ )
416
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
417
// and will occasionally use negative indices with the arrays
418
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
419
// however, then the alignment will be imperfect, i.e. bad for SSE,
420
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
421
Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
422
memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
423
minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
424
memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
427
for( int y = y1; y != y2; y += dy )
430
DispType* disp1ptr = disp1.ptr<DispType>(y);
431
CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
432
CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
434
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
436
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
438
for( k = dy1; k <= dy2; k++ )
440
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
444
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
446
memset(hsumAdd, 0, D*sizeof(CostType));
447
for( x = 0; x <= SW2*D; x += D )
449
int scale = x == 0 ? SW2 + 1 : 1;
450
for( d = 0; d < D; d++ )
451
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
456
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
457
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
459
for( x = D; x < width1*D; x += D )
461
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
462
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
467
for( d = 0; d < D; d += 8 )
469
__m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
470
__m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
471
hv = _mm_adds_epi16(_mm_subs_epi16(hv,
472
_mm_load_si128((const __m128i*)(pixSub + d))),
473
_mm_load_si128((const __m128i*)(pixAdd + d)));
474
Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
475
_mm_load_si128((const __m128i*)(hsumSub + x + d))),
477
_mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
478
_mm_store_si128((__m128i*)(C + x + d), Cx);
484
for( d = 0; d < D; d++ )
486
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
487
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
494
for( x = D; x < width1*D; x += D )
496
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
497
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
499
for( d = 0; d < D; d++ )
500
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
507
int scale = k == 0 ? SH2 + 1 : 1;
508
for( x = 0; x < width1*D; x++ )
509
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
513
// also, clear the S buffer
514
for( k = 0; k < width1*D; k++ )
518
// clear the left and the right borders
519
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
520
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
521
memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
522
memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
525
[formula 13 in the paper]
526
compute L_r(p, d) = C(p, d) +
530
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
531
where p = (x,y), r is one of the directions.
532
we process all the directions at once:
542
for( x = x1; x != x2; x += dx )
544
int xm = x*NR2, xd = xm*D2;
546
int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
547
int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
549
CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
550
CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
551
CostType* Lr_p2 = Lr[1] + xd + D2*2;
552
CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
554
Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
555
Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
557
CostType* Lr_p = Lr[0] + xd;
558
const CostType* Cp = C + x*D;
559
CostType* Sp = S + x*D;
564
__m128i _P1 = _mm_set1_epi16((short)P1);
566
__m128i _delta0 = _mm_set1_epi16((short)delta0);
567
__m128i _delta1 = _mm_set1_epi16((short)delta1);
568
__m128i _delta2 = _mm_set1_epi16((short)delta2);
569
__m128i _delta3 = _mm_set1_epi16((short)delta3);
570
__m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
572
for( d = 0; d < D; d += 8 )
574
__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
575
__m128i L0, L1, L2, L3;
577
L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
578
L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
579
L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
580
L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
582
L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
583
L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
585
L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
586
L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
588
L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
589
L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
591
L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
592
L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
594
L0 = _mm_min_epi16(L0, _delta0);
595
L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
597
L1 = _mm_min_epi16(L1, _delta1);
598
L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
600
L2 = _mm_min_epi16(L2, _delta2);
601
L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
603
L3 = _mm_min_epi16(L3, _delta3);
604
L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
606
_mm_store_si128( (__m128i*)(Lr_p + d), L0);
607
_mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
608
_mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
609
_mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
611
__m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
612
__m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
613
t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
614
_minL0 = _mm_min_epi16(_minL0, t0);
616
__m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
618
L0 = _mm_adds_epi16(L0, L1);
619
L2 = _mm_adds_epi16(L2, L3);
620
Sval = _mm_adds_epi16(Sval, L0);
621
Sval = _mm_adds_epi16(Sval, L2);
623
_mm_store_si128((__m128i*)(Sp + d), Sval);
626
_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
627
_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
632
int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
634
for( d = 0; d < D; d++ )
636
int Cpd = Cp[d], L0, L1, L2, L3;
638
L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
639
L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
640
L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
641
L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
643
Lr_p[d] = (CostType)L0;
644
minL0 = std::min(minL0, L0);
646
Lr_p[d + D2] = (CostType)L1;
647
minL1 = std::min(minL1, L1);
649
Lr_p[d + D2*2] = (CostType)L2;
650
minL2 = std::min(minL2, L2);
652
Lr_p[d + D2*3] = (CostType)L3;
653
minL3 = std::min(minL3, L3);
655
Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
657
minLr[0][xm] = (CostType)minL0;
658
minLr[0][xm+1] = (CostType)minL1;
659
minLr[0][xm+2] = (CostType)minL2;
660
minLr[0][xm+3] = (CostType)minL3;
664
if( pass == npasses )
666
for( x = 0; x < width; x++ )
668
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
669
disp2cost[x] = MAX_COST;
672
for( x = width1 - 1; x >= 0; x-- )
674
CostType* Sp = S + x*D;
675
int minS = MAX_COST, bestDisp = -1;
679
int xm = x*NR2, xd = xm*D2;
681
int minL0 = MAX_COST;
682
int delta0 = minLr[0][xm + NR2] + P2;
683
CostType* Lr_p0 = Lr[0] + xd + NRD2;
684
Lr_p0[-1] = Lr_p0[D] = MAX_COST;
685
CostType* Lr_p = Lr[0] + xd;
687
const CostType* Cp = C + x*D;
692
__m128i _P1 = _mm_set1_epi16((short)P1);
693
__m128i _delta0 = _mm_set1_epi16((short)delta0);
695
__m128i _minL0 = _mm_set1_epi16((short)minL0);
696
__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
697
__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
699
for( d = 0; d < D; d += 8 )
701
__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
703
L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
704
L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
705
L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
706
L0 = _mm_min_epi16(L0, _delta0);
707
L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
709
_mm_store_si128((__m128i*)(Lr_p + d), L0);
710
_minL0 = _mm_min_epi16(_minL0, L0);
711
L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
712
_mm_store_si128((__m128i*)(Sp + d), L0);
714
__m128i mask = _mm_cmpgt_epi16(_minS, L0);
715
_minS = _mm_min_epi16(_minS, L0);
716
_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
717
_d8 = _mm_adds_epi16(_d8, _8);
720
short CV_DECL_ALIGNED(16) bestDispBuf[8];
721
_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
723
_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
724
_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
725
_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
727
__m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
728
qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
729
qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
731
minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
732
minS = (CostType)_mm_cvtsi128_si32(qS);
734
qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
735
qS = _mm_cmpeq_epi16(_minS, qS);
736
int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
738
bestDisp = bestDispBuf[LSBTab[idx]];
743
for( d = 0; d < D; d++ )
745
int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
747
Lr_p[d] = (CostType)L0;
748
minL0 = std::min(minL0, L0);
750
int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
757
minLr[0][xm] = (CostType)minL0;
762
for( d = 0; d < D; d++ )
773
for( d = 0; d < D; d++ )
775
if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
781
int _x2 = x + minX1 - d - minD;
782
if( disp2cost[_x2] > minS )
784
disp2cost[_x2] = (CostType)minS;
785
disp2ptr[_x2] = (DispType)(d + minD);
788
if( 0 < d && d < D-1 )
790
// do subpixel quadratic interpolation:
791
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
792
// then find minimum of the parabola.
793
int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
794
d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
798
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
801
for( x = minX1; x < maxX1; x++ )
803
// we round the computed disparity both towards -inf and +inf and check
804
// if either of the corresponding disparities in disp2 is consistent.
805
// This is to give the computed disparity a chance to look valid if it is.
806
int d1 = disp1ptr[x];
807
if( d1 == INVALID_DISP_SCALED )
809
int _d = d1 >> DISP_SHIFT;
810
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
811
int _x = x - _d, x_ = x - d_;
812
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
813
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
814
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
818
// now shift the cyclic buffers
819
std::swap( Lr[0], Lr[1] );
820
std::swap( minLr[0], minLr[1] );
825
//////////////////////////////////////////////////////////////////////////////////////////////////////
827
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
828
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
829
PixType*& tmpBuf, CostType*& horPassCostVolume,
830
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
831
CostType*& disp2CostBuf, short*& disp2Buf);
833
struct SGBM3WayMainLoop : public ParallelLoopBody
836
const Mat *img1, *img2;
839
int nstripes, stripe_sz;
844
int minX1, maxX1, width1;
848
int uniquenessRatio, disp12MaxDiff;
850
int costBufSize, hsumBufNRows;
855
SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
856
void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
857
void operator () (const Range& range) const;
860
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
861
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
863
nstripes = _nstripes;
864
stripe_overlap = _stripe_overlap;
865
stripe_sz = (int)ceil(img1->rows/(double)nstripes);
867
width = img1->cols; height = img1->rows;
868
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
869
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
870
CV_Assert( D % 16 == 0 );
872
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
874
P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
875
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
876
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
878
costBufSize = width1*D;
879
hsumBufNRows = SH2*2 + 2;
881
ftzero = std::max(params.preFilterCap, 15) | 1;
884
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
885
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
886
PixType*& tmpBuf, CostType*& horPassCostVolume,
887
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
888
CostType*& disp2CostBuf, short*& disp2Buf)
890
// allocating all the required memory:
891
int costVolumeLineSize = width1*D;
892
int width1_ext = width1+2;
893
int costVolumeLineSize_ext = width1_ext*D;
894
int hsumBufNRows = SH2*2 + 2;
896
// main buffer to store matching costs for the current line:
897
int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
899
// auxiliary buffers for the raw matching cost computation:
900
int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
901
int pixDiffSize = costVolumeLineSize*sizeof(CostType);
902
int tmpBufSize = width*16*num_ch*sizeof(PixType);
904
// auxiliary buffers for the matching cost aggregation:
905
int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
906
int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
907
int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
908
int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass
910
// buffers for the pseudo-LRC check:
911
int disp2CostBufSize = width*sizeof(CostType);
912
int disp2BufSize = width*sizeof(short);
914
// sum up the sizes of all the buffers:
915
size_t totalBufSize = curCostVolumeLineSize +
919
horPassCostVolumeSize +
920
vertPassCostVolumeSize +
925
16; //to compensate for the alignPtr shifts
927
if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
928
buffer.create(1, (int)totalBufSize, CV_8U);
930
// set up all the pointers:
931
curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16);
932
hsumBuf = curCostVolumeLine + costVolumeLineSize;
933
pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
934
tmpBuf = (PixType*)(pixDiff + costVolumeLineSize);
935
horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch);
936
vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
937
rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
938
vertPassMin = rightPassBuf + D;
939
disp2CostBuf = vertPassMin + width1_ext;
940
disp2Buf = disp2CostBuf + width;
942
// initialize memory:
943
memset(buffer.ptr(),0,totalBufSize);
944
for(int i=0;i<costVolumeLineSize;i++)
945
curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
948
// performing block matching and building raw cost-volume for the current row
949
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
950
CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
951
int y, int src_start_idx) const
954
int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
956
for(int k = dy1; k <= dy2; k++ )
958
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
961
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
963
memset(hsumAdd, 0, D*sizeof(CostType));
964
for(x = 0; x <= SW2*D; x += D )
966
int scale = x == 0 ? SW2 + 1 : 1;
968
for( d = 0; d < D; d++ )
969
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
972
if( y > src_start_idx )
974
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
976
for( x = D; x < width1*D; x += D )
978
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
979
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
983
for( d = 0; d < D; d+=8 )
985
hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
986
v_store_aligned(hsumAdd+x+d,hv_reg);
987
v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
990
for( d = 0; d < D; d++ )
992
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
993
C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1000
for( x = D; x < width1*D; x += D )
1002
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1003
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1005
for( d = 0; d < D; d++ )
1006
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1011
if( y == src_start_idx )
1013
int scale = k == src_start_idx ? SH2 + 1 : 1;
1014
for( x = 0; x < width1*D; x++ )
1015
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1021
// define some additional reduce operations:
1022
inline short min(const v_int16x8& a)
1024
short CV_DECL_ALIGNED(16) buf[8];
1025
v_store_aligned(buf, a);
1026
short s0 = std::min(buf[0], buf[1]);
1027
short s1 = std::min(buf[2], buf[3]);
1028
short s2 = std::min(buf[4], buf[5]);
1029
short s3 = std::min(buf[6], buf[7]);
1030
return std::min(std::min(s0, s1),std::min(s2, s3));
1033
inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
1035
short CV_DECL_ALIGNED(16) val_buf[8];
1036
v_store_aligned(val_buf, val);
1037
short CV_DECL_ALIGNED(16) pos_buf[8];
1038
v_store_aligned(pos_buf, pos);
1040
short min_val = SHRT_MAX;
1041
if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
1042
if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
1043
if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
1044
if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
1045
if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
1046
if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
1047
if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
1048
if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
1053
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1054
// in-place cost accumulation from top to bottom (result is stored in topBuf)
1055
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1056
CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1059
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1061
v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1062
v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1063
v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX);
1064
v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev);
1066
v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1067
v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1068
v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX);
1069
v_int16x8 src1_topBuf = v_load_aligned(topBuf);
1072
v_int16x8 src_shifted_left,src_shifted_right;
1075
for(int i=0;i<D-8;i+=8)
1079
src2 = v_load_aligned(leftBuf_prev+i+8);
1081
//get shifted versions of the current block and add P1:
1082
src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1083
src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1085
// process and save current block:
1086
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1087
leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1088
v_store_aligned(leftBuf+i, res);
1090
//update src buffers:
1091
src0_leftBuf = src1_leftBuf;
1092
src1_leftBuf = src2;
1096
src2 = v_load_aligned(topBuf+i+8);
1098
//get shifted versions of the current block and add P1:
1099
src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1100
src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1102
// process and save current block:
1103
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1104
topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1105
v_store_aligned(topBuf+i, res);
1107
//update src buffers:
1108
src0_topBuf = src1_topBuf;
1112
// a bit different processing for the last cycle of the loop:
1114
src2 = v_setall_s16(SHRT_MAX);
1115
src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1116
src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1118
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1119
leftMinCost = min(v_min(leftMinCost_new_reg,res));
1120
v_store_aligned(leftBuf+D-8, res);
1123
src2 = v_setall_s16(SHRT_MAX);
1124
src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1125
src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1127
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1128
topMinCost = min(v_min(topMinCost_new_reg,res));
1129
v_store_aligned(topBuf+D-8, res);
1131
CostType leftMinCost_new = SHRT_MAX;
1132
CostType topMinCost_new = SHRT_MAX;
1133
int leftMinCost_P2 = leftMinCost + P2;
1134
int topMinCost_P2 = topMinCost + P2;
1135
CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1136
CostType topBuf_i_minus_1 = SHRT_MAX;
1139
for(int i=0;i<D-1;i++)
1141
leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1142
leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1143
leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1146
topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1147
topBuf_i_minus_1 = tmp;
1148
topMinCost_new = std::min(topMinCost_new,topBuf[i]);
1151
leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1152
leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1154
topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1155
topMinCost = std::min(topMinCost_new,topBuf[D-1]);
1159
// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1160
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1161
// optimal disparity value with minimum accumulated cost
1162
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1163
CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1166
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1168
v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1169
v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1170
v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX);
1171
v_int16x8 src1_rightBuf = v_load(rightBuf);
1174
v_int16x8 src_shifted_left,src_shifted_right;
1177
v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1178
v_int16x8 min_sum_pos_reg = v_setall_s16(0);
1179
v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1180
v_int16x8 eight_reg = v_setall_s16(8);
1182
for(int i=0;i<D-8;i+=8)
1185
src2 = v_load_aligned(rightBuf+i+8);
1187
//get shifted versions of the current block and add P1:
1188
src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1189
src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1191
// process and save current block:
1192
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1193
rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1194
v_store_aligned(rightBuf+i, res);
1196
// compute and save total cost:
1197
res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1198
v_store_aligned(leftBuf+i, res);
1200
// track disparity value with the minimum cost:
1201
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1202
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1203
loop_idx = loop_idx+eight_reg;
1206
src0_rightBuf = src1_rightBuf;
1207
src1_rightBuf = src2;
1210
// a bit different processing for the last cycle of the loop:
1211
src2 = v_setall_s16(SHRT_MAX);
1212
src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1213
src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1215
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1216
rightMinCost = min(v_min(rightMinCost_new_reg,res));
1217
v_store_aligned(rightBuf+D-8, res);
1219
res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1220
v_store_aligned(leftBuf+D-8, res);
1222
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1223
min_cost = min(min_sum_cost_reg);
1224
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1225
optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
1227
CostType rightMinCost_new = SHRT_MAX;
1228
int rightMinCost_P2 = rightMinCost + P2;
1229
CostType rightBuf_i_minus_1 = SHRT_MAX;
1231
min_cost = SHRT_MAX;
1233
for(int i=0;i<D-1;i++)
1236
rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1237
rightBuf_i_minus_1 = tmp;
1238
rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);
1239
leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1240
if(leftBuf[i]<min_cost)
1243
min_cost = leftBuf[i];
1247
rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1248
rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);
1249
leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1250
if(leftBuf[D-1]<min_cost)
1253
min_cost = leftBuf[D-1];
1258
void SGBM3WayMainLoop::operator () (const Range& range) const
1260
// force separate processing of stripes:
1261
if(range.end>range.start+1)
1263
for(int n=range.start;n<range.end;n++)
1264
(*this)(Range(n,n+1));
1268
const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1269
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1271
// setting up the ranges:
1272
int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1273
int src_end_idx = std::min(range.end * stripe_sz, height);
1277
dst_offset=stripe_overlap;
1281
Mat cur_buffer = buffers [range.start];
1282
Mat cur_disp = dst_disp[range.start];
1283
cur_disp = Scalar(INVALID_DISP_SCALED);
1286
CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1288
CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1290
getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1291
curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1292
vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1294
// start real processing:
1295
for(int y=src_start_idx;y<src_end_idx;y++)
1297
getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1299
short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1301
// initialize the auxiliary buffers for the pseudo left-right consistency check:
1302
for(int x=0;x<width;x++)
1304
disp2Buf[x] = (short)INVALID_DISP_SCALED;
1305
disp2CostBuf[x] = SHRT_MAX;
1307
CostType* C = curCostVolumeLine - D;
1308
CostType prev_min, min_cost;
1314
for (int x=D;x<(1+width1)*D;x+=D)
1315
accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
1318
memset(rightPassBuf,0,D*sizeof(CostType));
1320
for (int x=width1*D;x>=D;x-=D)
1322
accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
1324
if(uniquenessRatio>0)
1327
horPassCostVolume+=x;
1328
int thresh = (100*min_cost)/(100-uniquenessRatio);
1329
v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
1330
v_int16x8 d1 = v_setall_s16((short)(best_d-1));
1331
v_int16x8 d2 = v_setall_s16((short)(best_d+1));
1332
v_int16x8 eight_reg = v_setall_s16(8);
1333
v_int16x8 cur_d(0,1,2,3,4,5,6,7);
1334
v_int16x8 mask,cost1,cost2;
1336
for( d = 0; d < D; d+=16 )
1338
cost1 = v_load_aligned(horPassCostVolume+d);
1339
cost2 = v_load_aligned(horPassCostVolume+d+8);
1341
mask = cost1 < thresh_reg;
1342
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1343
if( v_signmask(mask) )
1346
cur_d = cur_d+eight_reg;
1348
mask = cost2 < thresh_reg;
1349
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
1350
if( v_signmask(mask) )
1353
cur_d = cur_d+eight_reg;
1355
horPassCostVolume-=x;
1357
for( d = 0; d < D; d++ )
1359
if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
1368
int _x2 = x/D - 1 + minX1 - d - minD;
1369
if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
1371
disp2CostBuf[_x2] = min_cost;
1372
disp2Buf[_x2] = (short)(d + minD);
1375
if( 0 < d && d < D-1 )
1377
// do subpixel quadratic interpolation:
1378
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1379
// then find minimum of the parabola.
1380
int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
1381
d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
1386
disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
1389
for(int x = minX1; x < maxX1; x++ )
1391
// pseudo LRC consistency check using only one disparity map;
1392
// pixels with difference more than disp12MaxDiff are invalidated
1393
int d1 = disp_row[x];
1394
if( d1 == INVALID_DISP_SCALED )
1396
int _d = d1 >> StereoMatcher::DISP_SHIFT;
1397
int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
1398
int _x = x - _d, x_ = x - d_;
1399
if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
1400
0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
1401
disp_row[x] = (short)INVALID_DISP_SCALED;
1406
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
1407
Mat& disp1, const StereoSGBMParams& params,
1408
Mat* buffers, int nstripes )
1410
// precompute a lookup table for the raw matching cost computation:
1411
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1412
PixType* clipTab = new PixType[TAB_SIZE];
1413
int ftzero = std::max(params.preFilterCap, 15) | 1;
1414
for(int k = 0; k < TAB_SIZE; k++ )
1415
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1417
// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
1418
int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
1419
int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
1420
Mat* dst_disp = new Mat[nstripes];
1421
for(int i=0;i<nstripes;i++)
1422
dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
1424
parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
1426
//assemble disp1 from dst_disp:
1429
for(int i=0;i<disp1.rows;i++)
1431
dst_row = (short*)disp1.ptr(i);
1432
src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
1433
memcpy(dst_row,src_row,disp1.cols*sizeof(short));
1440
class StereoSGBMImpl : public StereoSGBM
1445
params = StereoSGBMParams();
1448
StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
1449
int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
1450
int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
1453
params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
1454
_P1, _P2, _disp12MaxDiff, _preFilterCap,
1455
_uniquenessRatio, _speckleWindowSize, _speckleRange,
1459
void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
1461
Mat left = leftarr.getMat(), right = rightarr.getMat();
1462
CV_Assert( left.size() == right.size() && left.type() == right.type() &&
1463
left.depth() == CV_8U );
1465
disparr.create( left.size(), CV_16S );
1466
Mat disp = disparr.getMat();
1468
if(params.mode==MODE_SGBM_3WAY)
1469
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
1471
computeDisparitySGBM( left, right, disp, params, buffer );
1473
medianBlur(disp, disp, 3);
1475
if( params.speckleWindowSize > 0 )
1476
filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
1477
StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
1480
int getMinDisparity() const { return params.minDisparity; }
1481
void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
1483
int getNumDisparities() const { return params.numDisparities; }
1484
void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
1486
int getBlockSize() const { return params.SADWindowSize; }
1487
void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
1489
int getSpeckleWindowSize() const { return params.speckleWindowSize; }
1490
void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
1492
int getSpeckleRange() const { return params.speckleRange; }
1493
void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
1495
int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
1496
void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
1498
int getPreFilterCap() const { return params.preFilterCap; }
1499
void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
1501
int getUniquenessRatio() const { return params.uniquenessRatio; }
1502
void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
1504
int getP1() const { return params.P1; }
1505
void setP1(int P1) { params.P1 = P1; }
1507
int getP2() const { return params.P2; }
1508
void setP2(int P2) { params.P2 = P2; }
1510
int getMode() const { return params.mode; }
1511
void setMode(int mode) { params.mode = mode; }
1513
void write(FileStorage& fs) const
1515
fs << "name" << name_
1516
<< "minDisparity" << params.minDisparity
1517
<< "numDisparities" << params.numDisparities
1518
<< "blockSize" << params.SADWindowSize
1519
<< "speckleWindowSize" << params.speckleWindowSize
1520
<< "speckleRange" << params.speckleRange
1521
<< "disp12MaxDiff" << params.disp12MaxDiff
1522
<< "preFilterCap" << params.preFilterCap
1523
<< "uniquenessRatio" << params.uniquenessRatio
1524
<< "P1" << params.P1
1525
<< "P2" << params.P2
1526
<< "mode" << params.mode;
1529
void read(const FileNode& fn)
1531
FileNode n = fn["name"];
1532
CV_Assert( n.isString() && String(n) == name_ );
1533
params.minDisparity = (int)fn["minDisparity"];
1534
params.numDisparities = (int)fn["numDisparities"];
1535
params.SADWindowSize = (int)fn["blockSize"];
1536
params.speckleWindowSize = (int)fn["speckleWindowSize"];
1537
params.speckleRange = (int)fn["speckleRange"];
1538
params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1539
params.preFilterCap = (int)fn["preFilterCap"];
1540
params.uniquenessRatio = (int)fn["uniquenessRatio"];
1541
params.P1 = (int)fn["P1"];
1542
params.P2 = (int)fn["P2"];
1543
params.mode = (int)fn["mode"];
1546
StereoSGBMParams params;
1549
// the number of stripes is fixed, disregarding the number of threads/processors
1550
// to make the results fully reproducible:
1551
static const int num_stripes = 4;
1552
Mat buffers[num_stripes];
1554
static const char* name_;
1557
const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
1560
Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
1561
int P1, int P2, int disp12MaxDiff,
1562
int preFilterCap, int uniquenessRatio,
1563
int speckleWindowSize, int speckleRange,
1566
return Ptr<StereoSGBM>(
1567
new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
1568
P1, P2, disp12MaxDiff,
1569
preFilterCap, uniquenessRatio,
1570
speckleWindowSize, speckleRange,
1574
Rect getValidDisparityROI( Rect roi1, Rect roi2,
1576
int numberOfDisparities,
1579
int SW2 = SADWindowSize/2;
1580
int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
1582
int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
1583
int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
1584
int ymin = std::max(roi1.y, roi2.y) + SW2;
1585
int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
1587
Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
1589
return r.width > 0 && r.height > 0 ? r : Rect();
1592
typedef cv::Point_<short> Point2s;
1594
template <typename T>
1595
void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
1599
int width = img.cols, height = img.rows, npixels = width*height;
1600
size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
1601
if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
1602
_buf.create(1, (int)bufSize, CV_8U);
1604
uchar* buf = _buf.ptr();
1605
int i, j, dstep = (int)(img.step/sizeof(T));
1606
int* labels = (int*)buf;
1607
buf += npixels*sizeof(labels[0]);
1608
Point2s* wbuf = (Point2s*)buf;
1609
buf += npixels*sizeof(wbuf[0]);
1610
uchar* rtype = (uchar*)buf;
1613
// clear out label assignments
1614
memset(labels, 0, npixels*sizeof(labels[0]));
1616
for( i = 0; i < height; i++ )
1618
T* ds = img.ptr<T>(i);
1619
int* ls = labels + width*i;
1621
for( j = 0; j < width; j++ )
1623
if( ds[j] != newVal ) // not a bad disparity
1625
if( ls[j] ) // has a label, check for bad label
1627
if( rtype[ls[j]] ) // small region, zero out disparity
1630
// no label, assign and propagate
1633
Point2s* ws = wbuf; // initialize wavefront
1634
Point2s p((short)j, (short)i); // current pixel
1635
curlabel++; // next label
1636
int count = 0; // current region size
1639
// wavefront propagation
1640
while( ws >= wbuf ) // wavefront not empty
1643
// put neighbors onto wavefront
1644
T* dpp = &img.at<T>(p.y, p.x);
1646
int* lpp = labels + width*p.y + p.x;
1648
if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
1650
lpp[+width] = curlabel;
1651
*ws++ = Point2s(p.x, p.y+1);
1654
if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
1656
lpp[-width] = curlabel;
1657
*ws++ = Point2s(p.x, p.y-1);
1660
if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
1663
*ws++ = Point2s(p.x+1, p.y);
1666
if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
1669
*ws++ = Point2s(p.x-1, p.y);
1672
// pop most recent and propagate
1673
// NB: could try least recent, maybe better convergence
1677
// assign label type
1678
if( count <= maxSpeckleSize ) // speckle region
1680
rtype[ls[j]] = 1; // small region label
1684
rtype[ls[j]] = 0; // large region label
1692
static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff)
1694
#if IPP_VERSION_X100 >= 810
1695
int type = img.type();
1697
IppiSize roisize = { img.cols, img.rows };
1698
IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
1699
Ipp8u *pBuffer = NULL;
1700
IppStatus status = ippStsNoErr;
1702
if(ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize) < 0)
1705
pBuffer = (Ipp8u*)ippMalloc(bufsize);
1706
if(!pBuffer && bufsize)
1709
if (type == CV_8UC1)
1711
status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
1712
(Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, pBuffer);
1716
status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
1717
(Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, pBuffer);
1719
if(pBuffer) ippFree(pBuffer);
1724
CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff);
1732
void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
1733
double _maxDiff, InputOutputArray __buf )
1735
Mat img = _img.getMat();
1736
int type = img.type();
1737
Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
1738
CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
1740
int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
1742
CV_IPP_RUN(IPP_VERSION_X100 >= 810 && !__buf.needed() && (type == CV_8UC1 || type == CV_16SC1), ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff));
1744
if (type == CV_8UC1)
1745
filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1747
filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
1750
void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
1751
int numberOfDisparities, int disp12MaxDiff )
1753
Mat disp = _disp.getMat(), cost = _cost.getMat();
1754
int cols = disp.cols, rows = disp.rows;
1755
int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
1756
int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
1757
AutoBuffer<int> _disp2buf(cols*2);
1758
int* disp2buf = _disp2buf;
1759
int* disp2cost = disp2buf + cols;
1760
const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
1761
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1762
int costType = cost.type();
1764
disp12MaxDiff *= DISP_SCALE;
1766
CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
1767
(costType == CV_16S || costType == CV_32S) &&
1768
disp.size() == cost.size() );
1770
for( int y = 0; y < rows; y++ )
1772
short* dptr = disp.ptr<short>(y);
1774
for( x = 0; x < cols; x++ )
1776
disp2buf[x] = INVALID_DISP_SCALED;
1777
disp2cost[x] = INT_MAX;
1780
if( costType == CV_16S )
1782
const short* cptr = cost.ptr<short>(y);
1784
for( x = minX1; x < maxX1; x++ )
1786
int d = dptr[x], c = cptr[x];
1788
if( d == INVALID_DISP_SCALED )
1791
int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1793
if( disp2cost[x2] > c )
1802
const int* cptr = cost.ptr<int>(y);
1804
for( x = minX1; x < maxX1; x++ )
1806
int d = dptr[x], c = cptr[x];
1808
if( d == INVALID_DISP_SCALED )
1811
int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1813
if( disp2cost[x2] > c )
1821
for( x = minX1; x < maxX1; x++ )
1823
// we round the computed disparity both towards -inf and +inf and check
1824
// if either of the corresponding disparities in disp2 is consistent.
1825
// This is to give the computed disparity a chance to look valid if it is.
1827
if( d == INVALID_DISP_SCALED )
1829
int d0 = d >> DISP_SHIFT;
1830
int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1831
int x0 = x - d0, x1 = x - d1;
1832
if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1833
(0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1834
dptr[x] = (short)INVALID_DISP_SCALED;