1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
43
#include "precomp.hpp"
44
#include "opencl_kernels_imgproc.hpp"
46
/****************************************************************************************\
48
\****************************************************************************************/
50
#if IPP_VERSION_X100 >= 710
51
#define USE_IPP_SEP_FILTERS 1
53
#undef USE_IPP_SEP_FILTERS
59
BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
60
BaseRowFilter::~BaseRowFilter() {}
62
BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
63
BaseColumnFilter::~BaseColumnFilter() {}
64
void BaseColumnFilter::reset() {}
66
BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
67
BaseFilter::~BaseFilter() {}
68
void BaseFilter::reset() {}
70
FilterEngine::FilterEngine()
72
srcType = dstType = bufType = -1;
73
rowBorderType = columnBorderType = BORDER_REPLICATE;
74
bufStep = startY = startY0 = endY = rowCount = dstY = 0;
77
wholeSize = Size(-1,-1);
81
FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
82
const Ptr<BaseRowFilter>& _rowFilter,
83
const Ptr<BaseColumnFilter>& _columnFilter,
84
int _srcType, int _dstType, int _bufType,
85
int _rowBorderType, int _columnBorderType,
86
const Scalar& _borderValue )
88
init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
89
_rowBorderType, _columnBorderType, _borderValue);
92
FilterEngine::~FilterEngine()
97
void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
98
const Ptr<BaseRowFilter>& _rowFilter,
99
const Ptr<BaseColumnFilter>& _columnFilter,
100
int _srcType, int _dstType, int _bufType,
101
int _rowBorderType, int _columnBorderType,
102
const Scalar& _borderValue )
104
_srcType = CV_MAT_TYPE(_srcType);
105
_bufType = CV_MAT_TYPE(_bufType);
106
_dstType = CV_MAT_TYPE(_dstType);
109
int srcElemSize = (int)getElemSize(srcType);
113
filter2D = _filter2D;
114
rowFilter = _rowFilter;
115
columnFilter = _columnFilter;
117
if( _columnBorderType < 0 )
118
_columnBorderType = _rowBorderType;
120
rowBorderType = _rowBorderType;
121
columnBorderType = _columnBorderType;
123
CV_Assert( columnBorderType != BORDER_WRAP );
127
CV_Assert( rowFilter && columnFilter );
128
ksize = Size(rowFilter->ksize, columnFilter->ksize);
129
anchor = Point(rowFilter->anchor, columnFilter->anchor);
133
CV_Assert( bufType == srcType );
134
ksize = filter2D->ksize;
135
anchor = filter2D->anchor;
138
CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
139
0 <= anchor.y && anchor.y < ksize.height );
141
borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
142
int borderLength = std::max(ksize.width - 1, 1);
143
borderTab.resize(borderLength*borderElemSize);
145
maxWidth = bufStep = 0;
146
constBorderRow.clear();
148
if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
150
constBorderValue.resize(srcElemSize*borderLength);
151
int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4));
152
scalarToRawData(_borderValue, &constBorderValue[0], srcType1,
153
borderLength*CV_MAT_CN(srcType));
156
wholeSize = Size(-1,-1);
159
#define VEC_ALIGN CV_MALLOC_ALIGN
161
int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
165
wholeSize = _wholeSize;
167
CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
168
roi.x + roi.width <= wholeSize.width &&
169
roi.y + roi.height <= wholeSize.height );
171
int esz = (int)getElemSize(srcType);
172
int bufElemSize = (int)getElemSize(bufType);
173
const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
175
if( _maxBufRows < 0 )
176
_maxBufRows = ksize.height + 3;
177
_maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
179
if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
181
rows.resize(_maxBufRows);
182
maxWidth = std::max(maxWidth, roi.width);
183
int cn = CV_MAT_CN(srcType);
184
srcRow.resize(esz*(maxWidth + ksize.width - 1));
185
if( columnBorderType == BORDER_CONSTANT )
187
constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
188
uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
189
int n = (int)constBorderValue.size(), N;
190
N = (maxWidth + ksize.width - 1)*esz;
191
tdst = isSeparable() ? &srcRow[0] : dst;
193
for( i = 0; i < N; i += n )
195
n = std::min( n, N - i );
196
for(j = 0; j < n; j++)
197
tdst[i+j] = constVal[j];
201
(*rowFilter)(&srcRow[0], dst, maxWidth, cn);
204
int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
205
(!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
206
ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
209
// adjust bufstep so that the used part of the ring buffer stays compact in memory
210
bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
212
dx1 = std::max(anchor.x - roi.x, 0);
213
dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
215
// recompute border tables
216
if( dx1 > 0 || dx2 > 0 )
218
if( rowBorderType == BORDER_CONSTANT )
220
int nr = isSeparable() ? 1 : (int)rows.size();
221
for( i = 0; i < nr; i++ )
223
uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
224
memcpy( dst, constVal, dx1*esz );
225
memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
230
int xofs1 = std::min(roi.x, anchor.x) - roi.x;
232
int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
233
int* btab = (int*)&borderTab[0];
235
for( i = 0; i < dx1; i++ )
237
int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
238
for( j = 0; j < btab_esz; j++ )
239
btab[i*btab_esz + j] = p0 + j;
242
for( i = 0; i < dx2; i++ )
244
int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
245
for( j = 0; j < btab_esz; j++ )
246
btab[(i + dx1)*btab_esz + j] = p0 + j;
252
startY = startY0 = std::max(roi.y - anchor.y, 0);
253
endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
255
columnFilter->reset();
263
int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
264
bool isolated, int maxBufRows)
266
Rect srcRoi = _srcRoi;
268
if( srcRoi == Rect(0,0,-1,-1) )
269
srcRoi = Rect(0,0,src.cols,src.rows);
271
CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
272
srcRoi.width >= 0 && srcRoi.height >= 0 &&
273
srcRoi.x + srcRoi.width <= src.cols &&
274
srcRoi.y + srcRoi.height <= src.rows );
277
Size wsz(src.cols, src.rows);
279
src.locateROI( wsz, ofs );
280
start( wsz, srcRoi + ofs, maxBufRows );
282
return startY - ofs.y;
286
int FilterEngine::remainingInputRows() const
288
return endY - startY - rowCount;
291
int FilterEngine::remainingOutputRows() const
293
return roi.height - dstY;
296
int FilterEngine::proceed( const uchar* src, int srcstep, int count,
297
uchar* dst, int dststep )
299
CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
301
const int *btab = &borderTab[0];
302
int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
303
uchar** brows = &rows[0];
304
int bufRows = (int)rows.size();
305
int cn = CV_MAT_CN(bufType);
306
int width = roi.width, kwidth = ksize.width;
307
int kheight = ksize.height, ay = anchor.y;
308
int _dx1 = dx1, _dx2 = dx2;
309
int width1 = roi.width + kwidth - 1;
310
int xofs1 = std::min(roi.x, anchor.x);
311
bool isSep = isSeparable();
312
bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
316
count = std::min(count, remainingInputRows());
318
CV_Assert( src && dst && count > 0 );
320
for(;; dst += dststep*i, dy += i)
322
int dcount = bufRows - ay - startY - rowCount + roi.y;
323
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
324
dcount = std::min(dcount, count);
326
for( ; dcount-- > 0; src += srcstep )
328
int bi = (startY - startY0 + rowCount) % bufRows;
329
uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
330
uchar* row = isSep ? &srcRow[0] : brow;
332
if( ++rowCount > bufRows )
338
memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
342
if( btab_esz*(int)sizeof(int) == esz )
344
const int* isrc = (const int*)src;
345
int* irow = (int*)row;
347
for( i = 0; i < _dx1*btab_esz; i++ )
348
irow[i] = isrc[btab[i]];
349
for( i = 0; i < _dx2*btab_esz; i++ )
350
irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
354
for( i = 0; i < _dx1*esz; i++ )
355
row[i] = src[btab[i]];
356
for( i = 0; i < _dx2*esz; i++ )
357
row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
362
(*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
365
int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
366
for( i = 0; i < max_i; i++ )
368
int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
369
wholeSize.height, columnBorderType);
370
if( srcY < 0 ) // can happen only with constant border type
371
brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
374
CV_Assert( srcY >= startY );
375
if( srcY >= startY + rowCount )
377
int bi = (srcY - startY0) % bufRows;
378
brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
385
(*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
387
(*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
391
CV_Assert( dstY <= roi.height );
396
void FilterEngine::apply(const Mat& src, Mat& dst,
397
const Rect& _srcRoi, Point dstOfs, bool isolated)
399
CV_Assert( src.type() == srcType && dst.type() == dstType );
401
Rect srcRoi = _srcRoi;
402
if( srcRoi == Rect(0,0,-1,-1) )
403
srcRoi = Rect(0,0,src.cols,src.rows);
405
if( srcRoi.area() == 0 )
408
CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
409
dstOfs.x + srcRoi.width <= dst.cols &&
410
dstOfs.y + srcRoi.height <= dst.rows );
412
int y = start(src, srcRoi, isolated);
413
proceed( src.ptr() + y*src.step + srcRoi.x*src.elemSize(),
414
(int)src.step, endY - startY,
416
dstOfs.x*dst.elemSize(), (int)dst.step );
421
/****************************************************************************************\
422
* Separable linear filter *
423
\****************************************************************************************/
425
int cv::getKernelType(InputArray filter_kernel, Point anchor)
427
Mat _kernel = filter_kernel.getMat();
428
CV_Assert( _kernel.channels() == 1 );
429
int i, sz = _kernel.rows*_kernel.cols;
432
_kernel.convertTo(kernel, CV_64F);
434
const double* coeffs = kernel.ptr<double>();
436
int type = KERNEL_SMOOTH + KERNEL_INTEGER;
437
if( (_kernel.rows == 1 || _kernel.cols == 1) &&
438
anchor.x*2 + 1 == _kernel.cols &&
439
anchor.y*2 + 1 == _kernel.rows )
440
type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
442
for( i = 0; i < sz; i++ )
444
double a = coeffs[i], b = coeffs[sz - i - 1];
446
type &= ~KERNEL_SYMMETRICAL;
448
type &= ~KERNEL_ASYMMETRICAL;
450
type &= ~KERNEL_SMOOTH;
451
if( a != saturate_cast<int>(a) )
452
type &= ~KERNEL_INTEGER;
456
if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
457
type &= ~KERNEL_SMOOTH;
468
RowNoVec(const Mat&) {}
469
int operator()(const uchar*, uchar*, int, int) const { return 0; }
475
ColumnNoVec(const Mat&, int, int, double) {}
476
int operator()(const uchar**, uchar*, int) const { return 0; }
479
struct SymmRowSmallNoVec
481
SymmRowSmallNoVec() {}
482
SymmRowSmallNoVec(const Mat&, int) {}
483
int operator()(const uchar*, uchar*, int, int) const { return 0; }
486
struct SymmColumnSmallNoVec
488
SymmColumnSmallNoVec() {}
489
SymmColumnSmallNoVec(const Mat&, int, int, double) {}
490
int operator()(const uchar**, uchar*, int) const { return 0; }
496
FilterNoVec(const Mat&, int, double) {}
497
int operator()(const uchar**, uchar*, int) const { return 0; }
503
///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
507
RowVec_8u32s() { smallValues = false; }
508
RowVec_8u32s( const Mat& _kernel )
512
int k, ksize = kernel.rows + kernel.cols - 1;
513
for( k = 0; k < ksize; k++ )
515
int v = kernel.ptr<int>()[k];
516
if( v < SHRT_MIN || v > SHRT_MAX )
524
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
526
if( !checkHardwareSupport(CV_CPU_SSE2) )
529
int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
530
int* dst = (int*)_dst;
531
const int* _kx = kernel.ptr<int>();
536
for( ; i <= width - 16; i += 16 )
538
const uchar* src = _src + i;
539
__m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
540
__m128i x0, x1, x2, x3;
542
for( k = 0; k < _ksize; k++, src += cn )
544
f = _mm_cvtsi32_si128(_kx[k]);
545
f = _mm_shuffle_epi32(f, 0);
546
f = _mm_packs_epi32(f, f);
548
x0 = _mm_loadu_si128((const __m128i*)src);
549
x2 = _mm_unpackhi_epi8(x0, z);
550
x0 = _mm_unpacklo_epi8(x0, z);
551
x1 = _mm_mulhi_epi16(x0, f);
552
x3 = _mm_mulhi_epi16(x2, f);
553
x0 = _mm_mullo_epi16(x0, f);
554
x2 = _mm_mullo_epi16(x2, f);
556
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
557
s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
558
s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
559
s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
562
_mm_store_si128((__m128i*)(dst + i), s0);
563
_mm_store_si128((__m128i*)(dst + i + 4), s1);
564
_mm_store_si128((__m128i*)(dst + i + 8), s2);
565
_mm_store_si128((__m128i*)(dst + i + 12), s3);
568
for( ; i <= width - 4; i += 4 )
570
const uchar* src = _src + i;
571
__m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
573
for( k = 0; k < _ksize; k++, src += cn )
575
f = _mm_cvtsi32_si128(_kx[k]);
576
f = _mm_shuffle_epi32(f, 0);
577
f = _mm_packs_epi32(f, f);
579
x0 = _mm_cvtsi32_si128(*(const int*)src);
580
x0 = _mm_unpacklo_epi8(x0, z);
581
x1 = _mm_mulhi_epi16(x0, f);
582
x0 = _mm_mullo_epi16(x0, f);
583
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
585
_mm_store_si128((__m128i*)(dst + i), s0);
596
struct SymmRowSmallVec_8u32s
598
SymmRowSmallVec_8u32s() { smallValues = false; }
599
SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
602
symmetryType = _symmetryType;
604
int k, ksize = kernel.rows + kernel.cols - 1;
605
for( k = 0; k < ksize; k++ )
607
int v = kernel.ptr<int>()[k];
608
if( v < SHRT_MIN || v > SHRT_MAX )
616
int operator()(const uchar* src, uchar* _dst, int width, int cn) const
618
if( !checkHardwareSupport(CV_CPU_SSE2) )
621
int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
622
int* dst = (int*)_dst;
623
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
624
const int* kx = kernel.ptr<int>() + _ksize/2;
628
src += (_ksize/2)*cn;
631
__m128i z = _mm_setzero_si128();
638
if( kx[0] == 2 && kx[1] == 1 )
639
for( ; i <= width - 16; i += 16, src += 16 )
641
__m128i x0, x1, x2, y0, y1, y2;
642
x0 = _mm_loadu_si128((__m128i*)(src - cn));
643
x1 = _mm_loadu_si128((__m128i*)src);
644
x2 = _mm_loadu_si128((__m128i*)(src + cn));
645
y0 = _mm_unpackhi_epi8(x0, z);
646
x0 = _mm_unpacklo_epi8(x0, z);
647
y1 = _mm_unpackhi_epi8(x1, z);
648
x1 = _mm_unpacklo_epi8(x1, z);
649
y2 = _mm_unpackhi_epi8(x2, z);
650
x2 = _mm_unpacklo_epi8(x2, z);
651
x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
652
y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
653
_mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
654
_mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
655
_mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
656
_mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
658
else if( kx[0] == -2 && kx[1] == 1 )
659
for( ; i <= width - 16; i += 16, src += 16 )
661
__m128i x0, x1, x2, y0, y1, y2;
662
x0 = _mm_loadu_si128((__m128i*)(src - cn));
663
x1 = _mm_loadu_si128((__m128i*)src);
664
x2 = _mm_loadu_si128((__m128i*)(src + cn));
665
y0 = _mm_unpackhi_epi8(x0, z);
666
x0 = _mm_unpacklo_epi8(x0, z);
667
y1 = _mm_unpackhi_epi8(x1, z);
668
x1 = _mm_unpacklo_epi8(x1, z);
669
y2 = _mm_unpackhi_epi8(x2, z);
670
x2 = _mm_unpacklo_epi8(x2, z);
671
x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
672
y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
673
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
674
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
675
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
676
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
680
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
681
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
682
k0 = _mm_packs_epi32(k0, k0);
683
k1 = _mm_packs_epi32(k1, k1);
685
for( ; i <= width - 16; i += 16, src += 16 )
687
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
688
x0 = _mm_loadu_si128((__m128i*)(src - cn));
689
x1 = _mm_loadu_si128((__m128i*)src);
690
x2 = _mm_loadu_si128((__m128i*)(src + cn));
691
y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
692
x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
693
y1 = _mm_unpackhi_epi8(x1, z);
694
x1 = _mm_unpacklo_epi8(x1, z);
696
t1 = _mm_mulhi_epi16(x1, k0);
697
t0 = _mm_mullo_epi16(x1, k0);
698
x2 = _mm_mulhi_epi16(x0, k1);
699
x0 = _mm_mullo_epi16(x0, k1);
700
z0 = _mm_unpacklo_epi16(t0, t1);
701
z1 = _mm_unpackhi_epi16(t0, t1);
702
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
703
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
705
t1 = _mm_mulhi_epi16(y1, k0);
706
t0 = _mm_mullo_epi16(y1, k0);
707
y1 = _mm_mulhi_epi16(y0, k1);
708
y0 = _mm_mullo_epi16(y0, k1);
709
z2 = _mm_unpacklo_epi16(t0, t1);
710
z3 = _mm_unpackhi_epi16(t0, t1);
711
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
712
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
713
_mm_store_si128((__m128i*)(dst + i), z0);
714
_mm_store_si128((__m128i*)(dst + i + 4), z1);
715
_mm_store_si128((__m128i*)(dst + i + 8), z2);
716
_mm_store_si128((__m128i*)(dst + i + 12), z3);
720
else if( _ksize == 5 )
722
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
723
for( ; i <= width - 16; i += 16, src += 16 )
725
__m128i x0, x1, x2, y0, y1, y2;
726
x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
727
x1 = _mm_loadu_si128((__m128i*)src);
728
x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
729
y0 = _mm_unpackhi_epi8(x0, z);
730
x0 = _mm_unpacklo_epi8(x0, z);
731
y1 = _mm_unpackhi_epi8(x1, z);
732
x1 = _mm_unpacklo_epi8(x1, z);
733
y2 = _mm_unpackhi_epi8(x2, z);
734
x2 = _mm_unpacklo_epi8(x2, z);
735
x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
736
y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
737
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
738
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
739
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
740
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
744
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
745
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
746
k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
747
k0 = _mm_packs_epi32(k0, k0);
748
k1 = _mm_packs_epi32(k1, k1);
749
k2 = _mm_packs_epi32(k2, k2);
751
for( ; i <= width - 16; i += 16, src += 16 )
753
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
754
x0 = _mm_loadu_si128((__m128i*)(src - cn));
755
x1 = _mm_loadu_si128((__m128i*)src);
756
x2 = _mm_loadu_si128((__m128i*)(src + cn));
757
y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
758
x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
759
y1 = _mm_unpackhi_epi8(x1, z);
760
x1 = _mm_unpacklo_epi8(x1, z);
762
t1 = _mm_mulhi_epi16(x1, k0);
763
t0 = _mm_mullo_epi16(x1, k0);
764
x2 = _mm_mulhi_epi16(x0, k1);
765
x0 = _mm_mullo_epi16(x0, k1);
766
z0 = _mm_unpacklo_epi16(t0, t1);
767
z1 = _mm_unpackhi_epi16(t0, t1);
768
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
769
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
771
t1 = _mm_mulhi_epi16(y1, k0);
772
t0 = _mm_mullo_epi16(y1, k0);
773
y1 = _mm_mulhi_epi16(y0, k1);
774
y0 = _mm_mullo_epi16(y0, k1);
775
z2 = _mm_unpacklo_epi16(t0, t1);
776
z3 = _mm_unpackhi_epi16(t0, t1);
777
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
778
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
780
x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
781
x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
782
y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
783
y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
785
t1 = _mm_mulhi_epi16(y0, k2);
786
t0 = _mm_mullo_epi16(y0, k2);
787
y0 = _mm_mullo_epi16(y1, k2);
788
y1 = _mm_mulhi_epi16(y1, k2);
789
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
790
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
791
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
792
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
794
_mm_store_si128((__m128i*)(dst + i), z0);
795
_mm_store_si128((__m128i*)(dst + i + 4), z1);
796
_mm_store_si128((__m128i*)(dst + i + 8), z2);
797
_mm_store_si128((__m128i*)(dst + i + 12), z3);
806
if( kx[0] == 0 && kx[1] == 1 )
807
for( ; i <= width - 16; i += 16, src += 16 )
810
x0 = _mm_loadu_si128((__m128i*)(src + cn));
811
x1 = _mm_loadu_si128((__m128i*)(src - cn));
812
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
813
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
814
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
815
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
816
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
817
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
821
__m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
822
k1 = _mm_packs_epi32(k1, k1);
824
for( ; i <= width - 16; i += 16, src += 16 )
826
__m128i x0, x1, y0, y1, z0, z1, z2, z3;
827
x0 = _mm_loadu_si128((__m128i*)(src + cn));
828
x1 = _mm_loadu_si128((__m128i*)(src - cn));
829
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
830
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
832
x1 = _mm_mulhi_epi16(x0, k1);
833
x0 = _mm_mullo_epi16(x0, k1);
834
z0 = _mm_unpacklo_epi16(x0, x1);
835
z1 = _mm_unpackhi_epi16(x0, x1);
837
y1 = _mm_mulhi_epi16(y0, k1);
838
y0 = _mm_mullo_epi16(y0, k1);
839
z2 = _mm_unpacklo_epi16(y0, y1);
840
z3 = _mm_unpackhi_epi16(y0, y1);
841
_mm_store_si128((__m128i*)(dst + i), z0);
842
_mm_store_si128((__m128i*)(dst + i + 4), z1);
843
_mm_store_si128((__m128i*)(dst + i + 8), z2);
844
_mm_store_si128((__m128i*)(dst + i + 12), z3);
848
else if( _ksize == 5 )
850
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
851
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
852
k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
853
k0 = _mm_packs_epi32(k0, k0);
854
k1 = _mm_packs_epi32(k1, k1);
855
k2 = _mm_packs_epi32(k2, k2);
857
for( ; i <= width - 16; i += 16, src += 16 )
859
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
860
x0 = _mm_loadu_si128((__m128i*)(src + cn));
861
x2 = _mm_loadu_si128((__m128i*)(src - cn));
862
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
863
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
865
x2 = _mm_mulhi_epi16(x0, k1);
866
x0 = _mm_mullo_epi16(x0, k1);
867
z0 = _mm_unpacklo_epi16(x0, x2);
868
z1 = _mm_unpackhi_epi16(x0, x2);
869
y1 = _mm_mulhi_epi16(y0, k1);
870
y0 = _mm_mullo_epi16(y0, k1);
871
z2 = _mm_unpacklo_epi16(y0, y1);
872
z3 = _mm_unpackhi_epi16(y0, y1);
874
x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
875
x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
876
y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
877
y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
879
t1 = _mm_mulhi_epi16(y0, k2);
880
t0 = _mm_mullo_epi16(y0, k2);
881
y0 = _mm_mullo_epi16(y1, k2);
882
y1 = _mm_mulhi_epi16(y1, k2);
883
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
884
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
885
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
886
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
888
_mm_store_si128((__m128i*)(dst + i), z0);
889
_mm_store_si128((__m128i*)(dst + i + 4), z1);
890
_mm_store_si128((__m128i*)(dst + i + 8), z2);
891
_mm_store_si128((__m128i*)(dst + i + 12), z3);
896
src -= (_ksize/2)*cn;
898
for( ; i <= width - 4; i += 4, src += 4 )
900
__m128i f, s0 = z, x0, x1;
902
for( k = j = 0; k < _ksize; k++, j += cn )
904
f = _mm_cvtsi32_si128(kx[k]);
905
f = _mm_shuffle_epi32(f, 0);
906
f = _mm_packs_epi32(f, f);
908
x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
909
x0 = _mm_unpacklo_epi8(x0, z);
910
x1 = _mm_mulhi_epi16(x0, f);
911
x0 = _mm_mullo_epi16(x0, f);
912
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
914
_mm_store_si128((__m128i*)(dst + i), s0);
926
struct SymmColumnVec_32s8u
928
SymmColumnVec_32s8u() { symmetryType=0; }
929
SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
931
symmetryType = _symmetryType;
932
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
933
delta = (float)(_delta/(1 << _bits));
934
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
937
int operator()(const uchar** _src, uchar* dst, int width) const
939
if( !checkHardwareSupport(CV_CPU_SSE2) )
942
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
943
const float* ky = kernel.ptr<float>() + ksize2;
945
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
946
const int** src = (const int**)_src;
947
const __m128i *S, *S2;
948
__m128 d4 = _mm_set1_ps(delta);
952
for( ; i <= width - 16; i += 16 )
954
__m128 f = _mm_load_ss(ky);
955
f = _mm_shuffle_ps(f, f, 0);
956
__m128 s0, s1, s2, s3;
958
S = (const __m128i*)(src[0] + i);
959
s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
960
s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
961
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
962
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
963
s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
964
s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
965
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
966
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
968
for( k = 1; k <= ksize2; k++ )
970
S = (const __m128i*)(src[k] + i);
971
S2 = (const __m128i*)(src[-k] + i);
972
f = _mm_load_ss(ky+k);
973
f = _mm_shuffle_ps(f, f, 0);
974
x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
975
x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
976
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
977
s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
978
x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
979
x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
980
s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
981
s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
984
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
985
x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
986
x0 = _mm_packus_epi16(x0, x1);
987
_mm_storeu_si128((__m128i*)(dst + i), x0);
990
for( ; i <= width - 4; i += 4 )
992
__m128 f = _mm_load_ss(ky);
993
f = _mm_shuffle_ps(f, f, 0);
995
__m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
996
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
998
for( k = 1; k <= ksize2; k++ )
1000
S = (const __m128i*)(src[k] + i);
1001
S2 = (const __m128i*)(src[-k] + i);
1002
f = _mm_load_ss(ky+k);
1003
f = _mm_shuffle_ps(f, f, 0);
1004
x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1005
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1008
x0 = _mm_cvtps_epi32(s0);
1009
x0 = _mm_packs_epi32(x0, x0);
1010
x0 = _mm_packus_epi16(x0, x0);
1011
*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1016
for( ; i <= width - 16; i += 16 )
1018
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1021
for( k = 1; k <= ksize2; k++ )
1023
S = (const __m128i*)(src[k] + i);
1024
S2 = (const __m128i*)(src[-k] + i);
1025
f = _mm_load_ss(ky+k);
1026
f = _mm_shuffle_ps(f, f, 0);
1027
x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1028
x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1029
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1030
s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1031
x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1032
x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1033
s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1034
s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1037
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1038
x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1039
x0 = _mm_packus_epi16(x0, x1);
1040
_mm_storeu_si128((__m128i*)(dst + i), x0);
1043
for( ; i <= width - 4; i += 4 )
1048
for( k = 1; k <= ksize2; k++ )
1050
S = (const __m128i*)(src[k] + i);
1051
S2 = (const __m128i*)(src[-k] + i);
1052
f = _mm_load_ss(ky+k);
1053
f = _mm_shuffle_ps(f, f, 0);
1054
x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1055
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1058
x0 = _mm_cvtps_epi32(s0);
1059
x0 = _mm_packs_epi32(x0, x0);
1060
x0 = _mm_packus_epi16(x0, x0);
1061
*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1074
struct SymmColumnSmallVec_32s16s
1076
SymmColumnSmallVec_32s16s() { symmetryType=0; }
1077
SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1079
symmetryType = _symmetryType;
1080
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1081
delta = (float)(_delta/(1 << _bits));
1082
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1085
int operator()(const uchar** _src, uchar* _dst, int width) const
1087
if( !checkHardwareSupport(CV_CPU_SSE2) )
1090
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1091
const float* ky = kernel.ptr<float>() + ksize2;
1093
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1094
const int** src = (const int**)_src;
1095
const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1096
short* dst = (short*)_dst;
1097
__m128 df4 = _mm_set1_ps(delta);
1098
__m128i d4 = _mm_cvtps_epi32(df4);
1102
if( ky[0] == 2 && ky[1] == 1 )
1104
for( ; i <= width - 8; i += 8 )
1106
__m128i s0, s1, s2, s3, s4, s5;
1107
s0 = _mm_load_si128((__m128i*)(S0 + i));
1108
s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1109
s2 = _mm_load_si128((__m128i*)(S1 + i));
1110
s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1111
s4 = _mm_load_si128((__m128i*)(S2 + i));
1112
s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1113
s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1114
s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1115
s0 = _mm_add_epi32(s0, d4);
1116
s1 = _mm_add_epi32(s1, d4);
1117
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1120
else if( ky[0] == -2 && ky[1] == 1 )
1122
for( ; i <= width - 8; i += 8 )
1124
__m128i s0, s1, s2, s3, s4, s5;
1125
s0 = _mm_load_si128((__m128i*)(S0 + i));
1126
s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1127
s2 = _mm_load_si128((__m128i*)(S1 + i));
1128
s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1129
s4 = _mm_load_si128((__m128i*)(S2 + i));
1130
s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1131
s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1132
s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1133
s0 = _mm_add_epi32(s0, d4);
1134
s1 = _mm_add_epi32(s1, d4);
1135
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1140
__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1141
for( ; i <= width - 8; i += 8 )
1144
s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1145
s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1146
s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1147
s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1149
x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1150
_mm_load_si128((__m128i*)(S2 + i)));
1151
x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1152
_mm_load_si128((__m128i*)(S2 + i + 4)));
1153
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1154
s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1155
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1156
_mm_storeu_si128((__m128i*)(dst + i), x0);
1162
if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1166
for( ; i <= width - 8; i += 8 )
1168
__m128i s0, s1, s2, s3;
1169
s0 = _mm_load_si128((__m128i*)(S2 + i));
1170
s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1171
s2 = _mm_load_si128((__m128i*)(S0 + i));
1172
s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1173
s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1174
s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1175
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1180
__m128 k1 = _mm_set1_ps(ky[1]);
1181
for( ; i <= width - 8; i += 8 )
1183
__m128 s0 = df4, s1 = df4;
1185
x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
1186
_mm_load_si128((__m128i*)(S0 + i)));
1187
x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
1188
_mm_load_si128((__m128i*)(S0 + i + 4)));
1189
s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1190
s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1191
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1192
_mm_storeu_si128((__m128i*)(dst + i), x0);
1206
/////////////////////////////////////// 16s //////////////////////////////////
1208
struct RowVec_16s32f
1211
RowVec_16s32f( const Mat& _kernel )
1214
sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1217
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1219
if( !sse2_supported )
1222
int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1223
float* dst = (float*)_dst;
1224
const float* _kx = kernel.ptr<float>();
1227
for( ; i <= width - 8; i += 8 )
1229
const short* src = (const short*)_src + i;
1230
__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1231
for( k = 0; k < _ksize; k++, src += cn )
1233
f = _mm_load_ss(_kx+k);
1234
f = _mm_shuffle_ps(f, f, 0);
1236
__m128i x0i = _mm_loadu_si128((const __m128i*)src);
1237
__m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
1238
x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
1239
x0 = _mm_cvtepi32_ps(x0i);
1240
x1 = _mm_cvtepi32_ps(x1i);
1241
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1242
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1244
_mm_store_ps(dst + i, s0);
1245
_mm_store_ps(dst + i + 4, s1);
1251
bool sse2_supported;
1255
struct SymmColumnVec_32f16s
1257
SymmColumnVec_32f16s() { symmetryType=0; }
1258
SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
1260
symmetryType = _symmetryType;
1262
delta = (float)_delta;
1263
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1264
sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1267
int operator()(const uchar** _src, uchar* _dst, int width) const
1269
if( !sse2_supported )
1272
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1273
const float* ky = kernel.ptr<float>() + ksize2;
1275
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1276
const float** src = (const float**)_src;
1277
const float *S, *S2;
1278
short* dst = (short*)_dst;
1279
__m128 d4 = _mm_set1_ps(delta);
1283
for( ; i <= width - 16; i += 16 )
1285
__m128 f = _mm_load_ss(ky);
1286
f = _mm_shuffle_ps(f, f, 0);
1287
__m128 s0, s1, s2, s3;
1290
s0 = _mm_load_ps(S);
1291
s1 = _mm_load_ps(S+4);
1292
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1293
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1294
s2 = _mm_load_ps(S+8);
1295
s3 = _mm_load_ps(S+12);
1296
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1297
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1299
for( k = 1; k <= ksize2; k++ )
1303
f = _mm_load_ss(ky+k);
1304
f = _mm_shuffle_ps(f, f, 0);
1305
x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1306
x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1307
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1308
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1309
x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1310
x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1311
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1312
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1315
__m128i s0i = _mm_cvtps_epi32(s0);
1316
__m128i s1i = _mm_cvtps_epi32(s1);
1317
__m128i s2i = _mm_cvtps_epi32(s2);
1318
__m128i s3i = _mm_cvtps_epi32(s3);
1320
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1321
_mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1324
for( ; i <= width - 4; i += 4 )
1326
__m128 f = _mm_load_ss(ky);
1327
f = _mm_shuffle_ps(f, f, 0);
1328
__m128 x0, s0 = _mm_load_ps(src[0] + i);
1329
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1331
for( k = 1; k <= ksize2; k++ )
1333
f = _mm_load_ss(ky+k);
1334
f = _mm_shuffle_ps(f, f, 0);
1337
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1338
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1341
__m128i s0i = _mm_cvtps_epi32(s0);
1342
_mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1347
for( ; i <= width - 16; i += 16 )
1349
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1353
for( k = 1; k <= ksize2; k++ )
1357
f = _mm_load_ss(ky+k);
1358
f = _mm_shuffle_ps(f, f, 0);
1359
x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1360
x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1361
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1362
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1363
x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1364
x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1365
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1366
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1369
__m128i s0i = _mm_cvtps_epi32(s0);
1370
__m128i s1i = _mm_cvtps_epi32(s1);
1371
__m128i s2i = _mm_cvtps_epi32(s2);
1372
__m128i s3i = _mm_cvtps_epi32(s3);
1374
_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1375
_mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1378
for( ; i <= width - 4; i += 4 )
1380
__m128 f, x0, s0 = d4;
1382
for( k = 1; k <= ksize2; k++ )
1384
f = _mm_load_ss(ky+k);
1385
f = _mm_shuffle_ps(f, f, 0);
1386
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1387
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1390
__m128i s0i = _mm_cvtps_epi32(s0);
1391
_mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1401
bool sse2_supported;
1405
/////////////////////////////////////// 32f //////////////////////////////////
1411
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1414
RowVec_32f( const Mat& _kernel )
1417
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1418
#if defined USE_IPP_SEP_FILTERS && IPP_DISABLE_BLOCK
1423
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1425
#if defined USE_IPP_SEP_FILTERS && IPP_DISABLE_BLOCK
1428
int ret = ippiOperator(_src, _dst, width, cn);
1433
int _ksize = kernel.rows + kernel.cols - 1;
1434
const float* src0 = (const float*)_src;
1435
float* dst = (float*)_dst;
1436
const float* _kx = kernel.ptr<float>();
1444
for( ; i <= width - 8; i += 8 )
1446
const float* src = src0 + i;
1447
__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1448
for( k = 0; k < _ksize; k++, src += cn )
1450
f = _mm_load_ss(_kx+k);
1451
f = _mm_shuffle_ps(f, f, 0);
1453
x0 = _mm_loadu_ps(src);
1454
x1 = _mm_loadu_ps(src + 4);
1455
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1456
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1458
_mm_store_ps(dst + i, s0);
1459
_mm_store_ps(dst + i + 4, s1);
1466
#if defined USE_IPP_SEP_FILTERS && IPP_DISABLE_BLOCK
1469
int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const
1471
int _ksize = kernel.rows + kernel.cols - 1;
1472
if ((1 != cn && 3 != cn) || width < _ksize*8)
1475
const float* src = (const float*)_src;
1476
float* dst = (float*)_dst;
1477
const float* _kx = (const float*)kernel.data;
1479
IppiSize roisz = { width, 1 };
1482
if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
1483
(cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
1486
AutoBuffer<uchar> buf(bufsz + 64);
1487
uchar* bufptr = alignPtr((uchar*)buf, 32);
1488
int step = (int)(width*sizeof(dst[0])*cn);
1489
float borderValue[] = {0.f, 0.f, 0.f};
1490
// here is the trick. IPP needs border type and extrapolates the row. We did it already.
1491
// So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
1492
if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src, step, &dst, roisz, _kx, _ksize, 0,
1493
ippBorderRepl, borderValue[0], bufptr) < 0) ||
1494
(cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src, step, &dst, roisz, _kx, _ksize, 0,
1495
ippBorderRepl, borderValue, bufptr) < 0))
1497
setIppErrorStatus();
1500
CV_IMPL_ADD(CV_IMPL_IPP);
1501
return width - _ksize + 1;
1507
struct SymmRowSmallVec_32f
1509
SymmRowSmallVec_32f() {}
1510
SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1513
symmetryType = _symmetryType;
1516
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1518
if( !checkHardwareSupport(CV_CPU_SSE) )
1521
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1522
float* dst = (float*)_dst;
1523
const float* src = (const float*)_src + (_ksize/2)*cn;
1524
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1525
const float* kx = kernel.ptr<float>() + _ksize/2;
1534
if( kx[0] == 2 && kx[1] == 1 )
1535
for( ; i <= width - 8; i += 8, src += 8 )
1537
__m128 x0, x1, x2, y0, y1, y2;
1538
x0 = _mm_loadu_ps(src - cn);
1539
x1 = _mm_loadu_ps(src);
1540
x2 = _mm_loadu_ps(src + cn);
1541
y0 = _mm_loadu_ps(src - cn + 4);
1542
y1 = _mm_loadu_ps(src + 4);
1543
y2 = _mm_loadu_ps(src + cn + 4);
1544
x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1545
y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1546
_mm_store_ps(dst + i, x0);
1547
_mm_store_ps(dst + i + 4, y0);
1549
else if( kx[0] == -2 && kx[1] == 1 )
1550
for( ; i <= width - 8; i += 8, src += 8 )
1552
__m128 x0, x1, x2, y0, y1, y2;
1553
x0 = _mm_loadu_ps(src - cn);
1554
x1 = _mm_loadu_ps(src);
1555
x2 = _mm_loadu_ps(src + cn);
1556
y0 = _mm_loadu_ps(src - cn + 4);
1557
y1 = _mm_loadu_ps(src + 4);
1558
y2 = _mm_loadu_ps(src + cn + 4);
1559
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1560
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1561
_mm_store_ps(dst + i, x0);
1562
_mm_store_ps(dst + i + 4, y0);
1566
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1567
for( ; i <= width - 8; i += 8, src += 8 )
1569
__m128 x0, x1, x2, y0, y1, y2;
1570
x0 = _mm_loadu_ps(src - cn);
1571
x1 = _mm_loadu_ps(src);
1572
x2 = _mm_loadu_ps(src + cn);
1573
y0 = _mm_loadu_ps(src - cn + 4);
1574
y1 = _mm_loadu_ps(src + 4);
1575
y2 = _mm_loadu_ps(src + cn + 4);
1577
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1578
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1579
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1580
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1581
_mm_store_ps(dst + i, x0);
1582
_mm_store_ps(dst + i + 4, y0);
1586
else if( _ksize == 5 )
1588
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1589
for( ; i <= width - 8; i += 8, src += 8 )
1591
__m128 x0, x1, x2, y0, y1, y2;
1592
x0 = _mm_loadu_ps(src - cn*2);
1593
x1 = _mm_loadu_ps(src);
1594
x2 = _mm_loadu_ps(src + cn*2);
1595
y0 = _mm_loadu_ps(src - cn*2 + 4);
1596
y1 = _mm_loadu_ps(src + 4);
1597
y2 = _mm_loadu_ps(src + cn*2 + 4);
1598
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1599
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1600
_mm_store_ps(dst + i, x0);
1601
_mm_store_ps(dst + i + 4, y0);
1605
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1606
for( ; i <= width - 8; i += 8, src += 8 )
1608
__m128 x0, x1, x2, y0, y1, y2;
1609
x0 = _mm_loadu_ps(src - cn);
1610
x1 = _mm_loadu_ps(src);
1611
x2 = _mm_loadu_ps(src + cn);
1612
y0 = _mm_loadu_ps(src - cn + 4);
1613
y1 = _mm_loadu_ps(src + 4);
1614
y2 = _mm_loadu_ps(src + cn + 4);
1616
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1617
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1618
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1619
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1621
x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1622
y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1623
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1624
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1626
_mm_store_ps(dst + i, x0);
1627
_mm_store_ps(dst + i + 4, y0);
1636
if( kx[0] == 0 && kx[1] == 1 )
1637
for( ; i <= width - 8; i += 8, src += 8 )
1639
__m128 x0, x2, y0, y2;
1640
x0 = _mm_loadu_ps(src + cn);
1641
x2 = _mm_loadu_ps(src - cn);
1642
y0 = _mm_loadu_ps(src + cn + 4);
1643
y2 = _mm_loadu_ps(src - cn + 4);
1644
x0 = _mm_sub_ps(x0, x2);
1645
y0 = _mm_sub_ps(y0, y2);
1646
_mm_store_ps(dst + i, x0);
1647
_mm_store_ps(dst + i + 4, y0);
1651
__m128 k1 = _mm_set1_ps(kx[1]);
1652
for( ; i <= width - 8; i += 8, src += 8 )
1654
__m128 x0, x2, y0, y2;
1655
x0 = _mm_loadu_ps(src + cn);
1656
x2 = _mm_loadu_ps(src - cn);
1657
y0 = _mm_loadu_ps(src + cn + 4);
1658
y2 = _mm_loadu_ps(src - cn + 4);
1660
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1661
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1662
_mm_store_ps(dst + i, x0);
1663
_mm_store_ps(dst + i + 4, y0);
1667
else if( _ksize == 5 )
1669
__m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1670
for( ; i <= width - 8; i += 8, src += 8 )
1672
__m128 x0, x2, y0, y2;
1673
x0 = _mm_loadu_ps(src + cn);
1674
x2 = _mm_loadu_ps(src - cn);
1675
y0 = _mm_loadu_ps(src + cn + 4);
1676
y2 = _mm_loadu_ps(src - cn + 4);
1678
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1679
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1681
x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1682
y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1683
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1684
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1686
_mm_store_ps(dst + i, x0);
1687
_mm_store_ps(dst + i + 4, y0);
1700
struct SymmColumnVec_32f
1702
SymmColumnVec_32f() { symmetryType=0; }
1703
SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1705
symmetryType = _symmetryType;
1707
delta = (float)_delta;
1708
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1711
int operator()(const uchar** _src, uchar* _dst, int width) const
1713
if( !checkHardwareSupport(CV_CPU_SSE) )
1716
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1717
const float* ky = kernel.ptr<float>() + ksize2;
1719
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1720
const float** src = (const float**)_src;
1721
const float *S, *S2;
1722
float* dst = (float*)_dst;
1723
__m128 d4 = _mm_set1_ps(delta);
1727
for( ; i <= width - 16; i += 16 )
1729
__m128 f = _mm_load_ss(ky);
1730
f = _mm_shuffle_ps(f, f, 0);
1731
__m128 s0, s1, s2, s3;
1734
s0 = _mm_load_ps(S);
1735
s1 = _mm_load_ps(S+4);
1736
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1737
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1738
s2 = _mm_load_ps(S+8);
1739
s3 = _mm_load_ps(S+12);
1740
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1741
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1743
for( k = 1; k <= ksize2; k++ )
1747
f = _mm_load_ss(ky+k);
1748
f = _mm_shuffle_ps(f, f, 0);
1749
x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1750
x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1751
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1752
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1753
x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1754
x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1755
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1756
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1759
_mm_storeu_ps(dst + i, s0);
1760
_mm_storeu_ps(dst + i + 4, s1);
1761
_mm_storeu_ps(dst + i + 8, s2);
1762
_mm_storeu_ps(dst + i + 12, s3);
1765
for( ; i <= width - 4; i += 4 )
1767
__m128 f = _mm_load_ss(ky);
1768
f = _mm_shuffle_ps(f, f, 0);
1769
__m128 x0, s0 = _mm_load_ps(src[0] + i);
1770
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1772
for( k = 1; k <= ksize2; k++ )
1774
f = _mm_load_ss(ky+k);
1775
f = _mm_shuffle_ps(f, f, 0);
1778
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1779
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1782
_mm_storeu_ps(dst + i, s0);
1787
for( ; i <= width - 16; i += 16 )
1789
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1793
for( k = 1; k <= ksize2; k++ )
1797
f = _mm_load_ss(ky+k);
1798
f = _mm_shuffle_ps(f, f, 0);
1799
x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1800
x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1801
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1802
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1803
x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1804
x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1805
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1806
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1809
_mm_storeu_ps(dst + i, s0);
1810
_mm_storeu_ps(dst + i + 4, s1);
1811
_mm_storeu_ps(dst + i + 8, s2);
1812
_mm_storeu_ps(dst + i + 12, s3);
1815
for( ; i <= width - 4; i += 4 )
1817
__m128 f, x0, s0 = d4;
1819
for( k = 1; k <= ksize2; k++ )
1821
f = _mm_load_ss(ky+k);
1822
f = _mm_shuffle_ps(f, f, 0);
1823
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1824
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1827
_mm_storeu_ps(dst + i, s0);
1840
struct SymmColumnSmallVec_32f
1842
SymmColumnSmallVec_32f() { symmetryType=0; }
1843
SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1845
symmetryType = _symmetryType;
1847
delta = (float)_delta;
1848
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1851
int operator()(const uchar** _src, uchar* _dst, int width) const
1853
if( !checkHardwareSupport(CV_CPU_SSE) )
1856
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1857
const float* ky = kernel.ptr<float>() + ksize2;
1859
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1860
const float** src = (const float**)_src;
1861
const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1862
float* dst = (float*)_dst;
1863
__m128 d4 = _mm_set1_ps(delta);
1867
if( ky[0] == 2 && ky[1] == 1 )
1869
for( ; i <= width - 8; i += 8 )
1871
__m128 s0, s1, s2, s3, s4, s5;
1872
s0 = _mm_load_ps(S0 + i);
1873
s1 = _mm_load_ps(S0 + i + 4);
1874
s2 = _mm_load_ps(S1 + i);
1875
s3 = _mm_load_ps(S1 + i + 4);
1876
s4 = _mm_load_ps(S2 + i);
1877
s5 = _mm_load_ps(S2 + i + 4);
1878
s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1879
s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1880
s0 = _mm_add_ps(s0, d4);
1881
s1 = _mm_add_ps(s1, d4);
1882
_mm_storeu_ps(dst + i, s0);
1883
_mm_storeu_ps(dst + i + 4, s1);
1886
else if( ky[0] == -2 && ky[1] == 1 )
1888
for( ; i <= width - 8; i += 8 )
1890
__m128 s0, s1, s2, s3, s4, s5;
1891
s0 = _mm_load_ps(S0 + i);
1892
s1 = _mm_load_ps(S0 + i + 4);
1893
s2 = _mm_load_ps(S1 + i);
1894
s3 = _mm_load_ps(S1 + i + 4);
1895
s4 = _mm_load_ps(S2 + i);
1896
s5 = _mm_load_ps(S2 + i + 4);
1897
s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1898
s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1899
s0 = _mm_add_ps(s0, d4);
1900
s1 = _mm_add_ps(s1, d4);
1901
_mm_storeu_ps(dst + i, s0);
1902
_mm_storeu_ps(dst + i + 4, s1);
1907
__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1908
for( ; i <= width - 8; i += 8 )
1910
__m128 s0, s1, x0, x1;
1911
s0 = _mm_load_ps(S1 + i);
1912
s1 = _mm_load_ps(S1 + i + 4);
1913
s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1914
s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1915
x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1916
x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1917
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1918
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1919
_mm_storeu_ps(dst + i, s0);
1920
_mm_storeu_ps(dst + i + 4, s1);
1926
if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1930
for( ; i <= width - 8; i += 8 )
1932
__m128 s0, s1, s2, s3;
1933
s0 = _mm_load_ps(S2 + i);
1934
s1 = _mm_load_ps(S2 + i + 4);
1935
s2 = _mm_load_ps(S0 + i);
1936
s3 = _mm_load_ps(S0 + i + 4);
1937
s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1938
s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1939
_mm_storeu_ps(dst + i, s0);
1940
_mm_storeu_ps(dst + i + 4, s1);
1945
__m128 k1 = _mm_set1_ps(ky[1]);
1946
for( ; i <= width - 8; i += 8 )
1948
__m128 s0 = d4, s1 = d4, x0, x1;
1949
x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1950
x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1951
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1952
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1953
_mm_storeu_ps(dst + i, s0);
1954
_mm_storeu_ps(dst + i + 4, s1);
1968
/////////////////////////////// non-separable filters ///////////////////////////////
1970
///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1975
FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1978
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1979
delta = (float)(_delta/(1 << _bits));
1980
std::vector<Point> coords;
1981
preprocess2DKernel(kernel, coords, coeffs);
1982
_nz = (int)coords.size();
1985
int operator()(const uchar** src, uchar* dst, int width) const
1987
if( !checkHardwareSupport(CV_CPU_SSE2) )
1990
const float* kf = (const float*)&coeffs[0];
1991
int i = 0, k, nz = _nz;
1992
__m128 d4 = _mm_set1_ps(delta);
1994
for( ; i <= width - 16; i += 16 )
1996
__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1997
__m128i x0, x1, z = _mm_setzero_si128();
1999
for( k = 0; k < nz; k++ )
2001
__m128 f = _mm_load_ss(kf+k), t0, t1;
2002
f = _mm_shuffle_ps(f, f, 0);
2004
x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2005
x1 = _mm_unpackhi_epi8(x0, z);
2006
x0 = _mm_unpacklo_epi8(x0, z);
2008
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2009
t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2010
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2011
s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2013
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2014
t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2015
s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2016
s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2019
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2020
x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2021
x0 = _mm_packus_epi16(x0, x1);
2022
_mm_storeu_si128((__m128i*)(dst + i), x0);
2025
for( ; i <= width - 4; i += 4 )
2028
__m128i x0, z = _mm_setzero_si128();
2030
for( k = 0; k < nz; k++ )
2032
__m128 f = _mm_load_ss(kf+k), t0;
2033
f = _mm_shuffle_ps(f, f, 0);
2035
x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2036
x0 = _mm_unpacklo_epi8(x0, z);
2037
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2038
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2041
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2042
x0 = _mm_packus_epi16(x0, x0);
2043
*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
2050
std::vector<uchar> coeffs;
2055
struct FilterVec_8u16s
2057
FilterVec_8u16s() {}
2058
FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
2061
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2062
delta = (float)(_delta/(1 << _bits));
2063
std::vector<Point> coords;
2064
preprocess2DKernel(kernel, coords, coeffs);
2065
_nz = (int)coords.size();
2068
int operator()(const uchar** src, uchar* _dst, int width) const
2070
if( !checkHardwareSupport(CV_CPU_SSE2) )
2073
const float* kf = (const float*)&coeffs[0];
2074
short* dst = (short*)_dst;
2075
int i = 0, k, nz = _nz;
2076
__m128 d4 = _mm_set1_ps(delta);
2078
for( ; i <= width - 16; i += 16 )
2080
__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2081
__m128i x0, x1, z = _mm_setzero_si128();
2083
for( k = 0; k < nz; k++ )
2085
__m128 f = _mm_load_ss(kf+k), t0, t1;
2086
f = _mm_shuffle_ps(f, f, 0);
2088
x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2089
x1 = _mm_unpackhi_epi8(x0, z);
2090
x0 = _mm_unpacklo_epi8(x0, z);
2092
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2093
t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2094
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2095
s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2097
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2098
t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2099
s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2100
s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2103
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2104
x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2105
_mm_storeu_si128((__m128i*)(dst + i), x0);
2106
_mm_storeu_si128((__m128i*)(dst + i + 8), x1);
2109
for( ; i <= width - 4; i += 4 )
2112
__m128i x0, z = _mm_setzero_si128();
2114
for( k = 0; k < nz; k++ )
2116
__m128 f = _mm_load_ss(kf+k), t0;
2117
f = _mm_shuffle_ps(f, f, 0);
2119
x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2120
x0 = _mm_unpacklo_epi8(x0, z);
2121
t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2122
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2125
x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2126
_mm_storel_epi64((__m128i*)(dst + i), x0);
2133
std::vector<uchar> coeffs;
2138
struct FilterVec_32f
2141
FilterVec_32f(const Mat& _kernel, int, double _delta)
2143
delta = (float)_delta;
2144
std::vector<Point> coords;
2145
preprocess2DKernel(_kernel, coords, coeffs);
2146
_nz = (int)coords.size();
2149
int operator()(const uchar** _src, uchar* _dst, int width) const
2151
if( !checkHardwareSupport(CV_CPU_SSE) )
2154
const float* kf = (const float*)&coeffs[0];
2155
const float** src = (const float**)_src;
2156
float* dst = (float*)_dst;
2157
int i = 0, k, nz = _nz;
2158
__m128 d4 = _mm_set1_ps(delta);
2160
for( ; i <= width - 16; i += 16 )
2162
__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2164
for( k = 0; k < nz; k++ )
2166
__m128 f = _mm_load_ss(kf+k), t0, t1;
2167
f = _mm_shuffle_ps(f, f, 0);
2168
const float* S = src[k] + i;
2170
t0 = _mm_loadu_ps(S);
2171
t1 = _mm_loadu_ps(S + 4);
2172
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2173
s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2175
t0 = _mm_loadu_ps(S + 8);
2176
t1 = _mm_loadu_ps(S + 12);
2177
s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2178
s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2181
_mm_storeu_ps(dst + i, s0);
2182
_mm_storeu_ps(dst + i + 4, s1);
2183
_mm_storeu_ps(dst + i + 8, s2);
2184
_mm_storeu_ps(dst + i + 12, s3);
2187
for( ; i <= width - 4; i += 4 )
2191
for( k = 0; k < nz; k++ )
2193
__m128 f = _mm_load_ss(kf+k), t0;
2194
f = _mm_shuffle_ps(f, f, 0);
2195
t0 = _mm_loadu_ps(src[k] + i);
2196
s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2198
_mm_storeu_ps(dst + i, s0);
2205
std::vector<uchar> coeffs;
2212
struct SymmRowSmallVec_8u32s
2214
SymmRowSmallVec_8u32s() { smallValues = false; }
2215
SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
2218
symmetryType = _symmetryType;
2220
int k, ksize = kernel.rows + kernel.cols - 1;
2221
for( k = 0; k < ksize; k++ )
2223
int v = kernel.ptr<int>()[k];
2224
if( v < SHRT_MIN || v > SHRT_MAX )
2226
smallValues = false;
2232
int operator()(const uchar* src, uchar* _dst, int width, int cn) const
2234
if( !checkHardwareSupport(CV_CPU_NEON) )
2237
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2238
int* dst = (int*)_dst;
2239
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2240
const int* kx = kernel.ptr<int>() + _ksize/2;
2244
src += (_ksize/2)*cn;
2253
if( kx[0] == 2 && kx[1] == 1 )
2255
uint16x8_t zq = vdupq_n_u16(0);
2257
for( ; i <= width - 8; i += 8, src += 8 )
2259
uint8x8_t x0, x1, x2;
2260
x0 = vld1_u8( (uint8_t *) (src - cn) );
2261
x1 = vld1_u8( (uint8_t *) (src) );
2262
x2 = vld1_u8( (uint8_t *) (src + cn) );
2264
uint16x8_t y0, y1, y2;
2265
y0 = vaddl_u8(x0, x2);
2266
y1 = vshll_n_u8(x1, 1);
2267
y2 = vaddq_u16(y0, y1);
2270
str.val[0] = y2; str.val[1] = zq;
2271
vst2q_u16( (uint16_t *) (dst + i), str );
2274
else if( kx[0] == -2 && kx[1] == 1 )
2278
int32x4_t k32 = vdupq_n_s32(0);
2279
k32 = vld1q_lane_s32(kx, k32, 0);
2280
k32 = vld1q_lane_s32(kx + 1, k32, 1);
2282
int16x4_t k = vqmovn_s32(k32);
2284
uint8x8_t z = vdup_n_u8(0);
2286
for( ; i <= width - 8; i += 8, src += 8 )
2288
uint8x8_t x0, x1, x2;
2289
x0 = vld1_u8( (uint8_t *) (src - cn) );
2290
x1 = vld1_u8( (uint8_t *) (src) );
2291
x2 = vld1_u8( (uint8_t *) (src + cn) );
2295
y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2296
y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2297
y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
2298
y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
2299
y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
2300
y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
2302
vst1q_s32((int32_t *)(dst + i), y2);
2303
vst1q_s32((int32_t *)(dst + i + 4), y3);
2307
else if( _ksize == 5 )
2309
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2313
int32x4_t k32 = vdupq_n_s32(0);
2314
k32 = vld1q_lane_s32(kx, k32, 0);
2315
k32 = vld1q_lane_s32(kx + 1, k32, 1);
2316
k32 = vld1q_lane_s32(kx + 2, k32, 2);
2318
int16x4_t k = vqmovn_s32(k32);
2320
uint8x8_t z = vdup_n_u8(0);
2322
for( ; i <= width - 8; i += 8, src += 8 )
2324
uint8x8_t x0, x1, x2, x3, x4;
2325
x0 = vld1_u8( (uint8_t *) (src - cn) );
2326
x1 = vld1_u8( (uint8_t *) (src) );
2327
x2 = vld1_u8( (uint8_t *) (src + cn) );
2330
int32x4_t accl, acch;
2331
y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2332
y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2333
accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
2334
accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
2335
acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
2336
acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
2339
x3 = vld1_u8( (uint8_t *) (src - cn*2) );
2340
x4 = vld1_u8( (uint8_t *) (src + cn*2) );
2341
y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
2342
accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
2343
acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
2345
vst1q_s32((int32_t *)(dst + i), accl);
2346
vst1q_s32((int32_t *)(dst + i + 4), acch);
2355
if( kx[0] == 0 && kx[1] == 1 )
2357
uint8x8_t z = vdup_n_u8(0);
2359
for( ; i <= width - 8; i += 8, src += 8 )
2362
x0 = vld1_u8( (uint8_t *) (src - cn) );
2363
x1 = vld1_u8( (uint8_t *) (src + cn) );
2366
y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2367
vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2369
vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
2370
vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
2375
int32x4_t k32 = vdupq_n_s32(0);
2376
k32 = vld1q_lane_s32(kx + 1, k32, 1);
2378
int16x4_t k = vqmovn_s32(k32);
2380
uint8x8_t z = vdup_n_u8(0);
2382
for( ; i <= width - 8; i += 8, src += 8 )
2385
x0 = vld1_u8( (uint8_t *) (src - cn) );
2386
x1 = vld1_u8( (uint8_t *) (src + cn) );
2390
y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2391
vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2392
y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
2393
y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
2395
vst1q_s32((int32_t *)(dst + i), y1);
2396
vst1q_s32((int32_t *)(dst + i + 4), y2);
2400
else if( _ksize == 5 )
2402
int32x4_t k32 = vdupq_n_s32(0);
2403
k32 = vld1q_lane_s32(kx + 1, k32, 1);
2404
k32 = vld1q_lane_s32(kx + 2, k32, 2);
2406
int16x4_t k = vqmovn_s32(k32);
2408
uint8x8_t z = vdup_n_u8(0);
2410
for( ; i <= width - 8; i += 8, src += 8 )
2413
x0 = vld1_u8( (uint8_t *) (src - cn) );
2414
x1 = vld1_u8( (uint8_t *) (src + cn) );
2416
int32x4_t accl, acch;
2418
y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2419
vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2420
accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
2421
acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
2424
x2 = vld1_u8( (uint8_t *) (src - cn*2) );
2425
x3 = vld1_u8( (uint8_t *) (src + cn*2) );
2428
y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
2429
vreinterpretq_s16_u16(vaddl_u8(x2, z)));
2430
accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
2431
acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
2433
vst1q_s32((int32_t *)(dst + i), accl);
2434
vst1q_s32((int32_t *)(dst + i + 4), acch);
2448
struct SymmColumnVec_32s8u
2450
SymmColumnVec_32s8u() { symmetryType=0; }
2451
SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2453
symmetryType = _symmetryType;
2454
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2455
delta = (float)(_delta/(1 << _bits));
2456
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2459
int operator()(const uchar** _src, uchar* dst, int width) const
2461
if( !checkHardwareSupport(CV_CPU_NEON) )
2464
int _ksize = kernel.rows + kernel.cols - 1;
2465
int ksize2 = _ksize / 2;
2466
const float* ky = kernel.ptr<float>() + ksize2;
2468
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2469
const int** src = (const int**)_src;
2472
float32x4_t d4 = vdupq_n_f32(delta);
2481
k32 = vdup_n_f32(0);
2482
k32 = vld1_lane_f32(ky, k32, 0);
2483
k32 = vld1_lane_f32(ky + 1, k32, 1);
2485
for( ; i <= width - 8; i += 8 )
2487
float32x4_t accl, acch;
2488
float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
2492
f0l = vcvtq_f32_s32( vld1q_s32(S) );
2493
f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2498
f1l = vcvtq_f32_s32( vld1q_s32(S) );
2499
f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2500
f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2501
f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2504
accl = vmlaq_lane_f32(accl, f0l, k32, 0);
2505
acch = vmlaq_lane_f32(acch, f0h, k32, 0);
2506
accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
2507
acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
2509
for( k = 2; k <= ksize2; k++ )
2514
float32x4_t f3l, f3h, f4l, f4h;
2515
f3l = vcvtq_f32_s32( vld1q_s32(S) );
2516
f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2517
f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2518
f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2520
accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
2521
acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
2524
int32x4_t s32l, s32h;
2525
s32l = vcvtq_s32_f32(accl);
2526
s32h = vcvtq_s32_f32(acch);
2528
int16x4_t s16l, s16h;
2529
s16l = vqmovn_s32(s32l);
2530
s16h = vqmovn_s32(s32h);
2533
u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
2535
vst1_u8((uint8_t *)(dst + i), u8);
2541
k32 = vdup_n_f32(0);
2542
k32 = vld1_lane_f32(ky + 1, k32, 1);
2544
for( ; i <= width - 8; i += 8 )
2546
float32x4_t accl, acch;
2547
float32x4_t f1l, f1h, f2l, f2h;
2552
f1l = vcvtq_f32_s32( vld1q_s32(S) );
2553
f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2554
f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2555
f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2558
accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
2559
acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
2561
for( k = 2; k <= ksize2; k++ )
2566
float32x4_t f3l, f3h, f4l, f4h;
2567
f3l = vcvtq_f32_s32( vld1q_s32(S) );
2568
f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2569
f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2570
f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2572
accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
2573
acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
2576
int32x4_t s32l, s32h;
2577
s32l = vcvtq_s32_f32(accl);
2578
s32h = vcvtq_s32_f32(acch);
2580
int16x4_t s16l, s16h;
2581
s16l = vqmovn_s32(s32l);
2582
s16h = vqmovn_s32(s32h);
2585
u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
2587
vst1_u8((uint8_t *)(dst + i), u8);
2600
struct SymmColumnSmallVec_32s16s
2602
SymmColumnSmallVec_32s16s() { symmetryType=0; }
2603
SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2605
symmetryType = _symmetryType;
2606
_kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2607
delta = (float)(_delta/(1 << _bits));
2608
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2611
int operator()(const uchar** _src, uchar* _dst, int width) const
2613
if( !checkHardwareSupport(CV_CPU_NEON) )
2616
int ksize2 = (kernel.rows + kernel.cols - 1)/2;
2617
const float* ky = kernel.ptr<float>() + ksize2;
2619
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2620
const int** src = (const int**)_src;
2621
const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
2622
short* dst = (short*)_dst;
2623
float32x4_t df4 = vdupq_n_f32(delta);
2624
int32x4_t d4 = vcvtq_s32_f32(df4);
2628
if( ky[0] == 2 && ky[1] == 1 )
2630
for( ; i <= width - 4; i += 4 )
2632
int32x4_t x0, x1, x2;
2633
x0 = vld1q_s32((int32_t const *)(S0 + i));
2634
x1 = vld1q_s32((int32_t const *)(S1 + i));
2635
x2 = vld1q_s32((int32_t const *)(S2 + i));
2637
int32x4_t y0, y1, y2, y3;
2638
y0 = vaddq_s32(x0, x2);
2639
y1 = vqshlq_n_s32(x1, 1);
2640
y2 = vaddq_s32(y0, y1);
2641
y3 = vaddq_s32(y2, d4);
2646
vst1_s16((int16_t *)(dst + i), t);
2649
else if( ky[0] == -2 && ky[1] == 1 )
2651
for( ; i <= width - 4; i += 4 )
2653
int32x4_t x0, x1, x2;
2654
x0 = vld1q_s32((int32_t const *)(S0 + i));
2655
x1 = vld1q_s32((int32_t const *)(S1 + i));
2656
x2 = vld1q_s32((int32_t const *)(S2 + i));
2658
int32x4_t y0, y1, y2, y3;
2659
y0 = vaddq_s32(x0, x2);
2660
y1 = vqshlq_n_s32(x1, 1);
2661
y2 = vsubq_s32(y0, y1);
2662
y3 = vaddq_s32(y2, d4);
2667
vst1_s16((int16_t *)(dst + i), t);
2670
else if( ky[0] == 10 && ky[1] == 3 )
2672
for( ; i <= width - 4; i += 4 )
2674
int32x4_t x0, x1, x2, x3;
2675
x0 = vld1q_s32((int32_t const *)(S0 + i));
2676
x1 = vld1q_s32((int32_t const *)(S1 + i));
2677
x2 = vld1q_s32((int32_t const *)(S2 + i));
2679
x3 = vaddq_s32(x0, x2);
2682
y0 = vmlaq_n_s32(d4, x1, 10);
2683
y0 = vmlaq_n_s32(y0, x3, 3);
2688
vst1_s16((int16_t *)(dst + i), t);
2693
float32x2_t k32 = vdup_n_f32(0);
2694
k32 = vld1_lane_f32(ky, k32, 0);
2695
k32 = vld1_lane_f32(ky + 1, k32, 1);
2697
for( ; i <= width - 4; i += 4 )
2699
int32x4_t x0, x1, x2, x3, x4;
2700
x0 = vld1q_s32((int32_t const *)(S0 + i));
2701
x1 = vld1q_s32((int32_t const *)(S1 + i));
2702
x2 = vld1q_s32((int32_t const *)(S2 + i));
2704
x3 = vaddq_s32(x0, x2);
2706
float32x4_t s0, s1, s2;
2707
s0 = vcvtq_f32_s32(x1);
2708
s1 = vcvtq_f32_s32(x3);
2709
s2 = vmlaq_lane_f32(df4, s0, k32, 0);
2710
s2 = vmlaq_lane_f32(s2, s1, k32, 1);
2712
x4 = vcvtq_s32_f32(s2);
2715
x5 = vqmovn_s32(x4);
2717
vst1_s16((int16_t *)(dst + i), x5);
2723
if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
2727
for( ; i <= width - 4; i += 4 )
2730
x0 = vld1q_s32((int32_t const *)(S0 + i));
2731
x1 = vld1q_s32((int32_t const *)(S2 + i));
2734
y0 = vsubq_s32(x1, x0);
2735
y1 = vqaddq_s32(y0, d4);
2740
vst1_s16((int16_t *)(dst + i), t);
2745
float32x2_t k32 = vdup_n_f32(0);
2746
k32 = vld1_lane_f32(ky + 1, k32, 1);
2748
for( ; i <= width - 4; i += 4 )
2750
int32x4_t x0, x1, x2, x3;
2751
x0 = vld1q_s32((int32_t const *)(S0 + i));
2752
x1 = vld1q_s32((int32_t const *)(S2 + i));
2754
x2 = vsubq_s32(x1, x0);
2757
s0 = vcvtq_f32_s32(x2);
2758
s1 = vmlaq_lane_f32(df4, s0, k32, 1);
2760
x3 = vcvtq_s32_f32(s1);
2763
x4 = vqmovn_s32(x3);
2765
vst1_s16((int16_t *)(dst + i), x4);
2779
struct SymmColumnVec_32f16s
2781
SymmColumnVec_32f16s() { symmetryType=0; }
2782
SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
2784
symmetryType = _symmetryType;
2786
delta = (float)_delta;
2787
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2788
neon_supported = checkHardwareSupport(CV_CPU_NEON);
2791
int operator()(const uchar** _src, uchar* _dst, int width) const
2793
if( !neon_supported )
2796
int _ksize = kernel.rows + kernel.cols - 1;
2797
int ksize2 = _ksize / 2;
2798
const float* ky = kernel.ptr<float>() + ksize2;
2800
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2801
const float** src = (const float**)_src;
2802
const float *S, *S2;
2803
short* dst = (short*)_dst;
2805
float32x4_t d4 = vdupq_n_f32(delta);
2814
k32 = vdup_n_f32(0);
2815
k32 = vld1_lane_f32(ky, k32, 0);
2816
k32 = vld1_lane_f32(ky + 1, k32, 1);
2818
for( ; i <= width - 8; i += 8 )
2820
float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
2821
float32x4_t accl, acch;
2826
x0h = vld1q_f32(S + 4);
2832
x1h = vld1q_f32(S + 4);
2833
x2l = vld1q_f32(S2);
2834
x2h = vld1q_f32(S2 + 4);
2837
accl = vmlaq_lane_f32(accl, x0l, k32, 0);
2838
acch = vmlaq_lane_f32(acch, x0h, k32, 0);
2839
accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
2840
acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
2842
for( k = 2; k <= ksize2; k++ )
2847
float32x4_t x3l, x3h, x4l, x4h;
2849
x3h = vld1q_f32(S + 4);
2850
x4l = vld1q_f32(S2);
2851
x4h = vld1q_f32(S2 + 4);
2853
accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
2854
acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
2857
int32x4_t s32l, s32h;
2858
s32l = vcvtq_s32_f32(accl);
2859
s32h = vcvtq_s32_f32(acch);
2861
int16x4_t s16l, s16h;
2862
s16l = vqmovn_s32(s32l);
2863
s16h = vqmovn_s32(s32h);
2865
vst1_s16((int16_t *)(dst + i), s16l);
2866
vst1_s16((int16_t *)(dst + i + 4), s16h);
2872
k32 = vdup_n_f32(0);
2873
k32 = vld1_lane_f32(ky + 1, k32, 1);
2875
for( ; i <= width - 8; i += 8 )
2877
float32x4_t x1l, x1h, x2l, x2h;
2878
float32x4_t accl, acch;
2884
x1h = vld1q_f32(S + 4);
2885
x2l = vld1q_f32(S2);
2886
x2h = vld1q_f32(S2 + 4);
2889
accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
2890
acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
2892
for( k = 2; k <= ksize2; k++ )
2897
float32x4_t x3l, x3h, x4l, x4h;
2899
x3h = vld1q_f32(S + 4);
2900
x4l = vld1q_f32(S2);
2901
x4h = vld1q_f32(S2 + 4);
2903
accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
2904
acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
2907
int32x4_t s32l, s32h;
2908
s32l = vcvtq_s32_f32(accl);
2909
s32h = vcvtq_s32_f32(acch);
2911
int16x4_t s16l, s16h;
2912
s16l = vqmovn_s32(s32l);
2913
s16h = vqmovn_s32(s32h);
2915
vst1_s16((int16_t *)(dst + i), s16l);
2916
vst1_s16((int16_t *)(dst + i + 4), s16h);
2926
bool neon_supported;
2930
struct SymmRowSmallVec_32f
2932
SymmRowSmallVec_32f() {}
2933
SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
2936
symmetryType = _symmetryType;
2939
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
2941
if( !checkHardwareSupport(CV_CPU_NEON) )
2944
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2945
float* dst = (float*)_dst;
2946
const float* src = (const float*)_src + (_ksize/2)*cn;
2947
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2948
const float* kx = kernel.ptr<float>() + _ksize/2;
2957
if( kx[0] == 2 && kx[1] == 1 )
2959
else if( kx[0] == -2 && kx[1] == 1 )
2966
else if( _ksize == 5 )
2968
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2973
k0 = k1 = vdup_n_f32(0);
2974
k0 = vld1_lane_f32(kx + 0, k0, 0);
2975
k0 = vld1_lane_f32(kx + 1, k0, 1);
2976
k1 = vld1_lane_f32(kx + 2, k1, 0);
2978
for( ; i <= width - 4; i += 4, src += 4 )
2980
float32x4_t x0, x1, x2, x3, x4;
2981
x0 = vld1q_f32(src);
2982
x1 = vld1q_f32(src - cn);
2983
x2 = vld1q_f32(src + cn);
2984
x3 = vld1q_f32(src - cn*2);
2985
x4 = vld1q_f32(src + cn*2);
2988
y0 = vmulq_lane_f32(x0, k0, 0);
2989
y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
2990
y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
2992
vst1q_f32(dst + i, y0);
3001
if( kx[0] == 0 && kx[1] == 1 )
3008
else if( _ksize == 5 )
3012
k = vld1_lane_f32(kx + 1, k, 0);
3013
k = vld1_lane_f32(kx + 2, k, 1);
3015
for( ; i <= width - 4; i += 4, src += 4 )
3017
float32x4_t x0, x1, x2, x3;
3018
x0 = vld1q_f32(src - cn);
3019
x1 = vld1q_f32(src + cn);
3020
x2 = vld1q_f32(src - cn*2);
3021
x3 = vld1q_f32(src + cn*2);
3024
y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
3025
y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
3027
vst1q_f32(dst + i, y0);
3040
typedef RowNoVec RowVec_8u32s;
3041
typedef RowNoVec RowVec_16s32f;
3042
typedef RowNoVec RowVec_32f;
3043
typedef ColumnNoVec SymmColumnVec_32f;
3044
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3045
typedef FilterNoVec FilterVec_8u;
3046
typedef FilterNoVec FilterVec_8u16s;
3047
typedef FilterNoVec FilterVec_32f;
3052
typedef RowNoVec RowVec_8u32s;
3053
typedef RowNoVec RowVec_16s32f;
3054
typedef RowNoVec RowVec_32f;
3055
typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
3056
typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
3057
typedef ColumnNoVec SymmColumnVec_32s8u;
3058
typedef ColumnNoVec SymmColumnVec_32f16s;
3059
typedef ColumnNoVec SymmColumnVec_32f;
3060
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
3061
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3062
typedef FilterNoVec FilterVec_8u;
3063
typedef FilterNoVec FilterVec_8u16s;
3064
typedef FilterNoVec FilterVec_32f;
3069
template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
3071
RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
3073
if( _kernel.isContinuous() )
3076
_kernel.copyTo(kernel);
3078
ksize = kernel.rows + kernel.cols - 1;
3079
CV_Assert( kernel.type() == DataType<DT>::type &&
3080
(kernel.rows == 1 || kernel.cols == 1));
3084
void operator()(const uchar* src, uchar* dst, int width, int cn)
3087
const DT* kx = kernel.ptr<DT>();
3092
i = vecOp(src, dst, width, cn);
3094
#if CV_ENABLE_UNROLLED
3095
for( ; i <= width - 4; i += 4 )
3097
S = (const ST*)src + i;
3099
DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
3101
for( k = 1; k < _ksize; k++ )
3105
s0 += f*S[0]; s1 += f*S[1];
3106
s2 += f*S[2]; s3 += f*S[3];
3109
D[i] = s0; D[i+1] = s1;
3110
D[i+2] = s2; D[i+3] = s3;
3113
for( ; i < width; i++ )
3115
S = (const ST*)src + i;
3117
for( k = 1; k < _ksize; k++ )
3131
template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
3132
public RowFilter<ST, DT, VecOp>
3134
SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
3135
const VecOp& _vecOp = VecOp())
3136
: RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
3138
symmetryType = _symmetryType;
3139
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
3142
void operator()(const uchar* src, uchar* dst, int width, int cn)
3144
int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
3145
const DT* kx = this->kernel.template ptr<DT>() + ksize2;
3146
bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3148
int i = this->vecOp(src, dst, width, cn), j, k;
3149
const ST* S = (const ST*)src + i + ksize2n;
3154
if( this->ksize == 1 && kx[0] == 1 )
3156
for( ; i <= width - 2; i += 2 )
3158
DT s0 = S[i], s1 = S[i+1];
3159
D[i] = s0; D[i+1] = s1;
3163
else if( this->ksize == 3 )
3165
if( kx[0] == 2 && kx[1] == 1 )
3166
for( ; i <= width - 2; i += 2, S += 2 )
3168
DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
3169
D[i] = s0; D[i+1] = s1;
3171
else if( kx[0] == -2 && kx[1] == 1 )
3172
for( ; i <= width - 2; i += 2, S += 2 )
3174
DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
3175
D[i] = s0; D[i+1] = s1;
3179
DT k0 = kx[0], k1 = kx[1];
3180
for( ; i <= width - 2; i += 2, S += 2 )
3182
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
3183
D[i] = s0; D[i+1] = s1;
3187
else if( this->ksize == 5 )
3189
DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
3190
if( k0 == -2 && k1 == 0 && k2 == 1 )
3191
for( ; i <= width - 2; i += 2, S += 2 )
3193
DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
3194
DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
3195
D[i] = s0; D[i+1] = s1;
3198
for( ; i <= width - 2; i += 2, S += 2 )
3200
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
3201
DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
3202
D[i] = s0; D[i+1] = s1;
3206
for( ; i < width; i++, S++ )
3209
for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3210
s0 += kx[k]*(S[j] + S[-j]);
3216
if( this->ksize == 3 )
3218
if( kx[0] == 0 && kx[1] == 1 )
3219
for( ; i <= width - 2; i += 2, S += 2 )
3221
DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
3222
D[i] = s0; D[i+1] = s1;
3227
for( ; i <= width - 2; i += 2, S += 2 )
3229
DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
3230
D[i] = s0; D[i+1] = s1;
3234
else if( this->ksize == 5 )
3236
DT k1 = kx[1], k2 = kx[2];
3237
for( ; i <= width - 2; i += 2, S += 2 )
3239
DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
3240
DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
3241
D[i] = s0; D[i+1] = s1;
3245
for( ; i < width; i++, S++ )
3248
for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3249
s0 += kx[k]*(S[j] - S[-j]);
3259
template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
3261
typedef typename CastOp::type1 ST;
3262
typedef typename CastOp::rtype DT;
3264
ColumnFilter( const Mat& _kernel, int _anchor,
3265
double _delta, const CastOp& _castOp=CastOp(),
3266
const VecOp& _vecOp=VecOp() )
3268
if( _kernel.isContinuous() )
3271
_kernel.copyTo(kernel);
3273
ksize = kernel.rows + kernel.cols - 1;
3274
delta = saturate_cast<ST>(_delta);
3277
CV_Assert( kernel.type() == DataType<ST>::type &&
3278
(kernel.rows == 1 || kernel.cols == 1));
3281
void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3283
const ST* ky = kernel.template ptr<ST>();
3287
CastOp castOp = castOp0;
3289
for( ; count--; dst += dststep, src++ )
3292
i = vecOp(src, dst, width);
3293
#if CV_ENABLE_UNROLLED
3294
for( ; i <= width - 4; i += 4 )
3297
const ST* S = (const ST*)src[0] + i;
3298
ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3299
s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3301
for( k = 1; k < _ksize; k++ )
3303
S = (const ST*)src[k] + i; f = ky[k];
3304
s0 += f*S[0]; s1 += f*S[1];
3305
s2 += f*S[2]; s3 += f*S[3];
3308
D[i] = castOp(s0); D[i+1] = castOp(s1);
3309
D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3312
for( ; i < width; i++ )
3314
ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3315
for( k = 1; k < _ksize; k++ )
3316
s0 += ky[k]*((const ST*)src[k])[i];
3329
template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
3331
typedef typename CastOp::type1 ST;
3332
typedef typename CastOp::rtype DT;
3334
SymmColumnFilter( const Mat& _kernel, int _anchor,
3335
double _delta, int _symmetryType,
3336
const CastOp& _castOp=CastOp(),
3337
const VecOp& _vecOp=VecOp())
3338
: ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
3340
symmetryType = _symmetryType;
3341
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
3344
void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3346
int ksize2 = this->ksize/2;
3347
const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3349
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
3350
ST _delta = this->delta;
3351
CastOp castOp = this->castOp0;
3356
for( ; count--; dst += dststep, src++ )
3359
i = (this->vecOp)(src, dst, width);
3360
#if CV_ENABLE_UNROLLED
3361
for( ; i <= width - 4; i += 4 )
3364
const ST* S = (const ST*)src[0] + i, *S2;
3365
ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3366
s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3368
for( k = 1; k <= ksize2; k++ )
3370
S = (const ST*)src[k] + i;
3371
S2 = (const ST*)src[-k] + i;
3373
s0 += f*(S[0] + S2[0]);
3374
s1 += f*(S[1] + S2[1]);
3375
s2 += f*(S[2] + S2[2]);
3376
s3 += f*(S[3] + S2[3]);
3379
D[i] = castOp(s0); D[i+1] = castOp(s1);
3380
D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3383
for( ; i < width; i++ )
3385
ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3386
for( k = 1; k <= ksize2; k++ )
3387
s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
3394
for( ; count--; dst += dststep, src++ )
3397
i = this->vecOp(src, dst, width);
3398
#if CV_ENABLE_UNROLLED
3399
for( ; i <= width - 4; i += 4 )
3403
ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3405
for( k = 1; k <= ksize2; k++ )
3407
S = (const ST*)src[k] + i;
3408
S2 = (const ST*)src[-k] + i;
3410
s0 += f*(S[0] - S2[0]);
3411
s1 += f*(S[1] - S2[1]);
3412
s2 += f*(S[2] - S2[2]);
3413
s3 += f*(S[3] - S2[3]);
3416
D[i] = castOp(s0); D[i+1] = castOp(s1);
3417
D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3420
for( ; i < width; i++ )
3423
for( k = 1; k <= ksize2; k++ )
3424
s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
3435
template<class CastOp, class VecOp>
3436
struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
3438
typedef typename CastOp::type1 ST;
3439
typedef typename CastOp::rtype DT;
3441
SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
3442
double _delta, int _symmetryType,
3443
const CastOp& _castOp=CastOp(),
3444
const VecOp& _vecOp=VecOp())
3445
: SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
3447
CV_Assert( this->ksize == 3 );
3450
void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3452
int ksize2 = this->ksize/2;
3453
const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3455
bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3456
bool is_1_2_1 = ky[0] == 2 && ky[1] == 1;
3457
bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1;
3458
bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1);
3459
ST f0 = ky[0], f1 = ky[1];
3460
ST _delta = this->delta;
3461
CastOp castOp = this->castOp0;
3464
for( ; count--; dst += dststep, src++ )
3467
i = (this->vecOp)(src, dst, width);
3468
const ST* S0 = (const ST*)src[-1];
3469
const ST* S1 = (const ST*)src[0];
3470
const ST* S2 = (const ST*)src[1];
3476
#if CV_ENABLE_UNROLLED
3477
for( ; i <= width - 4; i += 4 )
3479
ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3480
ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
3482
D[i+1] = castOp(s1);
3484
s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
3485
s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
3486
D[i+2] = castOp(s0);
3487
D[i+3] = castOp(s1);
3490
for( ; i < width; i ++ )
3492
ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3496
else if( is_1_m2_1 )
3498
#if CV_ENABLE_UNROLLED
3499
for( ; i <= width - 4; i += 4 )
3501
ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3502
ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
3504
D[i+1] = castOp(s1);
3506
s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
3507
s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
3508
D[i+2] = castOp(s0);
3509
D[i+3] = castOp(s1);
3512
for( ; i < width; i ++ )
3514
ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3520
#if CV_ENABLE_UNROLLED
3521
for( ; i <= width - 4; i += 4 )
3523
ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3524
ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
3526
D[i+1] = castOp(s1);
3528
s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
3529
s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
3530
D[i+2] = castOp(s0);
3531
D[i+3] = castOp(s1);
3534
for( ; i < width; i ++ )
3536
ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3547
#if CV_ENABLE_UNROLLED
3548
for( ; i <= width - 4; i += 4 )
3550
ST s0 = S2[i] - S0[i] + _delta;
3551
ST s1 = S2[i+1] - S0[i+1] + _delta;
3553
D[i+1] = castOp(s1);
3555
s0 = S2[i+2] - S0[i+2] + _delta;
3556
s1 = S2[i+3] - S0[i+3] + _delta;
3557
D[i+2] = castOp(s0);
3558
D[i+3] = castOp(s1);
3561
for( ; i < width; i ++ )
3563
ST s0 = S2[i] - S0[i] + _delta;
3571
#if CV_ENABLE_UNROLLED
3572
for( ; i <= width - 4; i += 4 )
3574
ST s0 = (S2[i] - S0[i])*f1 + _delta;
3575
ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
3577
D[i+1] = castOp(s1);
3579
s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
3580
s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
3581
D[i+2] = castOp(s0);
3582
D[i+3] = castOp(s1);
3585
for( ; i < width; i++ )
3586
D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
3593
template<typename ST, typename DT> struct Cast
3598
DT operator()(ST val) const { return saturate_cast<DT>(val); }
3601
template<typename ST, typename DT, int bits> struct FixedPtCast
3605
enum { SHIFT = bits, DELTA = 1 << (bits-1) };
3607
DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3610
template<typename ST, typename DT> struct FixedPtCastEx
3615
FixedPtCastEx() : SHIFT(0), DELTA(0) {}
3616
FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
3617
DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3623
cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
3624
InputArray _kernel, int anchor,
3627
Mat kernel = _kernel.getMat();
3628
int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
3629
int cn = CV_MAT_CN(srcType);
3630
CV_Assert( cn == CV_MAT_CN(bufType) &&
3631
ddepth >= std::max(sdepth, CV_32S) &&
3632
kernel.type() == ddepth );
3633
int ksize = kernel.rows + kernel.cols - 1;
3635
if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
3637
if( sdepth == CV_8U && ddepth == CV_32S )
3638
return makePtr<SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s> >
3639
(kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType));
3640
if( sdepth == CV_32F && ddepth == CV_32F )
3641
return makePtr<SymmRowSmallFilter<float, float, SymmRowSmallVec_32f> >
3642
(kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType));
3645
if( sdepth == CV_8U && ddepth == CV_32S )
3646
return makePtr<RowFilter<uchar, int, RowVec_8u32s> >
3647
(kernel, anchor, RowVec_8u32s(kernel));
3648
if( sdepth == CV_8U && ddepth == CV_32F )
3649
return makePtr<RowFilter<uchar, float, RowNoVec> >(kernel, anchor);
3650
if( sdepth == CV_8U && ddepth == CV_64F )
3651
return makePtr<RowFilter<uchar, double, RowNoVec> >(kernel, anchor);
3652
if( sdepth == CV_16U && ddepth == CV_32F )
3653
return makePtr<RowFilter<ushort, float, RowNoVec> >(kernel, anchor);
3654
if( sdepth == CV_16U && ddepth == CV_64F )
3655
return makePtr<RowFilter<ushort, double, RowNoVec> >(kernel, anchor);
3656
if( sdepth == CV_16S && ddepth == CV_32F )
3657
return makePtr<RowFilter<short, float, RowVec_16s32f> >
3658
(kernel, anchor, RowVec_16s32f(kernel));
3659
if( sdepth == CV_16S && ddepth == CV_64F )
3660
return makePtr<RowFilter<short, double, RowNoVec> >(kernel, anchor);
3661
if( sdepth == CV_32F && ddepth == CV_32F )
3662
return makePtr<RowFilter<float, float, RowVec_32f> >
3663
(kernel, anchor, RowVec_32f(kernel));
3664
if( sdepth == CV_32F && ddepth == CV_64F )
3665
return makePtr<RowFilter<float, double, RowNoVec> >(kernel, anchor);
3666
if( sdepth == CV_64F && ddepth == CV_64F )
3667
return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor);
3669
CV_Error_( CV_StsNotImplemented,
3670
("Unsupported combination of source format (=%d), and buffer format (=%d)",
3673
return Ptr<BaseRowFilter>();
3677
cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
3678
InputArray _kernel, int anchor,
3679
int symmetryType, double delta,
3682
Mat kernel = _kernel.getMat();
3683
int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
3684
int cn = CV_MAT_CN(dstType);
3685
CV_Assert( cn == CV_MAT_CN(bufType) &&
3686
sdepth >= std::max(ddepth, CV_32S) &&
3687
kernel.type() == sdepth );
3689
if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
3691
if( ddepth == CV_8U && sdepth == CV_32S )
3692
return makePtr<ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec> >
3693
(kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits));
3694
if( ddepth == CV_8U && sdepth == CV_32F )
3695
return makePtr<ColumnFilter<Cast<float, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3696
if( ddepth == CV_8U && sdepth == CV_64F )
3697
return makePtr<ColumnFilter<Cast<double, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3698
if( ddepth == CV_16U && sdepth == CV_32F )
3699
return makePtr<ColumnFilter<Cast<float, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3700
if( ddepth == CV_16U && sdepth == CV_64F )
3701
return makePtr<ColumnFilter<Cast<double, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3702
if( ddepth == CV_16S && sdepth == CV_32F )
3703
return makePtr<ColumnFilter<Cast<float, short>, ColumnNoVec> >(kernel, anchor, delta);
3704
if( ddepth == CV_16S && sdepth == CV_64F )
3705
return makePtr<ColumnFilter<Cast<double, short>, ColumnNoVec> >(kernel, anchor, delta);
3706
if( ddepth == CV_32F && sdepth == CV_32F )
3707
return makePtr<ColumnFilter<Cast<float, float>, ColumnNoVec> >(kernel, anchor, delta);
3708
if( ddepth == CV_64F && sdepth == CV_64F )
3709
return makePtr<ColumnFilter<Cast<double, double>, ColumnNoVec> >(kernel, anchor, delta);
3713
int ksize = kernel.rows + kernel.cols - 1;
3716
if( ddepth == CV_8U && sdepth == CV_32S )
3717
return makePtr<SymmColumnSmallFilter<
3718
FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3719
(kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3720
SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3721
if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
3722
return makePtr<SymmColumnSmallFilter<Cast<int, short>,
3723
SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType,
3724
Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta));
3725
if( ddepth == CV_32F && sdepth == CV_32F )
3726
return makePtr<SymmColumnSmallFilter<
3727
Cast<float, float>,SymmColumnSmallVec_32f> >
3728
(kernel, anchor, delta, symmetryType, Cast<float, float>(),
3729
SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta));
3731
if( ddepth == CV_8U && sdepth == CV_32S )
3732
return makePtr<SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3733
(kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3734
SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3735
if( ddepth == CV_8U && sdepth == CV_32F )
3736
return makePtr<SymmColumnFilter<Cast<float, uchar>, ColumnNoVec> >
3737
(kernel, anchor, delta, symmetryType);
3738
if( ddepth == CV_8U && sdepth == CV_64F )
3739
return makePtr<SymmColumnFilter<Cast<double, uchar>, ColumnNoVec> >
3740
(kernel, anchor, delta, symmetryType);
3741
if( ddepth == CV_16U && sdepth == CV_32F )
3742
return makePtr<SymmColumnFilter<Cast<float, ushort>, ColumnNoVec> >
3743
(kernel, anchor, delta, symmetryType);
3744
if( ddepth == CV_16U && sdepth == CV_64F )
3745
return makePtr<SymmColumnFilter<Cast<double, ushort>, ColumnNoVec> >
3746
(kernel, anchor, delta, symmetryType);
3747
if( ddepth == CV_16S && sdepth == CV_32S )
3748
return makePtr<SymmColumnFilter<Cast<int, short>, ColumnNoVec> >
3749
(kernel, anchor, delta, symmetryType);
3750
if( ddepth == CV_16S && sdepth == CV_32F )
3751
return makePtr<SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s> >
3752
(kernel, anchor, delta, symmetryType, Cast<float, short>(),
3753
SymmColumnVec_32f16s(kernel, symmetryType, 0, delta));
3754
if( ddepth == CV_16S && sdepth == CV_64F )
3755
return makePtr<SymmColumnFilter<Cast<double, short>, ColumnNoVec> >
3756
(kernel, anchor, delta, symmetryType);
3757
if( ddepth == CV_32F && sdepth == CV_32F )
3758
return makePtr<SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f> >
3759
(kernel, anchor, delta, symmetryType, Cast<float, float>(),
3760
SymmColumnVec_32f(kernel, symmetryType, 0, delta));
3761
if( ddepth == CV_64F && sdepth == CV_64F )
3762
return makePtr<SymmColumnFilter<Cast<double, double>, ColumnNoVec> >
3763
(kernel, anchor, delta, symmetryType);
3766
CV_Error_( CV_StsNotImplemented,
3767
("Unsupported combination of buffer format (=%d), and destination format (=%d)",
3770
return Ptr<BaseColumnFilter>();
3774
cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
3775
int _srcType, int _dstType,
3776
InputArray __rowKernel, InputArray __columnKernel,
3777
Point _anchor, double _delta,
3778
int _rowBorderType, int _columnBorderType,
3779
const Scalar& _borderValue )
3781
Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
3782
_srcType = CV_MAT_TYPE(_srcType);
3783
_dstType = CV_MAT_TYPE(_dstType);
3784
int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
3785
int cn = CV_MAT_CN(_srcType);
3786
CV_Assert( cn == CV_MAT_CN(_dstType) );
3787
int rsize = _rowKernel.rows + _rowKernel.cols - 1;
3788
int csize = _columnKernel.rows + _columnKernel.cols - 1;
3790
_anchor.x = rsize/2;
3792
_anchor.y = csize/2;
3793
int rtype = getKernelType(_rowKernel,
3794
_rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
3795
int ctype = getKernelType(_columnKernel,
3796
_columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
3797
Mat rowKernel, columnKernel;
3799
int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
3802
if( sdepth == CV_8U &&
3803
((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3804
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3806
((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3807
(ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3808
(rtype & ctype & KERNEL_INTEGER) &&
3809
ddepth == CV_16S)) )
3812
bits = ddepth == CV_8U ? 8 : 0;
3813
_rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
3814
_columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
3816
_delta *= (1 << bits);
3820
if( _rowKernel.type() != bdepth )
3821
_rowKernel.convertTo( rowKernel, bdepth );
3823
rowKernel = _rowKernel;
3824
if( _columnKernel.type() != bdepth )
3825
_columnKernel.convertTo( columnKernel, bdepth );
3827
columnKernel = _columnKernel;
3830
int _bufType = CV_MAKETYPE(bdepth, cn);
3831
Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
3832
_srcType, _bufType, rowKernel, _anchor.x, rtype);
3833
Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
3834
_bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
3836
return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(), _rowFilter, _columnFilter,
3837
_srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
3841
/****************************************************************************************\
3842
* Non-separable linear filter *
3843
\****************************************************************************************/
3848
void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs )
3850
int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
3853
CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
3855
coeffs.resize(nz*getElemSize(ktype));
3856
uchar* _coeffs = &coeffs[0];
3858
for( i = k = 0; i < kernel.rows; i++ )
3860
const uchar* krow = kernel.ptr(i);
3861
for( j = 0; j < kernel.cols; j++ )
3863
if( ktype == CV_8U )
3865
uchar val = krow[j];
3868
coords[k] = Point(j,i);
3871
else if( ktype == CV_32S )
3873
int val = ((const int*)krow)[j];
3876
coords[k] = Point(j,i);
3877
((int*)_coeffs)[k++] = val;
3879
else if( ktype == CV_32F )
3881
float val = ((const float*)krow)[j];
3884
coords[k] = Point(j,i);
3885
((float*)_coeffs)[k++] = val;
3889
double val = ((const double*)krow)[j];
3892
coords[k] = Point(j,i);
3893
((double*)_coeffs)[k++] = val;
3900
template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
3902
typedef typename CastOp::type1 KT;
3903
typedef typename CastOp::rtype DT;
3905
Filter2D( const Mat& _kernel, Point _anchor,
3906
double _delta, const CastOp& _castOp=CastOp(),
3907
const VecOp& _vecOp=VecOp() )
3910
ksize = _kernel.size();
3911
delta = saturate_cast<KT>(_delta);
3914
CV_Assert( _kernel.type() == DataType<KT>::type );
3915
preprocess2DKernel( _kernel, coords, coeffs );
3916
ptrs.resize( coords.size() );
3919
void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
3922
const Point* pt = &coords[0];
3923
const KT* kf = (const KT*)&coeffs[0];
3924
const ST** kp = (const ST**)&ptrs[0];
3925
int i, k, nz = (int)coords.size();
3926
CastOp castOp = castOp0;
3929
for( ; count > 0; count--, dst += dststep, src++ )
3933
for( k = 0; k < nz; k++ )
3934
kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
3936
i = vecOp((const uchar**)kp, dst, width);
3937
#if CV_ENABLE_UNROLLED
3938
for( ; i <= width - 4; i += 4 )
3940
KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3942
for( k = 0; k < nz; k++ )
3944
const ST* sptr = kp[k] + i;
3952
D[i] = castOp(s0); D[i+1] = castOp(s1);
3953
D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3956
for( ; i < width; i++ )
3959
for( k = 0; k < nz; k++ )
3960
s0 += kf[k]*kp[k][i];
3966
std::vector<Point> coords;
3967
std::vector<uchar> coeffs;
3968
std::vector<uchar*> ptrs;
3976
#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
3977
#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
3979
// prepare kernel: transpose and make double rows (+align). Returns size of aligned row
3984
// Output, last two zeros is the alignment:
3988
template <typename T>
3989
static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
3991
Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
3992
int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
3993
data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
3994
for (int x = 0; x < kernel.cols; x++)
3996
for (int y = 0; y < kernel.rows; y++)
3998
data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
3999
data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
4002
return size_y_aligned;
4005
static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
4006
InputArray _kernel, Point anchor,
4007
double delta, int borderType )
4009
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4010
ddepth = ddepth < 0 ? sdepth : ddepth;
4011
int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
4012
wtype = CV_MAKE_TYPE(wdepth, cn);
4016
Size ksize = _kernel.size();
4018
anchor.x = ksize.width / 2;
4020
anchor.y = ksize.height / 2;
4022
bool isolated = (borderType & BORDER_ISOLATED) != 0;
4023
borderType &= ~BORDER_ISOLATED;
4024
const cv::ocl::Device &device = cv::ocl::Device::getDefault();
4025
bool doubleSupport = device.doubleFPConfig() > 0;
4026
if (wdepth == CV_64F && !doubleSupport)
4029
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
4030
"BORDER_WRAP", "BORDER_REFLECT_101" };
4032
cv::Mat kernelMat = _kernel.getMat();
4033
cv::Size sz = _src.size(), wholeSize;
4034
size_t globalsize[2] = { (size_t)sz.width, (size_t)sz.height };
4035
size_t localsize_general[2] = {0, 1};
4036
size_t* localsize = NULL;
4039
UMat src = _src.getUMat();
4043
src.locateROI(wholeSize, ofs);
4046
size_t tryWorkItems = device.maxWorkGroupSize();
4047
if (device.isIntel() && 128 < tryWorkItems)
4051
// For smaller filter kernels, there is a special kernel that is more
4052
// efficient than the general one.
4053
UMat kernalDataUMat;
4054
if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
4055
((ksize.width < 5 && ksize.height < 5) ||
4056
(ksize.width == 5 && ksize.height == 5 && cn == 1)))
4058
kernelMat = kernelMat.reshape(0, 1);
4059
String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
4060
int h = isolated ? sz.height : wholeSize.height;
4061
int w = isolated ? sz.width : wholeSize.width;
4063
if (w < ksize.width || h < ksize.height)
4066
// Figure out what vector size to use for loading the pixels.
4067
int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4;
4068
int pxLoadVecSize = cn * pxLoadNumPixels;
4070
// Figure out how many pixels per work item to compute in X and Y
4071
// directions. Too many and we run out of registers.
4072
int pxPerWorkItemX = 1;
4073
int pxPerWorkItemY = 1;
4074
if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
4076
pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
4077
pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4079
else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
4081
pxPerWorkItemX = sz.width % 2 ? 1 : 2;
4082
pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4084
globalsize[0] = sz.width / pxPerWorkItemX;
4085
globalsize[1] = sz.height / pxPerWorkItemY;
4087
// Need some padding in the private array for pixels
4088
int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
4090
// Make the global size a nice round number so the runtime can pick
4091
// from reasonable choices for the workgroup size
4092
const int wgRound = 256;
4093
globalsize[0] = ROUNDUP(globalsize[0], wgRound);
4095
char build_options[1024];
4096
sprintf(build_options, "-D cn=%d "
4097
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4098
"-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
4099
"-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
4100
"-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
4101
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4102
"-D convertToWT=%s -D convertToDstT=%s %s",
4103
cn, anchor.x, anchor.y, ksize.width, ksize.height,
4104
pxLoadVecSize, pxLoadNumPixels,
4105
pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
4106
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4107
privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
4108
ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4109
ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4110
ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4111
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
4113
if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options))
4118
localsize = localsize_general;
4119
std::vector<float> kernelMatDataFloat;
4120
int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
4121
String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
4125
size_t BLOCK_SIZE = tryWorkItems;
4126
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
4129
if ((size_t)ksize.width > BLOCK_SIZE)
4132
int requiredTop = anchor.y;
4133
int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
4134
int requiredBottom = ksize.height - 1 - anchor.y;
4135
int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
4136
int h = isolated ? sz.height : wholeSize.height;
4137
int w = isolated ? sz.width : wholeSize.width;
4138
bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
4140
if ((w < ksize.width) || (h < ksize.height))
4143
String opts = format("-D LOCAL_SIZE=%d -D cn=%d "
4144
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4145
"-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
4146
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4147
"-D convertToWT=%s -D convertToDstT=%s",
4148
(int)BLOCK_SIZE, cn, anchor.x, anchor.y,
4149
ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
4150
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4151
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4152
doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
4153
ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4154
ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4155
ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4156
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));
4158
localsize[0] = BLOCK_SIZE;
4159
globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
4160
globalsize[1] = sz.height;
4162
if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
4165
size_t kernelWorkGroupSize = k.workGroupSize();
4166
if (localsize[0] <= kernelWorkGroupSize)
4168
if (BLOCK_SIZE < kernelWorkGroupSize)
4170
tryWorkItems = kernelWorkGroupSize;
4174
_dst.create(sz, dtype);
4175
UMat dst = _dst.getUMat();
4177
int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
4178
int srcOffsetY = (int)(src.offset / src.step);
4179
int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
4180
int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
4182
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
4183
srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
4185
return k.run(2, globalsize, localsize, false);
4188
const int shift_bits = 8;
4190
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
4191
int borderType, int ddepth, bool fast8uc1, bool int_arithm)
4193
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
4194
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4195
Size bufSize = buf.size();
4196
int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4198
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
4202
size_t localsize[2] = {16, 10};
4204
size_t localsize[2] = {16, 16};
4207
size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
4209
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
4211
int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
4213
bool isolated = (borderType & BORDER_ISOLATED) != 0;
4214
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
4215
* const btype = borderMap[borderType & ~BORDER_ISOLATED];
4217
bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
4218
extra_extrapolation |= src.rows < radiusY;
4219
extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
4220
extra_extrapolation |= src.cols < radiusX;
4223
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
4224
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
4225
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
4226
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4227
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4228
ocl::typeToStr(type), ocl::typeToStr(buf_type),
4229
ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
4230
ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
4231
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4232
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4233
build_options += ocl::kernelToStr(kernelX, bdepth);
4235
Size srcWholeSize; Point srcOffset;
4236
src.locateROI(srcWholeSize, srcOffset);
4238
String kernelName("row_filter");
4240
kernelName += "_C1_D0";
4242
ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
4248
k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
4249
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4250
ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
4251
buf.cols, buf.rows, radiusY);
4253
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
4254
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4255
ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
4257
return k.run(2, globalsize, localsize, false);
4260
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
4262
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4263
if (dst.depth() == CV_64F && !doubleSupport)
4267
size_t localsize[2] = { 16, 10 };
4269
size_t localsize[2] = { 16, 16 };
4271
size_t globalsize[2] = { 0, 0 };
4273
int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
4274
Size sz = dst.size();
4275
int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4277
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
4278
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
4281
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
4282
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
4283
" -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
4284
anchor, (int)localsize[0], (int)localsize[1], cn,
4285
ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
4286
ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
4287
ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
4288
2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4289
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4290
build_options += ocl::kernelToStr(kernelY, bdepth);
4292
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
4297
k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
4298
static_cast<float>(delta));
4300
return k.run(2, globalsize, localsize, false);
4303
const int optimizedSepFilterLocalWidth = 16;
4304
const int optimizedSepFilterLocalHeight = 8;
4306
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
4307
Mat row_kernel, Mat col_kernel,
4308
double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
4310
Size size = _src.size(), wholeSize;
4312
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4313
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
4314
dtype = CV_MAKE_TYPE(ddepth, cn);
4315
size_t src_step = _src.step(), src_offset = _src.offset();
4316
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4318
if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
4319
!(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
4320
borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
4321
borderType == BORDER_REFLECT_101))
4324
size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
4325
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
4328
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4329
"BORDER_REFLECT_101" };
4331
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
4332
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
4333
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
4334
(int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
4335
ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
4336
ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
4337
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4338
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
4339
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
4340
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
4341
cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4343
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
4347
UMat src = _src.getUMat();
4348
_dst.create(size, dtype);
4349
UMat dst = _dst.getUMat();
4351
int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
4352
int src_offset_y = static_cast<int>(src_offset / src_step);
4354
src.locateROI(wholeSize, origin);
4356
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
4357
wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
4358
static_cast<float>(delta));
4360
return k.run(2, gt2, lt2, false);
4363
static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4364
InputArray _kernelX, InputArray _kernelY, Point anchor,
4365
double delta, int borderType )
4367
const ocl::Device & d = ocl::Device::getDefault();
4368
Size imgSize = _src.size();
4370
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4374
Mat kernelX = _kernelX.getMat().reshape(1, 1);
4375
if (kernelX.cols % 2 != 1)
4377
Mat kernelY = _kernelY.getMat().reshape(1, 1);
4378
if (kernelY.cols % 2 != 1)
4385
anchor.x = kernelX.cols >> 1;
4387
anchor.y = kernelY.cols >> 1;
4389
int rtype = getKernelType(kernelX,
4390
kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
4391
int ctype = getKernelType(kernelY,
4392
kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
4394
int bdepth = CV_32F;
4395
bool int_arithm = false;
4396
if( sdepth == CV_8U && ddepth == CV_8U &&
4397
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
4398
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
4400
if (ocl::Device::getDefault().isIntel())
4402
for (int i=0; i<kernelX.cols; i++)
4403
kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
4404
if (kernelX.data != kernelY.data)
4405
for (int i=0; i<kernelX.cols; i++)
4406
kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
4410
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
4411
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
4416
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
4417
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
4418
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
4419
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
4420
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
4421
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
4422
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
4423
borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
4425
UMat src = _src.getUMat();
4426
Size srcWholeSize; Point srcOffset;
4427
src.locateROI(srcWholeSize, srcOffset);
4429
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
4430
src.cols % 4 == 0 && src.step % 4 == 0;
4432
Size srcSize = src.size();
4433
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
4434
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
4435
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
4438
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
4439
UMat dst = _dst.getUMat();
4441
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
4448
cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
4449
InputArray filter_kernel, Point anchor,
4450
double delta, int bits)
4452
Mat _kernel = filter_kernel.getMat();
4453
int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
4454
int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
4455
CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
4457
anchor = normalizeAnchor(anchor, _kernel.size());
4459
/*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
4460
return makePtr<Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u> >
4461
(_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
4462
FilterVec_8u(_kernel, bits, delta));
4463
if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
4464
return makePtr<Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s> >
4465
(_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
4466
FilterVec_8u16s(_kernel, bits, delta));*/
4468
kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
4470
if( _kernel.type() == kdepth )
4473
_kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
4475
if( sdepth == CV_8U && ddepth == CV_8U )
4476
return makePtr<Filter2D<uchar, Cast<float, uchar>, FilterVec_8u> >
4477
(kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta));
4478
if( sdepth == CV_8U && ddepth == CV_16U )
4479
return makePtr<Filter2D<uchar,
4480
Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4481
if( sdepth == CV_8U && ddepth == CV_16S )
4482
return makePtr<Filter2D<uchar, Cast<float, short>, FilterVec_8u16s> >
4483
(kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta));
4484
if( sdepth == CV_8U && ddepth == CV_32F )
4485
return makePtr<Filter2D<uchar,
4486
Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4487
if( sdepth == CV_8U && ddepth == CV_64F )
4488
return makePtr<Filter2D<uchar,
4489
Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4491
if( sdepth == CV_16U && ddepth == CV_16U )
4492
return makePtr<Filter2D<ushort,
4493
Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4494
if( sdepth == CV_16U && ddepth == CV_32F )
4495
return makePtr<Filter2D<ushort,
4496
Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4497
if( sdepth == CV_16U && ddepth == CV_64F )
4498
return makePtr<Filter2D<ushort,
4499
Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4501
if( sdepth == CV_16S && ddepth == CV_16S )
4502
return makePtr<Filter2D<short,
4503
Cast<float, short>, FilterNoVec> >(kernel, anchor, delta);
4504
if( sdepth == CV_16S && ddepth == CV_32F )
4505
return makePtr<Filter2D<short,
4506
Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4507
if( sdepth == CV_16S && ddepth == CV_64F )
4508
return makePtr<Filter2D<short,
4509
Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4511
if( sdepth == CV_32F && ddepth == CV_32F )
4512
return makePtr<Filter2D<float, Cast<float, float>, FilterVec_32f> >
4513
(kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta));
4514
if( sdepth == CV_64F && ddepth == CV_64F )
4515
return makePtr<Filter2D<double,
4516
Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4518
CV_Error_( CV_StsNotImplemented,
4519
("Unsupported combination of source format (=%d), and destination format (=%d)",
4522
return Ptr<BaseFilter>();
4526
cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
4527
InputArray filter_kernel,
4528
Point _anchor, double _delta,
4529
int _rowBorderType, int _columnBorderType,
4530
const Scalar& _borderValue )
4532
Mat _kernel = filter_kernel.getMat();
4533
_srcType = CV_MAT_TYPE(_srcType);
4534
_dstType = CV_MAT_TYPE(_dstType);
4535
int cn = CV_MAT_CN(_srcType);
4536
CV_Assert( cn == CV_MAT_CN(_dstType) );
4538
Mat kernel = _kernel;
4541
/*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
4542
int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
4543
if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
4544
_kernel.rows*_kernel.cols <= (1 << 10) )
4546
bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
4547
_kernel.convertTo(kernel, CV_32S, 1 << bits);
4550
Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
4551
kernel, _anchor, _delta, bits);
4553
return makePtr<FilterEngine>(_filter2D, Ptr<BaseRowFilter>(),
4554
Ptr<BaseColumnFilter>(), _srcType, _dstType, _srcType,
4555
_rowBorderType, _columnBorderType, _borderValue );
4561
static bool ipp_filter2D( InputArray _src, OutputArray _dst, int ddepth,
4562
InputArray _kernel, Point anchor0,
4563
double delta, int borderType )
4566
Mat src = _src.getMat(), kernel = _kernel.getMat();
4569
ddepth = src.depth();
4571
_dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4572
Mat dst = _dst.getMat();
4573
Point anchor = normalizeAnchor(anchor0, kernel.size());
4575
typedef IppStatus (CV_STDCALL * ippiFilterBorder)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize dstRoiSize,
4576
IppiBorderType border, const void * borderValue,
4577
const IppiFilterBorderSpec* pSpec, Ipp8u* pBuffer);
4579
int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4580
ktype = kernel.type(), kdepth = CV_MAT_DEPTH(ktype);
4581
bool isolated = (borderType & BORDER_ISOLATED) != 0;
4582
#if IPP_VERSION_X100 >= 900
4583
Point ippAnchor((kernel.cols-1)/2, (kernel.rows-1)/2);
4585
Point ippAnchor(kernel.cols >> 1, kernel.rows >> 1);
4587
int borderTypeNI = borderType & ~BORDER_ISOLATED;
4588
IppiBorderType ippBorderType = ippiGetBorderType(borderTypeNI);
4590
if (borderTypeNI == BORDER_CONSTANT || borderTypeNI == BORDER_REPLICATE)
4592
ippiFilterBorder ippFunc =
4593
stype == CV_8UC1 ? (ippiFilterBorder)ippiFilterBorder_8u_C1R :
4594
stype == CV_8UC3 ? (ippiFilterBorder)ippiFilterBorder_8u_C3R :
4595
stype == CV_8UC4 ? (ippiFilterBorder)ippiFilterBorder_8u_C4R :
4596
stype == CV_16UC1 ? (ippiFilterBorder)ippiFilterBorder_16u_C1R :
4597
stype == CV_16UC3 ? (ippiFilterBorder)ippiFilterBorder_16u_C3R :
4598
stype == CV_16UC4 ? (ippiFilterBorder)ippiFilterBorder_16u_C4R :
4599
stype == CV_16SC1 ? (ippiFilterBorder)ippiFilterBorder_16s_C1R :
4600
stype == CV_16SC3 ? (ippiFilterBorder)ippiFilterBorder_16s_C3R :
4601
stype == CV_16SC4 ? (ippiFilterBorder)ippiFilterBorder_16s_C4R :
4602
stype == CV_32FC1 ? (ippiFilterBorder)ippiFilterBorder_32f_C1R :
4603
stype == CV_32FC3 ? (ippiFilterBorder)ippiFilterBorder_32f_C3R :
4604
stype == CV_32FC4 ? (ippiFilterBorder)ippiFilterBorder_32f_C4R : 0;
4606
if (sdepth == ddepth && (ktype == CV_16SC1 || ktype == CV_32FC1) &&
4607
ippFunc && (int)ippBorderType >= 0 && (!src.isSubmatrix() || isolated) &&
4608
std::fabs(delta - 0) < DBL_EPSILON && ippAnchor == anchor && dst.data != src.data)
4610
IppiSize kernelSize = { kernel.cols, kernel.rows }, dstRoiSize = { dst.cols, dst.rows };
4611
IppDataType dataType = ippiGetDataType(ddepth), kernelType = ippiGetDataType(kdepth);
4612
Ipp32s specSize = 0, bufsize = 0;
4613
IppStatus status = (IppStatus)-1;
4615
if ((status = ippiFilterBorderGetSize(kernelSize, dstRoiSize, dataType, kernelType, cn, &specSize, &bufsize)) >= 0)
4617
IppAutoBuffer<IppiFilterBorderSpec> spec(specSize);
4618
IppAutoBuffer<Ipp8u> buffer(bufsize);
4619
Ipp32f borderValue[4] = { 0, 0, 0, 0 };
4621
if(kdepth == CV_32F)
4623
Ipp32f *pKerBuffer = (Ipp32f*)kernel.data;
4624
IppAutoBuffer<Ipp32f> kerTmp;
4625
int kerStep = sizeof(Ipp32f)*kernelSize.width;
4626
#if IPP_VERSION_X100 >= 900
4627
if((int)kernel.step != kerStep)
4629
kerTmp.Alloc(kerStep*kernelSize.height);
4630
if(ippiCopy_32f_C1R((Ipp32f*)kernel.data, (int)kernel.step, kerTmp, kerStep, kernelSize) < 0)
4632
pKerBuffer = kerTmp;
4635
kerTmp.Alloc(kerStep*kernelSize.height);
4636
Mat kerFlip(Size(kernelSize.width, kernelSize.height), CV_32FC1, kerTmp, kerStep);
4637
flip(kernel, kerFlip, -1);
4638
pKerBuffer = kerTmp;
4641
if((status = ippiFilterBorderInit_32f(pKerBuffer, kernelSize,
4642
dataType, cn, ippRndFinancial, spec)) >= 0 )
4644
status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize,
4645
ippBorderType, borderValue, spec, buffer);
4648
else if(kdepth == CV_16S)
4650
Ipp16s *pKerBuffer = (Ipp16s*)kernel.data;
4651
IppAutoBuffer<Ipp16s> kerTmp;
4652
int kerStep = sizeof(Ipp16s)*kernelSize.width;
4653
#if IPP_VERSION_X100 >= 900
4654
if((int)kernel.step != kerStep)
4656
kerTmp.Alloc(kerStep*kernelSize.height);
4657
if(ippiCopy_16s_C1R((Ipp16s*)kernel.data, (int)kernel.step, kerTmp, kerStep, kernelSize) < 0)
4659
pKerBuffer = kerTmp;
4662
kerTmp.Alloc(kerStep*kernelSize.height);
4663
Mat kerFlip(Size(kernelSize.width, kernelSize.height), CV_16SC1, kerTmp, kerStep);
4664
flip(kernel, kerFlip, -1);
4665
pKerBuffer = kerTmp;
4668
if((status = ippiFilterBorderInit_16s(pKerBuffer, kernelSize,
4669
0, dataType, cn, ippRndFinancial, spec)) >= 0)
4671
status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize,
4672
ippBorderType, borderValue, spec, buffer);
4679
CV_IMPL_ADD(CV_IMPL_IPP);
4685
CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ddepth); CV_UNUSED(_kernel), CV_UNUSED(anchor0), CV_UNUSED(delta), CV_UNUSED(borderType);
4693
void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
4694
InputArray _kernel, Point anchor0,
4695
double delta, int borderType )
4697
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4698
ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType))
4700
Mat src = _src.getMat(), kernel = _kernel.getMat();
4703
ddepth = src.depth();
4706
int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
4707
(src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
4709
int dft_filter_size = 50;
4712
_dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4713
Mat dst = _dst.getMat();
4714
Point anchor = normalizeAnchor(anchor0, kernel.size());
4716
CV_IPP_RUN(true, ipp_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType));
4719
#ifdef HAVE_TEGRA_OPTIMIZATION
4720
if( tegra::useTegra() && tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
4724
if( kernel.cols*kernel.rows >= dft_filter_size )
4727
// crossCorr doesn't accept non-zero delta with multiple channels
4728
if( src.channels() != 1 && delta != 0 )
4730
// The semantics of filter2D require that the delta be applied
4731
// as floating-point math. So wee need an intermediate Mat
4732
// with a float datatype. If the dest is already floats,
4733
// we just use that.
4734
int corrDepth = dst.depth();
4735
if( (dst.depth() == CV_32F || dst.depth() == CV_64F) &&
4736
src.data != dst.data )
4742
corrDepth = dst.depth() == CV_64F ? CV_64F : CV_32F;
4743
temp.create( dst.size(), CV_MAKETYPE(corrDepth, dst.channels()) );
4745
crossCorr( src, kernel, temp, src.size(),
4746
CV_MAKETYPE(corrDepth, src.channels()),
4747
anchor, 0, borderType );
4748
add( temp, delta, temp );
4749
if ( temp.data != dst.data )
4751
temp.convertTo( dst, dst.type() );
4756
if( src.data != dst.data )
4759
temp.create(dst.size(), dst.type());
4760
crossCorr( src, kernel, temp, src.size(),
4761
CV_MAKETYPE(ddepth, src.channels()),
4762
anchor, delta, borderType );
4763
if( temp.data != dst.data )
4769
Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
4770
anchor, delta, borderType & ~BORDER_ISOLATED );
4771
f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4775
void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4776
InputArray _kernelX, InputArray _kernelY, Point anchor,
4777
double delta, int borderType )
4779
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4780
ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
4782
Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
4785
ddepth = src.depth();
4787
_dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4788
Mat dst = _dst.getMat();
4790
Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
4791
dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
4792
f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4797
cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
4799
cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
4800
cv::Mat kernel = cv::cvarrToMat(_kernel);
4802
CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
4804
cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );