1
/*M///////////////////////////////////////////////////////////////////////////////////////
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
11
// For Open Source Computer Vision Library
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
43
#if !defined CUDA_DISABLER
45
#include "opencv2/core/cuda/common.hpp"
46
#include "opencv2/core/cuda/functional.hpp"
47
#include "opencv2/core/cuda/emulation.hpp"
48
#include "opencv2/core/cuda/scan.hpp"
49
#include "opencv2/core/cuda/reduce.hpp"
50
#include "opencv2/core/cuda/saturate_cast.hpp"
52
using namespace cv::cuda;
53
using namespace cv::cuda::device;
57
__global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
58
const int2 tileSize, const int tilesX,
59
const int clipLimit, const float lutScale)
61
__shared__ int smem[512];
63
const int tx = blockIdx.x;
64
const int ty = blockIdx.y;
65
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
70
for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
72
const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
73
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
75
const int data = srcPtr[j];
76
Emulation::smem::atomicAdd(&smem[data], 1);
82
int tHistVal = smem[tid];
91
if (tHistVal > clipLimit)
93
clipped = tHistVal - clipLimit;
97
// find number of overall clipped samples
99
reduce<256>(smem, clipped, tid, plus<int>());
101
// broadcast evaluated value
103
__shared__ int totalClipped;
106
totalClipped = clipped;
109
// redistribute clipped samples evenly
111
int redistBatch = totalClipped / 256;
112
tHistVal += redistBatch;
114
int residual = totalClipped - redistBatch * 256;
119
const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
121
lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
124
void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
126
const dim3 block(32, 8);
127
const dim3 grid(tilesX, tilesY);
129
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
131
cudaSafeCall( cudaGetLastError() );
134
cudaSafeCall( cudaDeviceSynchronize() );
137
__global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
139
const int x = blockIdx.x * blockDim.x + threadIdx.x;
140
const int y = blockIdx.y * blockDim.y + threadIdx.y;
142
if (x >= src.cols || y >= src.rows)
145
const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
146
int ty1 = __float2int_rd(tyf);
148
const float ya = tyf - ty1;
150
ty2 = ::min(ty2, tilesY - 1);
152
const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
153
int tx1 = __float2int_rd(txf);
155
const float xa = txf - tx1;
157
tx2 = ::min(tx2, tilesX - 1);
159
const int srcVal = src(y, x);
163
res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
164
res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
165
res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
166
res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
168
dst(y, x) = saturate_cast<uchar>(res);
171
void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
173
const dim3 block(32, 8);
174
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
176
cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
178
tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
179
cudaSafeCall( cudaGetLastError() );
182
cudaSafeCall( cudaDeviceSynchronize() );
186
#endif // CUDA_DISABLER