1
/******************************************************************************
2
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the NVIDIA CORPORATION nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
******************************************************************************/
31
* cub::BlockHistogramTiles implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
38
#include "specializations/block_histo_tiles_gatomic.cuh"
39
#include "specializations/block_histo_tiles_satomic.cuh"
40
#include "specializations/block_histo_tiles_sort.cuh"
41
#include "../../util_type.cuh"
42
#include "../../grid/grid_mapping.cuh"
43
#include "../../grid/grid_even_share.cuh"
44
#include "../../grid/grid_queue.cuh"
45
#include "../../util_namespace.cuh"
47
/// Optional outer namespace(s)
54
/******************************************************************************
55
* Algorithmic variants
56
******************************************************************************/
60
* \brief BlockHistogramTilesAlgorithm enumerates alternative algorithms for BlockHistogramTiles.
62
enum BlockHistogramTilesAlgorithm
67
* A two-kernel approach in which:
68
* -# Thread blocks in the first kernel aggregate their own privatized
69
* histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
70
* -# A single thread block in the second kernel reduces them into the output histogram(s).
72
* \par Performance Considerations
73
* Delivers consistent throughput regardless of sample bin distribution.
75
* However, because histograms are privatized in shared memory, a large
76
* number of bins (e.g., thousands) may adversely affect occupancy and
77
* performance (or even the ability to launch).
84
* A two-kernel approach in which:
85
* -# Thread blocks in the first kernel aggregate their own privatized
86
* histograms using shared-memory \p atomicAdd().
87
* -# A single thread block in the second kernel reduces them into the
88
* output histogram(s).
90
* \par Performance Considerations
91
* Performance is strongly tied to the hardware implementation of atomic
92
* addition, and may be significantly degraded for non uniformly-random
93
* input distributions where many concurrent updates are likely to be
94
* made to the same bin counter.
96
* However, because histograms are privatized in shared memory, a large
97
* number of bins (e.g., thousands) may adversely affect occupancy and
98
* performance (or even the ability to launch).
100
GRID_HISTO_SHARED_ATOMIC,
105
* A single-kernel approach in which thread blocks update the output histogram(s) directly
106
* using global-memory \p atomicAdd().
108
* \par Performance Considerations
109
* Performance is strongly tied to the hardware implementation of atomic
110
* addition, and may be significantly degraded for non uniformly-random
111
* input distributions where many concurrent updates are likely to be
112
* made to the same bin counter.
114
* Performance is not significantly impacted when computing histograms having large
115
* numbers of bins (e.g., thousands).
117
GRID_HISTO_GLOBAL_ATOMIC,
122
/******************************************************************************
124
******************************************************************************/
127
* Tuning policy for BlockHistogramTiles
131
int _ITEMS_PER_THREAD,
132
BlockHistogramTilesAlgorithm _GRID_ALGORITHM,
133
GridMappingStrategy _GRID_MAPPING,
135
struct BlockHistogramTilesPolicy
139
BLOCK_THREADS = _BLOCK_THREADS,
140
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
141
SM_OCCUPANCY = _SM_OCCUPANCY,
144
static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = _GRID_ALGORITHM;
145
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
150
/******************************************************************************
151
* Thread block abstractions
152
******************************************************************************/
156
* Implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
159
typename BlockHistogramTilesPolicy, ///< Tuning policy
160
int BINS, ///< Number of histogram bins per channel
161
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
162
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
163
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
164
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
165
typename SizeT> ///< Integer type for offsets
166
struct BlockHistogramTiles
168
//---------------------------------------------------------------------
169
// Types and constants
170
//---------------------------------------------------------------------
172
// Histogram grid algorithm
173
static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = BlockHistogramTilesPolicy::GRID_ALGORITHM;
175
// Alternative internal implementation types
176
typedef BlockHistogramTilesSort< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesSortT;
177
typedef BlockHistogramTilesSharedAtomic< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesSharedAtomicT;
178
typedef BlockHistogramTilesGlobalAtomic< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesGlobalAtomicT;
180
// Internal block sweep histogram type
181
typedef typename If<(GRID_ALGORITHM == GRID_HISTO_SORT),
182
BlockHistogramTilesSortT,
183
typename If<(GRID_ALGORITHM == GRID_HISTO_SHARED_ATOMIC),
184
BlockHistogramTilesSharedAtomicT,
185
BlockHistogramTilesGlobalAtomicT>::Type>::Type InternalBlockDelegate;
189
TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
193
// Temporary storage type
194
typedef typename InternalBlockDelegate::TempStorage TempStorage;
196
//---------------------------------------------------------------------
198
//---------------------------------------------------------------------
200
// Internal block delegate
201
InternalBlockDelegate internal_delegate;
204
//---------------------------------------------------------------------
206
//---------------------------------------------------------------------
211
__device__ __forceinline__ BlockHistogramTiles(
212
TempStorage &temp_storage, ///< Reference to temp_storage
213
InputIteratorRA d_in, ///< Input data to reduce
214
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
216
internal_delegate(temp_storage, d_in, d_out_histograms)
221
* \brief Reduce a consecutive segment of input tiles
223
__device__ __forceinline__ void ConsumeTiles(
224
SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
225
SizeT block_oob) ///< [in] Threadblock end offset (exclusive)
227
// Consume subsequent full tiles of input
228
while (block_offset + TILE_ITEMS <= block_oob)
230
internal_delegate.ConsumeTile<true>(block_offset);
231
block_offset += TILE_ITEMS;
234
// Consume a partially-full tile
235
if (block_offset < block_oob)
237
int valid_items = block_oob - block_offset;
238
internal_delegate.ConsumeTile<false>(block_offset, valid_items);
242
internal_delegate.AggregateOutput();
247
* Reduce a consecutive segment of input tiles
249
__device__ __forceinline__ void ConsumeTiles(
250
SizeT num_items, ///< [in] Total number of global input items
251
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
252
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
253
Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
255
even_share.BlockInit();
256
ConsumeTiles(even_share.block_offset, even_share.block_oob);
261
* Dequeue and reduce tiles of items as part of a inter-block scan
263
__device__ __forceinline__ void ConsumeTiles(
264
int num_items, ///< Total number of input items
265
GridQueue<SizeT> queue) ///< Queue descriptor for assigning tiles of work to thread blocks
267
// Shared block offset
268
__shared__ SizeT shared_block_offset;
270
// We give each thread block at least one tile of input.
271
SizeT block_offset = blockIdx.x * TILE_ITEMS;
272
SizeT even_share_base = gridDim.x * TILE_ITEMS;
274
// Process full tiles of input
275
while (block_offset + TILE_ITEMS <= num_items)
277
internal_delegate.ConsumeTile<true>(block_offset);
279
// Dequeue up to TILE_ITEMS
280
if (threadIdx.x == 0)
281
shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
285
block_offset = shared_block_offset;
290
// Consume a partially-full tile
291
if (block_offset < num_items)
293
int valid_items = num_items - block_offset;
294
internal_delegate.ConsumeTile<false>(block_offset, valid_items);
298
internal_delegate.AggregateOutput();
303
* Dequeue and reduce tiles of items as part of a inter-block scan
305
__device__ __forceinline__ void ConsumeTiles(
306
SizeT num_items, ///< [in] Total number of global input items
307
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
308
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
309
Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
311
ConsumeTiles(num_items, queue);
321
CUB_NS_POSTFIX // Optional outer namespace(s)