1
/******************************************************************************
2
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the NVIDIA CORPORATION nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
******************************************************************************/
31
* The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
36
#include "specializations/block_histogram_sort.cuh"
37
#include "specializations/block_histogram_atomic.cuh"
38
#include "../util_arch.cuh"
39
#include "../util_namespace.cuh"
41
/// Optional outer namespace(s)
48
/******************************************************************************
49
* Algorithmic variants
50
******************************************************************************/
53
* \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
55
enum BlockHistogramAlgorithm
60
* Sorting followed by differentiation. Execution is comprised of two phases:
61
* -# Sort the data using efficient radix sort
62
* -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
64
* \par Performance Considerations
65
* Delivers consistent throughput regardless of sample bin distribution.
72
* Use atomic addition to update byte counts directly
74
* \par Performance Considerations
75
* Performance is strongly tied to the hardware implementation of atomic
76
* addition, and may be significantly degraded for non uniformly-random
77
* input distributions where many concurrent updates are likely to be
78
* made to the same bin counter.
85
/******************************************************************************
87
******************************************************************************/
91
* \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
92
* \ingroup BlockModule
95
* A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
96
* counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
99
* Optionally, BlockHistogram can be specialized to use different algorithms:
100
* -# <b>cub::BLOCK_HISTO_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
101
* -# <b>cub::BLOCK_HISTO_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
103
* \tparam T The sample type being histogrammed (must be castable to an integer bin identifier)
104
* \tparam BLOCK_THREADS The thread block size in threads
105
* \tparam ITEMS_PER_THREAD The number of items per thread
106
* \tparam BINS The number bins within the histogram
107
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
109
* \par A Simple Example
110
* \blockcollective{BlockHistogram}
112
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
113
* are partitioned across 128 threads where each thread owns 4 samples.
116
* #include <cub/cub.cuh>
118
* __global__ void ExampleKernel(...)
120
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
121
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
123
* // Allocate shared memory for BlockHistogram
124
* __shared__ typename BlockHistogram::TempStorage temp_storage;
126
* // Allocate shared memory for block-wide histogram bin counts
127
* __shared__ unsigned int smem_histogram[256];
129
* // Obtain input samples per thread
130
* unsigned char data[4];
133
* // Compute the block-wide histogram
134
* BlockHistogram(temp_storage).Histogram(data, smem_histogram);
138
* \par Performance and Usage Considerations
139
* - The histogram output can be constructed in shared or global memory
140
* - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
146
int ITEMS_PER_THREAD,
148
BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT>
153
/******************************************************************************
154
* Constants and type definitions
155
******************************************************************************/
158
* Ensure the template parameterization meets the requirements of the
159
* targeted device architecture. BLOCK_HISTO_ATOMIC can only be used
160
* on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used
163
static const BlockHistogramAlgorithm SAFE_ALGORITHM =
164
((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
168
/// Internal specialization.
169
typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
170
BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
171
BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
173
/// Shared memory storage layout type for BlockHistogram
174
typedef typename InternalBlockHistogram::TempStorage _TempStorage;
177
/******************************************************************************
179
******************************************************************************/
181
/// Shared storage reference
182
_TempStorage &temp_storage;
188
/******************************************************************************
190
******************************************************************************/
192
/// Internal storage allocator
193
__device__ __forceinline__ _TempStorage& PrivateStorage()
195
__shared__ _TempStorage private_storage;
196
return private_storage;
202
/// \smemstorage{BlockHistogram}
203
struct TempStorage : Uninitialized<_TempStorage> {};
206
/******************************************************************//**
207
* \name Collective constructors
208
*********************************************************************/
212
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
214
__device__ __forceinline__ BlockHistogram()
216
temp_storage(PrivateStorage()),
217
linear_tid(threadIdx.x)
222
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
224
__device__ __forceinline__ BlockHistogram(
225
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
227
temp_storage(temp_storage.Alias()),
228
linear_tid(threadIdx.x)
233
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
235
__device__ __forceinline__ BlockHistogram(
236
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
238
temp_storage(PrivateStorage()),
239
linear_tid(linear_tid)
244
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
246
__device__ __forceinline__ BlockHistogram(
247
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
248
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
250
temp_storage(temp_storage.Alias()),
251
linear_tid(linear_tid)
255
//@} end member group
256
/******************************************************************//**
257
* \name Histogram operations
258
*********************************************************************/
263
* \brief Initialize the shared histogram counters to zero.
265
* The code snippet below illustrates a the initialization and update of a
266
* histogram of 512 integer samples that are partitioned across 128 threads
267
* where each thread owns 4 samples.
270
* #include <cub/cub.cuh>
272
* __global__ void ExampleKernel(...)
274
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
275
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
277
* // Allocate shared memory for BlockHistogram
278
* __shared__ typename BlockHistogram::TempStorage temp_storage;
280
* // Allocate shared memory for block-wide histogram bin counts
281
* __shared__ unsigned int smem_histogram[256];
283
* // Obtain input samples per thread
284
* unsigned char thread_samples[4];
287
* // Initialize the block-wide histogram
288
* BlockHistogram(temp_storage).InitHistogram(smem_histogram);
290
* // Update the block-wide histogram
291
* BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
295
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
297
template <typename HistoCounter>
298
__device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
300
// Initialize histogram bin counts to zeros
301
int histo_offset = 0;
304
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
306
histogram[histo_offset + linear_tid] = 0;
308
// Finish up with guarded initialization if necessary
309
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
311
histogram[histo_offset + linear_tid] = 0;
317
* \brief Constructs a block-wide histogram in shared/global memory. Each thread contributes an array of input elements.
321
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
322
* are partitioned across 128 threads where each thread owns 4 samples.
325
* #include <cub/cub.cuh>
327
* __global__ void ExampleKernel(...)
329
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
330
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
332
* // Allocate shared memory for BlockHistogram
333
* __shared__ typename BlockHistogram::TempStorage temp_storage;
335
* // Allocate shared memory for block-wide histogram bin counts
336
* __shared__ unsigned int smem_histogram[256];
338
* // Obtain input samples per thread
339
* unsigned char thread_samples[4];
342
* // Compute the block-wide histogram
343
* BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
347
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
350
typename HistoCounter>
351
__device__ __forceinline__ void Histogram(
352
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
353
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
355
// Initialize histogram bin counts to zeros
356
InitHistogram(histogram);
358
// Composite the histogram
359
InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
365
* \brief Updates an existing block-wide histogram in shared/global memory. Each thread composites an array of input elements.
369
* The code snippet below illustrates a the initialization and update of a
370
* histogram of 512 integer samples that are partitioned across 128 threads
371
* where each thread owns 4 samples.
374
* #include <cub/cub.cuh>
376
* __global__ void ExampleKernel(...)
378
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
379
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
381
* // Allocate shared memory for BlockHistogram
382
* __shared__ typename BlockHistogram::TempStorage temp_storage;
384
* // Allocate shared memory for block-wide histogram bin counts
385
* __shared__ unsigned int smem_histogram[256];
387
* // Obtain input samples per thread
388
* unsigned char thread_samples[4];
391
* // Initialize the block-wide histogram
392
* BlockHistogram(temp_storage).InitHistogram(smem_histogram);
394
* // Update the block-wide histogram
395
* BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
399
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
402
typename HistoCounter>
403
__device__ __forceinline__ void Composite(
404
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
405
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
407
InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
413
CUB_NS_POSTFIX // Optional outer namespace(s)