1
/******************************************************************************
2
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the NVIDIA CORPORATION nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
******************************************************************************/
31
* cub::BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
38
#include "../../../util_type.cuh"
39
#include "../../../util_namespace.cuh"
41
/// Optional outer namespace(s)
49
* BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
52
typename BlockHistogramTilesPolicy, ///< Tuning policy
53
int BINS, ///< Number of histogram bins
54
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
55
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
56
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
57
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
58
typename SizeT> ///< Integer type for offsets
59
struct BlockHistogramTilesSharedAtomic
61
//---------------------------------------------------------------------
62
// Types and constants
63
//---------------------------------------------------------------------
66
typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
71
BLOCK_THREADS = BlockHistogramTilesPolicy::BLOCK_THREADS,
72
ITEMS_PER_THREAD = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
73
TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
74
TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
77
/// Shared memory type required by this thread block
80
HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
84
/// Alias wrapper allowing storage to be unioned
85
struct TempStorage : Uninitialized<_TempStorage> {};
88
//---------------------------------------------------------------------
90
//---------------------------------------------------------------------
92
/// Reference to temp_storage
93
_TempStorage &temp_storage;
95
/// Reference to output histograms
96
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
98
/// Input data to reduce
102
//---------------------------------------------------------------------
104
//---------------------------------------------------------------------
109
__device__ __forceinline__ BlockHistogramTilesSharedAtomic(
110
TempStorage &temp_storage, ///< Reference to temp_storage
111
InputIteratorRA d_in, ///< Input data to reduce
112
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
114
temp_storage(temp_storage.Alias()),
116
d_out_histograms(d_out_histograms)
118
// Initialize histogram bin counts to zeros
120
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
122
int histo_offset = 0;
125
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
127
this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
129
// Finish up with guarded initialization if necessary
130
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
132
this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
139
* Process a single tile of input
141
template <bool FULL_TILE>
142
__device__ __forceinline__ void ConsumeTile(
143
SizeT block_offset, ///< The offset the tile to consume
144
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
148
// Full tile of samples to read and composite
149
SampleT items[ITEMS_PER_THREAD][CHANNELS];
152
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
155
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
157
if (CHANNEL < ACTIVE_CHANNELS)
159
items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
164
__threadfence_block();
167
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
170
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
172
if (CHANNEL < ACTIVE_CHANNELS)
174
atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
179
__threadfence_block();
183
// Only a partially-full tile of samples to read and composite
184
int bounds = valid_items - (threadIdx.x * CHANNELS);
187
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
190
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
192
if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
194
SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
195
atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
205
* Aggregate results into output
207
__device__ __forceinline__ void AggregateOutput()
209
// Barrier to ensure shared memory histograms are coherent
212
// Copy shared memory histograms to output
214
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
216
int channel_offset = (blockIdx.x * BINS);
217
int histo_offset = 0;
220
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
222
d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
224
// Finish up with guarded initialization if necessary
225
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
227
d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
236
CUB_NS_POSTFIX // Optional outer namespace(s)