1
/******************************************************************************
2
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the NVIDIA CORPORATION nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
******************************************************************************/
31
* The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
36
#include "specializations/warp_reduce_shfl.cuh"
37
#include "specializations/warp_reduce_smem.cuh"
38
#include "../thread/thread_operators.cuh"
39
#include "../util_arch.cuh"
40
#include "../util_type.cuh"
41
#include "../util_namespace.cuh"
43
/// Optional outer namespace(s)
51
* \addtogroup WarpModule
56
* \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
59
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
60
* uses a binary combining operator to compute a single aggregate from a list of input elements.
62
* \tparam T The reduction input/output element type
63
* \tparam LOGICAL_WARPS <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions. Default is 1.
64
* \tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
66
* \par Simple Examples
67
* \warpcollective{WarpReduce}
69
* The code snippet below illustrates four concurrent warp sum reductions within a block of
70
* 128 threads (one per each of the 32-thread warps).
73
* #include <cub/cub.cuh>
75
* __global__ void ExampleKernel(...)
77
* // Specialize WarpReduce for 4 warps on type int
78
* typedef cub::WarpReduce<int, 4> WarpReduce;
80
* // Allocate shared memory for WarpReduce
81
* __shared__ typename WarpReduce::TempStorage temp_storage;
83
* // Obtain one input item per thread
84
* int thread_data = ...
86
* // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
87
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
91
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
92
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
93
* \p 2544, and \p 3568, respectively (and is undefined in other threads).
96
* The code snippet below illustrates a single warp sum reduction within a block of
100
* #include <cub/cub.cuh>
102
* __global__ void ExampleKernel(...)
104
* // Specialize WarpReduce for one warp on type int
105
* typedef cub::WarpReduce<int, 1> WarpReduce;
107
* // Allocate shared memory for WarpReduce
108
* __shared__ typename WarpReduce::TempStorage temp_storage;
111
* // Only the first warp performs a reduction
112
* if (threadIdx.x < 32)
114
* // Obtain one input item per thread
115
* int thread_data = ...
117
* // Return the warp-wide sum to lane0
118
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
122
* Suppose the set of input \p thread_data across the warp of threads is <tt>0, 1, 2, 3, ..., 31</tt>.
123
* The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
125
* \par Usage and Performance Considerations
126
* - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
127
* - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
128
* - Warp reductions are concurrent if more than one logical warp is participating
129
* - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
130
* - Uses synchronization-free communication between warp lanes when applicable
131
* - Zero bank conflicts for most types
132
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
133
* - Summation (<b><em>vs.</em></b> generic reduction)
134
* - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
139
int LOGICAL_WARPS = 1,
140
int LOGICAL_WARP_THREADS = PtxArchProps::WARP_THREADS>
145
/******************************************************************************
146
* Constants and typedefs
147
******************************************************************************/
151
POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
156
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
158
/// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
159
typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
160
WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
161
WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
163
#endif // DOXYGEN_SHOULD_SKIP_THIS
168
/// Shared memory storage layout type for WarpReduce
169
typedef typename InternalWarpReduce::TempStorage _TempStorage;
172
/******************************************************************************
174
******************************************************************************/
176
/// Shared storage reference
177
_TempStorage &temp_storage;
186
/******************************************************************************
188
******************************************************************************/
190
/// Internal storage allocator
191
__device__ __forceinline__ _TempStorage& PrivateStorage()
193
__shared__ TempStorage private_storage;
194
return private_storage;
200
/// \smemstorage{WarpReduce}
201
struct TempStorage : Uninitialized<_TempStorage> {};
204
/******************************************************************//**
205
* \name Collective constructors
206
*********************************************************************/
211
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
214
__device__ __forceinline__ WarpReduce()
216
temp_storage(PrivateStorage()),
217
warp_id((LOGICAL_WARPS == 1) ?
219
threadIdx.x / LOGICAL_WARP_THREADS),
220
lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
222
threadIdx.x % LOGICAL_WARP_THREADS)
227
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
229
__device__ __forceinline__ WarpReduce(
230
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
232
temp_storage(temp_storage.Alias()),
233
warp_id((LOGICAL_WARPS == 1) ?
235
threadIdx.x / LOGICAL_WARP_THREADS),
236
lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
238
threadIdx.x % LOGICAL_WARP_THREADS)
243
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Threads are identified using the given warp and lane identifiers.
245
__device__ __forceinline__ WarpReduce(
246
int warp_id, ///< [in] A suitable warp membership identifier
247
int lane_id) ///< [in] A lane identifier within the warp
249
temp_storage(PrivateStorage()),
256
* \brief Collective constructor using the specified memory allocation as temporary storage. Threads are identified using the given warp and lane identifiers.
258
__device__ __forceinline__ WarpReduce(
259
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
260
int warp_id, ///< [in] A suitable warp membership identifier
261
int lane_id) ///< [in] A lane identifier within the warp
263
temp_storage(temp_storage.Alias()),
270
//@} end member group
271
/******************************************************************//**
272
* \name Summation reductions
273
*********************************************************************/
278
* \brief Computes a warp-wide sum in each active warp. The output is valid in warp <em>lane</em><sub>0</sub>.
282
* The code snippet below illustrates four concurrent warp sum reductions within a block of
283
* 128 threads (one per each of the 32-thread warps).
286
* #include <cub/cub.cuh>
288
* __global__ void ExampleKernel(...)
290
* // Specialize WarpReduce for 4 warps on type int
291
* typedef cub::WarpReduce<int, 4> WarpReduce;
293
* // Allocate shared memory for WarpReduce
294
* __shared__ typename WarpReduce::TempStorage temp_storage;
296
* // Obtain one input item per thread
297
* int thread_data = ...
299
* // Return the warp-wide sums to each lane0
300
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
304
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
305
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
306
* \p 2544, and \p 3568, respectively (and is undefined in other threads).
309
__device__ __forceinline__ T Sum(
310
T input) ///< [in] Calling thread's input
312
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
316
* \brief Computes a partially-full warp-wide sum in each active warp. The output is valid in warp <em>lane</em><sub>0</sub>.
318
* All threads in each logical warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
322
* The code snippet below illustrates a sum reduction within a single, partially-full
323
* block of 32 threads (one warp).
326
* #include <cub/cub.cuh>
328
* __global__ void ExampleKernel(int *d_data, int valid_items)
330
* // Specialize WarpReduce for a single warp on type int
331
* typedef cub::WarpReduce<int, 1> WarpReduce;
333
* // Allocate shared memory for WarpReduce
334
* __shared__ typename WarpReduce::TempStorage temp_storage;
336
* // Obtain one input item per thread if in range
338
* if (threadIdx.x < valid_items)
339
* thread_data = d_data[threadIdx.x];
341
* // Return the warp-wide sums to each lane0
342
* int aggregate = WarpReduce(temp_storage).Sum(
343
* thread_data, valid_items);
347
* Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
348
* is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is
349
* undefined in other threads).
352
__device__ __forceinline__ T Sum(
353
T input, ///< [in] Calling thread's input
354
int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
356
// Determine if we don't need bounds checking
357
if (valid_items >= LOGICAL_WARP_THREADS)
359
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
363
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
369
* \brief Computes a segmented sum in each active warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
373
* The code snippet below illustrates a head-segmented warp sum
374
* reduction within a block of 32 threads (one warp).
377
* #include <cub/cub.cuh>
379
* __global__ void ExampleKernel(...)
381
* // Specialize WarpReduce for a single warp on type int
382
* typedef cub::WarpReduce<int, 1> WarpReduce;
384
* // Allocate shared memory for WarpReduce
385
* __shared__ typename WarpReduce::TempStorage temp_storage;
387
* // Obtain one input item and flag per thread
388
* int thread_data = ...
389
* int head_flag = ...
391
* // Return the warp-wide sums to each lane0
392
* int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
393
* thread_data, head_flag);
397
* Suppose the set of input \p thread_data and \p head_flag across the block of threads
398
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
399
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
400
* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
402
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
407
__device__ __forceinline__ T HeadSegmentedSum(
408
T input, ///< [in] Calling thread's input
409
Flag head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
411
return HeadSegmentedReduce(input, head_flag, cub::Sum());
416
* \brief Computes a segmented sum in each active warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
420
* The code snippet below illustrates a tail-segmented warp sum
421
* reduction within a block of 32 threads (one warp).
424
* #include <cub/cub.cuh>
426
* __global__ void ExampleKernel(...)
428
* // Specialize WarpReduce for a single warp on type int
429
* typedef cub::WarpReduce<int, 1> WarpReduce;
431
* // Allocate shared memory for WarpReduce
432
* __shared__ typename WarpReduce::TempStorage temp_storage;
434
* // Obtain one input item and flag per thread
435
* int thread_data = ...
436
* int tail_flag = ...
438
* // Return the warp-wide sums to each lane0
439
* int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
440
* thread_data, tail_flag);
444
* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
445
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
446
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
447
* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
449
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
453
__device__ __forceinline__ T TailSegmentedSum(
454
T input, ///< [in] Calling thread's input
455
Flag tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
457
return TailSegmentedReduce(input, tail_flag, cub::Sum());
462
//@} end member group
463
/******************************************************************//**
464
* \name Generic reductions
465
*********************************************************************/
469
* \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
471
* Supports non-commutative reduction operators
475
* The code snippet below illustrates four concurrent warp max reductions within a block of
476
* 128 threads (one per each of the 32-thread warps).
479
* #include <cub/cub.cuh>
481
* __global__ void ExampleKernel(...)
483
* // Specialize WarpReduce for 4 warps on type int
484
* typedef cub::WarpReduce<int, 4> WarpReduce;
486
* // Allocate shared memory for WarpReduce
487
* __shared__ typename WarpReduce::TempStorage temp_storage;
489
* // Obtain one input item per thread
490
* int thread_data = ...
492
* // Return the warp-wide reductions to each lane0
493
* int aggregate = WarpReduce(temp_storage).Reduce(
494
* thread_data, cub::Max());
498
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
499
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
500
* \p 95, and \p 127, respectively (and is undefined in other threads).
502
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
504
template <typename ReductionOp>
505
__device__ __forceinline__ T Reduce(
506
T input, ///< [in] Calling thread's input
507
ReductionOp reduction_op) ///< [in] Binary reduction operator
509
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
513
* \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
515
* All threads in each logical warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
517
* Supports non-commutative reduction operators
521
* The code snippet below illustrates a max reduction within a single, partially-full
522
* block of 32 threads (one warp).
525
* #include <cub/cub.cuh>
527
* __global__ void ExampleKernel(int *d_data, int valid_items)
529
* // Specialize WarpReduce for a single warp on type int
530
* typedef cub::WarpReduce<int, 1> WarpReduce;
532
* // Allocate shared memory for WarpReduce
533
* __shared__ typename WarpReduce::TempStorage temp_storage;
535
* // Obtain one input item per thread if in range
537
* if (threadIdx.x < valid_items)
538
* thread_data = d_data[threadIdx.x];
540
* // Return the warp-wide reductions to each lane0
541
* int aggregate = WarpReduce(temp_storage).Reduce(
542
* thread_data, cub::Max(), valid_items);
546
* Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
547
* is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is
548
* undefined in other threads).
550
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
552
template <typename ReductionOp>
553
__device__ __forceinline__ T Reduce(
554
T input, ///< [in] Calling thread's input
555
ReductionOp reduction_op, ///< [in] Binary reduction operator
556
int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
558
// Determine if we don't need bounds checking
559
if (valid_items >= LOGICAL_WARP_THREADS)
561
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
565
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
571
* \brief Computes a segmented reduction in each active warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
573
* Supports non-commutative reduction operators
577
* The code snippet below illustrates a head-segmented warp max
578
* reduction within a block of 32 threads (one warp).
581
* #include <cub/cub.cuh>
583
* __global__ void ExampleKernel(...)
585
* // Specialize WarpReduce for a single warp on type int
586
* typedef cub::WarpReduce<int, 1> WarpReduce;
588
* // Allocate shared memory for WarpReduce
589
* __shared__ typename WarpReduce::TempStorage temp_storage;
591
* // Obtain one input item and flag per thread
592
* int thread_data = ...
593
* int head_flag = ...
595
* // Return the warp-wide reductions to each lane0
596
* int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
597
* thread_data, head_flag, cub::Max());
601
* Suppose the set of input \p thread_data and \p head_flag across the block of threads
602
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
603
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
604
* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
606
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
609
typename ReductionOp,
611
__device__ __forceinline__ T HeadSegmentedReduce(
612
T input, ///< [in] Calling thread's input
613
Flag head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment
614
ReductionOp reduction_op) ///< [in] Reduction operator
616
return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
621
* \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
623
* Supports non-commutative reduction operators
627
* The code snippet below illustrates a tail-segmented warp max
628
* reduction within a block of 32 threads (one warp).
631
* #include <cub/cub.cuh>
633
* __global__ void ExampleKernel(...)
635
* // Specialize WarpReduce for a single warp on type int
636
* typedef cub::WarpReduce<int, 1> WarpReduce;
638
* // Allocate shared memory for WarpReduce
639
* __shared__ typename WarpReduce::TempStorage temp_storage;
641
* // Obtain one input item and flag per thread
642
* int thread_data = ...
643
* int tail_flag = ...
645
* // Return the warp-wide reductions to each lane0
646
* int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
647
* thread_data, tail_flag, cub::Max());
651
* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
652
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
653
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
654
* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
656
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
659
typename ReductionOp,
661
__device__ __forceinline__ T TailSegmentedReduce(
662
T input, ///< [in] Calling thread's input
663
Flag tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
664
ReductionOp reduction_op) ///< [in] Reduction operator
666
return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
671
//@} end member group
674
/** @} */ // end group WarpModule
677
CUB_NS_POSTFIX // Optional outer namespace(s)