1
/******************************************************************************
2
* Copyright (c) 2011, Duane Merrill. All rights reserved.
3
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the NVIDIA CORPORATION nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
******************************************************************************/
31
* Static architectural properties by SM version.
35
/******************************************************************************
36
* Static architectural properties by SM version.
38
* "Device" reflects the PTX architecture targeted by the active compiler
39
* pass. It provides useful compile-time statics within device code. E.g.,:
41
* __shared__ int[Device::WARP_THREADS];
43
* int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
45
******************************************************************************/
49
#include "util_namespace.cuh"
51
/// Optional outer namespace(s)
59
* \addtogroup UtilModule
64
/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
66
#define CUB_PTX_ARCH 0
68
#define CUB_PTX_ARCH __CUDA_ARCH__
72
/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
73
#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
74
#define CUB_RUNTIME_ENABLED
78
/// Execution space for destructors
79
#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
80
#define CUB_DESTRUCTOR __host__
82
#define CUB_DESTRUCTOR __host__ __device__
87
* \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
89
* The default specialization is for SM10.
91
template <int SM_ARCH>
97
5, /// Log of the number of threads per warp
99
1 << LOG_WARP_THREADS, /// Number of threads per warp
101
4, /// Log of the number of smem banks
103
1 << LOG_SMEM_BANKS, /// The number of smem banks
105
4, /// Size of smem bank words
107
16 * 1024, /// Maximum SM shared memory
109
512, /// Smem allocation size in bytes
111
true, /// Whether or not the architecture allocates registers by block (or by warp)
113
256, /// Number of registers allocated at a time per block (or by warp)
115
2, /// Granularity of warps for which registers are allocated
117
768, /// Maximum number of threads per SM
118
MAX_SM_THREADBLOCKS =
119
8, /// Maximum number of thread blocks per SM
121
512, /// Maximum number of thread per thread block
123
8 * 1024, /// Maximum number of registers per SM
130
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
133
* Architecture properties for SM30
136
struct ArchProps<300>
140
LOG_WARP_THREADS = 5, // 32 threads per warp
141
WARP_THREADS = 1 << LOG_WARP_THREADS,
142
LOG_SMEM_BANKS = 5, // 32 banks
143
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
144
SMEM_BANK_BYTES = 4, // 4 byte bank words
145
SMEM_BYTES = 48 * 1024, // 48KB shared memory
146
SMEM_ALLOC_UNIT = 256, // 256B smem allocation segment size
147
REGS_BY_BLOCK = false, // Allocates registers by warp
148
REG_ALLOC_UNIT = 256, // 256 registers allocated at a time per warp
149
WARP_ALLOC_UNIT = 4, // Registers are allocated at a granularity of every 4 warps per threadblock
150
MAX_SM_THREADS = 2048, // 2K max threads per SM
151
MAX_SM_THREADBLOCKS = 16, // 16 max threadblocks per SM
152
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
153
MAX_SM_REGISTERS = 64 * 1024, // 64K max registers per SM
157
template <typename T>
158
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
160
target.template Callback<ArchProps>();
166
* Architecture properties for SM20
169
struct ArchProps<200>
173
LOG_WARP_THREADS = 5, // 32 threads per warp
174
WARP_THREADS = 1 << LOG_WARP_THREADS,
175
LOG_SMEM_BANKS = 5, // 32 banks
176
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
177
SMEM_BANK_BYTES = 4, // 4 byte bank words
178
SMEM_BYTES = 48 * 1024, // 48KB shared memory
179
SMEM_ALLOC_UNIT = 128, // 128B smem allocation segment size
180
REGS_BY_BLOCK = false, // Allocates registers by warp
181
REG_ALLOC_UNIT = 64, // 64 registers allocated at a time per warp
182
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
183
MAX_SM_THREADS = 1536, // 1536 max threads per SM
184
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
185
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
186
MAX_SM_REGISTERS = 32 * 1024, // 32K max registers per SM
190
template <typename T>
191
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
193
if (sm_version > 200) {
194
ArchProps<300>::Callback(target, sm_version);
196
target.template Callback<ArchProps>();
203
* Architecture properties for SM12
206
struct ArchProps<120>
210
LOG_WARP_THREADS = 5, // 32 threads per warp
211
WARP_THREADS = 1 << LOG_WARP_THREADS,
212
LOG_SMEM_BANKS = 4, // 16 banks
213
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
214
SMEM_BANK_BYTES = 4, // 4 byte bank words
215
SMEM_BYTES = 16 * 1024, // 16KB shared memory
216
SMEM_ALLOC_UNIT = 512, // 512B smem allocation segment size
217
REGS_BY_BLOCK = true, // Allocates registers by threadblock
218
REG_ALLOC_UNIT = 512, // 512 registers allocated at time per threadblock
219
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
220
MAX_SM_THREADS = 1024, // 1024 max threads per SM
221
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
222
MAX_BLOCK_THREADS = 512, // 512 max threads per threadblock
223
MAX_SM_REGISTERS = 16 * 1024, // 16K max registers per SM
227
template <typename T>
228
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
230
if (sm_version > 120) {
231
ArchProps<200>::Callback(target, sm_version);
233
target.template Callback<ArchProps>();
240
* Architecture properties for SM10. Derives from the default ArchProps specialization.
243
struct ArchProps<100> : ArchProps<0>
246
template <typename T>
247
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
249
if (sm_version > 100) {
250
ArchProps<120>::Callback(target, sm_version);
252
target.template Callback<ArchProps>();
259
* Architecture properties for SM35
262
struct ArchProps<350> : ArchProps<300> {}; // Derives from SM30
265
* Architecture properties for SM21
268
struct ArchProps<210> : ArchProps<200> {}; // Derives from SM20
271
* Architecture properties for SM13
274
struct ArchProps<130> : ArchProps<120> {}; // Derives from SM12
277
* Architecture properties for SM11
280
struct ArchProps<110> : ArchProps<100> {}; // Derives from SM10
283
#endif // DOXYGEN_SHOULD_SKIP_THIS
287
* \brief The architectural properties for the PTX version targeted by the active compiler pass.
289
struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
292
/** @} */ // end group UtilModule
295
CUB_NS_POSTFIX // Optional outer namespace(s)