3
// ************************************************************************
5
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
6
// Copyright (2012) Sandia Corporation
8
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9
// the U.S. Government retains certain rights in this software.
11
// Redistribution and use in source and binary forms, with or without
12
// modification, are permitted provided that the following conditions are
15
// 1. Redistributions of source code must retain the above copyright
16
// notice, this list of conditions and the following disclaimer.
18
// 2. Redistributions in binary form must reproduce the above copyright
19
// notice, this list of conditions and the following disclaimer in the
20
// documentation and/or other materials provided with the distribution.
22
// 3. Neither the name of the Corporation nor the names of the
23
// contributors may be used to endorse or promote products derived from
24
// this software without specific prior written permission.
26
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
40
// ************************************************************************
44
#ifndef KOKKOS_CUDA_INTERNAL_HPP
45
#define KOKKOS_CUDA_INTERNAL_HPP
50
void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
52
void cuda_device_synchronize();
55
void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
57
if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
60
template<class DriverType>
61
int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
62
#if ( CUDA_VERSION < 6050 )
65
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
70
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
71
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
73
cuda_parallel_launch_constant_memory<DriverType>,
77
while (blockSize<1024 && numBlocks>0) {
79
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
81
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
83
cuda_parallel_launch_constant_memory<DriverType>,
87
if(numBlocks>0) return blockSize;
88
else return blockSize/2;
91
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
92
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
94
cuda_parallel_launch_local_memory<DriverType>,
98
while (blockSize<1024 && numBlocks>0) {
100
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
102
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
104
cuda_parallel_launch_local_memory<DriverType>,
108
if(numBlocks>0) return blockSize;
109
else return blockSize/2;
114
template<class DriverType>
115
int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
116
#if ( CUDA_VERSION < 6050 )
119
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
128
while(blockSize<1024) {
131
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
132
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
133
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
135
cuda_parallel_launch_constant_memory<DriverType>,
138
if(maxOccupancy < numBlocks*blockSize) {
139
maxOccupancy = numBlocks*blockSize;
140
bestBlockSize = blockSize;
144
while(blockSize<1024) {
146
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
148
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
150
cuda_parallel_launch_local_memory<DriverType>,
154
if(maxOccupancy < numBlocks*blockSize) {
155
maxOccupancy = numBlocks*blockSize;
156
bestBlockSize = blockSize;
160
return bestBlockSize;
167
#define CUDA_SAFE_CALL( call ) \
168
Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
170
#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */