2
* Copyright 2008-2011 NVIDIA Corporation
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
19
#include <thrust/detail/config.h>
21
// do not attempt to compile this file, which uses CUDA built-in variables, with any compiler other than nvcc
22
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
24
#include <thrust/iterator/iterator_traits.h>
37
template <unsigned int block_size, typename ValueIterator, typename BinaryFunction>
38
__device__ __forceinline__
39
void reduce(ValueIterator data, BinaryFunction binary_op)
41
// TODO generalize this code with TMP
42
if (block_size >= 1024) { if (threadIdx.x < 512) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 512]); } __syncthreads(); }
43
if (block_size >= 512) { if (threadIdx.x < 256) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 256]); } __syncthreads(); }
44
if (block_size >= 256) { if (threadIdx.x < 128) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 128]); } __syncthreads(); }
45
if (block_size >= 128) { if (threadIdx.x < 64) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 64]); } __syncthreads(); }
46
if (block_size >= 64) { if (threadIdx.x < 32) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 32]); } __syncthreads(); }
47
if (block_size >= 32) { if (threadIdx.x < 16) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 16]); } __syncthreads(); }
48
if (block_size >= 16) { if (threadIdx.x < 8) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 8]); } __syncthreads(); }
49
if (block_size >= 8) { if (threadIdx.x < 4) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 4]); } __syncthreads(); }
50
if (block_size >= 4) { if (threadIdx.x < 2) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 2]); } __syncthreads(); }
51
if (block_size >= 2) { if (threadIdx.x < 1) { data[threadIdx.x] = binary_op(data[threadIdx.x], data[threadIdx.x + 1]); } __syncthreads(); }
54
template <typename ValueIterator, typename BinaryFunction>
55
__device__ __forceinline__
56
void reduce_n(ValueIterator data, unsigned int n, BinaryFunction binary_op)
60
for (unsigned int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x)
61
data[threadIdx.x] = binary_op(data[threadIdx.x], data[i]);
68
unsigned int half = n / 2;
70
if (threadIdx.x < half)
71
data[threadIdx.x] = binary_op(data[threadIdx.x], data[n - threadIdx.x - 1]);
79
} // end namespace block
80
} // end namespace cuda
81
} // end namespace backend
82
} // end namespace detail
83
} // end namespace thrust
85
#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC