3
// ************************************************************************
5
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
6
// Copyright (2012) Sandia Corporation
8
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9
// the U.S. Government retains certain rights in this software.
11
// Redistribution and use in source and binary forms, with or without
12
// modification, are permitted provided that the following conditions are
15
// 1. Redistributions of source code must retain the above copyright
16
// notice, this list of conditions and the following disclaimer.
18
// 2. Redistributions in binary form must reproduce the above copyright
19
// notice, this list of conditions and the following disclaimer in the
20
// documentation and/or other materials provided with the distribution.
22
// 3. Neither the name of the Corporation nor the names of the
23
// contributors may be used to endorse or promote products derived from
24
// this software without specific prior written permission.
26
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
40
// ************************************************************************
43
#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
44
#define KOKKOS_CUDA_VECTORIZATION_HPP
45
#include <Kokkos_Cuda.hpp>
50
// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
51
// or other GPUs. We provide a generic definition (which is trivial
52
// and doesn't do what it claims to do) because we don't actually use
53
// this function unless we are on a suitable GPU, with a suitable
54
// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow"
55
// internal parameter depends both on the ExecutionSpace and the Scalar type,
56
// and it controls whether shfl_down() gets called.)
59
template< typename Scalar >
61
enum {n = sizeof(Scalar)/4};
63
KOKKOS_INLINE_FUNCTION
65
return *(Scalar*) fval;
67
KOKKOS_INLINE_FUNCTION
68
void operator= (Scalar& value) {
69
float* const val_ptr = (float*) &value;
70
for(int i=0; i<n ; i++) {
74
KOKKOS_INLINE_FUNCTION
75
void operator= (const Scalar& value) {
76
float* const val_ptr = (float*) &value;
77
for(int i=0; i<n ; i++) {
86
#if (__CUDA_ARCH__ >= 300)
88
KOKKOS_INLINE_FUNCTION
89
int shfl(const int &val, const int& srcLane, const int& width ) {
90
return __shfl(val,srcLane,width);
93
KOKKOS_INLINE_FUNCTION
94
float shfl(const float &val, const int& srcLane, const int& width ) {
95
return __shfl(val,srcLane,width);
98
template<typename Scalar>
99
KOKKOS_INLINE_FUNCTION
100
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
103
float tmp = *reinterpret_cast<float*>(&tmp1);
104
tmp = __shfl(tmp,srcLane,width);
105
return *reinterpret_cast<Scalar*>(&tmp);
108
KOKKOS_INLINE_FUNCTION
109
double shfl(const double &val, const int& srcLane, const int& width) {
110
int lo = __double2loint(val);
111
int hi = __double2hiint(val);
112
lo = __shfl(lo,srcLane,width);
113
hi = __shfl(hi,srcLane,width);
114
return __hiloint2double(hi,lo);
117
template<typename Scalar>
118
KOKKOS_INLINE_FUNCTION
119
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
120
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
121
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
122
lo = __shfl(lo,srcLane,width);
123
hi = __shfl(hi,srcLane,width);
124
const double tmp = __hiloint2double(hi,lo);
125
return *(reinterpret_cast<const Scalar*>(&tmp));
128
template<typename Scalar>
129
KOKKOS_INLINE_FUNCTION
130
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
131
Impl::shfl_union<Scalar> s_val;
132
Impl::shfl_union<Scalar> r_val;
135
for(int i = 0; i<s_val.n; i++)
136
r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
137
return r_val.value();
140
KOKKOS_INLINE_FUNCTION
141
int shfl_down(const int &val, const int& delta, const int& width) {
142
return __shfl_down(val,delta,width);
145
KOKKOS_INLINE_FUNCTION
146
float shfl_down(const float &val, const int& delta, const int& width) {
147
return __shfl_down(val,delta,width);
150
template<typename Scalar>
151
KOKKOS_INLINE_FUNCTION
152
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
154
float tmp = *reinterpret_cast<float*>(&tmp1);
155
tmp = __shfl_down(tmp,delta,width);
156
return *reinterpret_cast<Scalar*>(&tmp);
159
KOKKOS_INLINE_FUNCTION
160
double shfl_down(const double &val, const int& delta, const int& width) {
161
int lo = __double2loint(val);
162
int hi = __double2hiint(val);
163
lo = __shfl_down(lo,delta,width);
164
hi = __shfl_down(hi,delta,width);
165
return __hiloint2double(hi,lo);
168
template<typename Scalar>
169
KOKKOS_INLINE_FUNCTION
170
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
171
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
172
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
173
lo = __shfl_down(lo,delta,width);
174
hi = __shfl_down(hi,delta,width);
175
const double tmp = __hiloint2double(hi,lo);
176
return *(reinterpret_cast<const Scalar*>(&tmp));
179
template<typename Scalar>
180
KOKKOS_INLINE_FUNCTION
181
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
182
Impl::shfl_union<Scalar> s_val;
183
Impl::shfl_union<Scalar> r_val;
186
for(int i = 0; i<s_val.n; i++)
187
r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
188
return r_val.value();
191
KOKKOS_INLINE_FUNCTION
192
int shfl_up(const int &val, const int& delta, const int& width ) {
193
return __shfl_up(val,delta,width);
196
KOKKOS_INLINE_FUNCTION
197
float shfl_up(const float &val, const int& delta, const int& width ) {
198
return __shfl_up(val,delta,width);
201
template<typename Scalar>
202
KOKKOS_INLINE_FUNCTION
203
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
205
float tmp = *reinterpret_cast<float*>(&tmp1);
206
tmp = __shfl_up(tmp,delta,width);
207
return *reinterpret_cast<Scalar*>(&tmp);
210
KOKKOS_INLINE_FUNCTION
211
double shfl_up(const double &val, const int& delta, const int& width ) {
212
int lo = __double2loint(val);
213
int hi = __double2hiint(val);
214
lo = __shfl_up(lo,delta,width);
215
hi = __shfl_up(hi,delta,width);
216
return __hiloint2double(hi,lo);
219
template<typename Scalar>
220
KOKKOS_INLINE_FUNCTION
221
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
222
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
223
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
224
lo = __shfl_up(lo,delta,width);
225
hi = __shfl_up(hi,delta,width);
226
const double tmp = __hiloint2double(hi,lo);
227
return *(reinterpret_cast<const Scalar*>(&tmp));
230
template<typename Scalar>
231
KOKKOS_INLINE_FUNCTION
232
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
233
Impl::shfl_union<Scalar> s_val;
234
Impl::shfl_union<Scalar> r_val;
237
for(int i = 0; i<s_val.n; i++)
238
r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
239
return r_val.value();
243
template<typename Scalar>
244
KOKKOS_INLINE_FUNCTION
245
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
246
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
250
template<typename Scalar>
251
KOKKOS_INLINE_FUNCTION
252
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
253
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
257
template<typename Scalar>
258
KOKKOS_INLINE_FUNCTION
259
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
260
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
265
template<typename Scalar>
267
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
268
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
272
template<typename Scalar>
274
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
275
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
279
template<typename Scalar>
281
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
282
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");