~ubuntu-branches/ubuntu/utopic/libthrust/utopic

« back to all changes in this revision

Viewing changes to system/cuda/detail/merge.inl

  • Committer: Package Import Robot
  • Author(s): Andreas Beckmann
  • Date: 2013-07-10 12:57:33 UTC
  • mfrom: (1.1.4)
  • Revision ID: package-import@ubuntu.com-20130710125733-my19jic71sqsabaj
Tags: 1.7.0-1
* New upstream release.  (Closes: #715362)
* Update watch file.

Show diffs side-by-side

added added

removed removed

Lines of Context:
15
15
 */
16
16
 
17
17
#include <thrust/detail/config.h>
18
 
 
19
 
#include <thrust/iterator/iterator_traits.h>
20
 
#include <thrust/iterator/zip_iterator.h>
21
 
#include <thrust/system/detail/generic/select_system.h>
22
 
 
 
18
#include <thrust/system/cuda/detail/merge.h>
 
19
#include <thrust/pair.h>
 
20
#include <thrust/tuple.h>
23
21
#include <thrust/detail/minmax.h>
24
 
#include <thrust/detail/internal_functional.h>
25
 
#include <thrust/system/cuda/detail/arch.h>
26
 
#include <thrust/system/cuda/detail/block/copy.h>
27
 
#include <thrust/system/cuda/detail/block/merge.h>
28
 
#include <thrust/system/cuda/detail/extern_shared_ptr.h>
29
 
#include <thrust/system/cuda/detail/detail/get_set_operation_splitter_ranks.h>
30
 
#include <thrust/detail/internal_functional.h>
31
 
#include <thrust/system/cuda/detail/tag.h>
 
22
#include <thrust/detail/function.h>
 
23
#include <thrust/system/cuda/detail/detail/uninitialized.h>
32
24
#include <thrust/system/cuda/detail/detail/launch_closure.h>
 
25
#include <thrust/detail/util/blocking.h>
33
26
 
34
27
namespace thrust
35
28
{
39
32
{
40
33
namespace detail
41
34
{
42
 
 
43
35
namespace merge_detail
44
36
{
45
37
 
46
 
template<typename T1, typename T2>
47
 
T1 ceil_div(T1 up, T2 down)
48
 
{
49
 
  T1 div = up / down;
50
 
  T1 rem = up % down;
51
 
  return (rem != 0) ? div + 1 : div;
52
 
}
53
 
 
54
 
template<unsigned int N>
55
 
  struct static_align_size_to_int
56
 
{
57
 
  static const unsigned int value = (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0);
58
 
};
59
 
 
60
 
__host__ __device__
61
 
inline unsigned int align_size_to_int(unsigned int N)
62
 
{
63
 
  return (N / sizeof(int)) + ((N % sizeof(int)) ? 1 : 0);
64
 
}
65
38
 
66
39
template<typename RandomAccessIterator1,
67
40
         typename RandomAccessIterator2,
68
 
         typename RandomAccessIterator5>
69
 
unsigned int get_merge_kernel_per_block_dynamic_smem_usage(unsigned int block_size)
70
 
{
 
41
         typename Size,
 
42
         typename Compare>
 
43
__device__ __thrust_forceinline__
 
44
thrust::pair<Size,Size>
 
45
  partition_search(RandomAccessIterator1 first1,
 
46
                   RandomAccessIterator2 first2,
 
47
                   Size diag,
 
48
                   Size lower_bound1,
 
49
                   Size upper_bound1,
 
50
                   Size lower_bound2,
 
51
                   Size upper_bound2,
 
52
                   Compare comp)
 
53
{
 
54
  Size begin = thrust::max<Size>(lower_bound1, diag - upper_bound2);
 
55
  Size end   = thrust::min<Size>(diag - lower_bound2, upper_bound1);
 
56
 
 
57
  while(begin < end)
 
58
  {
 
59
    Size mid = (begin + end) / 2;
 
60
    Size index1 = mid;
 
61
    Size index2 = diag - mid - 1;
 
62
 
 
63
    if(comp(first2[index2], first1[index1]))
 
64
    {
 
65
      end = mid;
 
66
    }
 
67
    else
 
68
    {
 
69
      begin = mid + 1;
 
70
    }
 
71
  }
 
72
 
 
73
  return thrust::make_pair(begin, diag - begin);
 
74
}
 
75
 
 
76
 
 
77
template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
 
78
__device__ __thrust_forceinline__
 
79
void merge_n(Context &ctx,
 
80
             RandomAccessIterator1 first1,
 
81
             Size n1,
 
82
             RandomAccessIterator2 first2,
 
83
             Size n2,
 
84
             RandomAccessIterator3 result,
 
85
             Compare comp_,
 
86
             unsigned int work_per_thread)
 
87
{
 
88
  const unsigned int block_size = ctx.block_dimension();
 
89
  thrust::detail::device_function<Compare,bool> comp(comp_);
71
90
  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
72
91
  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
73
 
  typedef typename thrust::iterator_value<RandomAccessIterator5>::type value_type5;
74
 
 
75
 
  // merge_kernel allocates memory aligned to int
76
 
  const unsigned int array_size1 = align_size_to_int(block_size * sizeof(value_type1));
77
 
  const unsigned int array_size2 = align_size_to_int(block_size * sizeof(value_type2));
78
 
  const unsigned int array_size3 = align_size_to_int(2 * block_size * sizeof(value_type5));
79
 
 
80
 
  return sizeof(int) * (array_size1 + array_size2 + array_size3);
81
 
} // end get_merge_kernel_per_block_dynamic_smem_usage()
82
 
 
83
 
 
84
 
template<typename RandomAccessIterator1,
85
 
         typename RandomAccessIterator2, 
86
 
         typename RandomAccessIterator3,
87
 
         typename RandomAccessIterator4,
88
 
         typename RandomAccessIterator5,
89
 
         typename StrictWeakOrdering,
90
 
         typename Size,
91
 
         typename Context>
92
 
struct merge_closure
 
92
 
 
93
  Size result_size = n1 + n2;
 
94
 
 
95
  // this is just oversubscription_rate * block_size * work_per_thread
 
96
  // but it makes no sense to send oversubscription_rate as an extra parameter
 
97
  Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());
 
98
 
 
99
  using thrust::system::cuda::detail::detail::uninitialized;
 
100
  __shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;
 
101
 
 
102
  Size result_block_offset = ctx.block_index() * work_per_block;
 
103
 
 
104
  // find where this block's input begins in both input sequences
 
105
  if(ctx.thread_index() == 0)
 
106
  {
 
107
    s_block_input_begin = (ctx.block_index() == 0) ?
 
108
      thrust::pair<Size,Size>(0,0) :
 
109
      partition_search(first1, first2,
 
110
                       result_block_offset,
 
111
                       Size(0), n1,
 
112
                       Size(0), n2,
 
113
                       comp);
 
114
  }
 
115
 
 
116
  ctx.barrier();
 
117
 
 
118
  // iterate to consume this block's input
 
119
  Size work_per_iteration = block_size * work_per_thread;
 
120
  thrust::pair<Size,Size> block_input_end = s_block_input_begin;
 
121
  block_input_end.first  += work_per_iteration;
 
122
  block_input_end.second += work_per_iteration;
 
123
  Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);
 
124
 
 
125
  for(;
 
126
      result_block_offset < result_block_offset_last;
 
127
      result_block_offset += work_per_iteration,
 
128
      block_input_end.first  += work_per_iteration,
 
129
      block_input_end.second += work_per_iteration
 
130
     )
 
131
  {
 
132
    // find where this thread's input begins in both input sequences for this iteration
 
133
    thrust::pair<Size,Size> thread_input_begin =
 
134
      partition_search(first1, first2,
 
135
                       Size(result_block_offset + ctx.thread_index() * work_per_thread),
 
136
                       s_block_input_begin.get().first,  thrust::min<Size>(block_input_end.first , n1),
 
137
                       s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),
 
138
                       comp);
 
139
 
 
140
    ctx.barrier();
 
141
 
 
142
    // XXX the performance impact of not keeping x1 & x2
 
143
    //     in registers is about 10% for int32
 
144
    uninitialized<value_type1> x1;
 
145
    uninitialized<value_type2> x2;
 
146
 
 
147
    // XXX this is just a serial merge -- try to simplify or abstract this loop
 
148
    Size i = result_block_offset + ctx.thread_index() * work_per_thread;
 
149
    Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);
 
150
    for(;
 
151
        i < last_i;
 
152
        ++i)
 
153
    {
 
154
      // optionally load x1 & x2
 
155
      bool output_x2 = true;
 
156
      if(thread_input_begin.second < n2)
 
157
      {
 
158
        x2 = first2[thread_input_begin.second];
 
159
      }
 
160
      else
 
161
      {
 
162
        output_x2 = false;
 
163
      }
 
164
 
 
165
      if(thread_input_begin.first < n1)
 
166
      {
 
167
        x1 = first1[thread_input_begin.first];
 
168
 
 
169
        if(output_x2)
 
170
        {
 
171
          output_x2 = comp(x2.get(), x1.get());
 
172
        }
 
173
      }
 
174
 
 
175
      result[i] = output_x2 ? x2.get() : x1.get();
 
176
 
 
177
      if(output_x2)
 
178
      {
 
179
        ++thread_input_begin.second;
 
180
      }
 
181
      else
 
182
      {
 
183
        ++thread_input_begin.first;
 
184
      }
 
185
    } // end for
 
186
 
 
187
    // the block's last thread has conveniently located the
 
188
    // beginning of the next iteration's input
 
189
    if(ctx.thread_index() == block_size-1)
 
190
    {
 
191
      s_block_input_begin = thread_input_begin;
 
192
    }
 
193
    ctx.barrier();
 
194
  } // end for
 
195
} // end merge_n
 
196
 
 
197
 
 
198
template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
 
199
  struct merge_n_closure
93
200
{
94
 
  const RandomAccessIterator1 first1;
95
 
  const RandomAccessIterator1 last1;
96
 
  const RandomAccessIterator2 first2;
97
 
  const RandomAccessIterator2 last2;
98
 
  RandomAccessIterator3 splitter_ranks1;
99
 
  RandomAccessIterator4 splitter_ranks2;
100
 
  const RandomAccessIterator5 result;
101
 
  StrictWeakOrdering comp;
102
 
  Size num_merged_partitions;
103
 
  Context context;
104
 
 
105
 
  typedef Context context_type;
106
 
 
107
 
  merge_closure(const RandomAccessIterator1 first1, 
108
 
                const RandomAccessIterator1 last1,
109
 
                const RandomAccessIterator2 first2,
110
 
                const RandomAccessIterator2 last2,
111
 
                RandomAccessIterator3 splitter_ranks1,
112
 
                RandomAccessIterator4 splitter_ranks2,
113
 
                const RandomAccessIterator5 result,
114
 
                StrictWeakOrdering comp,
115
 
                Size num_merged_partitions,
116
 
                Context context = Context())
117
 
    : first1(first1), last1(last1), first2(first2), last2(last2),
118
 
      splitter_ranks1(splitter_ranks1), splitter_ranks2(splitter_ranks2),
119
 
      result(result), comp(comp), num_merged_partitions(num_merged_partitions),
120
 
      context(context)
 
201
  typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;
 
202
 
 
203
  RandomAccessIterator1 first1;
 
204
  Size n1;
 
205
  RandomAccessIterator2 first2;
 
206
  Size n2;
 
207
  RandomAccessIterator3 result;
 
208
  Compare comp;
 
209
  Size work_per_thread;
 
210
 
 
211
  merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)
 
212
    : first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)
121
213
  {}
122
214
 
123
 
  __device__ __thrust_forceinline__
124
 
  void operator()(void)
 
215
  __device__ __forceinline__
 
216
  void operator()()
125
217
  {
126
 
    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
127
 
    typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
128
 
    typedef typename thrust::iterator_value<RandomAccessIterator5>::type value_type5;
129
 
 
130
 
    // allocate shared storage
131
 
    const unsigned int array_size1 = align_size_to_int(context.block_dimension() * sizeof(value_type1));
132
 
    const unsigned int array_size2 = align_size_to_int(context.block_dimension() * sizeof(value_type2));
133
 
    const unsigned int array_size3 = align_size_to_int(2 * context.block_dimension() * sizeof(value_type5));
134
 
    int *_shared1 = extern_shared_ptr<int>();
135
 
    int *_shared2 = _shared1 + array_size1;
136
 
    int *_result  = _shared2 + array_size2;
137
 
 
138
 
    value_type1 *s_input1 = reinterpret_cast<value_type1*>(_shared1);
139
 
    value_type2 *s_input2 = reinterpret_cast<value_type2*>(_shared2);
140
 
    value_type5 *s_result = reinterpret_cast<value_type5*>(_result);
141
 
 
142
 
    // advance splitter iterators
143
 
    splitter_ranks1 += context.block_index();
144
 
    splitter_ranks2 += context.block_index();
145
 
 
146
 
    for(Size partition_idx = context.block_index();
147
 
        partition_idx < num_merged_partitions;
148
 
        partition_idx   += context.grid_dimension(),
149
 
        splitter_ranks1 += context.grid_dimension(),
150
 
        splitter_ranks2 += context.grid_dimension())
151
 
    {
152
 
      RandomAccessIterator1 input_begin1 = first1;
153
 
      RandomAccessIterator1 input_end1   = last1;
154
 
      RandomAccessIterator2 input_begin2 = first2;
155
 
      RandomAccessIterator2 input_end2   = last2;
156
 
 
157
 
      RandomAccessIterator5 output_begin = result;
158
 
 
159
 
      // find the end of the input if this is not the last block
160
 
      // the end of merged partition i is at splitter_ranks1[i] + splitter_ranks2[i]
161
 
      if(partition_idx != num_merged_partitions - 1)
162
 
      {
163
 
        RandomAccessIterator3 rank1 = splitter_ranks1;
164
 
        RandomAccessIterator4 rank2 = splitter_ranks2;
165
 
 
166
 
        input_end1 = first1 + *rank1;
167
 
        input_end2 = first2 + *rank2;
168
 
      }
169
 
 
170
 
      // find the beginning of the input and output if this is not the first partition
171
 
      // merged partition i begins at splitter_ranks1[i-1] + splitter_ranks2[i-1]
172
 
      if(partition_idx != 0)
173
 
      {
174
 
        RandomAccessIterator3 rank1 = splitter_ranks1 - 1;
175
 
        RandomAccessIterator4 rank2 = splitter_ranks2 - 1;
176
 
 
177
 
        // advance the input to point to the beginning
178
 
        input_begin1 += *rank1;
179
 
        input_begin2 += *rank2;
180
 
 
181
 
        // advance the result to point to the beginning of the output
182
 
        output_begin += *rank1;
183
 
        output_begin += *rank2;
184
 
      }
185
 
 
186
 
      if(input_begin1 < input_end1 && input_begin2 < input_end2)
187
 
      {
188
 
        typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
189
 
 
190
 
        typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
191
 
 
192
 
        // load the first segment
193
 
        difference1 s_input1_size = thrust::min<difference1>(context.block_dimension(), input_end1 - input_begin1);
194
 
 
195
 
        block::copy(context, input_begin1, input_begin1 + s_input1_size, s_input1);
196
 
        input_begin1 += s_input1_size;
197
 
 
198
 
        // load the second segment
199
 
        difference2 s_input2_size = thrust::min<difference2>(context.block_dimension(), input_end2 - input_begin2);
200
 
 
201
 
        block::copy(context, input_begin2, input_begin2 + s_input2_size, s_input2);
202
 
        input_begin2 += s_input2_size;
203
 
 
204
 
        context.barrier();
205
 
 
206
 
        block::merge(context,
207
 
                     s_input1, s_input1 + s_input1_size,
208
 
                     s_input2, s_input2 + s_input2_size,
209
 
                     s_result,
210
 
                     comp);
211
 
 
212
 
        context.barrier();
213
 
 
214
 
        // store to gmem
215
 
        output_begin = block::copy(context, s_result, s_result + s_input1_size + s_input2_size, output_begin);
216
 
      }
217
 
 
218
 
      // simply copy any remaining input
219
 
      block::copy(context, input_begin2, input_end2, block::copy(context, input_begin1, input_end1, output_begin));
220
 
    } // end for partition
 
218
    context_type ctx;
 
219
    merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);
221
220
  }
222
 
}; // end merge_closure
223
 
 
224
 
 
225
 
template<typename RandomAccessIterator1,
226
 
         typename RandomAccessIterator2,
227
 
         typename RandomAccessIterator3,
228
 
         typename RandomAccessIterator4,
229
 
         typename Compare,
230
 
         typename Size1,
231
 
         typename Size2,
232
 
         typename Size3>
233
 
  void get_merge_splitter_ranks(RandomAccessIterator1 first1,
234
 
                                RandomAccessIterator1 last1,
235
 
                                RandomAccessIterator2 first2,
236
 
                                RandomAccessIterator2 last2,
237
 
                                RandomAccessIterator3 splitter_ranks1,
238
 
                                RandomAccessIterator4 splitter_ranks2,
239
 
                                Compare comp,
240
 
                                Size1 partition_size,
241
 
                                Size2 num_splitters_from_range1,
242
 
                                Size3 num_splitters_from_range2)
 
221
};
 
222
 
 
223
 
 
224
// returns (work_per_thread, threads_per_block, oversubscription_factor)
 
225
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
 
226
  thrust::tuple<unsigned int,unsigned int,unsigned int>
 
227
    tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)
243
228
{
244
 
  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
245
 
  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
246
 
 
247
 
  const difference1 num_elements1 = last1 - first1;
248
 
  const difference2 num_elements2 = last2 - first2;
249
 
 
250
 
  // zip up the ranges with a counter to disambiguate repeated elements during rank-finding
251
 
  typedef thrust::tuple<RandomAccessIterator1,thrust::counting_iterator<difference1> > iterator_tuple1;
252
 
  typedef thrust::tuple<RandomAccessIterator2,thrust::counting_iterator<difference2> > iterator_tuple2;
253
 
  typedef thrust::zip_iterator<iterator_tuple1> iterator_and_counter1;
254
 
  typedef thrust::zip_iterator<iterator_tuple2> iterator_and_counter2;
255
 
 
256
 
  iterator_and_counter1 first_and_counter1 =
257
 
    thrust::make_zip_iterator(thrust::make_tuple(first1, thrust::make_counting_iterator<difference1>(0)));
258
 
  iterator_and_counter1 last_and_counter1 = first_and_counter1 + num_elements1;
259
 
 
260
 
  // make the second range begin counting at num_elements1 so they sort after elements from the first range when ambiguous
261
 
  iterator_and_counter2 first_and_counter2 =
262
 
    thrust::make_zip_iterator(thrust::make_tuple(first2, thrust::make_counting_iterator<difference2>(num_elements1)));
263
 
  iterator_and_counter2 last_and_counter2 = first_and_counter2 + num_elements2;
264
 
 
265
 
  // take into account the tuples when comparing
266
 
  typedef thrust::detail::compare_first_less_second<Compare> splitter_compare;
267
 
 
268
 
  return detail::get_set_operation_splitter_ranks(first_and_counter1, last_and_counter1,
269
 
                                                  first_and_counter2, last_and_counter2,
270
 
                                                  splitter_ranks1,
271
 
                                                  splitter_ranks2,
272
 
                                                  splitter_compare(comp),
273
 
                                                  partition_size,
274
 
                                                  num_splitters_from_range1,
275
 
                                                  num_splitters_from_range2);
276
 
} // end get_merge_splitter_ranks()
277
 
 
278
 
 
279
 
template<typename Tag,
 
229
  // determined by empirical testing on GTX 480
 
230
  // ~4500 Mkeys/s on GTX 480
 
231
  const unsigned int work_per_thread         = 5;
 
232
  const unsigned int threads_per_block       = 128;
 
233
  const unsigned int oversubscription_factor = 30;
 
234
 
 
235
  return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);
 
236
}
 
237
 
 
238
 
 
239
} // end merge_detail
 
240
 
 
241
 
 
242
template<typename DerivedPolicy,
280
243
         typename RandomAccessIterator1,
281
244
         typename RandomAccessIterator2, 
282
245
         typename RandomAccessIterator3,
283
246
         typename Compare>
284
 
RandomAccessIterator3 merge(Tag,
285
 
                            RandomAccessIterator1 first1,
286
 
                            RandomAccessIterator1 last1,
287
 
                            RandomAccessIterator2 first2,
288
 
                            RandomAccessIterator2 last2,
289
 
                            RandomAccessIterator3 result,
290
 
                            Compare comp)
291
 
{
292
 
  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
293
 
  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
294
 
 
295
 
  const difference1 num_elements1 = last1 - first1;
296
 
  const difference2 num_elements2 = last2 - first2;
297
 
 
298
 
  // check for trivial problem
299
 
  if(num_elements1 == 0 && num_elements2 == 0)
300
 
    return result;
301
 
  else if(num_elements2 == 0)
302
 
    return thrust::copy(first1, last1, result);
303
 
  else if(num_elements1 == 0)
304
 
    return thrust::copy(first2, last2, result);
305
 
 
306
 
  using namespace merge_detail;
307
 
  using namespace thrust::detail;
308
 
 
309
 
  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
310
 
 
311
 
  typedef detail::blocked_thread_array Context;
312
 
  typedef merge_closure<RandomAccessIterator1,
313
 
                        RandomAccessIterator2,
314
 
                        typename temporary_array<difference1,Tag>::iterator,
315
 
                        typename temporary_array<difference2,Tag>::iterator,
316
 
                        RandomAccessIterator3,
317
 
                        Compare,
318
 
                        size_t,
319
 
                        Context> Closure;
320
 
  
321
 
  arch::function_attributes_t attributes = detail::closure_attributes<Closure>();
322
 
  arch::device_properties_t   properties = arch::device_properties();
323
 
 
324
 
  
325
 
  // prefer large blocks to keep the partitions as large as possible
326
 
  const size_t block_size =
327
 
    arch::max_blocksize_subject_to_smem_usage(properties, attributes,
328
 
                                              get_merge_kernel_per_block_dynamic_smem_usage<
329
 
                                                RandomAccessIterator1,
330
 
                                                RandomAccessIterator2,
331
 
                                                RandomAccessIterator3
332
 
                                              >);
333
 
 
334
 
  const size_t partition_size = block_size;
335
 
  const difference1 num_partitions1 = ceil_div(num_elements1, partition_size);
336
 
  const difference1 num_splitters_from_range1 = num_partitions1 - 1;
337
 
 
338
 
  const difference2 num_partitions2 = ceil_div(num_elements2, partition_size);
339
 
  const difference2 num_splitters_from_range2 = num_partitions2 - 1;
340
 
 
341
 
  size_t num_merged_partitions = num_splitters_from_range1 + num_splitters_from_range2 + 1;
342
 
 
343
 
  // allocate storage for splitter ranks
344
 
  temporary_array<difference1, Tag> splitter_ranks1(num_splitters_from_range1 + num_splitters_from_range2);
345
 
  temporary_array<difference2, Tag> splitter_ranks2(num_splitters_from_range1 + num_splitters_from_range2);
346
 
 
347
 
  // select some splitters and find the rank of each splitter in the other range
348
 
  // XXX it's possible to fuse rank-finding with the merge_kernel below
349
 
  //     this eliminates the temporary buffers splitter_ranks1 & splitter_ranks2
350
 
  //     but this spills to lmem and causes a 10x speeddown
351
 
  get_merge_splitter_ranks(first1,last1,
352
 
                           first2,last2,
353
 
                           splitter_ranks1.begin(),
354
 
                           splitter_ranks2.begin(),
355
 
                           comp,
356
 
                           partition_size,
357
 
                           num_splitters_from_range1,
358
 
                           num_splitters_from_range2);
359
 
 
360
 
  // maximize the number of blocks we can launch
361
 
  size_t max_blocks = properties.maxGridSize[0];
362
 
  size_t num_blocks = thrust::min(num_merged_partitions, max_blocks);
363
 
  size_t dynamic_smem_size = get_merge_kernel_per_block_dynamic_smem_usage<RandomAccessIterator1,RandomAccessIterator2,RandomAccessIterator3>(block_size);
364
 
 
365
 
  detail::launch_closure
366
 
    (Closure(first1, last1,
367
 
             first2, last2,
368
 
             splitter_ranks1.begin(),
369
 
             splitter_ranks2.begin(),
370
 
             result, 
371
 
             comp,
372
 
             num_merged_partitions),
373
 
     num_blocks, block_size, dynamic_smem_size);
374
 
 
375
 
  return result + num_elements1 + num_elements2;
376
 
} // end merge
377
 
 
378
 
} // end namespace merge_detail
379
 
 
380
 
template<typename RandomAccessIterator1,
381
 
         typename RandomAccessIterator2, 
382
 
         typename RandomAccessIterator3,
383
 
         typename Compare>
384
 
RandomAccessIterator3 merge(tag,
385
 
                            RandomAccessIterator1 first1,
386
 
                            RandomAccessIterator1 last1,
387
 
                            RandomAccessIterator2 first2,
388
 
                            RandomAccessIterator2 last2,
389
 
                            RandomAccessIterator3 result,
390
 
                            Compare comp)
391
 
{
392
 
  // recover the user's system tag and pass to merge_detail::merge
393
 
  using thrust::system::detail::generic::select_system;
394
 
 
395
 
  typedef typename thrust::iterator_system<RandomAccessIterator1>::type system1;
396
 
  typedef typename thrust::iterator_system<RandomAccessIterator2>::type system2;
397
 
  typedef typename thrust::iterator_system<RandomAccessIterator3>::type system3;
398
 
 
399
 
  return merge_detail::merge(select_system(system1(), system2(), system3()), first1, last1, first2, last2, result, comp);
 
247
RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
 
248
                            RandomAccessIterator1 first1,
 
249
                            RandomAccessIterator1 last1,
 
250
                            RandomAccessIterator2 first2,
 
251
                            RandomAccessIterator2 last2,
 
252
                            RandomAccessIterator3 result,
 
253
                            Compare comp)
 
254
{
 
255
  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
 
256
  Size n1 = last1 - first1;
 
257
  Size n2 = last2 - first2;
 
258
  typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;
 
259
 
 
260
  // empty result
 
261
  if(n <= 0) return result;
 
262
 
 
263
  unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;
 
264
  thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)
 
265
    = merge_detail::tunables(first1, last1, first2, last2, result, comp);
 
266
 
 
267
  const unsigned int work_per_block = work_per_thread * threads_per_block;
 
268
 
 
269
  const unsigned int num_processors = device_properties().multiProcessorCount;
 
270
  const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));
 
271
 
 
272
  typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;
 
273
  closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);
 
274
 
 
275
  detail::launch_closure(closure, num_blocks, threads_per_block);
 
276
 
 
277
  return result + n1 + n2;
400
278
} // end merge()
401
279
 
 
280
 
402
281
} // end namespace detail
403
282
} // end namespace cuda
404
283
} // end namespace system