~ubuntu-branches/ubuntu/utopic/libthrust/utopic

template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

__device__ __thrust_forceinline__

void merge_n(Context &ctx,

RandomAccessIterator1 first1,

Size n1,

RandomAccessIterator2 first2,

Size n2,

RandomAccessIterator3 result,

Compare comp_,

unsigned int work_per_thread)

{

const unsigned int block_size = ctx.block_dimension();

thrust::detail::device_function<Compare,bool> comp(comp_);

typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;

typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;

typedef typename thrust::iterator_value<RandomAccessIterator5>::type value_type5;

// merge_kernel allocates memory aligned to int

const unsigned int array_size1 = align_size_to_int(block_size * sizeof(value_type1));

const unsigned int array_size2 = align_size_to_int(block_size * sizeof(value_type2));

const unsigned int array_size3 = align_size_to_int(2 * block_size * sizeof(value_type5));

return sizeof(int) * (array_size1 + array_size2 + array_size3);

} // end get_merge_kernel_per_block_dynamic_smem_usage()

template<typename RandomAccessIterator1,

typename RandomAccessIterator2,

typename RandomAccessIterator3,

typename RandomAccessIterator4,

typename RandomAccessIterator5,

typename StrictWeakOrdering,

typename Size,

typename Context>

struct merge_closure

Size result_size = n1 + n2;

// this is just oversubscription_rate * block_size * work_per_thread

// but it makes no sense to send oversubscription_rate as an extra parameter

Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());

using thrust::system::cuda::detail::detail::uninitialized;

100

__shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;

101

102

Size result_block_offset = ctx.block_index() * work_per_block;

103

104

// find where this block's input begins in both input sequences

105

if(ctx.thread_index() == 0)

106

{

107

s_block_input_begin = (ctx.block_index() == 0) ?

108

thrust::pair<Size,Size>(0,0) :

109

partition_search(first1, first2,

110

result_block_offset,

111

Size(0), n1,

112

Size(0), n2,

113

comp);

114

}

115

116

ctx.barrier();

117

118

// iterate to consume this block's input

119

Size work_per_iteration = block_size * work_per_thread;

120

thrust::pair<Size,Size> block_input_end = s_block_input_begin;

121

block_input_end.first += work_per_iteration;

122

block_input_end.second += work_per_iteration;

123

Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);

124

125

for(;

126

result_block_offset < result_block_offset_last;

127

result_block_offset += work_per_iteration,

128

block_input_end.first += work_per_iteration,

129

block_input_end.second += work_per_iteration

130

)

131

{

132

// find where this thread's input begins in both input sequences for this iteration

133

thrust::pair<Size,Size> thread_input_begin =

134

partition_search(first1, first2,

135

Size(result_block_offset + ctx.thread_index() * work_per_thread),

136

s_block_input_begin.get().first, thrust::min<Size>(block_input_end.first , n1),

137

s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),

138

comp);

139

140

ctx.barrier();

141

142

// XXX the performance impact of not keeping x1 & x2

143

// in registers is about 10% for int32

144

uninitialized<value_type1> x1;

145

uninitialized<value_type2> x2;

146

147

// XXX this is just a serial merge -- try to simplify or abstract this loop

148

Size i = result_block_offset + ctx.thread_index() * work_per_thread;

149

Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);

150

for(;

151

i < last_i;

152

++i)

153

{

154

// optionally load x1 & x2

155

bool output_x2 = true;

156

if(thread_input_begin.second < n2)

157

{

158

x2 = first2[thread_input_begin.second];

159

}

160

else

161

{

162

output_x2 = false;

163

}

164

165

if(thread_input_begin.first < n1)

166

{

167

x1 = first1[thread_input_begin.first];

168

169

if(output_x2)

170

{

171

output_x2 = comp(x2.get(), x1.get());

172

}

173

}

174

175

result[i] = output_x2 ? x2.get() : x1.get();

176

177

if(output_x2)

178

{

179

++thread_input_begin.second;

180

}

181

else

182

{

183

++thread_input_begin.first;

184

}

185

} // end for

186

187

// the block's last thread has conveniently located the

188

// beginning of the next iteration's input

189

if(ctx.thread_index() == block_size-1)

190

{

191

s_block_input_begin = thread_input_begin;

192

}

193

ctx.barrier();

194

} // end for

195

} // end merge_n

196

197

198

template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

199

struct merge_n_closure

200

{

const RandomAccessIterator1 first1;

const RandomAccessIterator1 last1;

const RandomAccessIterator2 first2;

const RandomAccessIterator2 last2;

RandomAccessIterator3 splitter_ranks1;

RandomAccessIterator4 splitter_ranks2;

100

const RandomAccessIterator5 result;

101

StrictWeakOrdering comp;

102

Size num_merged_partitions;

103

Context context;

104

105

typedef Context context_type;

106

107

merge_closure(const RandomAccessIterator1 first1,

108

const RandomAccessIterator1 last1,

109

const RandomAccessIterator2 first2,

110

const RandomAccessIterator2 last2,

111

RandomAccessIterator3 splitter_ranks1,

112

RandomAccessIterator4 splitter_ranks2,

113

const RandomAccessIterator5 result,

114

StrictWeakOrdering comp,

115

Size num_merged_partitions,

116

Context context = Context())

117

: first1(first1), last1(last1), first2(first2), last2(last2),

118

splitter_ranks1(splitter_ranks1), splitter_ranks2(splitter_ranks2),

119

result(result), comp(comp), num_merged_partitions(num_merged_partitions),

120

context(context)

201

typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;

202

203

RandomAccessIterator1 first1;

204

Size n1;

205

RandomAccessIterator2 first2;

206

Size n2;

207

RandomAccessIterator3 result;

208

Compare comp;

209

Size work_per_thread;

210

211

merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)

212

: first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)

121

213

{}

122

214

123

__device__ __thrust_forceinline__

124

void operator()(void)

215

__device__ __forceinline__

216

void operator()()

125

217

{

126

typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;

127

typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;

128

typedef typename thrust::iterator_value<RandomAccessIterator5>::type value_type5;

129

130

// allocate shared storage

131

const unsigned int array_size1 = align_size_to_int(context.block_dimension() * sizeof(value_type1));

132

const unsigned int array_size2 = align_size_to_int(context.block_dimension() * sizeof(value_type2));

133

const unsigned int array_size3 = align_size_to_int(2 * context.block_dimension() * sizeof(value_type5));

134

int *_shared1 = extern_shared_ptr<int>();

135

int *_shared2 = _shared1 + array_size1;

136

int *_result = _shared2 + array_size2;

137

138

value_type1 *s_input1 = reinterpret_cast<value_type1*>(_shared1);

139

value_type2 *s_input2 = reinterpret_cast<value_type2*>(_shared2);

140

value_type5 *s_result = reinterpret_cast<value_type5*>(_result);

141

142

// advance splitter iterators

143

splitter_ranks1 += context.block_index();

144

splitter_ranks2 += context.block_index();

145

146

for(Size partition_idx = context.block_index();

147

partition_idx < num_merged_partitions;

148

partition_idx += context.grid_dimension(),

149

splitter_ranks1 += context.grid_dimension(),

150

splitter_ranks2 += context.grid_dimension())

151

{

152

RandomAccessIterator1 input_begin1 = first1;

153

RandomAccessIterator1 input_end1 = last1;

154

RandomAccessIterator2 input_begin2 = first2;

155

RandomAccessIterator2 input_end2 = last2;

156

157

RandomAccessIterator5 output_begin = result;

158

159

// find the end of the input if this is not the last block

160

// the end of merged partition i is at splitter_ranks1[i] + splitter_ranks2[i]

161

if(partition_idx != num_merged_partitions - 1)

162

{

163

RandomAccessIterator3 rank1 = splitter_ranks1;

164

RandomAccessIterator4 rank2 = splitter_ranks2;

165

166

input_end1 = first1 + *rank1;

167

input_end2 = first2 + *rank2;

168

}

169

170

// find the beginning of the input and output if this is not the first partition

171

// merged partition i begins at splitter_ranks1[i-1] + splitter_ranks2[i-1]

172

if(partition_idx != 0)

173

{

174

RandomAccessIterator3 rank1 = splitter_ranks1 - 1;

175

RandomAccessIterator4 rank2 = splitter_ranks2 - 1;

176

177

// advance the input to point to the beginning

178

input_begin1 += *rank1;

179

input_begin2 += *rank2;

180

181

// advance the result to point to the beginning of the output

182

output_begin += *rank1;

183

output_begin += *rank2;

184

}

185

186

if(input_begin1 < input_end1 && input_begin2 < input_end2)

187

{

188

typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;

189

190

typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;

191

192

// load the first segment

193

difference1 s_input1_size = thrust::min<difference1>(context.block_dimension(), input_end1 - input_begin1);

194

195

block::copy(context, input_begin1, input_begin1 + s_input1_size, s_input1);

196

input_begin1 += s_input1_size;

197

198

// load the second segment

199

difference2 s_input2_size = thrust::min<difference2>(context.block_dimension(), input_end2 - input_begin2);

200

201

block::copy(context, input_begin2, input_begin2 + s_input2_size, s_input2);

202

input_begin2 += s_input2_size;

203

204

context.barrier();

205

206

block::merge(context,

207

s_input1, s_input1 + s_input1_size,

208

s_input2, s_input2 + s_input2_size,

209

s_result,

210

comp);

211

212

context.barrier();

213

214

// store to gmem

215

output_begin = block::copy(context, s_result, s_result + s_input1_size + s_input2_size, output_begin);

216

}

217

218

// simply copy any remaining input

219

block::copy(context, input_begin2, input_end2, block::copy(context, input_begin1, input_end1, output_begin));

220

} // end for partition

218

context_type ctx;

219

merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);

221

220

}

222

}; // end merge_closure

223

224

225

template<typename RandomAccessIterator1,

226

typename RandomAccessIterator2,

227

typename RandomAccessIterator3,

228

typename RandomAccessIterator4,

229

typename Compare,

230

typename Size1,

231

typename Size2,

232

typename Size3>

233

void get_merge_splitter_ranks(RandomAccessIterator1 first1,

234

RandomAccessIterator1 last1,

235

RandomAccessIterator2 first2,

236

RandomAccessIterator2 last2,

237

RandomAccessIterator3 splitter_ranks1,

238

RandomAccessIterator4 splitter_ranks2,

239

Compare comp,

240

Size1 partition_size,

241

Size2 num_splitters_from_range1,

242

Size3 num_splitters_from_range2)

221

};

222

223

224

// returns (work_per_thread, threads_per_block, oversubscription_factor)

225

template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

226

thrust::tuple<unsigned int,unsigned int,unsigned int>

227

tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)

243

228

{

244

typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;

245

typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;

246

247

const difference1 num_elements1 = last1 - first1;

248

const difference2 num_elements2 = last2 - first2;

249

250

// zip up the ranges with a counter to disambiguate repeated elements during rank-finding

251

typedef thrust::tuple<RandomAccessIterator1,thrust::counting_iterator<difference1> > iterator_tuple1;

252

typedef thrust::tuple<RandomAccessIterator2,thrust::counting_iterator<difference2> > iterator_tuple2;

253

typedef thrust::zip_iterator<iterator_tuple1> iterator_and_counter1;

254

typedef thrust::zip_iterator<iterator_tuple2> iterator_and_counter2;

255

256

iterator_and_counter1 first_and_counter1 =

257

thrust::make_zip_iterator(thrust::make_tuple(first1, thrust::make_counting_iterator<difference1>(0)));

258

iterator_and_counter1 last_and_counter1 = first_and_counter1 + num_elements1;

259

260

// make the second range begin counting at num_elements1 so they sort after elements from the first range when ambiguous

261

iterator_and_counter2 first_and_counter2 =

262

thrust::make_zip_iterator(thrust::make_tuple(first2, thrust::make_counting_iterator<difference2>(num_elements1)));

263

iterator_and_counter2 last_and_counter2 = first_and_counter2 + num_elements2;

264

265

// take into account the tuples when comparing

266

typedef thrust::detail::compare_first_less_second<Compare> splitter_compare;

267

268

return detail::get_set_operation_splitter_ranks(first_and_counter1, last_and_counter1,

269

first_and_counter2, last_and_counter2,

270

splitter_ranks1,

271

splitter_ranks2,

272

splitter_compare(comp),

273

partition_size,

274

num_splitters_from_range1,

275

num_splitters_from_range2);

276

} // end get_merge_splitter_ranks()

277

278

279

template<typename Tag,

229

// determined by empirical testing on GTX 480

230

// ~4500 Mkeys/s on GTX 480

231

const unsigned int work_per_thread = 5;

232

const unsigned int threads_per_block = 128;

233

const unsigned int oversubscription_factor = 30;

234

235

return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);

236

}

237

238

239

} // end merge_detail

240

241

242

template<typename DerivedPolicy,

280

243

typename RandomAccessIterator1,

281

244

typename RandomAccessIterator2,

282

245

typename RandomAccessIterator3,

283

246

typename Compare>

284

RandomAccessIterator3 merge(Tag,

285

RandomAccessIterator1 first1,

286

RandomAccessIterator1 last1,

287

RandomAccessIterator2 first2,

288

RandomAccessIterator2 last2,

289

RandomAccessIterator3 result,

290

Compare comp)

291

{

292

typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;

293

typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;

294

295

const difference1 num_elements1 = last1 - first1;

296

const difference2 num_elements2 = last2 - first2;

297

298

// check for trivial problem

299

if(num_elements1 == 0 && num_elements2 == 0)

300

return result;

301

else if(num_elements2 == 0)

302

return thrust::copy(first1, last1, result);

303

else if(num_elements1 == 0)

304

return thrust::copy(first2, last2, result);

305

306

using namespace merge_detail;

307

using namespace thrust::detail;

308

309

typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;

310

311

typedef detail::blocked_thread_array Context;

312

typedef merge_closure<RandomAccessIterator1,

313

RandomAccessIterator2,

314

typename temporary_array<difference1,Tag>::iterator,

315

typename temporary_array<difference2,Tag>::iterator,

316

RandomAccessIterator3,

317

Compare,

318

size_t,

319

Context> Closure;

320

321

arch::function_attributes_t attributes = detail::closure_attributes<Closure>();

322

arch::device_properties_t properties = arch::device_properties();

323

324

325

// prefer large blocks to keep the partitions as large as possible

326

const size_t block_size =

327

arch::max_blocksize_subject_to_smem_usage(properties, attributes,

328

get_merge_kernel_per_block_dynamic_smem_usage<

329

RandomAccessIterator1,

330

RandomAccessIterator2,

331

RandomAccessIterator3

332

>);

333

334

const size_t partition_size = block_size;

335

const difference1 num_partitions1 = ceil_div(num_elements1, partition_size);

336

const difference1 num_splitters_from_range1 = num_partitions1 - 1;

337

338

const difference2 num_partitions2 = ceil_div(num_elements2, partition_size);

339

const difference2 num_splitters_from_range2 = num_partitions2 - 1;

340

341

size_t num_merged_partitions = num_splitters_from_range1 + num_splitters_from_range2 + 1;

342

343

// allocate storage for splitter ranks

344

temporary_array<difference1, Tag> splitter_ranks1(num_splitters_from_range1 + num_splitters_from_range2);

345

temporary_array<difference2, Tag> splitter_ranks2(num_splitters_from_range1 + num_splitters_from_range2);

346

347

// select some splitters and find the rank of each splitter in the other range

348

// XXX it's possible to fuse rank-finding with the merge_kernel below

349

// this eliminates the temporary buffers splitter_ranks1 & splitter_ranks2

350

// but this spills to lmem and causes a 10x speeddown

351

get_merge_splitter_ranks(first1,last1,

352

first2,last2,

353

splitter_ranks1.begin(),

354

splitter_ranks2.begin(),

355

comp,

356

partition_size,

357

num_splitters_from_range1,

358

num_splitters_from_range2);

359

360

// maximize the number of blocks we can launch

361

size_t max_blocks = properties.maxGridSize[0];

362

size_t num_blocks = thrust::min(num_merged_partitions, max_blocks);

363

size_t dynamic_smem_size = get_merge_kernel_per_block_dynamic_smem_usage<RandomAccessIterator1,RandomAccessIterator2,RandomAccessIterator3>(block_size);

364

365

detail::launch_closure

366

(Closure(first1, last1,

367

first2, last2,

368

splitter_ranks1.begin(),

369

splitter_ranks2.begin(),

370

result,

371

comp,

372

num_merged_partitions),

373

num_blocks, block_size, dynamic_smem_size);

374

375

return result + num_elements1 + num_elements2;

376

} // end merge

377

378

} // end namespace merge_detail

379

380

template<typename RandomAccessIterator1,

381

typename RandomAccessIterator2,

382

typename RandomAccessIterator3,

383

typename Compare>

384

RandomAccessIterator3 merge(tag,

385

RandomAccessIterator1 first1,

386

RandomAccessIterator1 last1,

387

RandomAccessIterator2 first2,

388

RandomAccessIterator2 last2,

389

RandomAccessIterator3 result,

390

Compare comp)

391

{

392

// recover the user's system tag and pass to merge_detail::merge

393

using thrust::system::detail::generic::select_system;

394

395

typedef typename thrust::iterator_system<RandomAccessIterator1>::type system1;

396

typedef typename thrust::iterator_system<RandomAccessIterator2>::type system2;

397

typedef typename thrust::iterator_system<RandomAccessIterator3>::type system3;

398

399

return merge_detail::merge(select_system(system1(), system2(), system3()), first1, last1, first2, last2, result, comp);

247

RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,

248

RandomAccessIterator1 first1,

249

RandomAccessIterator1 last1,

250

RandomAccessIterator2 first2,

251

RandomAccessIterator2 last2,

252

RandomAccessIterator3 result,

253

Compare comp)

254

{

255

typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;

256

Size n1 = last1 - first1;

257

Size n2 = last2 - first2;

258

typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;

259

260

// empty result

261

if(n <= 0) return result;

262

263

unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;

264

thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)

265

= merge_detail::tunables(first1, last1, first2, last2, result, comp);

266

267

const unsigned int work_per_block = work_per_thread * threads_per_block;

268

269

const unsigned int num_processors = device_properties().multiProcessorCount;

270

const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));

271

272

typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;

273

closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);

274

275

detail::launch_closure(closure, num_blocks, threads_per_block);

276

277

return result + n1 + n2;

400

278

} // end merge()

401

279

280

402

281

} // end namespace detail

403

282

} // end namespace cuda

404

283

} // end namespace system

Older »