48
48
using namespace LAMMPS_NS;
50
/* ---------------------------------------------------------------------- */
52
52
Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
54
54
cuda_exists = true;
58
printf("# Using LAMMPS_CUDA \n");
57
if (universe->me == 0) printf("# Using LAMMPS_CUDA \n");
60
59
shared_data.me = universe->me;
61
61
device_set = false;
63
64
Cuda_Cuda_GetCompileSettings(&shared_data);
65
if(shared_data.compile_settings.prec_glob != sizeof(CUDA_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT) / 4);
67
if(shared_data.compile_settings.prec_x != sizeof(X_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_x, sizeof(X_FLOAT) / 4);
69
if(shared_data.compile_settings.prec_v != sizeof(V_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_v, sizeof(V_FLOAT) / 4);
71
if(shared_data.compile_settings.prec_f != sizeof(F_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_f, sizeof(F_FLOAT) / 4);
73
if(shared_data.compile_settings.prec_pppm != sizeof(PPPM_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT) / 4);
75
if(shared_data.compile_settings.prec_fft != sizeof(FFT_FLOAT) / 4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n", shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT) / 4);
66
if (universe->me == 0) {
68
if(shared_data.compile_settings.prec_glob != sizeof(CUDA_CFLOAT) / 4)
69
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
70
" # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",
71
shared_data.compile_settings.prec_glob, (int) sizeof(CUDA_CFLOAT) / 4);
73
if(shared_data.compile_settings.prec_x != sizeof(X_CFLOAT) / 4)
74
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
75
" # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",
76
shared_data.compile_settings.prec_x, (int) sizeof(X_CFLOAT) / 4);
78
if(shared_data.compile_settings.prec_v != sizeof(V_CFLOAT) / 4)
79
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
80
" # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",
81
shared_data.compile_settings.prec_v, (int) sizeof(V_CFLOAT) / 4);
83
if(shared_data.compile_settings.prec_f != sizeof(F_CFLOAT) / 4)
84
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
85
" # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",
86
shared_data.compile_settings.prec_f, (int) sizeof(F_CFLOAT) / 4);
88
if(shared_data.compile_settings.prec_pppm != sizeof(PPPM_CFLOAT) / 4)
89
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
90
" # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",
91
shared_data.compile_settings.prec_pppm, (int) sizeof(PPPM_CFLOAT) / 4);
93
if(shared_data.compile_settings.prec_fft != sizeof(FFT_CFLOAT) / 4)
94
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
95
" # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",
96
shared_data.compile_settings.prec_fft, (int) sizeof(FFT_CFLOAT) / 4);
79
if(shared_data.compile_settings.cufft != 1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n", shared_data.compile_settings.cufft, 1);
99
if(shared_data.compile_settings.cufft != 1)
100
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
101
" # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
102
shared_data.compile_settings.cufft, 1);
83
if(shared_data.compile_settings.cufft != 0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n", shared_data.compile_settings.cufft, 0);
104
if(shared_data.compile_settings.cufft != 0)
105
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
106
" # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
107
shared_data.compile_settings.cufft, 0);
87
if(shared_data.compile_settings.arch != CUDA_ARCH) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n", shared_data.compile_settings.cufft, CUDA_ARCH);
110
if(shared_data.compile_settings.arch != CUDA_ARCH)
111
printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
112
" # CUDA WARNING: arch: cuda %i cpp %i\n\n",
113
shared_data.compile_settings.cufft, CUDA_ARCH);
204
void Cuda::accelerator(int narg, char** arg)
234
/* ----------------------------------------------------------------------
236
can be invoked multiple times: -c on, -pk, package command
237
can only init GPUs once in activate(), so just store params here
238
------------------------------------------------------------------------- */
240
void Cuda::accelerator(int narg, char **arg)
206
if(device_set) return;
208
if(universe->me == 0)
209
printf("# CUDA: Activate GPU \n");
211
int* devicelist = NULL;
214
for(int i = 0; i < narg; i++) {
215
if(strcmp(arg[i], "gpu/node") == 0) {
217
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'gpu/node' option.");
219
pppn = force->inumeric(FLERR,arg[i]);
222
if(strcmp(arg[i], "gpu/node/special") == 0) {
224
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node/special'.");
226
pppn = force->inumeric(FLERR,arg[i]);
228
if(pppn < 1) error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'.");
231
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'.");
242
// this error should not happen
244
if (device_set) error->all(FLERR,"USER-CUDA device is already activated");
246
// pppn = # of GPUs/node
248
pppn = force->inumeric(FLERR,arg[0]);
249
if (pppn <= 0) error->all(FLERR,"Illegal package cuda command");
253
delete [] devicelist;
258
while (iarg < narg) {
259
if (strcmp(arg[iarg],"newton") == 0) {
260
if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
261
if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
262
else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
263
else error->all(FLERR,"Illegal package cuda command");
264
} else if (strcmp(arg[iarg],"gpuID") == 0) {
265
if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command");
233
266
devicelist = new int[pppn];
235
for(int k = 0; k < pppn; k++) {
237
devicelist[k] = force->inumeric(FLERR,arg[i]);
241
if(strcmp(arg[i], "pinned") == 0) {
243
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option.");
245
pinned = force->inumeric(FLERR,arg[i]) == 0 ? false : true;
247
if((pinned == false) && (universe->me == 0)) printf(" #CUDA: Pinned memory is not used for communication\n");
250
if(strcmp(arg[i], "timing") == 0) {
267
for (int k = 0; k < pppn; k++)
268
devicelist[k] = force->inumeric(FLERR,arg[iarg+k+1]);
270
} else if (strcmp(arg[iarg],"timing") == 0) {
254
if(strcmp(arg[i], "suffix") == 0) {
256
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option.");
258
strcpy(lmp->suffix, arg[i]);
261
if(strcmp(arg[i], "overlap_comm") == 0) {
273
} else if (strcmp(arg[iarg],"test") == 0) {
274
if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
275
testatom = force->numeric(FLERR,arg[iarg+1]);
278
} else if (strcmp(arg[iarg],"thread") == 0) {
279
if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
280
if (strcmp(arg[iarg+1],"auto") == 0)
281
shared_data.pair.override_block_per_atom = -1;
282
else if (strcmp(arg[iarg+1],"tpa") == 0)
283
shared_data.pair.override_block_per_atom = 0;
284
else if (strcmp(arg[iarg+1],"bpa") == 0)
285
shared_data.pair.override_block_per_atom = 1;
286
else error->all(FLERR,"Illegal package cuda command");
290
// undocumented options
292
else if (strcmp(arg[iarg],"suffix") == 0) {
293
if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
294
strcpy(lmp->suffix,arg[iarg+1]);
296
} else if (strcmp(arg[iarg],"overlap_comm") == 0) {
262
297
shared_data.overlap_comm = 1;
265
if(strcmp(arg[i], "test") == 0) {
267
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'test' option.");
269
testatom = force->numeric(FLERR,arg[i]);
273
if(strcmp(arg[i], "override/bpa") == 0) {
275
error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'override/bpa' option.");
277
shared_data.pair.override_block_per_atom = force->inumeric(FLERR,arg[i]);
299
} else if (strcmp(arg[iarg],"pinned") == 0) {
300
if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
301
pinned = force->inumeric(FLERR,arg[iarg+1]) == 0 ? false : true;
302
if ((pinned == false) && (universe->me == 0))
303
printf(" #CUDA: Pinned memory is not used for communication\n");
305
} else error->all(FLERR,"Illegal package cuda command");
310
force->newton = force->newton_pair = force->newton_bond = newtonflag;
313
/* ----------------------------------------------------------------------
315
only done once with whatever settings used by the last package command
316
------------------------------------------------------------------------- */
318
void Cuda::activate()
320
if (device_set) return;
323
if (universe->me == 0) printf("# CUDA: Activate GPU \n");
281
325
CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist);
282
326
//if(shared_data.overlap_comm)
283
327
CudaWrapper_AddStreams(3);
362
406
void Cuda::allocate()
364
accelerator(0, NULL);
365
408
MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
367
410
if(not cu_virial) {
368
cu_virial = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.virial , 6);
369
cu_eng_vdwl = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_vdwl , 1);
370
cu_eng_coul = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_coul , 1);
411
cu_virial = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.virial , 6);
412
cu_eng_vdwl = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_vdwl , 1);
413
cu_eng_coul = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_coul , 1);
371
414
cu_extent = new cCudaData<double, double, x> (extent, 6);
372
415
shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
373
416
int size = 2 * CUDA_MAX_DEBUG_SIZE;
448
489
// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
449
490
if(atom->nmax > cu_atom->nmax || cu_tag == NULL) {
451
cu_x = new cCudaData<double, X_FLOAT, yx> ((double*)atom->x , & cu_atom->x , atom->nmax, 3, 0, true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
492
cu_x = new cCudaData<double, X_CFLOAT, yx> ((double*)atom->x , & cu_atom->x , atom->nmax, 3, 0, true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
453
cu_v = new cCudaData<double, V_FLOAT, yx> ((double*)atom->v, & cu_atom->v , atom->nmax, 3);
494
cu_v = new cCudaData<double, V_CFLOAT, yx> ((double*)atom->v, & cu_atom->v , atom->nmax, 3);
455
cu_f = new cCudaData<double, F_FLOAT, yx> ((double*)atom->f, & cu_atom->f , atom->nmax, 3, 0, true);
496
cu_f = new cCudaData<double, F_CFLOAT, yx> ((double*)atom->f, & cu_atom->f , atom->nmax, 3, 0, true);
457
498
cu_tag = new cCudaData<int , int , x > (atom->tag , & cu_atom->tag , atom->nmax, 0, true);
465
506
if(atom->rmass) {
467
cu_rmass = new cCudaData<double, V_FLOAT, x > (atom->rmass , & cu_atom->rmass , atom->nmax);
508
cu_rmass = new cCudaData<double, V_CFLOAT, x > (atom->rmass , & cu_atom->rmass , atom->nmax);
470
511
if(cu_atom->q_flag) {
472
cu_q = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q , atom->nmax, 0 , true);
513
cu_q = new cCudaData<double, F_CFLOAT, x > ((double*)atom->q, & cu_atom->q , atom->nmax, 0 , true);
473
514
}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
475
516
if(atom->radius) {
476
517
delete cu_radius;
477
cu_radius = new cCudaData<double, X_FLOAT, x > (atom->radius , & cu_atom->radius , atom->nmax);
518
cu_radius = new cCudaData<double, X_CFLOAT, x > (atom->radius , & cu_atom->radius , atom->nmax);
478
519
delete cu_v_radius;
479
cu_v_radius = new cCudaData<V_FLOAT, V_FLOAT, x> (v_radius , & cu_atom->v_radius , atom->nmax * 4);
520
cu_v_radius = new cCudaData<V_CFLOAT, V_CFLOAT, x> (v_radius , & cu_atom->v_radius , atom->nmax * 4);
480
521
delete cu_omega_rmass;
481
cu_omega_rmass = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass , atom->nmax * 4);
522
cu_omega_rmass = new cCudaData<V_CFLOAT, V_CFLOAT, x> (omega_rmass , & cu_atom->omega_rmass , atom->nmax * 4);
484
525
if(atom->omega) {
486
cu_omega = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega) , & cu_atom->omega , atom->nmax, 3);
527
cu_omega = new cCudaData<double, V_CFLOAT, yx > (((double*) atom->omega) , & cu_atom->omega , atom->nmax, 3);
489
530
if(atom->torque) {
490
531
delete cu_torque;
491
cu_torque = new cCudaData<double, F_FLOAT, yx > (((double*) atom->torque) , & cu_atom->torque , atom->nmax, 3);
532
cu_torque = new cCudaData<double, F_CFLOAT, yx > (((double*) atom->torque) , & cu_atom->torque , atom->nmax, 3);
494
535
if(atom->special) {
514
555
cu_atom->nmax = atom->nmax;
516
557
delete cu_x_type;
517
cu_x_type = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type , atom->nmax * 4);
558
cu_x_type = new cCudaData<X_CFLOAT, X_CFLOAT, x> (x_type , & cu_atom->x_type , atom->nmax * 4);
520
561
if(((cu_xhold == NULL) || (cu_xhold->get_dim()[0] < neighbor->maxhold)) && neighbor->xhold) {
522
cu_xhold = new cCudaData<double, X_FLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3);
563
cu_xhold = new cCudaData<double, X_CFLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3);
523
564
shared_data.atom.maxhold = neighbor->maxhold;
526
567
if(atom->mass && !cu_mass) {
527
cu_mass = new cCudaData<double, V_FLOAT, x > (atom->mass , & cu_atom->mass , atom->ntypes + 1);
568
cu_mass = new cCudaData<double, V_CFLOAT, x > (atom->mass , & cu_atom->mass , atom->ntypes + 1);
530
571
cu_atom->mass_host = atom->mass;
532
573
if(atom->map_style == 1) {
533
if((cu_map_array == NULL)) {
574
if(cu_map_array == NULL) {
534
575
cu_map_array = new cCudaData<int, int, x > (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size());
535
576
} else if(cu_map_array->dev_size() / sizeof(int) < atom->get_map_size()) {
536
577
delete cu_map_array;
619
cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax , 6);// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
660
cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax , 6);// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
621
662
if(cu_vatom->get_dim()[0] != atom->nmax) {
622
663
//delete cu_vatom;
623
//cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
664
//cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
624
665
shared_data.atom.update_nmax = 2;