86
86
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
87
87
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
88
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
89
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
90
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
88
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
89
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
90
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
91
91
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
92
92
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
93
93
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
94
94
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
96
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
96
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
98
98
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*));
121
121
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
123
123
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
124
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT));
125
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_FLOAT));
126
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_FLOAT));
124
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
125
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
126
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
127
127
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
128
128
cudaThreadSynchronize();
129
129
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
143
143
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
145
145
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
146
int size = (n * n_data_items) * sizeof(X_FLOAT);
146
int size = (n * n_data_items) * sizeof(X_CFLOAT);
148
148
if(sdata->buffer_new or (size > sdata->buffersize))
149
149
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
155
155
if(pbc_flag != 0) {
156
156
if(sdata->domain.triclinic == 0) {
185
185
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
187
187
if(not sdata->overlap_comm)
188
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
189
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
188
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
189
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
191
191
my_gettime(CLOCK_REALTIME, &time1);
192
192
sdata->cuda_timings.comm_forward_download +=
216
216
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
218
218
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
219
int size = (n * n_data_items) * sizeof(X_FLOAT);
219
int size = (n * n_data_items) * sizeof(X_CFLOAT);
221
221
if(sdata->buffer_new or (size > sdata->buffersize))
222
222
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
224
224
static int count = -1;
230
230
if(pbc_flag != 0) {
231
231
if(sdata->domain.triclinic == 0) {
276
276
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
278
278
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
279
int size = (n * n_data_items) * sizeof(X_FLOAT);
279
int size = (n * n_data_items) * sizeof(X_CFLOAT);
281
281
if(sdata->buffer_new or (size > sdata->buffersize))
282
282
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
289
289
my_gettime(CLOCK_REALTIME, &time1);
291
291
if(not sdata->overlap_comm || iswap < 0)
292
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
292
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
294
294
my_gettime(CLOCK_REALTIME, &time2);
295
295
sdata->cuda_timings.comm_forward_upload +=
464
464
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
466
int size = nsend * n_data_items * sizeof(X_FLOAT);
466
int size = nsend * n_data_items * sizeof(X_CFLOAT);
468
468
if(sdata->buffer_new or (size > sdata->buffersize))
469
469
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
475
475
if(pbc_flag != 0) {
476
476
if(sdata->domain.triclinic == 0) {
523
523
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
525
int size = n * n_data_items * sizeof(X_FLOAT);
525
int size = n * n_data_items * sizeof(X_CFLOAT);
527
527
if(sdata->buffer_new or (size > sdata->buffersize))
528
528
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
534
534
if(pbc_flag != 0) {
535
535
if(sdata->domain.triclinic == 0) {
585
585
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
587
int size = n * n_data_items * sizeof(X_FLOAT);
587
int size = n * n_data_items * sizeof(X_CFLOAT);
589
589
if(sdata->buffer_new or (size > sdata->buffersize))
590
590
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);