121
119
cu_pbc->upload();
123
121
delete cu_slablo;
124
cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
122
cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
125
123
cu_slablo->upload();
127
125
delete cu_slabhi;
128
cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
126
cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
129
127
cu_slabhi->upload();
131
129
cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
132
130
cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
133
131
cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
138
136
/* ----------------------------------------------------------------------
177
175
void CommCuda::forward_comm_cuda()
180
static double kerneltime=0.0;
181
static double copytime=0.0;
182
177
my_times time1,time2,time3;
185
180
MPI_Request request;
187
181
AtomVec *avec = atom->avec;
188
double **x = atom->x;
190
183
cuda->shared_data.domain.xy=domain->xy;
191
184
cuda->shared_data.domain.xz=domain->xz;
215
208
int size_forward_recv_now=0;
217
if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
218
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
210
if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
211
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
220
213
size_forward_recv_now=size_forward_recv[iswap];
221
214
my_gettime(CLOCK_REALTIME,&time1);
227
220
my_gettime(CLOCK_REALTIME,&time2);
229
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
230
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
222
if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
223
n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
232
225
//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n);
233
226
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
234
MPI_Wait(&request,&status);
227
MPI_Wait(&request,MPI_STATUS_IGNORE);
236
229
my_gettime(CLOCK_REALTIME,&time3);
237
230
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
255
248
buf_send,pbc_flag[iswap],pbc[iswap]);
257
250
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
258
MPI_Wait(&request,&status);
251
MPI_Wait(&request,MPI_STATUS_IGNORE);
259
252
avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
271
264
buf_send,pbc_flag[iswap],pbc[iswap]);
273
266
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
274
MPI_Wait(&request,&status);
267
MPI_Wait(&request,MPI_STATUS_IGNORE);
275
268
avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
286
279
n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
287
280
if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
288
if((sizeof(X_FLOAT)!=sizeof(double)) && n)
289
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
281
if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
282
n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
292
285
else if (ghost_velocity)
309
302
void CommCuda::forward_comm_pack_cuda()
312
static double kerneltime=0.0;
313
static double copytime=0.0;
314
my_times time1,time2,time3;
304
my_times time1,time2;
315
305
int n; // initialize comm buffers & exchange memory
317
307
MPI_Request request;
319
308
AtomVec *avec = atom->avec;
320
double **x = atom->x;
322
310
cuda->shared_data.domain.xy=domain->xy;
323
311
cuda->shared_data.domain.xz=domain->xz;
347
335
my_gettime(CLOCK_REALTIME,&time2);
349
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
350
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
337
if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
338
n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
351
339
cuda->shared_data.comm.send_size[iswap]=n;
353
341
else if (ghost_velocity)
359
347
my_gettime(CLOCK_REALTIME,&time2);
361
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
362
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
349
if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
350
n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
363
351
cuda->shared_data.comm.send_size[iswap]=n;
375
363
cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
377
365
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
378
MPI_Wait(&request,&status);
366
MPI_Wait(&request,MPI_STATUS_IGNORE);
379
367
avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
389
377
n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
390
378
if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
391
if((sizeof(X_FLOAT)!=sizeof(double)) && n)
392
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
379
if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
380
n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
395
383
else if (ghost_velocity)
412
400
void CommCuda::forward_comm_transfer_cuda()
415
static double kerneltime=0.0;
416
static double copytime=0.0;
417
my_times time1,time2,time3;
402
my_times time1,time2,time3;
419
404
MPI_Request request;
421
405
AtomVec *avec = atom->avec;
422
double **x = atom->x;
423
406
cuda->shared_data.domain.xy=domain->xy;
424
407
cuda->shared_data.domain.xz=domain->xz;
425
408
cuda->shared_data.domain.yz=domain->yz;
442
425
int size_forward_recv_now=0;
444
if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
445
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
427
if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
428
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
447
430
size_forward_recv_now=size_forward_recv[iswap];
464
447
cuda->shared_data.cuda_timings.comm_forward_download+=
465
448
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
466
449
MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
467
MPI_Wait(&request,&status);
450
MPI_Wait(&request,MPI_STATUS_IGNORE);
468
451
//printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
469
452
CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2);
470
453
my_gettime(CLOCK_REALTIME,&time1);
486
469
/* int size_forward_recv_now=0;
488
if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
489
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
471
if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
472
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
491
474
size_forward_recv_now=size_forward_recv[iswap];
498
481
my_gettime(CLOCK_REALTIME,&time2);
500
483
MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
501
MPI_Wait(&request,&status);
484
MPI_Wait(&request,MPI_STATUS_IGNORE);
503
486
my_gettime(CLOCK_REALTIME,&time3);
504
487
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
520
503
buf_send,pbc_flag[iswap],pbc[iswap]);
522
505
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
523
MPI_Wait(&request,&status);
506
MPI_Wait(&request,MPI_STATUS_IGNORE);
524
507
avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
550
533
void CommCuda::forward_comm_unpack_cuda()
553
static double kerneltime=0.0;
554
static double copytime=0.0;
555
my_times time1,time2,time3;
557
536
MPI_Request request;
559
537
AtomVec *avec = atom->avec;
560
double **x = atom->x;
562
539
cuda->shared_data.domain.xy=domain->xy;
563
540
cuda->shared_data.domain.xz=domain->xz;
599
576
buf_send,pbc_flag[iswap],pbc[iswap]);
601
578
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
602
MPI_Wait(&request,&status);
579
MPI_Wait(&request,MPI_STATUS_IGNORE);
603
580
avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
631
608
if(not cuda->shared_data.pair.cudable_force)
633
return Comm::forward_comm_pair(pair);
610
return CommBrick::forward_comm_pair(pair);
638
615
MPI_Request request;
617
int nsize = pair->comm_forward;
641
619
for (iswap = 0; iswap < nswap; iswap++) {
645
n = pair->pack_comm(sendnum[iswap],&iswap,
646
buf_send,pbc_flag[iswap],pbc[iswap]);
647
int nrecv = recvnum[iswap]*n;
623
n = pair->pack_forward_comm(sendnum[iswap],&iswap,
624
buf_send,pbc_flag[iswap],pbc[iswap]);
625
int nrecv = recvnum[iswap]*nsize;
648
626
if(nrecv<0) nrecv=-(nrecv+1)/2;
649
int nsend = sendnum[iswap]*n;
650
628
if(nsend<0) nsend=-(nsend+1)/2;
652
630
// exchange with another proc
656
634
MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0,
658
636
MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world);
659
MPI_Wait(&request,&status);
637
MPI_Wait(&request,MPI_STATUS_IGNORE);
661
639
} else buf = buf_send;
665
pair->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
643
pair->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
691
667
if (comm_f_only) {
693
669
int size_recv_now=size_reverse_recv[iswap];
694
if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
695
size_recv_now=(size_recv_now+1)*sizeof(F_FLOAT)/sizeof(double);
670
if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
671
size_recv_now=(size_recv_now+1)*sizeof(F_CFLOAT)/sizeof(double);
696
672
MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE,
697
673
sendproc[iswap],0,world,&request);
705
681
int size_reverse_send_now=size_reverse_send[iswap];
706
if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_send[iswap])
707
size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_FLOAT)/sizeof(double);
682
if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_send[iswap])
683
size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_CFLOAT)/sizeof(double);
708
684
MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE,
709
685
recvproc[iswap],0,world);
710
MPI_Wait(&request,&status);
686
MPI_Wait(&request,MPI_STATUS_IGNORE);
711
687
Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv);
715
691
sendproc[iswap],0,world,&request);
716
692
n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
717
693
MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
718
MPI_Wait(&request,&status);
694
MPI_Wait(&request,MPI_STATUS_IGNORE);
720
696
avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);
754
730
if(not cuda->oncpu) cuda->downloadAll();
732
CommBrick::exchange();
760
736
void CommCuda::exchange_cuda()
762
int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal;
765
double *sublo,*subhi,*buf;
738
int nsend,nrecv,nrecv1,nrecv2,nlocal;
766
740
MPI_Request request;
768
741
AtomVec *avec = atom->avec;
769
my_times time1,time2,time3;
742
my_times time1,time2;
771
744
// clear global->local map for owned and ghost atoms
772
745
// b/c atoms migrate to new procs in exchange() and
779
752
if (map_style) atom->map_clear();
781
// subbox bounds for orthogonal or triclinic
783
if (triclinic == 0) {
784
sublo = domain->sublo;
785
subhi = domain->subhi;
787
sublo = domain->sublo_lamda;
788
subhi = domain->subhi_lamda;
791
754
// loop over dimensions
793
756
for (int dim = 0; dim < 3; dim++) {
794
757
// fill buffer with atoms leaving my box, using < and >=
795
758
// when atom is deleted, fill it in with last atom
797
cuda->shared_data.exchange_dim=dim;
760
cuda->shared_data.exchange_dim=dim;
799
762
nlocal = atom->nlocal;
800
763
avec->maxsend=&maxsend;
819
782
MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
820
&nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status);
783
&nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
822
785
if (procgrid[dim] > 2) {
823
786
MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
824
&nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status);
787
&nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
827
790
if (nrecv+1 > maxrecv) grow_recv(nrecv+1);
829
792
MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0,
831
794
MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world);
832
MPI_Wait(&request,&status);
795
MPI_Wait(&request,MPI_STATUS_IGNORE);
834
797
if (procgrid[dim] > 2) {
835
798
MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
837
800
MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world);
838
MPI_Wait(&request,&status);
801
MPI_Wait(&request,MPI_STATUS_IGNORE);
840
803
if((nrecv1==0)||(nrecv2==0)) buf_recv[nrecv]=0;
898
861
void CommCuda::borders_cuda()
900
int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
901
int nsend,nrecv,nfirst,nlast,ngroup;
905
double *buf,*mlo,*mhi;
863
int n,iswap,dim,ineed,twoneed,smax,rmax;
864
int nsend,nrecv,nfirst,nlast;
906
866
MPI_Request request;
908
867
AtomVec *avec = atom->avec;
909
my_times time1,time2,time3;
868
my_times time1,time2;
911
870
// clear old ghosts
973
923
my_gettime(CLOCK_REALTIME,&time1);
974
924
if (sendproc[iswap] != me) {
975
925
MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
976
&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
926
&nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
977
927
if (nrecv*size_border > maxrecv)
978
928
grow_recv(nrecv*size_border);
979
929
MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
980
930
recvproc[iswap],0,world,&request);
981
931
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
982
MPI_Wait(&request,&status);
932
MPI_Wait(&request,MPI_STATUS_IGNORE);
1033
983
void CommCuda::borders_cuda_overlap_forward_comm()
1035
int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
1036
int nsend,nrecv,nfirst,nlast,ngroup;
1040
double *buf,*mlo,*mhi;
985
int n,iswap,dim,ineed,twoneed,smax,rmax;
986
int nsend,nrecv,nfirst,nlast;
1041
988
MPI_Request request;
1043
989
AtomVec *avec = atom->avec;
1044
my_times time1,time2,time3;
990
my_times time1,time2;
1046
992
// clear old ghosts
1109
1046
my_gettime(CLOCK_REALTIME,&time1);
1110
1047
if (sendproc[iswap] != me) {
1111
1048
MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
1112
&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
1049
&nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
1113
1050
if (nrecv*size_border > maxrecv)
1114
1051
grow_recv(nrecv*size_border);
1115
1052
MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
1116
1053
recvproc[iswap],0,world,&request);
1117
1054
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
1118
MPI_Wait(&request,&status);
1055
MPI_Wait(&request,MPI_STATUS_IGNORE);
1119
1056
buf = buf_recv;
1184
1122
if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);}
1185
1123
else buf=buf_send;
1187
n = fix->pack_comm(sendnum[iswap],&swap,
1188
buf,pbc_flag[iswap],pbc[iswap]);
1125
n = fix->pack_forward_comm(sendnum[iswap],&swap,
1126
buf,pbc_flag[iswap],pbc[iswap]);
1189
1127
if(sendproc[iswap] == me)
1195
n = fix->pack_comm(sendnum[iswap],sendlist[iswap],
1196
buf_send,pbc_flag[iswap],pbc[iswap]);
1133
n = fix->pack_forward_comm(sendnum[iswap],sendlist[iswap],
1134
buf_send,pbc_flag[iswap],pbc[iswap]);
1198
1136
// exchange with another proc
1199
1137
// if self, set recv buffer to send buffer
1201
1139
if (sendproc[iswap] != me) {
1202
MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
1140
MPI_Irecv(buf_recv,nsize*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
1203
1141
world,&request);
1204
MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
1205
MPI_Wait(&request,&status);
1142
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
1143
MPI_Wait(&request,MPI_STATUS_IGNORE);
1206
1144
buf = buf_recv;
1207
1145
} else buf = buf_send;
1209
1147
// unpack buffer
1211
fix->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
1149
fix->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
1367
1305
if(cu_sendlist)
1369
1307
cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,n,6);
1370
cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,n);
1371
cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,n);
1308
cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,n);
1309
cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,n);
1373
1311
cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
1374
1312
cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
1393
1331
void CommCuda::allocate_multi(int n)
1395
Comm::allocate_multi(n);
1333
CommBrick::allocate_multi(n);
1397
1335
delete cu_multilo;
1398
1336
delete cu_multihi;
1399
cu_multilo = new cCudaData<double, X_FLOAT,xy>(slablo,n,atom->ntypes+1);
1400
cu_multihi = new cCudaData<double, X_FLOAT,xy>(slabhi,n,atom->ntypes+1);
1337
cu_multilo = new cCudaData<double, X_CFLOAT,xy>(slablo,n,atom->ntypes+1);
1338
cu_multihi = new cCudaData<double, X_CFLOAT,xy>(slabhi,n,atom->ntypes+1);
1402
1340
cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data();
1403
1341
cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data();