~ubuntu-branches/debian/sid/lammps/sid

« back to all changes in this revision

Viewing changes to src/USER-CUDA/comm_cuda.cpp

  • Committer: Package Import Robot
  • Author(s): Anton Gladky
  • Date: 2015-04-29 23:44:49 UTC
  • mfrom: (5.1.3 experimental)
  • Revision ID: package-import@ubuntu.com-20150429234449-mbhy9utku6hp6oq8
Tags: 0~20150313.gitfa668e1-1
Upload into unstable.

Show diffs side-by-side

added added

removed removed

Lines of Context:
35
35
#include "fix.h"
36
36
#include "group.h"
37
37
#include "compute.h"
38
 
#include "cuda.h"
 
38
#include "user_cuda.h"
39
39
#include "error.h"
40
40
#include "memory.h"
41
41
#include "comm_cuda_cu.h"
75
75
  buf_send = NULL;
76
76
  buf_recv = NULL;
77
77
 
78
 
  Comm::free_swap();
 
78
  CommBrick::free_swap();
79
79
  allocate_swap(maxswap);
80
80
}
81
81
 
102
102
 
103
103
void CommCuda::init()
104
104
{
105
 
  int factor = 1;
106
 
  if(cuda->shared_data.overlap_comm) factor=maxswap;
107
105
  if(not buf_send)
108
106
  grow_send(maxsend,0);
109
107
  if(not buf_recv)
121
119
  cu_pbc->upload();
122
120
 
123
121
  delete cu_slablo;
124
 
  cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
 
122
  cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
125
123
  cu_slablo->upload();
126
124
 
127
125
  delete cu_slabhi;
128
 
  cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
 
126
  cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
129
127
  cu_slabhi->upload();
130
128
 
131
129
  cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
132
130
  cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
133
131
  cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
134
132
 
135
 
  Comm::init();
 
133
  CommBrick::init();
136
134
}
137
135
 
138
136
/* ----------------------------------------------------------------------
145
143
void CommCuda::setup()
146
144
{
147
145
  if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
148
 
  Comm::setup();
 
146
  CommBrick::setup();
149
147
 
150
148
  //upload changed geometry to device
151
149
    if(style == SINGLE)
176
174
 
177
175
void CommCuda::forward_comm_cuda()
178
176
{
179
 
  static int count=0;
180
 
  static double kerneltime=0.0;
181
 
  static double copytime=0.0;
182
177
  my_times time1,time2,time3;
183
178
 
184
179
  int n;
185
180
  MPI_Request request;
186
 
  MPI_Status status;
187
181
  AtomVec *avec = atom->avec;
188
 
  double **x = atom->x;
189
182
 
190
183
  cuda->shared_data.domain.xy=domain->xy;
191
184
  cuda->shared_data.domain.xz=domain->xz;
197
190
  if(not comm_x_only && not avec->cudable)
198
191
  {
199
192
          cuda->downloadAll();
200
 
    Comm::forward_comm();
 
193
    CommBrick::forward_comm();
201
194
    cuda->uploadAll();
202
195
    return;
203
196
  }
214
207
 
215
208
        int size_forward_recv_now=0;
216
209
 
217
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
218
 
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
 
210
        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
 
211
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
219
212
        else
220
213
          size_forward_recv_now=size_forward_recv[iswap];
221
214
my_gettime(CLOCK_REALTIME,&time1);
226
219
 
227
220
my_gettime(CLOCK_REALTIME,&time2);
228
221
 
229
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
230
 
          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
 
222
        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
 
223
          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
231
224
 
232
225
                //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n);
233
226
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
234
 
            MPI_Wait(&request,&status);
 
227
            MPI_Wait(&request,MPI_STATUS_IGNORE);
235
228
 
236
229
my_gettime(CLOCK_REALTIME,&time3);
237
230
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
255
248
                                buf_send,pbc_flag[iswap],pbc[iswap]);
256
249
 
257
250
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
258
 
            MPI_Wait(&request,&status);
 
251
            MPI_Wait(&request,MPI_STATUS_IGNORE);
259
252
            avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
260
253
      }
261
254
      else
271
264
                            buf_send,pbc_flag[iswap],pbc[iswap]);
272
265
 
273
266
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
274
 
            MPI_Wait(&request,&status);
 
267
            MPI_Wait(&request,MPI_STATUS_IGNORE);
275
268
            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
276
269
      }
277
270
 
285
278
                {
286
279
          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
287
280
          if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
288
 
          if((sizeof(X_FLOAT)!=sizeof(double)) && n)
289
 
            n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
 
281
          if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
 
282
            n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
290
283
                }
291
284
      }
292
285
      else if (ghost_velocity)
308
301
 
309
302
void CommCuda::forward_comm_pack_cuda()
310
303
{
311
 
        static int count=0;
312
 
        static double kerneltime=0.0;
313
 
        static double copytime=0.0;
314
 
    my_times time1,time2,time3;
 
304
  my_times time1,time2;
315
305
  int n;  // initialize comm buffers & exchange memory
316
306
 
317
307
  MPI_Request request;
318
 
  MPI_Status status;
319
308
  AtomVec *avec = atom->avec;
320
 
  double **x = atom->x;
321
309
 
322
310
  cuda->shared_data.domain.xy=domain->xy;
323
311
  cuda->shared_data.domain.xz=domain->xz;
346
334
 
347
335
my_gettime(CLOCK_REALTIME,&time2);
348
336
 
349
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
350
 
          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
 
337
        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
 
338
          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
351
339
                cuda->shared_data.comm.send_size[iswap]=n;
352
340
      }
353
341
      else if (ghost_velocity)
358
346
 
359
347
my_gettime(CLOCK_REALTIME,&time2);
360
348
 
361
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
362
 
          n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
 
349
        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
 
350
          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
363
351
                cuda->shared_data.comm.send_size[iswap]=n;
364
352
       }
365
353
      else
375
363
                            cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
376
364
 
377
365
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
378
 
            MPI_Wait(&request,&status);
 
366
            MPI_Wait(&request,MPI_STATUS_IGNORE);
379
367
            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
380
368
      }
381
369
 
388
376
                {
389
377
          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
390
378
          if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
391
 
          if((sizeof(X_FLOAT)!=sizeof(double)) && n)
392
 
            n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
 
379
          if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
 
380
            n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
393
381
                }
394
382
      }
395
383
      else if (ghost_velocity)
411
399
 
412
400
void CommCuda::forward_comm_transfer_cuda()
413
401
{
414
 
        static int count=0;
415
 
        static double kerneltime=0.0;
416
 
        static double copytime=0.0;
417
 
    my_times time1,time2,time3;
 
402
  my_times time1,time2,time3;
418
403
  int n;
419
404
  MPI_Request request;
420
 
  MPI_Status status;
421
405
  AtomVec *avec = atom->avec;
422
 
  double **x = atom->x;
423
406
  cuda->shared_data.domain.xy=domain->xy;
424
407
  cuda->shared_data.domain.xz=domain->xz;
425
408
  cuda->shared_data.domain.yz=domain->yz;
441
424
 
442
425
        int size_forward_recv_now=0;
443
426
 
444
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
445
 
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
 
427
        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
 
428
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
446
429
        else
447
430
          size_forward_recv_now=size_forward_recv[iswap];
448
431
 
464
447
cuda->shared_data.cuda_timings.comm_forward_download+=
465
448
      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
466
449
            MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
467
 
            MPI_Wait(&request,&status);
 
450
            MPI_Wait(&request,MPI_STATUS_IGNORE);
468
451
        //printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
469
452
                CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2);
470
453
my_gettime(CLOCK_REALTIME,&time1);
485
468
      {
486
469
 /*       int size_forward_recv_now=0;
487
470
 
488
 
        if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
489
 
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
 
471
        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
 
472
          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
490
473
        else
491
474
          size_forward_recv_now=size_forward_recv[iswap];
492
475
 
498
481
my_gettime(CLOCK_REALTIME,&time2);
499
482
 
500
483
            MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
501
 
            MPI_Wait(&request,&status);
 
484
            MPI_Wait(&request,MPI_STATUS_IGNORE);
502
485
 
503
486
my_gettime(CLOCK_REALTIME,&time3);
504
487
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
520
503
                            buf_send,pbc_flag[iswap],pbc[iswap]);
521
504
 
522
505
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
523
 
            MPI_Wait(&request,&status);
 
506
            MPI_Wait(&request,MPI_STATUS_IGNORE);
524
507
            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
525
508
      }
526
509
 
549
532
 
550
533
void CommCuda::forward_comm_unpack_cuda()
551
534
{
552
 
        static int count=0;
553
 
        static double kerneltime=0.0;
554
 
        static double copytime=0.0;
555
 
    my_times time1,time2,time3;
556
535
  int n;
557
536
  MPI_Request request;
558
 
  MPI_Status status;
559
537
  AtomVec *avec = atom->avec;
560
 
  double **x = atom->x;
561
538
 
562
539
  cuda->shared_data.domain.xy=domain->xy;
563
540
  cuda->shared_data.domain.xz=domain->xz;
599
576
                            buf_send,pbc_flag[iswap],pbc[iswap]);
600
577
 
601
578
            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
602
 
            MPI_Wait(&request,&status);
 
579
            MPI_Wait(&request,MPI_STATUS_IGNORE);
603
580
            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
604
581
      }
605
582
 
630
607
{
631
608
  if(not cuda->shared_data.pair.cudable_force)
632
609
  {
633
 
          return Comm::forward_comm_pair(pair);
 
610
          return CommBrick::forward_comm_pair(pair);
634
611
  }
635
612
 
636
613
  int iswap,n;
637
614
  double *buf;
638
615
  MPI_Request request;
639
 
  MPI_Status status;
 
616
 
 
617
  int nsize = pair->comm_forward;
640
618
 
641
619
  for (iswap = 0; iswap < nswap; iswap++) {
642
620
 
643
621
    // pack buffer
644
622
 
645
 
    n = pair->pack_comm(sendnum[iswap],&iswap,
646
 
                        buf_send,pbc_flag[iswap],pbc[iswap]);
647
 
        int nrecv = recvnum[iswap]*n;
 
623
    n = pair->pack_forward_comm(sendnum[iswap],&iswap,
 
624
                                buf_send,pbc_flag[iswap],pbc[iswap]);
 
625
        int nrecv = recvnum[iswap]*nsize;
648
626
        if(nrecv<0) nrecv=-(nrecv+1)/2;
649
 
        int nsend = sendnum[iswap]*n;
 
627
        int nsend = n;
650
628
        if(nsend<0) nsend=-(nsend+1)/2;
651
629
 
652
630
    // exchange with another proc
656
634
      MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0,
657
635
                world,&request);
658
636
      MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world);
659
 
      MPI_Wait(&request,&status);
 
637
      MPI_Wait(&request,MPI_STATUS_IGNORE);
660
638
      buf = buf_recv;
661
639
    } else buf = buf_send;
662
640
 
663
641
    // unpack buffer
664
642
 
665
 
    pair->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
 
643
    pair->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
666
644
  }
667
645
}
668
646
 
675
653
{
676
654
  int n;
677
655
  MPI_Request request;
678
 
  MPI_Status status;
679
656
  AtomVec *avec = atom->avec;
680
 
  double **f = atom->f;
681
657
  double *buf;
682
658
 
683
659
  if(not comm_f_only && not avec->cudable) cuda->downloadAll();  //not yet implemented in CUDA but only needed for non standard atom styles
691
667
      if (comm_f_only) {
692
668
 
693
669
    int size_recv_now=size_reverse_recv[iswap];
694
 
        if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
695
 
          size_recv_now=(size_recv_now+1)*sizeof(F_FLOAT)/sizeof(double);
 
670
        if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
 
671
          size_recv_now=(size_recv_now+1)*sizeof(F_CFLOAT)/sizeof(double);
696
672
        MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE,
697
673
                  sendproc[iswap],0,world,&request);
698
674
 
703
679
    }
704
680
    else buf=NULL;
705
681
    int size_reverse_send_now=size_reverse_send[iswap];
706
 
        if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_send[iswap])
707
 
          size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_FLOAT)/sizeof(double);
 
682
        if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_send[iswap])
 
683
          size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_CFLOAT)/sizeof(double);
708
684
        MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE,
709
685
                 recvproc[iswap],0,world);
710
 
        MPI_Wait(&request,&status);
 
686
        MPI_Wait(&request,MPI_STATUS_IGNORE);
711
687
        Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv);
712
688
 
713
689
      } else {
715
691
                  sendproc[iswap],0,world,&request);
716
692
        n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
717
693
        MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
718
 
        MPI_Wait(&request,&status);
 
694
        MPI_Wait(&request,MPI_STATUS_IGNORE);
719
695
 
720
696
      avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);
721
697
      }
753
729
 
754
730
  if(not cuda->oncpu) cuda->downloadAll();
755
731
 
756
 
  Comm::exchange();
 
732
  CommBrick::exchange();
757
733
}
758
734
 
759
735
 
760
736
void CommCuda::exchange_cuda()
761
737
{
762
 
  int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal;
763
 
  double lo,hi,value;
764
 
  double **x;
765
 
  double *sublo,*subhi,*buf;
 
738
  int nsend,nrecv,nrecv1,nrecv2,nlocal;
 
739
  double *buf;
766
740
  MPI_Request request;
767
 
  MPI_Status status;
768
741
  AtomVec *avec = atom->avec;
769
 
    my_times time1,time2,time3;
 
742
  my_times time1,time2;
770
743
 
771
744
  // clear global->local map for owned and ghost atoms
772
745
  // b/c atoms migrate to new procs in exchange() and
778
751
 
779
752
  if (map_style) atom->map_clear();
780
753
 
781
 
  // subbox bounds for orthogonal or triclinic
782
 
 
783
 
  if (triclinic == 0) {
784
 
    sublo = domain->sublo;
785
 
    subhi = domain->subhi;
786
 
  } else {
787
 
    sublo = domain->sublo_lamda;
788
 
    subhi = domain->subhi_lamda;
789
 
  }
790
 
 
791
754
  // loop over dimensions
792
755
 
793
756
  for (int dim = 0; dim < 3; dim++) {
794
757
    // fill buffer with atoms leaving my box, using < and >=
795
758
    // when atom is deleted, fill it in with last atom
796
759
 
797
 
          cuda->shared_data.exchange_dim=dim;
 
760
    cuda->shared_data.exchange_dim=dim;
798
761
 
799
762
    nlocal = atom->nlocal;
800
763
    avec->maxsend=&maxsend;
817
780
 
818
781
    } else {
819
782
      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
820
 
                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status);
 
783
                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
821
784
      nrecv = nrecv1;
822
785
      if (procgrid[dim] > 2) {
823
786
        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
824
 
                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status);
 
787
                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
825
788
        nrecv += nrecv2;
826
789
      }
827
790
      if (nrecv+1 > maxrecv) grow_recv(nrecv+1);
829
792
      MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0,
830
793
                world,&request);
831
794
      MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world);
832
 
      MPI_Wait(&request,&status);
 
795
      MPI_Wait(&request,MPI_STATUS_IGNORE);
833
796
 
834
797
      if (procgrid[dim] > 2) {
835
798
        MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
836
799
                  world,&request);
837
800
        MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world);
838
 
        MPI_Wait(&request,&status);
 
801
        MPI_Wait(&request,MPI_STATUS_IGNORE);
839
802
 
840
803
            if((nrecv1==0)||(nrecv2==0)) buf_recv[nrecv]=0;
841
804
      }
887
850
           return;
888
851
  }
889
852
 
890
 
  Comm::borders();
 
853
  CommBrick::borders();
891
854
 
892
855
  cuda->setSystemParams();
893
856
  if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();}
897
860
 
898
861
void CommCuda::borders_cuda()
899
862
{
900
 
  int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
901
 
  int nsend,nrecv,nfirst,nlast,ngroup;
902
 
  double lo,hi;
903
 
  int *type;
904
 
  double **x;
905
 
  double *buf,*mlo,*mhi;
 
863
  int n,iswap,dim,ineed,twoneed,smax,rmax;
 
864
  int nsend,nrecv,nfirst,nlast;
 
865
  double *buf;
906
866
  MPI_Request request;
907
 
  MPI_Status status;
908
867
  AtomVec *avec = atom->avec;
909
 
    my_times time1,time2,time3;
 
868
  my_times time1,time2;
910
869
 
911
870
  // clear old ghosts
912
871
 
929
888
      //   for later swaps in a dim, only check newly arrived ghosts
930
889
      // store sent atom indices in list for use in future timesteps
931
890
 
932
 
      x = atom->x;
933
 
      if (style == SINGLE) {
934
 
        lo = slablo[iswap];
935
 
        hi = slabhi[iswap];
936
 
      } else {
937
 
        type = atom->type;
938
 
        mlo = multilo[iswap];
939
 
        mhi = multihi[iswap];
940
 
      }
941
891
      if (ineed % 2 == 0) {
942
892
        nfirst = nlast;
943
893
        nlast = atom->nlocal + atom->nghost;
973
923
my_gettime(CLOCK_REALTIME,&time1);
974
924
      if (sendproc[iswap] != me) {
975
925
        MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
976
 
                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
 
926
                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
977
927
        if (nrecv*size_border > maxrecv)
978
928
          grow_recv(nrecv*size_border);
979
929
        MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
980
930
                  recvproc[iswap],0,world,&request);
981
931
        MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
982
 
        MPI_Wait(&request,&status);
 
932
        MPI_Wait(&request,MPI_STATUS_IGNORE);
983
933
        buf = buf_recv;
984
934
      } else {
985
935
        nrecv = nsend;
1032
982
 
1033
983
void CommCuda::borders_cuda_overlap_forward_comm()
1034
984
{
1035
 
  int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
1036
 
  int nsend,nrecv,nfirst,nlast,ngroup;
1037
 
  double lo,hi;
1038
 
  int *type;
1039
 
  double **x;
1040
 
  double *buf,*mlo,*mhi;
 
985
  int n,iswap,dim,ineed,twoneed,smax,rmax;
 
986
  int nsend,nrecv,nfirst,nlast;
 
987
  double *buf;
1041
988
  MPI_Request request;
1042
 
  MPI_Status status;
1043
989
  AtomVec *avec = atom->avec;
1044
 
    my_times time1,time2,time3;
 
990
  my_times time1,time2;
1045
991
 
1046
992
  // clear old ghosts
1047
993
 
1064
1010
      //   for later swaps in a dim, only check newly arrived ghosts
1065
1011
      // store sent atom indices in list for use in future timesteps
1066
1012
 
1067
 
      x = atom->x;
1068
 
      if (style == SINGLE) {
1069
 
        lo = slablo[iswap];
1070
 
        hi = slabhi[iswap];
1071
 
      } else {
1072
 
        type = atom->type;
1073
 
        mlo = multilo[iswap];
1074
 
        mhi = multihi[iswap];
1075
 
      }
1076
1013
      if (ineed % 2 == 0) {
1077
1014
        nfirst = nlast;
1078
1015
        nlast = atom->nlocal + atom->nghost;
1109
1046
my_gettime(CLOCK_REALTIME,&time1);
1110
1047
      if (sendproc[iswap] != me) {
1111
1048
        MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
1112
 
                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
 
1049
                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
1113
1050
        if (nrecv*size_border > maxrecv)
1114
1051
          grow_recv(nrecv*size_border);
1115
1052
        MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
1116
1053
                  recvproc[iswap],0,world,&request);
1117
1054
        MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
1118
 
        MPI_Wait(&request,&status);
 
1055
        MPI_Wait(&request,MPI_STATUS_IGNORE);
1119
1056
        buf = buf_recv;
1120
1057
      } else {
1121
1058
        nrecv = nsend;
1169
1106
 
1170
1107
 
1171
1108
 
1172
 
void CommCuda::forward_comm_fix(Fix *fix)
 
1109
void CommCuda::forward_comm_fix(Fix *fix, int size)
1173
1110
{
1174
1111
  int iswap,n;
1175
1112
  double *buf;
1176
1113
  MPI_Request request;
1177
 
  MPI_Status status;
 
1114
 
 
1115
  int nsize = fix->comm_forward;
1178
1116
 
1179
1117
  for (iswap = 0; iswap < nswap; iswap++) {
1180
1118
    // pack buffer
1184
1122
        if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);}
1185
1123
        else buf=buf_send;
1186
1124
 
1187
 
        n = fix->pack_comm(sendnum[iswap],&swap,
1188
 
                        buf,pbc_flag[iswap],pbc[iswap]);
 
1125
        n = fix->pack_forward_comm(sendnum[iswap],&swap,
 
1126
                                   buf,pbc_flag[iswap],pbc[iswap]);
1189
1127
        if(sendproc[iswap] == me)
1190
1128
        {
1191
1129
                continue;
1192
1130
        }
1193
1131
    }
1194
1132
    else
1195
 
    n = fix->pack_comm(sendnum[iswap],sendlist[iswap],
1196
 
                       buf_send,pbc_flag[iswap],pbc[iswap]);
 
1133
      n = fix->pack_forward_comm(sendnum[iswap],sendlist[iswap],
 
1134
                                 buf_send,pbc_flag[iswap],pbc[iswap]);
1197
1135
 
1198
1136
     // exchange with another proc
1199
1137
    // if self, set recv buffer to send buffer
1200
1138
 
1201
1139
    if (sendproc[iswap] != me) {
1202
 
      MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
 
1140
      MPI_Irecv(buf_recv,nsize*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
1203
1141
                world,&request);
1204
 
      MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
1205
 
      MPI_Wait(&request,&status);
 
1142
      MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
 
1143
      MPI_Wait(&request,MPI_STATUS_IGNORE);
1206
1144
      buf = buf_recv;
1207
1145
    } else buf = buf_send;
1208
1146
 
1209
1147
    // unpack buffer
1210
1148
 
1211
 
    fix->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
 
1149
    fix->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
1212
1150
  }
1213
1151
}
1214
1152
 
1313
1251
void CommCuda::grow_swap(int n)
1314
1252
{
1315
1253
  int oldmaxswap=maxswap;
1316
 
  Comm::grow_swap(n);
 
1254
  CommBrick::grow_swap(n);
1317
1255
  if(n>cu_sendlist->get_dim()[0])
1318
1256
  {
1319
1257
   MYDBG(printf(" # CUDA CommCuda::grow_swap\n");)
1357
1295
 
1358
1296
void CommCuda::allocate_swap(int n)
1359
1297
{
1360
 
   Comm::allocate_swap(n);
 
1298
   CommBrick::allocate_swap(n);
1361
1299
 
1362
1300
          delete cu_pbc;
1363
1301
          delete cu_slablo;
1367
1305
          if(cu_sendlist)
1368
1306
          {
1369
1307
            cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,n,6);
1370
 
            cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,n);
1371
 
            cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,n);
 
1308
            cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,n);
 
1309
            cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,n);
1372
1310
 
1373
1311
            cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
1374
1312
            cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
1392
1330
 
1393
1331
void CommCuda::allocate_multi(int n)
1394
1332
{
1395
 
  Comm::allocate_multi(n);
 
1333
  CommBrick::allocate_multi(n);
1396
1334
 
1397
1335
          delete cu_multilo;
1398
1336
          delete cu_multihi;
1399
 
          cu_multilo = new cCudaData<double, X_FLOAT,xy>(slablo,n,atom->ntypes+1);
1400
 
          cu_multihi = new cCudaData<double, X_FLOAT,xy>(slabhi,n,atom->ntypes+1);
 
1337
          cu_multilo = new cCudaData<double, X_CFLOAT,xy>(slablo,n,atom->ntypes+1);
 
1338
          cu_multihi = new cCudaData<double, X_CFLOAT,xy>(slabhi,n,atom->ntypes+1);
1401
1339
 
1402
1340
          cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data();
1403
1341
          cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data();
1410
1348
void CommCuda::free_swap()
1411
1349
{
1412
1350
 
1413
 
  Comm::free_swap();
 
1351
  CommBrick::free_swap();
1414
1352
 
1415
1353
  delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL;
1416
1354
  delete cu_pbc; cu_pbc = NULL;
1431
1369
 
1432
1370
void CommCuda::free_multi()
1433
1371
{
1434
 
  Comm::free_multi();
 
1372
  CommBrick::free_multi();
1435
1373
  delete cu_multilo; cu_multilo = NULL;
1436
1374
  delete cu_multihi; cu_multihi = NULL;
1437
1375
}