~ubuntu-branches/debian/sid/lammps/sid

« back to all changes in this revision

Viewing changes to lib/cuda/neighbor.cu

  • Committer: Package Import Robot
  • Author(s): Anton Gladky
  • Date: 2015-04-29 23:44:49 UTC
  • mfrom: (5.1.3 experimental)
  • Revision ID: package-import@ubuntu.com-20150429234449-mbhy9utku6hp6oq8
Tags: 0~20150313.gitfa668e1-1
Upload into unstable.

Show diffs side-by-side

added added

removed removed

Lines of Context:
38
38
#define _nex_group      MY_AP(nex_group)
39
39
#define _ex_mol_bit     MY_AP(ex_mol_bit)
40
40
#define _nex_mol        MY_AP(nex_mol)
41
 
__device__ __constant__ CUDA_FLOAT* _cutneighsq;
 
41
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
42
42
__device__ __constant__ int* _ex_type;
43
43
__device__ __constant__ int _nex_type;
44
44
__device__ __constant__ int* _ex1_bit;
54
54
{
55
55
  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
56
56
 
57
 
  int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT)));
 
57
  int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
58
58
 
59
59
  if(sdata->buffersize < size) {
60
60
    MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
77
77
    Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
78
78
 
79
79
  // initialize only on first call
80
 
  CUDA_FLOAT rez_bin_size[3] = {
 
80
  CUDA_CFLOAT rez_bin_size[3] = {
81
81
    (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
82
82
    (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
83
83
    (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
87
87
 
88
88
  if(! init) {
89
89
    init = 0;
90
 
    cudaMemcpyToSymbol(MY_AP(x)              , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
 
90
    cudaMemcpyToSymbol(MY_AP(x)              , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
91
91
    cudaMemcpyToSymbol(MY_AP(nall)         , & sdata->atom.nall                    , sizeof(unsigned));
92
92
    cudaMemcpyToSymbol(MY_AP(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned));
93
 
    cudaMemcpyToSymbol(MY_AP(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT) * 3);
 
93
    cudaMemcpyToSymbol(MY_AP(sublo)          ,   sdata->domain.sublo                 , sizeof(X_CFLOAT) * 3);
94
94
  }
95
95
 
96
96
 
101
101
  my_times starttime, endtime;
102
102
  my_gettime(CLOCK_REALTIME, &starttime);
103
103
 
104
 
  cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
 
104
  cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
105
105
 
106
106
  Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
107
107
  cudaThreadSynchronize();
126
126
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
127
127
{
128
128
  //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
129
 
  CUDA_FLOAT globcutoff = -1.0;
 
129
  CUDA_CFLOAT globcutoff = -1.0;
130
130
 
131
131
  short init = 0;
132
132
 
137
137
 
138
138
    unsigned cuda_ntypes = sdata->atom.ntypes + 1;
139
139
 
140
 
    unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
 
140
    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
141
141
 
142
 
    CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
 
142
    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
143
143
    //printf("Allocate: %i\n",nx);
144
 
    sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
 
144
    sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
145
145
 
146
146
    if(sneighlist->cutneighsq) {
147
147
      int cutoffsdiffer = 0;
149
149
 
150
150
      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
151
151
        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
152
 
          acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
 
152
          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
153
153
 
154
154
          if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
155
155
        }
156
156
      }
157
157
 
158
 
      if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0;
 
158
      if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
159
159
    } else {
160
160
      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
161
161
      return 0;
173
173
    }
174
174
 
175
175
    CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
176
 
    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_FLOAT*));
 
176
    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_CFLOAT*));
177
177
 
178
178
    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
179
179
    cudaMemcpyToSymbol(MY_AP(special_flag)     , sdata->atom.special_flag         , 4 * sizeof(int));
218
218
  dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
219
219
  dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
220
220
 
221
 
  //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
 
221
  //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
222
222
  int buffer[20];
223
223
  buffer[0] = 1;
224
224
  buffer[1] = 0;
225
225
  CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
226
226
  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
227
227
  //cudaMemset(sdata->debugdata,0,100*sizeof(int));
228
 
  unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
 
228
  unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
229
229
  MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
230
230
  //shared_size=2056;
231
231
  my_times starttime, endtime;
245
245
        NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
246
246
        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
247
247
    }
248
 
    //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
 
248
    //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
249
249
    //  (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
250
250
 
251
251
    cudaThreadSynchronize();
301
301
             "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
302
302
             "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
303
303
 
304
 
    unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
305
 
    CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
 
304
    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
 
305
    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
306
306
 
307
307
    if(sneighlist->cutneighsq) {
308
308
      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
309
309
        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
310
 
          acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
 
310
          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
311
311
          //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
312
312
        }
313
313
      }
339
339
    cudaMemcpyToSymbol(MY_AP(nmax)             , & sdata->atom.nmax               , sizeof(int));
340
340
    cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
341
341
    cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
342
 
    cudaMemcpyToSymbol(MY_AP(x)                , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
 
342
    cudaMemcpyToSymbol(MY_AP(x)                , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
343
343
    cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors      , sizeof(int));
344
344
 
345
345
    free(acutneighsq);