1
/* ----------------------------------------------------------------------
2
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
http://lammps.sandia.gov, Sandia National Laboratories
4
Steve Plimpton, sjplimp@sandia.gov
6
This software is distributed under the GNU General Public License.
8
See the README file in the top-level LAMMPS directory.
9
------------------------------------------------------------------------- */
11
/* ----------------------------------------------------------------------
12
Contributing author: W. Michael Brown (Intel)
13
------------------------------------------------------------------------- */
16
#include "pair_lj_cut_intel.h"
23
#include "neigh_list.h"
24
#include "neigh_request.h"
27
using namespace LAMMPS_NS;
29
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
30
#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
32
/* ---------------------------------------------------------------------- */
34
PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) :
37
suffix_flag |= Suffix::INTEL;
42
/* ---------------------------------------------------------------------- */
44
void PairLJCutIntel::compute(int eflag, int vflag)
46
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
47
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
49
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
50
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
53
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
60
template <class flt_t, class acc_t>
61
void PairLJCutIntel::compute(int eflag, int vflag,
62
IntelBuffers<flt_t,acc_t> *buffers,
63
const ForceConst<flt_t> &fc)
66
ev_setup(eflag, vflag);
67
} else evflag = vflag_fdotr = 0;
69
const int inum = list->inum;
70
const int nthreads = comm->nthreads;
71
const int host_start = fix->host_start_pair();
72
const int offload_end = fix->offload_end_pair();
73
const int ago = neighbor->ago;
75
if (ago != 0 && fix->separate_buffers() == 0) {
76
fix->start_watch(TIME_PACK);
79
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
83
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
84
nthreads, sizeof(ATOM_T));
85
buffers->thr_pack(ifrom,ito,ago);
88
fix->stop_watch(TIME_PACK);
91
if (evflag || vflag_fdotr) {
93
if (vflag_fdotr) ovflag = 2;
94
else if (vflag) ovflag = 1;
96
if (force->newton_pair) {
97
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
98
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
100
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
101
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
104
if (force->newton_pair) {
105
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
106
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
108
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
109
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
113
if (force->newton_pair) {
114
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
115
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
117
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
118
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
123
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
124
void PairLJCutIntel::eval(const int offload, const int vflag,
125
IntelBuffers<flt_t,acc_t> *buffers,
126
const ForceConst<flt_t> &fc,
127
const int astart, const int aend)
129
const int inum = aend - astart;
130
if (inum == 0) return;
131
int nlocal, nall, minlocal;
132
fix->get_buffern(offload, nlocal, nall, minlocal);
134
const int ago = neighbor->ago;
135
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
137
ATOM_T * _noalias const x = buffers->get_x(offload);
139
const int * _noalias const numneigh = list->numneigh;
140
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
141
const int * _noalias const firstneigh = buffers->firstneigh(list);
142
const flt_t * _noalias const special_lj = fc.special_lj;
143
const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
144
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
146
const int ntypes = atom->ntypes + 1;
147
const int eatom = this->eflag_atom;
149
// Determine how much data to transfer
150
int x_size, q_size, f_stride, ev_size, separate_flag;
151
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
152
buffers, offload, fix, separate_flag,
153
x_size, q_size, ev_size, f_stride);
156
FORCE_T * _noalias f_start;
157
acc_t * _noalias ev_global;
158
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
159
const int nthreads = tc;
160
int *overflow = fix->get_off_overflow_flag();
163
*timer_compute = MIC_Wtime();
166
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
169
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
172
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
175
// loop over neighbors of my atoms
177
#pragma omp parallel default(none) \
178
shared(f_start,f_stride,nlocal,nall,minlocal) \
179
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
182
int iifrom, iito, tid;
183
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
187
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
188
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
190
for (int i = iifrom; i < iito; ++i) {
191
const int itype = x[i].w;
193
const int ptr_off = itype * ntypes;
194
const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
195
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
197
const int * _noalias const jlist = firstneigh + cnumneigh[i];
198
const int jnum = numneigh[i];
200
acc_t fxtmp, fytmp, fztmp, fwtmp;
201
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
203
const flt_t xtmp = x[i].x;
204
const flt_t ytmp = x[i].y;
205
const flt_t ztmp = x[i].z;
206
fxtmp = fytmp = fztmp = (acc_t)0;
208
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
209
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
212
#if defined(__INTEL_COMPILER)
213
#pragma vector aligned
214
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
215
sv0, sv1, sv2, sv3, sv4, sv5)
217
for (int jj = 0; jj < jnum; jj++) {
218
flt_t forcelj, evdwl;
219
forcelj = evdwl = (flt_t)0.0;
221
const int sbindex = jlist[jj] >> SBBITS & 3;
222
const int j = jlist[jj] & NEIGHMASK;
223
const flt_t delx = xtmp - x[j].x;
224
const flt_t dely = ytmp - x[j].y;
225
const flt_t delz = ztmp - x[j].z;
226
const int jtype = x[j].w;
227
const flt_t rsq = delx * delx + dely * dely + delz * delz;
230
if (rsq < ljc12oi[jtype].cutsq) {
232
flt_t factor_lj = special_lj[sbindex];
233
flt_t r2inv = 1.0 / rsq;
234
flt_t r6inv = r2inv * r2inv * r2inv;
236
if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
238
forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
239
flt_t fpair = factor_lj * forcelj * r2inv;
241
fxtmp += delx * fpair;
242
fytmp += dely * fpair;
243
fztmp += delz * fpair;
244
if (NEWTON_PAIR || j < nlocal) {
245
f[j].x -= delx * fpair;
246
f[j].y -= dely * fpair;
247
f[j].z -= delz * fpair;
251
flt_t ev_pre = (flt_t)0;
252
if (NEWTON_PAIR || i<nlocal)
253
ev_pre += (flt_t)0.5;
254
if (NEWTON_PAIR || j<nlocal)
255
ev_pre += (flt_t)0.5;
258
evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
259
ljc12oi[jtype].offset;
261
sevdwl += ev_pre*evdwl;
263
if (NEWTON_PAIR || i < nlocal)
264
fwtmp += 0.5 * evdwl;
265
if (NEWTON_PAIR || j < nlocal)
266
f[j].w += 0.5 * evdwl;
270
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
281
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
287
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
288
nlocal, minlocal, nthreads, f_start, f_stride,
293
ev_global[0] = oevdwl;
294
ev_global[1] = (acc_t)0.0;
306
*timer_compute = MIC_Wtime() - *timer_compute;
311
fix->stop_watch(TIME_OFFLOAD_LATENCY);
313
fix->stop_watch(TIME_HOST_PAIR);
316
fix->add_result_array(f_start, ev_global, offload, eatom);
318
fix->add_result_array(f_start, 0, offload);
321
/* ---------------------------------------------------------------------- */
323
void PairLJCutIntel::init_style()
325
PairLJCut::init_style();
326
neighbor->requests[neighbor->nrequest-1]->intel = 1;
328
int ifix = modify->find_fix("package_intel");
331
"The 'package intel' command is required for /intel styles");
332
fix = static_cast<FixIntel *>(modify->fix[ifix]);
334
fix->pair_init_check();
335
#ifdef _LMP_INTEL_OFFLOAD
336
if (fix->offload_balance() != 0.0)
338
"Offload for lj/cut/intel is not yet available. Set balance to 0.");
341
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
342
pack_force_const(force_const_single, fix->get_mixed_buffers());
343
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
344
pack_force_const(force_const_double, fix->get_double_buffers());
346
pack_force_const(force_const_single, fix->get_single_buffers());
349
/* ---------------------------------------------------------------------- */
351
template <class flt_t, class acc_t>
352
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
353
IntelBuffers<flt_t,acc_t> *buffers)
355
int tp1 = atom->ntypes + 1;
356
fc.set_ntypes(tp1,memory,_cop);
357
buffers->set_ntypes(tp1);
358
flt_t **cutneighsq = buffers->get_cutneighsq();
360
// Repeat cutsq calculation because done after call to init_style
361
double cut, cutneigh;
362
for (int i = 1; i <= atom->ntypes; i++) {
363
for (int j = i; j <= atom->ntypes; j++) {
364
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
366
cutneigh = cut + neighbor->skin;
367
cutsq[i][j] = cutsq[j][i] = cut*cut;
368
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
373
for (int i = 0; i < 4; i++) {
374
fc.special_lj[i] = force->special_lj[i];
375
fc.special_lj[0] = 1.0;
378
for (int i = 0; i < tp1; i++) {
379
for (int j = 0; j < tp1; j++) {
380
fc.ljc12o[i][j].lj1 = lj1[i][j];
381
fc.ljc12o[i][j].lj2 = lj2[i][j];
382
fc.lj34[i][j].lj3 = lj3[i][j];
383
fc.lj34[i][j].lj4 = lj4[i][j];
384
fc.ljc12o[i][j].cutsq = cutsq[i][j];
385
fc.ljc12o[i][j].offset = offset[i][j];
390
/* ---------------------------------------------------------------------- */
392
template <class flt_t>
393
void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
396
if (ntypes != _ntypes) {
398
fc_packed1 *oljc12o = ljc12o[0];
399
fc_packed2 *olj34 = lj34[0];
401
_memory->destroy(oljc12o);
402
_memory->destroy(olj34);
406
memory->create(ljc12o,ntypes,ntypes,"fc.c12o");
407
memory->create(lj34,ntypes,ntypes,"fc.lj34");