1
/* ----------------------------------------------------------------------
2
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
http://lammps.sandia.gov, Sandia National Laboratories
4
Steve Plimpton, sjplimp@sandia.gov
6
This software is distributed under the GNU General Public License.
8
See the README file in the top-level LAMMPS directory.
9
------------------------------------------------------------------------- */
11
/* ----------------------------------------------------------------------
12
Contributing author: W. Michael Brown (Intel)
13
------------------------------------------------------------------------- */
16
#include "pair_gayberne_intel.h"
17
#include "math_extra_intel.h"
20
#include "atom_vec_ellipsoid.h"
25
#include "neigh_list.h"
26
#include "neigh_request.h"
29
using namespace LAMMPS_NS;
31
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
32
#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
33
#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
35
/* ---------------------------------------------------------------------- */
37
PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
40
suffix_flag |= Suffix::INTEL;
44
/* ---------------------------------------------------------------------- */
46
void PairGayBerneIntel::compute(int eflag, int vflag)
48
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
49
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
51
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
52
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
55
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
62
template <class flt_t, class acc_t>
63
void PairGayBerneIntel::compute(int eflag, int vflag,
64
IntelBuffers<flt_t,acc_t> *buffers,
65
const ForceConst<flt_t> &fc)
68
ev_setup(eflag, vflag);
69
} else evflag = vflag_fdotr = 0;
71
const int inum = list->inum;
72
const int nall = atom->nlocal + atom->nghost;
73
const int nthreads = comm->nthreads;
74
const int host_start = fix->host_start_pair();
75
const int offload_end = fix->offload_end_pair();
76
const int ago = neighbor->ago;
78
if (fix->separate_buffers() == 0) {
79
fix->start_watch(TIME_PACK);
80
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
81
const int * const ellipsoid = atom->ellipsoid;
82
QUAT_T * _noalias const quat = buffers->get_quat();
84
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
88
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
90
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
92
for (int i = ifrom; i < ito; i++) {
93
int qi = ellipsoid[i];
95
quat[i].w = bonus[qi].quat[0];
96
quat[i].i = bonus[qi].quat[1];
97
quat[i].j = bonus[qi].quat[2];
98
quat[i].k = bonus[qi].quat[3];
102
quat[nall].w = (flt_t)1.0;
103
quat[nall].i = (flt_t)0.0;
104
quat[nall].j = (flt_t)0.0;
105
quat[nall].k = (flt_t)0.0;
106
fix->stop_watch(TIME_PACK);
109
if (evflag || vflag_fdotr) {
111
if (vflag_fdotr) ovflag = 2;
112
else if (vflag) ovflag = 1;
114
if (force->newton_pair) {
115
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
116
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
118
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
119
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
122
if (force->newton_pair) {
123
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
124
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
126
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
127
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
131
if (force->newton_pair) {
132
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
133
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
135
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
136
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
141
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
142
void PairGayBerneIntel::eval(const int offload, const int vflag,
143
IntelBuffers<flt_t,acc_t> *buffers,
144
const ForceConst<flt_t> &fc,
145
const int astart, const int aend)
147
const int inum = aend - astart;
148
if (inum == 0) return;
149
int nlocal, nall, minlocal;
150
fix->get_buffern(offload, nlocal, nall, minlocal);
152
const int ago = neighbor->ago;
153
ATOM_T * _noalias const x = buffers->get_x(offload);
154
QUAT_T * _noalias const quat = buffers->get_quat(offload);
155
const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
156
const int *ellipsoid = atom->ellipsoid;
158
#ifdef _LMP_INTEL_OFFLOAD
159
if (fix->separate_buffers()) {
160
fix->start_watch(TIME_PACK);
162
#pragma omp parallel default(none) \
163
shared(buffers,nlocal,nall,bonus,ellipsoid)
166
int nthreads = comm->nthreads;
167
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
168
nthreads, sizeof(ATOM_T));
169
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
170
for (int i = ifrom; i < ito; i++) {
171
int qi = ellipsoid[i];
173
quat[i].w = bonus[qi].quat[0];
174
quat[i].i = bonus[qi].quat[1];
175
quat[i].j = bonus[qi].quat[2];
176
quat[i].k = bonus[qi].quat[3];
179
int nghost = nall - nlocal;
181
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
182
nthreads, sizeof(ATOM_T));
187
offset = fix->offload_min_ghost() - nlocal;
188
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
190
for (int i = ifrom; i < ito; i++) {
191
int qi = ellipsoid[i + offset];
193
quat[i].w = bonus[qi].quat[0];
194
quat[i].i = bonus[qi].quat[1];
195
quat[i].j = bonus[qi].quat[2];
196
quat[i].k = bonus[qi].quat[3];
202
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
203
for (int i = fix->host_min_local(); i < nlocal; i++) {
204
int qi = ellipsoid[i];
206
quat[i].w = bonus[qi].quat[0];
207
quat[i].i = bonus[qi].quat[1];
208
quat[i].j = bonus[qi].quat[2];
209
quat[i].k = bonus[qi].quat[3];
212
int offset = fix->host_min_ghost() - nlocal;
213
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
214
for (int i = nlocal; i < nall; i++) {
215
int qi = ellipsoid[i + offset];
217
quat[i].w = bonus[qi].quat[0];
218
quat[i].i = bonus[qi].quat[1];
219
quat[i].j = bonus[qi].quat[2];
220
quat[i].k = bonus[qi].quat[3];
224
fix->stop_watch(TIME_PACK);
228
// const int * _noalias const ilist = list->ilist;
229
const int * _noalias const numneigh = list->numneigh;
230
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
231
const int * _noalias const firstneigh = buffers->firstneigh(list);
232
const flt_t * _noalias const special_lj = fc.special_lj;
234
const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
235
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
236
const FC_PACKED3_T * _noalias const ic = fc.ic;
237
const flt_t mu = fc.mu;
238
const flt_t gamma = fc.gamma;
239
const flt_t upsilon = fc.upsilon;
241
flt_t * const rsq_formi = fc.rsq_form[0];
242
flt_t * const delx_formi = fc.delx_form[0];
243
flt_t * const dely_formi = fc.dely_form[0];
244
flt_t * const delz_formi = fc.delz_form[0];
245
int * const jtype_formi = fc.jtype_form[0];
246
int * const jlist_formi = fc.jlist_form[0];
248
const int ntypes = atom->ntypes + 1;
249
const int eatom = this->eflag_atom;
251
// Determine how much data to transfer
252
int x_size, q_size, f_stride, ev_size, separate_flag;
253
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
254
buffers, offload, fix, separate_flag,
255
x_size, q_size, ev_size, f_stride);
258
FORCE_T * _noalias f_start;
259
acc_t * _noalias ev_global;
260
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
261
const int max_nbors = _max_nbors;
262
const int nthreads = tc;
266
if (INTEL_MIC_NBOR_PAD > 1)
267
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
269
if (INTEL_NBOR_PAD > 1)
270
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
272
const int pad_width = pad;
274
#ifdef _LMP_INTEL_OFFLOAD
275
int *overflow = fix->get_off_overflow_flag();
276
double *timer_compute = fix->off_watch_pair();
278
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
279
#pragma offload target(mic:_cop) if(offload) \
280
in(special_lj:length(0) alloc_if(0) free_if(0)) \
281
in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
282
in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
283
in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
284
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
285
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
286
in(numneigh:length(0) alloc_if(0) free_if(0)) \
287
in(x:length(x_size) alloc_if(0) free_if(0)) \
288
in(quat:length(nall+1) alloc_if(0) free_if(0)) \
289
in(overflow:length(0) alloc_if(0) free_if(0)) \
290
in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
291
in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
292
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
293
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
294
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
299
*timer_compute=MIC_Wtime();
302
#ifdef _LMP_INTEL_OFFLOAD
304
if (separate_flag < 3) {
305
int all_local = nlocal;
306
int ghost_min = overflow[LMP_GHOST_MIN];
307
nlocal = overflow[LMP_LOCAL_MAX] + 1;
308
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
309
if (nghost < 0) nghost = 0;
310
nall = nlocal + nghost;
313
if (NEWTON_PAIR) flength = nall;
314
else flength = nlocal;
315
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
318
if (nlocal < all_local || ghost_min > all_local) {
319
memmove(x + nlocal, x + ghost_min,
320
(nall - nlocal) * sizeof(ATOM_T));
321
memmove(quat + nlocal, quat + ghost_min,
322
(nall - nlocal) * sizeof(QUAT_T));
326
x[nall].x = (flt_t)INTEL_BIGP;
327
x[nall].y = (flt_t)INTEL_BIGP;
328
x[nall].z = (flt_t)INTEL_BIGP;
329
quat[nall].w = (flt_t)1.0;
330
quat[nall].i = (flt_t)0.0;
331
quat[nall].j = (flt_t)0.0;
332
quat[nall].k = (flt_t)0.0;
336
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
339
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
342
// loop over neighbors of my atoms
344
#pragma omp parallel default(none) \
345
shared(f_start,f_stride,nlocal,nall,minlocal) \
346
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
349
int iifrom, iito, tid;
350
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
354
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
355
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
357
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
358
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
359
flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
360
flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
361
int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
362
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
365
for (int i = iifrom; i < iito; ++i) {
366
// const int i = ilist[ii];
367
const int itype = x[i].w;
368
const int ptr_off = itype * ntypes;
369
const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
370
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
372
const int * _noalias const jlist = firstneigh + cnumneigh[i];
373
const int jnum = numneigh[i];
375
const flt_t xtmp = x[i].x;
376
const flt_t ytmp = x[i].y;
377
const flt_t ztmp = x[i].z;
379
flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
380
flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
381
flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
383
if (ijci[itype].form == ELLIPSE_ELLIPSE) {
384
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
385
ME_quat_to_mat_trans(quat[i],a1);
386
ME_diag_times3(ic[itype].well,a1,temp);
387
ME_transpose_times3(a1,temp,b1);
388
ME_diag_times3(ic[itype].shape2,a1,temp);
389
ME_transpose_times3(a1,temp,g1);
392
acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
393
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
394
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
397
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
398
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
401
bool multiple_forms = false;
403
for (int jj = 0; jj < jnum; jj++) {
405
int j = jm & NEIGHMASK;
406
const int jtype = x[j].w;
408
if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
409
flt_t delx = x[j].x-xtmp;
410
flt_t dely = x[j].y-ytmp;
411
flt_t delz = x[j].z-ztmp;
412
flt_t rsq = delx * delx + dely * dely + delz * delz;
414
if (rsq < ijci[jtype].cutsq) {
415
rsq_form[packed_j] = rsq;
416
delx_form[packed_j] = delx;
417
dely_form[packed_j] = dely;
418
delz_form[packed_j] = delz;
419
jtype_form[packed_j] = jtype;
420
jlist_form[packed_j] = jm;
424
multiple_forms = true;
426
while( (packed_j % pad_width) != 0 )
427
jlist_form[packed_j++] = nall;
429
// -------------------------------------------------------------
432
__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
433
__assume(packed_j % 8 == 0);
434
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
436
#if defined(__INTEL_COMPILER)
437
#pragma vector aligned
438
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
439
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
441
for (int jj = 0; jj < packed_j; jj++) {
442
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
443
flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
444
flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
445
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
446
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
447
flt_t rtor_0, rtor_1, rtor_2;
449
const int sbindex = jlist_form[jj] >> SBBITS & 3;
450
const int j = jlist_form[jj] & NEIGHMASK;
451
flt_t factor_lj = special_lj[sbindex];
452
const int jtype = jtype_form[jj];
453
const flt_t sigma = ijci[jtype].sigma;
454
const flt_t epsilon = ijci[jtype].epsilon;
455
const flt_t shape2_0 = ic[jtype].shape2[0];
456
const flt_t shape2_1 = ic[jtype].shape2[1];
457
const flt_t shape2_2 = ic[jtype].shape2[2];
458
flt_t one_eng, evdwl;
460
ME_quat_to_mat_trans(quat[j], a2);
461
ME_diag_times3(ic[jtype].well, a2, temp);
462
ME_transpose_times3(a2, temp, b2);
463
ME_diag_times3a(shape2, a2, temp);
464
ME_transpose_times3(a2, temp, g2);
466
flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
467
flt_t temp1, temp2, temp3;
469
flt_t r12hat_0, r12hat_1, r12hat_2;
470
ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
471
flt_t r = sqrt(rsq_form[jj]);
473
// compute distance of closest approach
475
flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
476
ME_plus3(g1, g2, g12);
477
flt_t kappa_0, kappa_1, kappa_2;
478
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
481
// tempv = G12^-1*r12hat
483
flt_t inv_r = (flt_t)1.0 / r;
484
tempv_0 = kappa_0 * inv_r;
485
tempv_1 = kappa_1 * inv_r;
486
tempv_2 = kappa_2 * inv_r;
487
flt_t sigma12 = ME_dot3(r12hat, tempv);
488
sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
489
flt_t h12 = r - sigma12;
494
flt_t varrho = sigma / (h12 + gamma * sigma);
495
flt_t varrho6 = pow(varrho, (flt_t)6.0);
496
flt_t varrho12 = varrho6 * varrho6;
497
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
501
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
502
flt_t det_g12 = ME_det3(g12);
503
eta = pow(eta / det_g12, upsilon);
507
flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
508
flt_t iota_0, iota_1, iota_2;
509
ME_plus3(b1, b2, b12);
510
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
513
// tempv = G12^-1*r12hat
515
tempv_0 = iota_0 * inv_r;
516
tempv_1 = iota_1 * inv_r;
517
tempv_2 = iota_2 * inv_r;
518
flt_t chi = ME_dot3(r12hat, tempv);
519
chi = pow(chi * (flt_t)2.0, mu);
524
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
526
temp1 = temp1 * (flt_t)24.0 * epsilon;
527
flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
528
flt_t dUr_0, dUr_1, dUr_2;
529
temp2 = ME_dot3(kappa, r12hat);
530
flt_t uslj_rsq = u_slj / rsq_form[jj];
531
dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
532
dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
533
dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
535
// compute dChi_12/dr
537
flt_t dchi_0, dchi_1, dchi_2;
538
temp1 = ME_dot3(iota, r12hat);
539
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
540
pow(chi, (mu - (flt_t)1.0) / mu);
541
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
542
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
543
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
547
fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
548
fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
549
fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
551
// torque for particle 1 and 2
554
tempv_0 = -uslj_rsq * kappa_0;
555
tempv_1 = -uslj_rsq * kappa_1;
556
tempv_2 = -uslj_rsq * kappa_2;
557
ME_vecmat(kappa, g1, tempv2);
558
ME_cross3(tempv, tempv2, dUr);
559
flt_t dUr2_0, dUr2_1, dUr2_2;
561
if (NEWTON_PAIR || j < nlocal) {
562
ME_vecmat(kappa, g2, tempv2);
563
ME_cross3(tempv, tempv2, dUr2);
568
ME_vecmat(iota, b1, tempv);
569
ME_cross3(tempv, iota, dchi);
570
temp1 = (flt_t)-4.0 / rsq_form[jj];
574
flt_t dchi2_0, dchi2_1, dchi2_2;
576
if (NEWTON_PAIR || j < nlocal) {
577
ME_vecmat(iota, b2, tempv);
578
ME_cross3(tempv, iota, dchi2);
586
flt_t deta_0, deta_1, deta_2;
587
deta_0 = deta_1 = deta_2 = (flt_t)0.0;
588
ME_compute_eta_torque(g12, a1, shape2, temp);
589
temp1 = -eta * upsilon;
591
tempv_0 = temp1 * temp_0;
592
tempv_1 = temp1 * temp_1;
593
tempv_2 = temp1 * temp_2;
594
ME_mv0_cross3(a1, tempv, tempv2);
599
tempv_0 = temp1 * temp_3;
600
tempv_1 = temp1 * temp_4;
601
tempv_2 = temp1 * temp_5;
602
ME_mv1_cross3(a1, tempv, tempv2);
607
tempv_0 = temp1 * temp_6;
608
tempv_1 = temp1 * temp_7;
609
tempv_2 = temp1 * temp_8;
610
ME_mv2_cross3(a1, tempv, tempv2);
615
// compute d_eta for particle 2
617
flt_t deta2_0, deta2_1, deta2_2;
618
if (NEWTON_PAIR || j < nlocal) {
619
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
620
ME_compute_eta_torque(g12, a2, shape2, temp);
622
tempv_0 = temp1 * temp_0;
623
tempv_1 = temp1 * temp_1;
624
tempv_2 = temp1 * temp_2;
625
ME_mv0_cross3(a2, tempv, tempv2);
630
tempv_0 = temp1 * temp_3;
631
tempv_1 = temp1 * temp_4;
632
tempv_2 = temp1 * temp_5;
633
ME_mv1_cross3(a2, tempv, tempv2);
638
tempv_0 = temp1 * temp_6;
639
tempv_1 = temp1 * temp_7;
640
tempv_2 = temp1 * temp_8;
641
ME_mv2_cross3(a2, tempv, tempv2);
653
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
655
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
657
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
660
if (NEWTON_PAIR || j < nlocal) {
661
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
663
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
665
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
669
one_eng = temp1 * chi;
671
if (jlist_form[jj] == nall) {
672
one_eng = (flt_t)0.0;
685
fforce_0 *= factor_lj;
686
fforce_1 *= factor_lj;
687
fforce_2 *= factor_lj;
693
if (jlist_form[jj] < nall) {
702
if (NEWTON_PAIR || j < nlocal) {
717
flt_t ev_pre = (flt_t)0;
718
if (NEWTON_PAIR || i < nlocal)
719
ev_pre += (flt_t)0.5;
720
if (NEWTON_PAIR || j < nlocal)
721
ev_pre += (flt_t)0.5;
724
evdwl = factor_lj * one_eng;
725
sevdwl += ev_pre * evdwl;
727
if (NEWTON_PAIR || i < nlocal)
728
fwtmp += (flt_t)0.5 * evdwl;
729
if (NEWTON_PAIR || j < nlocal)
730
f[j*2].w += (flt_t)0.5 * evdwl;
735
ev_pre *= (flt_t)-1.0;
736
sv0 += ev_pre * delx_form[jj] * fforce_0;
737
sv1 += ev_pre * dely_form[jj] * fforce_1;
738
sv2 += ev_pre * delz_form[jj] * fforce_2;
739
sv3 += ev_pre * delx_form[jj] * fforce_1;
740
sv4 += ev_pre * delx_form[jj] * fforce_2;
741
sv5 += ev_pre * dely_form[jj] * fforce_2;
749
// -------------------------------------------------------------
765
if (eatom) f[i * 2].w += fwtmp;
783
if (offload == 0) o_range -= minlocal;
784
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
786
const int two_iito = iito * 2;
792
acc_t *facc = &(f_start[0].x);
793
const int sto = two_iito * 4;
794
const int fst4 = f_stride * 4;
798
int t_off = f_stride;
799
if (EFLAG && eatom) {
800
for (int t = 1; t < nthreads; t++) {
801
#if defined(__INTEL_COMPILER)
802
#pragma vector nontemporal
805
for (int n = iifrom * 2; n < two_iito; n++) {
806
f_start[n].x += f_start[n + t_off].x;
807
f_start[n].y += f_start[n + t_off].y;
808
f_start[n].z += f_start[n + t_off].z;
809
f_start[n].w += f_start[n + t_off].w;
814
for (int t = 1; t < nthreads; t++) {
815
#if defined(__INTEL_COMPILER)
816
#pragma vector nontemporal
819
for (int n = iifrom * 2; n < two_iito; n++) {
820
f_start[n].x += f_start[n + t_off].x;
821
f_start[n].y += f_start[n + t_off].y;
822
f_start[n].z += f_start[n + t_off].z;
830
const ATOM_T * _noalias const xo = x + minlocal;
831
#if defined(__INTEL_COMPILER)
832
#pragma vector nontemporal
835
for (int n = iifrom; n < iito; n++) {
836
const int nt2 = n * 2;
837
ov0 += f_start[nt2].x * xo[n].x;
838
ov1 += f_start[nt2].y * xo[n].y;
839
ov2 += f_start[nt2].z * xo[n].z;
840
ov3 += f_start[nt2].y * xo[n].x;
841
ov4 += f_start[nt2].z * xo[n].x;
842
ov5 += f_start[nt2].z * xo[n].y;
848
f_start[1].w = ierror;
853
ev_global[0] = oevdwl;
854
ev_global[1] = (acc_t)0.0;
867
*timer_compute = MIC_Wtime() - *timer_compute;
872
fix->stop_watch(TIME_OFFLOAD_LATENCY);
874
fix->stop_watch(TIME_HOST_PAIR);
877
fix->add_result_array(f_start, ev_global, offload,eatom);
879
fix->add_result_array(f_start, 0, offload);
882
/* ---------------------------------------------------------------------- */
884
void PairGayBerneIntel::init_style()
886
PairGayBerne::init_style();
887
neighbor->requests[neighbor->nrequest-1]->intel = 1;
889
int ifix = modify->find_fix("package_intel");
892
"The 'package intel' command is required for /intel styles");
893
fix = static_cast<FixIntel *>(modify->fix[ifix]);
895
fix->pair_init_check();
896
#ifdef _LMP_INTEL_OFFLOAD
897
if (force->newton_pair) fix->set_offload_noghost(1);
898
_cop = fix->coprocessor_number();
901
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
902
pack_force_const(force_const_single, fix->get_mixed_buffers());
903
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
904
pack_force_const(force_const_double, fix->get_double_buffers());
906
pack_force_const(force_const_single, fix->get_single_buffers());
909
/* ---------------------------------------------------------------------- */
911
template <class flt_t, class acc_t>
912
void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
913
IntelBuffers<flt_t,acc_t> *buffers)
915
int tp1 = atom->ntypes + 1;
916
_max_nbors = buffers->get_max_nbors();
917
int mthreads = comm->nthreads;
918
if (mthreads < buffers->get_off_threads())
919
mthreads = buffers->get_off_threads();
920
fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
921
buffers->set_ntypes(tp1);
922
flt_t **cutneighsq = buffers->get_cutneighsq();
924
// Repeat cutsq calculation because done after call to init_style
925
double cut, cutneigh;
926
for (int i = 1; i <= atom->ntypes; i++) {
927
for (int j = i; j <= atom->ntypes; j++) {
928
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
930
cutneigh = cut + neighbor->skin;
931
cutsq[i][j] = cutsq[j][i] = cut*cut;
932
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
937
for (int i = 0; i < 4; i++) {
938
fc.special_lj[i] = force->special_lj[i];
939
fc.special_lj[0] = 1.0;
942
fc.upsilon = upsilon;
945
for (int i = 0; i < tp1; i++) {
946
for (int j = 0; j < tp1; j++) {
947
fc.ijc[i][j].lj1 = lj1[i][j];
948
fc.ijc[i][j].lj2 = lj2[i][j];
949
fc.ijc[i][j].cutsq = cutsq[i][j];
950
fc.ijc[i][j].offset = offset[i][j];
951
fc.ijc[i][j].sigma = sigma[i][j];
952
fc.ijc[i][j].epsilon = epsilon[i][j];
953
fc.ijc[i][j].form = form[i][j];
954
fc.ijc[i][j].lshape = lshape[i] * lshape[j];
955
fc.lj34[i][j].lj3 = lj3[i][j];
956
fc.lj34[i][j].lj4 = lj4[i][j];
958
for (int j = 0; j < 4; j++) {
959
fc.ic[i].shape2[j] = shape2[i][j];
960
fc.ic[i].well[j] = well[i][j];
964
#ifdef _LMP_INTEL_OFFLOAD
965
if (_cop < 0) return;
966
flt_t * special_lj = fc.special_lj;
967
FC_PACKED1_T *oijc = fc.ijc[0];
968
FC_PACKED2_T *olj34 = fc.lj34[0];
969
FC_PACKED3_T *oic = fc.ic;
970
flt_t * ocutneighsq = cutneighsq[0];
971
int tp1sq = tp1 * tp1;
972
if (oijc != NULL && oic != NULL) {
973
#pragma offload_transfer target(mic:_cop) \
974
in(special_lj: length(4) alloc_if(0) free_if(0)) \
975
in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
976
in(oic: length(tp1) alloc_if(0) free_if(0)) \
977
in(ocutneighsq: length(tp1sq))
982
/* ---------------------------------------------------------------------- */
984
template <class flt_t>
985
void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
986
const int one_length,
990
if (ntypes != _ntypes) {
992
fc_packed3 *oic = ic;
994
#ifdef _LMP_INTEL_OFFLOAD
995
flt_t * ospecial_lj = special_lj;
996
fc_packed1 *oijc = ijc[0];
997
fc_packed2 *olj34 = lj34[0];
998
flt_t * orsq_form = rsq_form[0];
999
flt_t * odelx_form = delx_form[0];
1000
flt_t * odely_form = dely_form[0];
1001
flt_t * odelz_form = delz_form[0];
1002
int * ojtype_form = jtype_form[0];
1003
int * ojlist_form = jlist_form[0];
1005
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
1006
orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
1007
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
1009
#pragma offload_transfer target(mic:_cop) \
1010
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
1011
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
1012
nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
1016
_memory->destroy(oic);
1017
_memory->destroy(ijc);
1018
_memory->destroy(lj34);
1019
_memory->destroy(rsq_form);
1020
_memory->destroy(delx_form);
1021
_memory->destroy(dely_form);
1022
_memory->destroy(delz_form);
1023
_memory->destroy(jtype_form);
1024
_memory->destroy(jlist_form);
1029
memory->create(ijc, ntypes, ntypes, "fc.ijc");
1030
memory->create(lj34, ntypes, ntypes, "fc.lj34");
1031
memory->create(ic, ntypes, "fc.ic");
1032
memory->create(rsq_form, nthreads, one_length, "rsq_form");
1033
memory->create(delx_form, nthreads, one_length, "delx_form");
1034
memory->create(dely_form, nthreads, one_length, "dely_form");
1035
memory->create(delz_form, nthreads, one_length, "delz_form");
1036
memory->create(jtype_form, nthreads, one_length, "jtype_form");
1037
memory->create(jlist_form, nthreads, one_length, "jlist_form");
1039
for (int zn = 0; zn < nthreads; zn++)
1040
for (int zo = 0; zo < one_length; zo++) {
1041
rsq_form[zn][zo] = 10.0;
1042
delx_form[zn][zo] = 10.0;
1043
dely_form[zn][zo] = 10.0;
1044
delz_form[zn][zo] = 10.0;
1045
jtype_form[zn][zo] = 1;
1046
jlist_form[zn][zo] = 0;
1049
#ifdef _LMP_INTEL_OFFLOAD
1050
flt_t * ospecial_lj = special_lj;
1051
fc_packed1 *oijc = ijc[0];
1052
fc_packed2 *olj34 = lj34[0];
1053
fc_packed3 *oic = ic;
1054
flt_t * orsq_form = rsq_form[0];
1055
flt_t * odelx_form = delx_form[0];
1056
flt_t * odely_form = dely_form[0];
1057
flt_t * odelz_form = delz_form[0];
1058
int * ojtype_form = jtype_form[0];
1059
int * ojlist_form = jlist_form[0];
1060
int off_onel = one_length * nthreads;
1062
int tp1sq = ntypes*ntypes;
1063
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
1064
oic != NULL && orsq_form != NULL && odelx_form != NULL &&
1065
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
1066
ojlist_form !=NULL && cop >= 0) {
1067
#pragma offload_transfer target(mic:cop) \
1068
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
1069
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
1070
nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
1071
in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
1072
in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
1073
in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
1074
in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
1075
in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
1076
in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))