8
// #define min(x,y) (((x)<=(y))?(x):(y))
11
#define maxnbfn (15*maxatom)
13
#define maxnnbfn ((maxnbfn)*(maxnbfn+1)/2)
14
#define pi 3.141592653589793d0)
16
#define tol2e (1.0e-6)
24
void nga_get_(Integer *g_a,
30
void g_(double *gg, Integer *i, Integer *j, Integer *k, Integer *l);
31
double contract_matrices_(Integer *g_a, Integer *g_b);
32
void new_nga_acc_(Integer *g_a,
39
void nga_acc_(Integer *g_a,
46
void ctwoel_(Integer *g_schwarz, Integer *g_dens, Integer *g_fock,
47
double *schwmax, double *etwo,
48
long long *pnbfn, long long *icut1, long long *icut2,
49
long long *icut3, long long *icut4);
56
static void cpptwoel(Integer g_schwarz, Integer g_dens, Integer g_fock,
57
double schwmax, Integer nbfn,
58
long long *icut1, long long *icut2,
59
long long *icut3, long long *icut4);
61
void ctwoel_(Integer *g_schwarz, Integer *g_dens, Integer *g_fock,
62
double *schwmax, double *etwo,
63
long long *pnbfn, long long *icut1, long long *icut2,
64
long long *icut3, long long *icut4) {
65
const Integer nbfn = (Integer)*pnbfn;
66
long long ijcnt,klcnt,ijklcnt;
72
cpptwoel(*g_schwarz, *g_dens, *g_fock, *schwmax, nbfn,
73
icut1, icut2, icut3, icut4);
75
*etwo = 0.5*contract_matrices_(g_fock,g_dens);
77
ijcnt = *icut1 - ijcnt;
78
klcnt = *icut2 - klcnt;
79
ijklcnt = (*icut3) - ijklcnt;
82
/* no integrals may be calculated if there is no work for */
83
/* this node (ichunk too big), or, something is wrong */
84
printf("no two-electron integrals computed by node %d\n", GA_Nodeid());
90
/*************************************************************************
92
* the code involving task pool stuff follows
94
*************************************************************************/
96
//#include <UniformTaskCollectionSplit.h>
97
#include <UniformTaskCollSplitData.h>
98
#include <DenseArray.h>
102
using namespace tascel;
104
static int next_4chunk(int g_counter, Integer nbfn,
105
Integer lo[4], Integer hi[4],
106
Integer *ilo, Integer *jlo,
107
Integer *klo, Integer *llo) {
112
long long total_ntsks;
114
if (nbfn - ichunk*imax > 0) {
117
total_ntsks = imax*imax*imax*imax;
119
itask = NGA_Read_inc(g_counter, (int *)&zero, one);
120
/* printf("%d: itask=%d total_ntsks=%lf\n", armci_me, (int)itask, (double)total_ntsks); */
122
printf("next_4chunk: itask negative: %d imax:%d nbfn:%d ichunk:%d\n",
123
(int) itask, (int)imax, (int)nbfn, (int)ichunk);
124
printf("probable GA integer precision problem if %lf > 2^31\n",
129
if (itask < total_ntsks) {
144
hi[0] = min((*ilo+1)*ichunk,nbfn) - 1;
145
hi[1] = min((*jlo+1)*ichunk,nbfn) - 1;
146
hi[2] = min((*klo+1)*ichunk,nbfn) - 1;
147
hi[3] = min((*llo+1)*ichunk,nbfn) - 1;
156
static void clean_chunk(double *chunk) {
158
for(i=0; i<ichunk*ichunk; i++) {
171
long long *icut1, *icut2, *icut3, *icut4;
172
Integer g_schwarz, g_dens, g_fock;
175
static void compute_lo_hi(long itask, long nbfn, long lo[4], long hi[4],
176
long *ilo, long *jlo, long *klo, long *llo) {
179
long long total_ntsks;
181
if (nbfn - ichunk*imax > 0) {
184
total_ntsks = imax*imax*imax*imax;
187
printf("next_4chunk: itask negative: %d imax:%d nbfn:%d ichunk:%d\n",
188
(int) itask, (int)imax, (int)nbfn, (int)ichunk);
189
printf("probable GA integer precision problem if %lf > 2^31\n",
194
assert(itask < total_ntsks);
210
hi[0] = min((*ilo+1)*ichunk,nbfn) - 1;
211
hi[1] = min((*jlo+1)*ichunk,nbfn) - 1;
212
hi[2] = min((*klo+1)*ichunk,nbfn) - 1;
213
hi[3] = min((*llo+1)*ichunk,nbfn) - 1;
216
static int compute_owner(long itask, const vector<DataColl*> &colls, int arrid, long nbfn)
218
const int size = 2*sizeof(int);
220
long int ilo, jlo, klo, llo, lo[4], hi[4], tmp;
221
int *iidx = (int *)idx;
223
compute_lo_hi(itask, nbfn, lo, hi, &ilo, &jlo, &klo, &llo);
227
iidx[0] = lo[1]/ichunk;
228
iidx[1] = lo[0]/ichunk;
231
iidx[0] = lo[3]/ichunk;
232
iidx[1] = lo[2]/ichunk;
235
iidx[0] = lo[3]/ichunk;
236
iidx[1] = lo[2]/ichunk;
239
iidx[0] = lo[3]/ichunk;
240
iidx[1] = lo[1]/ichunk;
243
iidx[0] = lo[1]/ichunk;
244
iidx[1] = lo[0]/ichunk;
247
iidx[0] = lo[2]/ichunk;
248
iidx[1] = lo[0]/ichunk;
254
return colls.at(arrid)->getProc(idx,size);
257
static void compute_index(void *dscr, int dscr_len,
258
void *pldata, int pldata_len, int arrid, void *idx, int idxlen) {
259
assert(dscr_len == sizeof(task_dscr_t));
260
assert(arrid >=0 && arrid<6);
261
assert(idxlen == 2*sizeof(int));
262
const long itask = ((task_dscr_t*)dscr)->id;
263
int *iidx = (int *)idx;
264
task_plo_t *ptplo = (task_plo_t*)pldata;
265
assert(pldata_len == sizeof(task_plo_t));
266
const long int nbfn = ptplo->nbfn;
267
long int ilo, jlo, klo, llo, lo[4], hi[4], tmp;
269
compute_lo_hi(itask, nbfn, lo, hi, &ilo, &jlo, &klo, &llo);
273
iidx[0] = lo[1]/ichunk;
274
iidx[1] = lo[0]/ichunk;
277
iidx[0] = lo[3]/ichunk;
278
iidx[1] = lo[2]/ichunk;
281
iidx[0] = lo[3]/ichunk;
282
iidx[1] = lo[2]/ichunk;
285
iidx[0] = lo[3]/ichunk;
286
iidx[1] = lo[1]/ichunk;
289
iidx[0] = lo[1]/ichunk;
290
iidx[1] = lo[0]/ichunk;
293
iidx[0] = lo[2]/ichunk;
294
iidx[1] = lo[0]/ichunk;
302
static void twoel_task(UniformTaskCollection *utc, void *bigd, int bigd_len,
303
void *pldata, int pldata_len, vector<void *> data_bufs);
305
static void cpptwoel(Integer g_schwarz, Integer g_dens, Integer g_fock,
306
double schwmax, Integer nbfn,
307
long long *icut1, long long *icut2,
308
long long *icut3, long long *icut4) {
310
int g_counter, ione = 1;
316
tplo.schwmax = schwmax;
321
tplo.g_schwarz = g_schwarz;
322
tplo.g_dens = g_dens;
323
tplo.g_fock = g_fock;
326
TslFunc tf = frt.add(twoel_task);
327
vector<DataColl*> colls;
328
int block[2] = {ichunk, ichunk};
329
DenseArray coll_schwarz(g_schwarz, block, 2);
330
DenseArray coll_dens(g_dens, block, 2);
331
DenseArray coll_fock(g_fock, block, 2);
332
/**FIXME: modes are associated with arrays and not specific data
334
colls.push_back(&coll_schwarz);
335
colls.push_back(&coll_schwarz);
336
colls.push_back(&coll_dens);
337
colls.push_back(&coll_dens);
338
colls.push_back(&coll_fock);
339
colls.push_back(&coll_fock);
341
vector<AccessMode> modes(4, MODE_RONLY);
342
vector<int> idxlens(6, 2*sizeof(int));
344
modes.push_back(MODE_ACC);
345
modes.push_back(MODE_ACC);
347
const long me = GA_Nodeid();
348
const long nproc = GA_Nnodes();
349
const long imax = nbfn/ichunk + ((nbfn%ichunk) ? 1 : 0);
350
const long total_ntasks = imax*imax*imax*imax;
351
const long ntasks_per_proc = (long)ceil(1.0*total_ntasks/nproc);
352
const long tasklo = ntasks_per_proc * me;
353
const long taskhi = min(tasklo+ntasks_per_proc,total_ntasks)-1;
356
props.functions(tf,frt).taskSize(sizeof(task_dscr_t))
357
#define EVEN_DISTRIBUTION 0
358
#if EVEN_DISTRIBUTION
359
.maxTasks(ntasks_per_proc)
361
.maxTasks(total_ntasks)
363
.localData(&tplo,sizeof(tplo));
364
UniformTaskCollSplitData utc(props, colls, modes, idxlens, compute_index);
366
#if EVEN_DISTRIBUTION
367
for(long i=tasklo; i<=taskhi; i++)
369
for (long i=0; i<total_ntasks; ++i)
370
if (me == compute_owner(i, colls, 0, nbfn))
374
utc.addTask(&tdscr, sizeof(tdscr));
382
double buf[ichunk][ichunk];
385
static void twoel_task(UniformTaskCollection *utc, void *_bigd, int bigd_len,
386
void *pldata, int pldata_len, vector<void *> data_bufs) {
388
assert(bigd_len == sizeof(task_dscr_t));
389
assert(pldata!=NULL);
390
assert(pldata_len == sizeof(task_plo_t));
391
assert(data_bufs.size()==6);
393
task_dscr_t *ptdscr = (task_dscr_t*)_bigd;
394
task_plo_t *ptplo = (task_plo_t*)pldata;
396
tbuf_t *s_ij = (tbuf_t*)data_bufs[0];
397
tbuf_t *s_kl = (tbuf_t*)data_bufs[1];
398
tbuf_t *d_kl = (tbuf_t*)data_bufs[2];
399
tbuf_t *d_jl = (tbuf_t*)data_bufs[3];
400
tbuf_t *f_ij = (tbuf_t*)data_bufs[4];
401
tbuf_t *f_ik = (tbuf_t*)data_bufs[5];
405
Integer lo_ik[2],hi_ik[2],lo_jl[2],hi_jl[2];
406
Integer i,j,k,l,iloc,jloc,kloc,lloc,ld;
413
long long *icut1, *icut2, *icut3, *icut4;
414
Integer g_schwarz, g_dens, g_fock;
417
long itask = ptdscr->id;
419
schwmax = ptplo->schwmax;
420
icut1 = ptplo->icut1;
421
icut2 = ptplo->icut2;
422
icut3 = ptplo->icut3;
423
icut4 = ptplo->icut4;
424
g_schwarz = ptplo->g_schwarz;
425
g_dens = ptplo->g_dens;
426
g_fock = ptplo->g_fock;
431
compute_lo_hi(itask, nbfn, lo, hi, &it, &jt, &kt, <);
433
assert(ich == hi[0]-lo[0]+1);
434
assert(ich == hi[2]-lo[2]+1);
435
clean_chunk((double*)f_ij->buf);
436
clean_chunk((double *)f_ik->buf);
437
for(i = lo[0]; i<=hi[0]; i++) {
439
for(j = lo[1]; j<= hi[1]; j++) {
441
if (s_ij->buf[jloc][iloc]*(schwmax) < tol2e) {
442
*icut1 = *icut1 + (hi[2]-lo[2]+1)*(hi[3]-lo[3]+1);
446
for(k = lo[2]; k<=hi[2]; k++) {
448
for(l = lo[3]; l<=hi[3]; l++) {
450
if (s_ij->buf[jloc][iloc] * s_kl->buf[lloc][kloc] < tol2e) {
454
Integer _i=i+1, _j=j+1, _k=k+1, _l=l+1;
455
g_(&gg, &_i, &_j, &_k, &_l);
457
f_ij->buf[jloc][iloc] = f_ij->buf[jloc][iloc] + gg*d_kl->buf[lloc][kloc];
458
f_ik->buf[kloc][iloc] = f_ik->buf[kloc][iloc] - 0.5*gg*d_jl->buf[lloc][jloc];