6
#include<libciomr/libciomr.h>
8
#include<libpsio/psio.h>
9
#include<libint/libint.h>
10
#include<libr12/libr12.h>
16
#include"r12_quartet_data.h"
17
#include"norm_quartet.h"
19
#include"quartet_permutations.h"
20
#include"rmp2r12_energy.h"
23
/*-------------------------------------------------------
26
***Split into threads. Each thread do the following:
28
Loop over I batches (batch size num_i_per_batch) of active DOCC
30
Loop over all symmetry-unique shells UR, US<=UR
31
***if this UR, US is to be handled by this thread - proceed, else skip to next one
32
Find all symmetry-distinct shell doublets resulting from (UR US|
33
Loop over all resulting shell doublets R, S
35
Loop over all symmetry-unique shells UP, UQ<=UP
36
Find all symmetry-distinct shell quartets resulting from (R S|UP UQ)
37
Loop over the resulting set of P, Q doublets
38
Evaluate (RS|PQ), (RS|r12|PQ), and (RS|[r12,T1]|PQ)
39
Loop over p in P, q in Q, r in R, s in S, i in I
40
(rs|iq) += Cpi * (rs|pq)
41
(rs|ip) += Cqi * (rs|pq)
42
same for (rs|r12|pq) and (rs|[r12,T1]|pq)
43
End p, q, r, s, i loop
47
Loop over r in R, s in S
48
Loop over q < nao, x < num_mo, i in I
49
(rs|ix) += Cxs * (rs|is)
50
same for (rs|r12|is) and (rs|[r12,T1]|is)
53
***Lock (js| and (jr| (either individual or shell blocks depending on LOCK_RS_SHELL in rmp2r12_energy.h)
54
Loop over r in R, s in S
55
Loop over i in I, x < num_mo, j <= i
56
(js|ix) += Cjr * (rs|ix)
57
(jr|ix) += Cjs * (rs|ix)
58
same for (rs|r12|ix), but
59
(js|[r12,T1]|ix) += Cjr * (rs|[r12,T1]|ix)
60
(jr|[r12,T1]|ix) -= Cjs * (rs|[r12,T1]|ix) <---- Note the minus sign here!!!
63
***Unlock (js| and (jr|
68
***Barrier: threads wait until everyone is done
69
***Do the following in one thread only
70
Loop over i in I, j <= i
71
Loop over r < nao, x < num_mo, y < num_mo
72
(jy|ix) += Cys * (js|ix)
73
same for (js|r12|ix) and (js|[r12,T1]|ix)
81
-------------------------------------------------------*/
84
void *rmp2r12_energy_thread(void *tnum_ptr)
86
int thread_num = (int) tnum_ptr;
87
const double toler = UserOptions.cutoff;
89
extern pthread_mutex_t rmp2r12_energy_mutex;
90
extern pthread_mutex_t *rmp2r12_sindex_mutex;
91
extern pthread_cond_t rmp2r12_energy_cond;
92
extern RMP2R12_Status_t RMP2R12_Status;
93
extern double *jsix_buf[NUM_TE_TYPES]; /* buffer for (js|ia) integrals, where j runs over all d.-o. MOs,
94
s runs over all AOs, i - over I-batch, x - over all MOs */
95
extern double *jyix_buf[NUM_TE_TYPES]; /* buffer contains all MP2-R12/A-type integrals */
97
extern int *first, *last; /* first and last absolute (Pitzer) orbital indices in symblk */
98
extern int *fstocc, *lstocc; /* first and last occupied indices in Pitzer ordering for each symblk */
99
extern int *occ; /* Pitzer to "full"(no frozen core) QTS index mapping */
100
extern int *act2fullQTS; /* Maps "active"(taking frozen core into account) QTS into "frozen" QTS index */
101
extern int *ioff3; /* returns pointers to the beginning of rows in a rectangular matrix */
103
struct shell_pair *sp_ij, *sp_kl;
105
double_array_t fjt_table;
107
int ij, kl, ik, jl, ijkl;
108
int count, dum, status;
110
int n, num[NUM_TE_TYPES];
113
int pkblock_end_index = -1;
114
int i, j, m, p, q, r, s, x, y;
115
int isym, jsym, xsym, ysym;
116
int p_abs, q_abs, r_abs, s_abs;
118
int sii, sjj, skk, sll , slll;
119
int num_ij, swap_ij_kl;
121
int *sj_arr, *sk_arr, *sl_arr;
122
int sr_fao, ss_fao, sp_fao, sq_fao;
123
int usii,usjj,uskk,usll,usi,usj,usk,usl,usij;
124
int stab_i,stab_j,stab_k,stab_l,stab_ij,stab_kl;
125
int *R_list, *S_list, *T_list;
127
int dcr_ij, dcr_kl, dcr_ijkl;
129
int num_unique_quartets;
131
int max_num_unique_quartets;
132
int max_num_prim_comb;
134
int size, class_size;
136
int max_cart_class_size;
138
int np_i, np_j, np_k, np_l;
141
int num_ibatch, num_i_per_ibatch, ibatch, ibatch_first, ibatch_length;
142
int imin, imax, jmin;
143
int max_bf_per_shell;
144
int mo_i, mo_j, mo_x, mo_y;
147
int rs_offset, rsi_offset, rsp_offset;
150
char ij_key_string[80];
151
int iounits[NUM_TE_TYPES]; /* integrals file numbers */
153
double *raw_data[NUM_TE_TYPES]; /* pointers to the unnormalized target quartet of integrals */
154
double *data[NUM_TE_TYPES]; /* pointers to the transformed normalized target quartet of integrals */
157
double ssss, ss_r12_ss;
160
double *rsiq_buf[NUM_TE_TYPES]; /* buffer for (rs|iq) integrals, where r,s run over shell sets,
161
i runs over I-batch, q runs over all AOs */
163
double *ix_block_ptr;
164
double *rsix_buf[NUM_TE_TYPES]; /* buffer for (rs|iq) integrals, where r,s run over shell sets,
165
i runs over I-batch, x runs over all MOs */
171
double temp1,temp2,*iq_row,*ip_row;
177
iounits[0] = IOUnits.itapERI_MO;
178
iounits[1] = IOUnits.itapR12_MO;
179
iounits[2] = IOUnits.itapR12T2_MO;
181
/*-------------------------
182
Allocate data structures
183
-------------------------*/
184
max_bf_per_shell = ioff[BasisSet.max_am];
185
max_cart_class_size = (max_bf_per_shell)*
189
max_num_unique_quartets = Symmetry.max_stab_index*
190
Symmetry.max_stab_index*
191
Symmetry.max_stab_index;
192
sj_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
193
sk_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
194
sl_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
195
if (Symmetry.nirreps > 1)
196
max_class_size = max_cart_class_size;
197
#ifdef NONDOUBLE_INTS
198
for(i=0;i<NUM_TE_TYPES;i++)
199
raw_data[i] = init_array(max_cart_class_size);
201
max_num_prim_comb = (BasisSet.max_num_prims*
202
BasisSet.max_num_prims)*
203
(BasisSet.max_num_prims*
204
BasisSet.max_num_prims);
205
init_libr12(&Libr12,BasisSet.max_am-1,max_num_prim_comb);
206
init_fjt_table(&fjt_table);
208
num_i_per_ibatch = RMP2R12_Status.num_i_per_ibatch;
209
num_ibatch = RMP2R12_Status.num_ibatch;
210
ibatch_first = RMP2R12_Status.ibatch_first;
211
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
212
rsiq_buf[te_type] = init_array(num_i_per_ibatch*BasisSet.num_ao*
213
max_bf_per_shell*max_bf_per_shell);
214
rsix_buf[te_type] = init_array(num_i_per_ibatch*MOInfo.num_mo*
215
max_bf_per_shell*max_bf_per_shell);
217
jy_buf = block_matrix(MOInfo.ndocc,MOInfo.num_mo);
218
xy_buf = init_array(MOInfo.num_mo*MOInfo.num_mo);
219
scratch_buf = init_array(MAX(MOInfo.nactdocc*MOInfo.num_mo*
220
num_i_per_ibatch*MOInfo.num_mo,
221
MAX(max_cart_class_size,
222
num_i_per_ibatch*BasisSet.num_ao*
223
max_bf_per_shell*max_bf_per_shell)));
225
/*-----------------------------------
226
generate all unique shell quartets
227
-----------------------------------*/
228
/*--- I-batch loop ---*/
229
for (ibatch=ibatch_first;ibatch<num_ibatch;ibatch++) {
230
imin = ibatch * num_i_per_ibatch + MOInfo.nfrdocc;
231
imax = MIN( imin+num_i_per_ibatch , MOInfo.ndocc );
232
ibatch_length = imax - imin;
233
jmin = MOInfo.nfrdocc;
235
fprintf(outfile," Pass #%d, MO %d through MO %d\n",ibatch,imin+1,imax);
238
/*--- "unique" R,S loop ---*/
240
for (usii=0; usii<Symmetry.num_unique_shells; usii++)
241
for (usjj=0; usjj<=usii; usjj++, usij++) {
242
/*--- Decide if this thread will do this ---*/
243
if ( usij%UserOptions.num_threads != thread_num )
245
usi = usii; usj = usjj;
247
/*----------------------------------------------------------------------
248
NOTE on swapping usi, usj, etc.:
250
For 2-electron integrals of Hermitian operators it does not matter
251
if we swap si and sj, or sk and sl, or even si,sj and sk,sl. It
252
matters here though since [r12,T1] is non-Hermitian! What we want
253
in the end are the integrals of the [r12,T1] operator of this type:
254
(si sj|[r12,T1]|sk sl). If we have to swap bra and ket in the process
255
we will end up with having to compute (sk sl|[r12,T2]|si sj) instead.
256
Therefore if we have to swap bra and ket (swap_ij_kl = 1), then we
257
have to take [r12,T2] integral instead and swap it's bra and ket back.
258
----------------------------------------------------------------------*/
260
/*--- As usual, swap order usi and usj according to their angular momenta ---*/
261
if(BasisSet.shells[Symmetry.us2s[usi]].am < BasisSet.shells[Symmetry.us2s[usj]].am){
267
sii = Symmetry.us2s[usi];
268
sjj = Symmetry.us2s[usj];
269
if (Symmetry.nirreps > 1) {
270
stab_i = Symmetry.atom_positions[BasisSet.shells[sii].center-1];
271
stab_j = Symmetry.atom_positions[BasisSet.shells[sjj].center-1];
272
stab_ij = Symmetry.GnG[stab_i][stab_j];
273
R_list = Symmetry.dcr[stab_i][stab_j];
274
num_ij = Symmetry.dcr_dim[stab_i][stab_j];
280
for(dcr_ij=0;dcr_ij<num_ij;dcr_ij++) {
281
if (Symmetry.nirreps > 1)
286
sj = BasisSet.shells[sjj].trans_vec[R]-1;
288
/*--- "Unique" P,Q loop ---*/
289
for (uskk=0; uskk<Symmetry.num_unique_shells; uskk++)
290
for (usll=0; usll<=uskk; usll++){
292
/*--- For each combination of unique shells generate "petit list" of shells ---*/
293
usk = uskk; usl = usll;
294
/*--- As usual, swap order usk and usl according to their angular momenta ---*/
295
if(BasisSet.shells[Symmetry.us2s[usk]].am < BasisSet.shells[Symmetry.us2s[usl]].am){
300
/*--- DO NOT SWAP bra and ket at this time. Do it later, in the main loop ---*/
301
if(BasisSet.shells[Symmetry.us2s[usi]].am + BasisSet.shells[Symmetry.us2s[usj]].am >
302
BasisSet.shells[Symmetry.us2s[usk]].am + BasisSet.shells[Symmetry.us2s[usl]].am)
307
skk = Symmetry.us2s[usk];
308
sll = Symmetry.us2s[usl];
309
if (Symmetry.nirreps > 1) { /*--- Non-C1 symmetry case ---*/
310
/*--- Generate the petite list of shell quadruplets using DCD approach of Davidson ---*/
311
stab_k = Symmetry.atom_positions[BasisSet.shells[skk].center-1];
312
stab_l = Symmetry.atom_positions[BasisSet.shells[sll].center-1];
313
stab_kl = Symmetry.GnG[stab_k][stab_l];
314
S_list = Symmetry.dcr[stab_k][stab_l];
315
T_list = Symmetry.dcr[stab_ij][stab_kl];
316
lambda_T = Symmetry.nirreps/Symmetry.dcr_deg[stab_ij][stab_kl];
318
memset(sj_arr,0,sizeof(int)*max_num_unique_quartets);
319
memset(sk_arr,0,sizeof(int)*max_num_unique_quartets);
320
memset(sl_arr,0,sizeof(int)*max_num_unique_quartets);
323
for(dcr_ijkl=0;dcr_ijkl<Symmetry.dcr_dim[stab_ij][stab_kl];dcr_ijkl++){
324
T = T_list[dcr_ijkl];
325
sk = BasisSet.shells[skk].trans_vec[T]-1;
326
slll = BasisSet.shells[sll].trans_vec[T]-1;
327
for(dcr_kl=0;dcr_kl<Symmetry.dcr_dim[stab_k][stab_l];dcr_kl++) {
329
sl = BasisSet.shells[slll].trans_vec[S]-1;
331
total_am = BasisSet.shells[si].am +
332
BasisSet.shells[sj].am +
333
BasisSet.shells[sk].am +
334
BasisSet.shells[sl].am;
335
/*-------------------------------------------------------------
336
Obviously redundant or zero cases should be eliminated here!
337
Right now only zero case is eliminated. Redundancies arising
338
in DCD approach when usi == usj etc. may be eliminated too
339
but lambda_T will have to be replaced by an array (it won't
340
the same for every shell quartet in petite list anymore).
341
-------------------------------------------------------------*/
343
(BasisSet.shells[si].center!=BasisSet.shells[sj].center)||
344
(BasisSet.shells[sj].center!=BasisSet.shells[sk].center)||
345
(BasisSet.shells[sk].center!=BasisSet.shells[sl].center)) {
352
} /* petite list is ready to be used */
353
num_unique_quartets = count;
355
else { /*--- C1 symmetry case ---*/
356
total_am = BasisSet.shells[si].am +
357
BasisSet.shells[usj].am +
358
BasisSet.shells[usk].am +
359
BasisSet.shells[usl].am;
361
(BasisSet.shells[si].center!=BasisSet.shells[usj].center)||
362
(BasisSet.shells[usj].center!=BasisSet.shells[usk].center)||
363
(BasisSet.shells[usk].center!=BasisSet.shells[usl].center)) {
364
num_unique_quartets = 1;
370
num_unique_quartets = 0;
373
/*----------------------------------
374
Compute the nonredundant quartets
375
----------------------------------*/
376
for(plquartet=0;plquartet<num_unique_quartets;plquartet++) {
378
sj = sj_arr[plquartet];
379
sk = sk_arr[plquartet];
380
sl = sl_arr[plquartet];
381
/*--- As usual, we have to order bra-ket so that ket has the largest angular momentum */
390
np_i = BasisSet.shells[si].n_prims;
391
np_j = BasisSet.shells[sj].n_prims;
392
np_k = BasisSet.shells[sk].n_prims;
393
np_l = BasisSet.shells[sl].n_prims;
394
orig_am[0] = BasisSet.shells[si].am-1;
395
orig_am[1] = BasisSet.shells[sj].am-1;
396
orig_am[2] = BasisSet.shells[sk].am-1;
397
orig_am[3] = BasisSet.shells[sl].am-1;
398
am = orig_am[0] + orig_am[1] + orig_am[2] + orig_am[3];
400
sp_ij = &(BasisSet.shell_pairs[si][sj]);
401
sp_kl = &(BasisSet.shell_pairs[sk][sl]);
403
Libr12.ShellQuartet.AB[0] = sp_ij->AB[0];
404
Libr12.ShellQuartet.AB[1] = sp_ij->AB[1];
405
Libr12.ShellQuartet.AB[2] = sp_ij->AB[2];
406
Libr12.ShellQuartet.CD[0] = sp_kl->AB[0];
407
Libr12.ShellQuartet.CD[1] = sp_kl->AB[1];
408
Libr12.ShellQuartet.CD[2] = sp_kl->AB[2];
409
Libr12.ShellQuartet.AC[0] = Molecule.centers[BasisSet.shells[si].center-1].x-
410
Molecule.centers[BasisSet.shells[sk].center-1].x;
411
Libr12.ShellQuartet.AC[1] = Molecule.centers[BasisSet.shells[si].center-1].y-
412
Molecule.centers[BasisSet.shells[sk].center-1].y;
413
Libr12.ShellQuartet.AC[2] = Molecule.centers[BasisSet.shells[si].center-1].z-
414
Molecule.centers[BasisSet.shells[sk].center-1].z;
415
Libr12.ShellQuartet.ABdotAC = Libr12.ShellQuartet.AB[0]*Libr12.ShellQuartet.AC[0]+
416
Libr12.ShellQuartet.AB[1]*Libr12.ShellQuartet.AC[1]+
417
Libr12.ShellQuartet.AB[2]*Libr12.ShellQuartet.AC[2];
418
Libr12.ShellQuartet.CDdotCA = -1.0*(Libr12.ShellQuartet.CD[0]*Libr12.ShellQuartet.AC[0]+
419
Libr12.ShellQuartet.CD[1]*Libr12.ShellQuartet.AC[1]+
420
Libr12.ShellQuartet.CD[2]*Libr12.ShellQuartet.AC[2]);
421
AB2 = Libr12.ShellQuartet.AB[0]*Libr12.ShellQuartet.AB[0]+
422
Libr12.ShellQuartet.AB[1]*Libr12.ShellQuartet.AB[1]+
423
Libr12.ShellQuartet.AB[2]*Libr12.ShellQuartet.AB[2];
424
CD2 = Libr12.ShellQuartet.CD[0]*Libr12.ShellQuartet.CD[0]+
425
Libr12.ShellQuartet.CD[1]*Libr12.ShellQuartet.CD[1]+
426
Libr12.ShellQuartet.CD[2]*Libr12.ShellQuartet.CD[2];
428
/*--------------------------------
429
contract by primitives out here
430
--------------------------------*/
432
for (pi = 0; pi < np_i; pi++)
433
for (pj = 0; pj < np_j; pj++)
434
for (pk = 0; pk < np_k; pk++)
435
for (pl = 0; pl < np_l; pl++){
436
r12_quartet_data(&(Libr12.PrimQuartet[num_prim_comb++]), &fjt_table, AB2, CD2,
437
sp_ij, sp_kl, am, pi, pj, pk, pl, lambda_T);
441
build_r12_grt[orig_am[0]][orig_am[1]][orig_am[2]][orig_am[3]](&Libr12,num_prim_comb);
443
/*--- (usi usj|[r12,T1]|usk usl) = (usk usl|[r12,T2]|usi usj) ---*/
444
Libr12.te_ptr[2] = Libr12.te_ptr[3];
445
#ifdef NONDOUBLE_INTS
446
size = ioff[BasisSet.shells[si].am]*ioff[BasisSet.shells[sj].am]*
447
ioff[BasisSet.shells[sk].am]*ioff[BasisSet.shells[sl].am];
448
for(i=0;i<NUM_TE_TYPES-1;i++)
450
raw_data[i][j] = (double) Libr12.te_ptr[i][j];
452
for(i=0;i<NUM_TE_TYPES-1;i++)
453
raw_data[i] = Libr12.te_ptr[i];
455
/*--- Just normalize the integrals ---*/
456
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++)
457
data[te_type] = norm_quartet(raw_data[te_type], NULL, orig_am, 0);
462
for(p=0;p<num_prim_comb;p++) {
463
ssss += (double) Libr12.PrimQuartet[p].F[0];
464
ss_r12_ss += (double) Libr12.PrimQuartet[p].ss_r12_ss;
466
build_r12_grt[0][0][0][0](&Libr12,num_prim_comb);
467
#ifdef NONDOUBLE_INTS
468
raw_data[0][0] = ssss;
469
raw_data[1][0] = ss_r12_ss;
470
raw_data[2][0] = (double) Libr12.te_ptr[2][0];
471
raw_data[3][0] = (double) Libr12.te_ptr[3][0];
472
data[0] = raw_data[0];
473
data[1] = raw_data[1];
474
data[2] = raw_data[2];
475
data[3] = raw_data[3];
477
Libr12.int_stack[2] = Libr12.te_ptr[2][0];
478
Libr12.int_stack[3] = Libr12.te_ptr[3][0];
479
Libr12.int_stack[0] = ssss;
480
Libr12.int_stack[1] = ss_r12_ss;
481
data[0] = Libr12.int_stack;
482
data[1] = Libr12.int_stack+1;
483
data[2] = Libr12.int_stack+2;
484
data[3] = Libr12.int_stack+3;
489
Swap bra and ket back to the reversed order (PQ|RS) if not done yet
490
Need this to make Step 1 a (fast) vector operation!
491
NOTE: This changes only the way integrals are stored! No need to
492
worry about non-hermiticity of [r12,T1] here!!!
501
sr_fao = BasisSet.shells[si].fao - 1;
502
ss_fao = BasisSet.shells[sj].fao - 1;
503
sp_fao = BasisSet.shells[sk].fao - 1;
504
sq_fao = BasisSet.shells[sl].fao - 1;
505
nr = ioff[BasisSet.shells[si].am];
506
ns = ioff[BasisSet.shells[sj].am];
507
np = ioff[BasisSet.shells[sk].am];
508
nq = ioff[BasisSet.shells[sl].am];
510
No need to swap bra and ket since the quartet
511
was already computed in reverse order
515
sr_fao = BasisSet.shells[si].fao - 1;
516
ss_fao = BasisSet.shells[sj].fao - 1;
517
sp_fao = BasisSet.shells[sk].fao - 1;
518
sq_fao = BasisSet.shells[sl].fao - 1;
519
nr = ioff[BasisSet.shells[si].am];
520
ns = ioff[BasisSet.shells[sj].am];
521
np = ioff[BasisSet.shells[sk].am];
522
nq = ioff[BasisSet.shells[sl].am];
524
Need to swap bra and ket, but do
525
the actual permutation in te_type-loop
529
/*--- step 1 of the transformation ---*/
530
/* timer_on("Step 1");*/
531
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
533
if bra and ket were not swapped before computing
534
the quartet, swap them now so that Step 1 is
537
if ((!swap_ij_kl) && am) {
538
/* timer_on("Pre1Swap");*/
539
/*--- (rs|pq) -> (pq|rs) ---*/
540
ijkl_to_klij(data[te_type],scratch_buf,nr*ns,np*nq);
541
data[te_type] = scratch_buf;
542
/* timer_off("Pre1Swap");*/
545
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
546
mo_vec = MOInfo.scf_evec_occ[0][mo_i+imin];
547
rspq_ptr = data[te_type];
548
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
549
for(q=0,q_abs=sq_fao;
552
ip_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + p_abs) * nr * ns);
553
temp1 = mo_vec[q_abs];
555
for(rs=0;rs<nr*ns;rs++,rspq_ptr++,iq_row++,ip_row++) {
556
(*ip_row) += temp1 * (*rspq_ptr);
559
C_DAXPY(nr*ns,temp1,rspq_ptr,1,ip_row,1);
564
rspq_ptr = data[te_type];
565
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
566
temp = mo_vec[p_abs];
567
i_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + sq_fao) * nr * ns);
569
for(qrs=0;qrs<nq*nr*ns;qrs++,i_row++,rspq_ptr++) {
570
(*i_row) += temp * (*rspq_ptr);
573
C_DAXPY(nq*nr*ns,temp,rspq_ptr,1,i_row,1);
574
rspq_ptr += nq*nr*ns;
579
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
580
mo_vec = MOInfo.scf_evec_occ[0][mo_i+imin];
581
rspq_ptr = data[te_type];
582
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
583
temp = mo_vec[p_abs];
584
i_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + sq_fao) * nr * ns);
586
for(qrs=0;qrs<nq*nr*ns;qrs++,i_row++,rspq_ptr++) {
587
(*i_row) += temp * (*rspq_ptr);
590
C_DAXPY(nq*nr*ns,temp,rspq_ptr,1,i_row,1);
591
rspq_ptr += nq*nr*ns;
596
/* timer_off("Step 1");*/
598
} /* end of computing "petit" list - end of P,Q loop */
599
} /* end of "unique" P,Q loop */
601
/*--- step 2 of the transfromation ---*/
602
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
603
/* timer_on("Post1Swap");*/
604
ijkl_to_klij(rsiq_buf[te_type],scratch_buf,ibatch_length*BasisSet.num_ao,nr*ns);
605
rsi_row = scratch_buf;
606
/* timer_off("Post1Swap");*/
607
ix_block_ptr = rsix_buf[te_type];
610
/*--- Can be done as a matrix multiply now ---*/
611
/* timer_on("Step 2");*/
614
for(mo_i=0;mo_i<ibatch_length;mo_i++,rsi_row+=BasisSet.num_ao) {
615
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++,ix++) {
616
mo_vec = MOInfo.scf_evec[0][mo_x];
617
temp = ix_block_ptr[ix];
618
for(q_abs=0;q_abs<BasisSet.num_ao;q_abs++) {
619
temp += mo_vec[q_abs] * rsi_row[q_abs];
621
ix_block_ptr[ix] = temp;
625
C_DGEMM('n','t',ibatch_length,MOInfo.num_mo,BasisSet.num_ao,1.0,
626
rsi_row,BasisSet.num_ao,MOInfo.scf_evec[0][0],BasisSet.num_ao,
627
0.0,ix_block_ptr,MOInfo.num_mo);
628
rsi_row += BasisSet.num_ao*ibatch_length;
630
ix_block_ptr += MOInfo.num_mo*ibatch_length;
631
/* timer_off("Step 2");*/
635
/*--- step 3 of the transformation ---*/
636
rsi_row = rsix_buf[te_type];
637
/*--- To update (JS|IA) need to lock mutex corresponding to the S and R indices ---*/
639
pthread_mutex_lock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
643
/* timer_on("Step 3");*/
646
for(mo_i=0;mo_i<ibatch_length;mo_i++,rsi_row+=MOInfo.num_mo) {
647
/*-----------------------------------------------------
648
The mo_j range for Hermitian operators needs to only
649
be <= mo_i; it's just easier to transform [r12,T1]
650
for all mo_j than transform both [r12,T1] and
651
[r12,T2] (need less memory to do the same pass in
653
-----------------------------------------------------*/
654
for(mo_j=0;mo_j<MOInfo.nactdocc;mo_j++) {
656
pthread_mutex_lock(&rmp2r12_sindex_mutex[s_abs]);
658
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
659
temp = MOInfo.scf_evec_occ[0][mo_j+jmin][r_abs];
660
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
661
jsi_row[mo_x] += temp * rsi_row[mo_x];
664
pthread_mutex_unlock(&rmp2r12_sindex_mutex[s_abs]);
668
pthread_mutex_lock(&rmp2r12_sindex_mutex[r_abs]);
670
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + r_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
671
temp = MOInfo.scf_evec_occ[0][mo_j+jmin][s_abs];
672
if (te_type == 2) /*--- [r12,T1] integral - negative sign ---*/
674
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
675
jsi_row[mo_x] += temp * rsi_row[mo_x];
678
pthread_mutex_unlock(&rmp2r12_sindex_mutex[r_abs]);
683
/* timer_off("Step 3");*/
687
pthread_mutex_unlock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
689
memset(rsiq_buf[te_type],0,nr*ns*ibatch_length*BasisSet.num_ao*sizeof(double));
690
memset(rsix_buf[te_type],0,nr*ns*ibatch_length*MOInfo.num_mo*sizeof(double));
692
} /* end of R,S loop */
693
} /* end of "unique" R,S loop */
695
pthread_mutex_lock(&rmp2r12_energy_mutex);
696
RMP2R12_Status.num_arrived++;
697
if (RMP2R12_Status.num_arrived != UserOptions.num_threads) { /*--- there are some threads still working - wait here --*/
698
pthread_cond_wait(&rmp2r12_energy_cond,&rmp2r12_energy_mutex);
699
pthread_mutex_unlock(&rmp2r12_energy_mutex);
701
else { /*--- this is the last thread to get here - do the 4th step and dumping integrals alone and wake everybody up ---*/
702
/*--- step 4 of the transformation ---*/
703
/* timer_on("Step 4");*/
704
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
705
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
706
for(mo_j=0;mo_j<MOInfo.nactdocc;mo_j++) {
707
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
708
jyi_row = jyix_buf[te_type] + ((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo;
709
for(s_abs=0;s_abs<BasisSet.num_ao;s_abs++) {
710
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
711
temp = MOInfo.scf_evec[0][mo_y][s_abs];
712
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
713
jyi_row[mo_x] += temp * jsi_row[mo_x];
720
/* timer_off("Step 4");*/
723
/*--- Print them out for now ---*/
724
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
725
fprintf(outfile," Transformed integrals of type %d\n",te_type);
726
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
727
for(mo_j=0;mo_j<MOInfo.nactdocc;mo_j++) {
728
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
729
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
730
temp = jyix_buf[te_type][((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo + mo_x];
731
if (fabs(temp) > ZERO) {
733
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
734
mo_j+jmin,mo_y,mo_i+imin,mo_x,
735
mo_j*MOInfo.num_mo+mo_y,
736
mo_i*MOInfo.num_mo+mo_x,
739
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
740
mo_i+imin,mo_x,mo_j+jmin,mo_y,
741
mo_i*MOInfo.num_mo+mo_x,
742
mo_j*MOInfo.num_mo+mo_y,
752
/*--- Files are opened and closed each pass to ensure integrity of TOCs
753
if restart ever needed ---*/
754
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++)
755
psio_open(iounits[te_type], (ibatch != 0) ? PSIO_OPEN_OLD : PSIO_OPEN_NEW);
757
/*--------------------------------------------------------
758
Dump all fully transformed integrals to disk. Zero out
759
non-symmetrical integrals (what's left of the Pitzer's
760
equal contribution theorem in Abelian case).
761
--------------------------------------------------------*/
762
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
763
/*--------------------------------------------------------------------
764
Write integrals out in num_mo by num_mo batches corresponding to
766
--------------------------------------------------------------------*/
767
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
768
i = act2fullQTS[mo_i+imin]; /*--- mo_i+imin is the index in QTS "active" indexing scheme
769
we need i to be in QTS "full" indexing scheme, that's
770
what the MP2R12 code expects ---*/
771
isym = MOInfo.mo2symblk_occ[0][mo_i+imin];
772
for(mo_j=0;mo_j<MOInfo.nactdocc;mo_j++) {
773
jsym = MOInfo.mo2symblk_occ[0][mo_j+jmin];
774
j = act2fullQTS[mo_j+jmin]; /*--- Again, get the "full" QTS index ---*/
775
sprintf(ij_key_string,"Block_%d_x_%d_y",i,j);
776
memset(xy_buf,0,MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
777
/*--- Put all integrals with common i and j into a buffer ---*/
778
for(mo_x=0,xy=0;mo_x<MOInfo.num_mo;mo_x++) {
779
x = mo_x; /*--- The second index is a Pitzer index, that's fine ---*/
780
xsym = MOInfo.mo2symblk[x];
781
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++,xy++) {
782
y = mo_y; /*--- Again, the Pitzer index here is what we need ---*/
783
ysym = MOInfo.mo2symblk[y];
784
/*--- Skip this indegral if it's non-totally symmetric -
785
Pitzer's contribution theorem in Abelian case ---*/
786
if ((isym ^ jsym) ^ (xsym ^ ysym))
788
/*--- find the integral in ixjy_buf and put it in xy_buf ---*/
789
m = (((mo_j*MOInfo.num_mo + mo_y)*ibatch_length + mo_i)*MOInfo.num_mo + mo_x);
790
xy_buf[xy] = jyix_buf[te_type][m];
793
psio_write_entry(iounits[te_type], ij_key_string, (char *)xy_buf,
794
MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
797
psio_close(iounits[te_type], 1);
800
if (ibatch < num_ibatch-1)
801
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
802
memset(jsix_buf[te_type],0,MOInfo.nactdocc*BasisSet.num_ao*ibatch_length*MOInfo.num_mo*sizeof(double));
803
memset(jyix_buf[te_type],0,MOInfo.nactdocc*MOInfo.num_mo*ibatch_length*MOInfo.num_mo*sizeof(double));
805
/*--- Done with the non-threaded part - wake everybody up and prepare for the next ibatch ---*/
806
RMP2R12_Status.num_arrived = 0;
807
pthread_cond_broadcast(&rmp2r12_energy_cond);
808
pthread_mutex_unlock(&rmp2r12_energy_mutex);
810
} /* End of "I"-loop */
815
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
816
free(rsiq_buf[te_type]);
817
free(rsix_buf[te_type]);
821
free_libr12(&Libr12);
822
free_fjt_table(&fjt_table);
823
#ifdef NONDOUBLE_INTS
824
for(i=0;i<NUM_TE_TYPES;i++)