6
#include<libciomr/libciomr.h>
8
#include<libpsio/psio.h>
9
#include<libint/libint.h>
10
#include<libr12/libr12.h>
17
#include"r12_quartet_data.h"
18
#include"norm_quartet.h"
20
#include"quartet_permutations.h"
21
#include"rmp2r12_energy.h"
23
namespace psi{ namespace CINTS{ namespace ump2r12_aa{
25
/*-------------------------------------------------------
28
***Split into threads. Each thread do the following:
30
Loop over I batches (batch size num_i_per_batch) of active DOCC
32
Loop over all symmetry-unique shells UR, US<=UR
33
***if this UR, US is to be handled by this thread - proceed, else skip to next one
34
Find all symmetry-distinct shell doublets resulting from (UR US|
35
Loop over all resulting shell doublets R, S
37
Loop over all symmetry-unique shells UP, UQ<=UP
38
Find all symmetry-distinct shell quartets resulting from (R S|UP UQ)
39
Loop over the resulting set of P, Q doublets
40
Evaluate (RS|PQ), (RS|r12|PQ), and (RS|[r12,T1]|PQ)
41
Loop over p in P, q in Q, r in R, s in S, i in I
42
(rs|iq) += Cpi * (rs|pq)
43
(rs|ip) += Cqi * (rs|pq)
44
same for (rs|r12|pq) and (rs|[r12,T1]|pq)
45
End p, q, r, s, i loop
49
Loop over r in R, s in S
50
Loop over q < nao, x < num_mo, i in I
51
(rs|ix) += Cxs * (rs|is)
52
same for (rs|r12|is) and (rs|[r12,T1]|is)
55
***Lock (js| and (jr| (either individual or shell blocks depending on LOCK_RS_SHELL in rmp2r12_energy.h)
56
Loop over r in R, s in S
57
Loop over i in I, x < num_mo, j <= i
58
(js|ix) += Cjr * (rs|ix)
59
(jr|ix) += Cjs * (rs|ix)
60
same for (rs|r12|ix), but
61
(js|[r12,T1]|ix) += Cjr * (rs|[r12,T1]|ix)
62
(jr|[r12,T1]|ix) -= Cjs * (rs|[r12,T1]|ix) <---- Note the minus sign here!!!
65
***Unlock (js| and (jr|
70
***Barrier: threads wait until everyone is done
71
***Do the following in one thread only
72
Loop over i in I, j <= i
73
Loop over r < nao, x < num_mo, y < num_mo
74
(jy|ix) += Cys * (js|ix)
75
same for (js|r12|ix) and (js|[r12,T1]|ix)
83
-------------------------------------------------------*/
86
void *ump2r12_energy_thread_aa(void *tnum_ptr)
88
const long int thread_num = (long int) tnum_ptr;
89
const double toler = UserOptions.cutoff;
91
extern pthread_mutex_t rmp2r12_energy_mutex;
92
extern pthread_mutex_t *rmp2r12_sindex_mutex;
93
extern pthread_cond_t rmp2r12_energy_cond;
94
extern RMP2R12_Status_t RMP2R12_Status;
95
extern double *jsix_buf[NUM_TE_TYPES]; /* buffer for (js|ia) integrals, where j runs over all alpha occ act MOs,
96
s runs over all AOs, i - over I-batch, x - over all MOs */
97
extern double *jyix_buf[NUM_TE_TYPES]; /* buffer contains all MP2-R12/A-type integrals (AA|AA) */
98
extern double *jsix_buf2[NUM_TE_TYPES]; /* buffer for (js|ia) integrals, where j runs over all beta occ act MOs,
99
s runs over all AOs, i - over I-batch, x - over all MOs */
100
extern double *jyix_buf2[NUM_TE_TYPES]; /* buffer contains all MP2-R12/A-type integrals (AA|BB)*/
102
extern int *first, *last; /* first and last absolute (Pitzer) orbital indices in symblk */
103
extern int *fstocc_alpha, *lstocc_alpha; /* first and last occupied indices in Pitzer ordering for each symblk */
104
extern int *fstocc_beta, *lstocc_beta; /* first and last occupied indices in Pitzer ordering for each symblk */
105
extern int *occ_alpha, *occ_beta; /* Pitzer to "full"(no frozen core) QTS index mapping */
106
extern int *act2fullQTS_alpha; /* Maps "active"(taking frozen core into account) QTS into "frozen" QTS index */
107
extern int *act2fullQTS_beta; /* Maps "active"(taking frozen core into account) QTS into "frozen" QTS index */
108
extern int *ioff3; /* returns pointers to the beginning of rows in a rectangular matrix */
110
struct shell_pair *sp_ij, *sp_kl;
112
double_array_t fjt_table;
114
int ij, kl, ik, jl, ijkl;
115
int count, dum, status;
117
int n, num[NUM_TE_TYPES];
120
int pkblock_end_index = -1;
121
int i, j, m, p, q, r, s, x, y;
122
int isym, jsym, xsym, ysym;
123
int p_abs, q_abs, r_abs, s_abs;
125
int sii, sjj, skk, sll , slll;
126
int num_ij, swap_ij_kl;
128
int *sj_arr, *sk_arr, *sl_arr;
129
int sr_fao, ss_fao, sp_fao, sq_fao;
130
int usii,usjj,uskk,usll,usi,usj,usk,usl,usij;
131
int stab_i,stab_j,stab_k,stab_l,stab_ij,stab_kl;
132
int *R_list, *S_list, *T_list;
134
int dcr_ij, dcr_kl, dcr_ijkl;
136
int num_unique_quartets;
138
int max_num_unique_quartets;
139
int max_num_prim_comb;
141
int size, class_size;
143
int max_cart_class_size;
145
int np_i, np_j, np_k, np_l;
148
int num_ibatch, num_i_per_ibatch, ibatch, ibatch_first, ibatch_length;
149
int imin, imax, jmin;
150
int max_bf_per_shell;
151
int mo_i, mo_j, mo_x, mo_y;
154
int rs_offset, rsi_offset, rsp_offset;
157
char ij_key_string[80];
158
int iounits[NUM_TE_TYPES]; /* integrals file numbers */
159
int iounits2[NUM_TE_TYPES]; /* integrals file numbers */
161
double *raw_data[NUM_TE_TYPES]; /* pointers to the unnormalized target quartet of integrals */
162
double *data[NUM_TE_TYPES]; /* pointers to the transformed normalized target quartet of integrals */
165
double ssss, ss_r12_ss;
168
double *rsiq_buf[NUM_TE_TYPES]; /* buffer for (rs|iq) integrals, where r,s run over shell sets,
169
i runs over I-batch, q runs over all AOs */
171
double *ix_block_ptr;
172
double *rsix_buf[NUM_TE_TYPES]; /* buffer for (rs|iq) integrals, where r,s run over shell sets,
173
i runs over I-batch, x runs over all MOs */
179
double temp1,temp2,*iq_row,*ip_row;
183
/*-------------------------
184
Allocate data structures
185
-------------------------*/
186
max_bf_per_shell = ioff[BasisSet.max_am];
187
max_cart_class_size = (max_bf_per_shell)*
191
max_num_unique_quartets = Symmetry.max_stab_index*
192
Symmetry.max_stab_index*
193
Symmetry.max_stab_index;
194
sj_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
195
sk_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
196
sl_arr = (int *)malloc(sizeof(int)*max_num_unique_quartets);
197
if (Symmetry.nirreps > 1)
198
max_class_size = max_cart_class_size;
199
#ifdef NONDOUBLE_INTS
200
for(i=0;i<NUM_TE_TYPES;i++)
201
raw_data[i] = init_array(max_cart_class_size);
203
max_num_prim_comb = (BasisSet.max_num_prims*
204
BasisSet.max_num_prims)*
205
(BasisSet.max_num_prims*
206
BasisSet.max_num_prims);
207
init_libr12(&Libr12,BasisSet.max_am-1,max_num_prim_comb);
208
init_fjt_table(&fjt_table);
210
num_i_per_ibatch = RMP2R12_Status.num_i_per_ibatch;
211
num_ibatch = RMP2R12_Status.num_ibatch;
212
ibatch_first = RMP2R12_Status.ibatch_first;
213
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
214
rsiq_buf[te_type] = init_array(num_i_per_ibatch*BasisSet.num_ao*
215
max_bf_per_shell*max_bf_per_shell);
216
rsix_buf[te_type] = init_array(num_i_per_ibatch*MOInfo.num_mo*
217
max_bf_per_shell*max_bf_per_shell);
219
jy_buf = block_matrix(MOInfo.alpha_occ,MOInfo.num_mo);
220
xy_buf = init_array(MOInfo.num_mo*MOInfo.num_mo);
221
scratch_buf = init_array(MAX(MOInfo.alpha_act_occ*MOInfo.num_mo*
222
num_i_per_ibatch*MOInfo.num_mo,
223
MAX(max_cart_class_size,
224
num_i_per_ibatch*BasisSet.num_ao*
225
max_bf_per_shell*max_bf_per_shell)));
227
/*-----------------------------------
228
generate all unique shell quartets
229
-----------------------------------*/
230
/*--- I-batch loop ---*/
231
for (ibatch=ibatch_first;ibatch<num_ibatch;ibatch++) {
232
imin = ibatch * num_i_per_ibatch + MOInfo.nfrdocc;
233
imax = MIN( imin+num_i_per_ibatch , MOInfo.alpha_occ );
234
ibatch_length = imax - imin;
235
jmin = MOInfo.nfrdocc;
237
fprintf(outfile," Pass #%d, MO %d through MO %d\n",ibatch,imin+1,imax);
240
/*--- "unique" R,S loop ---*/
242
for (usii=0; usii<Symmetry.num_unique_shells; usii++)
243
for (usjj=0; usjj<=usii; usjj++, usij++) {
244
/*--- Decide if this thread will do this ---*/
245
if ( usij%UserOptions.num_threads != thread_num )
247
usi = usii; usj = usjj;
249
/*----------------------------------------------------------------------
250
NOTE on swapping usi, usj, etc.:
252
For 2-electron integrals of Hermitian operators it does not matter
253
if we swap si and sj, or sk and sl, or even si,sj and sk,sl. It
254
matters here though since [r12,T1] is non-Hermitian! What we want
255
in the end are the integrals of the [r12,T1] operator of this type:
256
(si sj|[r12,T1]|sk sl). If we have to swap bra and ket in the process
257
we will end up with having to compute (sk sl|[r12,T2]|si sj) instead.
258
Therefore if we have to swap bra and ket (swap_ij_kl = 1), then we
259
have to take [r12,T2] integral instead and swap it's bra and ket back.
260
----------------------------------------------------------------------*/
262
/*--- As usual, swap order usi and usj according to their angular momenta ---*/
263
if(BasisSet.shells[Symmetry.us2s[usi]].am < BasisSet.shells[Symmetry.us2s[usj]].am){
269
sii = Symmetry.us2s[usi];
270
sjj = Symmetry.us2s[usj];
271
if (Symmetry.nirreps > 1) {
272
stab_i = Symmetry.atom_positions[BasisSet.shells[sii].center-1];
273
stab_j = Symmetry.atom_positions[BasisSet.shells[sjj].center-1];
274
stab_ij = Symmetry.GnG[stab_i][stab_j];
275
R_list = Symmetry.dcr[stab_i][stab_j];
276
num_ij = Symmetry.dcr_dim[stab_i][stab_j];
282
for(dcr_ij=0;dcr_ij<num_ij;dcr_ij++) {
283
if (Symmetry.nirreps > 1)
288
sj = BasisSet.shells[sjj].trans_vec[R]-1;
290
/*--- "Unique" P,Q loop ---*/
291
for (uskk=0; uskk<Symmetry.num_unique_shells; uskk++)
292
for (usll=0; usll<=uskk; usll++){
294
/*--- For each combination of unique shells generate "petit list" of shells ---*/
295
usk = uskk; usl = usll;
296
/*--- As usual, swap order usk and usl according to their angular momenta ---*/
297
if(BasisSet.shells[Symmetry.us2s[usk]].am < BasisSet.shells[Symmetry.us2s[usl]].am){
302
/*--- DO NOT SWAP bra and ket at this time. Do it later, in the main loop ---*/
303
if(BasisSet.shells[Symmetry.us2s[usi]].am + BasisSet.shells[Symmetry.us2s[usj]].am >
304
BasisSet.shells[Symmetry.us2s[usk]].am + BasisSet.shells[Symmetry.us2s[usl]].am)
309
skk = Symmetry.us2s[usk];
310
sll = Symmetry.us2s[usl];
311
if (Symmetry.nirreps > 1) { /*--- Non-C1 symmetry case ---*/
312
/*--- Generate the petite list of shell quadruplets using DCD approach of Davidson ---*/
313
stab_k = Symmetry.atom_positions[BasisSet.shells[skk].center-1];
314
stab_l = Symmetry.atom_positions[BasisSet.shells[sll].center-1];
315
stab_kl = Symmetry.GnG[stab_k][stab_l];
316
S_list = Symmetry.dcr[stab_k][stab_l];
317
T_list = Symmetry.dcr[stab_ij][stab_kl];
318
lambda_T = Symmetry.nirreps/Symmetry.dcr_deg[stab_ij][stab_kl];
320
memset(sj_arr,0,sizeof(int)*max_num_unique_quartets);
321
memset(sk_arr,0,sizeof(int)*max_num_unique_quartets);
322
memset(sl_arr,0,sizeof(int)*max_num_unique_quartets);
325
for(dcr_ijkl=0;dcr_ijkl<Symmetry.dcr_dim[stab_ij][stab_kl];dcr_ijkl++){
326
T = T_list[dcr_ijkl];
327
sk = BasisSet.shells[skk].trans_vec[T]-1;
328
slll = BasisSet.shells[sll].trans_vec[T]-1;
329
for(dcr_kl=0;dcr_kl<Symmetry.dcr_dim[stab_k][stab_l];dcr_kl++) {
331
sl = BasisSet.shells[slll].trans_vec[S]-1;
333
total_am = BasisSet.shells[si].am +
334
BasisSet.shells[sj].am +
335
BasisSet.shells[sk].am +
336
BasisSet.shells[sl].am;
337
/*-------------------------------------------------------------
338
Obviously redundant or zero cases should be eliminated here!
339
Right now only zero case is eliminated. Redundancies arising
340
in DCD approach when usi == usj etc. may be eliminated too
341
but lambda_T will have to be replaced by an array (it won't
342
the same for every shell quartet in petite list anymore).
343
-------------------------------------------------------------*/
345
(BasisSet.shells[si].center!=BasisSet.shells[sj].center)||
346
(BasisSet.shells[sj].center!=BasisSet.shells[sk].center)||
347
(BasisSet.shells[sk].center!=BasisSet.shells[sl].center)) {
354
} /* petite list is ready to be used */
355
num_unique_quartets = count;
357
else { /*--- C1 symmetry case ---*/
358
total_am = BasisSet.shells[si].am +
359
BasisSet.shells[usj].am +
360
BasisSet.shells[usk].am +
361
BasisSet.shells[usl].am;
363
(BasisSet.shells[si].center!=BasisSet.shells[usj].center)||
364
(BasisSet.shells[usj].center!=BasisSet.shells[usk].center)||
365
(BasisSet.shells[usk].center!=BasisSet.shells[usl].center)) {
366
num_unique_quartets = 1;
372
num_unique_quartets = 0;
375
/*----------------------------------
376
Compute the nonredundant quartets
377
----------------------------------*/
378
for(plquartet=0;plquartet<num_unique_quartets;plquartet++) {
380
sj = sj_arr[plquartet];
381
sk = sk_arr[plquartet];
382
sl = sl_arr[plquartet];
383
/*--- As usual, we have to order bra-ket so that ket has the largest angular momentum */
392
np_i = BasisSet.shells[si].n_prims;
393
np_j = BasisSet.shells[sj].n_prims;
394
np_k = BasisSet.shells[sk].n_prims;
395
np_l = BasisSet.shells[sl].n_prims;
396
orig_am[0] = BasisSet.shells[si].am-1;
397
orig_am[1] = BasisSet.shells[sj].am-1;
398
orig_am[2] = BasisSet.shells[sk].am-1;
399
orig_am[3] = BasisSet.shells[sl].am-1;
400
am = orig_am[0] + orig_am[1] + orig_am[2] + orig_am[3];
402
sp_ij = &(BasisSet.shell_pairs[si][sj]);
403
sp_kl = &(BasisSet.shell_pairs[sk][sl]);
405
Libr12.ShellQuartet.AB[0] = sp_ij->AB[0];
406
Libr12.ShellQuartet.AB[1] = sp_ij->AB[1];
407
Libr12.ShellQuartet.AB[2] = sp_ij->AB[2];
408
Libr12.ShellQuartet.CD[0] = sp_kl->AB[0];
409
Libr12.ShellQuartet.CD[1] = sp_kl->AB[1];
410
Libr12.ShellQuartet.CD[2] = sp_kl->AB[2];
411
Libr12.ShellQuartet.AC[0] = Molecule.centers[BasisSet.shells[si].center-1].x-
412
Molecule.centers[BasisSet.shells[sk].center-1].x;
413
Libr12.ShellQuartet.AC[1] = Molecule.centers[BasisSet.shells[si].center-1].y-
414
Molecule.centers[BasisSet.shells[sk].center-1].y;
415
Libr12.ShellQuartet.AC[2] = Molecule.centers[BasisSet.shells[si].center-1].z-
416
Molecule.centers[BasisSet.shells[sk].center-1].z;
417
Libr12.ShellQuartet.ABdotAC = Libr12.ShellQuartet.AB[0]*Libr12.ShellQuartet.AC[0]+
418
Libr12.ShellQuartet.AB[1]*Libr12.ShellQuartet.AC[1]+
419
Libr12.ShellQuartet.AB[2]*Libr12.ShellQuartet.AC[2];
420
Libr12.ShellQuartet.CDdotCA = -1.0*(Libr12.ShellQuartet.CD[0]*Libr12.ShellQuartet.AC[0]+
421
Libr12.ShellQuartet.CD[1]*Libr12.ShellQuartet.AC[1]+
422
Libr12.ShellQuartet.CD[2]*Libr12.ShellQuartet.AC[2]);
423
AB2 = Libr12.ShellQuartet.AB[0]*Libr12.ShellQuartet.AB[0]+
424
Libr12.ShellQuartet.AB[1]*Libr12.ShellQuartet.AB[1]+
425
Libr12.ShellQuartet.AB[2]*Libr12.ShellQuartet.AB[2];
426
CD2 = Libr12.ShellQuartet.CD[0]*Libr12.ShellQuartet.CD[0]+
427
Libr12.ShellQuartet.CD[1]*Libr12.ShellQuartet.CD[1]+
428
Libr12.ShellQuartet.CD[2]*Libr12.ShellQuartet.CD[2];
430
/*--------------------------------
431
contract by primitives out here
432
--------------------------------*/
434
for (pi = 0; pi < np_i; pi++)
435
for (pj = 0; pj < np_j; pj++)
436
for (pk = 0; pk < np_k; pk++)
437
for (pl = 0; pl < np_l; pl++){
438
r12_quartet_data(&(Libr12.PrimQuartet[num_prim_comb++]), &fjt_table, AB2, CD2,
439
sp_ij, sp_kl, am, pi, pj, pk, pl, lambda_T);
443
build_r12_grt[orig_am[0]][orig_am[1]][orig_am[2]][orig_am[3]](&Libr12,num_prim_comb);
445
/*--- (usi usj|[r12,T1]|usk usl) = (usk usl|[r12,T2]|usi usj) ---*/
446
Libr12.te_ptr[2] = Libr12.te_ptr[3];
447
#ifdef NONDOUBLE_INTS
448
size = ioff[BasisSet.shells[si].am]*ioff[BasisSet.shells[sj].am]*
449
ioff[BasisSet.shells[sk].am]*ioff[BasisSet.shells[sl].am];
450
for(i=0;i<NUM_TE_TYPES-1;i++)
452
raw_data[i][j] = (double) Libr12.te_ptr[i][j];
454
for(i=0;i<NUM_TE_TYPES-1;i++)
455
raw_data[i] = Libr12.te_ptr[i];
457
/*--- Just normalize the integrals ---*/
458
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++)
459
data[te_type] = norm_quartet(raw_data[te_type], NULL, orig_am, 0);
464
for(p=0;p<num_prim_comb;p++) {
465
ssss += (double) Libr12.PrimQuartet[p].F[0];
466
ss_r12_ss += (double) Libr12.PrimQuartet[p].ss_r12_ss;
468
build_r12_grt[0][0][0][0](&Libr12,num_prim_comb);
469
#ifdef NONDOUBLE_INTS
470
raw_data[0][0] = ssss;
471
raw_data[1][0] = ss_r12_ss;
472
raw_data[2][0] = (double) Libr12.te_ptr[2][0];
473
raw_data[3][0] = (double) Libr12.te_ptr[3][0];
474
data[0] = raw_data[0];
475
data[1] = raw_data[1];
476
data[2] = raw_data[2];
477
data[3] = raw_data[3];
479
Libr12.int_stack[2] = Libr12.te_ptr[2][0];
480
Libr12.int_stack[3] = Libr12.te_ptr[3][0];
481
Libr12.int_stack[0] = ssss;
482
Libr12.int_stack[1] = ss_r12_ss;
483
data[0] = Libr12.int_stack;
484
data[1] = Libr12.int_stack+1;
485
data[2] = Libr12.int_stack+2;
486
data[3] = Libr12.int_stack+3;
491
Swap bra and ket back to the reversed order (PQ|RS) if not done yet
492
Need this to make Step 1 a (fast) vector operation!
493
NOTE: This changes only the way integrals are stored! No need to
494
worry about non-hermiticity of [r12,T1] here!!!
503
sr_fao = BasisSet.shells[si].fao - 1;
504
ss_fao = BasisSet.shells[sj].fao - 1;
505
sp_fao = BasisSet.shells[sk].fao - 1;
506
sq_fao = BasisSet.shells[sl].fao - 1;
507
nr = ioff[BasisSet.shells[si].am];
508
ns = ioff[BasisSet.shells[sj].am];
509
np = ioff[BasisSet.shells[sk].am];
510
nq = ioff[BasisSet.shells[sl].am];
512
No need to swap bra and ket since the quartet
513
was already computed in reverse order
517
sr_fao = BasisSet.shells[si].fao - 1;
518
ss_fao = BasisSet.shells[sj].fao - 1;
519
sp_fao = BasisSet.shells[sk].fao - 1;
520
sq_fao = BasisSet.shells[sl].fao - 1;
521
nr = ioff[BasisSet.shells[si].am];
522
ns = ioff[BasisSet.shells[sj].am];
523
np = ioff[BasisSet.shells[sk].am];
524
nq = ioff[BasisSet.shells[sl].am];
526
Need to swap bra and ket, but do
527
the actual permutation in te_type-loop
531
/*--- step 1 of the transformation ---*/
532
/* timer_on("Step 1");*/
533
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
535
if bra and ket were not swapped before computing
536
the quartet, swap them now so that Step 1 is
539
if ((!swap_ij_kl) && am) {
540
/* timer_on("Pre1Swap");*/
541
/*--- (rs|pq) -> (pq|rs) ---*/
542
ijkl_to_klij(data[te_type],scratch_buf,nr*ns,np*nq);
543
data[te_type] = scratch_buf;
544
/* timer_off("Pre1Swap");*/
547
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
548
mo_vec = MOInfo.scf_evec_occ_alpha[mo_i+imin];
549
rspq_ptr = data[te_type];
550
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
551
for(q=0,q_abs=sq_fao;
554
ip_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + p_abs) * nr * ns);
555
temp1 = mo_vec[q_abs];
557
for(rs=0;rs<nr*ns;rs++,rspq_ptr++,iq_row++,ip_row++) {
558
(*ip_row) += temp1 * (*rspq_ptr);
561
C_DAXPY(nr*ns,temp1,rspq_ptr,1,ip_row,1);
566
rspq_ptr = data[te_type];
567
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
568
temp = mo_vec[p_abs];
569
i_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + sq_fao) * nr * ns);
571
for(qrs=0;qrs<nq*nr*ns;qrs++,i_row++,rspq_ptr++) {
572
(*i_row) += temp * (*rspq_ptr);
575
C_DAXPY(nq*nr*ns,temp,rspq_ptr,1,i_row,1);
576
rspq_ptr += nq*nr*ns;
581
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
582
mo_vec = MOInfo.scf_evec_occ_alpha[mo_i+imin];
583
rspq_ptr = data[te_type];
584
for(p=0,p_abs=sp_fao;p<np;p++,p_abs++) {
585
temp = mo_vec[p_abs];
586
i_row = rsiq_buf[te_type] + ((mo_i*BasisSet.num_ao + sq_fao) * nr * ns);
588
for(qrs=0;qrs<nq*nr*ns;qrs++,i_row++,rspq_ptr++) {
589
(*i_row) += temp * (*rspq_ptr);
592
C_DAXPY(nq*nr*ns,temp,rspq_ptr,1,i_row,1);
593
rspq_ptr += nq*nr*ns;
598
/* timer_off("Step 1");*/
600
} /* end of computing "petit" list - end of P,Q loop */
601
} /* end of "unique" P,Q loop */
603
/*--- step 2 of the transfromation ---*/
604
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
605
/* timer_on("Post1Swap");*/
606
ijkl_to_klij(rsiq_buf[te_type],scratch_buf,ibatch_length*BasisSet.num_ao,nr*ns);
607
rsi_row = scratch_buf;
608
/* timer_off("Post1Swap");*/
609
ix_block_ptr = rsix_buf[te_type];
612
/*--- Can be done as a matrix multiply now ---*/
613
/* timer_on("Step 2");*/
616
for(mo_i=0;mo_i<ibatch_length;mo_i++,rsi_row+=BasisSet.num_ao) {
617
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++,ix++) {
618
mo_vec = MOInfo.scf_evec_alpha[mo_x];
619
temp = ix_block_ptr[ix];
620
for(q_abs=0;q_abs<BasisSet.num_ao;q_abs++) {
621
temp += mo_vec[q_abs] * rsi_row[q_abs];
623
ix_block_ptr[ix] = temp;
627
C_DGEMM('n','t',ibatch_length,MOInfo.num_mo,BasisSet.num_ao,1.0,
628
rsi_row,BasisSet.num_ao,MOInfo.scf_evec_alpha[0],BasisSet.num_ao,
629
0.0,ix_block_ptr,MOInfo.num_mo);
630
rsi_row += BasisSet.num_ao*ibatch_length;
632
ix_block_ptr += MOInfo.num_mo*ibatch_length;
633
/* timer_off("Step 2");*/
637
/*--- step 3 of the transformation (AA|AA) spin case---*/
638
rsi_row = rsix_buf[te_type];
639
/*--- To update (JS|IA) need to lock mutex corresponding to the S and R indices ---*/
641
pthread_mutex_lock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
645
/* timer_on("Step 3");*/
648
for(mo_i=0;mo_i<ibatch_length;mo_i++,rsi_row+=MOInfo.num_mo) {
649
for(mo_j=0;mo_j<(te_type==2 ? MOInfo.alpha_act_occ : mo_i);mo_j++) {
651
pthread_mutex_lock(&rmp2r12_sindex_mutex[s_abs]);
653
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
654
temp = MOInfo.scf_evec_occ_alpha[mo_j+jmin][r_abs];
655
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
656
jsi_row[mo_x] += temp * rsi_row[mo_x];
659
pthread_mutex_unlock(&rmp2r12_sindex_mutex[s_abs]);
663
pthread_mutex_lock(&rmp2r12_sindex_mutex[r_abs]);
665
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + r_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
666
temp = MOInfo.scf_evec_occ_alpha[mo_j+jmin][s_abs];
667
if (te_type == 2) /*--- [r12,T1] integral - negative sign ---*/
669
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
670
jsi_row[mo_x] += temp * rsi_row[mo_x];
673
pthread_mutex_unlock(&rmp2r12_sindex_mutex[r_abs]);
678
/* timer_off("Step 3");*/
682
pthread_mutex_unlock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
686
/*--- step 3 of the transformation (AA|BB) spin case---*/
687
rsi_row = rsix_buf[te_type];
688
/*--- To update (JS|IA) need to lock mutex corresponding to the S and R indices ---*/
690
pthread_mutex_lock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
694
/* timer_on("Step 3");*/
697
for(mo_i=0;mo_i<ibatch_length;mo_i++,rsi_row+=MOInfo.num_mo) {
698
for(mo_j=0;mo_j<MOInfo.beta_act_occ;mo_j++) {
700
pthread_mutex_lock(&rmp2r12_sindex_mutex[s_abs]);
702
jsi_row = jsix_buf2[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
703
temp = MOInfo.scf_evec_occ_beta[mo_j+jmin][r_abs];
704
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
705
jsi_row[mo_x] += temp * rsi_row[mo_x];
708
pthread_mutex_unlock(&rmp2r12_sindex_mutex[s_abs]);
712
pthread_mutex_lock(&rmp2r12_sindex_mutex[r_abs]);
714
jsi_row = jsix_buf2[te_type] + ((mo_j * BasisSet.num_ao + r_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
715
temp = MOInfo.scf_evec_occ_beta[mo_j+jmin][s_abs];
716
if (te_type == 2) /*--- [r12,T1] integral - negative sign ---*/
718
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
719
jsi_row[mo_x] += temp * rsi_row[mo_x];
722
pthread_mutex_unlock(&rmp2r12_sindex_mutex[r_abs]);
727
/* timer_off("Step 3");*/
731
pthread_mutex_unlock(&rmp2r12_sindex_mutex[INDEX(si,sj)]);
735
memset(rsiq_buf[te_type],0,nr*ns*ibatch_length*BasisSet.num_ao*sizeof(double));
736
memset(rsix_buf[te_type],0,nr*ns*ibatch_length*MOInfo.num_mo*sizeof(double));
737
} /* end loop over te_type */
738
} /* end of R,S loop */
739
} /* end of "unique" R,S loop */
742
pthread_mutex_lock(&rmp2r12_energy_mutex);
743
RMP2R12_Status.num_arrived++;
744
if (RMP2R12_Status.num_arrived != UserOptions.num_threads) { /*--- there are some threads still working - wait here --*/
745
pthread_cond_wait(&rmp2r12_energy_cond,&rmp2r12_energy_mutex);
746
pthread_mutex_unlock(&rmp2r12_energy_mutex);
748
else { /*--- this is the last thread to get here - do the 4th step and dumping integrals alone and wake everybody up ---*/
749
/*--- step 4 of the transformation ---*/
750
/* timer_on("Step 4");*/
751
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
752
/* make the alpha - alpha integrals */
753
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
754
for(mo_j=0;mo_j<(te_type==2 ? MOInfo.alpha_act_occ : mo_i+1);mo_j++) {
755
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
756
jyi_row = jyix_buf[te_type] + ((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo;
757
for(s_abs=0;s_abs<BasisSet.num_ao;s_abs++) {
758
jsi_row = jsix_buf[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
759
temp = MOInfo.scf_evec_alpha[mo_y][s_abs];
760
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
761
jyi_row[mo_x] += temp * jsi_row[mo_x];
767
/* make the alpha - beta integrals */
768
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
769
for(mo_j=0;mo_j<MOInfo.beta_act_occ;mo_j++) {
770
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
771
jyi_row = jyix_buf2[te_type] + ((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo;
772
for(s_abs=0;s_abs<BasisSet.num_ao;s_abs++) {
773
jsi_row = jsix_buf2[te_type] + ((mo_j * BasisSet.num_ao + s_abs) * ibatch_length + mo_i) * MOInfo.num_mo;
774
temp = MOInfo.scf_evec_beta[mo_y][s_abs];
775
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
776
jyi_row[mo_x] += temp * jsi_row[mo_x];
783
/* timer_off("Step 4");*/
786
/*--- Print them out for now ---*/
787
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
788
fprintf(outfile," Alpha Alpha Transformed integrals of type %d\n",te_type);
789
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
790
for(mo_j=0;mo_j<(te_type==2 ? MOInfo.alpha_act_occ : mo_i+1);mo_j++) {
791
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
792
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
793
temp = jyix_buf[te_type][((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo + mo_x];
794
if (fabs(temp) > ZERO) {
796
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
797
mo_j+jmin,mo_y,mo_i+imin,mo_x,
798
mo_j*MOInfo.num_mo+mo_y,
799
mo_i*MOInfo.num_mo+mo_x,
802
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
803
mo_i+imin,mo_x,mo_j+jmin,mo_y,
804
mo_i*MOInfo.num_mo+mo_x,
805
mo_j*MOInfo.num_mo+mo_y,
812
fprintf(outfile," Alpha Beta Transformed integrals of type %d\n",te_type);
813
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
814
for(mo_j=0;mo_j<MOInfo.beta_act_occ;mo_j++) {
815
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++) {
816
for(mo_x=0;mo_x<MOInfo.num_mo;mo_x++) {
817
temp = jyix_buf2[te_type][((mo_j * MOInfo.num_mo + mo_y) * ibatch_length + mo_i) * MOInfo.num_mo + mo_x];
818
if (fabs(temp) > ZERO) {
820
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
821
mo_j+jmin,mo_y,mo_i+imin,mo_x,
822
mo_j*MOInfo.num_mo+mo_y,
823
mo_i*MOInfo.num_mo+mo_x,
826
fprintf(outfile,"<%d %d %d %d [%d] [%d] = %20.10lf\n",
827
mo_i+imin,mo_x,mo_j+jmin,mo_y,
828
mo_i*MOInfo.num_mo+mo_x,
829
mo_j*MOInfo.num_mo+mo_y,
839
/*--- Files are opened and closed each pass to ensure integrity of TOCs
840
if restart ever needed, these are the alpha-beta integrals ---*/
841
iounits[0] = PSIF_MO_AA_TEI;
842
iounits[1] = PSIF_MO_AA_R12;
843
iounits[2] = PSIF_MO_AA_R12T1;
844
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++)
845
psio_open(iounits[te_type], (ibatch != 0) ? PSIO_OPEN_OLD : PSIO_OPEN_NEW);
847
/*--------------------------------------------------------
848
Dump all fully transformed integrals to disk. Zero out
849
non-symmetrical integrals (what's left of the Pitzer's
850
equal contribution theorem in Abelian case).
851
--------------------------------------------------------*/
852
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
853
/*--------------------------------------------------------------------
854
Write integrals out in num_mo by num_mo batches corresponding to
856
--------------------------------------------------------------------*/
857
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
858
i = act2fullQTS_alpha[mo_i+imin]; /*--- mo_i+imin is the index in QTS "active" indexing scheme
859
we need i to be in QTS "full" indexing scheme, that's
860
what the MP2R12 code expects ---*/
861
isym = MOInfo.mo2symblk_occ_alpha[mo_i+imin];
862
for(mo_j=0;mo_j<(te_type==2 ? MOInfo.alpha_act_occ : mo_i+1);mo_j++) {
863
jsym = MOInfo.mo2symblk_occ_alpha[mo_j+jmin];
864
j = act2fullQTS_alpha[mo_j+jmin]; /*--- Again, get the "full" QTS index ---*/
865
sprintf(ij_key_string,"Block_%d_x_%d_y",i,j);
866
memset(xy_buf,0,MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
867
/*--- Put all integrals with common i and j into a buffer ---*/
868
for(mo_x=0,xy=0;mo_x<MOInfo.num_mo;mo_x++) {
869
x = mo_x; /*--- The second index is a Pitzer index, that's fine ---*/
870
xsym = MOInfo.mo2symblk[x];
871
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++,xy++) {
872
y = mo_y; /*--- Again, the Pitzer index here is what we need ---*/
873
ysym = MOInfo.mo2symblk[y];
874
/*--- Skip this indegral if it's non-totally symmetric -
875
Pitzer's contribution theorem in Abelian case ---*/
876
if ((isym ^ jsym) ^ (xsym ^ ysym))
878
/*--- find the integral in ixjy_buf and put it in xy_buf ---*/
879
m = (((mo_j*MOInfo.num_mo + mo_y)*ibatch_length + mo_i)*MOInfo.num_mo + mo_x);
880
xy_buf[xy] = jyix_buf[te_type][m];
883
psio_write_entry(iounits[te_type], ij_key_string, (char *)xy_buf,
884
MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
887
psio_close(iounits[te_type], 1);
890
/*--- Exactly like the above, but now we're writing out the alpha - beta integrals ---*/
891
iounits2[0] = PSIF_MO_AB_TEI;
892
iounits2[1] = PSIF_MO_AB_R12;
893
iounits2[2] = PSIF_MO_AB_R12T1;
894
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++)
895
psio_open(iounits2[te_type], (ibatch != 0) ? PSIO_OPEN_OLD : PSIO_OPEN_NEW);
897
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
898
/*--------------------------------------------------------------------
899
Write integrals out in num_mo by num_mo batches corresponding to
901
--------------------------------------------------------------------*/
902
for(mo_i=0;mo_i<ibatch_length;mo_i++) {
903
i = act2fullQTS_alpha[mo_i+imin]; /*--- mo_i+imin is the index in QTS "active" indexing scheme
904
we need i to be in QTS "full" indexing scheme, that's
905
what the MP2R12 code expects ---*/
906
isym = MOInfo.mo2symblk_occ_alpha[mo_i+imin];
907
for(mo_j=0;mo_j<MOInfo.beta_act_occ;mo_j++) {
908
jsym = MOInfo.mo2symblk_occ_beta[mo_j+jmin];
909
j = act2fullQTS_beta[mo_j+jmin]; /*--- Again, get the "full" QTS index ---*/
910
sprintf(ij_key_string,"Block_%d_x_%d_y",i,j);
911
memset(xy_buf,0,MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
912
/*--- Put all integrals with common i and j into a buffer ---*/
913
for(mo_x=0,xy=0;mo_x<MOInfo.num_mo;mo_x++) {
914
x = mo_x; /*--- The second index is a Pitzer index, that's fine ---*/
915
xsym = MOInfo.mo2symblk[x];
916
for(mo_y=0;mo_y<MOInfo.num_mo;mo_y++,xy++) {
917
y = mo_y; /*--- Again, the Pitzer index here is what we need ---*/
918
ysym = MOInfo.mo2symblk[y];
919
/*--- Skip this indegral if it's non-totally symmetric -
920
Pitzer's contribution theorem in Abelian case ---*/
921
if ((isym ^ jsym) ^ (xsym ^ ysym))
923
/*--- find the integral in ixjy_buf and put it in xy_buf ---*/
924
m = (((mo_j*MOInfo.num_mo + mo_y)*ibatch_length + mo_i)*MOInfo.num_mo + mo_x);
925
xy_buf[xy] = jyix_buf2[te_type][m];
928
psio_write_entry(iounits2[te_type], ij_key_string, (char *)xy_buf,
929
MOInfo.num_mo*MOInfo.num_mo*sizeof(double));
932
psio_close(iounits2[te_type], 1);
936
if (ibatch < num_ibatch-1)
937
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
938
memset(jsix_buf[te_type],0,MOInfo.alpha_act_occ*BasisSet.num_ao*ibatch_length*MOInfo.num_mo*sizeof(double));
939
memset(jyix_buf[te_type],0,MOInfo.alpha_act_occ*MOInfo.num_mo*ibatch_length*MOInfo.num_mo*sizeof(double));
940
memset(jsix_buf2[te_type],0,MOInfo.beta_act_occ*BasisSet.num_ao*ibatch_length*MOInfo.num_mo*sizeof(double));
941
memset(jyix_buf2[te_type],0,MOInfo.beta_act_occ*MOInfo.num_mo*ibatch_length*MOInfo.num_mo*sizeof(double));
943
/*--- Done with the non-threaded part - wake everybody up and prepare for the next ibatch ---*/
944
RMP2R12_Status.num_arrived = 0;
945
pthread_cond_broadcast(&rmp2r12_energy_cond);
946
pthread_mutex_unlock(&rmp2r12_energy_mutex);
948
} /* End of "I"-loop */
953
for(te_type=0;te_type<NUM_TE_TYPES-1;te_type++) {
954
free(rsiq_buf[te_type]);
955
free(rsix_buf[te_type]);
960
free_libr12(&Libr12);
961
free_fjt_table(&fjt_table);
962
#ifdef NONDOUBLE_INTS
963
for(i=0;i<NUM_TE_TYPES;i++)
973
}}} /* End namespaces */