4
c $Id: perf.F,v 1.8 2005-01-12 02:07:10 manoj Exp $
5
c------------------------------------------------------------------------
6
c Program perf.x is used to test performance of GA put, get, accumulate |
7
c It has to be executed on four processors. |
8
c remote operations access data on processes 1,2,3 in the round-robin way|
9
c------------------------------------------------------------------------
12
#define SLEEP my_sleep
18
#include "mafdecls.fh"
22
c*** Intitialize a message passing library
26
c*** Intitialize the GA package
28
if(ga_nnodes().ne.4 .and. ga_nodeid().eq.0)
29
$ call ga_error('Program requires 4 GA processes',ga_nnodes())
31
c*** enables MA to use ARMCI memory
32
call set_ma_use_armci_mem()
34
c*** Initialize the MA package
36
if (.not. ma_init(MT_DBL, heap,heap))
37
$ call ga_error('ma init failed',2*heap)
42
if(ga_nodeid().eq.0) print *,'All tests successful '
52
#include "mafdecls.fh"
56
integer n, nn, num_chunks
57
parameter (n = 1024*1024, nn = n/4, num_chunks=16)
58
double precision buf(nn)
61
integer ilo, ihi, jlo, jhi
62
integer nproc, me, h_d
63
MA_ACCESS_INDEX_TYPE loop, indexd
64
integer chunk(num_chunks)
65
data chunk /1,9,16,81,256,576,900,2304,4096,8281,
66
$ 16384,29241,65536,124609,193600,262144/
71
c*** Create global array
72
if (.not. ga_create(MT_DBL, n, 1, 'a', 0, 0, g_a))
73
$ call ga_error(' ga_create failed ',1)
75
c*** allocate memory for D
76
if(.not. ma_push_get(MT_DBL,nn,'d',h_d, indexd))
77
& call ga_error('memory allocation failed',0)
84
DBL_MB(indexd+loop) = .01d0
93
55 format(' Performance of GA get, put & acc',
94
$ ' for 1-dimensional sections of array[',i7,']')
102
call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
104
& (g_a, n, chunk, num_chunks, DBL_MB(indexd), ilo, ihi,
110
& (g_a, n, chunk, num_chunks, DBL_MB(indexd), ilo, ihi,
114
if(.not. ma_pop_stack(h_d)) call ga_error('invalid MA handle ?',0)
118
subroutine TestPutGetAcc1
119
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
122
#include "testutil.fh"
124
integer num_chunks, chunk(num_chunks)
125
integer n, ilo, ihi, jlo,jhi,g_a
126
double precision buf(*), tg, tp, ta
127
double precision time_acc1, time_get1, time_put1
131
integer loop, jump, count, bytes
137
write(6,'(26X, 25HLocal 1-D Array Section )')
139
write(6,'(26X, 25HRemote 1-D Array Section )')
142
write(6,*)' section get put',
144
write(6,*)' bytes dim usec MB/s usec MB/s',
150
do loop = 1, num_chunks
151
bytes = util_mdtob(1)*chunk(loop) ! how much data is accessed
152
jump = n/(6000*loop)!jump distance between consecutive patches
153
if(loop.eq.num_chunks)jump=0
155
c everybody touches own data
156
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
158
tg=time_get1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
164
c everybody touches own data
165
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
167
tp=time_put1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
173
c everybody touches own data
174
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
176
ta=time_acc1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
183
write(6,77)bytes, chunk(loop),
184
& tg/1d-6, 1d-6*bytes/tg,
185
& tp/1d-6, 1d-6*bytes/tp,
186
& ta/1d-6, 1d-6*bytes/ta
191
77 format(i7, i7,1x, 3(1x,e8.3,1x,e8.3))
196
double precision function
197
& time_acc1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
201
#include "testutil.fh"
203
integer g_a, chunk, jump, count, is, js, ie, je
205
integer rows, indx, shifti(3)
207
integer ilo, ihi, jlo, jhi
208
double precision seconds, buf
218
seconds = util_timer()
220
c distance between consecutive patches increased by jump
221
c to destroy locality of reference
222
do ilo = is, ie -chunk-jump +1, chunk+jump
226
call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
228
indx = Mod(count,3) + 1
229
call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx),
230
$ jlo, jhi, buf, chunk, 1d0)
233
seconds = util_timer() - seconds
235
time_acc1 = seconds/count
239
double precision function
240
& time_get1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
244
#include "testutil.fh"
246
integer g_a, chunk, jump, count, is, js, ie, je
247
integer rows, indx, shifti(3)
250
integer ilo, ihi, jlo, jhi
251
double precision seconds, buf
261
seconds = util_timer()
263
c distance between consecutive patches increased by jump
264
c to destroy locality of reference
265
do ilo = is, ie -chunk-jump +1, chunk+jump
269
call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
271
indx = Mod(count,3) + 1
272
call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx),
273
$ jlo, jhi, buf, chunk)
276
seconds = util_timer() - seconds
278
time_get1 = seconds/count
283
double precision function
284
& time_put1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
288
#include "testutil.fh"
290
integer g_a, chunk, jump, count, is, js, ie, je
291
integer rows, indx, shifti(3), shiftj(3)
294
integer ilo, ihi, jlo, jhi
295
double precision seconds, buf
305
seconds = util_timer()
307
c distance between consecutive patches increased by jump
308
c to destroy locality of reference
309
do ilo = is, ie -chunk-jump +1, chunk+jump
313
call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
315
indx = Mod(count,3) + 1
316
call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx),
317
$ jlo, jhi, buf, chunk)
320
seconds = util_timer() - seconds
322
time_put1 = seconds/count
328
c test for square patches
332
#include "mafdecls.fh"
335
integer n, nn, num_chunks
336
parameter (n = 1024, nn = n*n/4, num_chunks=16)
337
double precision buf(nn)
340
integer ilo, ihi, jlo, jhi
341
integer nproc, me, h_d
342
MA_ACCESS_INDEX_TYPE indexd, loop
343
integer chunk(num_chunks)
344
data chunk /1,3,4,9,16,24,30,48,64,91,128,171,256,353,440,512/
349
c*** Create global array
350
if (.not. ga_create(MT_DBL, n, n, 'a', 0, 0, g_a))
351
$ call ga_error(' ga_create failed ',1)
353
c*** allocate memory for D
354
if(.not. ma_push_get(MT_DBL,nn,'d',h_d, indexd))
355
& call ga_error('memory allocation failed',0)
362
DBL_MB(indexd+loop) = .01d0
369
55 format(' Performance of GA get, put & acc',
370
$ ' for square sections of array[',i4,',',i4,']')
378
call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
380
& (g_a, n, chunk, num_chunks, DBL_MB(indexd), ilo, ihi,
386
& (g_a, n, chunk, num_chunks, DBL_MB(indexd), ilo, ihi,
390
if(.not. ma_pop_stack(h_d)) call ga_error('invalid MA handle ?',0)
394
subroutine TestPutGetAcc
395
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
398
#include "testutil.fh"
400
integer num_chunks, chunk(num_chunks)
401
integer n, ilo, ihi, jlo,jhi,g_a
402
double precision buf(*), tg, tp, ta
403
double precision time_acc, time_get, time_put
407
integer loop, jump, count, bytes
413
write(6,'(26X, 25HLocal 2-D Array Section )')
415
write(6,'(26X, 25HRemote 2-D Array Section )')
418
write(6,*)' section get put',
420
write(6,*)' bytes dim usec MB/s usec MB/s',
426
do loop = 1, num_chunks
427
bytes = util_mdtob(1)*chunk(loop)*chunk(loop) !how much data
428
jump = n/(60*loop) ! jump distance between consecutive patches
429
if(loop.eq.num_chunks)jump=0
431
c everybody touches own data
432
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
434
tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
440
c everybody touches own data
441
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
443
tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
449
c everybody touches own data
450
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
452
ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
459
write(6,77)bytes, chunk(loop),
460
& tg/1d-6, 1d-6*bytes/tg,
461
& tp/1d-6, 1d-6*bytes/tp,
462
& ta/1d-6, 1d-6*bytes/ta
467
77 format(i7, i7,1x, 3(1x,e8.3,1x,e8.3))
472
double precision function
473
& time_acc(g_a, is, ie, js, je, buf, chunk, jump, count, local)
477
#include "testutil.fh"
479
integer g_a, chunk, jump, count, is, js, ie, je
481
integer rows, cols, indx, shifti(3), shiftj(3)
483
integer ilo, ihi, jlo, jhi
484
double precision seconds, buf
496
seconds = util_timer()
498
c distance between consecutive patches increased by jump
499
c to destroy locality of reference
500
do ilo = is, ie -chunk-jump +1, chunk+jump
502
do jlo = js, je -chunk-jump +1, chunk+jump
506
call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
508
indx = Mod(count,3) + 1
509
call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx),
510
$ jlo+shiftj(indx), jhi+shiftj(indx),
515
seconds = util_timer() - seconds
517
time_acc = seconds/count
521
double precision function
522
& time_get(g_a, is, ie, js, je, buf, chunk, jump, count, local)
526
#include "testutil.fh"
528
integer g_a, chunk, jump, count, is, js, ie, je
529
integer rows, cols, indx, shifti(3), shiftj(3)
532
integer ilo, ihi, jlo, jhi
533
double precision seconds, buf
545
seconds = util_timer()
547
c distance between consecutive patches increased by jump
548
c to destroy locality of reference
549
do ilo = is, ie -chunk-jump +1, chunk+jump
551
do jlo = js, je -chunk-jump +1, chunk+jump
555
call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
557
indx = Mod(count,3) + 1
558
call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx),
559
$ jlo+shiftj(indx), jhi+shiftj(indx),
564
seconds = util_timer() - seconds
566
time_get = seconds/count
571
double precision function
572
& time_put(g_a, is, ie, js, je, buf, chunk, jump, count, local)
576
#include "testutil.fh"
578
integer g_a, chunk, jump, count, is, js, ie, je
579
integer rows, cols, indx, shifti(3), shiftj(3)
582
integer ilo, ihi, jlo, jhi
583
double precision seconds, buf
595
seconds = util_timer()
597
c distance between consecutive patches increased by jump
598
c to destroy locality of reference
599
do ilo = is, ie -chunk-jump +1, chunk+jump
601
do jlo = js, je -chunk-jump +1, chunk+jump
605
call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
607
indx = Mod(count,3) + 1
608
call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx),
609
$ jlo+shiftj(indx), jhi+shiftj(indx),
614
seconds = util_timer() - seconds
616
time_put = seconds/count
620
subroutine my_sleep(seconds)
623
#include "testutil.fh"
625
double precision ts, te, work
626
integer seconds, loop
627
common /sleep_block/ work
636
if(te - ts .lt. real(seconds)) goto 100
637
* print *, work, ts, te