1
c $Id: mir_perf1.F,v 1.1 2003-02-26 15:33:38 d3g293 Exp $
2
c------------------------------------------------------------------------
3
c Program perf.x is used to test performance of GA put, get, accumulate |
4
c It has to be executed on four processors. |
5
c remote operations access data on processes 1,2,3 in the round-robin way|
6
c------------------------------------------------------------------------
11
#include "mafdecls.fh"
15
c*** Intitialize a message passing library
21
integer required, provided
22
required=MPI_THREAD_MULTIPLE
23
call mpi_init_thread(required, provided, ierr)
24
if (provided.ne.MPI_THREAD_MULTIPLE) then
25
call ga_error('provided.ne.MPI_THREAD_MULTIPLE',provided)
34
c*** Intitialize the GA package
36
if(ga_nnodes().ne.4 .and. ga_nodeid().eq.0)
37
$ call ga_error('Program requires 4 GA processes',ga_nnodes())
39
c*** Initialize the MA package
41
if (.not. ma_init(MT_DBL, heap,heap))
42
$ call ga_error('ma init failed',2*heap)
49
call mpi_finalize(ierr)
58
#include "mafdecls.fh"
62
integer n, nn, num_chunks
63
parameter (n = 1024*1024, nn = n/4, num_chunks=16)
64
double precision buf(nn)
67
integer ilo, ihi, jlo, jhi
68
integer nproc, me, loop, p_handle
69
integer ndim, dims(2), chunks(2)
70
integer chunk(num_chunks)
71
data chunk /1,9,16,81,256,576,900,2304,4096,8281,
72
$ 16384,29241,65536,124609,193600,262144/
76
p_handle = ga_mirror_config()
83
c*** Create global array
84
if (.not. nga_create_config(MT_DBL, ndim, dims, 'a', chunks,
86
$ call ga_error(' ga_create failed ',1)
98
55 format(' Performance of GA get, put & acc',
99
$ ' for 1-dimensional sections of array[',i7,']')
107
call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
109
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.)
114
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.)
120
subroutine TestPutGetAcc1
121
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
124
#include "testutil.fh"
126
integer num_chunks, chunk(num_chunks)
127
integer n, ilo, ihi, jlo,jhi,g_a
128
integer inode, iproc, zero
129
double precision buf(*), tg, tp, ta
130
double precision time_acc1, time_get1, time_put1
134
integer loop, jump, count, bytes
138
inode = ga_cluster_nodeid()
139
iproc = me - ga_cluster_procid(inode,zero)
143
write(6,'(26X, 25HLocal 1-D Array Section )')
145
write(6,'(26X, 25HRemote 1-D Array Section )')
148
write(6,*)' section get put',
150
write(6,*)' bytes dim sec MB/s sec MB/s',
156
do loop = 1, num_chunks
157
bytes = util_mdtob(1)*chunk(loop) ! how much data is accessed
158
jump = n/(6000*loop) ! jump distance between consecutive patches
159
if(loop.eq.num_chunks)jump=0
161
c everybody touches own data
162
c this is a kind of klugy way of filling up local data, but it works
164
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
166
tg=time_get1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
172
c everybody touches own data
173
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
175
tp=time_put1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
181
c everybody touches own data
182
call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
184
ta=time_acc1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
191
write(6,77)bytes, chunk(loop), tg,
192
& 1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta
197
77 format(i7, i7,1x, 3(1x,d8.3,1x,d8.3))
202
double precision function
203
& time_acc1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
207
#include "testutil.fh"
209
integer g_a, chunk, jump, count, is, js, ie, je
211
integer rows, indx, shifti
213
integer ilo, ihi, jlo, jhi
214
double precision seconds, buf
222
seconds = util_timer()
224
c distance between consecutive patches increased by jump
225
c to destroy locality of reference
226
do ilo = is, ie -chunk-jump +1, chunk+jump
230
call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
232
call ga_acc(g_a, ilo+shifti, ihi+shifti,
233
$ jlo, jhi, buf, chunk, 1d0)
236
seconds = util_timer() - seconds
238
time_acc1 = seconds/count
242
double precision function
243
& time_get1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
247
#include "testutil.fh"
249
integer g_a, chunk, jump, count, is, js, ie, je
250
integer rows, indx, shifti
253
integer ilo, ihi, jlo, jhi
254
double precision seconds, buf
262
seconds = util_timer()
264
c distance between consecutive patches increased by jump
265
c to destroy locality of reference
266
do ilo = is, ie -chunk-jump +1, chunk+jump
270
call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
272
call ga_get(g_a, ilo+shifti, ihi+shifti,
273
$ jlo, jhi, buf, chunk)
276
seconds = util_timer() - seconds
278
time_get1 = seconds/count
283
double precision function
284
& time_put1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
288
#include "testutil.fh"
290
integer g_a, chunk, jump, count, is, js, ie, je
291
integer rows, indx, shifti
294
integer ilo, ihi, jlo, jhi
295
double precision seconds, buf
303
seconds = util_timer()
305
c distance between consecutive patches increased by jump
306
c to destroy locality of reference
307
do ilo = is, ie -chunk-jump +1, chunk+jump
311
call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
313
call ga_put(g_a, ilo+shifti, ihi+shifti,
314
$ jlo, jhi, buf, chunk)
317
seconds = util_timer() - seconds
319
time_put1 = seconds/count
325
c test for square patches
329
#include "mafdecls.fh"
332
integer n, nn, num_chunks
333
parameter (n = 1024, nn = n*n/4, num_chunks=16)
334
double precision buf(nn)
337
integer ilo, ihi, jlo, jhi
338
integer nproc, me, loop
339
integer chunk(num_chunks)
340
data chunk /1,3,4,9,16,24,30,48,64,91,128,171,256,353,440,512/
345
c*** Create global array
346
if (.not. ga_create(MT_DBL, n, n, 'a', 0, 0, g_a))
347
$ call ga_error(' ga_create failed ',1)
357
55 format(' Performance of GA get, put & acc',
358
$ ' for square sections of array[',i4,',',i4,']')
366
call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
368
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.)
373
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.)
379
subroutine TestPutGetAcc
380
& (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
383
#include "testutil.fh"
385
integer num_chunks, chunk(num_chunks)
386
integer n, ilo, ihi, jlo,jhi,g_a
387
double precision buf(*), tg, tp, ta
388
double precision time_acc, time_get, time_put
392
integer loop, jump, count, bytes
398
write(6,'(26X, 25HLocal 2-D Array Section )')
400
write(6,'(26X, 25HRemote 2-D Array Section )')
403
write(6,*)' section get put',
405
write(6,*)' bytes dim sec MB/s sec MB/s',
411
do loop = 1, num_chunks
412
bytes = util_mdtob(1)*chunk(loop)*chunk(loop) !how much data is accessed
413
jump = n/(60*loop) ! jump distance between consecutive patches
414
if(loop.eq.num_chunks)jump=0
416
c everybody touches own data
417
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
419
tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
425
c everybody touches own data
426
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
428
tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
434
c everybody touches own data
435
call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
437
ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
444
write(6,77)bytes, chunk(loop), tg,
445
& 1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta
450
77 format(i7, i7,1x, 3(1x,d8.3,1x,d8.3))
455
double precision function
456
& time_acc(g_a, is, ie, js, je, buf, chunk, jump, count, local)
460
#include "testutil.fh"
462
integer g_a, chunk, jump, count, is, js, ie, je
464
integer rows, cols, indx, shifti(3), shiftj(3)
466
integer ilo, ihi, jlo, jhi
467
double precision seconds, buf
479
seconds = util_timer()
481
c distance between consecutive patches increased by jump
482
c to destroy locality of reference
483
do ilo = is, ie -chunk-jump +1, chunk+jump
485
do jlo = js, je -chunk-jump +1, chunk+jump
489
call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
491
indx = Mod(count,3) + 1
492
call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx),
493
$ jlo+shiftj(indx), jhi+shiftj(indx),
498
seconds = util_timer() - seconds
500
time_acc = seconds/count
504
double precision function
505
& time_get(g_a, is, ie, js, je, buf, chunk, jump, count, local)
509
#include "testutil.fh"
511
integer g_a, chunk, jump, count, is, js, ie, je
512
integer rows, cols, indx, shifti(3), shiftj(3)
515
integer ilo, ihi, jlo, jhi
516
double precision seconds, buf
528
seconds = util_timer()
530
c distance between consecutive patches increased by jump
531
c to destroy locality of reference
532
do ilo = is, ie -chunk-jump +1, chunk+jump
534
do jlo = js, je -chunk-jump +1, chunk+jump
538
call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
540
indx = Mod(count,3) + 1
541
call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx),
542
$ jlo+shiftj(indx), jhi+shiftj(indx),
547
seconds = util_timer() - seconds
549
time_get = seconds/count
554
double precision function
555
& time_put(g_a, is, ie, js, je, buf, chunk, jump, count, local)
559
#include "testutil.fh"
561
integer g_a, chunk, jump, count, is, js, ie, je
562
integer rows, cols, indx, shifti(3), shiftj(3)
565
integer ilo, ihi, jlo, jhi
566
double precision seconds, buf
578
seconds = util_timer()
580
c distance between consecutive patches increased by jump
581
c to destroy locality of reference
582
do ilo = is, ie -chunk-jump +1, chunk+jump
584
do jlo = js, je -chunk-jump +1, chunk+jump
588
call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
590
indx = Mod(count,3) + 1
591
call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx),
592
$ jlo+shiftj(indx), jhi+shiftj(indx),
597
seconds = util_timer() - seconds
599
time_put = seconds/count
603
subroutine my_sleep(seconds)
606
#include "testutil.fh"
608
double precision ts, te, work
609
integer seconds, loop
610
common /sleep_block/ work
619
if(te - ts .lt. real(seconds)) goto 100
620
* print *, work, ts, te