5
/* $Id: armci_profile.c,v 1.8 2005-11-30 10:20:53 vinod Exp $ */
8
* Profiler can profile the following ARMCI Calls:
9
* ARMCI_Get,ARMCI_Put,ARMCI_Acc,ARMCI_NbGet,ARMCI_NbPut,ARMCI_NbAcc,
10
* ARMCI_GetS,ARMCI_PutS,ARMCI_AccS,ARMCI_NbGetS,ARMCI_NbPutS,ARMCI_NbAccS,
11
* ARMCI_GetV,ARMCI_PutV,ARMCI_AccV,ARMCI_NbGetV,ARMCI_NbPutV,ARMCI_NbAccV,
12
* ARMCI_Wait, armci_wait_notify
13
* (NOTE: As armci_notify is same as ARMCI_Put, it is not profiled.)
16
* Note #1: Right now, only process 0's profile is printed.
17
* Each and every process saves its profile in the correspoding data struture.
18
* Each process prints its profile to an output file armci_profile.<myrank>
19
* when armci_profile_terminate() is called (called in ARMCI_Finalize()).
21
* Note #2: By default profiler prints msg ranges 0 to 21. Example: range 10
22
* corresponds to message ranges from 1024 bytes to 2047 bytes.
23
* Message ranges are in the power of 2. for ex:
24
* ------------------------------------
25
* MSG_RANGE (r) BYTES (2^r to 2^(r+1)-1)
26
* ------------------------------------
35
* -------------------------------------
36
* To increase the message range, set ARMCI_MAX_MSG_RANGE accordingly.
38
* Note #3: If Stride information needs to be printed, set ARMCI_PRINT_STRIDE.
39
* Stride information is printed in armci_profile_terminate() for a various
40
* selective message ranges and event types.Modify it according to your needs.
42
* Note #4: There is no profiling support for non-blocking operations yet!!
59
#include "armci_profile.h"
63
# define MP_TIMER tcg_time
66
# define MP_TIMER MPI_Wtime
70
#define ARMCI_PRINT_STRIDE 1
71
#define ARMCI_MAX_MSG_RANGE 22 /* 0 to 21 */
73
#if ARMCI_PRINT_STRIDE
75
# define STRIDE_COUNT 1000
76
# define ARMCI_MAX_DIM 7
78
typedef struct armci_stride {
81
int count[ARMCI_MAX_DIM];
90
typedef struct armci_vector {
99
#define ARMCI_EVENTS 24
101
char *gEventName[ARMCI_EVENTS]={
103
"STRIDED GET", "STRIDED PUT", "STRIDED ACC",
104
"VECTOR GET", "VECTOR PUT", "VECTOR ACC",
105
"NBGET", "NBPUT", "NBACC",
106
"STRIDED NBGET", "STRIDED NBPUT", "STRIDED NBACC",
107
"VECTOR NBGET", "VECTOR NBPUT", "VECTOR NBACC",
108
"BARRIER","ARMCI_WAIT","NOTIFY_WAIT",
109
"FENCE", "ALLFENCE", "RMW"
112
typedef struct armci_profile {
113
int count; /* number of times called */
114
double time; /* total execution time for "count" calls */
115
#if ARMCI_PRINT_STRIDE
116
armci_stride_t *stride;
117
armci_vector_t *vector;
121
/* profile get/put/acc for various message ranges (i.e ARMCI_MAX_MSG_RANGE) */
122
static armci_profile_t ARMCI_PROF[ARMCI_EVENTS][ARMCI_MAX_MSG_RANGE];
132
static int strided_event(int e) {
133
if (e==ARMCI_PROF_GETS || e==ARMCI_PROF_PUTS || e==ARMCI_PROF_ACCS ||
134
e==ARMCI_PROF_NBGETS || e==ARMCI_PROF_NBPUTS || e==ARMCI_PROF_NBACCS)
139
void armci_profile_init() {
141
if(armci_me==0) {printf("\nProfiling ARMCI - ON\n");fflush(stdout);}
143
gCURRENT_EVNT.is_set = 0;
145
for(i=0; i<ARMCI_EVENTS; i++)
146
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
147
ARMCI_PROF[i][j].count = 0; ARMCI_PROF[i][j].time = 0.0;
150
#if ARMCI_PRINT_STRIDE
151
for(i=0; i<ARMCI_EVENTS; i++) {
153
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
154
ARMCI_PROF[i][j].stride = (armci_stride_t*)malloc(STRIDE_COUNT*sizeof(armci_stride_t));
155
ARMCI_PROF[i][j].vector = NULL;
156
if( ARMCI_PROF[i][j].stride == NULL)
157
armci_die("armci_profile_init(): malloc failed", armci_me);
159
if(i==ARMCI_PROF_GETV || i==ARMCI_PROF_PUTV || i==ARMCI_PROF_ACCV ||
160
i==ARMCI_PROF_NBGETV || i==ARMCI_PROF_NBPUTV || i==ARMCI_PROF_NBACCV)
161
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
162
ARMCI_PROF[i][j].vector = (armci_vector_t*)malloc(STRIDE_COUNT*sizeof(armci_vector_t));
163
ARMCI_PROF[i][j].stride = NULL;
164
if( ARMCI_PROF[i][j].vector == NULL)
165
armci_die("armci_profile_init(): malloc failed", armci_me);
171
#define ARMCI_EVENT_CLOSED 0
172
#define ARMCI_EVENT_NOTCLOSED -1
173
#define ARMCI_EVENT_SET 0
174
#define ARMCI_EVENT_NOTSET -1
176
static int armci_profile_set_event(int event_type, int range) {
179
printf("\n%d(s):call profile set for %s isset is %d",armci_me,
180
gEventName[event_type],gCURRENT_EVNT.is_set);
182
printf("\n%d:call profile set for %s isset is %d",armci_me,
183
gEventName[event_type],gCURRENT_EVNT.is_set);
186
if(gCURRENT_EVNT.is_set == 0) { /* set an event */
187
gCURRENT_EVNT.is_set = 1;
188
gCURRENT_EVNT.event_type = event_type;
189
gCURRENT_EVNT.range = range;
190
gCURRENT_EVNT.start_time = MP_TIMER();
191
return ARMCI_EVENT_SET;
193
else gCURRENT_EVNT.is_set++; /* event overlap */
194
return ARMCI_EVENT_NOTSET;
197
static int armci_profile_close_event(int event_type, int range, double *time,
200
int curr_event = gCURRENT_EVNT.event_type;
203
printf("\n%d(s):call profile close for %s isset is %d",armci_me,
204
gEventName[event_type],gCURRENT_EVNT.is_set);
206
printf("\n%d:call profile close for %s isset is %d",armci_me,
207
gEventName[event_type],gCURRENT_EVNT.is_set);
212
if(gCURRENT_EVNT.is_set==1) { /* Yep, there is an event set. So close it.*/
213
/*Check if "profile stop" is called for corresponding "profile start"*/
214
if(event_type != curr_event) {
216
"%d: %s: ERROR:Profile started for %s, but stopped for %s\n",
217
armci_me,name,gEventName[curr_event],gEventName[event_type]);
219
armci_die("Profile_stop is called a different event", armci_me);
222
*time = MP_TIMER() - gCURRENT_EVNT.start_time;
223
ARMCI_PROF[curr_event][range].time += *time;
224
gCURRENT_EVNT.is_set = 0; /* close the event */
225
return ARMCI_EVENT_CLOSED;
227
else { /* event overlapping */
228
gCURRENT_EVNT.is_set--;
229
if(gCURRENT_EVNT.is_set<=0) {
230
char *msg="Profile_stop is called before profile_start";
231
printf("%d: %s: ERROR: %s. Event Name = %s\n", armci_me,
232
name, msg, gEventName[curr_event]);
234
armci_die(" profile_stop is called before profile_start", armci_me);
237
return ARMCI_EVENT_NOTCLOSED;
240
void armci_profile_start_strided(int count[], int stride_levels, int proc,
242
int i, status, bytes=1, range;
244
if(stride_levels >= ARMCI_MAX_DIM)
245
armci_die("ARMCI_PROFILE: stride_levels >= ARMCI_MAX_DIM. Increase ARMCI_MAX_DIM.", armci_me);
247
/* find the message range */
248
for(i=0; i<= stride_levels; i++) bytes *= count[i];
249
if(bytes<=0) range=0;
250
else range = (int) (log((double)bytes)/log(2.0));
251
if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
253
/* set the curent event for timer */
254
status = armci_profile_set_event(event_type, range);
256
if(status == ARMCI_EVENT_SET) { /* new event set */
257
/* profile update: i.e. update event count */
258
ARMCI_PROF[event_type][range].count++;
260
# if ARMCI_PRINT_STRIDE
261
if(strided_event(event_type)) {
262
int idx = ARMCI_PROF[event_type][range].count-1;
263
if(idx<STRIDE_COUNT) {
264
ARMCI_PROF[event_type][range].stride[idx].stride_levels = stride_levels;
265
ARMCI_PROF[event_type][range].stride[idx].proc = proc;
266
for(i=0;i<=stride_levels;i++) {
267
ARMCI_PROF[event_type][range].stride[idx].count[i]=count[i];
273
else { /* Do nothing. It is just an event overlap */ }
276
void armci_profile_stop_strided(int event_type) {
278
int status, range = gCURRENT_EVNT.range;
280
status = armci_profile_close_event(event_type, range, &time,
281
"armci_profile_stop_strided");
283
#if ARMCI_PRINT_STRIDE
284
if(status == ARMCI_EVENT_CLOSED) {
285
/* record the time of each strided data transfer */
286
if(strided_event(event_type)) {
287
int idx = ARMCI_PROF[event_type][range].count-1;
289
ARMCI_PROF[event_type][range].stride[idx].time = time;
295
void armci_profile_start_vector(armci_giov_t darr[], int len, int proc,
298
int i, bytes=0, range, status;
300
/* find the message range */
301
for(i=0; i<len; i++) bytes += darr[i].bytes;
302
if(bytes<=0) range=0;
303
else range = (int) (log((double)bytes)/log(2.0));
304
if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
306
/* set the curent event for timer */
307
status = armci_profile_set_event(event_type, range);
309
if(status == ARMCI_EVENT_SET) { /* new event set */
310
/* profile update: i.e. update event count */
311
ARMCI_PROF[event_type][range].count++;
313
# if ARMCI_PRINT_STRIDE
315
int idx = ARMCI_PROF[event_type][range].count-1;
316
if(idx<STRIDE_COUNT) {
317
ARMCI_PROF[event_type][range].vector[idx].vec_len = len;
318
ARMCI_PROF[event_type][range].vector[idx].proc = proc;
319
ARMCI_PROF[event_type][range].vector[idx].giov =
320
(giov_t*)malloc(len*sizeof(giov_t));
322
ARMCI_PROF[event_type][range].vector[idx].giov[i].ptr_array_len = darr[i].ptr_array_len;
323
ARMCI_PROF[event_type][range].vector[idx].giov[i].bytes =
332
void armci_profile_stop_vector(int event_type) {
334
int status, range = gCURRENT_EVNT.range;
336
status = armci_profile_close_event(event_type, range, &time,
337
"armci_profile_stop_vector");
339
#if ARMCI_PRINT_STRIDE
340
if(status == ARMCI_EVENT_CLOSED) {/*record time of each data transfer*/
341
int idx = ARMCI_PROF[event_type][range].count-1;
343
ARMCI_PROF[event_type][range].vector[idx].time = time;
348
void armci_profile_start(int event_type) {
351
/* message range is zero for events registered using this call */
354
/* set the curent event for timer */
355
status = armci_profile_set_event(event_type, range);
356
if(status == ARMCI_EVENT_SET) { /* new event set */
357
/* profile update: i.e. update event count */
358
ARMCI_PROF[event_type][range].count++;
362
void armci_profile_stop(int event_type) {
364
int status,range = gCURRENT_EVNT.range;
365
status = armci_profile_close_event(event_type, range, &time,
366
"armci_profile_stop");
369
#define ARMCI_HDR0(fp) fprintf(fp, "\n\n************** TOTAL DATA TRANSFERS **************\n\n");
370
#define ARMCI_HDR1(fp) fprintf(fp, "\n\n************ CONTIGUOUS DATA TRANSFER ************\n\n");
371
#define ARMCI_HDR2(fp) fprintf(fp, "\n\n********** NON-CONTIGUOUS DATA TRANSFER **********\n\n");
372
#define ARMCI_HDR3(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
373
#define ARMCI_HDR4(fp) fprintf(fp, "SL#\tndim\t proc\t time stride_info\n\n");
374
#define ARMCI_HDR5(fp) fprintf(fp, "SL#\tnvec\t proc\t time\t [ #arrays\t bytes\t]\n");
375
#define ARMCI_HDR6(fp) fprintf(fp, "\n\n****** NON-BLOCKING CONTIGUOUS DATA TRANSFER *****\n\n");
376
#define ARMCI_HDR7(fp) fprintf(fp, "\n\n*** NON-BLOCKING NON-CONTIGUOUS DATA TRANSFER ****\n\n");
377
#define ARMCI_HDR8(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
378
#define ARMCI_HDR9(fp) fprintf(fp, "\n\n******************* ARMCI MISC *******************\n\n");
380
/* print profile of all get/put/acc calls for every message range */
381
static void armci_print_all(FILE *fp) {
382
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
383
double gtime, ptime, atime;
385
ARMCI_HDR0(fp); ARMCI_HDR3(fp);
386
for(i=0; i< nrange; i++) {
388
nget =(ARMCI_PROF[ARMCI_PROF_GET][i].count +
389
ARMCI_PROF[ARMCI_PROF_GETS][i].count +
390
ARMCI_PROF[ARMCI_PROF_GETV][i].count +
391
ARMCI_PROF[ARMCI_PROF_NBGET][i].count +
392
ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
393
ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
394
nput =(ARMCI_PROF[ARMCI_PROF_PUT][i].count +
395
ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
396
ARMCI_PROF[ARMCI_PROF_PUTV][i].count +
397
ARMCI_PROF[ARMCI_PROF_NBPUT][i].count +
398
ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
399
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
400
nacc =(ARMCI_PROF[ARMCI_PROF_ACC][i].count +
401
ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
402
ARMCI_PROF[ARMCI_PROF_ACCV][i].count +
403
ARMCI_PROF[ARMCI_PROF_NBACC][i].count +
404
ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
405
ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
407
gtime = (ARMCI_PROF[ARMCI_PROF_GET][i].time +
408
ARMCI_PROF[ARMCI_PROF_GETS][i].time +
409
ARMCI_PROF[ARMCI_PROF_GETV][i].time +
410
ARMCI_PROF[ARMCI_PROF_NBGET][i].time +
411
ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
412
ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
413
ptime = (ARMCI_PROF[ARMCI_PROF_PUT][i].time +
414
ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
415
ARMCI_PROF[ARMCI_PROF_PUTV][i].time +
416
ARMCI_PROF[ARMCI_PROF_NBPUT][i].time +
417
ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
418
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
419
atime = (ARMCI_PROF[ARMCI_PROF_ACC][i].time +
420
ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
421
ARMCI_PROF[ARMCI_PROF_ACCV][i].time +
422
ARMCI_PROF[ARMCI_PROF_NBACC][i].time +
423
ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
424
ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
426
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
427
nget, nput, nacc, gtime, ptime, atime);
428
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
429
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
433
/* print profile of contiguous get/put/acc calls for every message range */
434
static void armci_print_contig(FILE *fp) {
435
int i, nrange=ARMCI_MAX_MSG_RANGE;
436
ARMCI_HDR1(fp); ARMCI_HDR3(fp);
437
for(i=0; i< nrange; i++) {
438
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
439
ARMCI_PROF[ARMCI_PROF_GET][i].count,
440
ARMCI_PROF[ARMCI_PROF_PUT][i].count,
441
ARMCI_PROF[ARMCI_PROF_ACC][i].count,
442
ARMCI_PROF[ARMCI_PROF_GET][i].time,
443
ARMCI_PROF[ARMCI_PROF_PUT][i].time,
444
ARMCI_PROF[ARMCI_PROF_ACC][i].time);
445
if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
446
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
450
/* This prints the number of non-contiguous get/put/acc/ calls for every
452
static void armci_print_noncontig(FILE *fp) {
453
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
454
double gtime, ptime, atime;
456
ARMCI_HDR2(fp); ARMCI_HDR3(fp);
457
for(i=0; i< nrange; i++) {
458
nget = (ARMCI_PROF[ARMCI_PROF_GETS][i].count +
459
ARMCI_PROF[ARMCI_PROF_GETV][i].count);
460
nput = (ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
461
ARMCI_PROF[ARMCI_PROF_PUTV][i].count);
462
nacc = (ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
463
ARMCI_PROF[ARMCI_PROF_ACCV][i].count);
464
gtime = (ARMCI_PROF[ARMCI_PROF_GETS][i].time +
465
ARMCI_PROF[ARMCI_PROF_GETV][i].time);
466
ptime = (ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
467
ARMCI_PROF[ARMCI_PROF_PUTV][i].time);
468
atime = (ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
469
ARMCI_PROF[ARMCI_PROF_ACCV][i].time);
471
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
472
nget, nput, nacc, gtime, ptime, atime);
473
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
474
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
478
/* print profile of non-blocking contiguous get/put/acc calls for every
480
static void armci_print_nbcontig(FILE *fp) {
481
int i, nrange=ARMCI_MAX_MSG_RANGE;
482
ARMCI_HDR6(fp); ARMCI_HDR8(fp);
483
for(i=0; i< nrange; i++) {
484
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
485
ARMCI_PROF[ARMCI_PROF_NBGET][i].count,
486
ARMCI_PROF[ARMCI_PROF_NBPUT][i].count,
487
ARMCI_PROF[ARMCI_PROF_NBACC][i].count,
488
ARMCI_PROF[ARMCI_PROF_NBGET][i].time,
489
ARMCI_PROF[ARMCI_PROF_NBPUT][i].time,
490
ARMCI_PROF[ARMCI_PROF_NBACC][i].time);
491
if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
492
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
496
/* This prints the number of non-blocking non-contiguous get/put/acc/ calls
497
for every message range */
498
static void armci_print_nbnoncontig(FILE *fp) {
499
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
500
double gtime, ptime, atime;
502
ARMCI_HDR7(fp); ARMCI_HDR8(fp);
503
for(i=0; i< nrange; i++) {
504
nget = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
505
ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
506
nput = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
507
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
508
nacc = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
509
ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
510
gtime = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
511
ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
512
ptime = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
513
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
514
atime = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
515
ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
517
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
518
nget, nput, nacc, gtime, ptime, atime);
519
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
520
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
524
/* Profile of armci_notify_wait(), ARMCI_Wait() and ARMCI_Barrier() */
525
static void armci_print_misc(FILE *fp) {
527
fprintf(fp, "#calls\t time\t EVENT\n\n");
528
fprintf(fp, "%d\t %.2e ARMCI_Wait()\n",
529
ARMCI_PROF[ARMCI_PROF_WAIT][0].count,
530
ARMCI_PROF[ARMCI_PROF_WAIT][0].time);
531
fprintf(fp, "%d\t %.2e armci_notify_wait()\n",
532
ARMCI_PROF[ARMCI_PROF_NOTIFY][0].count,
533
ARMCI_PROF[ARMCI_PROF_NOTIFY][0].time);
534
fprintf(fp, "%d\t %.2e ARMCI_Barrier()\n",
535
ARMCI_PROF[ARMCI_PROF_BARRIER][0].count,
536
ARMCI_PROF[ARMCI_PROF_BARRIER][0].time);
537
fprintf(fp, "%d\t %.2e ARMCI_Fence()\n",
538
ARMCI_PROF[ARMCI_PROF_FENCE][0].count,
539
ARMCI_PROF[ARMCI_PROF_FENCE][0].time);
540
fprintf(fp, "%d\t %.2e ARMCI_Allfence()\n",
541
ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].count,
542
ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].time);
543
fprintf(fp, "%d\t %.2e ARMCI_Rmw()\n",
544
ARMCI_PROF[ARMCI_PROF_RMW][0].count,
545
ARMCI_PROF[ARMCI_PROF_RMW][0].time);
548
#if ARMCI_PRINT_STRIDE
549
static void armci_print_warning_msg(FILE *fp, int range, int str_count) {
550
fprintf(fp, "WARNING: In your program, total number of data transfers\n");
551
fprintf(fp, "for message range[%d - %d] is %d. This exceeds\n",
552
1<<range, 1<<(range+1), str_count);
553
fprintf(fp,"the maximum # of data transfers [%d] that can be profiled.\n",
555
fprintf(fp, "Therefore profile of only first %d data \n", STRIDE_COUNT);
556
fprintf(fp, "transfers are shown below. To increase the count, set\n");
557
fprintf(fp, "STRIDE_COUNT > %d (in armci_profile.c)\n", str_count);
560
static void armci_print_stridedinfo(FILE *fp, int event, int range) {
561
int i, j, stride_levels, str_count;
564
str_count = ARMCI_PROF[event][range].count;
565
if(str_count <=0) return;
566
if(str_count > STRIDE_COUNT) {
567
armci_print_warning_msg(fp, range, str_count);
568
str_count = STRIDE_COUNT;
571
fprintf(fp, "\n\nSTRIDE INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
572
1<<range, (1<<(range+1))-1, gEventName[event]);
575
for(i=0; i< str_count; i++) {
576
time += ARMCI_PROF[event][range].stride[i].time;
577
stride_levels = ARMCI_PROF[event][range].stride[i].stride_levels;
578
fprintf(fp, "%d\t%d\t %d\t %.2e (",i, stride_levels,
579
ARMCI_PROF[event][range].stride[i].proc,
580
ARMCI_PROF[event][range].stride[i].time);
581
for(j=0;j<=stride_levels;j++) {
582
fprintf(fp, "%d", ARMCI_PROF[event][range].stride[i].count[j]);
583
if(j!=stride_levels) fprintf(fp, "x");
587
/*This o/p is just for verification*/
588
fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
592
static void armci_print_vectorinfo(FILE *fp, int event, int range) {
593
int i, j, vec_len, str_count;
596
str_count = ARMCI_PROF[event][range].count;
597
if(str_count <=0) return;
598
if(str_count > STRIDE_COUNT) {
599
armci_print_warning_msg(fp, range, str_count);
600
str_count = STRIDE_COUNT;
603
fprintf(fp, "\n\nVECTOR INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
604
1<<range, (1<<(range+1))-1, gEventName[event]);
607
for(i=0; i< str_count; i++) {
608
time += ARMCI_PROF[event][range].vector[i].time;
609
vec_len = ARMCI_PROF[event][range].vector[i].vec_len;
610
fprintf(fp, "%d\t%d\t %d\t %.2e [ ",i, vec_len,
611
ARMCI_PROF[event][range].vector[i].proc,
612
ARMCI_PROF[event][range].vector[i].time);
613
for(j=0;j<vec_len;j++) {
614
fprintf(fp, "%-9d %d\t]\n",
615
ARMCI_PROF[event][range].vector[i].giov[j].ptr_array_len,
616
ARMCI_PROF[event][range].vector[i].giov[j].bytes);
617
if(j!=vec_len-1) fprintf(fp, "\t\t\t\t [ ");
620
/*This o/p is just for verification*/
621
fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
624
#endif /* end of ARMCI_PRINT_STRIDE */
626
void armci_profile_terminate() {
629
sprintf(file_name, "armci_profile.%d", armci_me);
630
fp = fopen(file_name, "w");
632
armci_print_all(fp); /* total get/put/acc calls */
633
armci_print_contig(fp); /* contiguous calls */
634
armci_print_noncontig(fp); /* non-contiguous calls */
635
armci_print_nbcontig(fp); /* non-blocking contiguous calls */
636
armci_print_nbnoncontig(fp); /* non-blocking non-contiguous calls */
638
/* miscellaneous (barrier, armci_wait, notify_wait) */
639
armci_print_misc(fp);
641
#if ARMCI_PRINT_STRIDE
644
* printing stride info for non-contiguous get (ARMCI_PROF_GETS) for message
645
* range #6. 2^6 - 2^(6+1) bytes (i.e. 64-128 bytes)
646
* Ex: armci_print_stridedinfo(ARMCI_PROF_GETS,6);
648
#define ARMCI_PRINT_EVENTS 6
650
int str_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETS, ARMCI_PROF_PUTS,
651
ARMCI_PROF_ACCS, ARMCI_PROF_NBGETS,
652
ARMCI_PROF_NBPUTS,ARMCI_PROF_NBACCS};
653
int vec_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETV, ARMCI_PROF_PUTV,
654
ARMCI_PROF_ACCV, ARMCI_PROF_NBGETV,
655
ARMCI_PROF_NBPUTV,ARMCI_PROF_NBACCV};
657
fprintf(fp,"\n\n***************************************************\n");
658
fprintf(fp, " STRIDE INFORMATION for all strided data transfers\n");
659
fprintf(fp, "***************************************************\n");
660
for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
661
for(j=0; j<ARMCI_PRINT_EVENTS; j++)
662
armci_print_stridedinfo(fp,str_event[j], i);
664
fprintf(fp,"\n\n**************************************************\n");
665
fprintf(fp, " VECTOR INFORMATION for all vector data transfers\n");
666
fprintf(fp, "**************************************************\n");
667
for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
668
for(j=0; j<ARMCI_PRINT_EVENTS; j++)
669
armci_print_vectorinfo(fp,vec_event[j], i);