1
/* $Id: armci_profile.c,v 1.8 2005-11-30 10:20:53 vinod Exp $ */
4
* Set an environment variable as follows to enable ARMCI profiling
5
* export ARMCI_PROFILE=YES (bash)
6
* setenv ARMCI_PROFILE YES (csh/tcsh)
8
* Profiler can profile the following ARMCI Calls:
9
* ARMCI_Get,ARMCI_Put,ARMCI_Acc,ARMCI_NbGet,ARMCI_NbPut,ARMCI_NbAcc,
10
* ARMCI_GetS,ARMCI_PutS,ARMCI_AccS,ARMCI_NbGetS,ARMCI_NbPutS,ARMCI_NbAccS,
11
* ARMCI_GetV,ARMCI_PutV,ARMCI_AccV,ARMCI_NbGetV,ARMCI_NbPutV,ARMCI_NbAccV,
12
* ARMCI_Wait, armci_wait_notify
13
* (NOTE: As armci_notify is same as ARMCI_Put, it is not profiled.)
16
* Note #1: Right now, only process 0's profile is printed.
17
* Each and every process saves its profile in the correspoding data struture.
18
* Each process prints its profile to an output file armci_profile.<myrank>
19
* when armci_profile_terminate() is called (called in ARMCI_Finalize()).
21
* Note #2: By default profiler prints msg ranges 0 to 21. Example: range 10
22
* corresponds to message ranges from 1024 bytes to 2047 bytes.
23
* Message ranges are in the power of 2. for ex:
24
* ------------------------------------
25
* MSG_RANGE (r) BYTES (2^r to 2^(r+1)-1)
26
* ------------------------------------
35
* -------------------------------------
36
* To increase the message range, set ARMCI_MAX_MSG_RANGE accordingly.
38
* Note #3: If Stride information needs to be printed, set ARMCI_PRINT_STRIDE.
39
* Stride information is printed in armci_profile_terminate() for a various
40
* selective message ranges and event types.Modify it according to your needs.
42
* Note #4: There is no profiling support for non-blocking operations yet!!
54
#include "armci_profile.h"
58
# define MP_TIMER TCGTIME_
61
# define MP_TIMER MPI_Wtime
65
#define ARMCI_PRINT_STRIDE 1
66
#define ARMCI_MAX_MSG_RANGE 22 /* 0 to 21 */
68
#if ARMCI_PRINT_STRIDE
70
# define STRIDE_COUNT 1000
71
# define ARMCI_MAX_DIM 7
73
typedef struct armci_stride {
76
int count[ARMCI_MAX_DIM];
85
typedef struct armci_vector {
94
#define ARMCI_EVENTS 24
96
char *gEventName[ARMCI_EVENTS]={
98
"STRIDED GET", "STRIDED PUT", "STRIDED ACC",
99
"VECTOR GET", "VECTOR PUT", "VECTOR ACC",
100
"NBGET", "NBPUT", "NBACC",
101
"STRIDED NBGET", "STRIDED NBPUT", "STRIDED NBACC",
102
"VECTOR NBGET", "VECTOR NBPUT", "VECTOR NBACC",
103
"BARRIER","ARMCI_WAIT","NOTIFY_WAIT",
104
"FENCE", "ALLFENCE", "RMW"
107
typedef struct armci_profile {
108
int count; /* number of times called */
109
double time; /* total execution time for "count" calls */
110
#if ARMCI_PRINT_STRIDE
111
armci_stride_t *stride;
112
armci_vector_t *vector;
116
/* profile get/put/acc for various message ranges (i.e ARMCI_MAX_MSG_RANGE) */
117
static armci_profile_t ARMCI_PROF[ARMCI_EVENTS][ARMCI_MAX_MSG_RANGE];
127
static int strided_event(int e) {
128
if (e==ARMCI_PROF_GETS || e==ARMCI_PROF_PUTS || e==ARMCI_PROF_ACCS ||
129
e==ARMCI_PROF_NBGETS || e==ARMCI_PROF_NBPUTS || e==ARMCI_PROF_NBACCS)
134
void armci_profile_init() {
136
if(armci_me==0) {printf("\nProfiling ARMCI - ON\n");fflush(stdout);}
138
gCURRENT_EVNT.is_set = 0;
140
for(i=0; i<ARMCI_EVENTS; i++)
141
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
142
ARMCI_PROF[i][j].count = 0; ARMCI_PROF[i][j].time = 0.0;
145
#if ARMCI_PRINT_STRIDE
146
for(i=0; i<ARMCI_EVENTS; i++) {
148
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
149
ARMCI_PROF[i][j].stride = (armci_stride_t*)malloc(STRIDE_COUNT*sizeof(armci_stride_t));
150
ARMCI_PROF[i][j].vector = NULL;
151
if( ARMCI_PROF[i][j].stride == NULL)
152
armci_die("armci_profile_init(): malloc failed", armci_me);
154
if(i==ARMCI_PROF_GETV || i==ARMCI_PROF_PUTV || i==ARMCI_PROF_ACCV ||
155
i==ARMCI_PROF_NBGETV || i==ARMCI_PROF_NBPUTV || i==ARMCI_PROF_NBACCV)
156
for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
157
ARMCI_PROF[i][j].vector = (armci_vector_t*)malloc(STRIDE_COUNT*sizeof(armci_vector_t));
158
ARMCI_PROF[i][j].stride = NULL;
159
if( ARMCI_PROF[i][j].vector == NULL)
160
armci_die("armci_profile_init(): malloc failed", armci_me);
166
#define ARMCI_EVENT_CLOSED 0
167
#define ARMCI_EVENT_NOTCLOSED -1
168
#define ARMCI_EVENT_SET 0
169
#define ARMCI_EVENT_NOTSET -1
171
static int armci_profile_set_event(int event_type, int range) {
174
printf("\n%d(s):call profile set for %s isset is %d",armci_me,
175
gEventName[event_type],gCURRENT_EVNT.is_set);
177
printf("\n%d:call profile set for %s isset is %d",armci_me,
178
gEventName[event_type],gCURRENT_EVNT.is_set);
181
if(gCURRENT_EVNT.is_set == 0) { /* set an event */
182
gCURRENT_EVNT.is_set = 1;
183
gCURRENT_EVNT.event_type = event_type;
184
gCURRENT_EVNT.range = range;
185
gCURRENT_EVNT.start_time = MP_TIMER();
186
return ARMCI_EVENT_SET;
188
else gCURRENT_EVNT.is_set++; /* event overlap */
189
return ARMCI_EVENT_NOTSET;
192
static int armci_profile_close_event(int event_type, int range, double *time,
195
int curr_event = gCURRENT_EVNT.event_type;
198
printf("\n%d(s):call profile close for %s isset is %d",armci_me,
199
gEventName[event_type],gCURRENT_EVNT.is_set);
201
printf("\n%d:call profile close for %s isset is %d",armci_me,
202
gEventName[event_type],gCURRENT_EVNT.is_set);
207
if(gCURRENT_EVNT.is_set==1) { /* Yep, there is an event set. So close it.*/
208
/*Check if "profile stop" is called for corresponding "profile start"*/
209
if(event_type != curr_event) {
211
"%d: %s: ERROR:Profile started for %s, but stopped for %s\n",
212
armci_me,name,gEventName[curr_event],gEventName[event_type]);
214
armci_die("Profile_stop is called a different event", armci_me);
217
*time = MP_TIMER() - gCURRENT_EVNT.start_time;
218
ARMCI_PROF[curr_event][range].time += *time;
219
gCURRENT_EVNT.is_set = 0; /* close the event */
220
return ARMCI_EVENT_CLOSED;
222
else { /* event overlapping */
223
gCURRENT_EVNT.is_set--;
224
if(gCURRENT_EVNT.is_set<=0) {
225
char *msg="Profile_stop is called before profile_start";
226
printf("%d: %s: ERROR: %s. Event Name = %s\n", armci_me,
227
name, msg, gEventName[curr_event]);
229
armci_die(" profile_stop is called before profile_start", armci_me);
232
return ARMCI_EVENT_NOTCLOSED;
235
void armci_profile_start_strided(int count[], int stride_levels, int proc,
237
int i, status, bytes=1, range;
239
if(stride_levels >= ARMCI_MAX_DIM)
240
armci_die("ARMCI_PROFILE: stride_levels >= ARMCI_MAX_DIM. Increase ARMCI_MAX_DIM.", armci_me);
242
/* find the message range */
243
for(i=0; i<= stride_levels; i++) bytes *= count[i];
244
if(bytes<=0) range=0;
245
else range = (int) (log((double)bytes)/log(2.0));
246
if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
248
/* set the curent event for timer */
249
status = armci_profile_set_event(event_type, range);
251
if(status == ARMCI_EVENT_SET) { /* new event set */
252
/* profile update: i.e. update event count */
253
ARMCI_PROF[event_type][range].count++;
255
# if ARMCI_PRINT_STRIDE
256
if(strided_event(event_type)) {
257
int idx = ARMCI_PROF[event_type][range].count-1;
258
if(idx<STRIDE_COUNT) {
259
ARMCI_PROF[event_type][range].stride[idx].stride_levels = stride_levels;
260
ARMCI_PROF[event_type][range].stride[idx].proc = proc;
261
for(i=0;i<=stride_levels;i++) {
262
ARMCI_PROF[event_type][range].stride[idx].count[i]=count[i];
268
else { /* Do nothing. It is just an event overlap */ }
271
void armci_profile_stop_strided(int event_type) {
273
int status, range = gCURRENT_EVNT.range;
275
status = armci_profile_close_event(event_type, range, &time,
276
"armci_profile_stop_strided");
278
#if ARMCI_PRINT_STRIDE
279
if(status == ARMCI_EVENT_CLOSED) {
280
/* record the time of each strided data transfer */
281
if(strided_event(event_type)) {
282
int idx = ARMCI_PROF[event_type][range].count-1;
284
ARMCI_PROF[event_type][range].stride[idx].time = time;
290
void armci_profile_start_vector(armci_giov_t darr[], int len, int proc,
293
int i, bytes=0, range, status;
295
/* find the message range */
296
for(i=0; i<len; i++) bytes += darr[i].bytes;
297
if(bytes<=0) range=0;
298
else range = (int) (log((double)bytes)/log(2.0));
299
if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
301
/* set the curent event for timer */
302
status = armci_profile_set_event(event_type, range);
304
if(status == ARMCI_EVENT_SET) { /* new event set */
305
/* profile update: i.e. update event count */
306
ARMCI_PROF[event_type][range].count++;
308
# if ARMCI_PRINT_STRIDE
310
int idx = ARMCI_PROF[event_type][range].count-1;
311
if(idx<STRIDE_COUNT) {
312
ARMCI_PROF[event_type][range].vector[idx].vec_len = len;
313
ARMCI_PROF[event_type][range].vector[idx].proc = proc;
314
ARMCI_PROF[event_type][range].vector[idx].giov =
315
(giov_t*)malloc(len*sizeof(giov_t));
317
ARMCI_PROF[event_type][range].vector[idx].giov[i].ptr_array_len = darr[i].ptr_array_len;
318
ARMCI_PROF[event_type][range].vector[idx].giov[i].bytes =
327
void armci_profile_stop_vector(int event_type) {
329
int status, range = gCURRENT_EVNT.range;
331
status = armci_profile_close_event(event_type, range, &time,
332
"armci_profile_stop_vector");
334
#if ARMCI_PRINT_STRIDE
335
if(status == ARMCI_EVENT_CLOSED) {/*record time of each data transfer*/
336
int idx = ARMCI_PROF[event_type][range].count-1;
338
ARMCI_PROF[event_type][range].vector[idx].time = time;
343
void armci_profile_start(int event_type) {
346
/* message range is zero for events registered using this call */
349
/* set the curent event for timer */
350
status = armci_profile_set_event(event_type, range);
351
if(status == ARMCI_EVENT_SET) { /* new event set */
352
/* profile update: i.e. update event count */
353
ARMCI_PROF[event_type][range].count++;
357
void armci_profile_stop(int event_type) {
359
int status,range = gCURRENT_EVNT.range;
360
status = armci_profile_close_event(event_type, range, &time,
361
"armci_profile_stop");
364
#define ARMCI_HDR0(fp) fprintf(fp, "\n\n************** TOTAL DATA TRANSFERS **************\n\n");
365
#define ARMCI_HDR1(fp) fprintf(fp, "\n\n************ CONTIGUOUS DATA TRANSFER ************\n\n");
366
#define ARMCI_HDR2(fp) fprintf(fp, "\n\n********** NON-CONTIGUOUS DATA TRANSFER **********\n\n");
367
#define ARMCI_HDR3(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
368
#define ARMCI_HDR4(fp) fprintf(fp, "SL#\tndim\t proc\t time stride_info\n\n");
369
#define ARMCI_HDR5(fp) fprintf(fp, "SL#\tnvec\t proc\t time\t [ #arrays\t bytes\t]\n");
370
#define ARMCI_HDR6(fp) fprintf(fp, "\n\n****** NON-BLOCKING CONTIGUOUS DATA TRANSFER *****\n\n");
371
#define ARMCI_HDR7(fp) fprintf(fp, "\n\n*** NON-BLOCKING NON-CONTIGUOUS DATA TRANSFER ****\n\n");
372
#define ARMCI_HDR8(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
373
#define ARMCI_HDR9(fp) fprintf(fp, "\n\n******************* ARMCI MISC *******************\n\n");
375
/* print profile of all get/put/acc calls for every message range */
376
static void armci_print_all(FILE *fp) {
377
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
378
double gtime, ptime, atime;
380
ARMCI_HDR0(fp); ARMCI_HDR3(fp);
381
for(i=0; i< nrange; i++) {
383
nget =(ARMCI_PROF[ARMCI_PROF_GET][i].count +
384
ARMCI_PROF[ARMCI_PROF_GETS][i].count +
385
ARMCI_PROF[ARMCI_PROF_GETV][i].count +
386
ARMCI_PROF[ARMCI_PROF_NBGET][i].count +
387
ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
388
ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
389
nput =(ARMCI_PROF[ARMCI_PROF_PUT][i].count +
390
ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
391
ARMCI_PROF[ARMCI_PROF_PUTV][i].count +
392
ARMCI_PROF[ARMCI_PROF_NBPUT][i].count +
393
ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
394
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
395
nacc =(ARMCI_PROF[ARMCI_PROF_ACC][i].count +
396
ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
397
ARMCI_PROF[ARMCI_PROF_ACCV][i].count +
398
ARMCI_PROF[ARMCI_PROF_NBACC][i].count +
399
ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
400
ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
402
gtime = (ARMCI_PROF[ARMCI_PROF_GET][i].time +
403
ARMCI_PROF[ARMCI_PROF_GETS][i].time +
404
ARMCI_PROF[ARMCI_PROF_GETV][i].time +
405
ARMCI_PROF[ARMCI_PROF_NBGET][i].time +
406
ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
407
ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
408
ptime = (ARMCI_PROF[ARMCI_PROF_PUT][i].time +
409
ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
410
ARMCI_PROF[ARMCI_PROF_PUTV][i].time +
411
ARMCI_PROF[ARMCI_PROF_NBPUT][i].time +
412
ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
413
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
414
atime = (ARMCI_PROF[ARMCI_PROF_ACC][i].time +
415
ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
416
ARMCI_PROF[ARMCI_PROF_ACCV][i].time +
417
ARMCI_PROF[ARMCI_PROF_NBACC][i].time +
418
ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
419
ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
421
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
422
nget, nput, nacc, gtime, ptime, atime);
423
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
424
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
428
/* print profile of contiguous get/put/acc calls for every message range */
429
static void armci_print_contig(FILE *fp) {
430
int i, nrange=ARMCI_MAX_MSG_RANGE;
431
ARMCI_HDR1(fp); ARMCI_HDR3(fp);
432
for(i=0; i< nrange; i++) {
433
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
434
ARMCI_PROF[ARMCI_PROF_GET][i].count,
435
ARMCI_PROF[ARMCI_PROF_PUT][i].count,
436
ARMCI_PROF[ARMCI_PROF_ACC][i].count,
437
ARMCI_PROF[ARMCI_PROF_GET][i].time,
438
ARMCI_PROF[ARMCI_PROF_PUT][i].time,
439
ARMCI_PROF[ARMCI_PROF_ACC][i].time);
440
if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
441
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
445
/* This prints the number of non-contiguous get/put/acc/ calls for every
447
static void armci_print_noncontig(FILE *fp) {
448
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
449
double gtime, ptime, atime;
451
ARMCI_HDR2(fp); ARMCI_HDR3(fp);
452
for(i=0; i< nrange; i++) {
453
nget = (ARMCI_PROF[ARMCI_PROF_GETS][i].count +
454
ARMCI_PROF[ARMCI_PROF_GETV][i].count);
455
nput = (ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
456
ARMCI_PROF[ARMCI_PROF_PUTV][i].count);
457
nacc = (ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
458
ARMCI_PROF[ARMCI_PROF_ACCV][i].count);
459
gtime = (ARMCI_PROF[ARMCI_PROF_GETS][i].time +
460
ARMCI_PROF[ARMCI_PROF_GETV][i].time);
461
ptime = (ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
462
ARMCI_PROF[ARMCI_PROF_PUTV][i].time);
463
atime = (ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
464
ARMCI_PROF[ARMCI_PROF_ACCV][i].time);
466
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
467
nget, nput, nacc, gtime, ptime, atime);
468
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
469
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
473
/* print profile of non-blocking contiguous get/put/acc calls for every
475
static void armci_print_nbcontig(FILE *fp) {
476
int i, nrange=ARMCI_MAX_MSG_RANGE;
477
ARMCI_HDR6(fp); ARMCI_HDR8(fp);
478
for(i=0; i< nrange; i++) {
479
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
480
ARMCI_PROF[ARMCI_PROF_NBGET][i].count,
481
ARMCI_PROF[ARMCI_PROF_NBPUT][i].count,
482
ARMCI_PROF[ARMCI_PROF_NBACC][i].count,
483
ARMCI_PROF[ARMCI_PROF_NBGET][i].time,
484
ARMCI_PROF[ARMCI_PROF_NBPUT][i].time,
485
ARMCI_PROF[ARMCI_PROF_NBACC][i].time);
486
if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
487
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
491
/* This prints the number of non-blocking non-contiguous get/put/acc/ calls
492
for every message range */
493
static void armci_print_nbnoncontig(FILE *fp) {
494
int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
495
double gtime, ptime, atime;
497
ARMCI_HDR7(fp); ARMCI_HDR8(fp);
498
for(i=0; i< nrange; i++) {
499
nget = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
500
ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
501
nput = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
502
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
503
nacc = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
504
ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
505
gtime = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
506
ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
507
ptime = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
508
ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
509
atime = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
510
ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
512
fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
513
nget, nput, nacc, gtime, ptime, atime);
514
if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
515
else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
519
/* Profile of armci_notify_wait(), ARMCI_Wait() and ARMCI_Barrier() */
520
static void armci_print_misc(FILE *fp) {
522
fprintf(fp, "#calls\t time\t EVENT\n\n");
523
fprintf(fp, "%d\t %.2e ARMCI_Wait()\n",
524
ARMCI_PROF[ARMCI_PROF_WAIT][0].count,
525
ARMCI_PROF[ARMCI_PROF_WAIT][0].time);
526
fprintf(fp, "%d\t %.2e armci_notify_wait()\n",
527
ARMCI_PROF[ARMCI_PROF_NOTIFY][0].count,
528
ARMCI_PROF[ARMCI_PROF_NOTIFY][0].time);
529
fprintf(fp, "%d\t %.2e ARMCI_Barrier()\n",
530
ARMCI_PROF[ARMCI_PROF_BARRIER][0].count,
531
ARMCI_PROF[ARMCI_PROF_BARRIER][0].time);
532
fprintf(fp, "%d\t %.2e ARMCI_Fence()\n",
533
ARMCI_PROF[ARMCI_PROF_FENCE][0].count,
534
ARMCI_PROF[ARMCI_PROF_FENCE][0].time);
535
fprintf(fp, "%d\t %.2e ARMCI_Allfence()\n",
536
ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].count,
537
ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].time);
538
fprintf(fp, "%d\t %.2e ARMCI_Rmw()\n",
539
ARMCI_PROF[ARMCI_PROF_RMW][0].count,
540
ARMCI_PROF[ARMCI_PROF_RMW][0].time);
543
#if ARMCI_PRINT_STRIDE
544
static void armci_print_warning_msg(FILE *fp, int range, int str_count) {
545
fprintf(fp, "WARNING: In your program, total number of data transfers\n");
546
fprintf(fp, "for message range[%d - %d] is %d. This exceeds\n",
547
1<<range, 1<<(range+1), str_count);
548
fprintf(fp,"the maximum # of data transfers [%d] that can be profiled.\n",
550
fprintf(fp, "Therefore profile of only first %d data \n", STRIDE_COUNT);
551
fprintf(fp, "transfers are shown below. To increase the count, set\n");
552
fprintf(fp, "STRIDE_COUNT > %d (in armci_profile.c)\n", str_count);
555
static void armci_print_stridedinfo(FILE *fp, int event, int range) {
556
int i, j, stride_levels, str_count;
559
str_count = ARMCI_PROF[event][range].count;
560
if(str_count <=0) return;
561
if(str_count > STRIDE_COUNT) {
562
armci_print_warning_msg(fp, range, str_count);
563
str_count = STRIDE_COUNT;
566
fprintf(fp, "\n\nSTRIDE INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
567
1<<range, (1<<(range+1))-1, gEventName[event]);
570
for(i=0; i< str_count; i++) {
571
time += ARMCI_PROF[event][range].stride[i].time;
572
stride_levels = ARMCI_PROF[event][range].stride[i].stride_levels;
573
fprintf(fp, "%d\t%d\t %d\t %.2e (",i, stride_levels,
574
ARMCI_PROF[event][range].stride[i].proc,
575
ARMCI_PROF[event][range].stride[i].time);
576
for(j=0;j<=stride_levels;j++) {
577
fprintf(fp, "%d", ARMCI_PROF[event][range].stride[i].count[j]);
578
if(j!=stride_levels) fprintf(fp, "x");
582
/*This o/p is just for verification*/
583
fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
587
static void armci_print_vectorinfo(FILE *fp, int event, int range) {
588
int i, j, vec_len, str_count;
591
str_count = ARMCI_PROF[event][range].count;
592
if(str_count <=0) return;
593
if(str_count > STRIDE_COUNT) {
594
armci_print_warning_msg(fp, range, str_count);
595
str_count = STRIDE_COUNT;
598
fprintf(fp, "\n\nVECTOR INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
599
1<<range, (1<<(range+1))-1, gEventName[event]);
602
for(i=0; i< str_count; i++) {
603
time += ARMCI_PROF[event][range].vector[i].time;
604
vec_len = ARMCI_PROF[event][range].vector[i].vec_len;
605
fprintf(fp, "%d\t%d\t %d\t %.2e [ ",i, vec_len,
606
ARMCI_PROF[event][range].vector[i].proc,
607
ARMCI_PROF[event][range].vector[i].time);
608
for(j=0;j<vec_len;j++) {
609
fprintf(fp, "%-9d %d\t]\n",
610
ARMCI_PROF[event][range].vector[i].giov[j].ptr_array_len,
611
ARMCI_PROF[event][range].vector[i].giov[j].bytes);
612
if(j!=vec_len-1) fprintf(fp, "\t\t\t\t [ ");
615
/*This o/p is just for verification*/
616
fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
619
#endif /* end of ARMCI_PRINT_STRIDE */
621
void armci_profile_terminate() {
624
sprintf(file_name, "armci_profile.%d", armci_me);
625
fp = fopen(file_name, "w");
627
armci_print_all(fp); /* total get/put/acc calls */
628
armci_print_contig(fp); /* contiguous calls */
629
armci_print_noncontig(fp); /* non-contiguous calls */
630
armci_print_nbcontig(fp); /* non-blocking contiguous calls */
631
armci_print_nbnoncontig(fp); /* non-blocking non-contiguous calls */
633
/* miscellaneous (barrier, armci_wait, notify_wait) */
634
armci_print_misc(fp);
636
#if ARMCI_PRINT_STRIDE
639
* printing stride info for non-contiguous get (ARMCI_PROF_GETS) for message
640
* range #6. 2^6 - 2^(6+1) bytes (i.e. 64-128 bytes)
641
* Ex: armci_print_stridedinfo(ARMCI_PROF_GETS,6);
643
#define ARMCI_PRINT_EVENTS 6
645
int str_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETS, ARMCI_PROF_PUTS,
646
ARMCI_PROF_ACCS, ARMCI_PROF_NBGETS,
647
ARMCI_PROF_NBPUTS,ARMCI_PROF_NBACCS};
648
int vec_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETV, ARMCI_PROF_PUTV,
649
ARMCI_PROF_ACCV, ARMCI_PROF_NBGETV,
650
ARMCI_PROF_NBPUTV,ARMCI_PROF_NBACCV};
652
fprintf(fp,"\n\n***************************************************\n");
653
fprintf(fp, " STRIDE INFORMATION for all strided data transfers\n");
654
fprintf(fp, "***************************************************\n");
655
for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
656
for(j=0; j<ARMCI_PRINT_EVENTS; j++)
657
armci_print_stridedinfo(fp,str_event[j], i);
659
fprintf(fp,"\n\n**************************************************\n");
660
fprintf(fp, " VECTOR INFORMATION for all vector data transfers\n");
661
fprintf(fp, "**************************************************\n");
662
for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
663
for(j=0; j<ARMCI_PRINT_EVENTS; j++)
664
armci_print_vectorinfo(fp,vec_event[j], i);
670
#endif /* end of ARMCI_PROFILE */