5
/* $Id: pack.c,v 1.36.10.1 2006-12-14 13:24:37 manoj Exp $ */
11
#if !defined(ACC_COPY) &&!defined(CRAY_YMP) &&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) &&!defined(DCMF)
15
#if defined(REMOTE_OP)
16
# define OP_STRIDED armci_rem_strided
18
# define OP_STRIDED(_a,_b,_c,_d,_e,_f,_g,_h,_i,_delete1,_j,_hdl)\
19
armci_op_strided(_a,_b,_c,_d,_e,_f,_g,_h,_i,_j,_hdl)
23
/*\ determine if patch fits in the ARMCI buffer, and if not
24
* at which stride level (patch dim) need to decompose it
25
* *fit_level is the value of stride level to perform packing at
26
* *nb means number of elements of count[*fit_level] that fit in buf
28
static void armci_fit_buffer(int count[], int stride_levels, int* fit_level,
31
int bytes=1, sbytes=1;
34
/* find out at which stride level BUFFER becomes too small */
35
for(level=0; level<= stride_levels; level++){
36
sbytes = bytes; /* store #bytes at current level to save div cost later */
37
bytes *= count[level];
38
if(bufsize < bytes) break;
41
/* buffer big enough for entire patch */
43
*fit_level = stride_levels;
44
*nb = count[stride_levels];
48
/* buffer too small */
51
/* smaller than a single column */
55
case -1: /* one column fits */
60
/* it could keep nb instances of (level-1)-dimensional patch */
67
/*\ The function decomposes a multi-dimensional patch so that it fits in the
68
* internal ARMCI buffer.
69
* It works by recursively reducing patch dimension until some portion of the
70
* subpatch fits in the buffer.
71
* The recursive process is controlled by "fit_level" and "nb" arguments,
72
* which have to be set to -1 at the top-level of the recursion tree.
74
* Argument last and variable looplast are used to indicate to sending/packing
75
* routine that we are dealing with the last portion of the request.
76
* Due to the recursive nature of packing code, the algorithm is following:
77
* if last=1 then internal for loop passes 1 for the last chunk
81
int armci_pack_strided(int op, void* scale, int proc,
82
void *src_ptr, int src_stride_arr[],
83
void* dst_ptr, int dst_stride_arr[],
84
int count[], int stride_levels, ext_header_t *h,
85
int fit_level, int nb, int last,armci_ihdl_t nb_handle)
87
int rc=0, bufsize=BUFSIZE,noswap=0;
96
static int call_count;
98
#ifdef STRIDED_GET_BUFLEN
99
if(op==GET)bufsize=STRIDED_GET_BUFLEN;
102
if(stride_levels || ARMCI_ACC(op))bufsize=MSG_BUFLEN_SMALL-PAGE_SIZE;
106
#if (defined(GM_) || defined(VIA_) || defined(VAPI_))
107
/*we cant assume that the entire available buffer will be used for data,
108
fact that the header and descriptor also go in the same buffer should be
109
considered while packing.
111
bufsize-=(sizeof(request_header_t)+(MAX_STRIDE_LEVEL+4)*sizeof(int)+2*sizeof(void *));
112
# if defined(PIPE_BUFSIZE) && defined(MAX_PIPELINE_CHUNKS)
113
bufsize-=8*MAX_PIPELINE_CHUNKS;
117
#ifdef BALANCE_FACTOR
118
/* Added the following for balancing buffers */
121
for(i=0; i<= stride_levels; i++)
123
if(bytes > bufsize && bytes/bufsize < 3 && bytes%bufsize < BALANCE_BUFSIZE){
124
/* bytes div bufsize - 1 is to increase the balence factor for 3 buffer case */
125
bufsize = bytes/ (bytes/bufsize - 1 + BALANCE_FACTOR);
126
noswap = 1; /*** yuck: if set to 1, error in buffers.c ***/
133
/* determine decomposition of the patch to fit in the buffer */
135
armci_fit_buffer(count, stride_levels, &fit_level, &nb, bufsize);
139
if(fit_level == stride_levels){
141
/* we can fit subpatch into the buffer */
142
int chunk = count[fit_level];
143
int dst_stride, src_stride;
145
if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */
146
if(h) h->last = last?1:0;
147
if(nb_handle && call_count ){
148
nb_handle->bufid=NB_MULTI;
151
return(OP_STRIDED(op, scale, proc, src_ptr, src_stride_arr,
152
dst_ptr,dst_stride_arr,count,stride_levels,h,flag,nb_handle));
156
dst_stride = dst_stride_arr[fit_level -1];
157
src_stride = src_stride_arr[fit_level -1];
159
dst_stride = src_stride = 1;
161
if(op == GET || noswap == 1) b =nb;
162
else{ b = chunk%nb; if(b==0)b=nb; } /* put smallest piece first */
164
for(sn = 0; sn < chunk; ){
165
src = (char*)src_ptr + src_stride* sn;
166
dst = (char*)dst_ptr + dst_stride* sn;
167
count[fit_level] = ARMCI_MIN(b, chunk-sn); /*modify count for this level*/
169
if(h) h->last = (last && ((sn+b)>=chunk))? 1: 0 ;
170
if(nb_handle)call_count++;
171
rc = OP_STRIDED( op, scale, proc, src, src_stride_arr,
172
dst,dst_stride_arr,count,fit_level,h,flag,nb_handle);
178
count[fit_level] = chunk; /* restore original count */
182
for(sn = 0; sn < count[stride_levels]; sn++){
184
src = (char*)src_ptr + src_stride_arr[stride_levels -1]* sn;
185
dst = (char*)dst_ptr + dst_stride_arr[stride_levels -1]* sn;
187
if(last && (sn == count[stride_levels]-1)) looplast =1;
188
rc = armci_pack_strided(op, scale, proc, src, src_stride_arr,
189
dst, dst_stride_arr, count, stride_levels -1,
190
h,fit_level, nb, looplast,nb_handle);
194
if(nb_handle && call_count )
195
nb_handle->bufid=NB_MULTI;
199
/*\ decompose strided data into chunks and call func on each chunk
201
void armci_dispatch_strided(void *ptr, int stride_arr[], int count[],
202
int strides, int fit_level, int nb, int bufsize,
203
void (*fun)(void*,int*,int*,int,void*), void *arg)
208
/* determine decomposition of the patch to fit in the buffer */
211
armci_fit_buffer(count, strides, &fit_level, &nb, bufsize);
215
if(fit_level == strides){
217
/* we can fit subpatch into the buffer */
218
int chunk = count[fit_level];
221
# ifdef PIPE_MEDIUM_BUFSIZE_
222
/* for first call we adjust nb for performance in medium request */
223
if(first_call && strides==0)
224
if(chunk<2*bufsize && chunk>PIPE_MEDIUM_BUFSIZE)
225
nb = PIPE_MEDIUM_BUFSIZE;
228
if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */
229
fun(ptr, stride_arr, count, strides, arg);
233
stride_upd = stride_arr[fit_level -1];
237
for(sn = 0; sn < chunk; sn += nb){
239
ptr_upd = (char*)ptr + stride_upd* sn;
240
count[fit_level] = ARMCI_MIN(nb, chunk-sn); /*modify count for this level*/
241
fun(ptr_upd, stride_arr, count, fit_level, arg);
243
count[fit_level] = chunk; /* restore original count */
245
}else for(sn = 0; sn < count[strides]; sn++){
246
ptr_upd = (char*)ptr + stride_arr[strides -1]* sn;
247
armci_dispatch_strided(ptr_upd, stride_arr, count, strides -1,
248
fit_level, nb, bufsize, fun, arg);
252
/* how much space is needed to move data + reduced descriptor ? */
253
int armci_vector_bytes( armci_giov_t darr[], int len)
256
for(i=0; i<len; i++){
257
/* # elements * (elem size + dst address ) */
258
bytes += darr[i].ptr_array_len * (darr[i].bytes + sizeof(void*));
259
bytes += 2*sizeof(int); /* ptr_array_len + bytes */
265
#define BUFSIZE10 26000
266
#define BUFSIZE1 BUFSIZE
268
void armci_split_dscr_array( armci_giov_t darr[], int len,
269
armci_giov_t* extra, int *nlen, armci_giov_t* save)
272
int bytes=0, split=0;
274
extra->src_ptr_array=NULL;
275
/* go through the sets looking for set to be split */
279
csize = darr[s].ptr_array_len * (darr[s].bytes + sizeof(void*));
280
csize += 2*sizeof(int); /* ptr_array_len + bytes */
282
if(csize + bytes >((int)BUFSIZE1)){
284
split =(BUFSIZE1 -bytes-2*sizeof(int))/(darr[s].bytes +sizeof(void*));
285
if(split == 0) s--; /* no room available - do not split */
290
if(BUFSIZE1 -bytes < 64) break; /* stop here if almost full */
293
if(s==len)s--; /* adjust loop counter should be < number of sets */
298
/* save the value to be overwritten only if "save" is not filled */
299
if(!save->src_ptr_array)*save= darr[s];
301
/* split the set: reduce # of elems, "extra" keeps info for rest of set*/
303
darr[s].ptr_array_len = split;
304
extra->ptr_array_len -= split;
305
extra->src_ptr_array = &extra->src_ptr_array[split];
306
extra->dst_ptr_array = &extra->dst_ptr_array[split];
312
static inline void armcip_init_giov_t(armci_giov_t *thing)
314
thing->src_ptr_array=NULL;
315
thing->dst_ptr_array=NULL;
316
thing->ptr_array_len=0;
321
int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len,
322
int proc,armci_ihdl_t nb_handle)
324
armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */
325
armci_giov_t save; /* keeps original value of set to be processed in chunks */
326
armci_giov_t *ndarr; /* points to first array element to be processed now */
327
int rc=0, nlen, count=0;
329
armcip_init_giov_t(&extra);
330
armcip_init_giov_t(&save);
333
save.src_ptr_array=NULL; /* indicates that save slot is empty */
336
armci_split_dscr_array(ndarr, len, &extra, &nlen, &save);
337
# if defined(REMOTE_OP)
338
rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle);
340
if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc);
341
else rc = armci_copy_vector(op,ndarr,nlen,proc);
345
/* non-NULL pointer indicates that set was split */
346
if(extra.src_ptr_array){
349
nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/
352
ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */
353
nlen--; /* since last set not done in full need to process it again */
357
if(save.src_ptr_array){
359
save.src_ptr_array=NULL; /* indicates that save slot is empty */
363
armci_die("vector packetization problem:buffer too small",BUFSIZE1);