5
/* $Id: pack.c,v 1.36.10.1 2006-12-14 13:24:37 manoj Exp $ */
9
#if !defined(ACC_COPY) &&!defined(CRAY_YMP) &&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML)
13
#if defined(REMOTE_OP)
14
# define OP_STRIDED armci_rem_strided
16
# define OP_STRIDED(_a,_b,_c,_d,_e,_f,_g,_h,_i,_delete1,_j,_hdl)\
17
armci_op_strided(_a,_b,_c,_d,_e,_f,_g,_h,_i,_j,_hdl)
21
/*\ determine if patch fits in the ARMCI buffer, and if not
22
* at which stride level (patch dim) need to decompose it
23
* *fit_level is the value of stride level to perform packing at
24
* *nb means number of elements of count[*fit_level] that fit in buf
26
static void armci_fit_buffer(int count[], int stride_levels, int* fit_level,
29
int bytes=1, sbytes=1;
32
/* find out at which stride level BUFFER becomes too small */
33
for(level=0; level<= stride_levels; level++){
34
sbytes = bytes; /* store #bytes at current level to save div cost later */
35
bytes *= count[level];
36
if(bufsize < bytes) break;
39
/* buffer big enough for entire patch */
41
*fit_level = stride_levels;
42
*nb = count[stride_levels];
46
/* buffer too small */
49
/* smaller than a single column */
53
case -1: /* one column fits */
58
/* it could keep nb instances of (level-1)-dimensional patch */
65
/*\ The function decomposes a multi-dimensional patch so that it fits in the
66
* internal ARMCI buffer.
67
* It works by recursively reducing patch dimension until some portion of the
68
* subpatch fits in the buffer.
69
* The recursive process is controlled by "fit_level" and "nb" arguments,
70
* which have to be set to -1 at the top-level of the recursion tree.
72
* Argument last and variable looplast are used to indicate to sending/packing
73
* routine that we are dealing with the last portion of the request.
74
* Due to the recursive nature of packing code, the algorithm is following:
75
* if last=1 then internal for loop passes 1 for the last chunk
79
int armci_pack_strided(int op, void* scale, int proc,
80
void *src_ptr, int src_stride_arr[],
81
void* dst_ptr, int dst_stride_arr[],
82
int count[], int stride_levels, ext_header_t *h,
83
int fit_level, int nb, int last,armci_ihdl_t nb_handle)
85
int rc=0, sn, bufsize=BUFSIZE,noswap=0;
93
static int call_count;
95
#ifdef STRIDED_GET_BUFLEN
96
if(op==GET) bufsize=STRIDED_GET_BUFLEN;
99
if(stride_levels || ARMCI_ACC(op)) bufsize=MSG_BUFLEN_SMALL-PAGE_SIZE;
103
#if (defined(GM_) || defined(VIA_) || defined(VAPI_))
104
/*we cant assume that the entire available buffer will be used for data,
105
fact that the header and descriptor also go in the same buffer should be
106
considered while packing.
108
bufsize-=(sizeof(request_header_t)+(MAX_STRIDE_LEVEL+4)*sizeof(int)+2*sizeof(void *));
109
# if defined(PIPE_BUFSIZE) && defined(MAX_PIPELINE_CHUNKS)
110
bufsize-=8*MAX_PIPELINE_CHUNKS;
114
#ifdef BALANCE_FACTOR
115
/* Added the following for balancing buffers */
118
for(i=0; i<= stride_levels; i++)
120
if(bytes > bufsize && bytes/bufsize < 3 && bytes%bufsize < BALANCE_BUFSIZE){
121
/* bytes div bufsize - 1 is to increase the balence factor for 3 buffer case */
122
bufsize = bytes/ (bytes/bufsize - 1 + BALANCE_FACTOR);
123
noswap = 1; /*** yuck: if set to 1, error in buffers.c ***/
130
/* determine decomposition of the patch to fit in the buffer */
132
armci_fit_buffer(count, stride_levels, &fit_level, &nb, bufsize);
136
// printf("%s [cp]: pack_strided: flag=%d, bufsize=%ld; fit_level=%d; stride_level=%d; nb=%d\n",Portals_ID(),flag,(long) bufsize,fit_level,stride_levels,nb);
138
if(fit_level == stride_levels){
140
/* we can fit subpatch into the buffer */
141
int chunk = count[fit_level];
142
int dst_stride, src_stride;
144
if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */
145
if(h) h->last = last?1:0;
146
if(nb_handle && call_count ){
147
nb_handle->bufid=NB_MULTI;
150
return(OP_STRIDED(op, scale, proc, src_ptr, src_stride_arr,
151
dst_ptr,dst_stride_arr,count,stride_levels,h,flag,nb_handle));
155
dst_stride = dst_stride_arr[fit_level -1];
156
src_stride = src_stride_arr[fit_level -1];
158
dst_stride = src_stride = 1;
160
if(op == GET || noswap == 1) b =nb;
161
else{ b = chunk%nb; if(b==0)b=nb; } /* put smallest piece first */
163
for(sn = 0; sn < chunk; ){
164
src = (char*)src_ptr + src_stride* sn;
165
dst = (char*)dst_ptr + dst_stride* sn;
166
count[fit_level] = ARMCI_MIN(b, chunk-sn); /*modify count for this level*/
168
if(h) h->last = (last && ((sn+b)>=chunk))? 1: 0 ;
169
if(nb_handle)call_count++;
170
rc = OP_STRIDED( op, scale, proc, src, src_stride_arr,
171
dst,dst_stride_arr,count,fit_level,h,flag,nb_handle);
177
count[fit_level] = chunk; /* restore original count */
181
for(sn = 0; sn < count[stride_levels]; sn++){
183
src = (char*)src_ptr + src_stride_arr[stride_levels -1]* sn;
184
dst = (char*)dst_ptr + dst_stride_arr[stride_levels -1]* sn;
186
if(last && (sn == count[stride_levels]-1)) looplast =1;
187
rc = armci_pack_strided(op, scale, proc, src, src_stride_arr,
188
dst, dst_stride_arr, count, stride_levels -1,
189
h,fit_level, nb, looplast,nb_handle);
193
if(nb_handle && call_count )
194
nb_handle->bufid=NB_MULTI;
198
/*\ decompose strided data into chunks and call func on each chunk
200
void armci_dispatch_strided(void *ptr, int stride_arr[], int count[],
201
int strides, int fit_level, int nb, int bufsize,
202
void (*fun)(void*,int*,int*,int,void*), void *arg)
207
/* determine decomposition of the patch to fit in the buffer */
210
armci_fit_buffer(count, strides, &fit_level, &nb, bufsize);
214
if(fit_level == strides){
216
/* we can fit subpatch into the buffer */
217
int chunk = count[fit_level];
220
# ifdef PIPE_MEDIUM_BUFSIZE_
221
/* for first call we adjust nb for performance in medium request */
222
if(first_call && strides==0)
223
if(chunk<2*bufsize && chunk>PIPE_MEDIUM_BUFSIZE)
224
nb = PIPE_MEDIUM_BUFSIZE;
227
if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */
228
fun(ptr, stride_arr, count, strides, arg);
232
stride_upd = stride_arr[fit_level -1];
236
for(sn = 0; sn < chunk; sn += nb){
238
ptr_upd = (char*)ptr + stride_upd* sn;
239
count[fit_level] = ARMCI_MIN(nb, chunk-sn); /*modify count for this level*/
240
fun(ptr_upd, stride_arr, count, fit_level, arg);
242
count[fit_level] = chunk; /* restore original count */
244
}else for(sn = 0; sn < count[strides]; sn++){
245
ptr_upd = (char*)ptr + stride_arr[strides -1]* sn;
246
armci_dispatch_strided(ptr_upd, stride_arr, count, strides -1,
247
fit_level, nb, bufsize, fun, arg);
251
/* how much space is needed to move data + reduced descriptor ? */
252
int armci_vector_bytes( armci_giov_t darr[], int len)
255
for(i=0; i<len; i++){
256
/* # elements * (elem size + dst address ) */
257
bytes += darr[i].ptr_array_len * (darr[i].bytes + sizeof(void*));
258
bytes += 2*sizeof(int); /* ptr_array_len + bytes */
264
#define BUFSIZE10 26000
265
#define BUFSIZE1 BUFSIZE
267
void armci_split_dscr_array( armci_giov_t darr[], int len,
268
armci_giov_t* extra, int *nlen, armci_giov_t* save)
271
int bytes=0, split=0;
273
extra->src_ptr_array=NULL;
274
/* go through the sets looking for set to be split */
278
csize = darr[s].ptr_array_len * (darr[s].bytes + sizeof(void*));
279
csize += 2*sizeof(int); /* ptr_array_len + bytes */
281
if(csize + bytes >BUFSIZE1){
283
split =(BUFSIZE1 -bytes-2*sizeof(int))/(darr[s].bytes +sizeof(void*));
284
if(split == 0) s--; /* no room available - do not split */
289
if(BUFSIZE1 -bytes < 64) break; /* stop here if almost full */
292
if(s==len)s--; /* adjust loop counter should be < number of sets */
297
/* save the value to be overwritten only if "save" is not filled */
298
if(!save->src_ptr_array)*save= darr[s];
300
/* split the set: reduce # of elems, "extra" keeps info for rest of set*/
302
darr[s].ptr_array_len = split;
303
extra->ptr_array_len -= split;
304
extra->src_ptr_array = &extra->src_ptr_array[split];
305
extra->dst_ptr_array = &extra->dst_ptr_array[split];
311
int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len,
312
int proc,armci_ihdl_t nb_handle)
314
armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */
315
armci_giov_t save; /* keeps original value of set to be processed in chunks */
316
armci_giov_t *ndarr; /* points to first array element to be processed now */
317
int rc=0, nlen, count=0;
321
save.src_ptr_array=NULL; /* indicates that save slot is empty */
324
armci_split_dscr_array(ndarr, len, &extra, &nlen, &save);
325
# if defined(REMOTE_OP)
326
rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle);
328
if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc);
329
else rc = armci_copy_vector(op,ndarr,nlen,proc);
333
/* non-NULL pointer indicates that set was split */
334
if(extra.src_ptr_array){
337
nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/
340
ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */
341
nlen--; /* since last set not done in full need to process it again */
345
if(save.src_ptr_array){
347
save.src_ptr_array=NULL; /* indicates that save slot is empty */
351
armci_die("vector packetization problem:buffer too small",BUFSIZE1);