2
* Copyright (C) 1999-2001 The Regents of the University of California
3
* (through E.O. Lawrence Berkeley National Laboratory), subject to
4
* approval by the U.S. Department of Energy.
6
* Use of this software is under license. The license agreement is included
7
* in the file MVICH_LICENSE.TXT.
9
* Developed at Berkeley Lab as part of MVICH.
11
* Authors: Bill Saphir <wcsaphir@lbl.gov>
12
* Michael Welcome <mlwelcome@lbl.gov>
15
/* Copyright (c) 2002-2008, The Ohio State University. All rights
18
* This file is part of the MVAPICH software package developed by the
19
* team members of The Ohio State University's Network-Based Computing
20
* Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
22
* For detailed copyright and licensing information, please refer to the
23
* copyright file COPYRIGHT_MVAPICH in the top level MPICH directory.
27
#define _XOPEN_SOURCE 600
36
* cbufs provide system buffers for VMPICH. They are analogous to mbufs
38
* The primary motivation for cbufs is that implementing MPI on VIA
39
* seems to requiring pre-posting a number of fixed-sized buffers.
40
* These buffers must be registered (pinned). Life is easier if
41
* they are all registered at once so there is only one memory
42
* handle. We manage a fixed-size pool of cbufs that are
43
* allocated and pinned when a progam starts up. We manage
44
* the free cbuf list as a singly linked list.
46
* Two different ways to manage the free list as a singly-linked list.
47
* 1. head and tail pointers. Add to tail, remove from head.
48
* 2. only head pointer, treat as a stack.
50
* #1 Eliminates contention between adding to list and removing from list
53
* #2 Has slightly less overhead when there is no contention, and is more
54
* likely to produce a cbuf that is already in cache.
56
* Currently anticipate that most access near-term will be single-threaded,
57
* so go with head only. (#2)
60
/* head of list of allocated cbuf regions */
61
static cbuf_region *cbuf_region_head = NULL;
64
* free_cbuf_head is the head of the free list
67
static cbuf *free_cbuf_head = NULL;
69
static int cbuf_n_allocated = 0;
70
static long num_free_cbuf = 0;
71
static long num_cbuf_get = 0;
72
static long num_cbuf_free = 0;
74
static pthread_spinlock_t cbuf_lock;
75
int viadev_cbuf_max = -1;
76
int viadev_cbuf_total_size = (2 * 1024);
77
int viadev_cbuf_secondary_pool_size = 128;
81
pthread_spin_init(&cbuf_lock, 0);
84
static void lock_cbuf()
86
pthread_spin_lock(&cbuf_lock);
90
static void unlock_cbuf()
92
pthread_spin_unlock(&cbuf_lock);
97
void dump_cbuf_region(cbuf_region * r)
101
void dump_cbuf_regions()
103
cbuf_region *r = cbuf_region_head;
110
void deallocate_cbufs()
112
cbuf_region *r = cbuf_region_head;
117
if (r->mem_handle != NULL) {
118
/* free cbufs add it later */
126
static void allocate_cbuf_region(int ncbufs)
128
struct cbuf_region *reg;
130
void *cbuf_dma_buffer;
134
int alignment_cbuf = 64;
137
alignment_dma = getpagesize();
139
if (free_cbuf_head != NULL) {
145
/* are we limiting cbuf allocation? If so, make sure
146
* we dont alloc more than allowed
149
reg = (struct cbuf_region *) malloc(sizeof(struct cbuf_region));
153
if(posix_memalign((void **) &mem, alignment_cbuf, ncbufs * sizeof(cbuf))) {
156
/* ALLOCATE THE DMA BUFFER */
158
if(posix_memalign((void **) &cbuf_dma_buffer, alignment_dma,
159
ncbufs * viadev_cbuf_total_size)) {
162
memset(mem, 0, ncbufs * sizeof(cbuf));
163
memset(cbuf_dma_buffer, 0, ncbufs * viadev_cbuf_total_size);
165
cbuf_n_allocated += ncbufs;
166
num_free_cbuf += ncbufs;
167
reg->malloc_start = mem;
169
reg->malloc_buf_start = cbuf_dma_buffer;
170
reg->malloc_end = (void *) ((char *) mem + ncbufs * sizeof(cbuf));
171
reg->malloc_buf_end = (void *) ((char *) cbuf_dma_buffer +
172
ncbufs * viadev_cbuf_total_size);
176
free_cbuf_head = (cbuf *) ((aint_t) mem);
178
reg->cbuf_head = free_cbuf_head;
181
reg->mem_handle = armci_register_memory(cbuf_dma_buffer,
182
ncbufs * viadev_cbuf_total_size);
184
if (reg->mem_handle == NULL) {
187
/* init the free list */
188
for (i = 0; i < ncbufs - 1; i++) {
189
cur = free_cbuf_head + i;
191
cur->desc.next = free_cbuf_head + i + 1;
194
#ifdef ADAPTIVE_RDMA_FAST_PATH
196
cur->buffer = (unsigned char *) ((char *)(cbuf_dma_buffer) +
197
(i * viadev_cbuf_total_size));
200
/* last one needs to be set to NULL */
201
cur = free_cbuf_head + ncbufs - 1;
203
cur->desc.next = NULL;
207
#ifdef ADAPTIVE_RDMA_FAST_PATH
209
cur->buffer = (unsigned char *) ((char *)cbuf_dma_buffer +
210
((ncbufs - 1) * viadev_cbuf_total_size));
214
/* thread region list */
215
reg->next = cbuf_region_head;
216
cbuf_region_head = reg;
219
void allocate_cbufs(int ncbufs)
221
/* this function is only called by the init routines.
222
* cache the nic handle and ptag for later cbuf_region allocations
224
/* now allocate the first cbuf region */
225
allocate_cbuf_region(ncbufs);
230
* Get a cbuf off the free list
240
* It will often be possible for higher layers to recover
241
* when no cbuf is available, but waiting for more descriptors
242
* to complete. For now, just abort.
244
if (NULL == free_cbuf_head) {
245
allocate_cbuf_region(viadev_cbuf_secondary_pool_size);
246
if (NULL == free_cbuf_head) {
253
/* this correctly handles removing from single entry free list */
254
free_cbuf_head = free_cbuf_head->desc.next;
255
#ifdef ADAPTIVE_RDMA_FAST_PATH
256
/* need to change this to RPUT_CBUF_FLAG or RGET_CBUF_FLAG later
257
* if we are doing rput */
258
v->padding = NORMAL_CBUF_FLAG;
261
/* this is probably not the right place to initialize shandle to NULL.
262
* Do it here for now because it will make sure it is always initialized.
263
* Otherwise we would need to very carefully add the initialization in
264
* a dozen other places, and probably miss one.
271
v->grank = -1; /* Make sure it is not inadvertantly used anywhere */
279
* Put a cbuf back on the free list
282
void release_cbuf(cbuf * v)
287
/* note this correctly handles appending to empty free list */
290
assert(v != free_cbuf_head);
292
v->desc.next = free_cbuf_head;
294
#ifdef ADAPTIVE_RDMA_FAST_PATH
307
* fill in cbuf descriptor with all necessary info
312
void cbuf_init_send(cbuf * v, unsigned long len)
314
v->desc.u.sr.next = NULL;
315
v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
316
v->desc.u.sr.opcode = IBV_WR_SEND;
317
v->desc.u.sr.wr_id = (aint_t) v;
318
v->desc.u.sr.num_sge = 1;
319
v->desc.u.sr.sg_list = &(v->desc.sg_entry);
321
v->desc.sg_entry.addr = (uintptr_t) v->buffer;
322
v->desc.sg_entry.length = len;
323
v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
326
void cbuf_init_recv(cbuf * v, unsigned long len)
328
v->desc.u.rr.next = NULL;
329
v->desc.u.rr.wr_id = (aint_t) v;
330
v->desc.u.rr.num_sge = 1;
331
v->desc.u.rr.sg_list = &(v->desc.sg_entry);
333
v->desc.sg_entry.addr = (uintptr_t) v->buffer;
334
v->desc.sg_entry.length = len;
335
v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
337
#ifdef ADAPTIVE_RDMA_FAST_PATH
338
v->padding = NORMAL_CBUF_FLAG;
341
void cbuf_init_sendrecv(cbuf * v, unsigned long len)
345
void cbuf_init_rput(cbuf * v, void *local_address,
346
uint32_t lkey, void *remote_address,
347
uint32_t rkey, int len)
349
v->desc.u.sr.next = NULL;
350
v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
351
v->desc.u.sr.opcode = IBV_WR_RDMA_WRITE;
352
v->desc.u.sr.wr_id = (aint_t) v;
354
v->desc.u.sr.num_sge = 1;
355
v->desc.u.sr.sg_list = &(v->desc.sg_entry);
357
v->desc.sg_entry.length = len;
358
v->desc.sg_entry.lkey = lkey;
359
v->desc.sg_entry.addr = (uintptr_t) local_address;
361
v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
362
v->desc.u.sr.wr.rdma.rkey = rkey;
364
#ifdef ADAPTIVE_RDMA_FAST_PATH
365
v->padding = RPUT_CBUF_FLAG;
372
void cbuf_init_rget(cbuf * v,
375
void *remote_address,
376
uint32_t rkey, int len)
378
v->desc.u.sr.next = NULL;
379
v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
380
v->desc.u.sr.opcode = IBV_WR_RDMA_READ;
381
v->desc.u.sr.wr_id = (aint_t) v;
383
v->desc.u.sr.num_sge = 1;
384
v->desc.u.sr.sg_list = &(v->desc.sg_entry);
386
v->desc.sg_entry.length = len;
387
v->desc.sg_entry.lkey = lkey;
388
v->desc.sg_entry.addr = (uintptr_t) local_address;
390
v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
391
v->desc.u.sr.wr.rdma.rkey = rkey;
393
#ifdef ADAPTIVE_RDMA_FAST_PATH
394
v->padding = RGET_CBUF_FLAG;
400
* print out cbuf contents for debugging
403
void dump_cbuf(char *msg, cbuf * v)
407
#ifdef ADAPTIVE_RDMA_FAST_PATH