1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
3
* (C) 2012 NEC Corporation
4
* Author: Masamichi Takagi
5
* (C) 2012 Oct 14 Yutaka Ishikawa, ishikawa@is.s.u-tokyo.ac.jp
6
* See COPYRIGHT in top-level directory.
14
#include <sys/types.h>
15
#include "mpid_nem_impl.h"
21
*** diff -p verbs.h dcfa.h (structures)
22
same name, same fields
23
struct ibv_device { };
24
struct ibv_context { };
26
struct ibv_ah_attr { };
28
same name, different fields
29
struct ibv_qp_init_attr {
31
- struct ibv_xrc_domain *xrc_domain;
42
+ int flag; 1: offload
50
+ struct mlx4_buf buf;
51
+ int max_inline_data;
54
+ uint32_t doorbell_qpn;
55
+ uint32_t sq_signal_bits;
59
+ uint32_t *db; // doorbell addr for post recv
61
+ ibmic_qp_conn_info_t remote_qp_info;
66
- struct ibv_context *context;
68
- uint32_t events_completed;
69
- struct ibv_xrc_domain *xrc_domain;
70
- pthread_mutex_t mutex;
71
- pthread_cond_t cond;
75
- struct ibv_comp_channel *channel;
78
- uint32_t comp_events_completed;
79
- uint32_t async_events_completed;
81
- pthread_mutex_t mutex;
82
- pthread_cond_t cond;
84
+ struct mlx4_buf buf;
85
+ uint32_t cons_index;
86
+ uint32_t wait_index;
87
+ uint32_t *set_ci_db;
96
- uint16_t pkey_index;
99
- uint8_t dlid_path_bits;
103
- struct ibv_sge *sg_list;
104
+ struct ibv_sge sg_list[WR_SG_NUM];
111
- struct ibv_sge *sg_list;
112
+ struct ibv_sge sg_list[WR_SG_NUM];
116
+ uint64_t mic_addr; // buffer address on mic
120
- struct ibv_port_attr { };
123
*** diff -p verbs.h dcfa.h (functions)
125
same name, same arguments
140
same name, different arguments
141
- int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
142
+ int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr);
144
- int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr)
145
+ int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr);
147
- struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
148
+ struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe_max);
151
- ibv_get_device_name
155
struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
159
/* Original Infiniband */
160
#include <infiniband/verbs.h>
163
static inline unsigned long long MPID_nem_ib_rdtsc_cpuid(void)
166
__asm__ __volatile__(// serialize
167
"xorl %%eax,%%eax \n cpuid":::"%rax", "%rbx", "%rcx", "%rdx");
168
__asm__ __volatile__("rdtsc":"=a"(lo), "=d"(hi));
169
return (unsigned long long) hi << 32 | lo;
172
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq;
173
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_lmt_put;
174
extern struct ibv_cq *MPID_nem_ib_rc_shared_scq_scratch_pad;
175
extern struct ibv_cq *MPID_nem_ib_ud_shared_rcq;
177
#define MPID_NEM_IB_COM_SIZE 2048 /* one process uses 2-4 fds */
178
#define MPID_NEM_IB_COM_INLINE_DATA (512-64) /* experimented max is 884 */ /* this is lower bound and more than this value is set. the more this value is, the more the actual value set is. you need to check it */
180
#define MPID_NEM_IB_COM_MAX_SQ_CAPACITY (256/1)
181
#define MPID_NEM_IB_COM_MAX_RQ_CAPACITY ((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)+16) /* We pre-post_recv MPID_NEM_IB_COM_MAX_SQ_CAPACITY of commands */
182
#define MPID_NEM_IB_COM_MAX_SGE_CAPACITY (32/2) /* maximum for ConnectX-3 looks like 32 */
183
#define MPID_NEM_IB_COM_MAX_CQ_CAPACITY MPID_NEM_IB_COM_MAX_RQ_CAPACITY
184
#define MPID_NEM_IB_COM_MAX_CQ_HEIGHT_DRAIN (((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>2)+((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>1)) /* drain when reaching this amount */
185
#define MPID_NEM_IB_COM_MAX_SQ_HEIGHT_DRAIN (((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)>>2)+((MPID_NEM_IB_COM_MAX_SQ_CAPACITY)>>1)) /* drain when reaching this amount */
186
#define MPID_NEM_IB_COM_AMT_CQ_DRAIN ((MPID_NEM_IB_COM_MAX_CQ_CAPACITY)>>2) /* drain this amount */
187
#define MPID_NEM_IB_COM_MAX_RD_ATOMIC 4
189
#define MPID_NEM_IB_COM_MAX_TRIES 1
190
#define MPID_NEM_IB_COM_SCQ_FLG 1
191
#define MPID_NEM_IB_COM_RCQ_FLG 2
193
#define MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ 100
194
#define MPID_NEM_IB_COM_INFOKEY_MR_ADDR 200
195
#define MPID_NEM_IB_COM_INFOKEY_MR_LENGTH 201
196
#define MPID_NEM_IB_COM_INFOKEY_MR_RKEY 202
197
#define MPID_NEM_IB_COM_INFOKEY_QP_QPN 300
198
#define MPID_NEM_IB_COM_INFOKEY_PORT_LID 400
199
#define MPID_NEM_IB_COM_INFOKEY_PORT_GID 401
203
#define MPID_NEM_IB_COM_NBUF_RDMA 2 /* number of <addr, sz, lkey, rkey> */
204
#define MPID_NEM_IB_COM_RDMAWR_FROM 0 /* index to RDMA-write-from buffer */
205
#define MPID_NEM_IB_COM_RDMAWR_TO 1 /* index to RDMA-write-to buffer */
206
/* assuming that the unit (32768) is equals to eager-RDMA-write threashold
207
assuming that the multiplier (256) is
208
equals to max number of outstanding eager-RDMA-write transactions */
209
#define MPID_NEM_IB_COM_RDMABUF_SZSEG (16384/4) //(16384+8+40+1) /* this size minus magics and headers must be 2^n because data might grow to the next 2^m boundary, see ib_impl.h, ib_com.c, src/mpid/ch3/src/mpid_isend.c */
210
#define MPID_NEM_IB_COM_RDMABUF_SZ ((MPID_NEM_IB_COM_RDMABUF_SZSEG) * 16) /* (32768 * 256) */
211
#define MPID_NEM_IB_COM_RDMABUF_NSEG ((MPID_NEM_IB_COM_RDMABUF_SZ) / (MPID_NEM_IB_COM_RDMABUF_SZSEG))
212
#define MPID_NEM_IB_COM_SMT_INLINE_NCHAIN 8 /* maximum number of chained inline-send commands */
213
#define MPID_NEM_IB_COM_RDMABUF_HIGH_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>1)+((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
214
#define MPID_NEM_IB_COM_RDMABUF_LOW_WATER_MARK (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2))
215
#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_HW 1
216
#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_STATE_LW 2
217
#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_HW /*1*/(((MPID_NEM_IB_COM_RDMABUF_NSEG)>>4) == 0 ? 1 : ((MPID_NEM_IB_COM_RDMABUF_NSEG)>>4))
218
#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_LW (((MPID_NEM_IB_COM_RDMABUF_NSEG)>>2)) /*12*/ /* receiver tries to notify sender the number of releases when receiver find not-noticed releases of more than this number */
219
#define MPID_NEM_IB_COM_RDMABUF_OCCUPANCY_NOTIFY_RATE_DELAY_MULTIPLIER(notify_rate) (notify_rate + (notify_rate>>1)) /* (notify_rate) */ /* send seq_num to the sender side if there is no chance to embed seq_num into a packet bound for the sender side for this number of release events */
221
#define MPID_NEM_IB_COM_NBUF_UD 2 /* number of <addr, sz, lkey, rkey> */
222
#define MPID_NEM_IB_COM_UDWR_FROM 0 /* index to UD-write-from buffer */
223
#define MPID_NEM_IB_COM_UDWR_TO 1 /* index to UD-write-to buffer */
224
#define MPID_NEM_IB_COM_UDBUF_SZ (128 * 8192) /* supporting 100K ranks with 10 rounds */
225
#define MPID_NEM_IB_COM_UDBUF_SZSEG (128)
226
#define MPID_NEM_IB_COM_UDBUF_NSEG (MPID_NEM_IB_COM_UDBUF_SZ / MPID_NEM_IB_COM_UDBUF_SZSEG)
228
#define MPID_NEM_IB_COM_NBUF_SCRATCH_PAD 1 /* number of <addr, sz, lkey, rkey> */
229
#define MPID_NEM_IB_COM_SCRATCH_PAD_TO 0 /* index to RDMA-write-to buffer */
231
/* send command templates */
232
#define MPID_NEM_IB_COM_RC_SR_NTEMPLATE (8+1+2) /* number of request templates, 8 for inline-chained-smt, 1 for smt, 1 for lmt */
233
#define MPID_NEM_IB_COM_SMT_INLINE_CHAINED0 0 /* index to it */
234
#define MPID_NEM_IB_COM_SMT_INLINE_CHAINED7 7
235
#define MPID_NEM_IB_COM_SMT_NOINLINE 8
236
#define MPID_NEM_IB_COM_LMT_INITIATOR 9 /* FIXME: bad naming */
238
#define MPID_NEM_IB_COM_RC_SR_LMT_PUT_NTEMPLATE MPID_NEM_IB_COM_RC_SR_NTEMPLATE /* FIXME: TEMPLATE named MPID_NEM_IB_COM_RC_SR shares MPID_NEM_IB_COM_LMT_PUT */
239
#define MPID_NEM_IB_COM_LMT_PUT 10
241
/* recv command templates */
242
#define MPID_NEM_IB_COM_RC_RR_NTEMPLATE 1 /* 1 for smt, */
243
#define MPID_NEM_IB_COM_RDMAWR_RESPONDER 0 /* index to recv request template */
246
#define MPID_NEM_IB_COM_SMT_INLINE_INITIATOR_NSGE 4 /* MPI header, (sz;magic), data x1, magic */
247
#define MPID_NEM_IB_COM_SMT_NOINLINE_INITIATOR_NSGE 4 /* MPI header, (sz;magic), data x1, magic */
248
#define MPID_NEM_IB_COM_LMT_INITIATOR_NSGE 1 /* data x1 */
249
#define MPID_NEM_IB_COM_LMT_PUT_NSGE 1 /* data x1 */
250
#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR_NSGE 1 /* QP state */
252
#define MPID_NEM_IB_COM_UD_SR_NTEMPLATE 1
253
#define MPID_NEM_IB_COM_UD_RR_NTEMPLATE 1
254
#define MPID_NEM_IB_COM_UD_INITIATOR 0 /* index to send request template */
255
#define MPID_NEM_IB_COM_UD_RESPONDER 0 /* index to recv request template */
257
#define MPID_NEM_IB_COM_SCRATCH_PAD_SR_NTEMPLATE 2
258
#define MPID_NEM_IB_COM_SCRATCH_PAD_RR_NTEMPLATE 1
259
#define MPID_NEM_IB_COM_SCRATCH_PAD_INITIATOR 0 /* index to send request template */
260
#define MPID_NEM_IB_COM_SCRATCH_PAD_CAS 1
261
#define MPID_NEM_IB_COM_SCRATCH_PAD_RESPONDER 0 /* index to recv request template */
264
typedef struct MPID_nem_ib_com {
266
short icom_connected;
270
struct ibv_port_attr icom_pattr; /* IB port attributes */
272
struct ibv_qp *icom_qp;
273
struct ibv_cq *icom_scq;
274
struct ibv_cq *icom_rcq;
275
struct ibv_mr **icom_mrlist;
277
union ibv_gid icom_gid;
278
void **icom_mem; /* 0: send 1: recv 2..: rdma */
279
int *icom_msize; /* 0: send 1: recv 2..: rdma */
280
struct ibv_send_wr *icom_sr;
281
struct ibv_ah_attr *icom_ah_attr;
282
struct ibv_recv_wr *icom_rr;
287
int rsr_seq_num_poll;
288
int rsr_seq_num_tail; /* occupation status of remote Send Request (SR) queue (it covers occupation status of local RDMA-wr-to buffer) */
289
int rsr_seq_num_tail_last_sent; /* latest one sent to remote rank */
290
int lsr_seq_num_tail; /* occupation status of local Send Request (SR) queue */
291
int lsr_seq_num_tail_last_requested; /* value when lmt_start_send issued req_seq_num */
292
int rdmabuf_occupancy_notify_rstate, rdmabuf_occupancy_notify_lstate;
293
int ncom, ncom_lmt_put, ncom_scratch_pad; /* number of entries in the command queue */
295
uint32_t max_inline_data; /* actual value obtained after ibv_create_qp */
296
uint32_t max_send_wr;
297
uint32_t max_recv_wr;
299
uint32_t open_flag; /* MPID_NEM_IB_COM_OPEN_UD, ... */
300
uint16_t remote_lid; /* for debug */
302
/* other commands can be executed before RDMA-rd command */
303
/* see the "Ordering and the Fence Indicator" section in "InfiniBand Architecture" by William T. Futral */
304
uint16_t after_rdma_rd;
306
uint64_t rsr_seq_num_released[(MPID_NEM_IB_COM_RDMABUF_NSEG + 63) / 64];
310
extern int MPID_nem_ib_com_open(int ib_port, int MPID_nem_ib_com_open_flag, int *condesc);
311
extern int MPID_nem_ib_com_alloc(int condesc, int sz);
312
extern int MPID_nem_ib_com_close(int);
313
extern int MPID_nem_ib_com_rts(int condesc, int remote_qpnum, uint16_t remote_lid,
314
union ibv_gid *remote_gid);
316
extern int MPID_nem_ib_com_reg_mr_connect(int condesc, void *rmem, int rkey);
317
extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void *prefix, int sz_prefix,
318
void *hdr, int sz_hdr, void *data, int sz_data, int *copied);
319
extern int MPID_nem_ib_com_isend_chain(int condesc, uint64_t wr_id, void *hdr, int sz_hdr,
320
void *data, int sz_data);
321
extern int MPID_nem_ib_com_put_scratch_pad(int condesc, uint64_t wr_id, uint64_t offset, int sz,
323
//extern int MPID_nem_ib_com_isend(int condesc, uint64_t wr_id, void* hdr, int sz_hdr, void* data, int sz_data);
324
extern int MPID_nem_ib_com_irecv(int condesc, uint64_t wr_id);
325
extern int MPID_nem_ib_com_udsend(int condesc, union ibv_gid *remote_gid, uint16_t remote_lid,
326
uint32_t remote_qpn, uint32_t imm_data, uint64_t wr_id);
327
extern int MPID_nem_ib_com_udrecv(int condesc);
328
extern int MPID_nem_ib_com_lrecv(int condesc, uint64_t wr_id, void *raddr, int sz_data,
329
uint32_t rkey, void *laddr);
330
extern int MPID_nem_ib_com_put_lmt(int condesc, uint64_t wr_id, void *raddr, int sz_data,
331
uint32_t rkey, void *laddr);
332
extern int MPID_nem_ib_com_poll_cq(int which_cq, struct ibv_wc *wc, int *result);
334
extern int MPID_nem_ib_com_obtain_pointer(int condesc, MPID_nem_ib_com_t ** MPID_nem_ib_com);
336
/* for ib_reg_mr.c */
337
extern int MPID_nem_ib_com_reg_mr(void *addr, int len, struct ibv_mr **mr);
338
extern int MPID_nem_ib_com_dereg_mr(struct ibv_mr *mr);
340
extern int MPID_nem_ib_com_get_info_conn(int condesc, int key, void *out, uint32_t out_len);
341
extern int MPID_nem_ib_com_get_info_mr(int condesc, int memid, int key, void *out, int out_len);
343
extern int MPID_nem_ib_com_sseq_num_get(int condesc, int *seq_num);
344
extern int MPID_nem_ib_com_lsr_seq_num_tail_get(int condesc, int **seq_num);
345
extern int MPID_nem_ib_com_rsr_seq_num_tail_get(int condesc, int **seq_num);
346
extern int MPID_nem_ib_com_rsr_seq_num_tail_last_sent_get(int condesc, int **seq_num);
347
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rate_get(int condesc, int *notify_rate);
348
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_rstate_get(int condesc, int **rstate);
349
extern int MPID_nem_ib_com_rdmabuf_occupancy_notify_lstate_get(int condesc, int **lstate);
351
extern char *MPID_nem_ib_com_strerror(int errno);
353
extern int MPID_nem_ib_com_mem_rdmawr_from(int condesc, void **out);
354
extern int MPID_nem_ib_com_mem_rdmawr_to(int condesc, int seq_num, void **out);
355
extern int MPID_nem_ib_com_mem_udwr_from(int condesc, void **out);
356
extern int MPID_nem_ib_com_mem_udwr_to(int condesc, void **out);
359
extern void MPID_nem_ib_com_register_cache_init(void);
360
extern void MPID_nem_ib_com_register_cache_destroy(void);
361
extern struct ibv_mr *MPID_nem_ib_com_reg_mr_fetch(void *addr, int len);
363
extern int MPID_nem_ib_com_udbuf_init(void *q);
365
#define MPID_NEM_IB_COM_RC_SHARED_RCQ 0
366
#define MPID_NEM_IB_COM_RC_SHARED_SCQ 1
367
#define MPID_NEM_IB_COM_UD_SHARED_RCQ 2
368
#define MPID_NEM_IB_COM_UD_SHARED_SCQ 3
369
#define MPID_NEM_IB_COM_RC_SHARED_SCQ_LMT_PUT 4
372
#define MPID_NEM_IB_COM_OPEN_RC 0x01
373
/* for MPI control message, eager send, rendezvous protocol,
374
so via RC-send/recv or RDMA-write/RDMA-read */
376
#define MPID_NEM_IB_COM_OPEN_UD 0x02
377
/* obsolete, to wait for you-to-me QP to become RTR state
378
so via UD-send/recv */
380
#define MPID_NEM_IB_COM_OPEN_RC_LMT_PUT 0x03
381
/* obsolete, tried to use different CQ for LMT-PUT protocol for speed */
383
#define MPID_NEM_IB_COM_OPEN_SCRATCH_PAD 0x04
384
/* obsolete, to wait for you-to-me QP to become RTR state
387
#define MPID_NEM_IB_COM_ERR_SETANDJUMP(errno, stmt) { stmt; ibcom_errno = errno; goto fn_fail; }
388
#define MPID_NEM_IB_COM_ERR_CHKANDJUMP(cond, errno, stmt) if (cond) { stmt; ibcom_errno = errno; goto fn_fail; }
390
#define MPID_NEM_IB_COM_QKEY 0x1234
391
#define MPID_NEM_IB_COM_MAGIC 0x55
393
typedef struct MPID_nem_ib_sz_hdrmagic_t {
396
} MPID_nem_ib_sz_hdrmagic_t;
399
typedef struct MPID_nem_ib_tailmagic_t {
401
//uint32_t traits; /* for debug */
402
} MPID_nem_ib_tailmagic_t;
404
#define MPID_NEM_IB_SZ_DATA_POW2(sz) \
405
for(sz_data_pow2 = 15; sz_data_pow2 < (sz); sz_data_pow2 = ((((sz_data_pow2 + 1) << 1) - 1) > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t)) ? MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t) : (((sz_data_pow2 + 1) << 1) - 1)) { } \
406
if (sz_data_pow2 > MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t)) { printf("assertion failed\n"); }; \
408
#define MPID_NEM_IB_MAX_DATA_POW2 (MPID_NEM_IB_COM_RDMABUF_SZSEG - sizeof(MPID_nem_ib_tailmagic_t))
410
typedef struct MPID_nem_ib_com_qp_state_t {
412
} MPID_nem_ib_com_qp_state_t;
414
#define MPID_NEM_IB_COM_QP_STATE_RTR 0x12345678
415
#define MPID_NEM_IB_COM_SZ_MPI_HEADER 48
416
#define MPID_NEM_IB_COM_AMT_SLACK (MPID_NEM_IB_COM_RDMABUF_NSEG > 128 ? 1 : 1)