2
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3
* University Research and Technology
4
* Corporation. All rights reserved.
5
* Copyright (c) 2004-2005 The University of Tennessee and The University
6
* of Tennessee Research Foundation. All rights
8
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9
* University of Stuttgart. All rights reserved.
10
* Copyright (c) 2004-2005 The Regents of the University of California.
11
* All rights reserved.
14
* Additional copyrights may follow
20
#include "ompi_config.h"
23
#include "ompi/types.h"
24
#include "ompi/mca/pml/base/pml_base_sendreq.h"
25
#include "orte/mca/ns/base/base.h"
26
#include "orte/mca/oob/base/base.h"
27
#include "orte/mca/rml/rml.h"
28
#include "orte/mca/errmgr/errmgr.h"
29
#include "orte/dss/dss.h"
30
#include "btl_openib.h"
31
#include "btl_openib_endpoint.h"
32
#include "btl_openib_proc.h"
33
#include "btl_openib_frag.h"
34
#include "ompi/class/ompi_free_list.h"
38
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
39
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
41
int mca_btl_openib_endpoint_create_qp(
42
mca_btl_openib_module_t* openib_btl,
45
#if OMPI_MCA_BTL_OPENIB_HAVE_SRQ
48
struct ibv_qp_attr* qp_attr,
54
int mca_btl_openib_endpoint_qp_init_query(
55
mca_btl_openib_module_t* openib_btl,
57
struct ibv_qp_attr* attr,
67
* post a send to the work queue
69
static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* openib_btl,
70
mca_btl_openib_endpoint_t * endpoint,
71
mca_btl_openib_frag_t * frag)
75
struct ibv_send_wr* bad_wr;
76
frag->sg_entry.addr = (unsigned long) frag->hdr;
78
if((frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY) &&
79
frag->size <= openib_btl->super.btl_eager_limit){
80
/* check for a send wqe */
81
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1) < 0) {
82
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
83
opal_list_append(&endpoint->pending_frags_hp,
84
(opal_list_item_t *)frag);
87
/* check for rdma tocken */
88
if (OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,-1) < 0) {
89
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,1);
90
/* check for a token */
91
if(!mca_btl_openib_component.use_srq &&
92
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,-1) < 0) {
93
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
94
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,1);
95
opal_list_append(&endpoint->pending_frags_hp,
96
(opal_list_item_t *)frag);
98
} else if( mca_btl_openib_component.use_srq &&
99
OPAL_THREAD_ADD32(&openib_btl->sd_tokens_hp,-1) < 0) {
100
/* queue the request */
101
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
102
OPAL_THREAD_ADD32(&openib_btl->sd_tokens_hp,1);
103
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
104
opal_list_append(&openib_btl->pending_frags_hp,
105
(opal_list_item_t *)frag);
106
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
113
(endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
114
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
115
frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
116
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
117
-frag->hdr->rdma_credits);
118
ib_qp = endpoint->lcl_qp_hp;
120
/* check for a send wqe */
121
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
123
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
124
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag);
127
/* check for a token */
128
} else if(!mca_btl_openib_component.use_srq &&
129
OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,-1) < 0 ) {
131
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
132
OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,1);
133
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag);
136
} else if(mca_btl_openib_component.use_srq &&
137
OPAL_THREAD_ADD32(&openib_btl->sd_tokens_lp,-1) < 0) {
139
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
140
OPAL_THREAD_ADD32(&openib_btl->sd_tokens_lp,1);
141
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
142
opal_list_append(&openib_btl->pending_frags_lp, (opal_list_item_t *)frag);
143
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
146
/* queue the request */
148
frag->hdr->credits = (endpoint->rd_credits_lp > 0) ? endpoint->rd_credits_lp : 0;
149
OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits);
150
ib_qp = endpoint->lcl_qp_lp;
154
frag->sg_entry.length =
155
frag->segment.seg_len + sizeof(mca_btl_openib_header_t) +
156
(do_rdma ? sizeof(mca_btl_openib_footer_t) : 0);
157
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
158
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE;
160
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
164
mca_btl_openib_footer_t* ftr =
165
(mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
166
frag->segment.seg_len);
167
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE;
168
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.length);
169
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
170
#ifdef OMPI_ENABLE_DEBUG
171
((mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
172
frag->segment.seg_len))->seq =
173
endpoint->eager_rdma_remote.seq++;
175
frag->wr_desc.sr_desc.wr.rdma.rkey = endpoint->eager_rdma_remote.rkey;
176
frag->wr_desc.sr_desc.wr.rdma.remote_addr =
177
(uintptr_t)endpoint->eager_rdma_remote.base.pval +
178
endpoint->eager_rdma_remote.head *
179
openib_btl->eager_rdma_frag_size +
180
sizeof(mca_btl_openib_frag_t) +
181
sizeof(mca_btl_openib_header_t) +
183
sizeof(mca_btl_openib_footer_t);
184
frag->wr_desc.sr_desc.wr.rdma.remote_addr -= frag->sg_entry.length;
185
MCA_BTL_OPENIB_RDMA_NEXT_INDEX (endpoint->eager_rdma_remote.head);
187
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
189
if(ibv_post_send(ib_qp,
190
&frag->wr_desc.sr_desc,
192
BTL_ERROR(("error posting send request errno says %s\n",
197
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
198
if(mca_btl_openib_component.use_srq) {
199
MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1);
200
MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1);
203
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1);
204
MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1);
205
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
214
OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t,
215
opal_list_item_t, mca_btl_openib_endpoint_construct,
216
mca_btl_openib_endpoint_destruct);
219
* Initialize state of the endpoint instance.
223
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
225
endpoint->endpoint_btl = 0;
226
endpoint->endpoint_proc = 0;
227
endpoint->endpoint_tstamp = 0.0;
228
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
229
endpoint->endpoint_retries = 0;
230
OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
231
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
232
OBJ_CONSTRUCT(&endpoint->pending_frags_hp, opal_list_t);
233
OBJ_CONSTRUCT(&endpoint->pending_frags_lp, opal_list_t);
235
endpoint->lcl_qp_attr_hp = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
236
endpoint->lcl_qp_attr_lp = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
237
memset(endpoint->lcl_qp_attr_hp, 0, sizeof(struct ibv_qp_attr));
238
memset(endpoint->lcl_qp_attr_lp, 0, sizeof(struct ibv_qp_attr));
240
endpoint->rd_posted_hp = 0;
241
endpoint->rd_posted_lp = 0;
243
/* number of available send wqes */
244
endpoint->sd_wqe_hp = mca_btl_openib_component.rd_num;
245
endpoint->sd_wqe_lp = mca_btl_openib_component.rd_num;
247
/* zero these out w/ initial posting, so that we start out w/
248
* zero credits to return to peer
250
endpoint->rd_credits_hp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv);
251
endpoint->rd_credits_lp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv);
252
endpoint->sd_credits_hp = 0;
253
endpoint->sd_credits_lp = 0;
255
/* initialize the high and low priority tokens */
256
endpoint->sd_tokens_hp = mca_btl_openib_component.rd_num;
257
endpoint->sd_tokens_lp = mca_btl_openib_component.rd_num;
258
endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom;
260
/* initialize RDMA eager related parts */
261
endpoint->eager_recv_count = 0;
262
memset(&endpoint->eager_rdma_remote, 0,
263
sizeof(mca_btl_openib_eager_rdma_remote_t));
264
memset (&endpoint->eager_rdma_local, 0,
265
sizeof(mca_btl_openib_eager_rdma_local_t));
266
OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);
268
endpoint->rem_info.rem_qp_num_hp = 0;
269
endpoint->rem_info.rem_qp_num_lp = 0;
270
endpoint->rem_info.rem_lid = 0;
271
endpoint->rem_info.rem_psn_hp = 0;
272
endpoint->rem_info.rem_psn_lp = 0;
273
endpoint->rem_info.rem_subnet = 0;
281
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
286
* Send connection information to remote endpoint using OOB
290
static void mca_btl_openib_endpoint_send_cb(
292
orte_process_name_t* endpoint,
293
orte_buffer_t* buffer,
301
static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* endpoint)
303
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
306
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
307
return ORTE_ERR_OUT_OF_RESOURCE;
310
/* pack the info in the send buffer */
312
rc = orte_dss.pack(buffer, &endpoint->lcl_qp_hp->qp_num, 1, ORTE_UINT32);
313
if(rc != ORTE_SUCCESS) {
318
rc = orte_dss.pack(buffer, &endpoint->lcl_qp_lp->qp_num, 1, ORTE_UINT32);
319
if(rc != ORTE_SUCCESS) {
324
rc = orte_dss.pack(buffer, &endpoint->lcl_psn_hp, 1, ORTE_UINT32);
325
if(rc != ORTE_SUCCESS) {
330
rc = orte_dss.pack(buffer, &endpoint->lcl_psn_lp, 1, ORTE_UINT32);
331
if(rc != ORTE_SUCCESS) {
336
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->ib_port_attr->lid, 1, ORTE_UINT16);
337
if(rc != ORTE_SUCCESS) {
343
rc = orte_dss.pack(buffer, &((mca_btl_openib_endpoint_t*) endpoint)->subnet, 1, ORTE_UINT16);
344
if(rc != ORTE_SUCCESS) {
349
/* send to endpoint */
350
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
351
mca_btl_openib_endpoint_send_cb, NULL);
354
BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
355
endpoint->lcl_qp_hp->qp_num,
356
endpoint->lcl_qp_lp->qp_num,
357
endpoint->endpoint_btl->ib_port_attr->lid));
367
* Set remote connection info
368
* (from OOB connection)
371
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, mca_btl_openib_rem_info_t* rem_info)
374
memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info, rem_info, sizeof(mca_btl_openib_rem_info_t));
376
BTL_VERBOSE(("Setting High Priority QP num = %d, Low Priority QP num %d, LID = %d",
377
endpoint->rem_info.rem_qp_num_hp,
378
endpoint->rem_info.rem_qp_num_lp,
379
endpoint->rem_info.rem_lid));
388
* Start to connect to the endpoint. We send our Queue Pair
389
* information over the TCP OOB communication mechanism.
391
* On completion of our send, a send completion handler
396
static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoint)
399
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl;
402
/* Create the High Priority Queue Pair */
403
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
405
openib_btl->ib_cq_hp,
406
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
409
endpoint->lcl_qp_attr_hp,
410
&endpoint->lcl_qp_hp))) {
411
BTL_ERROR(("error creating queue pair, error code %d", rc));
414
srand48(getpid() * time(NULL));
415
endpoint->lcl_psn_hp = lrand48() & 0xffffff;
417
/* Create the Low Priority Queue Pair */
418
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
420
openib_btl->ib_cq_lp,
421
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
424
endpoint->lcl_qp_attr_lp,
425
&endpoint->lcl_qp_lp))) {
426
BTL_ERROR(("error creating queue pair, error code %d", rc));
429
endpoint->lcl_psn_lp = lrand48() & 0xffffff;
431
BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
432
endpoint->lcl_qp_hp->qp_num,
433
endpoint->lcl_qp_lp->qp_num,
434
openib_btl->ib_port_attr->lid));
436
/* Send connection info over to remote endpoint */
437
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
438
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
439
BTL_ERROR(("error sending connect request, error code %d", rc));
446
* Reply to a `start - connect' message
449
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint,
450
mca_btl_openib_rem_info_t *rem_info)
453
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl;
456
/* Create the High Priority Queue Pair */
457
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
459
openib_btl->ib_cq_hp,
460
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
464
endpoint->lcl_qp_attr_hp,
465
&endpoint->lcl_qp_hp))) {
466
BTL_ERROR(("error creating queue pair, error code %d", rc));
469
srand48(getpid() * time(NULL));
470
endpoint->lcl_psn_hp = lrand48() & 0xffffff;
472
/* Create the Low Priority Queue Pair */
473
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
475
openib_btl->ib_cq_lp,
476
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
480
endpoint->lcl_qp_attr_lp,
481
&endpoint->lcl_qp_lp))) {
482
BTL_ERROR(("error creating queue pair, error code %d", rc));
485
endpoint->lcl_psn_lp = lrand48() & 0xffffff;
487
BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
488
endpoint->lcl_qp_hp->qp_num,
489
endpoint->lcl_qp_lp->qp_num,
490
openib_btl->ib_port_attr->lid));
493
/* Set the remote side info */
494
mca_btl_openib_endpoint_set_remote_info(endpoint, rem_info);
496
/* Connect to endpoint */
498
rc = mca_btl_openib_endpoint_connect(endpoint);
499
if(rc != OMPI_SUCCESS) {
500
BTL_ERROR(("error in endpoint connect error code is %d", rc));
504
/* Send connection info over to remote endpoint */
505
endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
506
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
507
BTL_ERROR(("error in endpoint send connect request error code is %d", rc));
514
* endpoint is waiting ack to final connection establishment..
517
static void mca_btl_openib_endpoint_waiting_ack(mca_btl_openib_endpoint_t *endpoint) {
518
endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK;
522
* called when the openib has completed setup via the
526
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
528
opal_list_item_t *frag_item;
529
mca_btl_openib_frag_t *frag;
530
mca_btl_openib_module_t* openib_btl;
532
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
533
endpoint->endpoint_btl->poll_cq = true;
536
* The connection is correctly setup. Now we can decrease the event trigger.
538
opal_progress_event_decrement();
540
/* While there are frags in the list,
543
while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
544
frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
545
frag = (mca_btl_openib_frag_t *) frag_item;
546
openib_btl = endpoint->endpoint_btl;
547
/* We need to post this one */
549
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
550
BTL_ERROR(("Error posting send"));
555
* Non blocking OOB recv callback.
556
* Read incoming QP and other info, and if this endpoint
557
* is trying to connect, reply with our QP info,
558
* otherwise try to modify QP's and establish
559
* reliable connection
563
static void mca_btl_openib_endpoint_recv(
565
orte_process_name_t* endpoint,
566
orte_buffer_t* buffer,
570
mca_btl_openib_proc_t *ib_proc;
571
mca_btl_openib_endpoint_t *ib_endpoint;
576
mca_btl_openib_rem_info_t rem_info;
578
/* start by unpacking data first so we know who is knocking at
581
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_hp, &cnt, ORTE_UINT32);
582
if(ORTE_SUCCESS != rc) {
586
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_lp, &cnt, ORTE_UINT32);
587
if(ORTE_SUCCESS != rc) {
591
rc = orte_dss.unpack(buffer, &rem_info.rem_psn_hp, &cnt, ORTE_UINT32);
592
if(ORTE_SUCCESS != rc) {
595
}rc = orte_dss.unpack(buffer, &rem_info.rem_psn_lp, &cnt, ORTE_UINT32);
596
if(ORTE_SUCCESS != rc) {
600
rc = orte_dss.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
601
if(ORTE_SUCCESS != rc) {
605
rc = orte_dss.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
606
if(ORTE_SUCCESS != rc) {
611
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32);
612
if(rc != ORTE_SUCCESS) {
617
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32);
618
if(rc != ORTE_SUCCESS) {
623
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32);
624
if(rc != ORTE_SUCCESS) {
629
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32);
630
if(rc != ORTE_SUCCESS) {
636
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
637
rem_info.rem_qp_num_hp,
638
rem_info.rem_qp_num_lp,
641
for(ib_proc = (mca_btl_openib_proc_t*)
642
opal_list_get_first(&mca_btl_openib_component.ib_procs);
643
ib_proc != (mca_btl_openib_proc_t*)
644
opal_list_get_end(&mca_btl_openib_component.ib_procs);
645
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
647
if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) {
650
/* Try to get the endpoint instance of this proc */
652
for(i = 0; i < ib_proc->proc_endpoint_count; i++) {
653
mca_btl_openib_port_info_t port_info;
654
port_info = ib_proc->proc_ports[i];
655
ib_endpoint = ib_proc->proc_endpoints[i];
656
if(ib_endpoint->rem_info.rem_lid &&
657
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
658
/* we've seen them before! */
663
/* If we haven't seen this remote lid before then try to match on
665
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
666
mca_btl_openib_port_info_t port_info;
667
port_info = ib_proc->proc_ports[i];
668
ib_endpoint = ib_proc->proc_endpoints[i];
669
if(!ib_endpoint->rem_info.rem_lid &&
670
ib_endpoint->subnet == rem_info.rem_subnet) {
671
/* found a match based on subnet! */
676
/* try finding an open port, even if subnets
679
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
680
mca_btl_openib_port_info_t port_info;
681
port_info = ib_proc->proc_ports[i];
682
ib_endpoint = ib_proc->proc_endpoints[i];
683
if(!ib_endpoint->rem_info.rem_lid) {
684
/* found an unused end-point */
691
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
695
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
696
endpoint_state = ib_endpoint->endpoint_state;
699
switch(endpoint_state) {
700
case MCA_BTL_IB_CLOSED :
701
/* We had this connection closed before.
702
* The endpoint is trying to connect. Move the
703
* status of this connection to CONNECTING,
704
* and then reply with our QP information */
706
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, &rem_info))) {
707
BTL_ERROR(("error in endpoint reply start connect"));
711
/** As long as we expect a message from the peer (in order to setup the connection)
712
* let the event engine pool the OOB events. Note: we increment it once peer active
715
opal_progress_event_increment();
718
case MCA_BTL_IB_CONNECTING :
720
mca_btl_openib_endpoint_set_remote_info(ib_endpoint, &rem_info);
721
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) {
722
BTL_ERROR(("endpoint connect error: %d", rc));
726
/* Setup state as awaiting ack from peer */
727
mca_btl_openib_endpoint_waiting_ack(ib_endpoint);
729
/* Send him an ack */
730
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
733
case MCA_BTL_IB_WAITING_ACK:
734
mca_btl_openib_endpoint_connected(ib_endpoint);
737
case MCA_BTL_IB_CONNECT_ACK:
738
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
739
mca_btl_openib_endpoint_connected(ib_endpoint);
742
case MCA_BTL_IB_CONNECTED :
745
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
747
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
754
* Post the OOB recv (for receiving the peers information)
756
void mca_btl_openib_post_recv()
759
orte_rml.recv_buffer_nb(
761
ORTE_RML_TAG_DYNAMIC-1,
763
mca_btl_openib_endpoint_recv,
769
* Attempt to send a fragment using a given endpoint. If the endpoint is not
770
* connected, queue the fragment and start the connection as required.
773
int mca_btl_openib_endpoint_send(
774
mca_btl_base_endpoint_t* endpoint,
775
mca_btl_openib_frag_t* frag
779
bool call_progress = false;
780
mca_btl_openib_module_t *openib_btl;
782
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
783
switch(endpoint->endpoint_state) {
784
case MCA_BTL_IB_CONNECTING:
786
BTL_VERBOSE(("Queing because state is connecting"));
788
opal_list_append(&endpoint->pending_send_frags,
789
(opal_list_item_t *)frag);
790
call_progress = true;
794
case MCA_BTL_IB_CONNECT_ACK:
795
case MCA_BTL_IB_WAITING_ACK:
796
BTL_VERBOSE(("Queuing because waiting for ack"));
798
opal_list_append(&endpoint->pending_send_frags,
799
(opal_list_item_t *)frag);
800
call_progress = true;
804
case MCA_BTL_IB_CLOSED:
806
BTL_VERBOSE(("Connection to endpoint closed ... connecting ..."));
807
opal_list_append(&endpoint->pending_send_frags,
808
(opal_list_item_t *)frag);
809
rc = mca_btl_openib_endpoint_start_connect(endpoint);
811
* As long as we expect a message from the peer (in order to setup the connection)
812
* let the event engine pool the OOB events. Note: we increment it once peer active
815
opal_progress_event_increment();
816
call_progress = true;
819
case MCA_BTL_IB_FAILED:
821
rc = OMPI_ERR_UNREACH;
824
case MCA_BTL_IB_CONNECTED:
826
openib_btl = endpoint->endpoint_btl;
827
BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p",
828
endpoint->endpoint_proc->proc_guid.vpid,
829
frag->sg_entry.length,
831
rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag);
836
rc = OMPI_ERR_UNREACH;
838
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
839
if(call_progress) opal_progress();
844
* Complete connection to endpoint.
847
int mca_btl_openib_endpoint_connect(
848
mca_btl_openib_endpoint_t *endpoint)
851
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl;
853
/* Connection establishment RC */
854
rc = mca_btl_openib_endpoint_qp_init_query(
857
endpoint->lcl_qp_attr_hp,
858
endpoint->lcl_psn_hp,
859
endpoint->rem_info.rem_qp_num_hp,
860
endpoint->rem_info.rem_psn_hp,
861
endpoint->rem_info.rem_lid,
867
if(rc != OMPI_SUCCESS) {
870
rc = mca_btl_openib_endpoint_qp_init_query(
873
endpoint->lcl_qp_attr_lp,
874
endpoint->lcl_psn_lp,
875
endpoint->rem_info.rem_qp_num_lp,
876
endpoint->rem_info.rem_psn_lp,
877
endpoint->rem_info.rem_lid,
883
if(rc != OMPI_SUCCESS) {
887
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
888
if(mca_btl_openib_component.use_srq) {
889
MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1);
890
MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1);
893
MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1);
894
MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1);
895
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
903
* Create the queue pair note that this is just the initial
904
* queue pair creation and we need to get the remote queue pair
905
* info from the peer before the qp is usable,
908
int mca_btl_openib_endpoint_create_qp(
909
mca_btl_openib_module_t* openib_btl,
912
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
915
struct ibv_qp_attr* qp_attr,
920
struct ibv_qp* my_qp;
921
struct ibv_qp_init_attr qp_init_attr;
923
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
925
qp_init_attr.send_cq = cq;
926
qp_init_attr.recv_cq = cq;
927
qp_init_attr.cap.max_send_wr = mca_btl_openib_component.rd_num + 1;
928
qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv;
929
qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
930
qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
931
qp_init_attr.qp_type = IBV_QPT_RC;
932
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
933
if(mca_btl_openib_component.use_srq) {
934
qp_init_attr.srq = srq;
937
my_qp = ibv_create_qp(pd, &qp_init_attr);
940
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
944
if(0 == (openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) {
945
BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data"));
950
qp_attr->qp_state = IBV_QPS_INIT;
951
qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
952
qp_attr->port_num = openib_btl->port_num;
953
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
955
if(ibv_modify_qp((*qp), qp_attr,
959
IBV_QP_ACCESS_FLAGS )) {
960
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
969
* The queue pair has been created and we have received the remote
970
* queue pair information from the peer so we init this queue pair
971
* and are ready to roll.
973
int mca_btl_openib_endpoint_qp_init_query(
974
mca_btl_openib_module_t* openib_btl,
976
struct ibv_qp_attr* attr,
986
attr->qp_state = IBV_QPS_RTR;
987
attr->path_mtu = mca_btl_openib_component.ib_mtu;
988
attr->dest_qp_num = rem_qp_num;
989
attr->rq_psn = rem_psn;
990
attr->max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
991
attr->min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
992
attr->ah_attr.is_global = 0;
993
attr->ah_attr.dlid = rem_lid;
994
attr->ah_attr.sl = mca_btl_openib_component.ib_service_level;
995
attr->ah_attr.src_path_bits = mca_btl_openib_component.ib_src_path_bits;
996
attr->ah_attr.port_num = port_num;
998
if(ibv_modify_qp(qp, attr,
1004
IBV_QP_MAX_DEST_RD_ATOMIC |
1005
IBV_QP_MIN_RNR_TIMER)) {
1006
BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno)));
1009
attr->qp_state = IBV_QPS_RTS;
1010
attr->timeout = mca_btl_openib_component.ib_timeout;
1011
attr->retry_cnt = mca_btl_openib_component.ib_retry_count;
1012
attr->rnr_retry = mca_btl_openib_component.ib_rnr_retry;
1013
attr->sq_psn = lcl_psn;
1014
attr->max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
1015
if (ibv_modify_qp(qp, attr,
1021
IBV_QP_MAX_QP_RD_ATOMIC)) {
1022
BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno)));
1025
return OMPI_SUCCESS;
1030
* Return control fragment.
1033
static void mca_btl_openib_endpoint_credits_lp(
1034
mca_btl_base_module_t* btl,
1035
struct mca_btl_base_endpoint_t* endpoint,
1036
struct mca_btl_base_descriptor_t* descriptor,
1041
/* we don't acquire a wqe or token for credit message - so decrement */
1042
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1);
1044
/* check to see if there are addditional credits to return */
1045
if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-1)) > 0) {
1046
OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-credits);
1047
if (endpoint->rd_credits_lp >= mca_btl_openib_component.rd_win &&
1048
OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,1) == 1) {
1049
mca_btl_openib_endpoint_send_credits_lp(endpoint);
1052
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl),
1053
((mca_btl_openib_frag_t*)descriptor));
1057
* Return credits to peer
1060
void mca_btl_openib_endpoint_send_credits_lp(
1061
mca_btl_openib_endpoint_t* endpoint)
1063
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
1064
mca_btl_openib_frag_t* frag;
1065
struct ibv_send_wr* bad_wr;
1068
MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
1070
BTL_ERROR(("error allocating fragment"));
1074
frag->base.des_cbfunc = mca_btl_openib_endpoint_credits_lp;
1075
frag->base.des_cbdata = NULL;
1076
frag->endpoint = endpoint;
1078
frag->hdr->tag = MCA_BTL_TAG_BTL;
1079
frag->hdr->credits = endpoint->rd_credits_lp;
1080
OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits);
1081
((mca_btl_openib_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_OPENIB_CONTROL_NOOP;
1083
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
1084
frag->sg_entry.length = sizeof(mca_btl_openib_header_t) +
1085
sizeof(mca_btl_openib_control_header_t);
1086
frag->sg_entry.addr = (unsigned long) frag->hdr;
1088
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
1089
frag->wr_desc.sr_desc.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
1091
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
1094
if(ibv_post_send(endpoint->lcl_qp_lp,
1095
&frag->wr_desc.sr_desc,
1097
OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, -1);
1098
OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, frag->hdr->credits);
1099
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
1100
BTL_ERROR(("error posting send request errno %d says %s", strerror(errno)));
1107
* Return control fragment.
1110
static void mca_btl_openib_endpoint_credits_hp(
1111
mca_btl_base_module_t* btl,
1112
struct mca_btl_base_endpoint_t* endpoint,
1113
struct mca_btl_base_descriptor_t* descriptor,
1118
/* we don't acquire a wqe or token for credit message - so decrement */
1119
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1);
1121
/* check to see if there are addditional credits to return */
1122
if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-1)) > 0) {
1123
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-credits);
1124
if ((endpoint->rd_credits_hp >= mca_btl_openib_component.rd_win ||
1125
endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win) &&
1126
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,1) == 1) {
1127
mca_btl_openib_endpoint_send_credits_hp(endpoint);
1130
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl),
1131
((mca_btl_openib_frag_t*)descriptor));
1135
* Return credits to peer
1138
void mca_btl_openib_endpoint_send_credits_hp(
1139
mca_btl_openib_endpoint_t* endpoint)
1141
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
1142
mca_btl_openib_frag_t* frag;
1143
struct ibv_send_wr* bad_wr;
1146
MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
1148
BTL_ERROR(("error allocating fragment"));
1152
frag->base.des_cbfunc = mca_btl_openib_endpoint_credits_hp;
1153
frag->base.des_cbdata = NULL;
1154
frag->endpoint = endpoint;
1156
frag->hdr->tag = MCA_BTL_TAG_BTL;
1157
frag->hdr->credits =
1158
(endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
1159
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
1160
frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
1161
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
1162
-frag->hdr->rdma_credits);
1163
((mca_btl_openib_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_OPENIB_CONTROL_NOOP;
1165
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
1166
frag->sg_entry.length = sizeof(mca_btl_openib_header_t) +
1167
sizeof(mca_btl_openib_control_header_t);
1168
frag->sg_entry.addr = (unsigned long) frag->hdr;
1170
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
1171
frag->wr_desc.sr_desc.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
1173
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
1176
if(ibv_post_send(endpoint->lcl_qp_hp,
1177
&frag->wr_desc.sr_desc,
1179
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, -1);
1180
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, frag->hdr->credits);
1181
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
1182
BTL_ERROR(("error posting send request errno %d says %s", errno,
1188
static void mca_btl_openib_endpoint_eager_rdma(
1189
mca_btl_base_module_t* btl,
1190
struct mca_btl_base_endpoint_t* endpoint,
1191
struct mca_btl_base_descriptor_t* descriptor,
1194
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl),
1195
((mca_btl_openib_frag_t*)descriptor));
1198
static int mca_btl_openib_endpoint_send_eager_rdma(
1199
mca_btl_base_endpoint_t* endpoint)
1201
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
1202
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
1203
mca_btl_openib_frag_t* frag;
1206
MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
1208
BTL_ERROR(("error allocating fragment"));
1212
frag->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma;
1213
frag->base.des_cbdata = NULL;
1214
frag->endpoint = endpoint;
1215
frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
1217
frag->hdr->tag = MCA_BTL_TAG_BTL;
1218
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval;
1219
rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
1220
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
1221
rdma_hdr->rdma_start.pval = endpoint->eager_rdma_local.base.pval;
1222
frag->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t);
1223
if (mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag) !=
1225
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
1226
BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
1231
/* create RDMA buffer for eager messages */
1232
void mca_btl_openib_endpoint_connect_eager_rdma(
1233
mca_btl_openib_endpoint_t* endpoint)
1235
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
1239
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
1240
if (endpoint->eager_rdma_local.base.pval)
1241
goto unlock_rdma_local;
1243
buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
1244
openib_btl->eager_rdma_frag_size *
1245
mca_btl_openib_component.eager_rdma_num, 0, 0,
1246
(mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
1249
goto unlock_rdma_local;
1251
for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
1252
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
1253
i*openib_btl->eager_rdma_frag_size);
1254
item->user_data = endpoint->eager_rdma_local.reg;
1255
OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_eager_t);
1256
((mca_btl_openib_frag_t*)item)->endpoint = endpoint;
1257
((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
1260
OPAL_THREAD_LOCK(&openib_btl->eager_rdma_lock);
1261
if(orte_pointer_array_add (&endpoint->eager_rdma_index,
1262
openib_btl->eager_rdma_buffers, endpoint) < 0)
1265
endpoint->eager_rdma_local.base.pval = buf;
1266
openib_btl->eager_rdma_buffers_count++;
1267
if (mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) {
1268
OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock);
1269
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
1273
openib_btl->eager_rdma_buffers_count--;
1274
endpoint->eager_rdma_local.base.pval = NULL;
1275
orte_pointer_array_set_item(openib_btl->eager_rdma_buffers,
1276
endpoint->eager_rdma_index, NULL);
1279
OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock);
1280
openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
1281
buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
1283
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);