~ubuntu-branches/ubuntu/hardy/openmpi/hardy-updates

« back to all changes in this revision

Viewing changes to ompi/mca/btl/openib/btl_openib_endpoint.c

  • Committer: Bazaar Package Importer
  • Author(s): Mark Hymers
  • Date: 2006-10-15 00:46:11 UTC
  • Revision ID: james.westby@ubuntu.com-20061015004611-uuhxnaxyjmuxfd5h
Tags: upstream-1.1
ImportĀ upstreamĀ versionĀ 1.1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 
3
 *                         University Research and Technology
 
4
 *                         Corporation.  All rights reserved.
 
5
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 
6
 *                         of Tennessee Research Foundation.  All rights
 
7
 *                         reserved.
 
8
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 
9
 *                         University of Stuttgart.  All rights reserved.
 
10
 * Copyright (c) 2004-2005 The Regents of the University of California.
 
11
 *                         All rights reserved.
 
12
 * $COPYRIGHT$
 
13
 * 
 
14
 * Additional copyrights may follow
 
15
 * 
 
16
 * $HEADER$
 
17
 */
 
18
 
 
19
 
 
20
#include "ompi_config.h"
 
21
#include <sys/time.h>
 
22
#include <time.h>
 
23
#include "ompi/types.h"
 
24
#include "ompi/mca/pml/base/pml_base_sendreq.h"
 
25
#include "orte/mca/ns/base/base.h"
 
26
#include "orte/mca/oob/base/base.h"
 
27
#include "orte/mca/rml/rml.h"
 
28
#include "orte/mca/errmgr/errmgr.h"
 
29
#include "orte/dss/dss.h"
 
30
#include "btl_openib.h"
 
31
#include "btl_openib_endpoint.h" 
 
32
#include "btl_openib_proc.h"
 
33
#include "btl_openib_frag.h"
 
34
#include "ompi/class/ompi_free_list.h" 
 
35
#include <errno.h> 
 
36
#include <string.h> 
 
37
 
 
38
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
 
39
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
 
40
 
 
41
int mca_btl_openib_endpoint_create_qp(
 
42
                                      mca_btl_openib_module_t* openib_btl, 
 
43
                                      struct ibv_pd* pd, 
 
44
                                      struct ibv_cq* cq, 
 
45
#if OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
46
                                      struct ibv_srq* srq, 
 
47
#endif
 
48
                                      struct ibv_qp_attr* qp_attr,                                      
 
49
                                      struct ibv_qp** qp
 
50
                                      ); 
 
51
 
 
52
 
 
53
 
 
54
int mca_btl_openib_endpoint_qp_init_query(
 
55
                                          mca_btl_openib_module_t* openib_btl, 
 
56
                                          struct ibv_qp* qp, 
 
57
                                          struct ibv_qp_attr* attr,
 
58
                                          uint32_t lcl_psn, 
 
59
                                          uint32_t rem_qp_num, 
 
60
                                          uint32_t rem_psn,  
 
61
                                          uint16_t rem_lid, 
 
62
                                          uint32_t port_num 
 
63
                                          ); 
 
64
                   
 
65
 
 
66
/* 
 
67
 * post a send to the work queue 
 
68
 */ 
 
69
static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* openib_btl, 
 
70
                                                    mca_btl_openib_endpoint_t * endpoint, 
 
71
                                                    mca_btl_openib_frag_t * frag)
 
72
 
73
    int do_rdma = 0;
 
74
    struct ibv_qp* ib_qp; 
 
75
    struct ibv_send_wr* bad_wr; 
 
76
    frag->sg_entry.addr = (unsigned long) frag->hdr; 
 
77
 
 
78
    if((frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY) &&
 
79
            frag->size <= openib_btl->super.btl_eager_limit){ 
 
80
        /* check for a send wqe */
 
81
        if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1) < 0) {
 
82
            OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
 
83
            opal_list_append(&endpoint->pending_frags_hp, 
 
84
                    (opal_list_item_t *)frag);
 
85
            return OMPI_SUCCESS;
 
86
            } 
 
87
        /* check for rdma tocken */     
 
88
        if (OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,-1) < 0) {
 
89
            OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,1);
 
90
            /* check for a token */
 
91
            if(!mca_btl_openib_component.use_srq &&
 
92
                    OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,-1) < 0) {
 
93
                OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
 
94
                OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,1);
 
95
                opal_list_append(&endpoint->pending_frags_hp,
 
96
                        (opal_list_item_t *)frag);
 
97
                return OMPI_SUCCESS;
 
98
          } else if( mca_btl_openib_component.use_srq &&
 
99
                  OPAL_THREAD_ADD32(&openib_btl->sd_tokens_hp,-1) < 0) { 
 
100
                /* queue the request */
 
101
                OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
 
102
                OPAL_THREAD_ADD32(&openib_btl->sd_tokens_hp,1);
 
103
                OPAL_THREAD_LOCK(&openib_btl->ib_lock);
 
104
                opal_list_append(&openib_btl->pending_frags_hp,
 
105
                        (opal_list_item_t *)frag);
 
106
                OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
 
107
                return OMPI_SUCCESS;
 
108
          }
 
109
        } else {
 
110
            do_rdma = 1;
 
111
        }
 
112
        frag->hdr->credits =
 
113
            (endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
 
114
        OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
 
115
        frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
 
116
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
 
117
                -frag->hdr->rdma_credits);
 
118
        ib_qp = endpoint->lcl_qp_hp; 
 
119
    } else {
 
120
        /* check for a send wqe */
 
121
        if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
 
122
 
 
123
            OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
 
124
            opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag);
 
125
            return OMPI_SUCCESS;
 
126
 
 
127
        /* check for a token */
 
128
        } else if(!mca_btl_openib_component.use_srq &&
 
129
            OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,-1) < 0 ) {
 
130
 
 
131
            OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
 
132
            OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,1);
 
133
            opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag); 
 
134
            return OMPI_SUCCESS;
 
135
 
 
136
        } else if(mca_btl_openib_component.use_srq &&
 
137
            OPAL_THREAD_ADD32(&openib_btl->sd_tokens_lp,-1) < 0) {
 
138
 
 
139
            OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
 
140
            OPAL_THREAD_ADD32(&openib_btl->sd_tokens_lp,1);
 
141
            OPAL_THREAD_LOCK(&openib_btl->ib_lock);
 
142
            opal_list_append(&openib_btl->pending_frags_lp, (opal_list_item_t *)frag); 
 
143
            OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
 
144
            return OMPI_SUCCESS;
 
145
 
 
146
        /* queue the request */
 
147
        } else { 
 
148
            frag->hdr->credits = (endpoint->rd_credits_lp > 0) ? endpoint->rd_credits_lp : 0;
 
149
            OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits);
 
150
            ib_qp = endpoint->lcl_qp_lp; 
 
151
        }
 
152
    } 
 
153
    
 
154
    frag->sg_entry.length =
 
155
        frag->segment.seg_len + sizeof(mca_btl_openib_header_t) +
 
156
        (do_rdma ? sizeof(mca_btl_openib_footer_t) : 0);
 
157
    if(frag->sg_entry.length <= openib_btl->ib_inline_max) { 
 
158
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE;
 
159
    } else { 
 
160
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; 
 
161
    }
 
162
 
 
163
    if(do_rdma) {
 
164
        mca_btl_openib_footer_t* ftr =
 
165
            (mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
 
166
                                       frag->segment.seg_len);
 
167
        frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE;
 
168
        MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.length);
 
169
        MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
 
170
#ifdef OMPI_ENABLE_DEBUG
 
171
        ((mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
 
172
                                    frag->segment.seg_len))->seq = 
 
173
            endpoint->eager_rdma_remote.seq++;
 
174
#endif
 
175
        frag->wr_desc.sr_desc.wr.rdma.rkey = endpoint->eager_rdma_remote.rkey;
 
176
        frag->wr_desc.sr_desc.wr.rdma.remote_addr =
 
177
            (uintptr_t)endpoint->eager_rdma_remote.base.pval +
 
178
            endpoint->eager_rdma_remote.head *
 
179
            openib_btl->eager_rdma_frag_size +
 
180
            sizeof(mca_btl_openib_frag_t) +
 
181
            sizeof(mca_btl_openib_header_t) +
 
182
            frag->size +
 
183
            sizeof(mca_btl_openib_footer_t);
 
184
        frag->wr_desc.sr_desc.wr.rdma.remote_addr -= frag->sg_entry.length;
 
185
        MCA_BTL_OPENIB_RDMA_NEXT_INDEX (endpoint->eager_rdma_remote.head);
 
186
    } else {
 
187
        frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
 
188
    }
 
189
    if(ibv_post_send(ib_qp, 
 
190
                     &frag->wr_desc.sr_desc, 
 
191
                     &bad_wr)) { 
 
192
        BTL_ERROR(("error posting send request errno says %s\n", 
 
193
                    strerror(errno))); 
 
194
        return OMPI_ERROR; 
 
195
    }
 
196
            
 
197
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ 
 
198
    if(mca_btl_openib_component.use_srq) { 
 
199
        MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); 
 
200
        MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1);         
 
201
    } else { 
 
202
#endif 
 
203
        MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); 
 
204
        MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); 
 
205
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
206
    }
 
207
#endif 
 
208
    
 
209
    return OMPI_SUCCESS; 
 
210
}
 
211
 
 
212
 
 
213
 
 
214
OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t, 
 
215
                   opal_list_item_t, mca_btl_openib_endpoint_construct, 
 
216
                   mca_btl_openib_endpoint_destruct);
 
217
 
 
218
/*
 
219
 * Initialize state of the endpoint instance.
 
220
 *
 
221
 */
 
222
 
 
223
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
 
224
{
 
225
    endpoint->endpoint_btl = 0;
 
226
    endpoint->endpoint_proc = 0;
 
227
    endpoint->endpoint_tstamp = 0.0;
 
228
    endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
 
229
    endpoint->endpoint_retries = 0;
 
230
    OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
 
231
    OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
 
232
    OBJ_CONSTRUCT(&endpoint->pending_frags_hp, opal_list_t);
 
233
    OBJ_CONSTRUCT(&endpoint->pending_frags_lp, opal_list_t);
 
234
    
 
235
    endpoint->lcl_qp_attr_hp = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr)); 
 
236
    endpoint->lcl_qp_attr_lp = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr)); 
 
237
    memset(endpoint->lcl_qp_attr_hp, 0, sizeof(struct ibv_qp_attr)); 
 
238
    memset(endpoint->lcl_qp_attr_lp, 0, sizeof(struct ibv_qp_attr)); 
 
239
 
 
240
    endpoint->rd_posted_hp = 0;
 
241
    endpoint->rd_posted_lp = 0;
 
242
 
 
243
    /* number of available send wqes */
 
244
    endpoint->sd_wqe_hp = mca_btl_openib_component.rd_num;
 
245
    endpoint->sd_wqe_lp = mca_btl_openib_component.rd_num;
 
246
 
 
247
    /* zero these out w/ initial posting, so that we start out w/
 
248
     * zero credits to return to peer
 
249
     */
 
250
    endpoint->rd_credits_hp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv);
 
251
    endpoint->rd_credits_lp = -(mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv);
 
252
    endpoint->sd_credits_hp = 0;
 
253
    endpoint->sd_credits_lp = 0;
 
254
 
 
255
    /* initialize the high and low priority tokens */
 
256
    endpoint->sd_tokens_hp = mca_btl_openib_component.rd_num;
 
257
    endpoint->sd_tokens_lp = mca_btl_openib_component.rd_num;
 
258
    endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom;
 
259
 
 
260
    /* initialize RDMA eager related parts */
 
261
    endpoint->eager_recv_count = 0;
 
262
    memset(&endpoint->eager_rdma_remote, 0,
 
263
                    sizeof(mca_btl_openib_eager_rdma_remote_t));
 
264
    memset (&endpoint->eager_rdma_local, 0,
 
265
                    sizeof(mca_btl_openib_eager_rdma_local_t));
 
266
    OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);
 
267
 
 
268
    endpoint->rem_info.rem_qp_num_hp = 0; 
 
269
    endpoint->rem_info.rem_qp_num_lp = 0; 
 
270
    endpoint->rem_info.rem_lid = 0; 
 
271
    endpoint->rem_info.rem_psn_hp = 0;
 
272
    endpoint->rem_info.rem_psn_lp = 0; 
 
273
    endpoint->rem_info.rem_subnet = 0; 
 
274
}
 
275
 
 
276
/*
 
277
 * Destroy a endpoint
 
278
 *
 
279
 */
 
280
 
 
281
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
 
282
{
 
283
}
 
284
 
 
285
/*
 
286
 * Send connection information to remote endpoint using OOB
 
287
 *
 
288
 */
 
289
 
 
290
static void mca_btl_openib_endpoint_send_cb(
 
291
    int status,
 
292
    orte_process_name_t* endpoint, 
 
293
    orte_buffer_t* buffer,
 
294
    orte_rml_tag_t tag, 
 
295
    void* cbdata)
 
296
{
 
297
    OBJ_RELEASE(buffer);
 
298
}
 
299
 
 
300
 
 
301
static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* endpoint)
 
302
{
 
303
    orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
 
304
    int rc;
 
305
    if(NULL == buffer) {
 
306
         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 
307
         return ORTE_ERR_OUT_OF_RESOURCE;
 
308
    }
 
309
 
 
310
    /* pack the info in the send buffer */
 
311
 
 
312
    rc = orte_dss.pack(buffer, &endpoint->lcl_qp_hp->qp_num, 1, ORTE_UINT32);
 
313
    if(rc != ORTE_SUCCESS) {
 
314
        ORTE_ERROR_LOG(rc);
 
315
        return rc;
 
316
    }
 
317
    
 
318
    rc = orte_dss.pack(buffer, &endpoint->lcl_qp_lp->qp_num, 1, ORTE_UINT32);
 
319
    if(rc != ORTE_SUCCESS) {
 
320
        ORTE_ERROR_LOG(rc);
 
321
        return rc;
 
322
    }
 
323
 
 
324
    rc = orte_dss.pack(buffer, &endpoint->lcl_psn_hp, 1, ORTE_UINT32); 
 
325
    if(rc != ORTE_SUCCESS) {
 
326
        ORTE_ERROR_LOG(rc);
 
327
        return rc;
 
328
    }
 
329
     
 
330
    rc = orte_dss.pack(buffer, &endpoint->lcl_psn_lp, 1, ORTE_UINT32); 
 
331
    if(rc != ORTE_SUCCESS) {
 
332
        ORTE_ERROR_LOG(rc);
 
333
        return rc;
 
334
    }
 
335
    
 
336
    rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->ib_port_attr->lid, 1, ORTE_UINT16);
 
337
    if(rc != ORTE_SUCCESS) {
 
338
        ORTE_ERROR_LOG(rc);
 
339
        return rc;
 
340
    }
 
341
 
 
342
    
 
343
    rc = orte_dss.pack(buffer, &((mca_btl_openib_endpoint_t*) endpoint)->subnet, 1, ORTE_UINT16);
 
344
    if(rc != ORTE_SUCCESS) {
 
345
        ORTE_ERROR_LOG(rc);
 
346
        return rc;
 
347
    }
 
348
 
 
349
    /* send to endpoint */
 
350
    rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
 
351
         mca_btl_openib_endpoint_send_cb, NULL);
 
352
    
 
353
    
 
354
    BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
 
355
              endpoint->lcl_qp_hp->qp_num,
 
356
              endpoint->lcl_qp_lp->qp_num,
 
357
              endpoint->endpoint_btl->ib_port_attr->lid));
 
358
    
 
359
    if(rc < 0) {
 
360
        ORTE_ERROR_LOG(rc);
 
361
        return rc;
 
362
    }
 
363
    return OMPI_SUCCESS;
 
364
}
 
365
 
 
366
/*
 
367
 * Set remote connection info
 
368
 *   (from OOB connection) 
 
369
 *
 
370
 */
 
371
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, mca_btl_openib_rem_info_t* rem_info)
 
372
{
 
373
    
 
374
    memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info, rem_info, sizeof(mca_btl_openib_rem_info_t)); 
 
375
    
 
376
    BTL_VERBOSE(("Setting High Priority QP num = %d, Low Priority QP num %d,  LID = %d",
 
377
                 endpoint->rem_info.rem_qp_num_hp,
 
378
                 endpoint->rem_info.rem_qp_num_lp, 
 
379
                 endpoint->rem_info.rem_lid));
 
380
 
 
381
    return ORTE_SUCCESS;
 
382
 
 
383
}
 
384
 
 
385
 
 
386
 
 
387
/*
 
388
 * Start to connect to the endpoint. We send our Queue Pair
 
389
 * information over the TCP OOB communication mechanism.
 
390
 
 
391
 * On completion of our send, a send completion handler 
 
392
 * is called.
 
393
 *
 
394
 */
 
395
 
 
396
static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoint)
 
397
{
 
398
    int rc;
 
399
    mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; 
 
400
    
 
401
    
 
402
    /* Create the High Priority Queue Pair */
 
403
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, 
 
404
                                                               openib_btl->ib_pd, 
 
405
                                                               openib_btl->ib_cq_hp, 
 
406
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
407
                                                               openib_btl->srq_hp, 
 
408
#endif
 
409
                                                               endpoint->lcl_qp_attr_hp, 
 
410
                                                               &endpoint->lcl_qp_hp))) { 
 
411
        BTL_ERROR(("error creating queue pair, error code %d", rc)); 
 
412
        return rc;
 
413
    }
 
414
    srand48(getpid() * time(NULL));
 
415
    endpoint->lcl_psn_hp = lrand48() & 0xffffff; 
 
416
    
 
417
    /* Create the Low Priority Queue Pair */
 
418
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, 
 
419
                                                               openib_btl->ib_pd, 
 
420
                                                               openib_btl->ib_cq_lp, 
 
421
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
422
                                                               openib_btl->srq_lp, 
 
423
#endif
 
424
                                                              endpoint->lcl_qp_attr_lp, 
 
425
                                                               &endpoint->lcl_qp_lp))) { 
 
426
        BTL_ERROR(("error creating queue pair, error code %d", rc)); 
 
427
        return rc;
 
428
    }
 
429
    endpoint->lcl_psn_lp = lrand48() & 0xffffff; 
 
430
 
 
431
    BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d,  LID = %d",
 
432
              endpoint->lcl_qp_hp->qp_num,
 
433
              endpoint->lcl_qp_lp->qp_num, 
 
434
              openib_btl->ib_port_attr->lid)); 
 
435
 
 
436
    /* Send connection info over to remote endpoint */
 
437
    endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
 
438
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
 
439
        BTL_ERROR(("error sending connect request, error code %d", rc)); 
 
440
        return rc;
 
441
    }
 
442
    return OMPI_SUCCESS;
 
443
}
 
444
 
 
445
/*
 
446
 * Reply to a `start - connect' message
 
447
 *
 
448
 */
 
449
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint, 
 
450
                                                       mca_btl_openib_rem_info_t *rem_info)
 
451
{
 
452
    int rc;
 
453
    mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; 
 
454
        
 
455
        
 
456
    /* Create the High Priority Queue Pair */
 
457
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, 
 
458
                                                               openib_btl->ib_pd, 
 
459
                                                               openib_btl->ib_cq_hp,  
 
460
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
461
                                                               openib_btl->srq_hp, 
 
462
#endif
 
463
 
 
464
                                                               endpoint->lcl_qp_attr_hp, 
 
465
                                                               &endpoint->lcl_qp_hp))) { 
 
466
        BTL_ERROR(("error creating queue pair, error code %d", rc)); 
 
467
        return rc;
 
468
    }
 
469
    srand48(getpid() * time(NULL));
 
470
    endpoint->lcl_psn_hp = lrand48() & 0xffffff;
 
471
    
 
472
    /* Create the Low Priority Queue Pair */
 
473
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, 
 
474
                                                               openib_btl->ib_pd, 
 
475
                                                               openib_btl->ib_cq_lp, 
 
476
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
477
                                                               openib_btl->srq_lp, 
 
478
#endif
 
479
 
 
480
                                                               endpoint->lcl_qp_attr_lp, 
 
481
                                                               &endpoint->lcl_qp_lp))) { 
 
482
        BTL_ERROR(("error creating queue pair, error code %d", rc)); 
 
483
        return rc;
 
484
    }
 
485
    endpoint->lcl_psn_lp = lrand48() & 0xffffff; 
 
486
 
 
487
    BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d,  LID = %d",
 
488
              endpoint->lcl_qp_hp->qp_num,
 
489
              endpoint->lcl_qp_lp->qp_num, 
 
490
              openib_btl->ib_port_attr->lid)); 
 
491
 
 
492
 
 
493
    /* Set the remote side info */
 
494
    mca_btl_openib_endpoint_set_remote_info(endpoint, rem_info);
 
495
    
 
496
    /* Connect to endpoint */
 
497
 
 
498
    rc = mca_btl_openib_endpoint_connect(endpoint);
 
499
    if(rc != OMPI_SUCCESS) {
 
500
        BTL_ERROR(("error in endpoint connect error code is %d", rc)); 
 
501
        return rc;
 
502
    }
 
503
 
 
504
    /* Send connection info over to remote endpoint */
 
505
    endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
 
506
    if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
 
507
        BTL_ERROR(("error in endpoint send connect request error code is %d", rc)); 
 
508
        return rc;
 
509
    }
 
510
    return OMPI_SUCCESS;
 
511
}
 
512
 
 
513
/* 
 
514
 *  endpoint is waiting ack to final connection establishment.. 
 
515
 */
 
516
 
 
517
static void mca_btl_openib_endpoint_waiting_ack(mca_btl_openib_endpoint_t *endpoint) { 
 
518
    endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK; 
 
519
}
 
520
 
 
521
/*
 
522
 * called when the openib has completed setup via the 
 
523
 *  OOB channel 
 
524
 */
 
525
 
 
526
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
 
527
{
 
528
    opal_list_item_t *frag_item;
 
529
    mca_btl_openib_frag_t *frag;
 
530
    mca_btl_openib_module_t* openib_btl; 
 
531
    
 
532
    endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
 
533
    endpoint->endpoint_btl->poll_cq = true; 
 
534
    
 
535
    /**
 
536
     * The connection is correctly setup. Now we can decrease the event trigger.
 
537
     */
 
538
    opal_progress_event_decrement();
 
539
 
 
540
    /* While there are frags in the list,
 
541
     * process them */
 
542
 
 
543
    while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
 
544
        frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
 
545
        frag = (mca_btl_openib_frag_t *) frag_item;
 
546
        openib_btl = endpoint->endpoint_btl;
 
547
        /* We need to post this one */
 
548
        
 
549
        if(OMPI_SUCCESS !=  mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
 
550
            BTL_ERROR(("Error posting send")); 
 
551
    }
 
552
}
 
553
 
 
554
/*
 
555
 * Non blocking OOB recv callback.
 
556
 * Read incoming QP and other info, and if this endpoint
 
557
 * is trying to connect, reply with our QP info, 
 
558
 * otherwise try to modify QP's and establish
 
559
 * reliable connection
 
560
 *
 
561
 */
 
562
 
 
563
static void mca_btl_openib_endpoint_recv(
 
564
    int status,
 
565
    orte_process_name_t* endpoint, 
 
566
    orte_buffer_t* buffer,
 
567
    orte_rml_tag_t tag, 
 
568
    void* cbdata)
 
569
{
 
570
    mca_btl_openib_proc_t *ib_proc;
 
571
    mca_btl_openib_endpoint_t *ib_endpoint;
 
572
    int endpoint_state;
 
573
    int rc;
 
574
    uint32_t i; 
 
575
    size_t cnt = 1; 
 
576
    mca_btl_openib_rem_info_t rem_info; 
 
577
    
 
578
    /* start by unpacking data first so we know who is knocking at 
 
579
       our door */ 
 
580
    
 
581
    rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_hp, &cnt, ORTE_UINT32);
 
582
    if(ORTE_SUCCESS != rc) {
 
583
        ORTE_ERROR_LOG(rc);
 
584
        return;
 
585
    }
 
586
    rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_lp, &cnt, ORTE_UINT32);
 
587
    if(ORTE_SUCCESS != rc) {
 
588
        ORTE_ERROR_LOG(rc);
 
589
        return;
 
590
    }
 
591
    rc = orte_dss.unpack(buffer, &rem_info.rem_psn_hp, &cnt, ORTE_UINT32);
 
592
    if(ORTE_SUCCESS != rc) {
 
593
        ORTE_ERROR_LOG(rc);
 
594
        return;
 
595
    }rc = orte_dss.unpack(buffer, &rem_info.rem_psn_lp, &cnt, ORTE_UINT32);
 
596
    if(ORTE_SUCCESS != rc) {
 
597
        ORTE_ERROR_LOG(rc);
 
598
        return;
 
599
    }
 
600
    rc = orte_dss.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
 
601
    if(ORTE_SUCCESS != rc) {
 
602
        ORTE_ERROR_LOG(rc);
 
603
        return;
 
604
    }
 
605
    rc = orte_dss.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
 
606
    if(ORTE_SUCCESS != rc) {
 
607
        ORTE_ERROR_LOG(rc);
 
608
        return;
 
609
    }
 
610
#if 0
 
611
    rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32); 
 
612
    if(rc != ORTE_SUCCESS) { 
 
613
        ORTE_ERROR_LOG(rc); 
 
614
        return rc; 
 
615
    }
 
616
    
 
617
    rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32); 
 
618
    if(rc != ORTE_SUCCESS) { 
 
619
        ORTE_ERROR_LOG(rc); 
 
620
        return rc; 
 
621
    }
 
622
    
 
623
    rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32); 
 
624
    if(rc != ORTE_SUCCESS) { 
 
625
        ORTE_ERROR_LOG(rc); 
 
626
        return rc; 
 
627
    }
 
628
 
 
629
    rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32); 
 
630
    if(rc != ORTE_SUCCESS) { 
 
631
        ORTE_ERROR_LOG(rc); 
 
632
        return rc; 
 
633
    }
 
634
#endif
 
635
 
 
636
    BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d,  LID = %d",
 
637
                 rem_info.rem_qp_num_hp,
 
638
                 rem_info.rem_qp_num_lp, 
 
639
                 rem_info.rem_lid));
 
640
 
 
641
    for(ib_proc = (mca_btl_openib_proc_t*)
 
642
            opal_list_get_first(&mca_btl_openib_component.ib_procs);
 
643
            ib_proc != (mca_btl_openib_proc_t*)
 
644
            opal_list_get_end(&mca_btl_openib_component.ib_procs);
 
645
            ib_proc  = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
 
646
 
 
647
        if(orte_ns.compare(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == 0) {
 
648
            bool found = false;
 
649
            
 
650
            /* Try to get the endpoint instance of this proc */
 
651
 
 
652
            for(i = 0; i < ib_proc->proc_endpoint_count; i++) { 
 
653
                mca_btl_openib_port_info_t port_info; 
 
654
                port_info = ib_proc->proc_ports[i]; 
 
655
                ib_endpoint = ib_proc->proc_endpoints[i]; 
 
656
                if(ib_endpoint->rem_info.rem_lid && 
 
657
                   ib_endpoint->rem_info.rem_lid  == rem_info.rem_lid) { 
 
658
                    /* we've seen them before! */ 
 
659
                    found = true; 
 
660
                    break;
 
661
                }
 
662
            }
 
663
            /* If we haven't seen this remote lid before then try to match on 
 
664
               endpoint */ 
 
665
            for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) { 
 
666
                mca_btl_openib_port_info_t port_info; 
 
667
                port_info = ib_proc->proc_ports[i]; 
 
668
                ib_endpoint = ib_proc->proc_endpoints[i]; 
 
669
                if(!ib_endpoint->rem_info.rem_lid && 
 
670
                   ib_endpoint->subnet  == rem_info.rem_subnet) { 
 
671
                    /* found a match based on subnet! */ 
 
672
                    found = true; 
 
673
                    break;
 
674
                }
 
675
            }
 
676
            /* try finding an open port, even if subnets  
 
677
               don't match
 
678
            */ 
 
679
            for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) { 
 
680
                mca_btl_openib_port_info_t port_info; 
 
681
                port_info = ib_proc->proc_ports[i]; 
 
682
                ib_endpoint = ib_proc->proc_endpoints[i]; 
 
683
                if(!ib_endpoint->rem_info.rem_lid) { 
 
684
                    /* found an unused end-point */ 
 
685
                    found = true; 
 
686
                    break;
 
687
                }
 
688
            }
 
689
            
 
690
            if(!found) { 
 
691
                BTL_ERROR(("can't find suitable endpoint for this peer\n")); 
 
692
                return; 
 
693
            }
 
694
           
 
695
            OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); 
 
696
            endpoint_state = ib_endpoint->endpoint_state;
 
697
 
 
698
            /* Update status */
 
699
            switch(endpoint_state) {
 
700
            case MCA_BTL_IB_CLOSED :
 
701
                /* We had this connection closed before.
 
702
                 * The endpoint is trying to connect. Move the
 
703
                 * status of this connection to CONNECTING,
 
704
                 * and then reply with our QP information */
 
705
                
 
706
                if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, &rem_info))) {
 
707
                    BTL_ERROR(("error in endpoint reply start connect")); 
 
708
                    break;
 
709
                }
 
710
                                                                                     
 
711
                /** As long as we expect a message from the peer (in order to setup the connection)
 
712
                 * let the event engine pool the OOB events. Note: we increment it once peer active
 
713
                 * connection.
 
714
                 */
 
715
                opal_progress_event_increment();
 
716
                break;
 
717
                
 
718
            case MCA_BTL_IB_CONNECTING :
 
719
 
 
720
                mca_btl_openib_endpoint_set_remote_info(ib_endpoint, &rem_info);
 
721
                if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) {
 
722
                    BTL_ERROR(("endpoint connect error: %d", rc)); 
 
723
                    break;
 
724
                }
 
725
                    
 
726
                /* Setup state as awaiting ack from peer */
 
727
                mca_btl_openib_endpoint_waiting_ack(ib_endpoint);
 
728
 
 
729
                /* Send him an ack */
 
730
                mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
 
731
                break;
 
732
                
 
733
            case MCA_BTL_IB_WAITING_ACK:
 
734
                mca_btl_openib_endpoint_connected(ib_endpoint);
 
735
                break; 
 
736
                
 
737
            case MCA_BTL_IB_CONNECT_ACK:
 
738
                mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
 
739
                mca_btl_openib_endpoint_connected(ib_endpoint);
 
740
                break;
 
741
 
 
742
            case MCA_BTL_IB_CONNECTED :
 
743
                break;
 
744
            default :
 
745
                BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
 
746
            }
 
747
            OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
 
748
            break;
 
749
        }
 
750
    }
 
751
}
 
752
 
 
753
/* 
 
754
 *  Post the OOB recv (for receiving the peers information)
 
755
 */ 
 
756
void mca_btl_openib_post_recv()
 
757
{
 
758
    
 
759
    orte_rml.recv_buffer_nb(
 
760
        ORTE_RML_NAME_ANY, 
 
761
        ORTE_RML_TAG_DYNAMIC-1, 
 
762
        ORTE_RML_PERSISTENT,
 
763
        mca_btl_openib_endpoint_recv,
 
764
        NULL);
 
765
}
 
766
 
 
767
 
 
768
/*
 
769
 * Attempt to send a fragment using a given endpoint. If the endpoint is not
 
770
 * connected, queue the fragment and start the connection as required.
 
771
 */
 
772
 
 
773
int mca_btl_openib_endpoint_send(
 
774
                             mca_btl_base_endpoint_t* endpoint,
 
775
                             mca_btl_openib_frag_t* frag
 
776
                             )
 
777
{
 
778
    int rc;
 
779
    bool call_progress = false;
 
780
    mca_btl_openib_module_t *openib_btl; 
 
781
    
 
782
    OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
 
783
    switch(endpoint->endpoint_state) {
 
784
        case MCA_BTL_IB_CONNECTING:
 
785
 
 
786
            BTL_VERBOSE(("Queing because state is connecting"));
 
787
            
 
788
            opal_list_append(&endpoint->pending_send_frags,
 
789
                    (opal_list_item_t *)frag);
 
790
            call_progress = true;
 
791
            rc = OMPI_SUCCESS;
 
792
            break;
 
793
 
 
794
        case MCA_BTL_IB_CONNECT_ACK:
 
795
        case MCA_BTL_IB_WAITING_ACK:
 
796
            BTL_VERBOSE(("Queuing because waiting for ack"));
 
797
 
 
798
            opal_list_append(&endpoint->pending_send_frags,
 
799
                    (opal_list_item_t *)frag);
 
800
            call_progress = true;
 
801
            rc = OMPI_SUCCESS;
 
802
            break;
 
803
 
 
804
        case MCA_BTL_IB_CLOSED:
 
805
 
 
806
            BTL_VERBOSE(("Connection to endpoint closed ... connecting ..."));
 
807
            opal_list_append(&endpoint->pending_send_frags,
 
808
                    (opal_list_item_t *)frag);
 
809
            rc = mca_btl_openib_endpoint_start_connect(endpoint);
 
810
            /**
 
811
             * As long as we expect a message from the peer (in order to setup the connection)
 
812
             * let the event engine pool the OOB events. Note: we increment it once peer active
 
813
             * connection.
 
814
             */
 
815
            opal_progress_event_increment();
 
816
            call_progress = true;
 
817
            break;
 
818
 
 
819
        case MCA_BTL_IB_FAILED:
 
820
 
 
821
            rc = OMPI_ERR_UNREACH;
 
822
            break;
 
823
 
 
824
        case MCA_BTL_IB_CONNECTED:
 
825
            {
 
826
                openib_btl = endpoint->endpoint_btl;
 
827
                BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p", 
 
828
                              endpoint->endpoint_proc->proc_guid.vpid,
 
829
                              frag->sg_entry.length,
 
830
                              frag));
 
831
                rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag); 
 
832
                break; 
 
833
            }
 
834
 
 
835
    default:
 
836
        rc = OMPI_ERR_UNREACH;
 
837
    }
 
838
    OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
 
839
    if(call_progress) opal_progress();
 
840
    return rc;
 
841
}
 
842
 
 
843
/*
 
844
 * Complete connection to endpoint.
 
845
 */
 
846
 
 
847
int mca_btl_openib_endpoint_connect(
 
848
    mca_btl_openib_endpoint_t *endpoint)
 
849
{
 
850
    int rc;
 
851
    mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; 
 
852
 
 
853
    /* Connection establishment RC */
 
854
    rc = mca_btl_openib_endpoint_qp_init_query(
 
855
                                               openib_btl, 
 
856
                                               endpoint->lcl_qp_hp, 
 
857
                                               endpoint->lcl_qp_attr_hp, 
 
858
                                               endpoint->lcl_psn_hp, 
 
859
                                               endpoint->rem_info.rem_qp_num_hp, 
 
860
                                               endpoint->rem_info.rem_psn_hp, 
 
861
                                               endpoint->rem_info.rem_lid, 
 
862
                                               openib_btl->port_num
 
863
                                               ); 
 
864
    
 
865
    
 
866
    
 
867
    if(rc != OMPI_SUCCESS) {
 
868
        return rc;
 
869
    }
 
870
    rc = mca_btl_openib_endpoint_qp_init_query(
 
871
                                               openib_btl, 
 
872
                                               endpoint->lcl_qp_lp, 
 
873
                                               endpoint->lcl_qp_attr_lp, 
 
874
                                               endpoint->lcl_psn_lp, 
 
875
                                               endpoint->rem_info.rem_qp_num_lp, 
 
876
                                               endpoint->rem_info.rem_psn_lp, 
 
877
                                               endpoint->rem_info.rem_lid, 
 
878
                                               openib_btl->port_num
 
879
                                               ); 
 
880
    
 
881
    
 
882
    
 
883
    if(rc != OMPI_SUCCESS) {
 
884
        return rc;
 
885
    }
 
886
             
 
887
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ 
 
888
    if(mca_btl_openib_component.use_srq) { 
 
889
        MCA_BTL_OPENIB_POST_SRR_HIGH(openib_btl, 1); 
 
890
        MCA_BTL_OPENIB_POST_SRR_LOW(openib_btl, 1);         
 
891
    } else { 
 
892
#endif 
 
893
        MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); 
 
894
        MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); 
 
895
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
896
    }
 
897
#endif 
 
898
    
 
899
    return OMPI_SUCCESS;
 
900
}
 
901
 
 
902
/* 
 
903
 * Create the queue pair note that this is just the initial 
 
904
 *  queue pair creation and we need to get the remote queue pair 
 
905
 *  info from the peer before the qp is usable, 
 
906
 */ 
 
907
 
 
908
int mca_btl_openib_endpoint_create_qp(
 
909
                                      mca_btl_openib_module_t* openib_btl, 
 
910
                                      struct ibv_pd* pd, 
 
911
                                      struct ibv_cq* cq, 
 
912
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
 
913
                                      struct ibv_srq* srq, 
 
914
#endif
 
915
                                      struct ibv_qp_attr* qp_attr,
 
916
                                      struct ibv_qp** qp
 
917
                                      )
 
918
{
 
919
    {
 
920
        struct ibv_qp* my_qp; 
 
921
        struct ibv_qp_init_attr qp_init_attr; 
 
922
 
 
923
        memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 
 
924
 
 
925
        qp_init_attr.send_cq = cq; 
 
926
        qp_init_attr.recv_cq = cq; 
 
927
        qp_init_attr.cap.max_send_wr = mca_btl_openib_component.rd_num + 1;
 
928
        qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv;
 
929
        qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
 
930
        qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
 
931
        qp_init_attr.qp_type = IBV_QPT_RC; 
 
932
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ 
 
933
        if(mca_btl_openib_component.use_srq) { 
 
934
            qp_init_attr.srq = srq; 
 
935
        }
 
936
#endif
 
937
        my_qp = ibv_create_qp(pd, &qp_init_attr); 
 
938
    
 
939
        if(NULL == my_qp) { 
 
940
            BTL_ERROR(("error creating qp errno says %s", strerror(errno))); 
 
941
            return OMPI_ERROR; 
 
942
        }
 
943
        (*qp) = my_qp; 
 
944
        if(0 == (openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) {
 
945
            BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data"));
 
946
        }
 
947
    }
 
948
    
 
949
    {
 
950
        qp_attr->qp_state = IBV_QPS_INIT; 
 
951
        qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix; 
 
952
        qp_attr->port_num = openib_btl->port_num; 
 
953
        qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; 
 
954
        
 
955
        if(ibv_modify_qp((*qp), qp_attr, 
 
956
                         IBV_QP_STATE | 
 
957
                         IBV_QP_PKEY_INDEX | 
 
958
                         IBV_QP_PORT | 
 
959
                         IBV_QP_ACCESS_FLAGS )) { 
 
960
            BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); 
 
961
            return OMPI_ERROR; 
 
962
        } 
 
963
    } 
 
964
 
 
965
    return OMPI_SUCCESS;
 
966
}
 
967
 
 
968
/* 
 
969
 * The queue pair has been created and we have received the remote 
 
970
 *  queue pair information from the peer so we init this queue pair 
 
971
 *  and are ready to roll. 
 
972
 */ 
 
973
int mca_btl_openib_endpoint_qp_init_query(
 
974
                                          mca_btl_openib_module_t* openib_btl, 
 
975
                                          struct ibv_qp* qp, 
 
976
                                          struct ibv_qp_attr* attr,
 
977
                                          uint32_t lcl_psn, 
 
978
                                          uint32_t rem_qp_num, 
 
979
                                          uint32_t rem_psn,  
 
980
                                          uint16_t rem_lid, 
 
981
                                          uint32_t port_num 
 
982
                                          )
 
983
     
 
984
     
 
985
{
 
986
    attr->qp_state = IBV_QPS_RTR; 
 
987
    attr->path_mtu = mca_btl_openib_component.ib_mtu; 
 
988
    attr->dest_qp_num = rem_qp_num; 
 
989
    attr->rq_psn = rem_psn; 
 
990
    attr->max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; 
 
991
    attr->min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; 
 
992
    attr->ah_attr.is_global = 0; 
 
993
    attr->ah_attr.dlid = rem_lid; 
 
994
    attr->ah_attr.sl = mca_btl_openib_component.ib_service_level; 
 
995
    attr->ah_attr.src_path_bits = mca_btl_openib_component.ib_src_path_bits; 
 
996
    attr->ah_attr.port_num = port_num; 
 
997
    
 
998
    if(ibv_modify_qp(qp, attr, 
 
999
                     IBV_QP_STATE              |
 
1000
                     IBV_QP_AV                 |
 
1001
                     IBV_QP_PATH_MTU           |
 
1002
                     IBV_QP_DEST_QPN           |
 
1003
                     IBV_QP_RQ_PSN             |
 
1004
                     IBV_QP_MAX_DEST_RD_ATOMIC |
 
1005
                     IBV_QP_MIN_RNR_TIMER)) {
 
1006
        BTL_ERROR(("error modifing QP to RTR errno says %s",  strerror(errno))); 
 
1007
        return OMPI_ERROR; 
 
1008
    }
 
1009
    attr->qp_state           = IBV_QPS_RTS;
 
1010
    attr->timeout            = mca_btl_openib_component.ib_timeout;
 
1011
    attr->retry_cnt      = mca_btl_openib_component.ib_retry_count;
 
1012
    attr->rnr_retry      = mca_btl_openib_component.ib_rnr_retry;
 
1013
    attr->sq_psn             = lcl_psn;
 
1014
    attr->max_rd_atomic  = mca_btl_openib_component.ib_max_rdma_dst_ops;
 
1015
    if (ibv_modify_qp(qp, attr,
 
1016
                      IBV_QP_STATE              |
 
1017
                      IBV_QP_TIMEOUT            |
 
1018
                      IBV_QP_RETRY_CNT          |
 
1019
                      IBV_QP_RNR_RETRY          |
 
1020
                      IBV_QP_SQ_PSN             |
 
1021
                      IBV_QP_MAX_QP_RD_ATOMIC)) {
 
1022
        BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno))); 
 
1023
        return OMPI_ERROR;
 
1024
    }
 
1025
    return OMPI_SUCCESS;
 
1026
}
 
1027
 
 
1028
 
 
1029
/**
 
1030
 * Return control fragment.
 
1031
 */
 
1032
 
 
1033
static void mca_btl_openib_endpoint_credits_lp(
 
1034
    mca_btl_base_module_t* btl,
 
1035
    struct mca_btl_base_endpoint_t* endpoint,
 
1036
    struct mca_btl_base_descriptor_t* descriptor,
 
1037
    int status)
 
1038
{
 
1039
    int32_t credits;
 
1040
 
 
1041
    /* we don't acquire a wqe or token for credit message - so decrement */
 
1042
    OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1);
 
1043
 
 
1044
    /* check to see if there are addditional credits to return */
 
1045
    if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-1)) > 0) {
 
1046
        OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,-credits);
 
1047
        if (endpoint->rd_credits_lp >= mca_btl_openib_component.rd_win &&
 
1048
            OPAL_THREAD_ADD32(&endpoint->sd_credits_lp,1) == 1) {
 
1049
            mca_btl_openib_endpoint_send_credits_lp(endpoint);
 
1050
        }
 
1051
    }
 
1052
    MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl), 
 
1053
                           ((mca_btl_openib_frag_t*)descriptor));
 
1054
}
 
1055
 
 
1056
/**
 
1057
 * Return credits to peer
 
1058
 */
 
1059
                                                                                                                             
 
1060
void mca_btl_openib_endpoint_send_credits_lp(
 
1061
    mca_btl_openib_endpoint_t* endpoint)
 
1062
{
 
1063
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
 
1064
    mca_btl_openib_frag_t* frag;
 
1065
    struct ibv_send_wr* bad_wr; 
 
1066
    int rc;
 
1067
 
 
1068
    MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
 
1069
    if(NULL == frag) {
 
1070
        BTL_ERROR(("error allocating fragment"));
 
1071
        return;
 
1072
    }
 
1073
 
 
1074
    frag->base.des_cbfunc = mca_btl_openib_endpoint_credits_lp;
 
1075
    frag->base.des_cbdata = NULL;
 
1076
    frag->endpoint = endpoint;
 
1077
 
 
1078
    frag->hdr->tag = MCA_BTL_TAG_BTL;
 
1079
    frag->hdr->credits = endpoint->rd_credits_lp;
 
1080
    OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits);
 
1081
    ((mca_btl_openib_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_OPENIB_CONTROL_NOOP;
 
1082
 
 
1083
    frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
 
1084
    frag->sg_entry.length = sizeof(mca_btl_openib_header_t) +
 
1085
                            sizeof(mca_btl_openib_control_header_t);
 
1086
    frag->sg_entry.addr = (unsigned long) frag->hdr; 
 
1087
    
 
1088
    if(frag->sg_entry.length <= openib_btl->ib_inline_max) { 
 
1089
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED; 
 
1090
    } else {
 
1091
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; 
 
1092
    }
 
1093
 
 
1094
    if(ibv_post_send(endpoint->lcl_qp_lp,
 
1095
                         &frag->wr_desc.sr_desc,
 
1096
                         &bad_wr)) {
 
1097
        OPAL_THREAD_ADD32(&endpoint->sd_credits_lp, -1);
 
1098
        OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, frag->hdr->credits);
 
1099
        MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
 
1100
        BTL_ERROR(("error posting send request errno %d says %s", strerror(errno)));
 
1101
        return;
 
1102
    }
 
1103
}
 
1104
 
 
1105
 
 
1106
/**
 
1107
 * Return control fragment.
 
1108
 */
 
1109
 
 
1110
static void mca_btl_openib_endpoint_credits_hp(
 
1111
    mca_btl_base_module_t* btl,
 
1112
    struct mca_btl_base_endpoint_t* endpoint,
 
1113
    struct mca_btl_base_descriptor_t* descriptor,
 
1114
    int status)
 
1115
{
 
1116
    int32_t credits;
 
1117
 
 
1118
    /* we don't acquire a wqe or token for credit message - so decrement */
 
1119
    OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1);
 
1120
 
 
1121
    /* check to see if there are addditional credits to return */
 
1122
    if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-1)) > 0) {
 
1123
        OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-credits);
 
1124
        if ((endpoint->rd_credits_hp >= mca_btl_openib_component.rd_win ||
 
1125
            endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win) &&
 
1126
            OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,1) == 1) {
 
1127
            mca_btl_openib_endpoint_send_credits_hp(endpoint);
 
1128
        } 
 
1129
    }
 
1130
    MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl), 
 
1131
                           ((mca_btl_openib_frag_t*)descriptor));
 
1132
}
 
1133
 
 
1134
/**
 
1135
 * Return credits to peer
 
1136
 */
 
1137
                                                                                                                             
 
1138
void mca_btl_openib_endpoint_send_credits_hp(
 
1139
    mca_btl_openib_endpoint_t* endpoint)
 
1140
{
 
1141
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
 
1142
    mca_btl_openib_frag_t* frag;
 
1143
    struct ibv_send_wr* bad_wr; 
 
1144
    int rc;
 
1145
 
 
1146
    MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
 
1147
    if(NULL == frag) {
 
1148
        BTL_ERROR(("error allocating fragment"));
 
1149
        return;
 
1150
    }
 
1151
 
 
1152
    frag->base.des_cbfunc = mca_btl_openib_endpoint_credits_hp;
 
1153
    frag->base.des_cbdata = NULL;
 
1154
    frag->endpoint = endpoint;
 
1155
 
 
1156
    frag->hdr->tag = MCA_BTL_TAG_BTL;
 
1157
    frag->hdr->credits =
 
1158
        (endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
 
1159
    OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
 
1160
    frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
 
1161
    OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
 
1162
            -frag->hdr->rdma_credits);
 
1163
    ((mca_btl_openib_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_OPENIB_CONTROL_NOOP;
 
1164
 
 
1165
    frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
 
1166
    frag->sg_entry.length = sizeof(mca_btl_openib_header_t) +
 
1167
                            sizeof(mca_btl_openib_control_header_t);
 
1168
    frag->sg_entry.addr = (unsigned long) frag->hdr; 
 
1169
    
 
1170
    if(frag->sg_entry.length <= openib_btl->ib_inline_max) { 
 
1171
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED; 
 
1172
    } else {
 
1173
        frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; 
 
1174
    }
 
1175
 
 
1176
    if(ibv_post_send(endpoint->lcl_qp_hp,
 
1177
                         &frag->wr_desc.sr_desc,
 
1178
                         &bad_wr)) {
 
1179
        OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, -1);
 
1180
        OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, frag->hdr->credits);
 
1181
        MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
 
1182
        BTL_ERROR(("error posting send request errno %d says %s", errno, 
 
1183
                    strerror(errno)));
 
1184
        return;
 
1185
    }
 
1186
}
 
1187
 
 
1188
static void mca_btl_openib_endpoint_eager_rdma(
 
1189
    mca_btl_base_module_t* btl,
 
1190
    struct mca_btl_base_endpoint_t* endpoint,
 
1191
    struct mca_btl_base_descriptor_t* descriptor,
 
1192
    int status)
 
1193
{
 
1194
    MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl),
 
1195
                           ((mca_btl_openib_frag_t*)descriptor));
 
1196
}
 
1197
 
 
1198
static int mca_btl_openib_endpoint_send_eager_rdma(
 
1199
    mca_btl_base_endpoint_t* endpoint)
 
1200
{
 
1201
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
 
1202
    mca_btl_openib_eager_rdma_header_t *rdma_hdr;
 
1203
    mca_btl_openib_frag_t* frag;
 
1204
    int rc;
 
1205
 
 
1206
    MCA_BTL_IB_FRAG_ALLOC_EAGER(openib_btl, frag, rc);
 
1207
    if(NULL == frag) {
 
1208
        BTL_ERROR(("error allocating fragment"));
 
1209
        return -1;
 
1210
    }
 
1211
 
 
1212
    frag->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma;
 
1213
    frag->base.des_cbdata = NULL;
 
1214
    frag->endpoint = endpoint;
 
1215
    frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
 
1216
 
 
1217
    frag->hdr->tag = MCA_BTL_TAG_BTL;
 
1218
    rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval;
 
1219
    rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
 
1220
    rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
 
1221
    rdma_hdr->rdma_start.pval = endpoint->eager_rdma_local.base.pval;
 
1222
    frag->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t);
 
1223
    if (mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag) !=
 
1224
            OMPI_SUCCESS) {
 
1225
        MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
 
1226
        BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
 
1227
        return -1;
 
1228
    }
 
1229
    return 0;
 
1230
}
 
1231
/* create RDMA buffer for eager messages */
 
1232
void mca_btl_openib_endpoint_connect_eager_rdma(
 
1233
        mca_btl_openib_endpoint_t* endpoint)
 
1234
{
 
1235
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
 
1236
    char *buf;
 
1237
    unsigned int i;
 
1238
 
 
1239
    OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
 
1240
    if (endpoint->eager_rdma_local.base.pval)
 
1241
        goto unlock_rdma_local;
 
1242
 
 
1243
    buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
 
1244
            openib_btl->eager_rdma_frag_size * 
 
1245
            mca_btl_openib_component.eager_rdma_num, 0, 0,
 
1246
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
 
1247
 
 
1248
    if(!buf)
 
1249
       goto unlock_rdma_local;
 
1250
 
 
1251
    for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
 
1252
        ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
 
1253
                i*openib_btl->eager_rdma_frag_size);
 
1254
        item->user_data = endpoint->eager_rdma_local.reg;
 
1255
        OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_eager_t);
 
1256
        ((mca_btl_openib_frag_t*)item)->endpoint = endpoint;
 
1257
        ((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
 
1258
    }
 
1259
 
 
1260
    OPAL_THREAD_LOCK(&openib_btl->eager_rdma_lock);
 
1261
    if(orte_pointer_array_add (&endpoint->eager_rdma_index,
 
1262
                openib_btl->eager_rdma_buffers, endpoint) < 0)
 
1263
           goto cleanup;
 
1264
 
 
1265
    endpoint->eager_rdma_local.base.pval = buf;
 
1266
    openib_btl->eager_rdma_buffers_count++;
 
1267
    if (mca_btl_openib_endpoint_send_eager_rdma(endpoint) == 0) {
 
1268
        OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock);
 
1269
        OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
 
1270
        return;
 
1271
    }
 
1272
 
 
1273
    openib_btl->eager_rdma_buffers_count--;
 
1274
    endpoint->eager_rdma_local.base.pval = NULL;
 
1275
    orte_pointer_array_set_item(openib_btl->eager_rdma_buffers,
 
1276
            endpoint->eager_rdma_index, NULL);
 
1277
 
 
1278
cleanup:
 
1279
    OPAL_THREAD_UNLOCK(&openib_btl->eager_rdma_lock);
 
1280
    openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
 
1281
            buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
 
1282
unlock_rdma_local:
 
1283
    OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
 
1284
}