2 * Unix SMB/CIFS implementation.
3 * Wrap Infiniband calls.
5 * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
7 * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 #include <sys/types.h>
29 #include <netinet/in.h>
30 #include <sys/socket.h>
32 #include <arpa/inet.h>
38 #include "lib/events/events.h"
39 #include "ibwrapper.h"
41 #include <rdma/rdma_cma.h>
43 #include "ibwrapper_internal.h"
44 #include "lib/util/dlinklist.h"
46 #define IBW_LASTERR_BUFSIZE 512
47 static char ibw_lasterr[IBW_LASTERR_BUFSIZE];
49 static void ibw_event_handler_verbs(struct event_context *ev,
50 struct fd_event *fde, uint16_t flags, void *private_data);
51 static int ibw_fill_cq(struct ibw_conn *conn);
54 static int ibw_init_memory(struct ibw_conn *conn)
56 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
57 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
62 pconn->buf = memalign(pctx->pagesize, pctx->max_msg_size);
64 sprintf(ibw_lasterr, "couldn't allocate work buf\n");
67 pconn->mr = ibv_reg_mr(pctx->pd, pconn->buf,
68 pctx->qsize * pctx->max_msg_size, IBV_ACCESS_LOCAL_WRITE);
70 sprintf(ibw_lasterr, "couldn't allocate mr\n");
74 pconn->wr_index = talloc_size(pconn, pctx->qsize * sizeof(struct ibw_wr *));
76 for(i=0; i<pctx->qsize; i++) {
77 p = pconn->wr_index[i] = talloc_zero(pconn, struct ibw_wr);
78 p->msg = pconn->buf + (i * pctx->max_msg_size);
81 DLIST_ADD(pconn->wr_list_avail, p);
87 static int ibw_ctx_priv_destruct(struct ibw_ctx_priv *pctx)
90 ibv_dealloc_pd(pctx->pd);
95 if (pctx->cm_channel) {
96 rdma_destroy_event_channel(pctx->cm_channel);
97 pctx->cm_channel = NULL;
99 if (pctx->cm_channel_event) {
100 /* TODO: do we have to do this here? */
101 talloc_free(pctx->cm_channel_event);
102 pctx->cm_channel_event = NULL;
105 rdma_destroy_id(pctx->cm_id);
112 static int ibw_ctx_destruct(struct ibw_ctx *ctx)
117 static int ibw_conn_priv_destruct(struct ibw_conn_priv *pconn)
119 /* free memory regions */
121 ibv_dereg_mr(pconn->mr);
125 free(pconn->buf); /* memalign-ed */
129 /* pconn->wr_index is freed by talloc */
130 /* pconn->wr_index[i] are freed by talloc */
133 if (pconn->cm_id->qp) {
134 ibv_destroy_qp(pconn->cm_id->qp);
135 pconn->cm_id->qp = NULL;
138 ibv_destroy_cq(pconn->cq);
141 if (pconn->verbs_channel) {
142 ibv_destroy_comp_channel(pconn->verbs_channel);
143 pconn->verbs_channel = NULL;
145 if (pconn->verbs_channel_event) {
146 /* TODO: do we have to do this here? */
147 talloc_free(pconn->verbs_channel_event);
148 pconn->verbs_channel_event = NULL;
151 rdma_destroy_id(pconn->cm_id);
157 static int ibw_conn_destruct(struct ibw_conn *conn)
159 /* important here: ctx is a talloc _parent_ */
160 DLIST_REMOVE(conn->ctx->conn_list, conn);
164 static struct ibw_conn *ibw_conn_new(struct ibw_ctx *ctx)
166 struct ibw_conn *conn;
167 struct ibw_conn_priv *pconn;
169 conn = talloc_zero(ctx, struct ibw_conn);
171 talloc_set_destructor(conn, ibw_conn_destruct);
173 pconn = talloc_zero(ctx, struct ibw_conn_priv);
175 talloc_set_destructor(pconn, ibw_conn_priv_destruct);
179 DLIST_ADD(ctx->conn_list, conn);
184 static int ibw_setup_cq_qp(struct ibw_conn *conn)
186 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
187 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
188 struct ibv_qp_init_attr init_attr;
192 if (ibw_init_memory(conn))
196 pconn->verbs_channel = ibv_create_comp_channel(pconn->cm_id->verbs);
197 if (!pconn->verbs_channel) {
198 sprintf(ibw_lasterr, "ibv_create_comp_channel failed %d\n", errno);
201 DEBUG(10, ("created channel %p\n", pconn->verbs_channel));
203 pconn->verbs_channel_event = event_add_fd(pctx->ectx, conn,
204 pconn->verbs_channel->fd, EVENT_FD_READ, ibw_event_handler_verbs, conn);
207 pconn->cq = ibv_create_cq(pconn->cm_id->verbs, pctx->qsize,
208 conn, pconn->verbs_channel, 0);
209 if (pconn->cq==NULL) {
210 sprintf(ibw_lasterr, "ibv_create_cq failed\n");
214 rc = ibv_req_notify_cq(pconn->cq, 0);
216 sprintf(ibw_lasterr, "ibv_req_notify_cq failed with %d\n", rc);
221 memset(&init_attr, 0, sizeof(init_attr));
222 init_attr.cap.max_send_wr = pctx->opts.max_send_wr;
223 init_attr.cap.max_recv_wr = pctx->opts.max_recv_wr;
224 init_attr.cap.max_recv_sge = 1;
225 init_attr.cap.max_send_sge = 1;
226 init_attr.qp_type = IBV_QPT_RC;
227 init_attr.send_cq = pconn->cq;
228 init_attr.recv_cq = pconn->cq;
230 rc = rdma_create_qp(pconn->cm_id, pctx->pd, &init_attr);
232 sprintf(ibw_lasterr, "rdma_create_qp failed with %d\n", rc);
235 /* elase result is in pconn->cm_id->qp */
237 return ibw_fill_cq(conn);
240 static int ibw_refill_cq_recv(struct ibw_conn *conn)
242 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
243 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
245 struct ibv_sge list = {
246 .addr = (uintptr_t) NULL,
247 .length = pctx->max_msg_size,
248 .lkey = pconn->mr->lkey
250 struct ibv_recv_wr wr = {
255 struct ibv_recv_wr *bad_wr;
256 struct ibw_wr *p = pconn->wr_list_avail;
259 sprintf(ibw_lasterr, "out of wr_list_avail");
260 DEBUG(0, (ibw_lasterr));
263 DLIST_REMOVE(pconn->wr_list_avail, p);
264 DLIST_ADD(pconn->wr_list_used, p);
265 list.addr = (uintptr_t) p->msg;
268 rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr);
270 sprintf(ibw_lasterr, "ibv_post_recv failed with %d\n", rc);
271 DEBUG(0, (ibw_lasterr));
278 static int ibw_fill_cq(struct ibw_conn *conn)
280 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
281 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
283 struct ibv_sge list = {
284 .addr = (uintptr_t) NULL,
285 .length = pctx->max_msg_size,
286 .lkey = pconn->mr->lkey
288 struct ibv_recv_wr wr = {
293 struct ibv_recv_wr *bad_wr;
296 for(i = pctx->opts.max_recv_wr; i!=0; i--) {
297 p = pconn->wr_list_avail;
299 sprintf(ibw_lasterr, "out of wr_list_avail");
300 DEBUG(0, (ibw_lasterr));
303 DLIST_REMOVE(pconn->wr_list_avail, p);
304 DLIST_ADD(pconn->wr_list_used, p);
305 list.addr = (uintptr_t) p->msg;
308 rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr);
310 sprintf(ibw_lasterr, "ibv_post_recv failed with %d\n", rc);
311 DEBUG(0, (ibw_lasterr));
319 static int ibw_manage_connect(struct ibw_conn *conn, struct rdma_cm_id *cma_id)
321 struct rdma_conn_param conn_param;
324 rc = ibw_setup_cq_qp(conn);
329 memset(&conn_param, 0, sizeof conn_param);
330 conn_param.responder_resources = 1;
331 conn_param.initiator_depth = 1;
332 conn_param.retry_count = 10;
334 rc = rdma_connect(cma_id, &conn_param);
336 sprintf(ibw_lasterr, "rdma_connect error %d\n", rc);
341 static void ibw_event_handler_cm(struct event_context *ev,
342 struct fd_event *fde, uint16_t flags, void *private_data)
345 struct ibw_ctx *ctx = talloc_get_type(private_data, struct ibw_ctx);
346 struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv);
347 struct ibw_conn *conn = NULL;
348 struct ibw_conn_priv *pconn = NULL;
349 struct rdma_cm_id *cma_id = NULL;
350 struct rdma_cm_event *event = NULL;
354 rc = rdma_get_cm_event(pctx->cm_channel, &event);
356 ctx->state = IBWS_ERROR;
357 sprintf(ibw_lasterr, "rdma_get_cm_event error %d\n", rc);
362 DEBUG(10, ("cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
363 (cma_id == pctx->cm_id) ? "parent" : "child"));
365 switch (event->event) {
366 case RDMA_CM_EVENT_ADDR_RESOLVED:
367 /* continuing from ibw_connect ... */
368 rc = rdma_resolve_route(cma_id, 2000);
370 sprintf(ibw_lasterr, "rdma_resolve_route error %d\n", rc);
373 /* continued at RDMA_CM_EVENT_ROUTE_RESOLVED */
376 case RDMA_CM_EVENT_ROUTE_RESOLVED:
377 /* after RDMA_CM_EVENT_ADDR_RESOLVED: */
378 assert(cma_id->context!=NULL);
379 conn = talloc_get_type(cma_id->context, struct ibw_conn);
381 rc = ibw_manage_connect(conn, cma_id);
387 case RDMA_CM_EVENT_CONNECT_REQUEST:
388 ctx->state = IBWS_CONNECT_REQUEST;
389 conn = ibw_conn_new(ctx);
390 pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
391 pconn->cm_id = cma_id; /* !!! event will be freed but id not */
392 cma_id->context = (void *)conn;
393 DEBUG(10, ("pconn->cm_id %p\n", pconn->cm_id));
395 conn->state = IBWC_INIT;
396 pctx->connstate_func(ctx, conn);
398 /* continued at ibw_accept when invoked by the func above */
399 if (!pconn->is_accepted) {
401 DEBUG(10, ("pconn->cm_id %p wasn't accepted\n", pconn->cm_id));
403 if (ibw_setup_cq_qp(conn))
407 /* TODO: clarify whether if it's needed by upper layer: */
408 ctx->state = IBWS_READY;
409 pctx->connstate_func(ctx, NULL);
411 /* NOTE: more requests can arrive until RDMA_CM_EVENT_ESTABLISHED ! */
414 case RDMA_CM_EVENT_ESTABLISHED:
415 /* expected after ibw_accept and ibw_connect[not directly] */
416 DEBUG(0, ("ESTABLISHED (conn: %u)\n", (unsigned int)cma_id->context));
417 conn = talloc_get_type(cma_id->context, struct ibw_conn);
418 assert(conn!=NULL); /* important assumption */
420 /* client conn is up */
421 conn->state = IBWC_CONNECTED;
423 /* both ctx and conn have changed */
424 pctx->connstate_func(ctx, conn);
427 case RDMA_CM_EVENT_ADDR_ERROR:
428 case RDMA_CM_EVENT_ROUTE_ERROR:
429 case RDMA_CM_EVENT_CONNECT_ERROR:
430 case RDMA_CM_EVENT_UNREACHABLE:
431 case RDMA_CM_EVENT_REJECTED:
432 sprintf(ibw_lasterr, "cma event %d, error %d\n", event->event, event->status);
435 case RDMA_CM_EVENT_DISCONNECTED:
436 if (cma_id!=pctx->cm_id) {
437 DEBUG(0, ("client DISCONNECT event\n"));
438 conn = talloc_get_type(cma_id->context, struct ibw_conn);
439 conn->state = IBWC_DISCONNECTED;
440 pctx->connstate_func(NULL, conn);
444 /* if we are the last... */
445 if (ctx->conn_list==NULL)
446 rdma_disconnect(pctx->cm_id);
448 DEBUG(0, ("server DISCONNECT event\n"));
449 ctx->state = IBWS_STOPPED; /* ??? TODO: try it... */
450 /* talloc_free(ctx) should be called within or after this func */
451 pctx->connstate_func(ctx, NULL);
455 case RDMA_CM_EVENT_DEVICE_REMOVAL:
456 sprintf(ibw_lasterr, "cma detected device removal!\n");
460 sprintf(ibw_lasterr, "unknown event %d\n", event->event);
464 if ((rc=rdma_ack_cm_event(event))) {
465 sprintf(ibw_lasterr, "rdma_ack_cm_event failed with %d\n", rc);
471 DEBUG(0, ("cm event handler: %s", ibw_lasterr));
472 if (cma_id!=pctx->cm_id) {
473 conn = talloc_get_type(cma_id->context, struct ibw_conn);
475 conn->state = IBWC_ERROR;
476 pctx->connstate_func(NULL, conn);
478 ctx->state = IBWS_ERROR;
479 pctx->connstate_func(ctx, NULL);
483 static void ibw_event_handler_verbs(struct event_context *ev,
484 struct fd_event *fde, uint16_t flags, void *private_data)
486 struct ibw_conn *conn = talloc_get_type(private_data, struct ibw_conn);
487 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
488 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
493 rc = ibv_poll_cq(pconn->cq, 1, &wc);
495 sprintf(ibw_lasterr, "ibv_poll_cq error %d\n", rc);
499 sprintf(ibw_lasterr, "cq completion failed status %d\n",
509 DEBUG(10, ("send completion\n"));
510 assert(pconn->cm_id->qp->qp_num==wc.qp_num);
511 assert(wc.wr_id < pctx->qsize);
512 p = pconn->wr_index[wc.wr_id];
513 DLIST_REMOVE(pconn->wr_list_used, p);
514 DLIST_ADD(pconn->wr_list_avail, p);
518 case IBV_WC_RDMA_WRITE:
519 DEBUG(10, ("rdma write completion\n"));
522 case IBV_WC_RDMA_READ:
523 DEBUG(10, ("rdma read completion\n"));
530 assert(pconn->cm_id->qp->qp_num==wc.qp_num);
531 assert(wc.wr_id < pctx->qsize);
532 p = pconn->wr_index[wc.wr_id];
534 DLIST_REMOVE(pconn->wr_list_used, p);
535 DLIST_ADD(pconn->wr_list_avail, p);
537 DEBUG(10, ("recv completion\n"));
538 assert(wc.byte_len <= pctx->max_msg_size);
540 pctx->receive_func(conn, p->msg, wc.byte_len);
541 if (ibw_refill_cq_recv(conn))
547 sprintf(ibw_lasterr, "unknown completion %d\n", wc.opcode);
553 DEBUG(0, (ibw_lasterr));
554 conn->state = IBWC_ERROR;
555 pctx->connstate_func(NULL, conn);
558 static int ibw_process_init_attrs(struct ibw_initattr *attr, int nattr, struct ibw_opts *opts)
561 const char *name, *value;
563 opts->max_send_wr = 256;
564 opts->max_recv_wr = 1024;
566 for(i=0; i<nattr; i++) {
568 value = attr[i].value;
570 assert(name!=NULL && value!=NULL);
571 if (strcmp(name, "max_send_wr")==0)
572 opts->max_send_wr = atoi(value);
573 else if (strcmp(name, "max_recv_wr")==0)
574 opts->max_recv_wr = atoi(value);
576 sprintf(ibw_lasterr, "ibw_init: unknown name %s\n", name);
583 struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,
585 ibw_connstate_fn_t ibw_connstate,
586 ibw_receive_fn_t ibw_receive,
587 struct event_context *ectx,
590 struct ibw_ctx *ctx = talloc_zero(NULL, struct ibw_ctx);
591 struct ibw_ctx_priv *pctx;
594 /* initialize basic data structures */
595 memset(ibw_lasterr, 0, IBW_LASTERR_BUFSIZE);
598 ibw_lasterr[0] = '\0';
599 talloc_set_destructor(ctx, ibw_ctx_destruct);
600 ctx->ctx_userdata = ctx_userdata;
602 pctx = talloc_zero(ctx, struct ibw_ctx_priv);
603 talloc_set_destructor(pctx, ibw_ctx_priv_destruct);
604 ctx->internal = (void *)pctx;
607 pctx->connstate_func = ibw_connstate;
608 pctx->receive_func = ibw_receive;
612 /* process attributes */
613 if (ibw_process_init_attrs(attr, nattr, &pctx->opts))
617 pctx->cm_channel = rdma_create_event_channel();
618 if (!pctx->cm_channel) {
619 sprintf(ibw_lasterr, "rdma_create_event_channel error %d\n", errno);
623 pctx->cm_channel_event = event_add_fd(pctx->ectx, pctx,
624 pctx->cm_channel->fd, EVENT_FD_READ, ibw_event_handler_cm, ctx);
626 rc = rdma_create_id(pctx->cm_channel, &pctx->cm_id, ctx, RDMA_PS_TCP);
629 sprintf(ibw_lasterr, "rdma_create_id error %d\n", rc);
632 DEBUG(10, ("created cm_id %p\n", pctx->cm_id));
635 pctx->pd = ibv_alloc_pd(pctx->cm_id->verbs);
637 sprintf(ibw_lasterr, "ibv_alloc_pd failed %d\n", errno);
640 DEBUG(10, ("created pd %p\n", pctx->pd));
642 pctx->pagesize = sysconf(_SC_PAGESIZE);
643 pctx->qsize = pctx->opts.max_send_wr + pctx->opts.max_recv_wr;
644 pctx->max_msg_size = max_msg_size;
647 /* don't put code here */
649 DEBUG(0, (ibw_lasterr));
657 int ibw_stop(struct ibw_ctx *ctx)
661 for(p=ctx->conn_list; p!=NULL; p=p->next) {
662 if (ctx->state==IBWC_ERROR || ctx->state==IBWC_CONNECTED) {
663 if (ibw_disconnect(p))
671 int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr)
673 struct ibw_ctx_priv *pctx = (struct ibw_ctx_priv *)ctx->internal;
676 rc = rdma_bind_addr(pctx->cm_id, (struct sockaddr *) my_addr);
678 sprintf(ibw_lasterr, "rdma_bind_addr error %d\n", rc);
679 DEBUG(0, (ibw_lasterr));
682 DEBUG(10, ("rdma_bind_addr successful\n"));
687 int ibw_listen(struct ibw_ctx *ctx, int backlog)
689 struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv);
692 DEBUG(10, ("rdma_listen...\n"));
693 rc = rdma_listen(pctx->cm_id, backlog);
695 sprintf(ibw_lasterr, "rdma_listen failed: %d\n", rc);
696 DEBUG(0, (ibw_lasterr));
703 int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata)
705 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
706 struct rdma_conn_param conn_param;
709 conn->conn_userdata = conn_userdata;
711 memset(&conn_param, 0, sizeof(struct rdma_conn_param));
712 conn_param.responder_resources = 1;
713 conn_param.initiator_depth = 1;
714 rc = rdma_accept(pconn->cm_id, &conn_param);
716 sprintf(ibw_lasterr, "rdma_accept failed %d\n", rc);
717 DEBUG(0, (ibw_lasterr));
721 pconn->is_accepted = 1;
723 /* continued at RDMA_CM_EVENT_ESTABLISHED */
728 int ibw_connect(struct ibw_ctx *ctx, struct sockaddr_in *serv_addr, void *conn_userdata)
730 struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv);
731 struct ibw_conn *conn = NULL;
732 struct ibw_conn_priv *pconn = NULL;
735 conn = ibw_conn_new(ctx);
736 conn->conn_userdata = conn_userdata;
737 pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
739 rc = rdma_create_id(pctx->cm_channel, &pconn->cm_id, conn, RDMA_PS_TCP);
742 sprintf(ibw_lasterr, "rdma_create_id error %d\n", rc);
746 rc = rdma_resolve_addr(pconn->cm_id, NULL, (struct sockaddr *) &serv_addr, 2000);
748 sprintf(ibw_lasterr, "rdma_resolve_addr error %d\n", rc);
749 DEBUG(0, (ibw_lasterr));
753 /* continued at RDMA_CM_EVENT_ADDR_RESOLVED */
758 int ibw_disconnect(struct ibw_conn *conn)
761 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
763 rc = rdma_disconnect(pctx->cm_id);
765 sprintf(ibw_lasterr, "ibw_disconnect failed with %d", rc);
766 DEBUG(0, (ibw_lasterr));
770 /* continued at RDMA_CM_EVENT_DISCONNECTED */
775 int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key)
777 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
778 struct ibw_wr *p = pconn->wr_list_avail;
781 sprintf(ibw_lasterr, "insufficient wr chunks\n");
785 DLIST_REMOVE(pconn->wr_list_avail, p);
786 DLIST_ADD(pconn->wr_list_used, p);
788 *buf = (void *)p->msg;
794 int ibw_send(struct ibw_conn *conn, void *buf, void *key, int n)
796 struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
797 struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
798 struct ibw_wr *p = talloc_get_type(key, struct ibw_wr);
799 struct ibv_sge list = {
800 .addr = (uintptr_t) p->msg,
802 .lkey = pconn->mr->lkey
804 struct ibv_send_wr wr = {
808 .opcode = IBV_WR_SEND,
809 .send_flags = IBV_SEND_SIGNALED,
811 struct ibv_send_wr *bad_wr;
813 assert(p->msg==(char *)buf);
814 assert(n<=pctx->max_msg_size);
816 return ibv_post_send(pconn->cm_id->qp, &wr, &bad_wr);
819 const char *ibw_getLastError(void)