2 Unix SMB/CIFS implementation.
3 Infrastructure for SMB-Direct RDMA as transport
4 Copyright (C) Stefan Metzmacher 2012
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "system/network.h"
23 #include "../util/tevent_ntstatus.h"
24 #include "../lib/tsocket/tsocket.h"
26 #include "lib/util/util_net.h" //TODO
28 #ifdef SMB_TRANSPORT_ENABLE_RDMA
29 #include <rdma/rdma_cma_abi.h>
30 #include <rdma/rdma_cma.h>
31 #include <infiniband/verbs.h>
33 struct smb_direct_transport {
35 struct rdma_cm_id *cm_id;
36 struct rdma_event_channel *cm_channel;
37 struct tevent_fd *fde_channel;
38 enum rdma_cm_event_type expected_event;
39 struct rdma_cm_event *cm_event;
43 struct ibv_comp_channel *comp_channel;
44 struct tevent_fd *fde_channel;
50 static int smb_direct_transport_destructor(struct smb_direct_transport *t);
52 struct smb_direct_transport *smb_direct_transport_create(TALLOC_CTX *mem_ctx);
54 struct smb_direct_transport *smb_direct_transport_create(TALLOC_CTX *mem_ctx)
56 struct smb_direct_transport *t;
59 t = talloc_zero(mem_ctx, struct smb_direct_transport);
63 talloc_set_destructor(t, smb_direct_transport_destructor);
65 t->rdma.cm_channel = rdma_create_event_channel();
66 if (t->rdma.cm_channel == NULL) {
71 #if RDMA_USER_CM_MAX_ABI_VERSION >= 2
72 ret = rdma_create_id(t->rdma.cm_channel,
76 ret = rdma_create_id(t->rdma.cm_channel,
88 static int smb_direct_transport_destructor(struct smb_direct_transport *t)
90 TALLOC_FREE(t->ibv.fde_channel);
91 TALLOC_FREE(t->rdma.fde_channel);
93 if (t->ibv.qp != NULL) {
94 ibv_destroy_qp(t->ibv.qp);
98 if (t->ibv.cq != NULL) {
99 ibv_destroy_cq(t->ibv.cq);
103 if (t->ibv.comp_channel != NULL) {
104 ibv_destroy_comp_channel(t->ibv.comp_channel);
105 t->ibv.comp_channel = NULL;
108 if (t->ibv.pd != NULL) {
109 ibv_dealloc_pd(t->ibv.pd);
113 if (t->rdma.cm_event != NULL) {
114 rdma_ack_cm_event(t->rdma.cm_event);
115 t->rdma.cm_event = NULL;
118 if (t->rdma.cm_id != NULL) {
119 rdma_destroy_id(t->rdma.cm_id);
120 t->rdma.cm_id = NULL;
123 if (t->rdma.cm_channel != NULL) {
124 rdma_destroy_event_channel(t->rdma.cm_channel);
125 t->rdma.cm_channel = NULL;
131 struct smb_direct_rdma_connect_state {
132 struct smb_direct_transport *t;
135 struct tevent_req *smb_direct_rdma_connect_send(TALLOC_CTX *mem_ctx,
136 struct tevent_context *ev,
137 struct smb_direct_transport *transport,
138 const struct sockaddr_storage *addr,
139 struct tsocket_address *local_addr,
140 struct tsocket_address *remote_addr);
141 NTSTATUS smb_direct_rdma_connect_recv(struct tevent_req *req);
143 static void smb_direct_rdma_connect_handler(struct tevent_context *ev,
144 struct tevent_fd *fde,
148 struct tevent_req *smb_direct_rdma_connect_send(TALLOC_CTX *mem_ctx,
149 struct tevent_context *ev,
150 struct smb_direct_transport *transport,
151 const struct sockaddr_storage *_addr,
152 struct tsocket_address *local_addr,
153 struct tsocket_address *remote_addr)
155 struct tevent_req *req;
156 struct smb_direct_rdma_connect_state *state;
158 struct sockaddr_storage addr = *_addr;
159 struct sockaddr *src_addr = NULL, *dst_addr = &addr;
161 set_sockaddr_port(dst_addr, 5445);
163 req = tevent_req_create(mem_ctx, &state,
164 struct smb_direct_rdma_connect_state);
168 state->t = transport;
170 transport->rdma.fde_channel = tevent_add_fd(ev, transport,
171 transport->rdma.cm_channel->fd,
173 smb_direct_rdma_connect_handler,
175 if (tevent_req_nomem(transport->rdma.fde_channel, req)) {
176 return tevent_req_post(req, ev);
180 ret = rdma_resolve_addr(state->t->rdma.cm_id,
184 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
185 __location__, __FUNCTION__, ret, errno));
187 state->t->rdma.expected_event = RDMA_CM_EVENT_ADDR_RESOLVED;
192 static void smb_direct_rdma_connect_handler(struct tevent_context *ev,
193 struct tevent_fd *fde,
197 struct tevent_req *req =
198 talloc_get_type_abort(private_data,
200 struct smb_direct_rdma_connect_state *state =
202 struct smb_direct_rdma_connect_state);
203 struct ibv_qp_init_attr init_attr;
204 struct rdma_conn_param conn_param;
209 ret = rdma_get_cm_event(state->t->rdma.cm_channel,
210 &state->t->rdma.cm_event);
212 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
213 __location__, __FUNCTION__, ret, errno));
216 if (state->t->rdma.cm_event->status != 0) {
217 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
218 __location__, __FUNCTION__, ret, errno));
222 if (state->t->rdma.cm_event->event != state->t->rdma.expected_event) {
223 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
224 __location__, __FUNCTION__, ret, errno));
228 switch (state->t->rdma.cm_event->event) {
229 case RDMA_CM_EVENT_ADDR_RESOLVED:
231 ret = rdma_resolve_route(state->t->rdma.cm_id, 5000);
233 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
234 __location__, __FUNCTION__, ret, errno));
237 state->t->rdma.expected_event = RDMA_CM_EVENT_ROUTE_RESOLVED;
239 case RDMA_CM_EVENT_ADDR_ERROR:
241 case RDMA_CM_EVENT_ROUTE_RESOLVED:
244 state->t->ibv.pd = ibv_alloc_pd(state->t->rdma.cm_id->verbs);
245 if (state->t->ibv.pd == NULL) {
249 state->t->ibv.comp_channel = ibv_create_comp_channel(state->t->rdma.cm_id->verbs);
250 if (state->t->ibv.comp_channel == NULL) {
253 ZERO_STRUCT(init_attr);
254 init_attr.cap.max_send_wr = 16;
255 init_attr.cap.max_recv_wr = 2;
256 init_attr.cap.max_recv_sge = 2;
257 init_attr.cap.max_send_sge = 2;
258 init_attr.qp_type = IBV_QPT_RC;
260 state->t->ibv.cq = ibv_create_cq(state->t->rdma.cm_id->verbs,
261 init_attr.cap.max_send_wr * 2,
263 state->t->ibv.comp_channel,
265 if (state->t->ibv.cq == NULL) {
268 init_attr.send_cq = state->t->ibv.cq;
269 init_attr.recv_cq = state->t->ibv.cq;
272 ret = ibv_req_notify_cq(state->t->ibv.cq, 0);
274 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
275 __location__, __FUNCTION__, ret, errno));
281 ret = rdma_create_qp(state->t->rdma.cm_id, state->t->ibv.pd,
284 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
285 __location__, __FUNCTION__, ret, errno));
288 state->t->ibv.qp = state->t->rdma.cm_id->qp;
290 ZERO_STRUCT(conn_param);
291 conn_param.responder_resources = 1;
292 conn_param.initiator_depth = 1;
293 conn_param.retry_count = 10;
296 ret = rdma_connect(state->t->rdma.cm_id, &conn_param);
298 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
299 __location__, __FUNCTION__, ret, errno));
302 state->t->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED;
305 case RDMA_CM_EVENT_ESTABLISHED:
308 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
309 __location__, __FUNCTION__, ret, errno));
311 state->t->rdma.expected_event = RDMA_CM_EVENT_DISCONNECTED;
312 TALLOC_FREE(state->t->rdma.fde_channel);
313 tevent_req_done(req);
316 case RDMA_CM_EVENT_ROUTE_ERROR:
317 case RDMA_CM_EVENT_CONNECT_REQUEST:
318 case RDMA_CM_EVENT_CONNECT_RESPONSE:
319 case RDMA_CM_EVENT_CONNECT_ERROR:
320 case RDMA_CM_EVENT_UNREACHABLE:
321 case RDMA_CM_EVENT_REJECTED:
322 case RDMA_CM_EVENT_DISCONNECTED:
323 case RDMA_CM_EVENT_DEVICE_REMOVAL:
324 case RDMA_CM_EVENT_MULTICAST_JOIN:
325 case RDMA_CM_EVENT_MULTICAST_ERROR:
326 case RDMA_CM_EVENT_ADDR_CHANGE:
327 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
332 if (state->t->rdma.cm_event != NULL) {
333 rdma_ack_cm_event(state->t->rdma.cm_event);
334 state->t->rdma.cm_event = NULL;
338 NTSTATUS smb_direct_rdma_connect_recv(struct tevent_req *req)
340 struct smb_direct_rdma_connect_state *state =
342 struct smb_direct_rdma_connect_state);
345 TALLOC_FREE(state->t->rdma.fde_channel);
347 if (tevent_req_is_nterror(req, &status)) {
348 tevent_req_received(req);
352 tevent_req_received(req);
356 struct smb_direct_negotiate_state {
357 struct smb_direct_transport *t;
359 uint8_t buffer[0x14];
361 struct ibv_sge sge[1];
362 struct ibv_send_wr wr;
365 uint8_t buffer[0x1C];
367 struct ibv_sge sge[1];
368 struct ibv_recv_wr wr;
372 struct tevent_req *smb_direct_negotiate_send(TALLOC_CTX *mem_ctx,
373 struct tevent_context *ev,
374 struct smb_direct_transport *transport);
375 NTSTATUS smb_direct_negotiate_recv(struct tevent_req *req);
377 static void smb_direct_negotiate_rdma_handler(struct tevent_context *ev,
378 struct tevent_fd *fde,
381 static void smb_direct_negotiate_ibv_handler(struct tevent_context *ev,
382 struct tevent_fd *fde,
386 struct tevent_req *smb_direct_negotiate_send(TALLOC_CTX *mem_ctx,
387 struct tevent_context *ev,
388 struct smb_direct_transport *transport)
390 struct tevent_req *req;
391 struct smb_direct_negotiate_state *state;
392 struct ibv_recv_wr *bad_recv_wr = NULL;
393 struct ibv_send_wr *bad_send_wr = NULL;
396 req = tevent_req_create(mem_ctx, &state,
397 struct smb_direct_negotiate_state);
401 state->t = transport;
403 transport->ibv.fde_channel = tevent_add_fd(ev, transport,
404 transport->ibv.comp_channel->fd,
406 smb_direct_negotiate_ibv_handler,
408 if (tevent_req_nomem(transport->ibv.fde_channel, req)) {
409 return tevent_req_post(req, ev);
411 transport->rdma.fde_channel = tevent_add_fd(ev, transport,
412 transport->rdma.cm_channel->fd,
414 smb_direct_negotiate_rdma_handler,
416 if (tevent_req_nomem(transport->rdma.fde_channel, req)) {
417 return tevent_req_post(req, ev);
420 SSVAL(state->req.buffer, 0x00, 0x0100);
421 SSVAL(state->req.buffer, 0x02, 0x0100);
422 SSVAL(state->req.buffer, 0x04, 0x0000);
423 SSVAL(state->req.buffer, 0x06, 0x000A);
424 SIVAL(state->req.buffer, 0x08, 0x00000400);
425 SIVAL(state->req.buffer, 0x0C, 0x00000400);
426 SIVAL(state->req.buffer, 0x10, 0x00020000);
428 state->req.mr = ibv_reg_mr(transport->ibv.pd,
430 sizeof(state->req.buffer),
431 IBV_ACCESS_LOCAL_WRITE);
432 if (tevent_req_nomem(state->req.mr, req)) {
433 return tevent_req_post(req, ev);
436 state->req.mr2 = ibv_reg_mr(transport->ibv.pd,
438 sizeof(state->req.buffer2),
439 IBV_ACCESS_LOCAL_WRITE);
440 if (tevent_req_nomem(state->req.mr2, req)) {
441 return tevent_req_post(req, ev);
443 state->req.sge[0].addr = (uint64_t) (uintptr_t) state->req.buffer;
444 state->req.sge[0].length = sizeof(state->req.buffer);
445 state->req.sge[0].lkey = state->req.mr->lkey;
446 state->req.wr.opcode = IBV_WR_SEND;
447 state->req.wr.send_flags = IBV_SEND_SIGNALED;
448 state->req.wr.sg_list = state->req.sge;
449 state->req.wr.num_sge = ARRAY_SIZE(state->req.sge);
451 state->rep.mr = ibv_reg_mr(transport->ibv.pd,
453 sizeof(state->rep.buffer),
455 if (tevent_req_nomem(state->rep.mr, req)) {
456 return tevent_req_post(req, ev);
459 state->rep.sge[0].addr = (uint64_t) (uintptr_t) state->rep.buffer;
460 state->rep.sge[0].length = sizeof(state->rep.buffer);;
461 state->rep.sge[0].lkey = state->rep.mr->lkey;
462 state->rep.wr.sg_list = state->rep.sge;
463 state->rep.wr.num_sge = ARRAY_SIZE(state->rep.sge);
466 ret = ibv_post_recv(transport->ibv.qp, &state->rep.wr, &bad_recv_wr);
468 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
469 __location__, __FUNCTION__, ret, errno));
474 ret = ibv_post_send(transport->ibv.qp, &state->req.wr, &bad_send_wr);
476 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
477 __location__, __FUNCTION__, ret, errno));
481 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
482 __location__, __FUNCTION__, ret, errno));
486 static void smb_direct_negotiate_ibv_handler(struct tevent_context *ev,
487 struct tevent_fd *fde,
491 struct tevent_req *req =
492 talloc_get_type_abort(private_data,
494 struct smb_direct_negotiate_state *state =
496 struct smb_direct_negotiate_state);
497 struct ibv_cq *cq = NULL;
498 void *cq_context = NULL;
500 struct ibv_recv_wr *bad_wr;
505 ret = ibv_get_cq_event(state->t->ibv.comp_channel,
508 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
509 __location__, __FUNCTION__, ret, errno));
512 if (cq != state->t->ibv.cq) {
514 if (cq_context != state->t) {
517 ret = ibv_req_notify_cq(state->t->ibv.cq, 0);
519 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
520 __location__, __FUNCTION__, ret, errno));
523 ret = ibv_poll_cq(state->t->ibv.cq, 1, &wc);
525 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
526 __location__, __FUNCTION__, ret, errno));
531 if (wc.status != IBV_WC_SUCCESS) {
537 case IBV_WC_RDMA_WRITE:
539 case IBV_WC_RDMA_READ:
542 //ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
548 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
549 __location__, __FUNCTION__, ret, errno));
550 ibv_ack_cq_events(state->t->ibv.cq, 1);
553 static void smb_direct_negotiate_rdma_handler(struct tevent_context *ev,
554 struct tevent_fd *fde,
558 struct tevent_req *req =
559 talloc_get_type_abort(private_data,
561 struct smb_direct_negotiate_state *state =
563 struct smb_direct_negotiate_state);
568 ret = rdma_get_cm_event(state->t->rdma.cm_channel,
569 &state->t->rdma.cm_event);
571 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
572 __location__, __FUNCTION__, ret, errno));
575 if (state->t->rdma.cm_event->status != 0) {
576 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
577 __location__, __FUNCTION__, ret, errno));
581 if (state->t->rdma.cm_event->event != state->t->rdma.expected_event) {
582 DEBUG(0,("%s:%s: ret[%d] errno[%d]\n",
583 __location__, __FUNCTION__, ret, errno));
587 switch (state->t->rdma.cm_event->event) {
588 case RDMA_CM_EVENT_ADDR_RESOLVED:
589 case RDMA_CM_EVENT_ADDR_ERROR:
590 case RDMA_CM_EVENT_ROUTE_RESOLVED:
591 case RDMA_CM_EVENT_ESTABLISHED:
592 case RDMA_CM_EVENT_ROUTE_ERROR:
593 case RDMA_CM_EVENT_CONNECT_REQUEST:
594 case RDMA_CM_EVENT_CONNECT_RESPONSE:
595 case RDMA_CM_EVENT_CONNECT_ERROR:
596 case RDMA_CM_EVENT_UNREACHABLE:
597 case RDMA_CM_EVENT_REJECTED:
598 case RDMA_CM_EVENT_DISCONNECTED:
599 case RDMA_CM_EVENT_DEVICE_REMOVAL:
600 case RDMA_CM_EVENT_MULTICAST_JOIN:
601 case RDMA_CM_EVENT_MULTICAST_ERROR:
602 case RDMA_CM_EVENT_ADDR_CHANGE:
603 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
608 if (state->t->rdma.cm_event != NULL) {
609 rdma_ack_cm_event(state->t->rdma.cm_event);
610 state->t->rdma.cm_event = NULL;
614 NTSTATUS smb_direct_negotiate_recv(struct tevent_req *req)
616 struct smb_direct_negotiate_state *state =
618 struct smb_direct_negotiate_state);
621 TALLOC_FREE(state->t->ibv.fde_channel);
622 TALLOC_FREE(state->t->rdma.fde_channel);
624 if (tevent_req_is_nterror(req, &status)) {
625 tevent_req_received(req);
629 tevent_req_received(req);
632 #endif /* SMB_TRANSPORT_ENABLE_RDMA */