qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

rdma_backend.c (41105B)


      1 /*
      2  * QEMU paravirtual RDMA - Generic RDMA backend
      3  *
      4  * Copyright (C) 2018 Oracle
      5  * Copyright (C) 2018 Red Hat Inc
      6  *
      7  * Authors:
      8  *     Yuval Shaia <yuval.shaia@oracle.com>
      9  *     Marcel Apfelbaum <marcel@redhat.com>
     10  *
     11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
     12  * See the COPYING file in the top-level directory.
     13  *
     14  */
     15 
     16 #include "qemu/osdep.h"
     17 #include "qapi/qapi-events-rdma.h"
     18 
     19 #include <infiniband/verbs.h>
     20 
     21 #include "contrib/rdmacm-mux/rdmacm-mux.h"
     22 #include "trace.h"
     23 #include "rdma_utils.h"
     24 #include "rdma_rm.h"
     25 #include "rdma_backend.h"
     26 
     27 #define THR_NAME_LEN 16
     28 #define THR_POLL_TO  5000
     29 
     30 #define MAD_HDR_SIZE sizeof(struct ibv_grh)
     31 
     32 typedef struct BackendCtx {
     33     void *up_ctx;
     34     struct ibv_sge sge; /* Used to save MAD recv buffer */
     35     RdmaBackendQP *backend_qp; /* To maintain recv buffers */
     36     RdmaBackendSRQ *backend_srq;
     37 } BackendCtx;
     38 
     39 struct backend_umad {
     40     struct ib_user_mad hdr;
     41     char mad[RDMA_MAX_PRIVATE_DATA];
     42 };
     43 
     44 static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
     45 
     46 static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
     47 {
     48     rdma_error_report("No completion handler is registered");
     49 }
     50 
     51 static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err,
     52                                  void *ctx)
     53 {
     54     struct ibv_wc wc = {};
     55 
     56     wc.status = status;
     57     wc.vendor_err = vendor_err;
     58 
     59     comp_handler(ctx, &wc);
     60 }
     61 
     62 static void free_cqe_ctx(gpointer data, gpointer user_data)
     63 {
     64     BackendCtx *bctx;
     65     RdmaDeviceResources *rdma_dev_res = user_data;
     66     unsigned long cqe_ctx_id = GPOINTER_TO_INT(data);
     67 
     68     bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, cqe_ctx_id);
     69     if (bctx) {
     70         rdma_rm_dealloc_cqe_ctx(rdma_dev_res, cqe_ctx_id);
     71         qatomic_dec(&rdma_dev_res->stats.missing_cqe);
     72     }
     73     g_free(bctx);
     74 }
     75 
     76 static void clean_recv_mads(RdmaBackendDev *backend_dev)
     77 {
     78     unsigned long cqe_ctx_id;
     79 
     80     do {
     81         cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev->
     82                                                     recv_mads_list);
     83         if (cqe_ctx_id != -ENOENT) {
     84             qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
     85             free_cqe_ctx(GINT_TO_POINTER(cqe_ctx_id),
     86                          backend_dev->rdma_dev_res);
     87         }
     88     } while (cqe_ctx_id != -ENOENT);
     89 }
     90 
     91 static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
     92 {
     93     int i, ne, total_ne = 0;
     94     BackendCtx *bctx;
     95     struct ibv_wc wc[2];
     96     RdmaProtectedGSList *cqe_ctx_list;
     97 
     98     WITH_QEMU_LOCK_GUARD(&rdma_dev_res->lock) {
     99         do {
    100             ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc);
    101 
    102             trace_rdma_poll_cq(ne, ibcq);
    103 
    104             for (i = 0; i < ne; i++) {
    105                 bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id);
    106                 if (unlikely(!bctx)) {
    107                     rdma_error_report("No matching ctx for req %"PRId64,
    108                                       wc[i].wr_id);
    109                     continue;
    110                 }
    111 
    112                 comp_handler(bctx->up_ctx, &wc[i]);
    113 
    114                 if (bctx->backend_qp) {
    115                     cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list;
    116                 } else {
    117                     cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list;
    118                 }
    119 
    120                 rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id);
    121                 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
    122                 g_free(bctx);
    123             }
    124             total_ne += ne;
    125         } while (ne > 0);
    126         qatomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne);
    127     }
    128 
    129     if (ne < 0) {
    130         rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno);
    131     }
    132 
    133     rdma_dev_res->stats.completions += total_ne;
    134 
    135     return total_ne;
    136 }
    137 
    138 static void *comp_handler_thread(void *arg)
    139 {
    140     RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg;
    141     int rc;
    142     struct ibv_cq *ev_cq;
    143     void *ev_ctx;
    144     int flags;
    145     GPollFD pfds[1];
    146 
    147     /* Change to non-blocking mode */
    148     flags = fcntl(backend_dev->channel->fd, F_GETFL);
    149     rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
    150     if (rc < 0) {
    151         rdma_error_report("Failed to change backend channel FD to non-blocking");
    152         return NULL;
    153     }
    154 
    155     pfds[0].fd = backend_dev->channel->fd;
    156     pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
    157 
    158     backend_dev->comp_thread.is_running = true;
    159 
    160     while (backend_dev->comp_thread.run) {
    161         do {
    162             rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
    163             if (!rc) {
    164                 backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++;
    165             }
    166         } while (!rc && backend_dev->comp_thread.run);
    167 
    168         if (backend_dev->comp_thread.run) {
    169             rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
    170             if (unlikely(rc)) {
    171                 rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc,
    172                                   errno);
    173                 continue;
    174             }
    175 
    176             rc = ibv_req_notify_cq(ev_cq, 0);
    177             if (unlikely(rc)) {
    178                 rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc,
    179                                   errno);
    180             }
    181 
    182             backend_dev->rdma_dev_res->stats.poll_cq_from_bk++;
    183             rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq);
    184 
    185             ibv_ack_cq_events(ev_cq, 1);
    186         }
    187     }
    188 
    189     backend_dev->comp_thread.is_running = false;
    190 
    191     qemu_thread_exit(0);
    192 
    193     return NULL;
    194 }
    195 
    196 static inline void disable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
    197 {
    198     qatomic_set(&backend_dev->rdmacm_mux.can_receive, 0);
    199 }
    200 
    201 static inline void enable_rdmacm_mux_async(RdmaBackendDev *backend_dev)
    202 {
    203     qatomic_set(&backend_dev->rdmacm_mux.can_receive, sizeof(RdmaCmMuxMsg));
    204 }
    205 
    206 static inline int rdmacm_mux_can_process_async(RdmaBackendDev *backend_dev)
    207 {
    208     return qatomic_read(&backend_dev->rdmacm_mux.can_receive);
    209 }
    210 
    211 static int rdmacm_mux_check_op_status(CharBackend *mad_chr_be)
    212 {
    213     RdmaCmMuxMsg msg = {};
    214     int ret;
    215 
    216     ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *)&msg, sizeof(msg));
    217     if (ret != sizeof(msg)) {
    218         rdma_error_report("Got invalid message from mux: size %d, expecting %d",
    219                           ret, (int)sizeof(msg));
    220         return -EIO;
    221     }
    222 
    223     trace_rdmacm_mux_check_op_status(msg.hdr.msg_type, msg.hdr.op_code,
    224                                      msg.hdr.err_code);
    225 
    226     if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) {
    227         rdma_error_report("Got invalid message type %d", msg.hdr.msg_type);
    228         return -EIO;
    229     }
    230 
    231     if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) {
    232         rdma_error_report("Operation failed in mux, error code %d",
    233                           msg.hdr.err_code);
    234         return -EIO;
    235     }
    236 
    237     return 0;
    238 }
    239 
    240 static int rdmacm_mux_send(RdmaBackendDev *backend_dev, RdmaCmMuxMsg *msg)
    241 {
    242     int rc = 0;
    243 
    244     msg->hdr.msg_type = RDMACM_MUX_MSG_TYPE_REQ;
    245     trace_rdmacm_mux("send", msg->hdr.msg_type, msg->hdr.op_code);
    246     disable_rdmacm_mux_async(backend_dev);
    247     rc = qemu_chr_fe_write(backend_dev->rdmacm_mux.chr_be,
    248                            (const uint8_t *)msg, sizeof(*msg));
    249     if (rc != sizeof(*msg)) {
    250         enable_rdmacm_mux_async(backend_dev);
    251         rdma_error_report("Failed to send request to rdmacm_mux (rc=%d)", rc);
    252         return -EIO;
    253     }
    254 
    255     rc = rdmacm_mux_check_op_status(backend_dev->rdmacm_mux.chr_be);
    256     if (rc) {
    257         rdma_error_report("Failed to execute rdmacm_mux request %d (rc=%d)",
    258                           msg->hdr.op_code, rc);
    259     }
    260 
    261     enable_rdmacm_mux_async(backend_dev);
    262 
    263     return 0;
    264 }
    265 
    266 static void stop_backend_thread(RdmaBackendThread *thread)
    267 {
    268     thread->run = false;
    269     while (thread->is_running) {
    270         sleep(THR_POLL_TO / SCALE_US / 2);
    271     }
    272 }
    273 
    274 static void start_comp_thread(RdmaBackendDev *backend_dev)
    275 {
    276     char thread_name[THR_NAME_LEN] = {};
    277 
    278     stop_backend_thread(&backend_dev->comp_thread);
    279 
    280     snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
    281              ibv_get_device_name(backend_dev->ib_dev));
    282     backend_dev->comp_thread.run = true;
    283     qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
    284                        comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
    285 }
    286 
    287 void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
    288                                                          struct ibv_wc *wc))
    289 {
    290     comp_handler = handler;
    291 }
    292 
    293 void rdma_backend_unregister_comp_handler(void)
    294 {
    295     rdma_backend_register_comp_handler(dummy_comp_handler);
    296 }
    297 
    298 int rdma_backend_query_port(RdmaBackendDev *backend_dev,
    299                             struct ibv_port_attr *port_attr)
    300 {
    301     int rc;
    302 
    303     rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr);
    304     if (rc) {
    305         rdma_error_report("ibv_query_port fail, rc=%d, errno=%d", rc, errno);
    306         return -EIO;
    307     }
    308 
    309     return 0;
    310 }
    311 
    312 void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq)
    313 {
    314     int polled;
    315 
    316     rdma_dev_res->stats.poll_cq_from_guest++;
    317     polled = rdma_poll_cq(rdma_dev_res, cq->ibcq);
    318     if (!polled) {
    319         rdma_dev_res->stats.poll_cq_from_guest_empty++;
    320     }
    321 }
    322 
    323 static GHashTable *ah_hash;
    324 
    325 static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd,
    326                                 uint8_t sgid_idx, union ibv_gid *dgid)
    327 {
    328     GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid));
    329     struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key);
    330 
    331     if (ah) {
    332         trace_rdma_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix),
    333                                        be64_to_cpu(dgid->global.interface_id));
    334         g_bytes_unref(ah_key);
    335     } else {
    336         struct ibv_ah_attr ah_attr = {
    337             .is_global     = 1,
    338             .port_num      = backend_dev->port_num,
    339             .grh.hop_limit = 1,
    340         };
    341 
    342         ah_attr.grh.dgid = *dgid;
    343         ah_attr.grh.sgid_index = sgid_idx;
    344 
    345         ah = ibv_create_ah(pd, &ah_attr);
    346         if (ah) {
    347             g_hash_table_insert(ah_hash, ah_key, ah);
    348         } else {
    349             g_bytes_unref(ah_key);
    350             rdma_error_report("Failed to create AH for gid <0x%" PRIx64", 0x%"PRIx64">",
    351                               be64_to_cpu(dgid->global.subnet_prefix),
    352                               be64_to_cpu(dgid->global.interface_id));
    353         }
    354 
    355         trace_rdma_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix),
    356                                         be64_to_cpu(dgid->global.interface_id));
    357     }
    358 
    359     return ah;
    360 }
    361 
    362 static void destroy_ah_hash_key(gpointer data)
    363 {
    364     g_bytes_unref(data);
    365 }
    366 
    367 static void destroy_ah_hast_data(gpointer data)
    368 {
    369     struct ibv_ah *ah = data;
    370 
    371     ibv_destroy_ah(ah);
    372 }
    373 
    374 static void ah_cache_init(void)
    375 {
    376     ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal,
    377                                     destroy_ah_hash_key, destroy_ah_hast_data);
    378 }
    379 
    380 #ifdef LEGACY_RDMA_REG_MR
    381 static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
    382                                 struct ibv_sge *sge, uint8_t num_sge,
    383                                 uint64_t *total_length)
    384 {
    385     RdmaRmMR *mr;
    386     int idx;
    387 
    388     for (idx = 0; idx < num_sge; idx++) {
    389         mr = rdma_rm_get_mr(rdma_dev_res, sge[idx].lkey);
    390         if (unlikely(!mr)) {
    391             rdma_error_report("Invalid lkey 0x%x", sge[idx].lkey);
    392             return VENDOR_ERR_INVLKEY | sge[idx].lkey;
    393         }
    394 
    395         sge[idx].addr = (uintptr_t)mr->virt + sge[idx].addr - mr->start;
    396         sge[idx].lkey = rdma_backend_mr_lkey(&mr->backend_mr);
    397 
    398         *total_length += sge[idx].length;
    399     }
    400 
    401     return 0;
    402 }
    403 #else
    404 static inline int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
    405                                        struct ibv_sge *sge, uint8_t num_sge,
    406                                        uint64_t *total_length)
    407 {
    408     int idx;
    409 
    410     for (idx = 0; idx < num_sge; idx++) {
    411         *total_length += sge[idx].length;
    412     }
    413     return 0;
    414 }
    415 #endif
    416 
    417 static void trace_mad_message(const char *title, char *buf, int len)
    418 {
    419     int i;
    420     char *b = g_malloc0(len * 3 + 1);
    421     char b1[4];
    422 
    423     for (i = 0; i < len; i++) {
    424         sprintf(b1, "%.2X ", buf[i] & 0x000000FF);
    425         strcat(b, b1);
    426     }
    427 
    428     trace_rdma_mad_message(title, len, b);
    429 
    430     g_free(b);
    431 }
    432 
    433 static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
    434                     union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
    435 {
    436     RdmaCmMuxMsg msg = {};
    437     char *hdr, *data;
    438     int ret;
    439 
    440     if (num_sge != 2) {
    441         return -EINVAL;
    442     }
    443 
    444     msg.hdr.op_code = RDMACM_MUX_OP_CODE_MAD;
    445     memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
    446 
    447     msg.umad_len = sge[0].length + sge[1].length;
    448 
    449     if (msg.umad_len > sizeof(msg.umad.mad)) {
    450         return -ENOMEM;
    451     }
    452 
    453     msg.umad.hdr.addr.qpn = htobe32(1);
    454     msg.umad.hdr.addr.grh_present = 1;
    455     msg.umad.hdr.addr.gid_index = sgid_idx;
    456     memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
    457     msg.umad.hdr.addr.hop_limit = 0xFF;
    458 
    459     hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
    460     if (!hdr) {
    461         return -ENOMEM;
    462     }
    463     data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
    464     if (!data) {
    465         rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
    466         return -ENOMEM;
    467     }
    468 
    469     memcpy(&msg.umad.mad[0], hdr, sge[0].length);
    470     memcpy(&msg.umad.mad[sge[0].length], data, sge[1].length);
    471 
    472     rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
    473     rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
    474 
    475     trace_mad_message("send", msg.umad.mad, msg.umad_len);
    476 
    477     ret = rdmacm_mux_send(backend_dev, &msg);
    478     if (ret) {
    479         rdma_error_report("Failed to send MAD to rdma_umadmux (%d)", ret);
    480         return -EIO;
    481     }
    482 
    483     return 0;
    484 }
    485 
    486 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
    487                             RdmaBackendQP *qp, uint8_t qp_type,
    488                             struct ibv_sge *sge, uint32_t num_sge,
    489                             uint8_t sgid_idx, union ibv_gid *sgid,
    490                             union ibv_gid *dgid, uint32_t dqpn, uint32_t dqkey,
    491                             void *ctx)
    492 {
    493     BackendCtx *bctx;
    494     uint32_t bctx_id;
    495     int rc;
    496     struct ibv_send_wr wr = {}, *bad_wr;
    497 
    498     if (!qp->ibqp) { /* This field is not initialized for QP0 and QP1 */
    499         if (qp_type == IBV_QPT_SMI) {
    500             rdma_error_report("Got QP0 request");
    501             complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
    502         } else if (qp_type == IBV_QPT_GSI) {
    503             rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
    504             if (rc) {
    505                 complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
    506                 backend_dev->rdma_dev_res->stats.mad_tx_err++;
    507             } else {
    508                 complete_work(IBV_WC_SUCCESS, 0, ctx);
    509                 backend_dev->rdma_dev_res->stats.mad_tx++;
    510             }
    511         }
    512         return;
    513     }
    514 
    515     bctx = g_malloc0(sizeof(*bctx));
    516     bctx->up_ctx = ctx;
    517     bctx->backend_qp = qp;
    518 
    519     rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    520     if (unlikely(rc)) {
    521         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    522         goto err_free_bctx;
    523     }
    524 
    525     rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
    526 
    527     rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    528                               &backend_dev->rdma_dev_res->stats.tx_len);
    529     if (rc) {
    530         complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    531         goto err_dealloc_cqe_ctx;
    532     }
    533 
    534     if (qp_type == IBV_QPT_UD) {
    535         wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
    536         if (!wr.wr.ud.ah) {
    537             complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    538             goto err_dealloc_cqe_ctx;
    539         }
    540         wr.wr.ud.remote_qpn = dqpn;
    541         wr.wr.ud.remote_qkey = dqkey;
    542     }
    543 
    544     wr.num_sge = num_sge;
    545     wr.opcode = IBV_WR_SEND;
    546     wr.send_flags = IBV_SEND_SIGNALED;
    547     wr.sg_list = sge;
    548     wr.wr_id = bctx_id;
    549 
    550     rc = ibv_post_send(qp->ibqp, &wr, &bad_wr);
    551     if (rc) {
    552         rdma_error_report("ibv_post_send fail, qpn=0x%x, rc=%d, errno=%d",
    553                           qp->ibqp->qp_num, rc, errno);
    554         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    555         goto err_dealloc_cqe_ctx;
    556     }
    557 
    558     qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    559     backend_dev->rdma_dev_res->stats.tx++;
    560 
    561     return;
    562 
    563 err_dealloc_cqe_ctx:
    564     backend_dev->rdma_dev_res->stats.tx_err++;
    565     rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    566 
    567 err_free_bctx:
    568     g_free(bctx);
    569 }
    570 
    571 static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
    572                                          struct ibv_sge *sge, uint32_t num_sge,
    573                                          void *ctx)
    574 {
    575     BackendCtx *bctx;
    576     int rc;
    577     uint32_t bctx_id;
    578 
    579     if (num_sge != 1) {
    580         rdma_error_report("Invalid num_sge (%d), expecting 1", num_sge);
    581         return VENDOR_ERR_INV_NUM_SGE;
    582     }
    583 
    584     if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
    585         rdma_error_report("Too small buffer for MAD");
    586         return VENDOR_ERR_INV_MAD_BUFF;
    587     }
    588 
    589     bctx = g_malloc0(sizeof(*bctx));
    590 
    591     rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    592     if (unlikely(rc)) {
    593         g_free(bctx);
    594         return VENDOR_ERR_NOMEM;
    595     }
    596 
    597     bctx->up_ctx = ctx;
    598     bctx->sge = *sge;
    599 
    600     rdma_protected_gqueue_append_int64(&backend_dev->recv_mads_list, bctx_id);
    601 
    602     return 0;
    603 }
    604 
    605 void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
    606                             RdmaBackendQP *qp, uint8_t qp_type,
    607                             struct ibv_sge *sge, uint32_t num_sge, void *ctx)
    608 {
    609     BackendCtx *bctx;
    610     uint32_t bctx_id;
    611     int rc;
    612     struct ibv_recv_wr wr = {}, *bad_wr;
    613 
    614     if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
    615         if (qp_type == IBV_QPT_SMI) {
    616             rdma_error_report("Got QP0 request");
    617             complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
    618         }
    619         if (qp_type == IBV_QPT_GSI) {
    620             rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
    621             if (rc) {
    622                 complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    623                 backend_dev->rdma_dev_res->stats.mad_rx_bufs_err++;
    624             } else {
    625                 backend_dev->rdma_dev_res->stats.mad_rx_bufs++;
    626             }
    627         }
    628         return;
    629     }
    630 
    631     bctx = g_malloc0(sizeof(*bctx));
    632     bctx->up_ctx = ctx;
    633     bctx->backend_qp = qp;
    634 
    635     rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    636     if (unlikely(rc)) {
    637         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    638         goto err_free_bctx;
    639     }
    640 
    641     rdma_protected_gslist_append_int32(&qp->cqe_ctx_list, bctx_id);
    642 
    643     rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    644                               &backend_dev->rdma_dev_res->stats.rx_bufs_len);
    645     if (rc) {
    646         complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    647         goto err_dealloc_cqe_ctx;
    648     }
    649 
    650     wr.num_sge = num_sge;
    651     wr.sg_list = sge;
    652     wr.wr_id = bctx_id;
    653     rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr);
    654     if (rc) {
    655         rdma_error_report("ibv_post_recv fail, qpn=0x%x, rc=%d, errno=%d",
    656                           qp->ibqp->qp_num, rc, errno);
    657         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    658         goto err_dealloc_cqe_ctx;
    659     }
    660 
    661     qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    662     backend_dev->rdma_dev_res->stats.rx_bufs++;
    663 
    664     return;
    665 
    666 err_dealloc_cqe_ctx:
    667     backend_dev->rdma_dev_res->stats.rx_bufs_err++;
    668     rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    669 
    670 err_free_bctx:
    671     g_free(bctx);
    672 }
    673 
    674 void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev,
    675                                 RdmaBackendSRQ *srq, struct ibv_sge *sge,
    676                                 uint32_t num_sge, void *ctx)
    677 {
    678     BackendCtx *bctx;
    679     uint32_t bctx_id;
    680     int rc;
    681     struct ibv_recv_wr wr = {}, *bad_wr;
    682 
    683     bctx = g_malloc0(sizeof(*bctx));
    684     bctx->up_ctx = ctx;
    685     bctx->backend_srq = srq;
    686 
    687     rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
    688     if (unlikely(rc)) {
    689         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
    690         goto err_free_bctx;
    691     }
    692 
    693     rdma_protected_gslist_append_int32(&srq->cqe_ctx_list, bctx_id);
    694 
    695     rc = build_host_sge_array(backend_dev->rdma_dev_res, sge, num_sge,
    696                               &backend_dev->rdma_dev_res->stats.rx_bufs_len);
    697     if (rc) {
    698         complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
    699         goto err_dealloc_cqe_ctx;
    700     }
    701 
    702     wr.num_sge = num_sge;
    703     wr.sg_list = sge;
    704     wr.wr_id = bctx_id;
    705     rc = ibv_post_srq_recv(srq->ibsrq, &wr, &bad_wr);
    706     if (rc) {
    707         rdma_error_report("ibv_post_srq_recv fail, srqn=0x%x, rc=%d, errno=%d",
    708                           srq->ibsrq->handle, rc, errno);
    709         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
    710         goto err_dealloc_cqe_ctx;
    711     }
    712 
    713     qatomic_inc(&backend_dev->rdma_dev_res->stats.missing_cqe);
    714     backend_dev->rdma_dev_res->stats.rx_bufs++;
    715     backend_dev->rdma_dev_res->stats.rx_srq++;
    716 
    717     return;
    718 
    719 err_dealloc_cqe_ctx:
    720     backend_dev->rdma_dev_res->stats.rx_bufs_err++;
    721     rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
    722 
    723 err_free_bctx:
    724     g_free(bctx);
    725 }
    726 
    727 int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd)
    728 {
    729     pd->ibpd = ibv_alloc_pd(backend_dev->context);
    730 
    731     if (!pd->ibpd) {
    732         rdma_error_report("ibv_alloc_pd fail, errno=%d", errno);
    733         return -EIO;
    734     }
    735 
    736     return 0;
    737 }
    738 
    739 void rdma_backend_destroy_pd(RdmaBackendPD *pd)
    740 {
    741     if (pd->ibpd) {
    742         ibv_dealloc_pd(pd->ibpd);
    743     }
    744 }
    745 
    746 int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr,
    747                            size_t length, uint64_t guest_start, int access)
    748 {
    749 #ifdef LEGACY_RDMA_REG_MR
    750     mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access);
    751 #else
    752     mr->ibmr = ibv_reg_mr_iova(pd->ibpd, addr, length, guest_start, access);
    753 #endif
    754     if (!mr->ibmr) {
    755         rdma_error_report("ibv_reg_mr fail, errno=%d", errno);
    756         return -EIO;
    757     }
    758 
    759     mr->ibpd = pd->ibpd;
    760 
    761     return 0;
    762 }
    763 
    764 void rdma_backend_destroy_mr(RdmaBackendMR *mr)
    765 {
    766     if (mr->ibmr) {
    767         ibv_dereg_mr(mr->ibmr);
    768     }
    769 }
    770 
    771 int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
    772                            int cqe)
    773 {
    774     int rc;
    775 
    776     cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL,
    777                              backend_dev->channel, 0);
    778     if (!cq->ibcq) {
    779         rdma_error_report("ibv_create_cq fail, errno=%d", errno);
    780         return -EIO;
    781     }
    782 
    783     rc = ibv_req_notify_cq(cq->ibcq, 0);
    784     if (rc) {
    785         rdma_warn_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno);
    786     }
    787 
    788     cq->backend_dev = backend_dev;
    789 
    790     return 0;
    791 }
    792 
    793 void rdma_backend_destroy_cq(RdmaBackendCQ *cq)
    794 {
    795     if (cq->ibcq) {
    796         ibv_destroy_cq(cq->ibcq);
    797     }
    798 }
    799 
    800 int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
    801                            RdmaBackendPD *pd, RdmaBackendCQ *scq,
    802                            RdmaBackendCQ *rcq, RdmaBackendSRQ *srq,
    803                            uint32_t max_send_wr, uint32_t max_recv_wr,
    804                            uint32_t max_send_sge, uint32_t max_recv_sge)
    805 {
    806     struct ibv_qp_init_attr attr = {};
    807 
    808     qp->ibqp = 0;
    809 
    810     switch (qp_type) {
    811     case IBV_QPT_GSI:
    812         return 0;
    813 
    814     case IBV_QPT_RC:
    815         /* fall through */
    816     case IBV_QPT_UD:
    817         /* do nothing */
    818         break;
    819 
    820     default:
    821         rdma_error_report("Unsupported QP type %d", qp_type);
    822         return -EIO;
    823     }
    824 
    825     attr.qp_type = qp_type;
    826     attr.send_cq = scq->ibcq;
    827     attr.recv_cq = rcq->ibcq;
    828     attr.cap.max_send_wr = max_send_wr;
    829     attr.cap.max_recv_wr = max_recv_wr;
    830     attr.cap.max_send_sge = max_send_sge;
    831     attr.cap.max_recv_sge = max_recv_sge;
    832     if (srq) {
    833         attr.srq = srq->ibsrq;
    834     }
    835 
    836     qp->ibqp = ibv_create_qp(pd->ibpd, &attr);
    837     if (!qp->ibqp) {
    838         rdma_error_report("ibv_create_qp fail, errno=%d", errno);
    839         return -EIO;
    840     }
    841 
    842     rdma_protected_gslist_init(&qp->cqe_ctx_list);
    843 
    844     qp->ibpd = pd->ibpd;
    845 
    846     /* TODO: Query QP to get max_inline_data and save it to be used in send */
    847 
    848     return 0;
    849 }
    850 
    851 int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
    852                                uint8_t qp_type, uint32_t qkey)
    853 {
    854     struct ibv_qp_attr attr = {};
    855     int rc, attr_mask;
    856 
    857     attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
    858     attr.qp_state        = IBV_QPS_INIT;
    859     attr.pkey_index      = 0;
    860     attr.port_num        = backend_dev->port_num;
    861 
    862     switch (qp_type) {
    863     case IBV_QPT_RC:
    864         attr_mask |= IBV_QP_ACCESS_FLAGS;
    865         trace_rdma_backend_rc_qp_state_init(qp->ibqp->qp_num);
    866         break;
    867 
    868     case IBV_QPT_UD:
    869         attr.qkey = qkey;
    870         attr_mask |= IBV_QP_QKEY;
    871         trace_rdma_backend_ud_qp_state_init(qp->ibqp->qp_num, qkey);
    872         break;
    873 
    874     default:
    875         rdma_error_report("Unsupported QP type %d", qp_type);
    876         return -EIO;
    877     }
    878 
    879     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    880     if (rc) {
    881         rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    882         return -EIO;
    883     }
    884 
    885     return 0;
    886 }
    887 
    888 int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
    889                               uint8_t qp_type, uint8_t sgid_idx,
    890                               union ibv_gid *dgid, uint32_t dqpn,
    891                               uint32_t rq_psn, uint32_t qkey, bool use_qkey)
    892 {
    893     struct ibv_qp_attr attr = {};
    894     union ibv_gid ibv_gid = {
    895         .global.interface_id = dgid->global.interface_id,
    896         .global.subnet_prefix = dgid->global.subnet_prefix
    897     };
    898     int rc, attr_mask;
    899 
    900     attr.qp_state = IBV_QPS_RTR;
    901     attr_mask = IBV_QP_STATE;
    902 
    903     qp->sgid_idx = sgid_idx;
    904 
    905     switch (qp_type) {
    906     case IBV_QPT_RC:
    907         attr.path_mtu               = IBV_MTU_1024;
    908         attr.dest_qp_num            = dqpn;
    909         attr.max_dest_rd_atomic     = 1;
    910         attr.min_rnr_timer          = 12;
    911         attr.ah_attr.port_num       = backend_dev->port_num;
    912         attr.ah_attr.is_global      = 1;
    913         attr.ah_attr.grh.hop_limit  = 1;
    914         attr.ah_attr.grh.dgid       = ibv_gid;
    915         attr.ah_attr.grh.sgid_index = qp->sgid_idx;
    916         attr.rq_psn                 = rq_psn;
    917 
    918         attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
    919                      IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC |
    920                      IBV_QP_MIN_RNR_TIMER;
    921 
    922         trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num,
    923                                            be64_to_cpu(ibv_gid.global.
    924                                                        subnet_prefix),
    925                                            be64_to_cpu(ibv_gid.global.
    926                                                        interface_id),
    927                                            qp->sgid_idx, dqpn, rq_psn);
    928         break;
    929 
    930     case IBV_QPT_UD:
    931         if (use_qkey) {
    932             attr.qkey = qkey;
    933             attr_mask |= IBV_QP_QKEY;
    934         }
    935         trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey :
    936                                            0);
    937         break;
    938     }
    939 
    940     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    941     if (rc) {
    942         rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    943         return -EIO;
    944     }
    945 
    946     return 0;
    947 }
    948 
    949 int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
    950                               uint32_t sq_psn, uint32_t qkey, bool use_qkey)
    951 {
    952     struct ibv_qp_attr attr = {};
    953     int rc, attr_mask;
    954 
    955     attr.qp_state = IBV_QPS_RTS;
    956     attr.sq_psn = sq_psn;
    957     attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
    958 
    959     switch (qp_type) {
    960     case IBV_QPT_RC:
    961         attr.timeout       = 14;
    962         attr.retry_cnt     = 7;
    963         attr.rnr_retry     = 7;
    964         attr.max_rd_atomic = 1;
    965 
    966         attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
    967                      IBV_QP_MAX_QP_RD_ATOMIC;
    968         trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn);
    969         break;
    970 
    971     case IBV_QPT_UD:
    972         if (use_qkey) {
    973             attr.qkey = qkey;
    974             attr_mask |= IBV_QP_QKEY;
    975         }
    976         trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn,
    977                                            use_qkey ? qkey : 0);
    978         break;
    979     }
    980 
    981     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    982     if (rc) {
    983         rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
    984         return -EIO;
    985     }
    986 
    987     return 0;
    988 }
    989 
    990 int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
    991                           int attr_mask, struct ibv_qp_init_attr *init_attr)
    992 {
    993     if (!qp->ibqp) {
    994         attr->qp_state = IBV_QPS_RTS;
    995         return 0;
    996     }
    997 
    998     return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr);
    999 }
   1000 
   1001 void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res)
   1002 {
   1003     if (qp->ibqp) {
   1004         ibv_destroy_qp(qp->ibqp);
   1005     }
   1006     g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res);
   1007     rdma_protected_gslist_destroy(&qp->cqe_ctx_list);
   1008 }
   1009 
   1010 int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd,
   1011                             uint32_t max_wr, uint32_t max_sge,
   1012                             uint32_t srq_limit)
   1013 {
   1014     struct ibv_srq_init_attr srq_init_attr = {};
   1015 
   1016     srq_init_attr.attr.max_wr = max_wr;
   1017     srq_init_attr.attr.max_sge = max_sge;
   1018     srq_init_attr.attr.srq_limit = srq_limit;
   1019 
   1020     srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr);
   1021     if (!srq->ibsrq) {
   1022         rdma_error_report("ibv_create_srq failed, errno=%d", errno);
   1023         return -EIO;
   1024     }
   1025 
   1026     rdma_protected_gslist_init(&srq->cqe_ctx_list);
   1027 
   1028     return 0;
   1029 }
   1030 
   1031 int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr)
   1032 {
   1033     if (!srq->ibsrq) {
   1034         return -EINVAL;
   1035     }
   1036 
   1037     return ibv_query_srq(srq->ibsrq, srq_attr);
   1038 }
   1039 
   1040 int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr,
   1041                 int srq_attr_mask)
   1042 {
   1043     if (!srq->ibsrq) {
   1044         return -EINVAL;
   1045     }
   1046 
   1047     return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask);
   1048 }
   1049 
   1050 void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res)
   1051 {
   1052     if (srq->ibsrq) {
   1053         ibv_destroy_srq(srq->ibsrq);
   1054     }
   1055     g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res);
   1056     rdma_protected_gslist_destroy(&srq->cqe_ctx_list);
   1057 }
   1058 
   1059 #define CHK_ATTR(req, dev, member, fmt) ({ \
   1060     trace_rdma_check_dev_attr(#member, dev.member, req->member); \
   1061     if (req->member > dev.member) { \
   1062         rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \
   1063                          #member, req->member, dev.member); \
   1064         req->member = dev.member; \
   1065     } \
   1066 })
   1067 
   1068 static int init_device_caps(RdmaBackendDev *backend_dev,
   1069                             struct ibv_device_attr *dev_attr)
   1070 {
   1071     struct ibv_device_attr bk_dev_attr;
   1072     int rc;
   1073 
   1074     rc = ibv_query_device(backend_dev->context, &bk_dev_attr);
   1075     if (rc) {
   1076         rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno);
   1077         return -EIO;
   1078     }
   1079 
   1080     dev_attr->max_sge = MAX_SGE;
   1081     dev_attr->max_srq_sge = MAX_SGE;
   1082 
   1083     CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64);
   1084     CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d");
   1085     CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d");
   1086     CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d");
   1087     CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d");
   1088     CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d");
   1089     CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d");
   1090     CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d");
   1091     CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d");
   1092     CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d");
   1093 
   1094     return 0;
   1095 }
   1096 
   1097 static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
   1098                                  union ibv_gid *my_gid, int paylen)
   1099 {
   1100     grh->paylen = htons(paylen);
   1101     grh->sgid = *sgid;
   1102     grh->dgid = *my_gid;
   1103 }
   1104 
   1105 static void process_incoming_mad_req(RdmaBackendDev *backend_dev,
   1106                                      RdmaCmMuxMsg *msg)
   1107 {
   1108     unsigned long cqe_ctx_id;
   1109     BackendCtx *bctx;
   1110     char *mad;
   1111 
   1112     trace_mad_message("recv", msg->umad.mad, msg->umad_len);
   1113 
   1114     cqe_ctx_id = rdma_protected_gqueue_pop_int64(&backend_dev->recv_mads_list);
   1115     if (cqe_ctx_id == -ENOENT) {
   1116         rdma_warn_report("No more free MADs buffers, waiting for a while");
   1117         sleep(THR_POLL_TO);
   1118         return;
   1119     }
   1120 
   1121     bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
   1122     if (unlikely(!bctx)) {
   1123         rdma_error_report("No matching ctx for req %ld", cqe_ctx_id);
   1124         backend_dev->rdma_dev_res->stats.mad_rx_err++;
   1125         return;
   1126     }
   1127 
   1128     mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
   1129                            bctx->sge.length);
   1130     if (!mad || bctx->sge.length < msg->umad_len + MAD_HDR_SIZE) {
   1131         backend_dev->rdma_dev_res->stats.mad_rx_err++;
   1132         complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
   1133                       bctx->up_ctx);
   1134     } else {
   1135         struct ibv_wc wc = {};
   1136         memset(mad, 0, bctx->sge.length);
   1137         build_mad_hdr((struct ibv_grh *)mad,
   1138                       (union ibv_gid *)&msg->umad.hdr.addr.gid, &msg->hdr.sgid,
   1139                       msg->umad_len);
   1140         memcpy(&mad[MAD_HDR_SIZE], msg->umad.mad, msg->umad_len);
   1141         rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
   1142 
   1143         wc.byte_len = msg->umad_len;
   1144         wc.status = IBV_WC_SUCCESS;
   1145         wc.wc_flags = IBV_WC_GRH;
   1146         backend_dev->rdma_dev_res->stats.mad_rx++;
   1147         comp_handler(bctx->up_ctx, &wc);
   1148     }
   1149 
   1150     g_free(bctx);
   1151     rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
   1152 }
   1153 
   1154 static inline int rdmacm_mux_can_receive(void *opaque)
   1155 {
   1156     RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
   1157 
   1158     return rdmacm_mux_can_process_async(backend_dev);
   1159 }
   1160 
   1161 static void rdmacm_mux_read(void *opaque, const uint8_t *buf, int size)
   1162 {
   1163     RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
   1164     RdmaCmMuxMsg *msg = (RdmaCmMuxMsg *)buf;
   1165 
   1166     trace_rdmacm_mux("read", msg->hdr.msg_type, msg->hdr.op_code);
   1167 
   1168     if (msg->hdr.msg_type != RDMACM_MUX_MSG_TYPE_REQ &&
   1169         msg->hdr.op_code != RDMACM_MUX_OP_CODE_MAD) {
   1170             rdma_error_report("Error: Not a MAD request, skipping");
   1171             return;
   1172     }
   1173     process_incoming_mad_req(backend_dev, msg);
   1174 }
   1175 
   1176 static int mad_init(RdmaBackendDev *backend_dev, CharBackend *mad_chr_be)
   1177 {
   1178     int ret;
   1179 
   1180     backend_dev->rdmacm_mux.chr_be = mad_chr_be;
   1181 
   1182     ret = qemu_chr_fe_backend_connected(backend_dev->rdmacm_mux.chr_be);
   1183     if (!ret) {
   1184         rdma_error_report("Missing chardev for MAD multiplexer");
   1185         return -EIO;
   1186     }
   1187 
   1188     rdma_protected_gqueue_init(&backend_dev->recv_mads_list);
   1189 
   1190     enable_rdmacm_mux_async(backend_dev);
   1191 
   1192     qemu_chr_fe_set_handlers(backend_dev->rdmacm_mux.chr_be,
   1193                              rdmacm_mux_can_receive, rdmacm_mux_read, NULL,
   1194                              NULL, backend_dev, NULL, true);
   1195 
   1196     return 0;
   1197 }
   1198 
   1199 static void mad_stop(RdmaBackendDev *backend_dev)
   1200 {
   1201     clean_recv_mads(backend_dev);
   1202 }
   1203 
   1204 static void mad_fini(RdmaBackendDev *backend_dev)
   1205 {
   1206     disable_rdmacm_mux_async(backend_dev);
   1207     qemu_chr_fe_disconnect(backend_dev->rdmacm_mux.chr_be);
   1208     rdma_protected_gqueue_destroy(&backend_dev->recv_mads_list);
   1209 }
   1210 
   1211 int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
   1212                                union ibv_gid *gid)
   1213 {
   1214     union ibv_gid sgid;
   1215     int ret;
   1216     int i = 0;
   1217 
   1218     do {
   1219         ret = ibv_query_gid(backend_dev->context, backend_dev->port_num, i,
   1220                             &sgid);
   1221         i++;
   1222     } while (!ret && (memcmp(&sgid, gid, sizeof(*gid))));
   1223 
   1224     trace_rdma_backend_get_gid_index(be64_to_cpu(gid->global.subnet_prefix),
   1225                                      be64_to_cpu(gid->global.interface_id),
   1226                                      i - 1);
   1227 
   1228     return ret ? ret : i - 1;
   1229 }
   1230 
   1231 int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
   1232                          union ibv_gid *gid)
   1233 {
   1234     RdmaCmMuxMsg msg = {};
   1235     int ret;
   1236 
   1237     trace_rdma_backend_gid_change("add", be64_to_cpu(gid->global.subnet_prefix),
   1238                                   be64_to_cpu(gid->global.interface_id));
   1239 
   1240     msg.hdr.op_code = RDMACM_MUX_OP_CODE_REG;
   1241     memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
   1242 
   1243     ret = rdmacm_mux_send(backend_dev, &msg);
   1244     if (ret) {
   1245         rdma_error_report("Failed to register GID to rdma_umadmux (%d)", ret);
   1246         return -EIO;
   1247     }
   1248 
   1249     qapi_event_send_rdma_gid_status_changed(ifname, true,
   1250                                             gid->global.subnet_prefix,
   1251                                             gid->global.interface_id);
   1252 
   1253     return ret;
   1254 }
   1255 
   1256 int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
   1257                          union ibv_gid *gid)
   1258 {
   1259     RdmaCmMuxMsg msg = {};
   1260     int ret;
   1261 
   1262     trace_rdma_backend_gid_change("del", be64_to_cpu(gid->global.subnet_prefix),
   1263                                   be64_to_cpu(gid->global.interface_id));
   1264 
   1265     msg.hdr.op_code = RDMACM_MUX_OP_CODE_UNREG;
   1266     memcpy(msg.hdr.sgid.raw, gid->raw, sizeof(msg.hdr.sgid));
   1267 
   1268     ret = rdmacm_mux_send(backend_dev, &msg);
   1269     if (ret) {
   1270         rdma_error_report("Failed to unregister GID from rdma_umadmux (%d)",
   1271                           ret);
   1272         return -EIO;
   1273     }
   1274 
   1275     qapi_event_send_rdma_gid_status_changed(ifname, false,
   1276                                             gid->global.subnet_prefix,
   1277                                             gid->global.interface_id);
   1278 
   1279     return 0;
   1280 }
   1281 
   1282 int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
   1283                       RdmaDeviceResources *rdma_dev_res,
   1284                       const char *backend_device_name, uint8_t port_num,
   1285                       struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be)
   1286 {
   1287     int i;
   1288     int ret = 0;
   1289     int num_ibv_devices;
   1290     struct ibv_device **dev_list;
   1291 
   1292     memset(backend_dev, 0, sizeof(*backend_dev));
   1293 
   1294     backend_dev->dev = pdev;
   1295     backend_dev->port_num = port_num;
   1296     backend_dev->rdma_dev_res = rdma_dev_res;
   1297 
   1298     rdma_backend_register_comp_handler(dummy_comp_handler);
   1299 
   1300     dev_list = ibv_get_device_list(&num_ibv_devices);
   1301     if (!dev_list) {
   1302         rdma_error_report("Failed to get IB devices list");
   1303         return -EIO;
   1304     }
   1305 
   1306     if (num_ibv_devices == 0) {
   1307         rdma_error_report("No IB devices were found");
   1308         ret = -ENXIO;
   1309         goto out_free_dev_list;
   1310     }
   1311 
   1312     if (backend_device_name) {
   1313         for (i = 0; dev_list[i]; ++i) {
   1314             if (!strcmp(ibv_get_device_name(dev_list[i]),
   1315                         backend_device_name)) {
   1316                 break;
   1317             }
   1318         }
   1319 
   1320         backend_dev->ib_dev = dev_list[i];
   1321         if (!backend_dev->ib_dev) {
   1322             rdma_error_report("Failed to find IB device %s",
   1323                               backend_device_name);
   1324             ret = -EIO;
   1325             goto out_free_dev_list;
   1326         }
   1327     } else {
   1328         backend_dev->ib_dev = *dev_list;
   1329     }
   1330 
   1331     rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name);
   1332 
   1333     backend_dev->context = ibv_open_device(backend_dev->ib_dev);
   1334     if (!backend_dev->context) {
   1335         rdma_error_report("Failed to open IB device %s",
   1336                           ibv_get_device_name(backend_dev->ib_dev));
   1337         ret = -EIO;
   1338         goto out;
   1339     }
   1340 
   1341     backend_dev->channel = ibv_create_comp_channel(backend_dev->context);
   1342     if (!backend_dev->channel) {
   1343         rdma_error_report("Failed to create IB communication channel");
   1344         ret = -EIO;
   1345         goto out_close_device;
   1346     }
   1347 
   1348     ret = init_device_caps(backend_dev, dev_attr);
   1349     if (ret) {
   1350         rdma_error_report("Failed to initialize device capabilities");
   1351         ret = -EIO;
   1352         goto out_destroy_comm_channel;
   1353     }
   1354 
   1355 
   1356     ret = mad_init(backend_dev, mad_chr_be);
   1357     if (ret) {
   1358         rdma_error_report("Failed to initialize mad");
   1359         ret = -EIO;
   1360         goto out_destroy_comm_channel;
   1361     }
   1362 
   1363     backend_dev->comp_thread.run = false;
   1364     backend_dev->comp_thread.is_running = false;
   1365 
   1366     ah_cache_init();
   1367 
   1368     goto out_free_dev_list;
   1369 
   1370 out_destroy_comm_channel:
   1371     ibv_destroy_comp_channel(backend_dev->channel);
   1372 
   1373 out_close_device:
   1374     ibv_close_device(backend_dev->context);
   1375 
   1376 out_free_dev_list:
   1377     ibv_free_device_list(dev_list);
   1378 
   1379 out:
   1380     return ret;
   1381 }
   1382 
   1383 
   1384 void rdma_backend_start(RdmaBackendDev *backend_dev)
   1385 {
   1386     start_comp_thread(backend_dev);
   1387 }
   1388 
   1389 void rdma_backend_stop(RdmaBackendDev *backend_dev)
   1390 {
   1391     mad_stop(backend_dev);
   1392     stop_backend_thread(&backend_dev->comp_thread);
   1393 }
   1394 
   1395 void rdma_backend_fini(RdmaBackendDev *backend_dev)
   1396 {
   1397     mad_fini(backend_dev);
   1398     g_hash_table_destroy(ah_hash);
   1399     ibv_destroy_comp_channel(backend_dev->channel);
   1400     ibv_close_device(backend_dev->context);
   1401 }