qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

nvme.c (50291B)


      1 /*
      2  * NVMe block driver based on vfio
      3  *
      4  * Copyright 2016 - 2018 Red Hat, Inc.
      5  *
      6  * Authors:
      7  *   Fam Zheng <famz@redhat.com>
      8  *   Paolo Bonzini <pbonzini@redhat.com>
      9  *
     10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
     11  * See the COPYING file in the top-level directory.
     12  */
     13 
     14 #include "qemu/osdep.h"
     15 #include <linux/vfio.h>
     16 #include "qapi/error.h"
     17 #include "qapi/qmp/qdict.h"
     18 #include "qapi/qmp/qstring.h"
     19 #include "qemu/error-report.h"
     20 #include "qemu/main-loop.h"
     21 #include "qemu/module.h"
     22 #include "qemu/cutils.h"
     23 #include "qemu/option.h"
     24 #include "qemu/memalign.h"
     25 #include "qemu/vfio-helpers.h"
     26 #include "block/block_int.h"
     27 #include "sysemu/replay.h"
     28 #include "trace.h"
     29 
     30 #include "block/nvme.h"
     31 
     32 #define NVME_SQ_ENTRY_BYTES 64
     33 #define NVME_CQ_ENTRY_BYTES 16
     34 #define NVME_QUEUE_SIZE 128
     35 #define NVME_DOORBELL_SIZE 4096
     36 
     37 /*
     38  * We have to leave one slot empty as that is the full queue case where
     39  * head == tail + 1.
     40  */
     41 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
     42 
     43 typedef struct BDRVNVMeState BDRVNVMeState;
     44 
     45 /* Same index is used for queues and IRQs */
     46 #define INDEX_ADMIN     0
     47 #define INDEX_IO(n)     (1 + n)
     48 
     49 /* This driver shares a single MSIX IRQ for the admin and I/O queues */
     50 enum {
     51     MSIX_SHARED_IRQ_IDX = 0,
     52     MSIX_IRQ_COUNT = 1
     53 };
     54 
     55 typedef struct {
     56     int32_t  head, tail;
     57     uint8_t  *queue;
     58     uint64_t iova;
     59     /* Hardware MMIO register */
     60     volatile uint32_t *doorbell;
     61 } NVMeQueue;
     62 
     63 typedef struct {
     64     BlockCompletionFunc *cb;
     65     void *opaque;
     66     int cid;
     67     void *prp_list_page;
     68     uint64_t prp_list_iova;
     69     int free_req_next; /* q->reqs[] index of next free req */
     70 } NVMeRequest;
     71 
     72 typedef struct {
     73     QemuMutex   lock;
     74 
     75     /* Read from I/O code path, initialized under BQL */
     76     BDRVNVMeState   *s;
     77     int             index;
     78 
     79     /* Fields protected by BQL */
     80     uint8_t     *prp_list_pages;
     81 
     82     /* Fields protected by @lock */
     83     CoQueue     free_req_queue;
     84     NVMeQueue   sq, cq;
     85     int         cq_phase;
     86     int         free_req_head;
     87     NVMeRequest reqs[NVME_NUM_REQS];
     88     int         need_kick;
     89     int         inflight;
     90 
     91     /* Thread-safe, no lock necessary */
     92     QEMUBH      *completion_bh;
     93 } NVMeQueuePair;
     94 
     95 struct BDRVNVMeState {
     96     AioContext *aio_context;
     97     QEMUVFIOState *vfio;
     98     void *bar0_wo_map;
     99     /* Memory mapped registers */
    100     volatile struct {
    101         uint32_t sq_tail;
    102         uint32_t cq_head;
    103     } *doorbells;
    104     /* The submission/completion queue pairs.
    105      * [0]: admin queue.
    106      * [1..]: io queues.
    107      */
    108     NVMeQueuePair **queues;
    109     unsigned queue_count;
    110     size_t page_size;
    111     /* How many uint32_t elements does each doorbell entry take. */
    112     size_t doorbell_scale;
    113     bool write_cache_supported;
    114     EventNotifier irq_notifier[MSIX_IRQ_COUNT];
    115 
    116     uint64_t nsze; /* Namespace size reported by identify command */
    117     int nsid;      /* The namespace id to read/write data. */
    118     int blkshift;
    119 
    120     uint64_t max_transfer;
    121     bool plugged;
    122 
    123     bool supports_write_zeroes;
    124     bool supports_discard;
    125 
    126     CoMutex dma_map_lock;
    127     CoQueue dma_flush_queue;
    128 
    129     /* Total size of mapped qiov, accessed under dma_map_lock */
    130     int dma_map_count;
    131 
    132     /* PCI address (required for nvme_refresh_filename()) */
    133     char *device;
    134 
    135     struct {
    136         uint64_t completion_errors;
    137         uint64_t aligned_accesses;
    138         uint64_t unaligned_accesses;
    139     } stats;
    140 };
    141 
    142 #define NVME_BLOCK_OPT_DEVICE "device"
    143 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
    144 
    145 static void nvme_process_completion_bh(void *opaque);
    146 
    147 static QemuOptsList runtime_opts = {
    148     .name = "nvme",
    149     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
    150     .desc = {
    151         {
    152             .name = NVME_BLOCK_OPT_DEVICE,
    153             .type = QEMU_OPT_STRING,
    154             .help = "NVMe PCI device address",
    155         },
    156         {
    157             .name = NVME_BLOCK_OPT_NAMESPACE,
    158             .type = QEMU_OPT_NUMBER,
    159             .help = "NVMe namespace",
    160         },
    161         { /* end of list */ }
    162     },
    163 };
    164 
    165 /* Returns true on success, false on failure. */
    166 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
    167                             unsigned nentries, size_t entry_bytes, Error **errp)
    168 {
    169     size_t bytes;
    170     int r;
    171 
    172     bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size());
    173     q->head = q->tail = 0;
    174     q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes);
    175     if (!q->queue) {
    176         error_setg(errp, "Cannot allocate queue");
    177         return false;
    178     }
    179     memset(q->queue, 0, bytes);
    180     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp);
    181     if (r) {
    182         error_prepend(errp, "Cannot map queue: ");
    183     }
    184     return r == 0;
    185 }
    186 
    187 static void nvme_free_queue(NVMeQueue *q)
    188 {
    189     qemu_vfree(q->queue);
    190 }
    191 
    192 static void nvme_free_queue_pair(NVMeQueuePair *q)
    193 {
    194     trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
    195     if (q->completion_bh) {
    196         qemu_bh_delete(q->completion_bh);
    197     }
    198     nvme_free_queue(&q->sq);
    199     nvme_free_queue(&q->cq);
    200     qemu_vfree(q->prp_list_pages);
    201     qemu_mutex_destroy(&q->lock);
    202     g_free(q);
    203 }
    204 
    205 static void nvme_free_req_queue_cb(void *opaque)
    206 {
    207     NVMeQueuePair *q = opaque;
    208 
    209     qemu_mutex_lock(&q->lock);
    210     while (q->free_req_head != -1 &&
    211            qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
    212         /* Retry waiting requests */
    213     }
    214     qemu_mutex_unlock(&q->lock);
    215 }
    216 
    217 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
    218                                              AioContext *aio_context,
    219                                              unsigned idx, size_t size,
    220                                              Error **errp)
    221 {
    222     int i, r;
    223     NVMeQueuePair *q;
    224     uint64_t prp_list_iova;
    225     size_t bytes;
    226 
    227     q = g_try_new0(NVMeQueuePair, 1);
    228     if (!q) {
    229         error_setg(errp, "Cannot allocate queue pair");
    230         return NULL;
    231     }
    232     trace_nvme_create_queue_pair(idx, q, size, aio_context,
    233                                  event_notifier_get_fd(s->irq_notifier));
    234     bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
    235                           qemu_real_host_page_size());
    236     q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes);
    237     if (!q->prp_list_pages) {
    238         error_setg(errp, "Cannot allocate PRP page list");
    239         goto fail;
    240     }
    241     memset(q->prp_list_pages, 0, bytes);
    242     qemu_mutex_init(&q->lock);
    243     q->s = s;
    244     q->index = idx;
    245     qemu_co_queue_init(&q->free_req_queue);
    246     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
    247     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
    248                           false, &prp_list_iova, errp);
    249     if (r) {
    250         error_prepend(errp, "Cannot map buffer for DMA: ");
    251         goto fail;
    252     }
    253     q->free_req_head = -1;
    254     for (i = 0; i < NVME_NUM_REQS; i++) {
    255         NVMeRequest *req = &q->reqs[i];
    256         req->cid = i + 1;
    257         req->free_req_next = q->free_req_head;
    258         q->free_req_head = i;
    259         req->prp_list_page = q->prp_list_pages + i * s->page_size;
    260         req->prp_list_iova = prp_list_iova + i * s->page_size;
    261     }
    262 
    263     if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
    264         goto fail;
    265     }
    266     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
    267 
    268     if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
    269         goto fail;
    270     }
    271     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
    272 
    273     return q;
    274 fail:
    275     nvme_free_queue_pair(q);
    276     return NULL;
    277 }
    278 
    279 /* With q->lock */
    280 static void nvme_kick(NVMeQueuePair *q)
    281 {
    282     BDRVNVMeState *s = q->s;
    283 
    284     if (s->plugged || !q->need_kick) {
    285         return;
    286     }
    287     trace_nvme_kick(s, q->index);
    288     assert(!(q->sq.tail & 0xFF00));
    289     /* Fence the write to submission queue entry before notifying the device. */
    290     smp_wmb();
    291     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
    292     q->inflight += q->need_kick;
    293     q->need_kick = 0;
    294 }
    295 
    296 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q)
    297 {
    298     NVMeRequest *req;
    299 
    300     req = &q->reqs[q->free_req_head];
    301     q->free_req_head = req->free_req_next;
    302     req->free_req_next = -1;
    303     return req;
    304 }
    305 
    306 /* Return a free request element if any, otherwise return NULL.  */
    307 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q)
    308 {
    309     QEMU_LOCK_GUARD(&q->lock);
    310     if (q->free_req_head == -1) {
    311         return NULL;
    312     }
    313     return nvme_get_free_req_nofail_locked(q);
    314 }
    315 
    316 /*
    317  * Wait for a free request to become available if necessary, then
    318  * return it.
    319  */
    320 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
    321 {
    322     QEMU_LOCK_GUARD(&q->lock);
    323 
    324     while (q->free_req_head == -1) {
    325         trace_nvme_free_req_queue_wait(q->s, q->index);
    326         qemu_co_queue_wait(&q->free_req_queue, &q->lock);
    327     }
    328 
    329     return nvme_get_free_req_nofail_locked(q);
    330 }
    331 
    332 /* With q->lock */
    333 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
    334 {
    335     req->free_req_next = q->free_req_head;
    336     q->free_req_head = req - q->reqs;
    337 }
    338 
    339 /* With q->lock */
    340 static void nvme_wake_free_req_locked(NVMeQueuePair *q)
    341 {
    342     if (!qemu_co_queue_empty(&q->free_req_queue)) {
    343         replay_bh_schedule_oneshot_event(q->s->aio_context,
    344                 nvme_free_req_queue_cb, q);
    345     }
    346 }
    347 
    348 /* Insert a request in the freelist and wake waiters */
    349 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
    350 {
    351     qemu_mutex_lock(&q->lock);
    352     nvme_put_free_req_locked(q, req);
    353     nvme_wake_free_req_locked(q);
    354     qemu_mutex_unlock(&q->lock);
    355 }
    356 
    357 static inline int nvme_translate_error(const NvmeCqe *c)
    358 {
    359     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
    360     if (status) {
    361         trace_nvme_error(le32_to_cpu(c->result),
    362                          le16_to_cpu(c->sq_head),
    363                          le16_to_cpu(c->sq_id),
    364                          le16_to_cpu(c->cid),
    365                          le16_to_cpu(status));
    366     }
    367     switch (status) {
    368     case 0:
    369         return 0;
    370     case 1:
    371         return -ENOSYS;
    372     case 2:
    373         return -EINVAL;
    374     default:
    375         return -EIO;
    376     }
    377 }
    378 
    379 /* With q->lock */
    380 static bool nvme_process_completion(NVMeQueuePair *q)
    381 {
    382     BDRVNVMeState *s = q->s;
    383     bool progress = false;
    384     NVMeRequest *preq;
    385     NVMeRequest req;
    386     NvmeCqe *c;
    387 
    388     trace_nvme_process_completion(s, q->index, q->inflight);
    389     if (s->plugged) {
    390         trace_nvme_process_completion_queue_plugged(s, q->index);
    391         return false;
    392     }
    393 
    394     /*
    395      * Support re-entrancy when a request cb() function invokes aio_poll().
    396      * Pending completions must be visible to aio_poll() so that a cb()
    397      * function can wait for the completion of another request.
    398      *
    399      * The aio_poll() loop will execute our BH and we'll resume completion
    400      * processing there.
    401      */
    402     qemu_bh_schedule(q->completion_bh);
    403 
    404     assert(q->inflight >= 0);
    405     while (q->inflight) {
    406         int ret;
    407         int16_t cid;
    408 
    409         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
    410         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
    411             break;
    412         }
    413         ret = nvme_translate_error(c);
    414         if (ret) {
    415             s->stats.completion_errors++;
    416         }
    417         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
    418         if (!q->cq.head) {
    419             q->cq_phase = !q->cq_phase;
    420         }
    421         cid = le16_to_cpu(c->cid);
    422         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
    423             warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
    424                         "queue size: %u", cid, NVME_QUEUE_SIZE);
    425             continue;
    426         }
    427         trace_nvme_complete_command(s, q->index, cid);
    428         preq = &q->reqs[cid - 1];
    429         req = *preq;
    430         assert(req.cid == cid);
    431         assert(req.cb);
    432         nvme_put_free_req_locked(q, preq);
    433         preq->cb = preq->opaque = NULL;
    434         q->inflight--;
    435         qemu_mutex_unlock(&q->lock);
    436         req.cb(req.opaque, ret);
    437         qemu_mutex_lock(&q->lock);
    438         progress = true;
    439     }
    440     if (progress) {
    441         /* Notify the device so it can post more completions. */
    442         smp_mb_release();
    443         *q->cq.doorbell = cpu_to_le32(q->cq.head);
    444         nvme_wake_free_req_locked(q);
    445     }
    446 
    447     qemu_bh_cancel(q->completion_bh);
    448 
    449     return progress;
    450 }
    451 
    452 static void nvme_process_completion_bh(void *opaque)
    453 {
    454     NVMeQueuePair *q = opaque;
    455 
    456     /*
    457      * We're being invoked because a nvme_process_completion() cb() function
    458      * called aio_poll(). The callback may be waiting for further completions
    459      * so notify the device that it has space to fill in more completions now.
    460      */
    461     smp_mb_release();
    462     *q->cq.doorbell = cpu_to_le32(q->cq.head);
    463     nvme_wake_free_req_locked(q);
    464 
    465     nvme_process_completion(q);
    466 }
    467 
    468 static void nvme_trace_command(const NvmeCmd *cmd)
    469 {
    470     int i;
    471 
    472     if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
    473         return;
    474     }
    475     for (i = 0; i < 8; ++i) {
    476         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
    477         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
    478                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
    479     }
    480 }
    481 
    482 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
    483                                 NvmeCmd *cmd, BlockCompletionFunc cb,
    484                                 void *opaque)
    485 {
    486     assert(!req->cb);
    487     req->cb = cb;
    488     req->opaque = opaque;
    489     cmd->cid = cpu_to_le16(req->cid);
    490 
    491     trace_nvme_submit_command(q->s, q->index, req->cid);
    492     nvme_trace_command(cmd);
    493     qemu_mutex_lock(&q->lock);
    494     memcpy((uint8_t *)q->sq.queue +
    495            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
    496     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
    497     q->need_kick++;
    498     nvme_kick(q);
    499     nvme_process_completion(q);
    500     qemu_mutex_unlock(&q->lock);
    501 }
    502 
    503 static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
    504 {
    505     int *pret = opaque;
    506     *pret = ret;
    507     aio_wait_kick();
    508 }
    509 
    510 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
    511 {
    512     BDRVNVMeState *s = bs->opaque;
    513     NVMeQueuePair *q = s->queues[INDEX_ADMIN];
    514     AioContext *aio_context = bdrv_get_aio_context(bs);
    515     NVMeRequest *req;
    516     int ret = -EINPROGRESS;
    517     req = nvme_get_free_req_nowait(q);
    518     if (!req) {
    519         return -EBUSY;
    520     }
    521     nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
    522 
    523     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
    524     return ret;
    525 }
    526 
    527 /* Returns true on success, false on failure. */
    528 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
    529 {
    530     BDRVNVMeState *s = bs->opaque;
    531     bool ret = false;
    532     QEMU_AUTO_VFREE union {
    533         NvmeIdCtrl ctrl;
    534         NvmeIdNs ns;
    535     } *id = NULL;
    536     NvmeLBAF *lbaf;
    537     uint16_t oncs;
    538     int r;
    539     uint64_t iova;
    540     NvmeCmd cmd = {
    541         .opcode = NVME_ADM_CMD_IDENTIFY,
    542         .cdw10 = cpu_to_le32(0x1),
    543     };
    544     size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size());
    545 
    546     id = qemu_try_memalign(qemu_real_host_page_size(), id_size);
    547     if (!id) {
    548         error_setg(errp, "Cannot allocate buffer for identify response");
    549         goto out;
    550     }
    551     r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp);
    552     if (r) {
    553         error_prepend(errp, "Cannot map buffer for DMA: ");
    554         goto out;
    555     }
    556 
    557     memset(id, 0, id_size);
    558     cmd.dptr.prp1 = cpu_to_le64(iova);
    559     if (nvme_admin_cmd_sync(bs, &cmd)) {
    560         error_setg(errp, "Failed to identify controller");
    561         goto out;
    562     }
    563 
    564     if (le32_to_cpu(id->ctrl.nn) < namespace) {
    565         error_setg(errp, "Invalid namespace");
    566         goto out;
    567     }
    568     s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
    569     s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
    570     /* For now the page list buffer per command is one page, to hold at most
    571      * s->page_size / sizeof(uint64_t) entries. */
    572     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
    573                           s->page_size / sizeof(uint64_t) * s->page_size);
    574 
    575     oncs = le16_to_cpu(id->ctrl.oncs);
    576     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
    577     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
    578 
    579     memset(id, 0, id_size);
    580     cmd.cdw10 = 0;
    581     cmd.nsid = cpu_to_le32(namespace);
    582     if (nvme_admin_cmd_sync(bs, &cmd)) {
    583         error_setg(errp, "Failed to identify namespace");
    584         goto out;
    585     }
    586 
    587     s->nsze = le64_to_cpu(id->ns.nsze);
    588     lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
    589 
    590     if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
    591             NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
    592                     NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
    593         bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
    594     }
    595 
    596     if (lbaf->ms) {
    597         error_setg(errp, "Namespaces with metadata are not yet supported");
    598         goto out;
    599     }
    600 
    601     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
    602         (1 << lbaf->ds) > s->page_size)
    603     {
    604         error_setg(errp, "Namespace has unsupported block size (2^%d)",
    605                    lbaf->ds);
    606         goto out;
    607     }
    608 
    609     ret = true;
    610     s->blkshift = lbaf->ds;
    611 out:
    612     qemu_vfio_dma_unmap(s->vfio, id);
    613 
    614     return ret;
    615 }
    616 
    617 static void nvme_poll_queue(NVMeQueuePair *q)
    618 {
    619     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
    620     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
    621 
    622     trace_nvme_poll_queue(q->s, q->index);
    623     /*
    624      * Do an early check for completions. q->lock isn't needed because
    625      * nvme_process_completion() only runs in the event loop thread and
    626      * cannot race with itself.
    627      */
    628     if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
    629         return;
    630     }
    631 
    632     qemu_mutex_lock(&q->lock);
    633     while (nvme_process_completion(q)) {
    634         /* Keep polling */
    635     }
    636     qemu_mutex_unlock(&q->lock);
    637 }
    638 
    639 static void nvme_poll_queues(BDRVNVMeState *s)
    640 {
    641     int i;
    642 
    643     for (i = 0; i < s->queue_count; i++) {
    644         nvme_poll_queue(s->queues[i]);
    645     }
    646 }
    647 
    648 static void nvme_handle_event(EventNotifier *n)
    649 {
    650     BDRVNVMeState *s = container_of(n, BDRVNVMeState,
    651                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
    652 
    653     trace_nvme_handle_event(s);
    654     event_notifier_test_and_clear(n);
    655     nvme_poll_queues(s);
    656 }
    657 
    658 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
    659 {
    660     BDRVNVMeState *s = bs->opaque;
    661     unsigned n = s->queue_count;
    662     NVMeQueuePair *q;
    663     NvmeCmd cmd;
    664     unsigned queue_size = NVME_QUEUE_SIZE;
    665 
    666     assert(n <= UINT16_MAX);
    667     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
    668                                n, queue_size, errp);
    669     if (!q) {
    670         return false;
    671     }
    672     cmd = (NvmeCmd) {
    673         .opcode = NVME_ADM_CMD_CREATE_CQ,
    674         .dptr.prp1 = cpu_to_le64(q->cq.iova),
    675         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
    676         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
    677     };
    678     if (nvme_admin_cmd_sync(bs, &cmd)) {
    679         error_setg(errp, "Failed to create CQ io queue [%u]", n);
    680         goto out_error;
    681     }
    682     cmd = (NvmeCmd) {
    683         .opcode = NVME_ADM_CMD_CREATE_SQ,
    684         .dptr.prp1 = cpu_to_le64(q->sq.iova),
    685         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
    686         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
    687     };
    688     if (nvme_admin_cmd_sync(bs, &cmd)) {
    689         error_setg(errp, "Failed to create SQ io queue [%u]", n);
    690         goto out_error;
    691     }
    692     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
    693     s->queues[n] = q;
    694     s->queue_count++;
    695     return true;
    696 out_error:
    697     nvme_free_queue_pair(q);
    698     return false;
    699 }
    700 
    701 static bool nvme_poll_cb(void *opaque)
    702 {
    703     EventNotifier *e = opaque;
    704     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
    705                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
    706     int i;
    707 
    708     for (i = 0; i < s->queue_count; i++) {
    709         NVMeQueuePair *q = s->queues[i];
    710         const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
    711         NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
    712 
    713         /*
    714          * q->lock isn't needed because nvme_process_completion() only runs in
    715          * the event loop thread and cannot race with itself.
    716          */
    717         if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) {
    718             return true;
    719         }
    720     }
    721     return false;
    722 }
    723 
    724 static void nvme_poll_ready(EventNotifier *e)
    725 {
    726     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
    727                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
    728 
    729     nvme_poll_queues(s);
    730 }
    731 
    732 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
    733                      Error **errp)
    734 {
    735     BDRVNVMeState *s = bs->opaque;
    736     NVMeQueuePair *q;
    737     AioContext *aio_context = bdrv_get_aio_context(bs);
    738     int ret;
    739     uint64_t cap;
    740     uint32_t ver;
    741     uint64_t timeout_ms;
    742     uint64_t deadline, now;
    743     volatile NvmeBar *regs = NULL;
    744 
    745     qemu_co_mutex_init(&s->dma_map_lock);
    746     qemu_co_queue_init(&s->dma_flush_queue);
    747     s->device = g_strdup(device);
    748     s->nsid = namespace;
    749     s->aio_context = bdrv_get_aio_context(bs);
    750     ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
    751     if (ret) {
    752         error_setg(errp, "Failed to init event notifier");
    753         return ret;
    754     }
    755 
    756     s->vfio = qemu_vfio_open_pci(device, errp);
    757     if (!s->vfio) {
    758         ret = -EINVAL;
    759         goto out;
    760     }
    761 
    762     regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
    763                                  PROT_READ | PROT_WRITE, errp);
    764     if (!regs) {
    765         ret = -EINVAL;
    766         goto out;
    767     }
    768     /* Perform initialize sequence as described in NVMe spec "7.6.1
    769      * Initialization". */
    770 
    771     cap = le64_to_cpu(regs->cap);
    772     trace_nvme_controller_capability_raw(cap);
    773     trace_nvme_controller_capability("Maximum Queue Entries Supported",
    774                                      1 + NVME_CAP_MQES(cap));
    775     trace_nvme_controller_capability("Contiguous Queues Required",
    776                                      NVME_CAP_CQR(cap));
    777     trace_nvme_controller_capability("Doorbell Stride",
    778                                      1 << (2 + NVME_CAP_DSTRD(cap)));
    779     trace_nvme_controller_capability("Subsystem Reset Supported",
    780                                      NVME_CAP_NSSRS(cap));
    781     trace_nvme_controller_capability("Memory Page Size Minimum",
    782                                      1 << (12 + NVME_CAP_MPSMIN(cap)));
    783     trace_nvme_controller_capability("Memory Page Size Maximum",
    784                                      1 << (12 + NVME_CAP_MPSMAX(cap)));
    785     if (!NVME_CAP_CSS(cap)) {
    786         error_setg(errp, "Device doesn't support NVMe command set");
    787         ret = -EINVAL;
    788         goto out;
    789     }
    790 
    791     s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
    792     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
    793     bs->bl.opt_mem_alignment = s->page_size;
    794     bs->bl.request_alignment = s->page_size;
    795     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
    796 
    797     ver = le32_to_cpu(regs->vs);
    798     trace_nvme_controller_spec_version(extract32(ver, 16, 16),
    799                                        extract32(ver, 8, 8),
    800                                        extract32(ver, 0, 8));
    801 
    802     /* Reset device to get a clean state. */
    803     regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
    804     /* Wait for CSTS.RDY = 0. */
    805     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
    806     while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
    807         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
    808             error_setg(errp, "Timeout while waiting for device to reset (%"
    809                              PRId64 " ms)",
    810                        timeout_ms);
    811             ret = -ETIMEDOUT;
    812             goto out;
    813         }
    814     }
    815 
    816     s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
    817                                            sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
    818                                            PROT_WRITE, errp);
    819     s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
    820     if (!s->doorbells) {
    821         ret = -EINVAL;
    822         goto out;
    823     }
    824 
    825     /* Set up admin queue. */
    826     s->queues = g_new(NVMeQueuePair *, 1);
    827     q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
    828     if (!q) {
    829         ret = -EINVAL;
    830         goto out;
    831     }
    832     s->queues[INDEX_ADMIN] = q;
    833     s->queue_count = 1;
    834     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
    835     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
    836                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
    837     regs->asq = cpu_to_le64(q->sq.iova);
    838     regs->acq = cpu_to_le64(q->cq.iova);
    839 
    840     /* After setting up all control registers we can enable device now. */
    841     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
    842                            (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
    843                            CC_EN_MASK);
    844     /* Wait for CSTS.RDY = 1. */
    845     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
    846     deadline = now + timeout_ms * SCALE_MS;
    847     while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
    848         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
    849             error_setg(errp, "Timeout while waiting for device to start (%"
    850                              PRId64 " ms)",
    851                        timeout_ms);
    852             ret = -ETIMEDOUT;
    853             goto out;
    854         }
    855     }
    856 
    857     ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
    858                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
    859     if (ret) {
    860         goto out;
    861     }
    862     aio_set_event_notifier(bdrv_get_aio_context(bs),
    863                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
    864                            false, nvme_handle_event, nvme_poll_cb,
    865                            nvme_poll_ready);
    866 
    867     if (!nvme_identify(bs, namespace, errp)) {
    868         ret = -EIO;
    869         goto out;
    870     }
    871 
    872     /* Set up command queues. */
    873     if (!nvme_add_io_queue(bs, errp)) {
    874         ret = -EIO;
    875     }
    876 out:
    877     if (regs) {
    878         qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
    879     }
    880 
    881     /* Cleaning up is done in nvme_file_open() upon error. */
    882     return ret;
    883 }
    884 
    885 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
    886  *
    887  *     nvme://0000:44:00.0/1
    888  *
    889  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
    890  * is the PCI address, and the last part is the namespace number starting from
    891  * 1 according to the NVMe spec. */
    892 static void nvme_parse_filename(const char *filename, QDict *options,
    893                                 Error **errp)
    894 {
    895     int pref = strlen("nvme://");
    896 
    897     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
    898         const char *tmp = filename + pref;
    899         char *device;
    900         const char *namespace;
    901         unsigned long ns;
    902         const char *slash = strchr(tmp, '/');
    903         if (!slash) {
    904             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
    905             return;
    906         }
    907         device = g_strndup(tmp, slash - tmp);
    908         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
    909         g_free(device);
    910         namespace = slash + 1;
    911         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
    912             error_setg(errp, "Invalid namespace '%s', positive number expected",
    913                        namespace);
    914             return;
    915         }
    916         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
    917                       *namespace ? namespace : "1");
    918     }
    919 }
    920 
    921 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
    922                                            Error **errp)
    923 {
    924     int ret;
    925     BDRVNVMeState *s = bs->opaque;
    926     NvmeCmd cmd = {
    927         .opcode = NVME_ADM_CMD_SET_FEATURES,
    928         .nsid = cpu_to_le32(s->nsid),
    929         .cdw10 = cpu_to_le32(0x06),
    930         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
    931     };
    932 
    933     ret = nvme_admin_cmd_sync(bs, &cmd);
    934     if (ret) {
    935         error_setg(errp, "Failed to configure NVMe write cache");
    936     }
    937     return ret;
    938 }
    939 
    940 static void nvme_close(BlockDriverState *bs)
    941 {
    942     BDRVNVMeState *s = bs->opaque;
    943 
    944     for (unsigned i = 0; i < s->queue_count; ++i) {
    945         nvme_free_queue_pair(s->queues[i]);
    946     }
    947     g_free(s->queues);
    948     aio_set_event_notifier(bdrv_get_aio_context(bs),
    949                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
    950                            false, NULL, NULL, NULL);
    951     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
    952     qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
    953                             0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
    954     qemu_vfio_close(s->vfio);
    955 
    956     g_free(s->device);
    957 }
    958 
    959 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
    960                           Error **errp)
    961 {
    962     const char *device;
    963     QemuOpts *opts;
    964     int namespace;
    965     int ret;
    966     BDRVNVMeState *s = bs->opaque;
    967 
    968     bs->supported_write_flags = BDRV_REQ_FUA;
    969 
    970     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    971     qemu_opts_absorb_qdict(opts, options, &error_abort);
    972     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
    973     if (!device) {
    974         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
    975         qemu_opts_del(opts);
    976         return -EINVAL;
    977     }
    978 
    979     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
    980     ret = nvme_init(bs, device, namespace, errp);
    981     qemu_opts_del(opts);
    982     if (ret) {
    983         goto fail;
    984     }
    985     if (flags & BDRV_O_NOCACHE) {
    986         if (!s->write_cache_supported) {
    987             error_setg(errp,
    988                        "NVMe controller doesn't support write cache configuration");
    989             ret = -EINVAL;
    990         } else {
    991             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
    992                                                   errp);
    993         }
    994         if (ret) {
    995             goto fail;
    996         }
    997     }
    998     return 0;
    999 fail:
   1000     nvme_close(bs);
   1001     return ret;
   1002 }
   1003 
   1004 static int64_t nvme_getlength(BlockDriverState *bs)
   1005 {
   1006     BDRVNVMeState *s = bs->opaque;
   1007     return s->nsze << s->blkshift;
   1008 }
   1009 
   1010 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
   1011 {
   1012     BDRVNVMeState *s = bs->opaque;
   1013     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
   1014     return UINT32_C(1) << s->blkshift;
   1015 }
   1016 
   1017 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
   1018 {
   1019     uint32_t blocksize = nvme_get_blocksize(bs);
   1020     bsz->phys = blocksize;
   1021     bsz->log = blocksize;
   1022     return 0;
   1023 }
   1024 
   1025 /* Called with s->dma_map_lock */
   1026 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
   1027                                             QEMUIOVector *qiov)
   1028 {
   1029     int r = 0;
   1030     BDRVNVMeState *s = bs->opaque;
   1031 
   1032     s->dma_map_count -= qiov->size;
   1033     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
   1034         r = qemu_vfio_dma_reset_temporary(s->vfio);
   1035         if (!r) {
   1036             qemu_co_queue_restart_all(&s->dma_flush_queue);
   1037         }
   1038     }
   1039     return r;
   1040 }
   1041 
   1042 /* Called with s->dma_map_lock */
   1043 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
   1044                                           NVMeRequest *req, QEMUIOVector *qiov)
   1045 {
   1046     BDRVNVMeState *s = bs->opaque;
   1047     uint64_t *pagelist = req->prp_list_page;
   1048     int i, j, r;
   1049     int entries = 0;
   1050     Error *local_err = NULL, **errp = NULL;
   1051 
   1052     assert(qiov->size);
   1053     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
   1054     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
   1055     for (i = 0; i < qiov->niov; ++i) {
   1056         bool retry = true;
   1057         uint64_t iova;
   1058         size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
   1059                                    qemu_real_host_page_size());
   1060 try_map:
   1061         r = qemu_vfio_dma_map(s->vfio,
   1062                               qiov->iov[i].iov_base,
   1063                               len, true, &iova, errp);
   1064         if (r == -ENOSPC) {
   1065             /*
   1066              * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA
   1067              * ioctl returns -ENOSPC to signal the user exhausted the DMA
   1068              * mappings available for a container since Linux kernel commit
   1069              * 492855939bdb ("vfio/type1: Limit DMA mappings per container",
   1070              * April 2019, see CVE-2019-3882).
   1071              *
   1072              * This block driver already handles this error path by checking
   1073              * for the -ENOMEM error, so we directly replace -ENOSPC by
   1074              * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev
   1075              * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and
   1076              * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator
   1077              * to add more storage to the blockdev. Not something we can do
   1078              * easily with an IOMMU :)
   1079              */
   1080             r = -ENOMEM;
   1081         }
   1082         if (r == -ENOMEM && retry) {
   1083             /*
   1084              * We exhausted the DMA mappings available for our container:
   1085              * recycle the volatile IOVA mappings.
   1086              */
   1087             retry = false;
   1088             trace_nvme_dma_flush_queue_wait(s);
   1089             if (s->dma_map_count) {
   1090                 trace_nvme_dma_map_flush(s);
   1091                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
   1092             } else {
   1093                 r = qemu_vfio_dma_reset_temporary(s->vfio);
   1094                 if (r) {
   1095                     goto fail;
   1096                 }
   1097             }
   1098             errp = &local_err;
   1099 
   1100             goto try_map;
   1101         }
   1102         if (r) {
   1103             goto fail;
   1104         }
   1105 
   1106         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
   1107             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
   1108         }
   1109         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
   1110                                     qiov->iov[i].iov_len / s->page_size);
   1111     }
   1112 
   1113     s->dma_map_count += qiov->size;
   1114 
   1115     assert(entries <= s->page_size / sizeof(uint64_t));
   1116     switch (entries) {
   1117     case 0:
   1118         abort();
   1119     case 1:
   1120         cmd->dptr.prp1 = pagelist[0];
   1121         cmd->dptr.prp2 = 0;
   1122         break;
   1123     case 2:
   1124         cmd->dptr.prp1 = pagelist[0];
   1125         cmd->dptr.prp2 = pagelist[1];
   1126         break;
   1127     default:
   1128         cmd->dptr.prp1 = pagelist[0];
   1129         cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
   1130         break;
   1131     }
   1132     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
   1133     for (i = 0; i < entries; ++i) {
   1134         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
   1135     }
   1136     return 0;
   1137 fail:
   1138     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
   1139      * increment s->dma_map_count. This is okay for fixed mapping memory areas
   1140      * because they are already mapped before calling this function; for
   1141      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
   1142      * calling qemu_vfio_dma_reset_temporary when necessary. */
   1143     if (local_err) {
   1144         error_reportf_err(local_err, "Cannot map buffer for DMA: ");
   1145     }
   1146     return r;
   1147 }
   1148 
   1149 typedef struct {
   1150     Coroutine *co;
   1151     int ret;
   1152     AioContext *ctx;
   1153 } NVMeCoData;
   1154 
   1155 static void nvme_rw_cb_bh(void *opaque)
   1156 {
   1157     NVMeCoData *data = opaque;
   1158     qemu_coroutine_enter(data->co);
   1159 }
   1160 
   1161 static void nvme_rw_cb(void *opaque, int ret)
   1162 {
   1163     NVMeCoData *data = opaque;
   1164     data->ret = ret;
   1165     if (!data->co) {
   1166         /* The rw coroutine hasn't yielded, don't try to enter. */
   1167         return;
   1168     }
   1169     replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
   1170 }
   1171 
   1172 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
   1173                                             uint64_t offset, uint64_t bytes,
   1174                                             QEMUIOVector *qiov,
   1175                                             bool is_write,
   1176                                             int flags)
   1177 {
   1178     int r;
   1179     BDRVNVMeState *s = bs->opaque;
   1180     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
   1181     NVMeRequest *req;
   1182 
   1183     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
   1184                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
   1185     NvmeCmd cmd = {
   1186         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
   1187         .nsid = cpu_to_le32(s->nsid),
   1188         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
   1189         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
   1190         .cdw12 = cpu_to_le32(cdw12),
   1191     };
   1192     NVMeCoData data = {
   1193         .ctx = bdrv_get_aio_context(bs),
   1194         .ret = -EINPROGRESS,
   1195     };
   1196 
   1197     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
   1198     assert(s->queue_count > 1);
   1199     req = nvme_get_free_req(ioq);
   1200     assert(req);
   1201 
   1202     qemu_co_mutex_lock(&s->dma_map_lock);
   1203     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
   1204     qemu_co_mutex_unlock(&s->dma_map_lock);
   1205     if (r) {
   1206         nvme_put_free_req_and_wake(ioq, req);
   1207         return r;
   1208     }
   1209     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
   1210 
   1211     data.co = qemu_coroutine_self();
   1212     while (data.ret == -EINPROGRESS) {
   1213         qemu_coroutine_yield();
   1214     }
   1215 
   1216     qemu_co_mutex_lock(&s->dma_map_lock);
   1217     r = nvme_cmd_unmap_qiov(bs, qiov);
   1218     qemu_co_mutex_unlock(&s->dma_map_lock);
   1219     if (r) {
   1220         return r;
   1221     }
   1222 
   1223     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
   1224     return data.ret;
   1225 }
   1226 
   1227 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
   1228                                      const QEMUIOVector *qiov)
   1229 {
   1230     int i;
   1231     BDRVNVMeState *s = bs->opaque;
   1232 
   1233     for (i = 0; i < qiov->niov; ++i) {
   1234         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
   1235                                  qemu_real_host_page_size()) ||
   1236             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) {
   1237             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
   1238                                       qiov->iov[i].iov_len, s->page_size);
   1239             return false;
   1240         }
   1241     }
   1242     return true;
   1243 }
   1244 
   1245 static coroutine_fn int nvme_co_prw(BlockDriverState *bs,
   1246                                     uint64_t offset, uint64_t bytes,
   1247                                     QEMUIOVector *qiov, bool is_write,
   1248                                     int flags)
   1249 {
   1250     BDRVNVMeState *s = bs->opaque;
   1251     int r;
   1252     QEMU_AUTO_VFREE uint8_t *buf = NULL;
   1253     QEMUIOVector local_qiov;
   1254     size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size());
   1255     assert(QEMU_IS_ALIGNED(offset, s->page_size));
   1256     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
   1257     assert(bytes <= s->max_transfer);
   1258     if (nvme_qiov_aligned(bs, qiov)) {
   1259         s->stats.aligned_accesses++;
   1260         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
   1261     }
   1262     s->stats.unaligned_accesses++;
   1263     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
   1264     buf = qemu_try_memalign(qemu_real_host_page_size(), len);
   1265 
   1266     if (!buf) {
   1267         return -ENOMEM;
   1268     }
   1269     qemu_iovec_init(&local_qiov, 1);
   1270     if (is_write) {
   1271         qemu_iovec_to_buf(qiov, 0, buf, bytes);
   1272     }
   1273     qemu_iovec_add(&local_qiov, buf, bytes);
   1274     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
   1275     qemu_iovec_destroy(&local_qiov);
   1276     if (!r && !is_write) {
   1277         qemu_iovec_from_buf(qiov, 0, buf, bytes);
   1278     }
   1279     return r;
   1280 }
   1281 
   1282 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
   1283                                        int64_t offset, int64_t bytes,
   1284                                        QEMUIOVector *qiov,
   1285                                        BdrvRequestFlags flags)
   1286 {
   1287     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
   1288 }
   1289 
   1290 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
   1291                                         int64_t offset, int64_t bytes,
   1292                                         QEMUIOVector *qiov,
   1293                                         BdrvRequestFlags flags)
   1294 {
   1295     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
   1296 }
   1297 
   1298 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
   1299 {
   1300     BDRVNVMeState *s = bs->opaque;
   1301     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
   1302     NVMeRequest *req;
   1303     NvmeCmd cmd = {
   1304         .opcode = NVME_CMD_FLUSH,
   1305         .nsid = cpu_to_le32(s->nsid),
   1306     };
   1307     NVMeCoData data = {
   1308         .ctx = bdrv_get_aio_context(bs),
   1309         .ret = -EINPROGRESS,
   1310     };
   1311 
   1312     assert(s->queue_count > 1);
   1313     req = nvme_get_free_req(ioq);
   1314     assert(req);
   1315     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
   1316 
   1317     data.co = qemu_coroutine_self();
   1318     if (data.ret == -EINPROGRESS) {
   1319         qemu_coroutine_yield();
   1320     }
   1321 
   1322     return data.ret;
   1323 }
   1324 
   1325 
   1326 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
   1327                                               int64_t offset,
   1328                                               int64_t bytes,
   1329                                               BdrvRequestFlags flags)
   1330 {
   1331     BDRVNVMeState *s = bs->opaque;
   1332     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
   1333     NVMeRequest *req;
   1334     uint32_t cdw12;
   1335 
   1336     if (!s->supports_write_zeroes) {
   1337         return -ENOTSUP;
   1338     }
   1339 
   1340     if (bytes == 0) {
   1341         return 0;
   1342     }
   1343 
   1344     cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
   1345     /*
   1346      * We should not lose information. pwrite_zeroes_alignment and
   1347      * max_pwrite_zeroes guarantees it.
   1348      */
   1349     assert(((cdw12 + 1) << s->blkshift) == bytes);
   1350 
   1351     NvmeCmd cmd = {
   1352         .opcode = NVME_CMD_WRITE_ZEROES,
   1353         .nsid = cpu_to_le32(s->nsid),
   1354         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
   1355         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
   1356     };
   1357 
   1358     NVMeCoData data = {
   1359         .ctx = bdrv_get_aio_context(bs),
   1360         .ret = -EINPROGRESS,
   1361     };
   1362 
   1363     if (flags & BDRV_REQ_MAY_UNMAP) {
   1364         cdw12 |= (1 << 25);
   1365     }
   1366 
   1367     if (flags & BDRV_REQ_FUA) {
   1368         cdw12 |= (1 << 30);
   1369     }
   1370 
   1371     cmd.cdw12 = cpu_to_le32(cdw12);
   1372 
   1373     trace_nvme_write_zeroes(s, offset, bytes, flags);
   1374     assert(s->queue_count > 1);
   1375     req = nvme_get_free_req(ioq);
   1376     assert(req);
   1377 
   1378     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
   1379 
   1380     data.co = qemu_coroutine_self();
   1381     while (data.ret == -EINPROGRESS) {
   1382         qemu_coroutine_yield();
   1383     }
   1384 
   1385     trace_nvme_rw_done(s, true, offset, bytes, data.ret);
   1386     return data.ret;
   1387 }
   1388 
   1389 
   1390 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
   1391                                          int64_t offset,
   1392                                          int64_t bytes)
   1393 {
   1394     BDRVNVMeState *s = bs->opaque;
   1395     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
   1396     NVMeRequest *req;
   1397     QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
   1398     QEMUIOVector local_qiov;
   1399     int ret;
   1400 
   1401     NvmeCmd cmd = {
   1402         .opcode = NVME_CMD_DSM,
   1403         .nsid = cpu_to_le32(s->nsid),
   1404         .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
   1405         .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
   1406     };
   1407 
   1408     NVMeCoData data = {
   1409         .ctx = bdrv_get_aio_context(bs),
   1410         .ret = -EINPROGRESS,
   1411     };
   1412 
   1413     if (!s->supports_discard) {
   1414         return -ENOTSUP;
   1415     }
   1416 
   1417     assert(s->queue_count > 1);
   1418 
   1419     /*
   1420      * Filling the @buf requires @offset and @bytes to satisfy restrictions
   1421      * defined in nvme_refresh_limits().
   1422      */
   1423     assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift));
   1424     assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift));
   1425     assert((bytes >> s->blkshift) <= UINT32_MAX);
   1426 
   1427     buf = qemu_try_memalign(s->page_size, s->page_size);
   1428     if (!buf) {
   1429         return -ENOMEM;
   1430     }
   1431     memset(buf, 0, s->page_size);
   1432     buf->nlb = cpu_to_le32(bytes >> s->blkshift);
   1433     buf->slba = cpu_to_le64(offset >> s->blkshift);
   1434     buf->cattr = 0;
   1435 
   1436     qemu_iovec_init(&local_qiov, 1);
   1437     qemu_iovec_add(&local_qiov, buf, 4096);
   1438 
   1439     req = nvme_get_free_req(ioq);
   1440     assert(req);
   1441 
   1442     qemu_co_mutex_lock(&s->dma_map_lock);
   1443     ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
   1444     qemu_co_mutex_unlock(&s->dma_map_lock);
   1445 
   1446     if (ret) {
   1447         nvme_put_free_req_and_wake(ioq, req);
   1448         goto out;
   1449     }
   1450 
   1451     trace_nvme_dsm(s, offset, bytes);
   1452 
   1453     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
   1454 
   1455     data.co = qemu_coroutine_self();
   1456     while (data.ret == -EINPROGRESS) {
   1457         qemu_coroutine_yield();
   1458     }
   1459 
   1460     qemu_co_mutex_lock(&s->dma_map_lock);
   1461     ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
   1462     qemu_co_mutex_unlock(&s->dma_map_lock);
   1463 
   1464     if (ret) {
   1465         goto out;
   1466     }
   1467 
   1468     ret = data.ret;
   1469     trace_nvme_dsm_done(s, offset, bytes, ret);
   1470 out:
   1471     qemu_iovec_destroy(&local_qiov);
   1472     return ret;
   1473 
   1474 }
   1475 
   1476 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset,
   1477                                          bool exact, PreallocMode prealloc,
   1478                                          BdrvRequestFlags flags, Error **errp)
   1479 {
   1480     int64_t cur_length;
   1481 
   1482     if (prealloc != PREALLOC_MODE_OFF) {
   1483         error_setg(errp, "Unsupported preallocation mode '%s'",
   1484                    PreallocMode_str(prealloc));
   1485         return -ENOTSUP;
   1486     }
   1487 
   1488     cur_length = nvme_getlength(bs);
   1489     if (offset != cur_length && exact) {
   1490         error_setg(errp, "Cannot resize NVMe devices");
   1491         return -ENOTSUP;
   1492     } else if (offset > cur_length) {
   1493         error_setg(errp, "Cannot grow NVMe devices");
   1494         return -EINVAL;
   1495     }
   1496 
   1497     return 0;
   1498 }
   1499 
   1500 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
   1501                                BlockReopenQueue *queue, Error **errp)
   1502 {
   1503     return 0;
   1504 }
   1505 
   1506 static void nvme_refresh_filename(BlockDriverState *bs)
   1507 {
   1508     BDRVNVMeState *s = bs->opaque;
   1509 
   1510     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
   1511              s->device, s->nsid);
   1512 }
   1513 
   1514 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
   1515 {
   1516     BDRVNVMeState *s = bs->opaque;
   1517 
   1518     bs->bl.opt_mem_alignment = s->page_size;
   1519     bs->bl.request_alignment = s->page_size;
   1520     bs->bl.max_transfer = s->max_transfer;
   1521 
   1522     /*
   1523      * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get
   1524      * at most 0xFFFF
   1525      */
   1526     bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16);
   1527     bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment,
   1528                                          1UL << s->blkshift);
   1529 
   1530     bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift;
   1531     bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment,
   1532                                     1UL << s->blkshift);
   1533 }
   1534 
   1535 static void nvme_detach_aio_context(BlockDriverState *bs)
   1536 {
   1537     BDRVNVMeState *s = bs->opaque;
   1538 
   1539     for (unsigned i = 0; i < s->queue_count; i++) {
   1540         NVMeQueuePair *q = s->queues[i];
   1541 
   1542         qemu_bh_delete(q->completion_bh);
   1543         q->completion_bh = NULL;
   1544     }
   1545 
   1546     aio_set_event_notifier(bdrv_get_aio_context(bs),
   1547                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
   1548                            false, NULL, NULL, NULL);
   1549 }
   1550 
   1551 static void nvme_attach_aio_context(BlockDriverState *bs,
   1552                                     AioContext *new_context)
   1553 {
   1554     BDRVNVMeState *s = bs->opaque;
   1555 
   1556     s->aio_context = new_context;
   1557     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
   1558                            false, nvme_handle_event, nvme_poll_cb,
   1559                            nvme_poll_ready);
   1560 
   1561     for (unsigned i = 0; i < s->queue_count; i++) {
   1562         NVMeQueuePair *q = s->queues[i];
   1563 
   1564         q->completion_bh =
   1565             aio_bh_new(new_context, nvme_process_completion_bh, q);
   1566     }
   1567 }
   1568 
   1569 static void nvme_aio_plug(BlockDriverState *bs)
   1570 {
   1571     BDRVNVMeState *s = bs->opaque;
   1572     assert(!s->plugged);
   1573     s->plugged = true;
   1574 }
   1575 
   1576 static void nvme_aio_unplug(BlockDriverState *bs)
   1577 {
   1578     BDRVNVMeState *s = bs->opaque;
   1579     assert(s->plugged);
   1580     s->plugged = false;
   1581     for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
   1582         NVMeQueuePair *q = s->queues[i];
   1583         qemu_mutex_lock(&q->lock);
   1584         nvme_kick(q);
   1585         nvme_process_completion(q);
   1586         qemu_mutex_unlock(&q->lock);
   1587     }
   1588 }
   1589 
   1590 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
   1591                               Error **errp)
   1592 {
   1593     int ret;
   1594     BDRVNVMeState *s = bs->opaque;
   1595 
   1596     /*
   1597      * FIXME: we may run out of IOVA addresses after repeated
   1598      * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
   1599      * doesn't reclaim addresses for fixed mappings.
   1600      */
   1601     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp);
   1602     return ret == 0;
   1603 }
   1604 
   1605 static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size)
   1606 {
   1607     BDRVNVMeState *s = bs->opaque;
   1608 
   1609     qemu_vfio_dma_unmap(s->vfio, host);
   1610 }
   1611 
   1612 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
   1613 {
   1614     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
   1615     BDRVNVMeState *s = bs->opaque;
   1616 
   1617     stats->driver = BLOCKDEV_DRIVER_NVME;
   1618     stats->u.nvme = (BlockStatsSpecificNvme) {
   1619         .completion_errors = s->stats.completion_errors,
   1620         .aligned_accesses = s->stats.aligned_accesses,
   1621         .unaligned_accesses = s->stats.unaligned_accesses,
   1622     };
   1623 
   1624     return stats;
   1625 }
   1626 
   1627 static const char *const nvme_strong_runtime_opts[] = {
   1628     NVME_BLOCK_OPT_DEVICE,
   1629     NVME_BLOCK_OPT_NAMESPACE,
   1630 
   1631     NULL
   1632 };
   1633 
   1634 static BlockDriver bdrv_nvme = {
   1635     .format_name              = "nvme",
   1636     .protocol_name            = "nvme",
   1637     .instance_size            = sizeof(BDRVNVMeState),
   1638 
   1639     .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
   1640     .create_opts              = &bdrv_create_opts_simple,
   1641 
   1642     .bdrv_parse_filename      = nvme_parse_filename,
   1643     .bdrv_file_open           = nvme_file_open,
   1644     .bdrv_close               = nvme_close,
   1645     .bdrv_getlength           = nvme_getlength,
   1646     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
   1647     .bdrv_co_truncate         = nvme_co_truncate,
   1648 
   1649     .bdrv_co_preadv           = nvme_co_preadv,
   1650     .bdrv_co_pwritev          = nvme_co_pwritev,
   1651 
   1652     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
   1653     .bdrv_co_pdiscard         = nvme_co_pdiscard,
   1654 
   1655     .bdrv_co_flush_to_disk    = nvme_co_flush,
   1656     .bdrv_reopen_prepare      = nvme_reopen_prepare,
   1657 
   1658     .bdrv_refresh_filename    = nvme_refresh_filename,
   1659     .bdrv_refresh_limits      = nvme_refresh_limits,
   1660     .strong_runtime_opts      = nvme_strong_runtime_opts,
   1661     .bdrv_get_specific_stats  = nvme_get_specific_stats,
   1662 
   1663     .bdrv_detach_aio_context  = nvme_detach_aio_context,
   1664     .bdrv_attach_aio_context  = nvme_attach_aio_context,
   1665 
   1666     .bdrv_io_plug             = nvme_aio_plug,
   1667     .bdrv_io_unplug           = nvme_aio_unplug,
   1668 
   1669     .bdrv_register_buf        = nvme_register_buf,
   1670     .bdrv_unregister_buf      = nvme_unregister_buf,
   1671 };
   1672 
   1673 static void bdrv_nvme_init(void)
   1674 {
   1675     bdrv_register(&bdrv_nvme);
   1676 }
   1677 
   1678 block_init(bdrv_nvme_init);