qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

vfio-helpers.c (26648B)


      1 /*
      2  * VFIO utility
      3  *
      4  * Copyright 2016 - 2018 Red Hat, Inc.
      5  *
      6  * Authors:
      7  *   Fam Zheng <famz@redhat.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
     10  * See the COPYING file in the top-level directory.
     11  */
     12 
     13 #include "qemu/osdep.h"
     14 #include <sys/ioctl.h>
     15 #include <linux/vfio.h>
     16 #include "qapi/error.h"
     17 #include "exec/ramlist.h"
     18 #include "exec/cpu-common.h"
     19 #include "exec/memory.h"
     20 #include "trace.h"
     21 #include "qemu/error-report.h"
     22 #include "standard-headers/linux/pci_regs.h"
     23 #include "qemu/event_notifier.h"
     24 #include "qemu/vfio-helpers.h"
     25 #include "qemu/lockable.h"
     26 #include "trace.h"
     27 
     28 #define QEMU_VFIO_DEBUG 0
     29 
     30 #define QEMU_VFIO_IOVA_MIN 0x10000ULL
     31 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
     32  * we can use a runtime limit; alternatively it's also possible to do platform
     33  * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
     34  **/
     35 #define QEMU_VFIO_IOVA_MAX (1ULL << 39)
     36 
     37 typedef struct {
     38     /* Page aligned addr. */
     39     void *host;
     40     size_t size;
     41     uint64_t iova;
     42 } IOVAMapping;
     43 
     44 struct IOVARange {
     45     uint64_t start;
     46     uint64_t end;
     47 };
     48 
     49 struct QEMUVFIOState {
     50     QemuMutex lock;
     51 
     52     /* These fields are protected by BQL */
     53     int container;
     54     int group;
     55     int device;
     56     RAMBlockNotifier ram_notifier;
     57     struct vfio_region_info config_region_info, bar_region_info[6];
     58     struct IOVARange *usable_iova_ranges;
     59     uint8_t nb_iova_ranges;
     60 
     61     /* These fields are protected by @lock */
     62     /* VFIO's IO virtual address space is managed by splitting into a few
     63      * sections:
     64      *
     65      * ---------------       <= 0
     66      * |xxxxxxxxxxxxx|
     67      * |-------------|       <= QEMU_VFIO_IOVA_MIN
     68      * |             |
     69      * |    Fixed    |
     70      * |             |
     71      * |-------------|       <= low_water_mark
     72      * |             |
     73      * |    Free     |
     74      * |             |
     75      * |-------------|       <= high_water_mark
     76      * |             |
     77      * |    Temp     |
     78      * |             |
     79      * |-------------|       <= QEMU_VFIO_IOVA_MAX
     80      * |xxxxxxxxxxxxx|
     81      * |xxxxxxxxxxxxx|
     82      * ---------------
     83      *
     84      * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
     85      *
     86      * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
     87      *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
     88      *   reclaimed - low_water_mark never shrinks;
     89      *
     90      * - IOVAs in range [low_water_mark, high_water_mark) are free;
     91      *
     92      * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
     93      *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
     94      *   is recycled. The caller should make sure I/O's depending on these
     95      *   mappings are completed before calling.
     96      **/
     97     uint64_t low_water_mark;
     98     uint64_t high_water_mark;
     99     IOVAMapping *mappings;
    100     int nr_mappings;
    101 };
    102 
    103 /**
    104  * Find group file by PCI device address as specified @device, and return the
    105  * path. The returned string is owned by caller and should be g_free'ed later.
    106  */
    107 static char *sysfs_find_group_file(const char *device, Error **errp)
    108 {
    109     char *sysfs_link;
    110     char *sysfs_group;
    111     char *p;
    112     char *path = NULL;
    113 
    114     sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
    115     sysfs_group = g_malloc0(PATH_MAX);
    116     if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
    117         error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
    118         goto out;
    119     }
    120     p = strrchr(sysfs_group, '/');
    121     if (!p) {
    122         error_setg(errp, "Failed to find iommu group number");
    123         goto out;
    124     }
    125 
    126     path = g_strdup_printf("/dev/vfio/%s", p + 1);
    127 out:
    128     g_free(sysfs_link);
    129     g_free(sysfs_group);
    130     return path;
    131 }
    132 
    133 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
    134 {
    135     assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
    136 }
    137 
    138 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
    139 {
    140     g_autofree char *barname = NULL;
    141     assert_bar_index_valid(s, index);
    142     s->bar_region_info[index] = (struct vfio_region_info) {
    143         .index = VFIO_PCI_BAR0_REGION_INDEX + index,
    144         .argsz = sizeof(struct vfio_region_info),
    145     };
    146     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
    147         error_setg_errno(errp, errno, "Failed to get BAR region info");
    148         return -errno;
    149     }
    150     barname = g_strdup_printf("bar[%d]", index);
    151     trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
    152                                 s->bar_region_info[index].size,
    153                                 s->bar_region_info[index].cap_offset);
    154 
    155     return 0;
    156 }
    157 
    158 /**
    159  * Map a PCI bar area.
    160  */
    161 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
    162                             uint64_t offset, uint64_t size, int prot,
    163                             Error **errp)
    164 {
    165     void *p;
    166     assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size()));
    167     assert_bar_index_valid(s, index);
    168     p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
    169              prot, MAP_SHARED,
    170              s->device, s->bar_region_info[index].offset + offset);
    171     trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
    172                                 size, offset, p);
    173     if (p == MAP_FAILED) {
    174         error_setg_errno(errp, errno, "Failed to map BAR region");
    175         p = NULL;
    176     }
    177     return p;
    178 }
    179 
    180 /**
    181  * Unmap a PCI bar area.
    182  */
    183 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
    184                              uint64_t offset, uint64_t size)
    185 {
    186     if (bar) {
    187         munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
    188     }
    189 }
    190 
    191 /**
    192  * Initialize device IRQ with @irq_type and register an event notifier.
    193  */
    194 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
    195                            int irq_type, Error **errp)
    196 {
    197     int r;
    198     struct vfio_irq_set *irq_set;
    199     size_t irq_set_size;
    200     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
    201 
    202     irq_info.index = irq_type;
    203     if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
    204         error_setg_errno(errp, errno, "Failed to get device interrupt info");
    205         return -errno;
    206     }
    207     if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
    208         error_setg(errp, "Device interrupt doesn't support eventfd");
    209         return -EINVAL;
    210     }
    211 
    212     irq_set_size = sizeof(*irq_set) + sizeof(int);
    213     irq_set = g_malloc0(irq_set_size);
    214 
    215     /* Get to a known IRQ state */
    216     *irq_set = (struct vfio_irq_set) {
    217         .argsz = irq_set_size,
    218         .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
    219         .index = irq_info.index,
    220         .start = 0,
    221         .count = 1,
    222     };
    223 
    224     *(int *)&irq_set->data = event_notifier_get_fd(e);
    225     r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
    226     g_free(irq_set);
    227     if (r) {
    228         error_setg_errno(errp, errno, "Failed to setup device interrupt");
    229         return -errno;
    230     }
    231     return 0;
    232 }
    233 
    234 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
    235                                      int size, int ofs)
    236 {
    237     int ret;
    238 
    239     trace_qemu_vfio_pci_read_config(buf, ofs, size,
    240                                     s->config_region_info.offset,
    241                                     s->config_region_info.size);
    242     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
    243     do {
    244         ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
    245     } while (ret == -1 && errno == EINTR);
    246     return ret == size ? 0 : -errno;
    247 }
    248 
    249 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
    250 {
    251     int ret;
    252 
    253     trace_qemu_vfio_pci_write_config(buf, ofs, size,
    254                                      s->config_region_info.offset,
    255                                      s->config_region_info.size);
    256     assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
    257     do {
    258         ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
    259     } while (ret == -1 && errno == EINTR);
    260     return ret == size ? 0 : -errno;
    261 }
    262 
    263 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
    264 {
    265     struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
    266     struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
    267     struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
    268     int i;
    269 
    270     while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
    271         if (!cap->next) {
    272             return;
    273         }
    274         cap = (struct vfio_info_cap_header *)(buf + cap->next);
    275     }
    276 
    277     cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
    278 
    279     s->nb_iova_ranges = cap_iova_range->nr_iovas;
    280     if (s->nb_iova_ranges > 1) {
    281         s->usable_iova_ranges =
    282             g_renew(struct IOVARange, s->usable_iova_ranges,
    283                     s->nb_iova_ranges);
    284     }
    285 
    286     for (i = 0; i < s->nb_iova_ranges; i++) {
    287         s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
    288         s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
    289     }
    290 }
    291 
    292 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
    293                               Error **errp)
    294 {
    295     int ret;
    296     int i;
    297     uint16_t pci_cmd;
    298     struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
    299     struct vfio_iommu_type1_info *iommu_info = NULL;
    300     size_t iommu_info_size = sizeof(*iommu_info);
    301     struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
    302     char *group_file = NULL;
    303 
    304     s->usable_iova_ranges = NULL;
    305 
    306     /* Create a new container */
    307     s->container = open("/dev/vfio/vfio", O_RDWR);
    308 
    309     if (s->container == -1) {
    310         error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
    311         return -errno;
    312     }
    313     if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
    314         error_setg(errp, "Invalid VFIO version");
    315         ret = -EINVAL;
    316         goto fail_container;
    317     }
    318 
    319     if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
    320         error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
    321         ret = -EINVAL;
    322         goto fail_container;
    323     }
    324 
    325     /* Open the group */
    326     group_file = sysfs_find_group_file(device, errp);
    327     if (!group_file) {
    328         ret = -EINVAL;
    329         goto fail_container;
    330     }
    331 
    332     s->group = open(group_file, O_RDWR);
    333     if (s->group == -1) {
    334         error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
    335                          group_file);
    336         g_free(group_file);
    337         ret = -errno;
    338         goto fail_container;
    339     }
    340     g_free(group_file);
    341 
    342     /* Test the group is viable and available */
    343     if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
    344         error_setg_errno(errp, errno, "Failed to get VFIO group status");
    345         ret = -errno;
    346         goto fail;
    347     }
    348 
    349     if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
    350         error_setg(errp, "VFIO group is not viable");
    351         ret = -EINVAL;
    352         goto fail;
    353     }
    354 
    355     /* Add the group to the container */
    356     if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
    357         error_setg_errno(errp, errno, "Failed to add group to VFIO container");
    358         ret = -errno;
    359         goto fail;
    360     }
    361 
    362     /* Enable the IOMMU model we want */
    363     if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
    364         error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
    365         ret = -errno;
    366         goto fail;
    367     }
    368 
    369     iommu_info = g_malloc0(iommu_info_size);
    370     iommu_info->argsz = iommu_info_size;
    371 
    372     /* Get additional IOMMU info */
    373     if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
    374         error_setg_errno(errp, errno, "Failed to get IOMMU info");
    375         ret = -errno;
    376         goto fail;
    377     }
    378 
    379     /*
    380      * if the kernel does not report usable IOVA regions, choose
    381      * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
    382      */
    383     s->nb_iova_ranges = 1;
    384     s->usable_iova_ranges = g_new0(struct IOVARange, 1);
    385     s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
    386     s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
    387 
    388     if (iommu_info->argsz > iommu_info_size) {
    389         iommu_info_size = iommu_info->argsz;
    390         iommu_info = g_realloc(iommu_info, iommu_info_size);
    391         if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
    392             ret = -errno;
    393             goto fail;
    394         }
    395         collect_usable_iova_ranges(s, iommu_info);
    396     }
    397 
    398     s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
    399 
    400     if (s->device < 0) {
    401         error_setg_errno(errp, errno, "Failed to get device fd");
    402         ret = -errno;
    403         goto fail;
    404     }
    405 
    406     /* Test and setup the device */
    407     if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
    408         error_setg_errno(errp, errno, "Failed to get device info");
    409         ret = -errno;
    410         goto fail;
    411     }
    412 
    413     if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
    414         error_setg(errp, "Invalid device regions");
    415         ret = -EINVAL;
    416         goto fail;
    417     }
    418 
    419     s->config_region_info = (struct vfio_region_info) {
    420         .index = VFIO_PCI_CONFIG_REGION_INDEX,
    421         .argsz = sizeof(struct vfio_region_info),
    422     };
    423     if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
    424         error_setg_errno(errp, errno, "Failed to get config region info");
    425         ret = -errno;
    426         goto fail;
    427     }
    428     trace_qemu_vfio_region_info("config", s->config_region_info.offset,
    429                                 s->config_region_info.size,
    430                                 s->config_region_info.cap_offset);
    431 
    432     for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
    433         ret = qemu_vfio_pci_init_bar(s, i, errp);
    434         if (ret) {
    435             goto fail;
    436         }
    437     }
    438 
    439     /* Enable bus master */
    440     ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
    441     if (ret) {
    442         goto fail;
    443     }
    444     pci_cmd |= PCI_COMMAND_MASTER;
    445     ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
    446     if (ret) {
    447         goto fail;
    448     }
    449     g_free(iommu_info);
    450     return 0;
    451 fail:
    452     g_free(s->usable_iova_ranges);
    453     s->usable_iova_ranges = NULL;
    454     s->nb_iova_ranges = 0;
    455     g_free(iommu_info);
    456     close(s->group);
    457 fail_container:
    458     close(s->container);
    459     return ret;
    460 }
    461 
    462 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
    463                                       size_t size, size_t max_size)
    464 {
    465     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
    466     Error *local_err = NULL;
    467     int ret;
    468 
    469     trace_qemu_vfio_ram_block_added(s, host, max_size);
    470     ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
    471     if (ret) {
    472         error_reportf_err(local_err,
    473                           "qemu_vfio_dma_map(%p, %zu) failed: ",
    474                           host, max_size);
    475     }
    476 }
    477 
    478 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
    479                                         size_t size, size_t max_size)
    480 {
    481     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
    482     if (host) {
    483         trace_qemu_vfio_ram_block_removed(s, host, max_size);
    484         qemu_vfio_dma_unmap(s, host);
    485     }
    486 }
    487 
    488 static void qemu_vfio_open_common(QEMUVFIOState *s)
    489 {
    490     qemu_mutex_init(&s->lock);
    491     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
    492     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
    493     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
    494     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
    495     ram_block_notifier_add(&s->ram_notifier);
    496 }
    497 
    498 /**
    499  * Open a PCI device, e.g. "0000:00:01.0".
    500  */
    501 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
    502 {
    503     int r;
    504     QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
    505 
    506     /*
    507      * VFIO may pin all memory inside mappings, resulting it in pinning
    508      * all memory inside RAM blocks unconditionally.
    509      */
    510     r = ram_block_discard_disable(true);
    511     if (r) {
    512         error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
    513         g_free(s);
    514         return NULL;
    515     }
    516 
    517     r = qemu_vfio_init_pci(s, device, errp);
    518     if (r) {
    519         ram_block_discard_disable(false);
    520         g_free(s);
    521         return NULL;
    522     }
    523     qemu_vfio_open_common(s);
    524     return s;
    525 }
    526 
    527 static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
    528 {
    529     for (int i = 0; i < s->nr_mappings; ++i) {
    530         trace_qemu_vfio_dump_mapping(s->mappings[i].host,
    531                                      s->mappings[i].iova,
    532                                      s->mappings[i].size);
    533     }
    534 }
    535 
    536 /**
    537  * Find the mapping entry that contains [host, host + size) and set @index to
    538  * the position. If no entry contains it, @index is the position _after_ which
    539  * to insert the new mapping. IOW, it is the index of the largest element that
    540  * is smaller than @host, or -1 if no entry is.
    541  */
    542 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
    543                                            int *index)
    544 {
    545     IOVAMapping *p = s->mappings;
    546     IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
    547     IOVAMapping *mid;
    548     trace_qemu_vfio_find_mapping(s, host);
    549     if (!p) {
    550         *index = -1;
    551         return NULL;
    552     }
    553     while (true) {
    554         mid = p + (q - p) / 2;
    555         if (mid == p) {
    556             break;
    557         }
    558         if (mid->host > host) {
    559             q = mid;
    560         } else if (mid->host < host) {
    561             p = mid;
    562         } else {
    563             break;
    564         }
    565     }
    566     if (mid->host > host) {
    567         mid--;
    568     } else if (mid < &s->mappings[s->nr_mappings - 1]
    569                && (mid + 1)->host <= host) {
    570         mid++;
    571     }
    572     *index = mid - &s->mappings[0];
    573     if (mid >= &s->mappings[0] &&
    574         mid->host <= host && mid->host + mid->size > host) {
    575         assert(mid < &s->mappings[s->nr_mappings]);
    576         return mid;
    577     }
    578     /* At this point *index + 1 is the right position to insert the new
    579      * mapping.*/
    580     return NULL;
    581 }
    582 
    583 /**
    584  * Allocate IOVA and create a new mapping record and insert it in @s.
    585  */
    586 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
    587                                           void *host, size_t size,
    588                                           int index, uint64_t iova)
    589 {
    590     int shift;
    591     IOVAMapping m = {.host = host, .size = size, .iova = iova};
    592     IOVAMapping *insert;
    593 
    594     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
    595     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size()));
    596     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size()));
    597     trace_qemu_vfio_new_mapping(s, host, size, index, iova);
    598 
    599     assert(index >= 0);
    600     s->nr_mappings++;
    601     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
    602     insert = &s->mappings[index];
    603     shift = s->nr_mappings - index - 1;
    604     if (shift) {
    605         memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
    606     }
    607     *insert = m;
    608     return insert;
    609 }
    610 
    611 /* Do the DMA mapping with VFIO. */
    612 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
    613                                 uint64_t iova, Error **errp)
    614 {
    615     struct vfio_iommu_type1_dma_map dma_map = {
    616         .argsz = sizeof(dma_map),
    617         .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
    618         .iova = iova,
    619         .vaddr = (uintptr_t)host,
    620         .size = size,
    621     };
    622     trace_qemu_vfio_do_mapping(s, host, iova, size);
    623 
    624     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
    625         error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
    626         return -errno;
    627     }
    628     return 0;
    629 }
    630 
    631 /**
    632  * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
    633  */
    634 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
    635                                    Error **errp)
    636 {
    637     int index;
    638     struct vfio_iommu_type1_dma_unmap unmap = {
    639         .argsz = sizeof(unmap),
    640         .flags = 0,
    641         .iova = mapping->iova,
    642         .size = mapping->size,
    643     };
    644 
    645     index = mapping - s->mappings;
    646     assert(mapping->size > 0);
    647     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size()));
    648     assert(index >= 0 && index < s->nr_mappings);
    649     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
    650         error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
    651     }
    652     memmove(mapping, &s->mappings[index + 1],
    653             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
    654     s->nr_mappings--;
    655     s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
    656 }
    657 
    658 /* Check if the mapping list is (ascending) ordered. */
    659 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
    660 {
    661     int i;
    662     if (QEMU_VFIO_DEBUG) {
    663         for (i = 0; i < s->nr_mappings - 1; ++i) {
    664             if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
    665                 error_report("item %d not sorted!", i);
    666                 qemu_vfio_dump_mappings(s);
    667                 return false;
    668             }
    669             if (!(s->mappings[i].host + s->mappings[i].size <=
    670                   s->mappings[i + 1].host)) {
    671                 error_report("item %d overlap with next!", i);
    672                 qemu_vfio_dump_mappings(s);
    673                 return false;
    674             }
    675         }
    676     }
    677     return true;
    678 }
    679 
    680 static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
    681                                       uint64_t *iova, Error **errp)
    682 {
    683     int i;
    684 
    685     for (i = 0; i < s->nb_iova_ranges; i++) {
    686         if (s->usable_iova_ranges[i].end < s->low_water_mark) {
    687             continue;
    688         }
    689         s->low_water_mark =
    690             MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
    691 
    692         if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
    693             s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
    694             *iova = s->low_water_mark;
    695             s->low_water_mark += size;
    696             return true;
    697         }
    698     }
    699     error_setg(errp, "fixed iova range not found");
    700 
    701     return false;
    702 }
    703 
    704 static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
    705                                      uint64_t *iova, Error **errp)
    706 {
    707     int i;
    708 
    709     for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
    710         if (s->usable_iova_ranges[i].start > s->high_water_mark) {
    711             continue;
    712         }
    713         s->high_water_mark =
    714             MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
    715 
    716         if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
    717             s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
    718             *iova = s->high_water_mark - size;
    719             s->high_water_mark = *iova;
    720             return true;
    721         }
    722     }
    723     error_setg(errp, "temporary iova range not found");
    724 
    725     return false;
    726 }
    727 
    728 /**
    729  * qemu_vfio_water_mark_reached:
    730  *
    731  * Returns %true if high watermark has been reached, %false otherwise.
    732  */
    733 static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
    734                                          Error **errp)
    735 {
    736     if (s->high_water_mark - s->low_water_mark + 1 < size) {
    737         error_setg(errp, "iova exhausted (water mark reached)");
    738         return true;
    739     }
    740     return false;
    741 }
    742 
    743 /* Map [host, host + size) area into a contiguous IOVA address space, and store
    744  * the result in @iova if not NULL. The caller need to make sure the area is
    745  * aligned to page size, and mustn't overlap with existing mapping areas (split
    746  * mapping status within this area is not allowed).
    747  */
    748 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
    749                       bool temporary, uint64_t *iova, Error **errp)
    750 {
    751     int index;
    752     IOVAMapping *mapping;
    753     uint64_t iova0;
    754 
    755     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size()));
    756     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size()));
    757     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
    758     QEMU_LOCK_GUARD(&s->lock);
    759     mapping = qemu_vfio_find_mapping(s, host, &index);
    760     if (mapping) {
    761         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
    762     } else {
    763         int ret;
    764 
    765         if (qemu_vfio_water_mark_reached(s, size, errp)) {
    766             return -ENOMEM;
    767         }
    768         if (!temporary) {
    769             if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
    770                 return -ENOMEM;
    771             }
    772 
    773             mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
    774             assert(qemu_vfio_verify_mappings(s));
    775             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
    776             if (ret < 0) {
    777                 qemu_vfio_undo_mapping(s, mapping, NULL);
    778                 return ret;
    779             }
    780             qemu_vfio_dump_mappings(s);
    781         } else {
    782             if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
    783                 return -ENOMEM;
    784             }
    785             ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
    786             if (ret < 0) {
    787                 return ret;
    788             }
    789         }
    790     }
    791     trace_qemu_vfio_dma_mapped(s, host, iova0, size);
    792     if (iova) {
    793         *iova = iova0;
    794     }
    795     return 0;
    796 }
    797 
    798 /* Reset the high watermark and free all "temporary" mappings. */
    799 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
    800 {
    801     struct vfio_iommu_type1_dma_unmap unmap = {
    802         .argsz = sizeof(unmap),
    803         .flags = 0,
    804         .iova = s->high_water_mark,
    805         .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
    806     };
    807     trace_qemu_vfio_dma_reset_temporary(s);
    808     QEMU_LOCK_GUARD(&s->lock);
    809     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
    810         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
    811         return -errno;
    812     }
    813     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
    814     return 0;
    815 }
    816 
    817 /* Unmapping the whole area that was previously mapped with
    818  * qemu_vfio_dma_map(). */
    819 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
    820 {
    821     int index = 0;
    822     IOVAMapping *m;
    823 
    824     if (!host) {
    825         return;
    826     }
    827 
    828     trace_qemu_vfio_dma_unmap(s, host);
    829     QEMU_LOCK_GUARD(&s->lock);
    830     m = qemu_vfio_find_mapping(s, host, &index);
    831     if (!m) {
    832         return;
    833     }
    834     qemu_vfio_undo_mapping(s, m, NULL);
    835 }
    836 
    837 static void qemu_vfio_reset(QEMUVFIOState *s)
    838 {
    839     ioctl(s->device, VFIO_DEVICE_RESET);
    840 }
    841 
    842 /* Close and free the VFIO resources. */
    843 void qemu_vfio_close(QEMUVFIOState *s)
    844 {
    845     int i;
    846 
    847     if (!s) {
    848         return;
    849     }
    850 
    851     ram_block_notifier_remove(&s->ram_notifier);
    852 
    853     for (i = 0; i < s->nr_mappings; ++i) {
    854         qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
    855     }
    856 
    857     g_free(s->usable_iova_ranges);
    858     s->nb_iova_ranges = 0;
    859     qemu_vfio_reset(s);
    860     close(s->device);
    861     close(s->group);
    862     close(s->container);
    863     ram_block_discard_disable(false);
    864 }