qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

spapr.c (9090B)


      1 /*
      2  * DMA memory preregistration
      3  *
      4  * Authors:
      5  *  Alexey Kardashevskiy <aik@ozlabs.ru>
      6  *
      7  * This work is licensed under the terms of the GNU GPL, version 2.  See
      8  * the COPYING file in the top-level directory.
      9  */
     10 
     11 #include "qemu/osdep.h"
     12 #include <sys/ioctl.h>
     13 #include <linux/vfio.h>
     14 
     15 #include "hw/vfio/vfio-common.h"
     16 #include "hw/hw.h"
     17 #include "exec/ram_addr.h"
     18 #include "qemu/error-report.h"
     19 #include "qapi/error.h"
     20 #include "trace.h"
     21 
     22 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
     23 {
     24     if (memory_region_is_iommu(section->mr)) {
     25         hw_error("Cannot possibly preregister IOMMU memory");
     26     }
     27 
     28     return !memory_region_is_ram(section->mr) ||
     29             memory_region_is_ram_device(section->mr);
     30 }
     31 
     32 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
     33 {
     34     return memory_region_get_ram_ptr(section->mr) +
     35         section->offset_within_region +
     36         (gpa - section->offset_within_address_space);
     37 }
     38 
     39 static void vfio_prereg_listener_region_add(MemoryListener *listener,
     40                                             MemoryRegionSection *section)
     41 {
     42     VFIOContainer *container = container_of(listener, VFIOContainer,
     43                                             prereg_listener);
     44     const hwaddr gpa = section->offset_within_address_space;
     45     hwaddr end;
     46     int ret;
     47     hwaddr page_mask = qemu_real_host_page_mask();
     48     struct vfio_iommu_spapr_register_memory reg = {
     49         .argsz = sizeof(reg),
     50         .flags = 0,
     51     };
     52 
     53     if (vfio_prereg_listener_skipped_section(section)) {
     54         trace_vfio_prereg_listener_region_add_skip(
     55                 section->offset_within_address_space,
     56                 section->offset_within_address_space +
     57                 int128_get64(int128_sub(section->size, int128_one())));
     58         return;
     59     }
     60 
     61     if (unlikely((section->offset_within_address_space & ~page_mask) ||
     62                  (section->offset_within_region & ~page_mask) ||
     63                  (int128_get64(section->size) & ~page_mask))) {
     64         error_report("%s received unaligned region", __func__);
     65         return;
     66     }
     67 
     68     end = section->offset_within_address_space + int128_get64(section->size);
     69     if (gpa >= end) {
     70         return;
     71     }
     72 
     73     memory_region_ref(section->mr);
     74 
     75     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
     76     reg.size = end - gpa;
     77 
     78     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
     79     trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
     80     if (ret) {
     81         /*
     82          * On the initfn path, store the first error in the container so we
     83          * can gracefully fail.  Runtime, there's not much we can do other
     84          * than throw a hardware error.
     85          */
     86         if (!container->initialized) {
     87             if (!container->error) {
     88                 error_setg_errno(&container->error, -ret,
     89                                  "Memory registering failed");
     90             }
     91         } else {
     92             hw_error("vfio: Memory registering failed, unable to continue");
     93         }
     94     }
     95 }
     96 
     97 static void vfio_prereg_listener_region_del(MemoryListener *listener,
     98                                             MemoryRegionSection *section)
     99 {
    100     VFIOContainer *container = container_of(listener, VFIOContainer,
    101                                             prereg_listener);
    102     const hwaddr gpa = section->offset_within_address_space;
    103     hwaddr end;
    104     int ret;
    105     hwaddr page_mask = qemu_real_host_page_mask();
    106     struct vfio_iommu_spapr_register_memory reg = {
    107         .argsz = sizeof(reg),
    108         .flags = 0,
    109     };
    110 
    111     if (vfio_prereg_listener_skipped_section(section)) {
    112         trace_vfio_prereg_listener_region_del_skip(
    113                 section->offset_within_address_space,
    114                 section->offset_within_address_space +
    115                 int128_get64(int128_sub(section->size, int128_one())));
    116         return;
    117     }
    118 
    119     if (unlikely((section->offset_within_address_space & ~page_mask) ||
    120                  (section->offset_within_region & ~page_mask) ||
    121                  (int128_get64(section->size) & ~page_mask))) {
    122         error_report("%s received unaligned region", __func__);
    123         return;
    124     }
    125 
    126     end = section->offset_within_address_space + int128_get64(section->size);
    127     if (gpa >= end) {
    128         return;
    129     }
    130 
    131     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
    132     reg.size = end - gpa;
    133 
    134     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
    135     trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
    136 }
    137 
    138 const MemoryListener vfio_prereg_listener = {
    139     .name = "vfio-pre-reg",
    140     .region_add = vfio_prereg_listener_region_add,
    141     .region_del = vfio_prereg_listener_region_del,
    142 };
    143 
    144 int vfio_spapr_create_window(VFIOContainer *container,
    145                              MemoryRegionSection *section,
    146                              hwaddr *pgsize)
    147 {
    148     int ret = 0;
    149     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
    150     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
    151     unsigned entries, bits_total, bits_per_level, max_levels;
    152     struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
    153     long rampagesize = qemu_minrampagesize();
    154 
    155     /*
    156      * The host might not support the guest supported IOMMU page size,
    157      * so we will use smaller physical IOMMU pages to back them.
    158      */
    159     if (pagesize > rampagesize) {
    160         pagesize = rampagesize;
    161     }
    162     pgmask = container->pgsizes & (pagesize | (pagesize - 1));
    163     pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
    164     if (!pagesize) {
    165         error_report("Host doesn't support page size 0x%"PRIx64
    166                      ", the supported mask is 0x%lx",
    167                      memory_region_iommu_get_min_page_size(iommu_mr),
    168                      container->pgsizes);
    169         return -EINVAL;
    170     }
    171 
    172     /*
    173      * FIXME: For VFIO iommu types which have KVM acceleration to
    174      * avoid bouncing all map/unmaps through qemu this way, this
    175      * would be the right place to wire that up (tell the KVM
    176      * device emulation the VFIO iommu handles to use).
    177      */
    178     create.window_size = int128_get64(section->size);
    179     create.page_shift = ctz64(pagesize);
    180     /*
    181      * SPAPR host supports multilevel TCE tables. We try to guess optimal
    182      * levels number and if this fails (for example due to the host memory
    183      * fragmentation), we increase levels. The DMA address structure is:
    184      * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
    185      * where:
    186      *   r = reserved (bits >= 55 are reserved in the existing hardware)
    187      *   i = IOMMU page offset (64K in this example)
    188      *   x = bits to index a TCE which can be split to equal chunks to index
    189      *      within the level.
    190      * The aim is to split "x" to smaller possible number of levels.
    191      */
    192     entries = create.window_size >> create.page_shift;
    193     /* bits_total is number of "x" needed */
    194     bits_total = ctz64(entries * sizeof(uint64_t));
    195     /*
    196      * bits_per_level is a safe guess of how much we can allocate per level:
    197      * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
    198      * is usually bigger than that.
    199      * Below we look at qemu_real_host_page_size as TCEs are allocated from
    200      * system pages.
    201      */
    202     bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
    203     create.levels = bits_total / bits_per_level;
    204     if (bits_total % bits_per_level) {
    205         ++create.levels;
    206     }
    207     max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
    208     for ( ; create.levels <= max_levels; ++create.levels) {
    209         ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
    210         if (!ret) {
    211             break;
    212         }
    213     }
    214     if (ret) {
    215         error_report("Failed to create a window, ret = %d (%m)", ret);
    216         return -errno;
    217     }
    218 
    219     if (create.start_addr != section->offset_within_address_space) {
    220         vfio_spapr_remove_window(container, create.start_addr);
    221 
    222         error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
    223                      section->offset_within_address_space,
    224                      (uint64_t)create.start_addr);
    225         return -EINVAL;
    226     }
    227     trace_vfio_spapr_create_window(create.page_shift,
    228                                    create.levels,
    229                                    create.window_size,
    230                                    create.start_addr);
    231     *pgsize = pagesize;
    232 
    233     return 0;
    234 }
    235 
    236 int vfio_spapr_remove_window(VFIOContainer *container,
    237                              hwaddr offset_within_address_space)
    238 {
    239     struct vfio_iommu_spapr_tce_remove remove = {
    240         .argsz = sizeof(remove),
    241         .start_addr = offset_within_address_space,
    242     };
    243     int ret;
    244 
    245     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
    246     if (ret) {
    247         error_report("Failed to remove window at %"PRIx64,
    248                      (uint64_t)remove.start_addr);
    249         return -errno;
    250     }
    251 
    252     trace_vfio_spapr_remove_window(offset_within_address_space);
    253 
    254     return 0;
    255 }