qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

pci.c (111149B)


      1 /*
      2  * vfio based device assignment support
      3  *
      4  * Copyright Red Hat, Inc. 2012
      5  *
      6  * Authors:
      7  *  Alex Williamson <alex.williamson@redhat.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2.  See
     10  * the COPYING file in the top-level directory.
     11  *
     12  * Based on qemu-kvm device-assignment:
     13  *  Adapted for KVM by Qumranet.
     14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
     15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
     16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
     17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
     18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
     19  */
     20 
     21 #include "qemu/osdep.h"
     22 #include <linux/vfio.h>
     23 #include <sys/ioctl.h>
     24 
     25 #include "hw/hw.h"
     26 #include "hw/pci/msi.h"
     27 #include "hw/pci/msix.h"
     28 #include "hw/pci/pci_bridge.h"
     29 #include "hw/qdev-properties.h"
     30 #include "hw/qdev-properties-system.h"
     31 #include "migration/vmstate.h"
     32 #include "qapi/qmp/qdict.h"
     33 #include "qemu/error-report.h"
     34 #include "qemu/main-loop.h"
     35 #include "qemu/module.h"
     36 #include "qemu/range.h"
     37 #include "qemu/units.h"
     38 #include "sysemu/kvm.h"
     39 #include "sysemu/runstate.h"
     40 #include "pci.h"
     41 #include "trace.h"
     42 #include "qapi/error.h"
     43 #include "migration/blocker.h"
     44 #include "migration/qemu-file.h"
     45 
     46 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
     47 
     48 /* Protected by BQL */
     49 static KVMRouteChange vfio_route_change;
     50 
     51 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
     52 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
     53 static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
     54 
     55 /*
     56  * Disabling BAR mmaping can be slow, but toggling it around INTx can
     57  * also be a huge overhead.  We try to get the best of both worlds by
     58  * waiting until an interrupt to disable mmaps (subsequent transitions
     59  * to the same state are effectively no overhead).  If the interrupt has
     60  * been serviced and the time gap is long enough, we re-enable mmaps for
     61  * performance.  This works well for things like graphics cards, which
     62  * may not use their interrupt at all and are penalized to an unusable
     63  * level by read/write BAR traps.  Other devices, like NICs, have more
     64  * regular interrupts and see much better latency by staying in non-mmap
     65  * mode.  We therefore set the default mmap_timeout such that a ping
     66  * is just enough to keep the mmap disabled.  Users can experiment with
     67  * other options with the x-intx-mmap-timeout-ms parameter (a value of
     68  * zero disables the timer).
     69  */
     70 static void vfio_intx_mmap_enable(void *opaque)
     71 {
     72     VFIOPCIDevice *vdev = opaque;
     73 
     74     if (vdev->intx.pending) {
     75         timer_mod(vdev->intx.mmap_timer,
     76                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
     77         return;
     78     }
     79 
     80     vfio_mmap_set_enabled(vdev, true);
     81 }
     82 
     83 static void vfio_intx_interrupt(void *opaque)
     84 {
     85     VFIOPCIDevice *vdev = opaque;
     86 
     87     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
     88         return;
     89     }
     90 
     91     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
     92 
     93     vdev->intx.pending = true;
     94     pci_irq_assert(&vdev->pdev);
     95     vfio_mmap_set_enabled(vdev, false);
     96     if (vdev->intx.mmap_timeout) {
     97         timer_mod(vdev->intx.mmap_timer,
     98                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
     99     }
    100 }
    101 
    102 static void vfio_intx_eoi(VFIODevice *vbasedev)
    103 {
    104     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
    105 
    106     if (!vdev->intx.pending) {
    107         return;
    108     }
    109 
    110     trace_vfio_intx_eoi(vbasedev->name);
    111 
    112     vdev->intx.pending = false;
    113     pci_irq_deassert(&vdev->pdev);
    114     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    115 }
    116 
    117 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
    118 {
    119 #ifdef CONFIG_KVM
    120     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
    121 
    122     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
    123         vdev->intx.route.mode != PCI_INTX_ENABLED ||
    124         !kvm_resamplefds_enabled()) {
    125         return;
    126     }
    127 
    128     /* Get to a known interrupt state */
    129     qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
    130     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    131     vdev->intx.pending = false;
    132     pci_irq_deassert(&vdev->pdev);
    133 
    134     /* Get an eventfd for resample/unmask */
    135     if (event_notifier_init(&vdev->intx.unmask, 0)) {
    136         error_setg(errp, "event_notifier_init failed eoi");
    137         goto fail;
    138     }
    139 
    140     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
    141                                            &vdev->intx.interrupt,
    142                                            &vdev->intx.unmask,
    143                                            vdev->intx.route.irq)) {
    144         error_setg_errno(errp, errno, "failed to setup resample irqfd");
    145         goto fail_irqfd;
    146     }
    147 
    148     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
    149                                VFIO_IRQ_SET_ACTION_UNMASK,
    150                                event_notifier_get_fd(&vdev->intx.unmask),
    151                                errp)) {
    152         goto fail_vfio;
    153     }
    154 
    155     /* Let'em rip */
    156     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    157 
    158     vdev->intx.kvm_accel = true;
    159 
    160     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
    161 
    162     return;
    163 
    164 fail_vfio:
    165     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
    166                                           vdev->intx.route.irq);
    167 fail_irqfd:
    168     event_notifier_cleanup(&vdev->intx.unmask);
    169 fail:
    170     qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev);
    171     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    172 #endif
    173 }
    174 
    175 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
    176 {
    177 #ifdef CONFIG_KVM
    178     if (!vdev->intx.kvm_accel) {
    179         return;
    180     }
    181 
    182     /*
    183      * Get to a known state, hardware masked, QEMU ready to accept new
    184      * interrupts, QEMU IRQ de-asserted.
    185      */
    186     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    187     vdev->intx.pending = false;
    188     pci_irq_deassert(&vdev->pdev);
    189 
    190     /* Tell KVM to stop listening for an INTx irqfd */
    191     if (kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
    192                                               vdev->intx.route.irq)) {
    193         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
    194     }
    195 
    196     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
    197     event_notifier_cleanup(&vdev->intx.unmask);
    198 
    199     /* QEMU starts listening for interrupt events. */
    200     qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt),
    201                         vfio_intx_interrupt, NULL, vdev);
    202 
    203     vdev->intx.kvm_accel = false;
    204 
    205     /* If we've missed an event, let it re-fire through QEMU */
    206     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    207 
    208     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
    209 #endif
    210 }
    211 
    212 static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
    213 {
    214     Error *err = NULL;
    215 
    216     trace_vfio_intx_update(vdev->vbasedev.name,
    217                            vdev->intx.route.irq, route->irq);
    218 
    219     vfio_intx_disable_kvm(vdev);
    220 
    221     vdev->intx.route = *route;
    222 
    223     if (route->mode != PCI_INTX_ENABLED) {
    224         return;
    225     }
    226 
    227     vfio_intx_enable_kvm(vdev, &err);
    228     if (err) {
    229         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    230     }
    231 
    232     /* Re-enable the interrupt in cased we missed an EOI */
    233     vfio_intx_eoi(&vdev->vbasedev);
    234 }
    235 
    236 static void vfio_intx_routing_notifier(PCIDevice *pdev)
    237 {
    238     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    239     PCIINTxRoute route;
    240 
    241     if (vdev->interrupt != VFIO_INT_INTx) {
    242         return;
    243     }
    244 
    245     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
    246 
    247     if (pci_intx_route_changed(&vdev->intx.route, &route)) {
    248         vfio_intx_update(vdev, &route);
    249     }
    250 }
    251 
    252 static void vfio_irqchip_change(Notifier *notify, void *data)
    253 {
    254     VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
    255                                        irqchip_change_notifier);
    256 
    257     vfio_intx_update(vdev, &vdev->intx.route);
    258 }
    259 
    260 static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
    261 {
    262     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
    263     Error *err = NULL;
    264     int32_t fd;
    265     int ret;
    266 
    267 
    268     if (!pin) {
    269         return 0;
    270     }
    271 
    272     vfio_disable_interrupts(vdev);
    273 
    274     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
    275     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
    276 
    277 #ifdef CONFIG_KVM
    278     /*
    279      * Only conditional to avoid generating error messages on platforms
    280      * where we won't actually use the result anyway.
    281      */
    282     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
    283         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
    284                                                         vdev->intx.pin);
    285     }
    286 #endif
    287 
    288     ret = event_notifier_init(&vdev->intx.interrupt, 0);
    289     if (ret) {
    290         error_setg_errno(errp, -ret, "event_notifier_init failed");
    291         return ret;
    292     }
    293     fd = event_notifier_get_fd(&vdev->intx.interrupt);
    294     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
    295 
    296     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
    297                                VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
    298         qemu_set_fd_handler(fd, NULL, NULL, vdev);
    299         event_notifier_cleanup(&vdev->intx.interrupt);
    300         return -errno;
    301     }
    302 
    303     vfio_intx_enable_kvm(vdev, &err);
    304     if (err) {
    305         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    306     }
    307 
    308     vdev->interrupt = VFIO_INT_INTx;
    309 
    310     trace_vfio_intx_enable(vdev->vbasedev.name);
    311     return 0;
    312 }
    313 
    314 static void vfio_intx_disable(VFIOPCIDevice *vdev)
    315 {
    316     int fd;
    317 
    318     timer_del(vdev->intx.mmap_timer);
    319     vfio_intx_disable_kvm(vdev);
    320     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
    321     vdev->intx.pending = false;
    322     pci_irq_deassert(&vdev->pdev);
    323     vfio_mmap_set_enabled(vdev, true);
    324 
    325     fd = event_notifier_get_fd(&vdev->intx.interrupt);
    326     qemu_set_fd_handler(fd, NULL, NULL, vdev);
    327     event_notifier_cleanup(&vdev->intx.interrupt);
    328 
    329     vdev->interrupt = VFIO_INT_NONE;
    330 
    331     trace_vfio_intx_disable(vdev->vbasedev.name);
    332 }
    333 
    334 /*
    335  * MSI/X
    336  */
    337 static void vfio_msi_interrupt(void *opaque)
    338 {
    339     VFIOMSIVector *vector = opaque;
    340     VFIOPCIDevice *vdev = vector->vdev;
    341     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
    342     void (*notify)(PCIDevice *dev, unsigned vector);
    343     MSIMessage msg;
    344     int nr = vector - vdev->msi_vectors;
    345 
    346     if (!event_notifier_test_and_clear(&vector->interrupt)) {
    347         return;
    348     }
    349 
    350     if (vdev->interrupt == VFIO_INT_MSIX) {
    351         get_msg = msix_get_message;
    352         notify = msix_notify;
    353 
    354         /* A masked vector firing needs to use the PBA, enable it */
    355         if (msix_is_masked(&vdev->pdev, nr)) {
    356             set_bit(nr, vdev->msix->pending);
    357             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
    358             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
    359         }
    360     } else if (vdev->interrupt == VFIO_INT_MSI) {
    361         get_msg = msi_get_message;
    362         notify = msi_notify;
    363     } else {
    364         abort();
    365     }
    366 
    367     msg = get_msg(&vdev->pdev, nr);
    368     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
    369     notify(&vdev->pdev, nr);
    370 }
    371 
    372 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
    373 {
    374     struct vfio_irq_set *irq_set;
    375     int ret = 0, i, argsz;
    376     int32_t *fds;
    377 
    378     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
    379 
    380     irq_set = g_malloc0(argsz);
    381     irq_set->argsz = argsz;
    382     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
    383     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
    384     irq_set->start = 0;
    385     irq_set->count = vdev->nr_vectors;
    386     fds = (int32_t *)&irq_set->data;
    387 
    388     for (i = 0; i < vdev->nr_vectors; i++) {
    389         int fd = -1;
    390 
    391         /*
    392          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
    393          * bits, therefore we always use the KVM signaling path when setup.
    394          * MSI-X mask and pending bits are emulated, so we want to use the
    395          * KVM signaling path only when configured and unmasked.
    396          */
    397         if (vdev->msi_vectors[i].use) {
    398             if (vdev->msi_vectors[i].virq < 0 ||
    399                 (msix && msix_is_masked(&vdev->pdev, i))) {
    400                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
    401             } else {
    402                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
    403             }
    404         }
    405 
    406         fds[i] = fd;
    407     }
    408 
    409     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
    410 
    411     g_free(irq_set);
    412 
    413     return ret;
    414 }
    415 
    416 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
    417                                   int vector_n, bool msix)
    418 {
    419     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
    420         return;
    421     }
    422 
    423     vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
    424                                              vector_n, &vdev->pdev);
    425 }
    426 
    427 static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
    428 {
    429     if (vector->virq < 0) {
    430         return;
    431     }
    432 
    433     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
    434         goto fail_notifier;
    435     }
    436 
    437     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
    438                                            NULL, vector->virq) < 0) {
    439         goto fail_kvm;
    440     }
    441 
    442     return;
    443 
    444 fail_kvm:
    445     event_notifier_cleanup(&vector->kvm_interrupt);
    446 fail_notifier:
    447     kvm_irqchip_release_virq(kvm_state, vector->virq);
    448     vector->virq = -1;
    449 }
    450 
    451 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
    452 {
    453     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
    454                                           vector->virq);
    455     kvm_irqchip_release_virq(kvm_state, vector->virq);
    456     vector->virq = -1;
    457     event_notifier_cleanup(&vector->kvm_interrupt);
    458 }
    459 
    460 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
    461                                      PCIDevice *pdev)
    462 {
    463     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
    464     kvm_irqchip_commit_routes(kvm_state);
    465 }
    466 
    467 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
    468                                    MSIMessage *msg, IOHandler *handler)
    469 {
    470     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    471     VFIOMSIVector *vector;
    472     int ret;
    473 
    474     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
    475 
    476     vector = &vdev->msi_vectors[nr];
    477 
    478     if (!vector->use) {
    479         vector->vdev = vdev;
    480         vector->virq = -1;
    481         if (event_notifier_init(&vector->interrupt, 0)) {
    482             error_report("vfio: Error: event_notifier_init failed");
    483         }
    484         vector->use = true;
    485         msix_vector_use(pdev, nr);
    486     }
    487 
    488     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    489                         handler, NULL, vector);
    490 
    491     /*
    492      * Attempt to enable route through KVM irqchip,
    493      * default to userspace handling if unavailable.
    494      */
    495     if (vector->virq >= 0) {
    496         if (!msg) {
    497             vfio_remove_kvm_msi_virq(vector);
    498         } else {
    499             vfio_update_kvm_msi_virq(vector, *msg, pdev);
    500         }
    501     } else {
    502         if (msg) {
    503             if (vdev->defer_kvm_irq_routing) {
    504                 vfio_add_kvm_msi_virq(vdev, vector, nr, true);
    505             } else {
    506                 vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
    507                 vfio_add_kvm_msi_virq(vdev, vector, nr, true);
    508                 kvm_irqchip_commit_route_changes(&vfio_route_change);
    509                 vfio_connect_kvm_msi_virq(vector);
    510             }
    511         }
    512     }
    513 
    514     /*
    515      * We don't want to have the host allocate all possible MSI vectors
    516      * for a device if they're not in use, so we shutdown and incrementally
    517      * increase them as needed.
    518      */
    519     if (vdev->nr_vectors < nr + 1) {
    520         vdev->nr_vectors = nr + 1;
    521         if (!vdev->defer_kvm_irq_routing) {
    522             vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
    523             ret = vfio_enable_vectors(vdev, true);
    524             if (ret) {
    525                 error_report("vfio: failed to enable vectors, %d", ret);
    526             }
    527         }
    528     } else {
    529         Error *err = NULL;
    530         int32_t fd;
    531 
    532         if (vector->virq >= 0) {
    533             fd = event_notifier_get_fd(&vector->kvm_interrupt);
    534         } else {
    535             fd = event_notifier_get_fd(&vector->interrupt);
    536         }
    537 
    538         if (vfio_set_irq_signaling(&vdev->vbasedev,
    539                                      VFIO_PCI_MSIX_IRQ_INDEX, nr,
    540                                      VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
    541             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    542         }
    543     }
    544 
    545     /* Disable PBA emulation when nothing more is pending. */
    546     clear_bit(nr, vdev->msix->pending);
    547     if (find_first_bit(vdev->msix->pending,
    548                        vdev->nr_vectors) == vdev->nr_vectors) {
    549         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
    550         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
    551     }
    552 
    553     return 0;
    554 }
    555 
    556 static int vfio_msix_vector_use(PCIDevice *pdev,
    557                                 unsigned int nr, MSIMessage msg)
    558 {
    559     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
    560 }
    561 
    562 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
    563 {
    564     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
    565     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
    566 
    567     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
    568 
    569     /*
    570      * There are still old guests that mask and unmask vectors on every
    571      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
    572      * the KVM setup in place, simply switch VFIO to use the non-bypass
    573      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
    574      * core will mask the interrupt and set pending bits, allowing it to
    575      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
    576      */
    577     if (vector->virq >= 0) {
    578         int32_t fd = event_notifier_get_fd(&vector->interrupt);
    579         Error *err = NULL;
    580 
    581         if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
    582                                    VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
    583             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    584         }
    585     }
    586 }
    587 
    588 static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
    589 {
    590     assert(!vdev->defer_kvm_irq_routing);
    591     vdev->defer_kvm_irq_routing = true;
    592     vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
    593 }
    594 
    595 static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
    596 {
    597     int i;
    598 
    599     assert(vdev->defer_kvm_irq_routing);
    600     vdev->defer_kvm_irq_routing = false;
    601 
    602     kvm_irqchip_commit_route_changes(&vfio_route_change);
    603 
    604     for (i = 0; i < vdev->nr_vectors; i++) {
    605         vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]);
    606     }
    607 }
    608 
    609 static void vfio_msix_enable(VFIOPCIDevice *vdev)
    610 {
    611     vfio_disable_interrupts(vdev);
    612 
    613     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
    614 
    615     vdev->interrupt = VFIO_INT_MSIX;
    616 
    617     /*
    618      * Setting vector notifiers triggers synchronous vector-use
    619      * callbacks for each active vector.  Deferring to commit the KVM
    620      * routes once rather than per vector provides a substantial
    621      * performance improvement.
    622      */
    623     vfio_prepare_kvm_msi_virq_batch(vdev);
    624 
    625     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
    626                                   vfio_msix_vector_release, NULL)) {
    627         error_report("vfio: msix_set_vector_notifiers failed");
    628     }
    629 
    630     vfio_commit_kvm_msi_virq_batch(vdev);
    631 
    632     if (vdev->nr_vectors) {
    633         int ret;
    634 
    635         ret = vfio_enable_vectors(vdev, true);
    636         if (ret) {
    637             error_report("vfio: failed to enable vectors, %d", ret);
    638         }
    639     } else {
    640         /*
    641          * Some communication channels between VF & PF or PF & fw rely on the
    642          * physical state of the device and expect that enabling MSI-X from the
    643          * guest enables the same on the host.  When our guest is Linux, the
    644          * guest driver call to pci_enable_msix() sets the enabling bit in the
    645          * MSI-X capability, but leaves the vector table masked.  We therefore
    646          * can't rely on a vector_use callback (from request_irq() in the guest)
    647          * to switch the physical device into MSI-X mode because that may come a
    648          * long time after pci_enable_msix().  This code enables vector 0 with
    649          * triggering to userspace, then immediately release the vector, leaving
    650          * the physical device with no vectors enabled, but MSI-X enabled, just
    651          * like the guest view.
    652          */
    653         vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
    654         vfio_msix_vector_release(&vdev->pdev, 0);
    655     }
    656 
    657     trace_vfio_msix_enable(vdev->vbasedev.name);
    658 }
    659 
    660 static void vfio_msi_enable(VFIOPCIDevice *vdev)
    661 {
    662     int ret, i;
    663 
    664     vfio_disable_interrupts(vdev);
    665 
    666     /*
    667      * Setting vector notifiers needs to enable route for each vector.
    668      * Deferring to commit the KVM routes once rather than per vector
    669      * provides a substantial performance improvement.
    670      */
    671     vfio_prepare_kvm_msi_virq_batch(vdev);
    672 
    673     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
    674 retry:
    675     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
    676 
    677     for (i = 0; i < vdev->nr_vectors; i++) {
    678         VFIOMSIVector *vector = &vdev->msi_vectors[i];
    679 
    680         vector->vdev = vdev;
    681         vector->virq = -1;
    682         vector->use = true;
    683 
    684         if (event_notifier_init(&vector->interrupt, 0)) {
    685             error_report("vfio: Error: event_notifier_init failed");
    686         }
    687 
    688         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    689                             vfio_msi_interrupt, NULL, vector);
    690 
    691         /*
    692          * Attempt to enable route through KVM irqchip,
    693          * default to userspace handling if unavailable.
    694          */
    695         vfio_add_kvm_msi_virq(vdev, vector, i, false);
    696     }
    697 
    698     vfio_commit_kvm_msi_virq_batch(vdev);
    699 
    700     /* Set interrupt type prior to possible interrupts */
    701     vdev->interrupt = VFIO_INT_MSI;
    702 
    703     ret = vfio_enable_vectors(vdev, false);
    704     if (ret) {
    705         if (ret < 0) {
    706             error_report("vfio: Error: Failed to setup MSI fds: %m");
    707         } else {
    708             error_report("vfio: Error: Failed to enable %d "
    709                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
    710         }
    711 
    712         vfio_msi_disable_common(vdev);
    713 
    714         if (ret > 0) {
    715             vdev->nr_vectors = ret;
    716             goto retry;
    717         }
    718 
    719         /*
    720          * Failing to setup MSI doesn't really fall within any specification.
    721          * Let's try leaving interrupts disabled and hope the guest figures
    722          * out to fall back to INTx for this device.
    723          */
    724         error_report("vfio: Error: Failed to enable MSI");
    725 
    726         return;
    727     }
    728 
    729     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
    730 }
    731 
    732 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
    733 {
    734     int i;
    735 
    736     for (i = 0; i < vdev->nr_vectors; i++) {
    737         VFIOMSIVector *vector = &vdev->msi_vectors[i];
    738         if (vdev->msi_vectors[i].use) {
    739             if (vector->virq >= 0) {
    740                 vfio_remove_kvm_msi_virq(vector);
    741             }
    742             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
    743                                 NULL, NULL, NULL);
    744             event_notifier_cleanup(&vector->interrupt);
    745         }
    746     }
    747 
    748     g_free(vdev->msi_vectors);
    749     vdev->msi_vectors = NULL;
    750     vdev->nr_vectors = 0;
    751     vdev->interrupt = VFIO_INT_NONE;
    752 }
    753 
    754 static void vfio_msix_disable(VFIOPCIDevice *vdev)
    755 {
    756     Error *err = NULL;
    757     int i;
    758 
    759     msix_unset_vector_notifiers(&vdev->pdev);
    760 
    761     /*
    762      * MSI-X will only release vectors if MSI-X is still enabled on the
    763      * device, check through the rest and release it ourselves if necessary.
    764      */
    765     for (i = 0; i < vdev->nr_vectors; i++) {
    766         if (vdev->msi_vectors[i].use) {
    767             vfio_msix_vector_release(&vdev->pdev, i);
    768             msix_vector_unuse(&vdev->pdev, i);
    769         }
    770     }
    771 
    772     if (vdev->nr_vectors) {
    773         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
    774     }
    775 
    776     vfio_msi_disable_common(vdev);
    777     vfio_intx_enable(vdev, &err);
    778     if (err) {
    779         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    780     }
    781 
    782     memset(vdev->msix->pending, 0,
    783            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
    784 
    785     trace_vfio_msix_disable(vdev->vbasedev.name);
    786 }
    787 
    788 static void vfio_msi_disable(VFIOPCIDevice *vdev)
    789 {
    790     Error *err = NULL;
    791 
    792     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
    793     vfio_msi_disable_common(vdev);
    794     vfio_intx_enable(vdev, &err);
    795     if (err) {
    796         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
    797     }
    798 
    799     trace_vfio_msi_disable(vdev->vbasedev.name);
    800 }
    801 
    802 static void vfio_update_msi(VFIOPCIDevice *vdev)
    803 {
    804     int i;
    805 
    806     for (i = 0; i < vdev->nr_vectors; i++) {
    807         VFIOMSIVector *vector = &vdev->msi_vectors[i];
    808         MSIMessage msg;
    809 
    810         if (!vector->use || vector->virq < 0) {
    811             continue;
    812         }
    813 
    814         msg = msi_get_message(&vdev->pdev, i);
    815         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
    816     }
    817 }
    818 
    819 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
    820 {
    821     struct vfio_region_info *reg_info;
    822     uint64_t size;
    823     off_t off = 0;
    824     ssize_t bytes;
    825 
    826     if (vfio_get_region_info(&vdev->vbasedev,
    827                              VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
    828         error_report("vfio: Error getting ROM info: %m");
    829         return;
    830     }
    831 
    832     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
    833                             (unsigned long)reg_info->offset,
    834                             (unsigned long)reg_info->flags);
    835 
    836     vdev->rom_size = size = reg_info->size;
    837     vdev->rom_offset = reg_info->offset;
    838 
    839     g_free(reg_info);
    840 
    841     if (!vdev->rom_size) {
    842         vdev->rom_read_failed = true;
    843         error_report("vfio-pci: Cannot read device rom at "
    844                     "%s", vdev->vbasedev.name);
    845         error_printf("Device option ROM contents are probably invalid "
    846                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
    847                     "or load from file with romfile=\n");
    848         return;
    849     }
    850 
    851     vdev->rom = g_malloc(size);
    852     memset(vdev->rom, 0xff, size);
    853 
    854     while (size) {
    855         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
    856                       size, vdev->rom_offset + off);
    857         if (bytes == 0) {
    858             break;
    859         } else if (bytes > 0) {
    860             off += bytes;
    861             size -= bytes;
    862         } else {
    863             if (errno == EINTR || errno == EAGAIN) {
    864                 continue;
    865             }
    866             error_report("vfio: Error reading device ROM: %m");
    867             break;
    868         }
    869     }
    870 
    871     /*
    872      * Test the ROM signature against our device, if the vendor is correct
    873      * but the device ID doesn't match, store the correct device ID and
    874      * recompute the checksum.  Intel IGD devices need this and are known
    875      * to have bogus checksums so we can't simply adjust the checksum.
    876      */
    877     if (pci_get_word(vdev->rom) == 0xaa55 &&
    878         pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
    879         !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
    880         uint16_t vid, did;
    881 
    882         vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
    883         did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
    884 
    885         if (vid == vdev->vendor_id && did != vdev->device_id) {
    886             int i;
    887             uint8_t csum, *data = vdev->rom;
    888 
    889             pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
    890                          vdev->device_id);
    891             data[6] = 0;
    892 
    893             for (csum = 0, i = 0; i < vdev->rom_size; i++) {
    894                 csum += data[i];
    895             }
    896 
    897             data[6] = -csum;
    898         }
    899     }
    900 }
    901 
    902 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
    903 {
    904     VFIOPCIDevice *vdev = opaque;
    905     union {
    906         uint8_t byte;
    907         uint16_t word;
    908         uint32_t dword;
    909         uint64_t qword;
    910     } val;
    911     uint64_t data = 0;
    912 
    913     /* Load the ROM lazily when the guest tries to read it */
    914     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
    915         vfio_pci_load_rom(vdev);
    916     }
    917 
    918     memcpy(&val, vdev->rom + addr,
    919            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
    920 
    921     switch (size) {
    922     case 1:
    923         data = val.byte;
    924         break;
    925     case 2:
    926         data = le16_to_cpu(val.word);
    927         break;
    928     case 4:
    929         data = le32_to_cpu(val.dword);
    930         break;
    931     default:
    932         hw_error("vfio: unsupported read size, %d bytes\n", size);
    933         break;
    934     }
    935 
    936     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
    937 
    938     return data;
    939 }
    940 
    941 static void vfio_rom_write(void *opaque, hwaddr addr,
    942                            uint64_t data, unsigned size)
    943 {
    944 }
    945 
    946 static const MemoryRegionOps vfio_rom_ops = {
    947     .read = vfio_rom_read,
    948     .write = vfio_rom_write,
    949     .endianness = DEVICE_LITTLE_ENDIAN,
    950 };
    951 
    952 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
    953 {
    954     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
    955     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
    956     DeviceState *dev = DEVICE(vdev);
    957     char *name;
    958     int fd = vdev->vbasedev.fd;
    959 
    960     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
    961         /* Since pci handles romfile, just print a message and return */
    962         if (vfio_opt_rom_in_denylist(vdev) && vdev->pdev.romfile) {
    963             warn_report("Device at %s is known to cause system instability"
    964                         " issues during option rom execution",
    965                         vdev->vbasedev.name);
    966             error_printf("Proceeding anyway since user specified romfile\n");
    967         }
    968         return;
    969     }
    970 
    971     /*
    972      * Use the same size ROM BAR as the physical device.  The contents
    973      * will get filled in later when the guest tries to read it.
    974      */
    975     if (pread(fd, &orig, 4, offset) != 4 ||
    976         pwrite(fd, &size, 4, offset) != 4 ||
    977         pread(fd, &size, 4, offset) != 4 ||
    978         pwrite(fd, &orig, 4, offset) != 4) {
    979         error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
    980         return;
    981     }
    982 
    983     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
    984 
    985     if (!size) {
    986         return;
    987     }
    988 
    989     if (vfio_opt_rom_in_denylist(vdev)) {
    990         if (dev->opts && qdict_haskey(dev->opts, "rombar")) {
    991             warn_report("Device at %s is known to cause system instability"
    992                         " issues during option rom execution",
    993                         vdev->vbasedev.name);
    994             error_printf("Proceeding anyway since user specified"
    995                          " non zero value for rombar\n");
    996         } else {
    997             warn_report("Rom loading for device at %s has been disabled"
    998                         " due to system instability issues",
    999                         vdev->vbasedev.name);
   1000             error_printf("Specify rombar=1 or romfile to force\n");
   1001             return;
   1002         }
   1003     }
   1004 
   1005     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
   1006 
   1007     name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
   1008 
   1009     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
   1010                           &vfio_rom_ops, vdev, name, size);
   1011     g_free(name);
   1012 
   1013     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
   1014                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
   1015 
   1016     vdev->rom_read_failed = false;
   1017 }
   1018 
   1019 void vfio_vga_write(void *opaque, hwaddr addr,
   1020                            uint64_t data, unsigned size)
   1021 {
   1022     VFIOVGARegion *region = opaque;
   1023     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
   1024     union {
   1025         uint8_t byte;
   1026         uint16_t word;
   1027         uint32_t dword;
   1028         uint64_t qword;
   1029     } buf;
   1030     off_t offset = vga->fd_offset + region->offset + addr;
   1031 
   1032     switch (size) {
   1033     case 1:
   1034         buf.byte = data;
   1035         break;
   1036     case 2:
   1037         buf.word = cpu_to_le16(data);
   1038         break;
   1039     case 4:
   1040         buf.dword = cpu_to_le32(data);
   1041         break;
   1042     default:
   1043         hw_error("vfio: unsupported write size, %d bytes", size);
   1044         break;
   1045     }
   1046 
   1047     if (pwrite(vga->fd, &buf, size, offset) != size) {
   1048         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
   1049                      __func__, region->offset + addr, data, size);
   1050     }
   1051 
   1052     trace_vfio_vga_write(region->offset + addr, data, size);
   1053 }
   1054 
   1055 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
   1056 {
   1057     VFIOVGARegion *region = opaque;
   1058     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
   1059     union {
   1060         uint8_t byte;
   1061         uint16_t word;
   1062         uint32_t dword;
   1063         uint64_t qword;
   1064     } buf;
   1065     uint64_t data = 0;
   1066     off_t offset = vga->fd_offset + region->offset + addr;
   1067 
   1068     if (pread(vga->fd, &buf, size, offset) != size) {
   1069         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
   1070                      __func__, region->offset + addr, size);
   1071         return (uint64_t)-1;
   1072     }
   1073 
   1074     switch (size) {
   1075     case 1:
   1076         data = buf.byte;
   1077         break;
   1078     case 2:
   1079         data = le16_to_cpu(buf.word);
   1080         break;
   1081     case 4:
   1082         data = le32_to_cpu(buf.dword);
   1083         break;
   1084     default:
   1085         hw_error("vfio: unsupported read size, %d bytes", size);
   1086         break;
   1087     }
   1088 
   1089     trace_vfio_vga_read(region->offset + addr, size, data);
   1090 
   1091     return data;
   1092 }
   1093 
   1094 static const MemoryRegionOps vfio_vga_ops = {
   1095     .read = vfio_vga_read,
   1096     .write = vfio_vga_write,
   1097     .endianness = DEVICE_LITTLE_ENDIAN,
   1098 };
   1099 
   1100 /*
   1101  * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
   1102  * size if the BAR is in an exclusive page in host so that we could map
   1103  * this BAR to guest. But this sub-page BAR may not occupy an exclusive
   1104  * page in guest. So we should set the priority of the expanded memory
   1105  * region to zero in case of overlap with BARs which share the same page
   1106  * with the sub-page BAR in guest. Besides, we should also recover the
   1107  * size of this sub-page BAR when its base address is changed in guest
   1108  * and not page aligned any more.
   1109  */
   1110 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
   1111 {
   1112     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1113     VFIORegion *region = &vdev->bars[bar].region;
   1114     MemoryRegion *mmap_mr, *region_mr, *base_mr;
   1115     PCIIORegion *r;
   1116     pcibus_t bar_addr;
   1117     uint64_t size = region->size;
   1118 
   1119     /* Make sure that the whole region is allowed to be mmapped */
   1120     if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
   1121         region->mmaps[0].size != region->size) {
   1122         return;
   1123     }
   1124 
   1125     r = &pdev->io_regions[bar];
   1126     bar_addr = r->addr;
   1127     base_mr = vdev->bars[bar].mr;
   1128     region_mr = region->mem;
   1129     mmap_mr = &region->mmaps[0].mem;
   1130 
   1131     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
   1132     if (bar_addr != PCI_BAR_UNMAPPED &&
   1133         !(bar_addr & ~qemu_real_host_page_mask())) {
   1134         size = qemu_real_host_page_size();
   1135     }
   1136 
   1137     memory_region_transaction_begin();
   1138 
   1139     if (vdev->bars[bar].size < size) {
   1140         memory_region_set_size(base_mr, size);
   1141     }
   1142     memory_region_set_size(region_mr, size);
   1143     memory_region_set_size(mmap_mr, size);
   1144     if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
   1145         memory_region_del_subregion(r->address_space, base_mr);
   1146         memory_region_add_subregion_overlap(r->address_space,
   1147                                             bar_addr, base_mr, 0);
   1148     }
   1149 
   1150     memory_region_transaction_commit();
   1151 }
   1152 
   1153 /*
   1154  * PCI config space
   1155  */
   1156 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
   1157 {
   1158     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1159     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
   1160 
   1161     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
   1162     emu_bits = le32_to_cpu(emu_bits);
   1163 
   1164     if (emu_bits) {
   1165         emu_val = pci_default_read_config(pdev, addr, len);
   1166     }
   1167 
   1168     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
   1169         ssize_t ret;
   1170 
   1171         ret = pread(vdev->vbasedev.fd, &phys_val, len,
   1172                     vdev->config_offset + addr);
   1173         if (ret != len) {
   1174             error_report("%s(%s, 0x%x, 0x%x) failed: %m",
   1175                          __func__, vdev->vbasedev.name, addr, len);
   1176             return -errno;
   1177         }
   1178         phys_val = le32_to_cpu(phys_val);
   1179     }
   1180 
   1181     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
   1182 
   1183     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
   1184 
   1185     return val;
   1186 }
   1187 
   1188 void vfio_pci_write_config(PCIDevice *pdev,
   1189                            uint32_t addr, uint32_t val, int len)
   1190 {
   1191     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   1192     uint32_t val_le = cpu_to_le32(val);
   1193 
   1194     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
   1195 
   1196     /* Write everything to VFIO, let it filter out what we can't write */
   1197     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
   1198                 != len) {
   1199         error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
   1200                      __func__, vdev->vbasedev.name, addr, val, len);
   1201     }
   1202 
   1203     /* MSI/MSI-X Enabling/Disabling */
   1204     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
   1205         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
   1206         int is_enabled, was_enabled = msi_enabled(pdev);
   1207 
   1208         pci_default_write_config(pdev, addr, val, len);
   1209 
   1210         is_enabled = msi_enabled(pdev);
   1211 
   1212         if (!was_enabled) {
   1213             if (is_enabled) {
   1214                 vfio_msi_enable(vdev);
   1215             }
   1216         } else {
   1217             if (!is_enabled) {
   1218                 vfio_msi_disable(vdev);
   1219             } else {
   1220                 vfio_update_msi(vdev);
   1221             }
   1222         }
   1223     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
   1224         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
   1225         int is_enabled, was_enabled = msix_enabled(pdev);
   1226 
   1227         pci_default_write_config(pdev, addr, val, len);
   1228 
   1229         is_enabled = msix_enabled(pdev);
   1230 
   1231         if (!was_enabled && is_enabled) {
   1232             vfio_msix_enable(vdev);
   1233         } else if (was_enabled && !is_enabled) {
   1234             vfio_msix_disable(vdev);
   1235         }
   1236     } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
   1237         range_covers_byte(addr, len, PCI_COMMAND)) {
   1238         pcibus_t old_addr[PCI_NUM_REGIONS - 1];
   1239         int bar;
   1240 
   1241         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   1242             old_addr[bar] = pdev->io_regions[bar].addr;
   1243         }
   1244 
   1245         pci_default_write_config(pdev, addr, val, len);
   1246 
   1247         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   1248             if (old_addr[bar] != pdev->io_regions[bar].addr &&
   1249                 vdev->bars[bar].region.size > 0 &&
   1250                 vdev->bars[bar].region.size < qemu_real_host_page_size()) {
   1251                 vfio_sub_page_bar_update_mapping(pdev, bar);
   1252             }
   1253         }
   1254     } else {
   1255         /* Write everything to QEMU to keep emulated bits correct */
   1256         pci_default_write_config(pdev, addr, val, len);
   1257     }
   1258 }
   1259 
   1260 /*
   1261  * Interrupt setup
   1262  */
   1263 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
   1264 {
   1265     /*
   1266      * More complicated than it looks.  Disabling MSI/X transitions the
   1267      * device to INTx mode (if supported).  Therefore we need to first
   1268      * disable MSI/X and then cleanup by disabling INTx.
   1269      */
   1270     if (vdev->interrupt == VFIO_INT_MSIX) {
   1271         vfio_msix_disable(vdev);
   1272     } else if (vdev->interrupt == VFIO_INT_MSI) {
   1273         vfio_msi_disable(vdev);
   1274     }
   1275 
   1276     if (vdev->interrupt == VFIO_INT_INTx) {
   1277         vfio_intx_disable(vdev);
   1278     }
   1279 }
   1280 
   1281 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
   1282 {
   1283     uint16_t ctrl;
   1284     bool msi_64bit, msi_maskbit;
   1285     int ret, entries;
   1286     Error *err = NULL;
   1287 
   1288     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
   1289               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
   1290         error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
   1291         return -errno;
   1292     }
   1293     ctrl = le16_to_cpu(ctrl);
   1294 
   1295     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
   1296     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
   1297     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
   1298 
   1299     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
   1300 
   1301     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
   1302     if (ret < 0) {
   1303         if (ret == -ENOTSUP) {
   1304             return 0;
   1305         }
   1306         error_propagate_prepend(errp, err, "msi_init failed: ");
   1307         return ret;
   1308     }
   1309     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
   1310 
   1311     return 0;
   1312 }
   1313 
   1314 static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
   1315 {
   1316     off_t start, end;
   1317     VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
   1318 
   1319     /*
   1320      * If the host driver allows mapping of a MSIX data, we are going to
   1321      * do map the entire BAR and emulate MSIX table on top of that.
   1322      */
   1323     if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
   1324                             VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
   1325         return;
   1326     }
   1327 
   1328     /*
   1329      * We expect to find a single mmap covering the whole BAR, anything else
   1330      * means it's either unsupported or already setup.
   1331      */
   1332     if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
   1333         region->size != region->mmaps[0].size) {
   1334         return;
   1335     }
   1336 
   1337     /* MSI-X table start and end aligned to host page size */
   1338     start = vdev->msix->table_offset & qemu_real_host_page_mask();
   1339     end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
   1340                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
   1341 
   1342     /*
   1343      * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
   1344      * NB - Host page size is necessarily a power of two and so is the PCI
   1345      * BAR (not counting EA yet), therefore if we have host page aligned
   1346      * @start and @end, then any remainder of the BAR before or after those
   1347      * must be at least host page sized and therefore mmap'able.
   1348      */
   1349     if (!start) {
   1350         if (end >= region->size) {
   1351             region->nr_mmaps = 0;
   1352             g_free(region->mmaps);
   1353             region->mmaps = NULL;
   1354             trace_vfio_msix_fixup(vdev->vbasedev.name,
   1355                                   vdev->msix->table_bar, 0, 0);
   1356         } else {
   1357             region->mmaps[0].offset = end;
   1358             region->mmaps[0].size = region->size - end;
   1359             trace_vfio_msix_fixup(vdev->vbasedev.name,
   1360                               vdev->msix->table_bar, region->mmaps[0].offset,
   1361                               region->mmaps[0].offset + region->mmaps[0].size);
   1362         }
   1363 
   1364     /* Maybe it's aligned at the end of the BAR */
   1365     } else if (end >= region->size) {
   1366         region->mmaps[0].size = start;
   1367         trace_vfio_msix_fixup(vdev->vbasedev.name,
   1368                               vdev->msix->table_bar, region->mmaps[0].offset,
   1369                               region->mmaps[0].offset + region->mmaps[0].size);
   1370 
   1371     /* Otherwise it must split the BAR */
   1372     } else {
   1373         region->nr_mmaps = 2;
   1374         region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
   1375 
   1376         memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
   1377 
   1378         region->mmaps[0].size = start;
   1379         trace_vfio_msix_fixup(vdev->vbasedev.name,
   1380                               vdev->msix->table_bar, region->mmaps[0].offset,
   1381                               region->mmaps[0].offset + region->mmaps[0].size);
   1382 
   1383         region->mmaps[1].offset = end;
   1384         region->mmaps[1].size = region->size - end;
   1385         trace_vfio_msix_fixup(vdev->vbasedev.name,
   1386                               vdev->msix->table_bar, region->mmaps[1].offset,
   1387                               region->mmaps[1].offset + region->mmaps[1].size);
   1388     }
   1389 }
   1390 
   1391 static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
   1392 {
   1393     int target_bar = -1;
   1394     size_t msix_sz;
   1395 
   1396     if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
   1397         return;
   1398     }
   1399 
   1400     /* The actual minimum size of MSI-X structures */
   1401     msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
   1402               (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
   1403     /* Round up to host pages, we don't want to share a page */
   1404     msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
   1405     /* PCI BARs must be a power of 2 */
   1406     msix_sz = pow2ceil(msix_sz);
   1407 
   1408     if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
   1409         /*
   1410          * TODO: Lookup table for known devices.
   1411          *
   1412          * Logically we might use an algorithm here to select the BAR adding
   1413          * the least additional MMIO space, but we cannot programmatically
   1414          * predict the driver dependency on BAR ordering or sizing, therefore
   1415          * 'auto' becomes a lookup for combinations reported to work.
   1416          */
   1417         if (target_bar < 0) {
   1418             error_setg(errp, "No automatic MSI-X relocation available for "
   1419                        "device %04x:%04x", vdev->vendor_id, vdev->device_id);
   1420             return;
   1421         }
   1422     } else {
   1423         target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
   1424     }
   1425 
   1426     /* I/O port BARs cannot host MSI-X structures */
   1427     if (vdev->bars[target_bar].ioport) {
   1428         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1429                    "I/O port BAR", target_bar);
   1430         return;
   1431     }
   1432 
   1433     /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
   1434     if (!vdev->bars[target_bar].size &&
   1435          target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
   1436         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1437                    "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
   1438         return;
   1439     }
   1440 
   1441     /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
   1442     if (vdev->bars[target_bar].size > 1 * GiB &&
   1443         !vdev->bars[target_bar].mem64) {
   1444         error_setg(errp, "Invalid MSI-X relocation BAR %d, "
   1445                    "no space to extend 32-bit BAR", target_bar);
   1446         return;
   1447     }
   1448 
   1449     /*
   1450      * If adding a new BAR, test if we can make it 64bit.  We make it
   1451      * prefetchable since QEMU MSI-X emulation has no read side effects
   1452      * and doing so makes mapping more flexible.
   1453      */
   1454     if (!vdev->bars[target_bar].size) {
   1455         if (target_bar < (PCI_ROM_SLOT - 1) &&
   1456             !vdev->bars[target_bar + 1].size) {
   1457             vdev->bars[target_bar].mem64 = true;
   1458             vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
   1459         }
   1460         vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
   1461         vdev->bars[target_bar].size = msix_sz;
   1462         vdev->msix->table_offset = 0;
   1463     } else {
   1464         vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
   1465                                           msix_sz * 2);
   1466         /*
   1467          * Due to above size calc, MSI-X always starts halfway into the BAR,
   1468          * which will always be a separate host page.
   1469          */
   1470         vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
   1471     }
   1472 
   1473     vdev->msix->table_bar = target_bar;
   1474     vdev->msix->pba_bar = target_bar;
   1475     /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
   1476     vdev->msix->pba_offset = vdev->msix->table_offset +
   1477                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
   1478 
   1479     trace_vfio_msix_relo(vdev->vbasedev.name,
   1480                          vdev->msix->table_bar, vdev->msix->table_offset);
   1481 }
   1482 
   1483 /*
   1484  * We don't have any control over how pci_add_capability() inserts
   1485  * capabilities into the chain.  In order to setup MSI-X we need a
   1486  * MemoryRegion for the BAR.  In order to setup the BAR and not
   1487  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
   1488  * need to first look for where the MSI-X table lives.  So we
   1489  * unfortunately split MSI-X setup across two functions.
   1490  */
   1491 static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
   1492 {
   1493     uint8_t pos;
   1494     uint16_t ctrl;
   1495     uint32_t table, pba;
   1496     int fd = vdev->vbasedev.fd;
   1497     VFIOMSIXInfo *msix;
   1498 
   1499     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
   1500     if (!pos) {
   1501         return;
   1502     }
   1503 
   1504     if (pread(fd, &ctrl, sizeof(ctrl),
   1505               vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
   1506         error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
   1507         return;
   1508     }
   1509 
   1510     if (pread(fd, &table, sizeof(table),
   1511               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
   1512         error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
   1513         return;
   1514     }
   1515 
   1516     if (pread(fd, &pba, sizeof(pba),
   1517               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
   1518         error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
   1519         return;
   1520     }
   1521 
   1522     ctrl = le16_to_cpu(ctrl);
   1523     table = le32_to_cpu(table);
   1524     pba = le32_to_cpu(pba);
   1525 
   1526     msix = g_malloc0(sizeof(*msix));
   1527     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
   1528     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
   1529     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
   1530     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
   1531     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
   1532 
   1533     /*
   1534      * Test the size of the pba_offset variable and catch if it extends outside
   1535      * of the specified BAR. If it is the case, we need to apply a hardware
   1536      * specific quirk if the device is known or we have a broken configuration.
   1537      */
   1538     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
   1539         /*
   1540          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
   1541          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
   1542          * the VF PBA offset while the BAR itself is only 8k. The correct value
   1543          * is 0x1000, so we hard code that here.
   1544          */
   1545         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
   1546             (vdev->device_id & 0xff00) == 0x5800) {
   1547             msix->pba_offset = 0x1000;
   1548         /*
   1549          * BAIDU KUNLUN Virtual Function devices for KUNLUN AI processor
   1550          * return an incorrect value of 0x460000 for the VF PBA offset while
   1551          * the BAR itself is only 0x10000.  The correct value is 0xb400.
   1552          */
   1553         } else if (vfio_pci_is(vdev, PCI_VENDOR_ID_BAIDU,
   1554                                PCI_DEVICE_ID_KUNLUN_VF)) {
   1555             msix->pba_offset = 0xb400;
   1556         } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
   1557             error_setg(errp, "hardware reports invalid configuration, "
   1558                        "MSIX PBA outside of specified BAR");
   1559             g_free(msix);
   1560             return;
   1561         }
   1562     }
   1563 
   1564     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
   1565                                 msix->table_offset, msix->entries);
   1566     vdev->msix = msix;
   1567 
   1568     vfio_pci_fixup_msix_region(vdev);
   1569 
   1570     vfio_pci_relocate_msix(vdev, errp);
   1571 }
   1572 
   1573 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
   1574 {
   1575     int ret;
   1576     Error *err = NULL;
   1577 
   1578     vdev->msix->pending = g_new0(unsigned long,
   1579                                  BITS_TO_LONGS(vdev->msix->entries));
   1580     ret = msix_init(&vdev->pdev, vdev->msix->entries,
   1581                     vdev->bars[vdev->msix->table_bar].mr,
   1582                     vdev->msix->table_bar, vdev->msix->table_offset,
   1583                     vdev->bars[vdev->msix->pba_bar].mr,
   1584                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
   1585                     &err);
   1586     if (ret < 0) {
   1587         if (ret == -ENOTSUP) {
   1588             warn_report_err(err);
   1589             return 0;
   1590         }
   1591 
   1592         error_propagate(errp, err);
   1593         return ret;
   1594     }
   1595 
   1596     /*
   1597      * The PCI spec suggests that devices provide additional alignment for
   1598      * MSI-X structures and avoid overlapping non-MSI-X related registers.
   1599      * For an assigned device, this hopefully means that emulation of MSI-X
   1600      * structures does not affect the performance of the device.  If devices
   1601      * fail to provide that alignment, a significant performance penalty may
   1602      * result, for instance Mellanox MT27500 VFs:
   1603      * http://www.spinics.net/lists/kvm/msg125881.html
   1604      *
   1605      * The PBA is simply not that important for such a serious regression and
   1606      * most drivers do not appear to look at it.  The solution for this is to
   1607      * disable the PBA MemoryRegion unless it's being used.  We disable it
   1608      * here and only enable it if a masked vector fires through QEMU.  As the
   1609      * vector-use notifier is called, which occurs on unmask, we test whether
   1610      * PBA emulation is needed and again disable if not.
   1611      */
   1612     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
   1613 
   1614     /*
   1615      * The emulated machine may provide a paravirt interface for MSIX setup
   1616      * so it is not strictly necessary to emulate MSIX here. This becomes
   1617      * helpful when frequently accessed MMIO registers are located in
   1618      * subpages adjacent to the MSIX table but the MSIX data containing page
   1619      * cannot be mapped because of a host page size bigger than the MSIX table
   1620      * alignment.
   1621      */
   1622     if (object_property_get_bool(OBJECT(qdev_get_machine()),
   1623                                  "vfio-no-msix-emulation", NULL)) {
   1624         memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
   1625     }
   1626 
   1627     return 0;
   1628 }
   1629 
   1630 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
   1631 {
   1632     msi_uninit(&vdev->pdev);
   1633 
   1634     if (vdev->msix) {
   1635         msix_uninit(&vdev->pdev,
   1636                     vdev->bars[vdev->msix->table_bar].mr,
   1637                     vdev->bars[vdev->msix->pba_bar].mr);
   1638         g_free(vdev->msix->pending);
   1639     }
   1640 }
   1641 
   1642 /*
   1643  * Resource setup
   1644  */
   1645 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
   1646 {
   1647     int i;
   1648 
   1649     for (i = 0; i < PCI_ROM_SLOT; i++) {
   1650         vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
   1651     }
   1652 }
   1653 
   1654 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
   1655 {
   1656     VFIOBAR *bar = &vdev->bars[nr];
   1657 
   1658     uint32_t pci_bar;
   1659     int ret;
   1660 
   1661     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
   1662     if (!bar->region.size) {
   1663         return;
   1664     }
   1665 
   1666     /* Determine what type of BAR this is for registration */
   1667     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
   1668                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
   1669     if (ret != sizeof(pci_bar)) {
   1670         error_report("vfio: Failed to read BAR %d (%m)", nr);
   1671         return;
   1672     }
   1673 
   1674     pci_bar = le32_to_cpu(pci_bar);
   1675     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
   1676     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
   1677     bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
   1678                                          ~PCI_BASE_ADDRESS_MEM_MASK);
   1679     bar->size = bar->region.size;
   1680 }
   1681 
   1682 static void vfio_bars_prepare(VFIOPCIDevice *vdev)
   1683 {
   1684     int i;
   1685 
   1686     for (i = 0; i < PCI_ROM_SLOT; i++) {
   1687         vfio_bar_prepare(vdev, i);
   1688     }
   1689 }
   1690 
   1691 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
   1692 {
   1693     VFIOBAR *bar = &vdev->bars[nr];
   1694     char *name;
   1695 
   1696     if (!bar->size) {
   1697         return;
   1698     }
   1699 
   1700     bar->mr = g_new0(MemoryRegion, 1);
   1701     name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
   1702     memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
   1703     g_free(name);
   1704 
   1705     if (bar->region.size) {
   1706         memory_region_add_subregion(bar->mr, 0, bar->region.mem);
   1707 
   1708         if (vfio_region_mmap(&bar->region)) {
   1709             error_report("Failed to mmap %s BAR %d. Performance may be slow",
   1710                          vdev->vbasedev.name, nr);
   1711         }
   1712     }
   1713 
   1714     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
   1715 }
   1716 
   1717 static void vfio_bars_register(VFIOPCIDevice *vdev)
   1718 {
   1719     int i;
   1720 
   1721     for (i = 0; i < PCI_ROM_SLOT; i++) {
   1722         vfio_bar_register(vdev, i);
   1723     }
   1724 }
   1725 
   1726 static void vfio_bars_exit(VFIOPCIDevice *vdev)
   1727 {
   1728     int i;
   1729 
   1730     for (i = 0; i < PCI_ROM_SLOT; i++) {
   1731         VFIOBAR *bar = &vdev->bars[i];
   1732 
   1733         vfio_bar_quirk_exit(vdev, i);
   1734         vfio_region_exit(&bar->region);
   1735         if (bar->region.size) {
   1736             memory_region_del_subregion(bar->mr, bar->region.mem);
   1737         }
   1738     }
   1739 
   1740     if (vdev->vga) {
   1741         pci_unregister_vga(&vdev->pdev);
   1742         vfio_vga_quirk_exit(vdev);
   1743     }
   1744 }
   1745 
   1746 static void vfio_bars_finalize(VFIOPCIDevice *vdev)
   1747 {
   1748     int i;
   1749 
   1750     for (i = 0; i < PCI_ROM_SLOT; i++) {
   1751         VFIOBAR *bar = &vdev->bars[i];
   1752 
   1753         vfio_bar_quirk_finalize(vdev, i);
   1754         vfio_region_finalize(&bar->region);
   1755         if (bar->size) {
   1756             object_unparent(OBJECT(bar->mr));
   1757             g_free(bar->mr);
   1758         }
   1759     }
   1760 
   1761     if (vdev->vga) {
   1762         vfio_vga_quirk_finalize(vdev);
   1763         for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
   1764             object_unparent(OBJECT(&vdev->vga->region[i].mem));
   1765         }
   1766         g_free(vdev->vga);
   1767     }
   1768 }
   1769 
   1770 /*
   1771  * General setup
   1772  */
   1773 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
   1774 {
   1775     uint8_t tmp;
   1776     uint16_t next = PCI_CONFIG_SPACE_SIZE;
   1777 
   1778     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
   1779          tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
   1780         if (tmp > pos && tmp < next) {
   1781             next = tmp;
   1782         }
   1783     }
   1784 
   1785     return next - pos;
   1786 }
   1787 
   1788 
   1789 static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
   1790 {
   1791     uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
   1792 
   1793     for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
   1794         tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
   1795         if (tmp > pos && tmp < next) {
   1796             next = tmp;
   1797         }
   1798     }
   1799 
   1800     return next - pos;
   1801 }
   1802 
   1803 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
   1804 {
   1805     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
   1806 }
   1807 
   1808 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
   1809                                    uint16_t val, uint16_t mask)
   1810 {
   1811     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
   1812     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
   1813     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
   1814 }
   1815 
   1816 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
   1817 {
   1818     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
   1819 }
   1820 
   1821 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
   1822                                    uint32_t val, uint32_t mask)
   1823 {
   1824     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
   1825     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
   1826     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
   1827 }
   1828 
   1829 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
   1830                                Error **errp)
   1831 {
   1832     uint16_t flags;
   1833     uint8_t type;
   1834 
   1835     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
   1836     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
   1837 
   1838     if (type != PCI_EXP_TYPE_ENDPOINT &&
   1839         type != PCI_EXP_TYPE_LEG_END &&
   1840         type != PCI_EXP_TYPE_RC_END) {
   1841 
   1842         error_setg(errp, "assignment of PCIe type 0x%x "
   1843                    "devices is not currently supported", type);
   1844         return -EINVAL;
   1845     }
   1846 
   1847     if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
   1848         PCIBus *bus = pci_get_bus(&vdev->pdev);
   1849         PCIDevice *bridge;
   1850 
   1851         /*
   1852          * Traditionally PCI device assignment exposes the PCIe capability
   1853          * as-is on non-express buses.  The reason being that some drivers
   1854          * simply assume that it's there, for example tg3.  However when
   1855          * we're running on a native PCIe machine type, like Q35, we need
   1856          * to hide the PCIe capability.  The reason for this is twofold;
   1857          * first Windows guests get a Code 10 error when the PCIe capability
   1858          * is exposed in this configuration.  Therefore express devices won't
   1859          * work at all unless they're attached to express buses in the VM.
   1860          * Second, a native PCIe machine introduces the possibility of fine
   1861          * granularity IOMMUs supporting both translation and isolation.
   1862          * Guest code to discover the IOMMU visibility of a device, such as
   1863          * IOMMU grouping code on Linux, is very aware of device types and
   1864          * valid transitions between bus types.  An express device on a non-
   1865          * express bus is not a valid combination on bare metal systems.
   1866          *
   1867          * Drivers that require a PCIe capability to make the device
   1868          * functional are simply going to need to have their devices placed
   1869          * on a PCIe bus in the VM.
   1870          */
   1871         while (!pci_bus_is_root(bus)) {
   1872             bridge = pci_bridge_get_device(bus);
   1873             bus = pci_get_bus(bridge);
   1874         }
   1875 
   1876         if (pci_bus_is_express(bus)) {
   1877             return 0;
   1878         }
   1879 
   1880     } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
   1881         /*
   1882          * On a Root Complex bus Endpoints become Root Complex Integrated
   1883          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
   1884          */
   1885         if (type == PCI_EXP_TYPE_ENDPOINT) {
   1886             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1887                                    PCI_EXP_TYPE_RC_END << 4,
   1888                                    PCI_EXP_FLAGS_TYPE);
   1889 
   1890             /* Link Capabilities, Status, and Control goes away */
   1891             if (size > PCI_EXP_LNKCTL) {
   1892                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
   1893                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
   1894                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
   1895 
   1896 #ifndef PCI_EXP_LNKCAP2
   1897 #define PCI_EXP_LNKCAP2 44
   1898 #endif
   1899 #ifndef PCI_EXP_LNKSTA2
   1900 #define PCI_EXP_LNKSTA2 50
   1901 #endif
   1902                 /* Link 2 Capabilities, Status, and Control goes away */
   1903                 if (size > PCI_EXP_LNKCAP2) {
   1904                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
   1905                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
   1906                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
   1907                 }
   1908             }
   1909 
   1910         } else if (type == PCI_EXP_TYPE_LEG_END) {
   1911             /*
   1912              * Legacy endpoints don't belong on the root complex.  Windows
   1913              * seems to be happier with devices if we skip the capability.
   1914              */
   1915             return 0;
   1916         }
   1917 
   1918     } else {
   1919         /*
   1920          * Convert Root Complex Integrated Endpoints to regular endpoints.
   1921          * These devices don't support LNK/LNK2 capabilities, so make them up.
   1922          */
   1923         if (type == PCI_EXP_TYPE_RC_END) {
   1924             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1925                                    PCI_EXP_TYPE_ENDPOINT << 4,
   1926                                    PCI_EXP_FLAGS_TYPE);
   1927             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
   1928                            QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
   1929                            QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
   1930             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
   1931         }
   1932     }
   1933 
   1934     /*
   1935      * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
   1936      * (Niantic errate #35) causing Windows to error with a Code 10 for the
   1937      * device on Q35.  Fixup any such devices to report version 1.  If we
   1938      * were to remove the capability entirely the guest would lose extended
   1939      * config space.
   1940      */
   1941     if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
   1942         vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
   1943                                1, PCI_EXP_FLAGS_VERS);
   1944     }
   1945 
   1946     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
   1947                              errp);
   1948     if (pos < 0) {
   1949         return pos;
   1950     }
   1951 
   1952     vdev->pdev.exp.exp_cap = pos;
   1953 
   1954     return pos;
   1955 }
   1956 
   1957 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
   1958 {
   1959     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
   1960 
   1961     if (cap & PCI_EXP_DEVCAP_FLR) {
   1962         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
   1963         vdev->has_flr = true;
   1964     }
   1965 }
   1966 
   1967 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
   1968 {
   1969     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
   1970 
   1971     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
   1972         trace_vfio_check_pm_reset(vdev->vbasedev.name);
   1973         vdev->has_pm_reset = true;
   1974     }
   1975 }
   1976 
   1977 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
   1978 {
   1979     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
   1980 
   1981     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
   1982         trace_vfio_check_af_flr(vdev->vbasedev.name);
   1983         vdev->has_flr = true;
   1984     }
   1985 }
   1986 
   1987 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
   1988 {
   1989     PCIDevice *pdev = &vdev->pdev;
   1990     uint8_t cap_id, next, size;
   1991     int ret;
   1992 
   1993     cap_id = pdev->config[pos];
   1994     next = pdev->config[pos + PCI_CAP_LIST_NEXT];
   1995 
   1996     /*
   1997      * If it becomes important to configure capabilities to their actual
   1998      * size, use this as the default when it's something we don't recognize.
   1999      * Since QEMU doesn't actually handle many of the config accesses,
   2000      * exact size doesn't seem worthwhile.
   2001      */
   2002     size = vfio_std_cap_max_size(pdev, pos);
   2003 
   2004     /*
   2005      * pci_add_capability always inserts the new capability at the head
   2006      * of the chain.  Therefore to end up with a chain that matches the
   2007      * physical device, we insert from the end by making this recursive.
   2008      * This is also why we pre-calculate size above as cached config space
   2009      * will be changed as we unwind the stack.
   2010      */
   2011     if (next) {
   2012         ret = vfio_add_std_cap(vdev, next, errp);
   2013         if (ret) {
   2014             return ret;
   2015         }
   2016     } else {
   2017         /* Begin the rebuild, use QEMU emulated list bits */
   2018         pdev->config[PCI_CAPABILITY_LIST] = 0;
   2019         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
   2020         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
   2021 
   2022         ret = vfio_add_virt_caps(vdev, errp);
   2023         if (ret) {
   2024             return ret;
   2025         }
   2026     }
   2027 
   2028     /* Scale down size, esp in case virt caps were added above */
   2029     size = MIN(size, vfio_std_cap_max_size(pdev, pos));
   2030 
   2031     /* Use emulated next pointer to allow dropping caps */
   2032     pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
   2033 
   2034     switch (cap_id) {
   2035     case PCI_CAP_ID_MSI:
   2036         ret = vfio_msi_setup(vdev, pos, errp);
   2037         break;
   2038     case PCI_CAP_ID_EXP:
   2039         vfio_check_pcie_flr(vdev, pos);
   2040         ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
   2041         break;
   2042     case PCI_CAP_ID_MSIX:
   2043         ret = vfio_msix_setup(vdev, pos, errp);
   2044         break;
   2045     case PCI_CAP_ID_PM:
   2046         vfio_check_pm_reset(vdev, pos);
   2047         vdev->pm_cap = pos;
   2048         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2049         break;
   2050     case PCI_CAP_ID_AF:
   2051         vfio_check_af_flr(vdev, pos);
   2052         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2053         break;
   2054     default:
   2055         ret = pci_add_capability(pdev, cap_id, pos, size, errp);
   2056         break;
   2057     }
   2058 
   2059     if (ret < 0) {
   2060         error_prepend(errp,
   2061                       "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
   2062                       cap_id, size, pos);
   2063         return ret;
   2064     }
   2065 
   2066     return 0;
   2067 }
   2068 
   2069 static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
   2070 {
   2071     PCIDevice *pdev = &vdev->pdev;
   2072     uint32_t header;
   2073     uint16_t cap_id, next, size;
   2074     uint8_t cap_ver;
   2075     uint8_t *config;
   2076 
   2077     /* Only add extended caps if we have them and the guest can see them */
   2078     if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
   2079         !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
   2080         return;
   2081     }
   2082 
   2083     /*
   2084      * pcie_add_capability always inserts the new capability at the tail
   2085      * of the chain.  Therefore to end up with a chain that matches the
   2086      * physical device, we cache the config space to avoid overwriting
   2087      * the original config space when we parse the extended capabilities.
   2088      */
   2089     config = g_memdup(pdev->config, vdev->config_size);
   2090 
   2091     /*
   2092      * Extended capabilities are chained with each pointing to the next, so we
   2093      * can drop anything other than the head of the chain simply by modifying
   2094      * the previous next pointer.  Seed the head of the chain here such that
   2095      * we can simply skip any capabilities we want to drop below, regardless
   2096      * of their position in the chain.  If this stub capability still exists
   2097      * after we add the capabilities we want to expose, update the capability
   2098      * ID to zero.  Note that we cannot seed with the capability header being
   2099      * zero as this conflicts with definition of an absent capability chain
   2100      * and prevents capabilities beyond the head of the list from being added.
   2101      * By replacing the dummy capability ID with zero after walking the device
   2102      * chain, we also transparently mark extended capabilities as absent if
   2103      * no capabilities were added.  Note that the PCIe spec defines an absence
   2104      * of extended capabilities to be determined by a value of zero for the
   2105      * capability ID, version, AND next pointer.  A non-zero next pointer
   2106      * should be sufficient to indicate additional capabilities are present,
   2107      * which will occur if we call pcie_add_capability() below.  The entire
   2108      * first dword is emulated to support this.
   2109      *
   2110      * NB. The kernel side does similar masking, so be prepared that our
   2111      * view of the device may also contain a capability ID zero in the head
   2112      * of the chain.  Skip it for the same reason that we cannot seed the
   2113      * chain with a zero capability.
   2114      */
   2115     pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
   2116                  PCI_EXT_CAP(0xFFFF, 0, 0));
   2117     pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
   2118     pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
   2119 
   2120     for (next = PCI_CONFIG_SPACE_SIZE; next;
   2121          next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
   2122         header = pci_get_long(config + next);
   2123         cap_id = PCI_EXT_CAP_ID(header);
   2124         cap_ver = PCI_EXT_CAP_VER(header);
   2125 
   2126         /*
   2127          * If it becomes important to configure extended capabilities to their
   2128          * actual size, use this as the default when it's something we don't
   2129          * recognize. Since QEMU doesn't actually handle many of the config
   2130          * accesses, exact size doesn't seem worthwhile.
   2131          */
   2132         size = vfio_ext_cap_max_size(config, next);
   2133 
   2134         /* Use emulated next pointer to allow dropping extended caps */
   2135         pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
   2136                                    PCI_EXT_CAP_NEXT_MASK);
   2137 
   2138         switch (cap_id) {
   2139         case 0: /* kernel masked capability */
   2140         case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
   2141         case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
   2142         case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */
   2143             trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
   2144             break;
   2145         default:
   2146             pcie_add_capability(pdev, cap_id, cap_ver, next, size);
   2147         }
   2148 
   2149     }
   2150 
   2151     /* Cleanup chain head ID if necessary */
   2152     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
   2153         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
   2154     }
   2155 
   2156     g_free(config);
   2157     return;
   2158 }
   2159 
   2160 static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
   2161 {
   2162     PCIDevice *pdev = &vdev->pdev;
   2163     int ret;
   2164 
   2165     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
   2166         !pdev->config[PCI_CAPABILITY_LIST]) {
   2167         return 0; /* Nothing to add */
   2168     }
   2169 
   2170     ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp);
   2171     if (ret) {
   2172         return ret;
   2173     }
   2174 
   2175     vfio_add_ext_cap(vdev);
   2176     return 0;
   2177 }
   2178 
   2179 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
   2180 {
   2181     PCIDevice *pdev = &vdev->pdev;
   2182     uint16_t cmd;
   2183 
   2184     vfio_disable_interrupts(vdev);
   2185 
   2186     /* Make sure the device is in D0 */
   2187     if (vdev->pm_cap) {
   2188         uint16_t pmcsr;
   2189         uint8_t state;
   2190 
   2191         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
   2192         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
   2193         if (state) {
   2194             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
   2195             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
   2196             /* vfio handles the necessary delay here */
   2197             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
   2198             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
   2199             if (state) {
   2200                 error_report("vfio: Unable to power on device, stuck in D%d",
   2201                              state);
   2202             }
   2203         }
   2204     }
   2205 
   2206     /*
   2207      * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
   2208      * Also put INTx Disable in known state.
   2209      */
   2210     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
   2211     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
   2212              PCI_COMMAND_INTX_DISABLE);
   2213     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
   2214 }
   2215 
   2216 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
   2217 {
   2218     Error *err = NULL;
   2219     int nr;
   2220 
   2221     vfio_intx_enable(vdev, &err);
   2222     if (err) {
   2223         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2224     }
   2225 
   2226     for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
   2227         off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
   2228         uint32_t val = 0;
   2229         uint32_t len = sizeof(val);
   2230 
   2231         if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
   2232             error_report("%s(%s) reset bar %d failed: %m", __func__,
   2233                          vdev->vbasedev.name, nr);
   2234         }
   2235     }
   2236 
   2237     vfio_quirk_reset(vdev);
   2238 }
   2239 
   2240 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
   2241 {
   2242     char tmp[13];
   2243 
   2244     sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
   2245             addr->bus, addr->slot, addr->function);
   2246 
   2247     return (strcmp(tmp, name) == 0);
   2248 }
   2249 
   2250 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
   2251 {
   2252     VFIOGroup *group;
   2253     struct vfio_pci_hot_reset_info *info;
   2254     struct vfio_pci_dependent_device *devices;
   2255     struct vfio_pci_hot_reset *reset;
   2256     int32_t *fds;
   2257     int ret, i, count;
   2258     bool multi = false;
   2259 
   2260     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
   2261 
   2262     if (!single) {
   2263         vfio_pci_pre_reset(vdev);
   2264     }
   2265     vdev->vbasedev.needs_reset = false;
   2266 
   2267     info = g_malloc0(sizeof(*info));
   2268     info->argsz = sizeof(*info);
   2269 
   2270     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
   2271     if (ret && errno != ENOSPC) {
   2272         ret = -errno;
   2273         if (!vdev->has_pm_reset) {
   2274             error_report("vfio: Cannot reset device %s, "
   2275                          "no available reset mechanism.", vdev->vbasedev.name);
   2276         }
   2277         goto out_single;
   2278     }
   2279 
   2280     count = info->count;
   2281     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
   2282     info->argsz = sizeof(*info) + (count * sizeof(*devices));
   2283     devices = &info->devices[0];
   2284 
   2285     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
   2286     if (ret) {
   2287         ret = -errno;
   2288         error_report("vfio: hot reset info failed: %m");
   2289         goto out_single;
   2290     }
   2291 
   2292     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
   2293 
   2294     /* Verify that we have all the groups required */
   2295     for (i = 0; i < info->count; i++) {
   2296         PCIHostDeviceAddress host;
   2297         VFIOPCIDevice *tmp;
   2298         VFIODevice *vbasedev_iter;
   2299 
   2300         host.domain = devices[i].segment;
   2301         host.bus = devices[i].bus;
   2302         host.slot = PCI_SLOT(devices[i].devfn);
   2303         host.function = PCI_FUNC(devices[i].devfn);
   2304 
   2305         trace_vfio_pci_hot_reset_dep_devices(host.domain,
   2306                 host.bus, host.slot, host.function, devices[i].group_id);
   2307 
   2308         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
   2309             continue;
   2310         }
   2311 
   2312         QLIST_FOREACH(group, &vfio_group_list, next) {
   2313             if (group->groupid == devices[i].group_id) {
   2314                 break;
   2315             }
   2316         }
   2317 
   2318         if (!group) {
   2319             if (!vdev->has_pm_reset) {
   2320                 error_report("vfio: Cannot reset device %s, "
   2321                              "depends on group %d which is not owned.",
   2322                              vdev->vbasedev.name, devices[i].group_id);
   2323             }
   2324             ret = -EPERM;
   2325             goto out;
   2326         }
   2327 
   2328         /* Prep dependent devices for reset and clear our marker. */
   2329         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2330             if (!vbasedev_iter->dev->realized ||
   2331                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
   2332                 continue;
   2333             }
   2334             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
   2335             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
   2336                 if (single) {
   2337                     ret = -EINVAL;
   2338                     goto out_single;
   2339                 }
   2340                 vfio_pci_pre_reset(tmp);
   2341                 tmp->vbasedev.needs_reset = false;
   2342                 multi = true;
   2343                 break;
   2344             }
   2345         }
   2346     }
   2347 
   2348     if (!single && !multi) {
   2349         ret = -EINVAL;
   2350         goto out_single;
   2351     }
   2352 
   2353     /* Determine how many group fds need to be passed */
   2354     count = 0;
   2355     QLIST_FOREACH(group, &vfio_group_list, next) {
   2356         for (i = 0; i < info->count; i++) {
   2357             if (group->groupid == devices[i].group_id) {
   2358                 count++;
   2359                 break;
   2360             }
   2361         }
   2362     }
   2363 
   2364     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
   2365     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
   2366     fds = &reset->group_fds[0];
   2367 
   2368     /* Fill in group fds */
   2369     QLIST_FOREACH(group, &vfio_group_list, next) {
   2370         for (i = 0; i < info->count; i++) {
   2371             if (group->groupid == devices[i].group_id) {
   2372                 fds[reset->count++] = group->fd;
   2373                 break;
   2374             }
   2375         }
   2376     }
   2377 
   2378     /* Bus reset! */
   2379     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
   2380     g_free(reset);
   2381 
   2382     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
   2383                                     ret ? strerror(errno) : "Success");
   2384 
   2385 out:
   2386     /* Re-enable INTx on affected devices */
   2387     for (i = 0; i < info->count; i++) {
   2388         PCIHostDeviceAddress host;
   2389         VFIOPCIDevice *tmp;
   2390         VFIODevice *vbasedev_iter;
   2391 
   2392         host.domain = devices[i].segment;
   2393         host.bus = devices[i].bus;
   2394         host.slot = PCI_SLOT(devices[i].devfn);
   2395         host.function = PCI_FUNC(devices[i].devfn);
   2396 
   2397         if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
   2398             continue;
   2399         }
   2400 
   2401         QLIST_FOREACH(group, &vfio_group_list, next) {
   2402             if (group->groupid == devices[i].group_id) {
   2403                 break;
   2404             }
   2405         }
   2406 
   2407         if (!group) {
   2408             break;
   2409         }
   2410 
   2411         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2412             if (!vbasedev_iter->dev->realized ||
   2413                 vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
   2414                 continue;
   2415             }
   2416             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
   2417             if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
   2418                 vfio_pci_post_reset(tmp);
   2419                 break;
   2420             }
   2421         }
   2422     }
   2423 out_single:
   2424     if (!single) {
   2425         vfio_pci_post_reset(vdev);
   2426     }
   2427     g_free(info);
   2428 
   2429     return ret;
   2430 }
   2431 
   2432 /*
   2433  * We want to differentiate hot reset of multiple in-use devices vs hot reset
   2434  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
   2435  * of doing hot resets when there is only a single device per bus.  The in-use
   2436  * here refers to how many VFIODevices are affected.  A hot reset that affects
   2437  * multiple devices, but only a single in-use device, means that we can call
   2438  * it from our bus ->reset() callback since the extent is effectively a single
   2439  * device.  This allows us to make use of it in the hotplug path.  When there
   2440  * are multiple in-use devices, we can only trigger the hot reset during a
   2441  * system reset and thus from our reset handler.  We separate _one vs _multi
   2442  * here so that we don't overlap and do a double reset on the system reset
   2443  * path where both our reset handler and ->reset() callback are used.  Calling
   2444  * _one() will only do a hot reset for the one in-use devices case, calling
   2445  * _multi() will do nothing if a _one() would have been sufficient.
   2446  */
   2447 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
   2448 {
   2449     return vfio_pci_hot_reset(vdev, true);
   2450 }
   2451 
   2452 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
   2453 {
   2454     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2455     return vfio_pci_hot_reset(vdev, false);
   2456 }
   2457 
   2458 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
   2459 {
   2460     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2461     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
   2462         vbasedev->needs_reset = true;
   2463     }
   2464 }
   2465 
   2466 static Object *vfio_pci_get_object(VFIODevice *vbasedev)
   2467 {
   2468     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2469 
   2470     return OBJECT(vdev);
   2471 }
   2472 
   2473 static bool vfio_msix_present(void *opaque, int version_id)
   2474 {
   2475     PCIDevice *pdev = opaque;
   2476 
   2477     return msix_present(pdev);
   2478 }
   2479 
   2480 const VMStateDescription vmstate_vfio_pci_config = {
   2481     .name = "VFIOPCIDevice",
   2482     .version_id = 1,
   2483     .minimum_version_id = 1,
   2484     .fields = (VMStateField[]) {
   2485         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
   2486         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
   2487         VMSTATE_END_OF_LIST()
   2488     }
   2489 };
   2490 
   2491 static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
   2492 {
   2493     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2494 
   2495     vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL);
   2496 }
   2497 
   2498 static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
   2499 {
   2500     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
   2501     PCIDevice *pdev = &vdev->pdev;
   2502     pcibus_t old_addr[PCI_NUM_REGIONS - 1];
   2503     int bar, ret;
   2504 
   2505     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   2506         old_addr[bar] = pdev->io_regions[bar].addr;
   2507     }
   2508 
   2509     ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
   2510     if (ret) {
   2511         return ret;
   2512     }
   2513 
   2514     vfio_pci_write_config(pdev, PCI_COMMAND,
   2515                           pci_get_word(pdev->config + PCI_COMMAND), 2);
   2516 
   2517     for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
   2518         /*
   2519          * The address may not be changed in some scenarios
   2520          * (e.g. the VF driver isn't loaded in VM).
   2521          */
   2522         if (old_addr[bar] != pdev->io_regions[bar].addr &&
   2523             vdev->bars[bar].region.size > 0 &&
   2524             vdev->bars[bar].region.size < qemu_real_host_page_size()) {
   2525             vfio_sub_page_bar_update_mapping(pdev, bar);
   2526         }
   2527     }
   2528 
   2529     if (msi_enabled(pdev)) {
   2530         vfio_msi_enable(vdev);
   2531     } else if (msix_enabled(pdev)) {
   2532         vfio_msix_enable(vdev);
   2533     }
   2534 
   2535     return ret;
   2536 }
   2537 
   2538 static VFIODeviceOps vfio_pci_ops = {
   2539     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
   2540     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
   2541     .vfio_eoi = vfio_intx_eoi,
   2542     .vfio_get_object = vfio_pci_get_object,
   2543     .vfio_save_config = vfio_pci_save_config,
   2544     .vfio_load_config = vfio_pci_load_config,
   2545 };
   2546 
   2547 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
   2548 {
   2549     VFIODevice *vbasedev = &vdev->vbasedev;
   2550     struct vfio_region_info *reg_info;
   2551     int ret;
   2552 
   2553     ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
   2554     if (ret) {
   2555         error_setg_errno(errp, -ret,
   2556                          "failed getting region info for VGA region index %d",
   2557                          VFIO_PCI_VGA_REGION_INDEX);
   2558         return ret;
   2559     }
   2560 
   2561     if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
   2562         !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
   2563         reg_info->size < 0xbffff + 1) {
   2564         error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
   2565                    (unsigned long)reg_info->flags,
   2566                    (unsigned long)reg_info->size);
   2567         g_free(reg_info);
   2568         return -EINVAL;
   2569     }
   2570 
   2571     vdev->vga = g_new0(VFIOVGA, 1);
   2572 
   2573     vdev->vga->fd_offset = reg_info->offset;
   2574     vdev->vga->fd = vdev->vbasedev.fd;
   2575 
   2576     g_free(reg_info);
   2577 
   2578     vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
   2579     vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
   2580     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
   2581 
   2582     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
   2583                           OBJECT(vdev), &vfio_vga_ops,
   2584                           &vdev->vga->region[QEMU_PCI_VGA_MEM],
   2585                           "vfio-vga-mmio@0xa0000",
   2586                           QEMU_PCI_VGA_MEM_SIZE);
   2587 
   2588     vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
   2589     vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
   2590     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
   2591 
   2592     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
   2593                           OBJECT(vdev), &vfio_vga_ops,
   2594                           &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
   2595                           "vfio-vga-io@0x3b0",
   2596                           QEMU_PCI_VGA_IO_LO_SIZE);
   2597 
   2598     vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
   2599     vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
   2600     QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
   2601 
   2602     memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
   2603                           OBJECT(vdev), &vfio_vga_ops,
   2604                           &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
   2605                           "vfio-vga-io@0x3c0",
   2606                           QEMU_PCI_VGA_IO_HI_SIZE);
   2607 
   2608     pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
   2609                      &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
   2610                      &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
   2611 
   2612     return 0;
   2613 }
   2614 
   2615 static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
   2616 {
   2617     VFIODevice *vbasedev = &vdev->vbasedev;
   2618     struct vfio_region_info *reg_info;
   2619     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
   2620     int i, ret = -1;
   2621 
   2622     /* Sanity check device */
   2623     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
   2624         error_setg(errp, "this isn't a PCI device");
   2625         return;
   2626     }
   2627 
   2628     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
   2629         error_setg(errp, "unexpected number of io regions %u",
   2630                    vbasedev->num_regions);
   2631         return;
   2632     }
   2633 
   2634     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
   2635         error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
   2636         return;
   2637     }
   2638 
   2639     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
   2640         char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
   2641 
   2642         ret = vfio_region_setup(OBJECT(vdev), vbasedev,
   2643                                 &vdev->bars[i].region, i, name);
   2644         g_free(name);
   2645 
   2646         if (ret) {
   2647             error_setg_errno(errp, -ret, "failed to get region %d info", i);
   2648             return;
   2649         }
   2650 
   2651         QLIST_INIT(&vdev->bars[i].quirks);
   2652     }
   2653 
   2654     ret = vfio_get_region_info(vbasedev,
   2655                                VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
   2656     if (ret) {
   2657         error_setg_errno(errp, -ret, "failed to get config info");
   2658         return;
   2659     }
   2660 
   2661     trace_vfio_populate_device_config(vdev->vbasedev.name,
   2662                                       (unsigned long)reg_info->size,
   2663                                       (unsigned long)reg_info->offset,
   2664                                       (unsigned long)reg_info->flags);
   2665 
   2666     vdev->config_size = reg_info->size;
   2667     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
   2668         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
   2669     }
   2670     vdev->config_offset = reg_info->offset;
   2671 
   2672     g_free(reg_info);
   2673 
   2674     if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
   2675         ret = vfio_populate_vga(vdev, errp);
   2676         if (ret) {
   2677             error_append_hint(errp, "device does not support "
   2678                               "requested feature x-vga\n");
   2679             return;
   2680         }
   2681     }
   2682 
   2683     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
   2684 
   2685     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
   2686     if (ret) {
   2687         /* This can fail for an old kernel or legacy PCI dev */
   2688         trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
   2689     } else if (irq_info.count == 1) {
   2690         vdev->pci_aer = true;
   2691     } else {
   2692         warn_report(VFIO_MSG_PREFIX
   2693                     "Could not enable error recovery for the device",
   2694                     vbasedev->name);
   2695     }
   2696 }
   2697 
   2698 static void vfio_put_device(VFIOPCIDevice *vdev)
   2699 {
   2700     g_free(vdev->vbasedev.name);
   2701     g_free(vdev->msix);
   2702 
   2703     vfio_put_base_device(&vdev->vbasedev);
   2704 }
   2705 
   2706 static void vfio_err_notifier_handler(void *opaque)
   2707 {
   2708     VFIOPCIDevice *vdev = opaque;
   2709 
   2710     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
   2711         return;
   2712     }
   2713 
   2714     /*
   2715      * TBD. Retrieve the error details and decide what action
   2716      * needs to be taken. One of the actions could be to pass
   2717      * the error to the guest and have the guest driver recover
   2718      * from the error. This requires that PCIe capabilities be
   2719      * exposed to the guest. For now, we just terminate the
   2720      * guest to contain the error.
   2721      */
   2722 
   2723     error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
   2724 
   2725     vm_stop(RUN_STATE_INTERNAL_ERROR);
   2726 }
   2727 
   2728 /*
   2729  * Registers error notifier for devices supporting error recovery.
   2730  * If we encounter a failure in this function, we report an error
   2731  * and continue after disabling error recovery support for the
   2732  * device.
   2733  */
   2734 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
   2735 {
   2736     Error *err = NULL;
   2737     int32_t fd;
   2738 
   2739     if (!vdev->pci_aer) {
   2740         return;
   2741     }
   2742 
   2743     if (event_notifier_init(&vdev->err_notifier, 0)) {
   2744         error_report("vfio: Unable to init event notifier for error detection");
   2745         vdev->pci_aer = false;
   2746         return;
   2747     }
   2748 
   2749     fd = event_notifier_get_fd(&vdev->err_notifier);
   2750     qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
   2751 
   2752     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
   2753                                VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
   2754         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2755         qemu_set_fd_handler(fd, NULL, NULL, vdev);
   2756         event_notifier_cleanup(&vdev->err_notifier);
   2757         vdev->pci_aer = false;
   2758     }
   2759 }
   2760 
   2761 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
   2762 {
   2763     Error *err = NULL;
   2764 
   2765     if (!vdev->pci_aer) {
   2766         return;
   2767     }
   2768 
   2769     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
   2770                                VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
   2771         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2772     }
   2773     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
   2774                         NULL, NULL, vdev);
   2775     event_notifier_cleanup(&vdev->err_notifier);
   2776 }
   2777 
   2778 static void vfio_req_notifier_handler(void *opaque)
   2779 {
   2780     VFIOPCIDevice *vdev = opaque;
   2781     Error *err = NULL;
   2782 
   2783     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
   2784         return;
   2785     }
   2786 
   2787     qdev_unplug(DEVICE(vdev), &err);
   2788     if (err) {
   2789         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2790     }
   2791 }
   2792 
   2793 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
   2794 {
   2795     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
   2796                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
   2797     Error *err = NULL;
   2798     int32_t fd;
   2799 
   2800     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
   2801         return;
   2802     }
   2803 
   2804     if (ioctl(vdev->vbasedev.fd,
   2805               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
   2806         return;
   2807     }
   2808 
   2809     if (event_notifier_init(&vdev->req_notifier, 0)) {
   2810         error_report("vfio: Unable to init event notifier for device request");
   2811         return;
   2812     }
   2813 
   2814     fd = event_notifier_get_fd(&vdev->req_notifier);
   2815     qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
   2816 
   2817     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
   2818                            VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
   2819         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2820         qemu_set_fd_handler(fd, NULL, NULL, vdev);
   2821         event_notifier_cleanup(&vdev->req_notifier);
   2822     } else {
   2823         vdev->req_enabled = true;
   2824     }
   2825 }
   2826 
   2827 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
   2828 {
   2829     Error *err = NULL;
   2830 
   2831     if (!vdev->req_enabled) {
   2832         return;
   2833     }
   2834 
   2835     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
   2836                                VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
   2837         error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
   2838     }
   2839     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
   2840                         NULL, NULL, vdev);
   2841     event_notifier_cleanup(&vdev->req_notifier);
   2842 
   2843     vdev->req_enabled = false;
   2844 }
   2845 
   2846 static void vfio_realize(PCIDevice *pdev, Error **errp)
   2847 {
   2848     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   2849     VFIODevice *vbasedev = &vdev->vbasedev;
   2850     VFIODevice *vbasedev_iter;
   2851     VFIOGroup *group;
   2852     char *tmp, *subsys, group_path[PATH_MAX], *group_name;
   2853     Error *err = NULL;
   2854     ssize_t len;
   2855     struct stat st;
   2856     int groupid;
   2857     int i, ret;
   2858     bool is_mdev;
   2859 
   2860     if (!vbasedev->sysfsdev) {
   2861         if (!(~vdev->host.domain || ~vdev->host.bus ||
   2862               ~vdev->host.slot || ~vdev->host.function)) {
   2863             error_setg(errp, "No provided host device");
   2864             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
   2865                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
   2866             return;
   2867         }
   2868         vbasedev->sysfsdev =
   2869             g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
   2870                             vdev->host.domain, vdev->host.bus,
   2871                             vdev->host.slot, vdev->host.function);
   2872     }
   2873 
   2874     if (stat(vbasedev->sysfsdev, &st) < 0) {
   2875         error_setg_errno(errp, errno, "no such host device");
   2876         error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
   2877         return;
   2878     }
   2879 
   2880     vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
   2881     vbasedev->ops = &vfio_pci_ops;
   2882     vbasedev->type = VFIO_DEVICE_TYPE_PCI;
   2883     vbasedev->dev = DEVICE(vdev);
   2884 
   2885     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
   2886     len = readlink(tmp, group_path, sizeof(group_path));
   2887     g_free(tmp);
   2888 
   2889     if (len <= 0 || len >= sizeof(group_path)) {
   2890         error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
   2891                          "no iommu_group found");
   2892         goto error;
   2893     }
   2894 
   2895     group_path[len] = 0;
   2896 
   2897     group_name = basename(group_path);
   2898     if (sscanf(group_name, "%d", &groupid) != 1) {
   2899         error_setg_errno(errp, errno, "failed to read %s", group_path);
   2900         goto error;
   2901     }
   2902 
   2903     trace_vfio_realize(vbasedev->name, groupid);
   2904 
   2905     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
   2906     if (!group) {
   2907         goto error;
   2908     }
   2909 
   2910     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
   2911         if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
   2912             error_setg(errp, "device is already attached");
   2913             vfio_put_group(group);
   2914             goto error;
   2915         }
   2916     }
   2917 
   2918     /*
   2919      * Mediated devices *might* operate compatibly with discarding of RAM, but
   2920      * we cannot know for certain, it depends on whether the mdev vendor driver
   2921      * stays in sync with the active working set of the guest driver.  Prevent
   2922      * the x-balloon-allowed option unless this is minimally an mdev device.
   2923      */
   2924     tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
   2925     subsys = realpath(tmp, NULL);
   2926     g_free(tmp);
   2927     is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
   2928     free(subsys);
   2929 
   2930     trace_vfio_mdev(vbasedev->name, is_mdev);
   2931 
   2932     if (vbasedev->ram_block_discard_allowed && !is_mdev) {
   2933         error_setg(errp, "x-balloon-allowed only potentially compatible "
   2934                    "with mdev devices");
   2935         vfio_put_group(group);
   2936         goto error;
   2937     }
   2938 
   2939     ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
   2940     if (ret) {
   2941         vfio_put_group(group);
   2942         goto error;
   2943     }
   2944 
   2945     vfio_populate_device(vdev, &err);
   2946     if (err) {
   2947         error_propagate(errp, err);
   2948         goto error;
   2949     }
   2950 
   2951     /* Get a copy of config space */
   2952     ret = pread(vbasedev->fd, vdev->pdev.config,
   2953                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
   2954                 vdev->config_offset);
   2955     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
   2956         ret = ret < 0 ? -errno : -EFAULT;
   2957         error_setg_errno(errp, -ret, "failed to read device config space");
   2958         goto error;
   2959     }
   2960 
   2961     /* vfio emulates a lot for us, but some bits need extra love */
   2962     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
   2963 
   2964     /* QEMU can choose to expose the ROM or not */
   2965     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
   2966     /* QEMU can also add or extend BARs */
   2967     memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
   2968 
   2969     /*
   2970      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
   2971      * device ID is managed by the vendor and need only be a 16-bit value.
   2972      * Allow any 16-bit value for subsystem so they can be hidden or changed.
   2973      */
   2974     if (vdev->vendor_id != PCI_ANY_ID) {
   2975         if (vdev->vendor_id >= 0xffff) {
   2976             error_setg(errp, "invalid PCI vendor ID provided");
   2977             goto error;
   2978         }
   2979         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
   2980         trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
   2981     } else {
   2982         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
   2983     }
   2984 
   2985     if (vdev->device_id != PCI_ANY_ID) {
   2986         if (vdev->device_id > 0xffff) {
   2987             error_setg(errp, "invalid PCI device ID provided");
   2988             goto error;
   2989         }
   2990         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
   2991         trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
   2992     } else {
   2993         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
   2994     }
   2995 
   2996     if (vdev->sub_vendor_id != PCI_ANY_ID) {
   2997         if (vdev->sub_vendor_id > 0xffff) {
   2998             error_setg(errp, "invalid PCI subsystem vendor ID provided");
   2999             goto error;
   3000         }
   3001         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
   3002                                vdev->sub_vendor_id, ~0);
   3003         trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
   3004                                               vdev->sub_vendor_id);
   3005     }
   3006 
   3007     if (vdev->sub_device_id != PCI_ANY_ID) {
   3008         if (vdev->sub_device_id > 0xffff) {
   3009             error_setg(errp, "invalid PCI subsystem device ID provided");
   3010             goto error;
   3011         }
   3012         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
   3013         trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
   3014                                               vdev->sub_device_id);
   3015     }
   3016 
   3017     /* QEMU can change multi-function devices to single function, or reverse */
   3018     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
   3019                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
   3020 
   3021     /* Restore or clear multifunction, this is always controlled by QEMU */
   3022     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
   3023         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
   3024     } else {
   3025         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
   3026     }
   3027 
   3028     /*
   3029      * Clear host resource mapping info.  If we choose not to register a
   3030      * BAR, such as might be the case with the option ROM, we can get
   3031      * confusing, unwritable, residual addresses from the host here.
   3032      */
   3033     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
   3034     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
   3035 
   3036     vfio_pci_size_rom(vdev);
   3037 
   3038     vfio_bars_prepare(vdev);
   3039 
   3040     vfio_msix_early_setup(vdev, &err);
   3041     if (err) {
   3042         error_propagate(errp, err);
   3043         goto error;
   3044     }
   3045 
   3046     vfio_bars_register(vdev);
   3047 
   3048     ret = vfio_add_capabilities(vdev, errp);
   3049     if (ret) {
   3050         goto out_teardown;
   3051     }
   3052 
   3053     if (vdev->vga) {
   3054         vfio_vga_quirk_setup(vdev);
   3055     }
   3056 
   3057     for (i = 0; i < PCI_ROM_SLOT; i++) {
   3058         vfio_bar_quirk_setup(vdev, i);
   3059     }
   3060 
   3061     if (!vdev->igd_opregion &&
   3062         vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
   3063         struct vfio_region_info *opregion;
   3064 
   3065         if (vdev->pdev.qdev.hotplugged) {
   3066             error_setg(errp,
   3067                        "cannot support IGD OpRegion feature on hotplugged "
   3068                        "device");
   3069             goto out_teardown;
   3070         }
   3071 
   3072         ret = vfio_get_dev_region_info(vbasedev,
   3073                         VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
   3074                         VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
   3075         if (ret) {
   3076             error_setg_errno(errp, -ret,
   3077                              "does not support requested IGD OpRegion feature");
   3078             goto out_teardown;
   3079         }
   3080 
   3081         ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
   3082         g_free(opregion);
   3083         if (ret) {
   3084             goto out_teardown;
   3085         }
   3086     }
   3087 
   3088     /* QEMU emulates all of MSI & MSIX */
   3089     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
   3090         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
   3091                MSIX_CAP_LENGTH);
   3092     }
   3093 
   3094     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
   3095         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
   3096                vdev->msi_cap_size);
   3097     }
   3098 
   3099     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
   3100         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
   3101                                                   vfio_intx_mmap_enable, vdev);
   3102         pci_device_set_intx_routing_notifier(&vdev->pdev,
   3103                                              vfio_intx_routing_notifier);
   3104         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
   3105         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
   3106         ret = vfio_intx_enable(vdev, errp);
   3107         if (ret) {
   3108             goto out_deregister;
   3109         }
   3110     }
   3111 
   3112     if (vdev->display != ON_OFF_AUTO_OFF) {
   3113         ret = vfio_display_probe(vdev, errp);
   3114         if (ret) {
   3115             goto out_deregister;
   3116         }
   3117     }
   3118     if (vdev->enable_ramfb && vdev->dpy == NULL) {
   3119         error_setg(errp, "ramfb=on requires display=on");
   3120         goto out_deregister;
   3121     }
   3122     if (vdev->display_xres || vdev->display_yres) {
   3123         if (vdev->dpy == NULL) {
   3124             error_setg(errp, "xres and yres properties require display=on");
   3125             goto out_deregister;
   3126         }
   3127         if (vdev->dpy->edid_regs == NULL) {
   3128             error_setg(errp, "xres and yres properties need edid support");
   3129             goto out_deregister;
   3130         }
   3131     }
   3132 
   3133     if (vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
   3134         ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
   3135         if (ret && ret != -ENODEV) {
   3136             error_report("Failed to setup NVIDIA V100 GPU RAM");
   3137         }
   3138     }
   3139 
   3140     if (vfio_pci_is(vdev, PCI_VENDOR_ID_IBM, PCI_ANY_ID)) {
   3141         ret = vfio_pci_nvlink2_init(vdev, errp);
   3142         if (ret && ret != -ENODEV) {
   3143             error_report("Failed to setup NVlink2 bridge");
   3144         }
   3145     }
   3146 
   3147     if (!pdev->failover_pair_id) {
   3148         ret = vfio_migration_probe(vbasedev, errp);
   3149         if (ret) {
   3150             error_report("%s: Migration disabled", vbasedev->name);
   3151         }
   3152     }
   3153 
   3154     vfio_register_err_notifier(vdev);
   3155     vfio_register_req_notifier(vdev);
   3156     vfio_setup_resetfn_quirk(vdev);
   3157 
   3158     return;
   3159 
   3160 out_deregister:
   3161     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
   3162     kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
   3163 out_teardown:
   3164     vfio_teardown_msi(vdev);
   3165     vfio_bars_exit(vdev);
   3166 error:
   3167     error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
   3168 }
   3169 
   3170 static void vfio_instance_finalize(Object *obj)
   3171 {
   3172     VFIOPCIDevice *vdev = VFIO_PCI(obj);
   3173     VFIOGroup *group = vdev->vbasedev.group;
   3174 
   3175     vfio_display_finalize(vdev);
   3176     vfio_bars_finalize(vdev);
   3177     g_free(vdev->emulated_config_bits);
   3178     g_free(vdev->rom);
   3179     /*
   3180      * XXX Leaking igd_opregion is not an oversight, we can't remove the
   3181      * fw_cfg entry therefore leaking this allocation seems like the safest
   3182      * option.
   3183      *
   3184      * g_free(vdev->igd_opregion);
   3185      */
   3186     vfio_put_device(vdev);
   3187     vfio_put_group(group);
   3188 }
   3189 
   3190 static void vfio_exitfn(PCIDevice *pdev)
   3191 {
   3192     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
   3193 
   3194     vfio_unregister_req_notifier(vdev);
   3195     vfio_unregister_err_notifier(vdev);
   3196     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
   3197     if (vdev->irqchip_change_notifier.notify) {
   3198         kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
   3199     }
   3200     vfio_disable_interrupts(vdev);
   3201     if (vdev->intx.mmap_timer) {
   3202         timer_free(vdev->intx.mmap_timer);
   3203     }
   3204     vfio_teardown_msi(vdev);
   3205     vfio_bars_exit(vdev);
   3206     vfio_migration_finalize(&vdev->vbasedev);
   3207 }
   3208 
   3209 static void vfio_pci_reset(DeviceState *dev)
   3210 {
   3211     VFIOPCIDevice *vdev = VFIO_PCI(dev);
   3212 
   3213     trace_vfio_pci_reset(vdev->vbasedev.name);
   3214 
   3215     vfio_pci_pre_reset(vdev);
   3216 
   3217     if (vdev->display != ON_OFF_AUTO_OFF) {
   3218         vfio_display_reset(vdev);
   3219     }
   3220 
   3221     if (vdev->resetfn && !vdev->resetfn(vdev)) {
   3222         goto post_reset;
   3223     }
   3224 
   3225     if (vdev->vbasedev.reset_works &&
   3226         (vdev->has_flr || !vdev->has_pm_reset) &&
   3227         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
   3228         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
   3229         goto post_reset;
   3230     }
   3231 
   3232     /* See if we can do our own bus reset */
   3233     if (!vfio_pci_hot_reset_one(vdev)) {
   3234         goto post_reset;
   3235     }
   3236 
   3237     /* If nothing else works and the device supports PM reset, use it */
   3238     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
   3239         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
   3240         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
   3241         goto post_reset;
   3242     }
   3243 
   3244 post_reset:
   3245     vfio_pci_post_reset(vdev);
   3246 }
   3247 
   3248 static void vfio_instance_init(Object *obj)
   3249 {
   3250     PCIDevice *pci_dev = PCI_DEVICE(obj);
   3251     VFIOPCIDevice *vdev = VFIO_PCI(obj);
   3252 
   3253     device_add_bootindex_property(obj, &vdev->bootindex,
   3254                                   "bootindex", NULL,
   3255                                   &pci_dev->qdev);
   3256     vdev->host.domain = ~0U;
   3257     vdev->host.bus = ~0U;
   3258     vdev->host.slot = ~0U;
   3259     vdev->host.function = ~0U;
   3260 
   3261     vdev->nv_gpudirect_clique = 0xFF;
   3262 
   3263     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
   3264      * line, therefore, no need to wait to realize like other devices */
   3265     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
   3266 }
   3267 
   3268 static Property vfio_pci_dev_properties[] = {
   3269     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
   3270     DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
   3271     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
   3272                             vbasedev.pre_copy_dirty_page_tracking,
   3273                             ON_OFF_AUTO_ON),
   3274     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
   3275                             display, ON_OFF_AUTO_OFF),
   3276     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
   3277     DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
   3278     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
   3279                        intx.mmap_timeout, 1100),
   3280     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
   3281                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
   3282     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
   3283                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
   3284     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
   3285                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
   3286     DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice,
   3287                      vbasedev.enable_migration, false),
   3288     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
   3289     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
   3290                      vbasedev.ram_block_discard_allowed, false),
   3291     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
   3292     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
   3293     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
   3294     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
   3295                      no_geforce_quirks, false),
   3296     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
   3297                      false),
   3298     DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
   3299                      false),
   3300     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
   3301     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
   3302     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
   3303                        sub_vendor_id, PCI_ANY_ID),
   3304     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
   3305                        sub_device_id, PCI_ANY_ID),
   3306     DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
   3307     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
   3308                                    nv_gpudirect_clique,
   3309                                    qdev_prop_nv_gpudirect_clique, uint8_t),
   3310     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
   3311                                 OFF_AUTOPCIBAR_OFF),
   3312     /*
   3313      * TODO - support passed fds... is this necessary?
   3314      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
   3315      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
   3316      */
   3317     DEFINE_PROP_END_OF_LIST(),
   3318 };
   3319 
   3320 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
   3321 {
   3322     DeviceClass *dc = DEVICE_CLASS(klass);
   3323     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
   3324 
   3325     dc->reset = vfio_pci_reset;
   3326     device_class_set_props(dc, vfio_pci_dev_properties);
   3327     dc->desc = "VFIO-based PCI device assignment";
   3328     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
   3329     pdc->realize = vfio_realize;
   3330     pdc->exit = vfio_exitfn;
   3331     pdc->config_read = vfio_pci_read_config;
   3332     pdc->config_write = vfio_pci_write_config;
   3333 }
   3334 
   3335 static const TypeInfo vfio_pci_dev_info = {
   3336     .name = TYPE_VFIO_PCI,
   3337     .parent = TYPE_PCI_DEVICE,
   3338     .instance_size = sizeof(VFIOPCIDevice),
   3339     .class_init = vfio_pci_dev_class_init,
   3340     .instance_init = vfio_instance_init,
   3341     .instance_finalize = vfio_instance_finalize,
   3342     .interfaces = (InterfaceInfo[]) {
   3343         { INTERFACE_PCIE_DEVICE },
   3344         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
   3345         { }
   3346     },
   3347 };
   3348 
   3349 static Property vfio_pci_dev_nohotplug_properties[] = {
   3350     DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
   3351     DEFINE_PROP_END_OF_LIST(),
   3352 };
   3353 
   3354 static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
   3355 {
   3356     DeviceClass *dc = DEVICE_CLASS(klass);
   3357 
   3358     device_class_set_props(dc, vfio_pci_dev_nohotplug_properties);
   3359     dc->hotpluggable = false;
   3360 }
   3361 
   3362 static const TypeInfo vfio_pci_nohotplug_dev_info = {
   3363     .name = TYPE_VFIO_PCI_NOHOTPLUG,
   3364     .parent = TYPE_VFIO_PCI,
   3365     .instance_size = sizeof(VFIOPCIDevice),
   3366     .class_init = vfio_pci_nohotplug_dev_class_init,
   3367 };
   3368 
   3369 static void register_vfio_pci_dev_type(void)
   3370 {
   3371     type_register_static(&vfio_pci_dev_info);
   3372     type_register_static(&vfio_pci_nohotplug_dev_info);
   3373 }
   3374 
   3375 type_init(register_vfio_pci_dev_type)