qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

whpx-all.c (87034B)


      1 /*
      2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
      3  *
      4  * Copyright Microsoft Corp. 2017
      5  *
      6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
      7  * See the COPYING file in the top-level directory.
      8  *
      9  */
     10 
     11 #include "qemu/osdep.h"
     12 #include "cpu.h"
     13 #include "exec/address-spaces.h"
     14 #include "exec/ioport.h"
     15 #include "exec/gdbstub.h"
     16 #include "qemu/accel.h"
     17 #include "sysemu/whpx.h"
     18 #include "sysemu/cpus.h"
     19 #include "sysemu/runstate.h"
     20 #include "qemu/main-loop.h"
     21 #include "hw/boards.h"
     22 #include "hw/i386/ioapic.h"
     23 #include "hw/i386/apic_internal.h"
     24 #include "qemu/error-report.h"
     25 #include "qapi/error.h"
     26 #include "qapi/qapi-types-common.h"
     27 #include "qapi/qapi-visit-common.h"
     28 #include "migration/blocker.h"
     29 #include <winerror.h>
     30 
     31 #include "whpx-internal.h"
     32 #include "whpx-accel-ops.h"
     33 
     34 #include <WinHvPlatform.h>
     35 #include <WinHvEmulation.h>
     36 
     37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
     38 
     39 static const WHV_REGISTER_NAME whpx_register_names[] = {
     40 
     41     /* X64 General purpose registers */
     42     WHvX64RegisterRax,
     43     WHvX64RegisterRcx,
     44     WHvX64RegisterRdx,
     45     WHvX64RegisterRbx,
     46     WHvX64RegisterRsp,
     47     WHvX64RegisterRbp,
     48     WHvX64RegisterRsi,
     49     WHvX64RegisterRdi,
     50     WHvX64RegisterR8,
     51     WHvX64RegisterR9,
     52     WHvX64RegisterR10,
     53     WHvX64RegisterR11,
     54     WHvX64RegisterR12,
     55     WHvX64RegisterR13,
     56     WHvX64RegisterR14,
     57     WHvX64RegisterR15,
     58     WHvX64RegisterRip,
     59     WHvX64RegisterRflags,
     60 
     61     /* X64 Segment registers */
     62     WHvX64RegisterEs,
     63     WHvX64RegisterCs,
     64     WHvX64RegisterSs,
     65     WHvX64RegisterDs,
     66     WHvX64RegisterFs,
     67     WHvX64RegisterGs,
     68     WHvX64RegisterLdtr,
     69     WHvX64RegisterTr,
     70 
     71     /* X64 Table registers */
     72     WHvX64RegisterIdtr,
     73     WHvX64RegisterGdtr,
     74 
     75     /* X64 Control Registers */
     76     WHvX64RegisterCr0,
     77     WHvX64RegisterCr2,
     78     WHvX64RegisterCr3,
     79     WHvX64RegisterCr4,
     80     WHvX64RegisterCr8,
     81 
     82     /* X64 Debug Registers */
     83     /*
     84      * WHvX64RegisterDr0,
     85      * WHvX64RegisterDr1,
     86      * WHvX64RegisterDr2,
     87      * WHvX64RegisterDr3,
     88      * WHvX64RegisterDr6,
     89      * WHvX64RegisterDr7,
     90      */
     91 
     92     /* X64 Floating Point and Vector Registers */
     93     WHvX64RegisterXmm0,
     94     WHvX64RegisterXmm1,
     95     WHvX64RegisterXmm2,
     96     WHvX64RegisterXmm3,
     97     WHvX64RegisterXmm4,
     98     WHvX64RegisterXmm5,
     99     WHvX64RegisterXmm6,
    100     WHvX64RegisterXmm7,
    101     WHvX64RegisterXmm8,
    102     WHvX64RegisterXmm9,
    103     WHvX64RegisterXmm10,
    104     WHvX64RegisterXmm11,
    105     WHvX64RegisterXmm12,
    106     WHvX64RegisterXmm13,
    107     WHvX64RegisterXmm14,
    108     WHvX64RegisterXmm15,
    109     WHvX64RegisterFpMmx0,
    110     WHvX64RegisterFpMmx1,
    111     WHvX64RegisterFpMmx2,
    112     WHvX64RegisterFpMmx3,
    113     WHvX64RegisterFpMmx4,
    114     WHvX64RegisterFpMmx5,
    115     WHvX64RegisterFpMmx6,
    116     WHvX64RegisterFpMmx7,
    117     WHvX64RegisterFpControlStatus,
    118     WHvX64RegisterXmmControlStatus,
    119 
    120     /* X64 MSRs */
    121     WHvX64RegisterEfer,
    122 #ifdef TARGET_X86_64
    123     WHvX64RegisterKernelGsBase,
    124 #endif
    125     WHvX64RegisterApicBase,
    126     /* WHvX64RegisterPat, */
    127     WHvX64RegisterSysenterCs,
    128     WHvX64RegisterSysenterEip,
    129     WHvX64RegisterSysenterEsp,
    130     WHvX64RegisterStar,
    131 #ifdef TARGET_X86_64
    132     WHvX64RegisterLstar,
    133     WHvX64RegisterCstar,
    134     WHvX64RegisterSfmask,
    135 #endif
    136 
    137     /* Interrupt / Event Registers */
    138     /*
    139      * WHvRegisterPendingInterruption,
    140      * WHvRegisterInterruptState,
    141      * WHvRegisterPendingEvent0,
    142      * WHvRegisterPendingEvent1
    143      * WHvX64RegisterDeliverabilityNotifications,
    144      */
    145 };
    146 
    147 struct whpx_register_set {
    148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
    149 };
    150 
    151 /*
    152  * The current implementation of instruction stepping sets the TF flag
    153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
    154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
    155  *
    156  * This approach has a few limitations:
    157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
    158  *        along with the other flags, possibly restoring it later. It would
    159  *        result in another INT1 when the flags are restored, triggering
    160  *        a stop in gdb that could be cleared by doing another step.
    161  *
    162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
    163  *        TF flags, ending the stepping mode.
    164  *
    165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
    166  *        or anything that could result in a page fault) will save the flags
    167  *        to the stack, clear the TF flag, and let the guest execute the
    168  *        handler. Normally, the guest will restore the original flags,
    169  *        that will continue single-stepping.
    170  *
    171  *     3. Debuggers running on the guest may wish to set TF to do instruction
    172  *        stepping. INT1 events generated by it would be intercepted by us,
    173  *        as long as the gdb is connected to QEMU.
    174  *
    175  * In practice this means that:
    176  *     1. Stepping through flags-modifying instructions may cause gdb to
    177  *        continue or stop in unexpected places. This will be fully recoverable
    178  *        and will not crash the target.
    179  *
    180  *     2. Stepping over an instruction that triggers an exception will step
    181  *        over the exception handler, not into it.
    182  *
    183  *     3. Debugging the guest via gdb, while running debugger on the guest
    184  *        at the same time may lead to unexpected effects. Removing all
    185  *        breakpoints set via QEMU will prevent any further interference
    186  *        with the guest-level debuggers.
    187  *
    188  * The limitations can be addressed as shown below:
    189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
    190  *        stepping through them. The exact semantics of the instructions is
    191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
    192  *        Architectures Software Developer's Manuals", however it involves a
    193  *        fair amount of corner cases due to compatibility with real mode,
    194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
    195  *
    196  *     2. We could step into the guest's exception handlers using the following
    197  *        sequence:
    198  *          a. Temporarily enable catching of all exception types via
    199  *             whpx_set_exception_exit_bitmap().
    200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
    201  *             the original handler.
    202  *          c. Patch the original handler, injecting an INT3 at the beginning.
    203  *          d. Update the exception exit bitmap to only catch the
    204  *             WHvX64ExceptionTypeBreakpointTrap exception.
    205  *          e. Let the affected CPU run in the exclusive mode.
    206  *          f. Restore the original handler and the exception exit bitmap.
    207  *        Note that handling all corner cases related to IDT/GDT is harder
    208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
    209  *        rough idea.
    210  *
    211  *     3. In order to properly support guest-level debugging in parallel with
    212  *        the QEMU-level debugging, we would need to be able to pass some INT1
    213  *        events to the guest. This could be done via the following methods:
    214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
    215  *             it seems to only work for interrupts and not software
    216  *             exceptions.
    217  *          b. Locating and patching the original handler by parsing IDT/GDT.
    218  *             This involves relatively complex logic outlined in the previous
    219  *             paragraph.
    220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
    221  *             RFLAGS, and pushing the old values to stack). This is even more
    222  *             complicated than the previous option, since it involves checking
    223  *             CPL, gate attributes, and doing various adjustments depending
    224  *             on the current CPU mode, whether the CPL is changing, etc.
    225  */
    226 typedef enum WhpxStepMode {
    227     WHPX_STEP_NONE = 0,
    228     /* Halt other VCPUs */
    229     WHPX_STEP_EXCLUSIVE,
    230 } WhpxStepMode;
    231 
    232 struct whpx_vcpu {
    233     WHV_EMULATOR_HANDLE emulator;
    234     bool window_registered;
    235     bool interruptable;
    236     bool ready_for_pic_interrupt;
    237     uint64_t tpr;
    238     uint64_t apic_base;
    239     bool interruption_pending;
    240 
    241     /* Must be the last field as it may have a tail */
    242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
    243 };
    244 
    245 static bool whpx_allowed;
    246 static bool whp_dispatch_initialized;
    247 static HMODULE hWinHvPlatform, hWinHvEmulation;
    248 static uint32_t max_vcpu_index;
    249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
    250 
    251 struct whpx_state whpx_global;
    252 struct WHPDispatch whp_dispatch;
    253 
    254 static bool whpx_has_xsave(void)
    255 {
    256     return whpx_xsave_cap.XsaveSupport;
    257 }
    258 
    259 /*
    260  * VP support
    261  */
    262 
    263 static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
    264 {
    265     return (struct whpx_vcpu *)cpu->hax_vcpu;
    266 }
    267 
    268 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
    269                                              int r86)
    270 {
    271     WHV_X64_SEGMENT_REGISTER hs;
    272     unsigned flags = qs->flags;
    273 
    274     hs.Base = qs->base;
    275     hs.Limit = qs->limit;
    276     hs.Selector = qs->selector;
    277 
    278     if (v86) {
    279         hs.Attributes = 0;
    280         hs.SegmentType = 3;
    281         hs.Present = 1;
    282         hs.DescriptorPrivilegeLevel = 3;
    283         hs.NonSystemSegment = 1;
    284 
    285     } else {
    286         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
    287 
    288         if (r86) {
    289             /* hs.Base &= 0xfffff; */
    290         }
    291     }
    292 
    293     return hs;
    294 }
    295 
    296 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
    297 {
    298     SegmentCache qs;
    299 
    300     qs.base = hs->Base;
    301     qs.limit = hs->Limit;
    302     qs.selector = hs->Selector;
    303 
    304     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
    305 
    306     return qs;
    307 }
    308 
    309 /* X64 Extended Control Registers */
    310 static void whpx_set_xcrs(CPUState *cpu)
    311 {
    312     CPUX86State *env = cpu->env_ptr;
    313     HRESULT hr;
    314     struct whpx_state *whpx = &whpx_global;
    315     WHV_REGISTER_VALUE xcr0;
    316     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
    317 
    318     if (!whpx_has_xsave()) {
    319         return;
    320     }
    321 
    322     /* Only xcr0 is supported by the hypervisor currently */
    323     xcr0.Reg64 = env->xcr0;
    324     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
    325         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
    326     if (FAILED(hr)) {
    327         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
    328     }
    329 }
    330 
    331 static int whpx_set_tsc(CPUState *cpu)
    332 {
    333     CPUX86State *env = cpu->env_ptr;
    334     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
    335     WHV_REGISTER_VALUE tsc_val;
    336     HRESULT hr;
    337     struct whpx_state *whpx = &whpx_global;
    338 
    339     /*
    340      * Suspend the partition prior to setting the TSC to reduce the variance
    341      * in TSC across vCPUs. When the first vCPU runs post suspend, the
    342      * partition is automatically resumed.
    343      */
    344     if (whp_dispatch.WHvSuspendPartitionTime) {
    345 
    346         /*
    347          * Unable to suspend partition while setting TSC is not a fatal
    348          * error. It just increases the likelihood of TSC variance between
    349          * vCPUs and some guest OS are able to handle that just fine.
    350          */
    351         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
    352         if (FAILED(hr)) {
    353             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
    354         }
    355     }
    356 
    357     tsc_val.Reg64 = env->tsc;
    358     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
    359         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
    360     if (FAILED(hr)) {
    361         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
    362         return -1;
    363     }
    364 
    365     return 0;
    366 }
    367 
    368 /*
    369  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
    370  * however, they use a slightly different encoding. Specifically:
    371  *
    372  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
    373  *
    374  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
    375  * and IA-32 Architectures Software Developer's Manual.
    376  *
    377  * The functions below translate the value of CR8 to TPR and vice versa.
    378  */
    379 
    380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
    381 {
    382     return tpr >> 4;
    383 }
    384 
    385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
    386 {
    387     return cr8 << 4;
    388 }
    389 
    390 static void whpx_set_registers(CPUState *cpu, int level)
    391 {
    392     struct whpx_state *whpx = &whpx_global;
    393     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
    394     CPUX86State *env = cpu->env_ptr;
    395     X86CPU *x86_cpu = X86_CPU(cpu);
    396     struct whpx_register_set vcxt;
    397     HRESULT hr;
    398     int idx;
    399     int idx_next;
    400     int i;
    401     int v86, r86;
    402 
    403     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
    404 
    405     /*
    406      * Following MSRs have side effects on the guest or are too heavy for
    407      * runtime. Limit them to full state update.
    408      */
    409     if (level >= WHPX_SET_RESET_STATE) {
    410         whpx_set_tsc(cpu);
    411     }
    412 
    413     memset(&vcxt, 0, sizeof(struct whpx_register_set));
    414 
    415     v86 = (env->eflags & VM_MASK);
    416     r86 = !(env->cr[0] & CR0_PE_MASK);
    417 
    418     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
    419     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
    420 
    421     idx = 0;
    422 
    423     /* Indexes for first 16 registers match between HV and QEMU definitions */
    424     idx_next = 16;
    425     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
    426         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
    427     }
    428     idx = idx_next;
    429 
    430     /* Same goes for RIP and RFLAGS */
    431     assert(whpx_register_names[idx] == WHvX64RegisterRip);
    432     vcxt.values[idx++].Reg64 = env->eip;
    433 
    434     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
    435     vcxt.values[idx++].Reg64 = env->eflags;
    436 
    437     /* Translate 6+4 segment registers. HV and QEMU order matches  */
    438     assert(idx == WHvX64RegisterEs);
    439     for (i = 0; i < 6; i += 1, idx += 1) {
    440         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
    441     }
    442 
    443     assert(idx == WHvX64RegisterLdtr);
    444     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
    445 
    446     assert(idx == WHvX64RegisterTr);
    447     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
    448 
    449     assert(idx == WHvX64RegisterIdtr);
    450     vcxt.values[idx].Table.Base = env->idt.base;
    451     vcxt.values[idx].Table.Limit = env->idt.limit;
    452     idx += 1;
    453 
    454     assert(idx == WHvX64RegisterGdtr);
    455     vcxt.values[idx].Table.Base = env->gdt.base;
    456     vcxt.values[idx].Table.Limit = env->gdt.limit;
    457     idx += 1;
    458 
    459     /* CR0, 2, 3, 4, 8 */
    460     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
    461     vcxt.values[idx++].Reg64 = env->cr[0];
    462     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
    463     vcxt.values[idx++].Reg64 = env->cr[2];
    464     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
    465     vcxt.values[idx++].Reg64 = env->cr[3];
    466     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
    467     vcxt.values[idx++].Reg64 = env->cr[4];
    468     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
    469     vcxt.values[idx++].Reg64 = vcpu->tpr;
    470 
    471     /* 8 Debug Registers - Skipped */
    472 
    473     /*
    474      * Extended control registers needs to be handled separately depending
    475      * on whether xsave is supported/enabled or not.
    476      */
    477     whpx_set_xcrs(cpu);
    478 
    479     /* 16 XMM registers */
    480     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
    481     idx_next = idx + 16;
    482     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
    483         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
    484         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
    485     }
    486     idx = idx_next;
    487 
    488     /* 8 FP registers */
    489     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
    490     for (i = 0; i < 8; i += 1, idx += 1) {
    491         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
    492         /* vcxt.values[idx].Fp.AsUINT128.High64 =
    493                env->fpregs[i].mmx.MMX_Q(1);
    494         */
    495     }
    496 
    497     /* FP control status register */
    498     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
    499     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
    500     vcxt.values[idx].FpControlStatus.FpStatus =
    501         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
    502     vcxt.values[idx].FpControlStatus.FpTag = 0;
    503     for (i = 0; i < 8; ++i) {
    504         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
    505     }
    506     vcxt.values[idx].FpControlStatus.Reserved = 0;
    507     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
    508     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
    509     idx += 1;
    510 
    511     /* XMM control status register */
    512     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
    513     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
    514     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
    515     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
    516     idx += 1;
    517 
    518     /* MSRs */
    519     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
    520     vcxt.values[idx++].Reg64 = env->efer;
    521 #ifdef TARGET_X86_64
    522     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
    523     vcxt.values[idx++].Reg64 = env->kernelgsbase;
    524 #endif
    525 
    526     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
    527     vcxt.values[idx++].Reg64 = vcpu->apic_base;
    528 
    529     /* WHvX64RegisterPat - Skipped */
    530 
    531     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
    532     vcxt.values[idx++].Reg64 = env->sysenter_cs;
    533     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
    534     vcxt.values[idx++].Reg64 = env->sysenter_eip;
    535     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
    536     vcxt.values[idx++].Reg64 = env->sysenter_esp;
    537     assert(whpx_register_names[idx] == WHvX64RegisterStar);
    538     vcxt.values[idx++].Reg64 = env->star;
    539 #ifdef TARGET_X86_64
    540     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
    541     vcxt.values[idx++].Reg64 = env->lstar;
    542     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
    543     vcxt.values[idx++].Reg64 = env->cstar;
    544     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
    545     vcxt.values[idx++].Reg64 = env->fmask;
    546 #endif
    547 
    548     /* Interrupt / Event Registers - Skipped */
    549 
    550     assert(idx == RTL_NUMBER_OF(whpx_register_names));
    551 
    552     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
    553         whpx->partition, cpu->cpu_index,
    554         whpx_register_names,
    555         RTL_NUMBER_OF(whpx_register_names),
    556         &vcxt.values[0]);
    557 
    558     if (FAILED(hr)) {
    559         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
    560                      hr);
    561     }
    562 
    563     return;
    564 }
    565 
    566 static int whpx_get_tsc(CPUState *cpu)
    567 {
    568     CPUX86State *env = cpu->env_ptr;
    569     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
    570     WHV_REGISTER_VALUE tsc_val;
    571     HRESULT hr;
    572     struct whpx_state *whpx = &whpx_global;
    573 
    574     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
    575         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
    576     if (FAILED(hr)) {
    577         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
    578         return -1;
    579     }
    580 
    581     env->tsc = tsc_val.Reg64;
    582     return 0;
    583 }
    584 
    585 /* X64 Extended Control Registers */
    586 static void whpx_get_xcrs(CPUState *cpu)
    587 {
    588     CPUX86State *env = cpu->env_ptr;
    589     HRESULT hr;
    590     struct whpx_state *whpx = &whpx_global;
    591     WHV_REGISTER_VALUE xcr0;
    592     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
    593 
    594     if (!whpx_has_xsave()) {
    595         return;
    596     }
    597 
    598     /* Only xcr0 is supported by the hypervisor currently */
    599     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
    600         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
    601     if (FAILED(hr)) {
    602         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
    603         return;
    604     }
    605 
    606     env->xcr0 = xcr0.Reg64;
    607 }
    608 
    609 static void whpx_get_registers(CPUState *cpu)
    610 {
    611     struct whpx_state *whpx = &whpx_global;
    612     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
    613     CPUX86State *env = cpu->env_ptr;
    614     X86CPU *x86_cpu = X86_CPU(cpu);
    615     struct whpx_register_set vcxt;
    616     uint64_t tpr, apic_base;
    617     HRESULT hr;
    618     int idx;
    619     int idx_next;
    620     int i;
    621 
    622     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
    623 
    624     if (!env->tsc_valid) {
    625         whpx_get_tsc(cpu);
    626         env->tsc_valid = !runstate_is_running();
    627     }
    628 
    629     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
    630         whpx->partition, cpu->cpu_index,
    631         whpx_register_names,
    632         RTL_NUMBER_OF(whpx_register_names),
    633         &vcxt.values[0]);
    634     if (FAILED(hr)) {
    635         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
    636                      hr);
    637     }
    638 
    639     if (whpx_apic_in_platform()) {
    640         /*
    641          * Fetch the TPR value from the emulated APIC. It may get overwritten
    642          * below with the value from CR8 returned by
    643          * WHvGetVirtualProcessorRegisters().
    644          */
    645         whpx_apic_get(x86_cpu->apic_state);
    646         vcpu->tpr = whpx_apic_tpr_to_cr8(
    647             cpu_get_apic_tpr(x86_cpu->apic_state));
    648     }
    649 
    650     idx = 0;
    651 
    652     /* Indexes for first 16 registers match between HV and QEMU definitions */
    653     idx_next = 16;
    654     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
    655         env->regs[idx] = vcxt.values[idx].Reg64;
    656     }
    657     idx = idx_next;
    658 
    659     /* Same goes for RIP and RFLAGS */
    660     assert(whpx_register_names[idx] == WHvX64RegisterRip);
    661     env->eip = vcxt.values[idx++].Reg64;
    662     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
    663     env->eflags = vcxt.values[idx++].Reg64;
    664 
    665     /* Translate 6+4 segment registers. HV and QEMU order matches  */
    666     assert(idx == WHvX64RegisterEs);
    667     for (i = 0; i < 6; i += 1, idx += 1) {
    668         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
    669     }
    670 
    671     assert(idx == WHvX64RegisterLdtr);
    672     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
    673     assert(idx == WHvX64RegisterTr);
    674     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
    675     assert(idx == WHvX64RegisterIdtr);
    676     env->idt.base = vcxt.values[idx].Table.Base;
    677     env->idt.limit = vcxt.values[idx].Table.Limit;
    678     idx += 1;
    679     assert(idx == WHvX64RegisterGdtr);
    680     env->gdt.base = vcxt.values[idx].Table.Base;
    681     env->gdt.limit = vcxt.values[idx].Table.Limit;
    682     idx += 1;
    683 
    684     /* CR0, 2, 3, 4, 8 */
    685     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
    686     env->cr[0] = vcxt.values[idx++].Reg64;
    687     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
    688     env->cr[2] = vcxt.values[idx++].Reg64;
    689     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
    690     env->cr[3] = vcxt.values[idx++].Reg64;
    691     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
    692     env->cr[4] = vcxt.values[idx++].Reg64;
    693     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
    694     tpr = vcxt.values[idx++].Reg64;
    695     if (tpr != vcpu->tpr) {
    696         vcpu->tpr = tpr;
    697         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
    698     }
    699 
    700     /* 8 Debug Registers - Skipped */
    701 
    702     /*
    703      * Extended control registers needs to be handled separately depending
    704      * on whether xsave is supported/enabled or not.
    705      */
    706     whpx_get_xcrs(cpu);
    707 
    708     /* 16 XMM registers */
    709     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
    710     idx_next = idx + 16;
    711     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
    712         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
    713         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
    714     }
    715     idx = idx_next;
    716 
    717     /* 8 FP registers */
    718     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
    719     for (i = 0; i < 8; i += 1, idx += 1) {
    720         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
    721         /* env->fpregs[i].mmx.MMX_Q(1) =
    722                vcxt.values[idx].Fp.AsUINT128.High64;
    723         */
    724     }
    725 
    726     /* FP control status register */
    727     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
    728     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
    729     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
    730     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
    731     for (i = 0; i < 8; ++i) {
    732         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
    733     }
    734     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
    735     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
    736     idx += 1;
    737 
    738     /* XMM control status register */
    739     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
    740     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
    741     idx += 1;
    742 
    743     /* MSRs */
    744     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
    745     env->efer = vcxt.values[idx++].Reg64;
    746 #ifdef TARGET_X86_64
    747     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
    748     env->kernelgsbase = vcxt.values[idx++].Reg64;
    749 #endif
    750 
    751     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
    752     apic_base = vcxt.values[idx++].Reg64;
    753     if (apic_base != vcpu->apic_base) {
    754         vcpu->apic_base = apic_base;
    755         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
    756     }
    757 
    758     /* WHvX64RegisterPat - Skipped */
    759 
    760     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
    761     env->sysenter_cs = vcxt.values[idx++].Reg64;
    762     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
    763     env->sysenter_eip = vcxt.values[idx++].Reg64;
    764     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
    765     env->sysenter_esp = vcxt.values[idx++].Reg64;
    766     assert(whpx_register_names[idx] == WHvX64RegisterStar);
    767     env->star = vcxt.values[idx++].Reg64;
    768 #ifdef TARGET_X86_64
    769     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
    770     env->lstar = vcxt.values[idx++].Reg64;
    771     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
    772     env->cstar = vcxt.values[idx++].Reg64;
    773     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
    774     env->fmask = vcxt.values[idx++].Reg64;
    775 #endif
    776 
    777     /* Interrupt / Event Registers - Skipped */
    778 
    779     assert(idx == RTL_NUMBER_OF(whpx_register_names));
    780 
    781     if (whpx_apic_in_platform()) {
    782         whpx_apic_get(x86_cpu->apic_state);
    783     }
    784 
    785     x86_update_hflags(env);
    786 
    787     return;
    788 }
    789 
    790 static HRESULT CALLBACK whpx_emu_ioport_callback(
    791     void *ctx,
    792     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
    793 {
    794     MemTxAttrs attrs = { 0 };
    795     address_space_rw(&address_space_io, IoAccess->Port, attrs,
    796                      &IoAccess->Data, IoAccess->AccessSize,
    797                      IoAccess->Direction);
    798     return S_OK;
    799 }
    800 
    801 static HRESULT CALLBACK whpx_emu_mmio_callback(
    802     void *ctx,
    803     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
    804 {
    805     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
    806                            ma->Direction);
    807     return S_OK;
    808 }
    809 
    810 static HRESULT CALLBACK whpx_emu_getreg_callback(
    811     void *ctx,
    812     const WHV_REGISTER_NAME *RegisterNames,
    813     UINT32 RegisterCount,
    814     WHV_REGISTER_VALUE *RegisterValues)
    815 {
    816     HRESULT hr;
    817     struct whpx_state *whpx = &whpx_global;
    818     CPUState *cpu = (CPUState *)ctx;
    819 
    820     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
    821         whpx->partition, cpu->cpu_index,
    822         RegisterNames, RegisterCount,
    823         RegisterValues);
    824     if (FAILED(hr)) {
    825         error_report("WHPX: Failed to get virtual processor registers,"
    826                      " hr=%08lx", hr);
    827     }
    828 
    829     return hr;
    830 }
    831 
    832 static HRESULT CALLBACK whpx_emu_setreg_callback(
    833     void *ctx,
    834     const WHV_REGISTER_NAME *RegisterNames,
    835     UINT32 RegisterCount,
    836     const WHV_REGISTER_VALUE *RegisterValues)
    837 {
    838     HRESULT hr;
    839     struct whpx_state *whpx = &whpx_global;
    840     CPUState *cpu = (CPUState *)ctx;
    841 
    842     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
    843         whpx->partition, cpu->cpu_index,
    844         RegisterNames, RegisterCount,
    845         RegisterValues);
    846     if (FAILED(hr)) {
    847         error_report("WHPX: Failed to set virtual processor registers,"
    848                      " hr=%08lx", hr);
    849     }
    850 
    851     /*
    852      * The emulator just successfully wrote the register state. We clear the
    853      * dirty state so we avoid the double write on resume of the VP.
    854      */
    855     cpu->vcpu_dirty = false;
    856 
    857     return hr;
    858 }
    859 
    860 static HRESULT CALLBACK whpx_emu_translate_callback(
    861     void *ctx,
    862     WHV_GUEST_VIRTUAL_ADDRESS Gva,
    863     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
    864     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
    865     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
    866 {
    867     HRESULT hr;
    868     struct whpx_state *whpx = &whpx_global;
    869     CPUState *cpu = (CPUState *)ctx;
    870     WHV_TRANSLATE_GVA_RESULT res;
    871 
    872     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
    873                                       Gva, TranslateFlags, &res, Gpa);
    874     if (FAILED(hr)) {
    875         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
    876     } else {
    877         *TranslationResult = res.ResultCode;
    878     }
    879 
    880     return hr;
    881 }
    882 
    883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
    884     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
    885     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
    886     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
    887     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
    888     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
    889     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
    890 };
    891 
    892 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
    893 {
    894     HRESULT hr;
    895     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
    896     WHV_EMULATOR_STATUS emu_status;
    897 
    898     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
    899         vcpu->emulator, cpu,
    900         &vcpu->exit_ctx.VpContext, ctx,
    901         &emu_status);
    902     if (FAILED(hr)) {
    903         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
    904         return -1;
    905     }
    906 
    907     if (!emu_status.EmulationSuccessful) {
    908         error_report("WHPX: Failed to emulate MMIO access with"
    909                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
    910         return -1;
    911     }
    912 
    913     return 0;
    914 }
    915 
    916 static int whpx_handle_portio(CPUState *cpu,
    917                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
    918 {
    919     HRESULT hr;
    920     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
    921     WHV_EMULATOR_STATUS emu_status;
    922 
    923     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
    924         vcpu->emulator, cpu,
    925         &vcpu->exit_ctx.VpContext, ctx,
    926         &emu_status);
    927     if (FAILED(hr)) {
    928         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
    929         return -1;
    930     }
    931 
    932     if (!emu_status.EmulationSuccessful) {
    933         error_report("WHPX: Failed to emulate PortIO access with"
    934                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
    935         return -1;
    936     }
    937 
    938     return 0;
    939 }
    940 
    941 /*
    942  * Controls whether we should intercept various exceptions on the guest,
    943  * namely breakpoint/single-step events.
    944  *
    945  * The 'exceptions' argument accepts a bitmask, e.g:
    946  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
    947  */
    948 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
    949 {
    950     struct whpx_state *whpx = &whpx_global;
    951     WHV_PARTITION_PROPERTY prop = { 0, };
    952     HRESULT hr;
    953 
    954     if (exceptions == whpx->exception_exit_bitmap) {
    955         return S_OK;
    956     }
    957 
    958     prop.ExceptionExitBitmap = exceptions;
    959 
    960     hr = whp_dispatch.WHvSetPartitionProperty(
    961         whpx->partition,
    962         WHvPartitionPropertyCodeExceptionExitBitmap,
    963         &prop,
    964         sizeof(WHV_PARTITION_PROPERTY));
    965 
    966     if (SUCCEEDED(hr)) {
    967         whpx->exception_exit_bitmap = exceptions;
    968     }
    969 
    970     return hr;
    971 }
    972 
    973 
    974 /*
    975  * This function is called before/after stepping over a single instruction.
    976  * It will update the CPU registers to arm/disarm the instruction stepping
    977  * accordingly.
    978  */
    979 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
    980     bool set,
    981     uint64_t *exit_context_rflags)
    982 {
    983     WHV_REGISTER_NAME reg_name;
    984     WHV_REGISTER_VALUE reg_value;
    985     HRESULT hr;
    986     struct whpx_state *whpx = &whpx_global;
    987 
    988     /*
    989      * If we are trying to step over a single instruction, we need to set the
    990      * TF bit in rflags. Otherwise, clear it.
    991      */
    992     reg_name = WHvX64RegisterRflags;
    993     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
    994         whpx->partition,
    995         cpu->cpu_index,
    996         &reg_name,
    997         1,
    998         &reg_value);
    999 
   1000     if (FAILED(hr)) {
   1001         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
   1002         return hr;
   1003     }
   1004 
   1005     if (exit_context_rflags) {
   1006         assert(*exit_context_rflags == reg_value.Reg64);
   1007     }
   1008 
   1009     if (set) {
   1010         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
   1011         reg_value.Reg64 |= TF_MASK;
   1012     } else {
   1013         reg_value.Reg64 &= ~TF_MASK;
   1014     }
   1015 
   1016     if (exit_context_rflags) {
   1017         *exit_context_rflags = reg_value.Reg64;
   1018     }
   1019 
   1020     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1021         whpx->partition,
   1022         cpu->cpu_index,
   1023         &reg_name,
   1024         1,
   1025         &reg_value);
   1026 
   1027     if (FAILED(hr)) {
   1028         error_report("WHPX: Failed to set rflags,"
   1029             " hr=%08lx",
   1030             hr);
   1031         return hr;
   1032     }
   1033 
   1034     reg_name = WHvRegisterInterruptState;
   1035     reg_value.Reg64 = 0;
   1036 
   1037     /* Suspend delivery of hardware interrupts during single-stepping. */
   1038     reg_value.InterruptState.InterruptShadow = set != 0;
   1039 
   1040     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1041     whpx->partition,
   1042         cpu->cpu_index,
   1043         &reg_name,
   1044         1,
   1045         &reg_value);
   1046 
   1047     if (FAILED(hr)) {
   1048         error_report("WHPX: Failed to set InterruptState,"
   1049             " hr=%08lx",
   1050             hr);
   1051         return hr;
   1052     }
   1053 
   1054     if (!set) {
   1055         /*
   1056          * We have just finished stepping over a single instruction,
   1057          * and intercepted the INT1 generated by it.
   1058          * We need to now hide the INT1 from the guest,
   1059          * as it would not be expecting it.
   1060          */
   1061 
   1062         reg_name = WHvX64RegisterPendingDebugException;
   1063         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
   1064         whpx->partition,
   1065             cpu->cpu_index,
   1066             &reg_name,
   1067             1,
   1068             &reg_value);
   1069 
   1070         if (FAILED(hr)) {
   1071             error_report("WHPX: Failed to get pending debug exceptions,"
   1072                          "hr=%08lx", hr);
   1073             return hr;
   1074         }
   1075 
   1076         if (reg_value.PendingDebugException.SingleStep) {
   1077             reg_value.PendingDebugException.SingleStep = 0;
   1078 
   1079             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1080                 whpx->partition,
   1081                 cpu->cpu_index,
   1082                 &reg_name,
   1083                 1,
   1084                 &reg_value);
   1085 
   1086             if (FAILED(hr)) {
   1087                 error_report("WHPX: Failed to clear pending debug exceptions,"
   1088                              "hr=%08lx", hr);
   1089              return hr;
   1090             }
   1091         }
   1092 
   1093     }
   1094 
   1095     return S_OK;
   1096 }
   1097 
   1098 /* Tries to find a breakpoint at the specified address. */
   1099 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
   1100 {
   1101     struct whpx_state *whpx = &whpx_global;
   1102     int i;
   1103 
   1104     if (whpx->breakpoints.breakpoints) {
   1105         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
   1106             if (address == whpx->breakpoints.breakpoints->data[i].address) {
   1107                 return &whpx->breakpoints.breakpoints->data[i];
   1108             }
   1109         }
   1110     }
   1111 
   1112     return NULL;
   1113 }
   1114 
   1115 /*
   1116  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
   1117  * debugging user-mode applications. Since the WHPX API does not offer
   1118  * an easy way to pass the intercepted exception back to the guest, we
   1119  * resort to using INT1 instead, and let the guest always handle INT3.
   1120  */
   1121 static const uint8_t whpx_breakpoint_instruction = 0xF1;
   1122 
   1123 /*
   1124  * The WHPX QEMU backend implements breakpoints by writing the INT1
   1125  * instruction into memory (ignoring the DRx registers). This raises a few
   1126  * issues that need to be carefully handled:
   1127  *
   1128  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
   1129  *    at the same location, and later remove them in arbitrary order.
   1130  *    This should not cause memory corruption, and should only remove the
   1131  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
   1132  *
   1133  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
   1134  *    physical location. Hence, physically adding/removing a breakpoint can
   1135  *    theoretically fail at any time. We need to keep track of it.
   1136  *
   1137  * The function below rebuilds a list of low-level breakpoints (one per
   1138  * address, tracking the original instruction and any errors) from the list of
   1139  * high-level breakpoints (set via cpu_breakpoint_insert()).
   1140  *
   1141  * In order to optimize performance, this function stores the list of
   1142  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
   1143  * low-level ones, so that it won't be re-invoked until these breakpoints
   1144  * change.
   1145  *
   1146  * Note that this function decides which breakpoints should be inserted into,
   1147  * memory, but doesn't actually do it. The memory accessing is done in
   1148  * whpx_apply_breakpoints().
   1149  */
   1150 static void whpx_translate_cpu_breakpoints(
   1151     struct whpx_breakpoints *breakpoints,
   1152     CPUState *cpu,
   1153     int cpu_breakpoint_count)
   1154 {
   1155     CPUBreakpoint *bp;
   1156     int cpu_bp_index = 0;
   1157 
   1158     breakpoints->original_addresses =
   1159         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
   1160 
   1161     breakpoints->original_address_count = cpu_breakpoint_count;
   1162 
   1163     int max_breakpoints = cpu_breakpoint_count +
   1164         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
   1165 
   1166     struct whpx_breakpoint_collection *new_breakpoints =
   1167         g_malloc0(sizeof(struct whpx_breakpoint_collection)
   1168                   + max_breakpoints * sizeof(struct whpx_breakpoint));
   1169 
   1170     new_breakpoints->allocated = max_breakpoints;
   1171     new_breakpoints->used = 0;
   1172 
   1173     /*
   1174      * 1. Preserve all old breakpoints that could not be automatically
   1175      * cleared when the CPU got stopped.
   1176      */
   1177     if (breakpoints->breakpoints) {
   1178         int i;
   1179         for (i = 0; i < breakpoints->breakpoints->used; i++) {
   1180             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
   1181                 new_breakpoints->data[new_breakpoints->used++] =
   1182                     breakpoints->breakpoints->data[i];
   1183             }
   1184         }
   1185     }
   1186 
   1187     /* 2. Map all CPU breakpoints to WHPX breakpoints */
   1188     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
   1189         int i;
   1190         bool found = false;
   1191 
   1192         /* This will be used to detect changed CPU breakpoints later. */
   1193         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
   1194 
   1195         for (i = 0; i < new_breakpoints->used; i++) {
   1196             /*
   1197              * WARNING: This loop has O(N^2) complexity, where N is the
   1198              * number of breakpoints. It should not be a bottleneck in
   1199              * real-world scenarios, since it only needs to run once after
   1200              * the breakpoints have been modified.
   1201              * If this ever becomes a concern, it can be optimized by storing
   1202              * high-level breakpoint objects in a tree or hash map.
   1203              */
   1204 
   1205             if (new_breakpoints->data[i].address == bp->pc) {
   1206                 /* There was already a breakpoint at this address. */
   1207                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
   1208                     new_breakpoints->data[i].state = WHPX_BP_SET;
   1209                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
   1210                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
   1211                 }
   1212 
   1213                 found = true;
   1214                 break;
   1215             }
   1216         }
   1217 
   1218         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
   1219             /* No WHPX breakpoint at this address. Create one. */
   1220             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
   1221             new_breakpoints->data[new_breakpoints->used].state =
   1222                 WHPX_BP_SET_PENDING;
   1223             new_breakpoints->used++;
   1224         }
   1225     }
   1226 
   1227     /*
   1228      * Free the previous breakpoint list. This can be optimized by keeping
   1229      * it as shadow buffer for the next computation instead of freeing
   1230      * it immediately.
   1231      */
   1232     g_free(breakpoints->breakpoints);
   1233 
   1234     breakpoints->breakpoints = new_breakpoints;
   1235 }
   1236 
   1237 /*
   1238  * Physically inserts/removes the breakpoints by reading and writing the
   1239  * physical memory, keeping a track of the failed attempts.
   1240  *
   1241  * Passing resuming=true  will try to set all previously unset breakpoints.
   1242  * Passing resuming=false will remove all inserted ones.
   1243  */
   1244 static void whpx_apply_breakpoints(
   1245     struct whpx_breakpoint_collection *breakpoints,
   1246     CPUState *cpu,
   1247     bool resuming)
   1248 {
   1249     int i, rc;
   1250     if (!breakpoints) {
   1251         return;
   1252     }
   1253 
   1254     for (i = 0; i < breakpoints->used; i++) {
   1255         /* Decide what to do right now based on the last known state. */
   1256         WhpxBreakpointState state = breakpoints->data[i].state;
   1257         switch (state) {
   1258         case WHPX_BP_CLEARED:
   1259             if (resuming) {
   1260                 state = WHPX_BP_SET_PENDING;
   1261             }
   1262             break;
   1263         case WHPX_BP_SET_PENDING:
   1264             if (!resuming) {
   1265                 state = WHPX_BP_CLEARED;
   1266             }
   1267             break;
   1268         case WHPX_BP_SET:
   1269             if (!resuming) {
   1270                 state = WHPX_BP_CLEAR_PENDING;
   1271             }
   1272             break;
   1273         case WHPX_BP_CLEAR_PENDING:
   1274             if (resuming) {
   1275                 state = WHPX_BP_SET;
   1276             }
   1277             break;
   1278         }
   1279 
   1280         if (state == WHPX_BP_SET_PENDING) {
   1281             /* Remember the original instruction. */
   1282             rc = cpu_memory_rw_debug(cpu,
   1283                 breakpoints->data[i].address,
   1284                 &breakpoints->data[i].original_instruction,
   1285                 1,
   1286                 false);
   1287 
   1288             if (!rc) {
   1289                 /* Write the breakpoint instruction. */
   1290                 rc = cpu_memory_rw_debug(cpu,
   1291                     breakpoints->data[i].address,
   1292                     (void *)&whpx_breakpoint_instruction,
   1293                     1,
   1294                     true);
   1295             }
   1296 
   1297             if (!rc) {
   1298                 state = WHPX_BP_SET;
   1299             }
   1300 
   1301         }
   1302 
   1303         if (state == WHPX_BP_CLEAR_PENDING) {
   1304             /* Restore the original instruction. */
   1305             rc = cpu_memory_rw_debug(cpu,
   1306                 breakpoints->data[i].address,
   1307                 &breakpoints->data[i].original_instruction,
   1308                 1,
   1309                 true);
   1310 
   1311             if (!rc) {
   1312                 state = WHPX_BP_CLEARED;
   1313             }
   1314         }
   1315 
   1316         breakpoints->data[i].state = state;
   1317     }
   1318 }
   1319 
   1320 /*
   1321  * This function is called when the a VCPU is about to start and no other
   1322  * VCPUs have been started so far. Since the VCPU start order could be
   1323  * arbitrary, it doesn't have to be VCPU#0.
   1324  *
   1325  * It is used to commit the breakpoints into memory, and configure WHPX
   1326  * to intercept debug exceptions.
   1327  *
   1328  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
   1329  * more VCPUs are already running, so this is the best place to do it.
   1330  */
   1331 static int whpx_first_vcpu_starting(CPUState *cpu)
   1332 {
   1333     struct whpx_state *whpx = &whpx_global;
   1334     HRESULT hr;
   1335 
   1336     g_assert(qemu_mutex_iothread_locked());
   1337 
   1338     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
   1339             (whpx->breakpoints.breakpoints &&
   1340              whpx->breakpoints.breakpoints->used)) {
   1341         CPUBreakpoint *bp;
   1342         int i = 0;
   1343         bool update_pending = false;
   1344 
   1345         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
   1346             if (i >= whpx->breakpoints.original_address_count ||
   1347                 bp->pc != whpx->breakpoints.original_addresses[i]) {
   1348                 update_pending = true;
   1349             }
   1350 
   1351             i++;
   1352         }
   1353 
   1354         if (i != whpx->breakpoints.original_address_count) {
   1355             update_pending = true;
   1356         }
   1357 
   1358         if (update_pending) {
   1359             /*
   1360              * The CPU breakpoints have changed since the last call to
   1361              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
   1362              * now be recomputed.
   1363              */
   1364             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
   1365         }
   1366 
   1367         /* Actually insert the breakpoints into the memory. */
   1368         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
   1369     }
   1370 
   1371     uint64_t exception_mask;
   1372     if (whpx->step_pending ||
   1373         (whpx->breakpoints.breakpoints &&
   1374          whpx->breakpoints.breakpoints->used)) {
   1375         /*
   1376          * We are either attempting to single-step one or more CPUs, or
   1377          * have one or more breakpoints enabled. Both require intercepting
   1378          * the WHvX64ExceptionTypeBreakpointTrap exception.
   1379          */
   1380 
   1381         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
   1382     } else {
   1383         /* Let the guest handle all exceptions. */
   1384         exception_mask = 0;
   1385     }
   1386 
   1387     hr = whpx_set_exception_exit_bitmap(exception_mask);
   1388     if (!SUCCEEDED(hr)) {
   1389         error_report("WHPX: Failed to update exception exit mask,"
   1390                      "hr=%08lx.", hr);
   1391         return 1;
   1392     }
   1393 
   1394     return 0;
   1395 }
   1396 
   1397 /*
   1398  * This function is called when the last VCPU has finished running.
   1399  * It is used to remove any previously set breakpoints from memory.
   1400  */
   1401 static int whpx_last_vcpu_stopping(CPUState *cpu)
   1402 {
   1403     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
   1404     return 0;
   1405 }
   1406 
   1407 /* Returns the address of the next instruction that is about to be executed. */
   1408 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
   1409 {
   1410     if (cpu->vcpu_dirty) {
   1411         /* The CPU registers have been modified by other parts of QEMU. */
   1412         CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
   1413         return env->eip;
   1414     } else if (exit_context_valid) {
   1415         /*
   1416          * The CPU registers have not been modified by neither other parts
   1417          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
   1418          * This is the most common case.
   1419          */
   1420         struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   1421         return vcpu->exit_ctx.VpContext.Rip;
   1422     } else {
   1423         /*
   1424          * The CPU registers have been modified by a call to
   1425          * WHvSetVirtualProcessorRegisters() and must be re-queried from
   1426          * the target.
   1427          */
   1428         WHV_REGISTER_VALUE reg_value;
   1429         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
   1430         HRESULT hr;
   1431         struct whpx_state *whpx = &whpx_global;
   1432 
   1433         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
   1434             whpx->partition,
   1435             cpu->cpu_index,
   1436             &reg_name,
   1437             1,
   1438             &reg_value);
   1439 
   1440         if (FAILED(hr)) {
   1441             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
   1442             return 0;
   1443         }
   1444 
   1445         return reg_value.Reg64;
   1446     }
   1447 }
   1448 
   1449 static int whpx_handle_halt(CPUState *cpu)
   1450 {
   1451     CPUX86State *env = cpu->env_ptr;
   1452     int ret = 0;
   1453 
   1454     qemu_mutex_lock_iothread();
   1455     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
   1456           (env->eflags & IF_MASK)) &&
   1457         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
   1458         cpu->exception_index = EXCP_HLT;
   1459         cpu->halted = true;
   1460         ret = 1;
   1461     }
   1462     qemu_mutex_unlock_iothread();
   1463 
   1464     return ret;
   1465 }
   1466 
   1467 static void whpx_vcpu_pre_run(CPUState *cpu)
   1468 {
   1469     HRESULT hr;
   1470     struct whpx_state *whpx = &whpx_global;
   1471     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   1472     CPUX86State *env = cpu->env_ptr;
   1473     X86CPU *x86_cpu = X86_CPU(cpu);
   1474     int irq;
   1475     uint8_t tpr;
   1476     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
   1477     UINT32 reg_count = 0;
   1478     WHV_REGISTER_VALUE reg_values[3];
   1479     WHV_REGISTER_NAME reg_names[3];
   1480 
   1481     memset(&new_int, 0, sizeof(new_int));
   1482     memset(reg_values, 0, sizeof(reg_values));
   1483 
   1484     qemu_mutex_lock_iothread();
   1485 
   1486     /* Inject NMI */
   1487     if (!vcpu->interruption_pending &&
   1488         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
   1489         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
   1490             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
   1491             vcpu->interruptable = false;
   1492             new_int.InterruptionType = WHvX64PendingNmi;
   1493             new_int.InterruptionPending = 1;
   1494             new_int.InterruptionVector = 2;
   1495         }
   1496         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
   1497             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
   1498         }
   1499     }
   1500 
   1501     /*
   1502      * Force the VCPU out of its inner loop to process any INIT requests or
   1503      * commit pending TPR access.
   1504      */
   1505     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
   1506         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
   1507             !(env->hflags & HF_SMM_MASK)) {
   1508             cpu->exit_request = 1;
   1509         }
   1510         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
   1511             cpu->exit_request = 1;
   1512         }
   1513     }
   1514 
   1515     /* Get pending hard interruption or replay one that was overwritten */
   1516     if (!whpx_apic_in_platform()) {
   1517         if (!vcpu->interruption_pending &&
   1518             vcpu->interruptable && (env->eflags & IF_MASK)) {
   1519             assert(!new_int.InterruptionPending);
   1520             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
   1521                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
   1522                 irq = cpu_get_pic_interrupt(env);
   1523                 if (irq >= 0) {
   1524                     new_int.InterruptionType = WHvX64PendingInterrupt;
   1525                     new_int.InterruptionPending = 1;
   1526                     new_int.InterruptionVector = irq;
   1527                 }
   1528             }
   1529         }
   1530 
   1531         /* Setup interrupt state if new one was prepared */
   1532         if (new_int.InterruptionPending) {
   1533             reg_values[reg_count].PendingInterruption = new_int;
   1534             reg_names[reg_count] = WHvRegisterPendingInterruption;
   1535             reg_count += 1;
   1536         }
   1537     } else if (vcpu->ready_for_pic_interrupt &&
   1538                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
   1539         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
   1540         irq = cpu_get_pic_interrupt(env);
   1541         if (irq >= 0) {
   1542             reg_names[reg_count] = WHvRegisterPendingEvent;
   1543             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
   1544             {
   1545                 .EventPending = 1,
   1546                 .EventType = WHvX64PendingEventExtInt,
   1547                 .Vector = irq,
   1548             };
   1549             reg_count += 1;
   1550         }
   1551      }
   1552 
   1553     /* Sync the TPR to the CR8 if was modified during the intercept */
   1554     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
   1555     if (tpr != vcpu->tpr) {
   1556         vcpu->tpr = tpr;
   1557         reg_values[reg_count].Reg64 = tpr;
   1558         cpu->exit_request = 1;
   1559         reg_names[reg_count] = WHvX64RegisterCr8;
   1560         reg_count += 1;
   1561     }
   1562 
   1563     /* Update the state of the interrupt delivery notification */
   1564     if (!vcpu->window_registered &&
   1565         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
   1566         reg_values[reg_count].DeliverabilityNotifications =
   1567             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
   1568                 .InterruptNotification = 1
   1569             };
   1570         vcpu->window_registered = 1;
   1571         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
   1572         reg_count += 1;
   1573     }
   1574 
   1575     qemu_mutex_unlock_iothread();
   1576     vcpu->ready_for_pic_interrupt = false;
   1577 
   1578     if (reg_count) {
   1579         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1580             whpx->partition, cpu->cpu_index,
   1581             reg_names, reg_count, reg_values);
   1582         if (FAILED(hr)) {
   1583             error_report("WHPX: Failed to set interrupt state registers,"
   1584                          " hr=%08lx", hr);
   1585         }
   1586     }
   1587 
   1588     return;
   1589 }
   1590 
   1591 static void whpx_vcpu_post_run(CPUState *cpu)
   1592 {
   1593     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   1594     CPUX86State *env = cpu->env_ptr;
   1595     X86CPU *x86_cpu = X86_CPU(cpu);
   1596 
   1597     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
   1598 
   1599     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
   1600     if (vcpu->tpr != tpr) {
   1601         vcpu->tpr = tpr;
   1602         qemu_mutex_lock_iothread();
   1603         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
   1604         qemu_mutex_unlock_iothread();
   1605     }
   1606 
   1607     vcpu->interruption_pending =
   1608         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
   1609 
   1610     vcpu->interruptable =
   1611         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
   1612 
   1613     return;
   1614 }
   1615 
   1616 static void whpx_vcpu_process_async_events(CPUState *cpu)
   1617 {
   1618     CPUX86State *env = cpu->env_ptr;
   1619     X86CPU *x86_cpu = X86_CPU(cpu);
   1620     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   1621 
   1622     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
   1623         !(env->hflags & HF_SMM_MASK)) {
   1624         whpx_cpu_synchronize_state(cpu);
   1625         do_cpu_init(x86_cpu);
   1626         vcpu->interruptable = true;
   1627     }
   1628 
   1629     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
   1630         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
   1631         apic_poll_irq(x86_cpu->apic_state);
   1632     }
   1633 
   1634     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
   1635          (env->eflags & IF_MASK)) ||
   1636         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
   1637         cpu->halted = false;
   1638     }
   1639 
   1640     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
   1641         whpx_cpu_synchronize_state(cpu);
   1642         do_cpu_sipi(x86_cpu);
   1643     }
   1644 
   1645     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
   1646         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
   1647         whpx_cpu_synchronize_state(cpu);
   1648         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
   1649                                       env->tpr_access_type);
   1650     }
   1651 
   1652     return;
   1653 }
   1654 
   1655 static int whpx_vcpu_run(CPUState *cpu)
   1656 {
   1657     HRESULT hr;
   1658     struct whpx_state *whpx = &whpx_global;
   1659     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   1660     struct whpx_breakpoint *stepped_over_bp = NULL;
   1661     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
   1662     int ret;
   1663 
   1664     g_assert(qemu_mutex_iothread_locked());
   1665 
   1666     if (whpx->running_cpus++ == 0) {
   1667         /* Insert breakpoints into memory, update exception exit bitmap. */
   1668         ret = whpx_first_vcpu_starting(cpu);
   1669         if (ret != 0) {
   1670             return ret;
   1671         }
   1672     }
   1673 
   1674     if (whpx->breakpoints.breakpoints &&
   1675         whpx->breakpoints.breakpoints->used > 0)
   1676     {
   1677         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
   1678         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
   1679         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
   1680             stepped_over_bp = NULL;
   1681         }
   1682 
   1683         if (stepped_over_bp) {
   1684             /*
   1685              * We are trying to run the instruction overwritten by an active
   1686              * breakpoint. We will temporarily disable the breakpoint, suspend
   1687              * other CPUs, and step over the instruction.
   1688              */
   1689             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
   1690         }
   1691     }
   1692 
   1693     if (exclusive_step_mode == WHPX_STEP_NONE) {
   1694         whpx_vcpu_process_async_events(cpu);
   1695         if (cpu->halted && !whpx_apic_in_platform()) {
   1696             cpu->exception_index = EXCP_HLT;
   1697             qatomic_set(&cpu->exit_request, false);
   1698             return 0;
   1699         }
   1700     }
   1701 
   1702     qemu_mutex_unlock_iothread();
   1703 
   1704     if (exclusive_step_mode != WHPX_STEP_NONE) {
   1705         start_exclusive();
   1706         g_assert(cpu == current_cpu);
   1707         g_assert(!cpu->running);
   1708         cpu->running = true;
   1709 
   1710         hr = whpx_set_exception_exit_bitmap(
   1711             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
   1712         if (!SUCCEEDED(hr)) {
   1713             error_report("WHPX: Failed to update exception exit mask, "
   1714                          "hr=%08lx.", hr);
   1715             return 1;
   1716         }
   1717 
   1718         if (stepped_over_bp) {
   1719             /* Temporarily disable the triggered breakpoint. */
   1720             cpu_memory_rw_debug(cpu,
   1721                 stepped_over_bp->address,
   1722                 &stepped_over_bp->original_instruction,
   1723                 1,
   1724                 true);
   1725         }
   1726     } else {
   1727         cpu_exec_start(cpu);
   1728     }
   1729 
   1730     do {
   1731         if (cpu->vcpu_dirty) {
   1732             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
   1733             cpu->vcpu_dirty = false;
   1734         }
   1735 
   1736         if (exclusive_step_mode == WHPX_STEP_NONE) {
   1737             whpx_vcpu_pre_run(cpu);
   1738 
   1739             if (qatomic_read(&cpu->exit_request)) {
   1740                 whpx_vcpu_kick(cpu);
   1741             }
   1742         }
   1743 
   1744         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
   1745             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
   1746         }
   1747 
   1748         hr = whp_dispatch.WHvRunVirtualProcessor(
   1749             whpx->partition, cpu->cpu_index,
   1750             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
   1751 
   1752         if (FAILED(hr)) {
   1753             error_report("WHPX: Failed to exec a virtual processor,"
   1754                          " hr=%08lx", hr);
   1755             ret = -1;
   1756             break;
   1757         }
   1758 
   1759         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
   1760             whpx_vcpu_configure_single_stepping(cpu,
   1761                 false,
   1762                 &vcpu->exit_ctx.VpContext.Rflags);
   1763         }
   1764 
   1765         whpx_vcpu_post_run(cpu);
   1766 
   1767         switch (vcpu->exit_ctx.ExitReason) {
   1768         case WHvRunVpExitReasonMemoryAccess:
   1769             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
   1770             break;
   1771 
   1772         case WHvRunVpExitReasonX64IoPortAccess:
   1773             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
   1774             break;
   1775 
   1776         case WHvRunVpExitReasonX64InterruptWindow:
   1777             vcpu->ready_for_pic_interrupt = 1;
   1778             vcpu->window_registered = 0;
   1779             ret = 0;
   1780             break;
   1781 
   1782         case WHvRunVpExitReasonX64ApicEoi:
   1783             assert(whpx_apic_in_platform());
   1784             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
   1785             break;
   1786 
   1787         case WHvRunVpExitReasonX64Halt:
   1788             /*
   1789              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
   1790              * longer used.
   1791              */
   1792             ret = whpx_handle_halt(cpu);
   1793             break;
   1794 
   1795         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
   1796             WHV_INTERRUPT_CONTROL ipi = {0};
   1797             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
   1798             uint32_t delivery_mode =
   1799                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
   1800             int dest_shorthand =
   1801                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
   1802             bool broadcast = false;
   1803             bool include_self = false;
   1804             uint32_t i;
   1805 
   1806             /* We only registered for INIT and SIPI exits. */
   1807             if ((delivery_mode != APIC_DM_INIT) &&
   1808                 (delivery_mode != APIC_DM_SIPI)) {
   1809                 error_report(
   1810                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
   1811                 break;
   1812             }
   1813 
   1814             if (delivery_mode == APIC_DM_INIT) {
   1815                 ipi.Type = WHvX64InterruptTypeInit;
   1816             } else {
   1817                 ipi.Type = WHvX64InterruptTypeSipi;
   1818             }
   1819 
   1820             ipi.DestinationMode =
   1821                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
   1822                     WHvX64InterruptDestinationModeLogical :
   1823                     WHvX64InterruptDestinationModePhysical;
   1824 
   1825             ipi.TriggerMode =
   1826                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
   1827                     WHvX64InterruptTriggerModeLevel :
   1828                     WHvX64InterruptTriggerModeEdge;
   1829 
   1830             ipi.Vector = icr & APIC_VECTOR_MASK;
   1831             switch (dest_shorthand) {
   1832             /* no shorthand. Bits 56-63 contain the destination. */
   1833             case 0:
   1834                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
   1835                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
   1836                         &ipi, sizeof(ipi));
   1837                 if (FAILED(hr)) {
   1838                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
   1839                         hr);
   1840                 }
   1841 
   1842                 break;
   1843 
   1844             /* self */
   1845             case 1:
   1846                 include_self = true;
   1847                 break;
   1848 
   1849             /* broadcast, including self */
   1850             case 2:
   1851                 broadcast = true;
   1852                 include_self = true;
   1853                 break;
   1854 
   1855             /* broadcast, excluding self */
   1856             case 3:
   1857                 broadcast = true;
   1858                 break;
   1859             }
   1860 
   1861             if (!broadcast && !include_self) {
   1862                 break;
   1863             }
   1864 
   1865             for (i = 0; i <= max_vcpu_index; i++) {
   1866                 if (i == cpu->cpu_index && !include_self) {
   1867                     continue;
   1868                 }
   1869 
   1870                 /*
   1871                  * Assuming that APIC Ids are identity mapped since
   1872                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
   1873                  * are not handled yet and the hypervisor doesn't allow the
   1874                  * guest to modify the APIC ID.
   1875                  */
   1876                 ipi.Destination = i;
   1877                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
   1878                         &ipi, sizeof(ipi));
   1879                 if (FAILED(hr)) {
   1880                     error_report(
   1881                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
   1882                         i, hr);
   1883                 }
   1884             }
   1885 
   1886             break;
   1887         }
   1888 
   1889         case WHvRunVpExitReasonCanceled:
   1890             if (exclusive_step_mode != WHPX_STEP_NONE) {
   1891                 /*
   1892                  * We are trying to step over a single instruction, and
   1893                  * likely got a request to stop from another thread.
   1894                  * Delay it until we are done stepping
   1895                  * over.
   1896                  */
   1897                 ret = 0;
   1898             } else {
   1899                 cpu->exception_index = EXCP_INTERRUPT;
   1900                 ret = 1;
   1901             }
   1902             break;
   1903         case WHvRunVpExitReasonX64MsrAccess: {
   1904             WHV_REGISTER_VALUE reg_values[3] = {0};
   1905             WHV_REGISTER_NAME reg_names[3];
   1906             UINT32 reg_count;
   1907 
   1908             reg_names[0] = WHvX64RegisterRip;
   1909             reg_names[1] = WHvX64RegisterRax;
   1910             reg_names[2] = WHvX64RegisterRdx;
   1911 
   1912             reg_values[0].Reg64 =
   1913                 vcpu->exit_ctx.VpContext.Rip +
   1914                 vcpu->exit_ctx.VpContext.InstructionLength;
   1915 
   1916             /*
   1917              * For all unsupported MSR access we:
   1918              *     ignore writes
   1919              *     return 0 on read.
   1920              */
   1921             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
   1922                         1 : 3;
   1923 
   1924             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1925                 whpx->partition,
   1926                 cpu->cpu_index,
   1927                 reg_names, reg_count,
   1928                 reg_values);
   1929 
   1930             if (FAILED(hr)) {
   1931                 error_report("WHPX: Failed to set MsrAccess state "
   1932                              " registers, hr=%08lx", hr);
   1933             }
   1934             ret = 0;
   1935             break;
   1936         }
   1937         case WHvRunVpExitReasonX64Cpuid: {
   1938             WHV_REGISTER_VALUE reg_values[5];
   1939             WHV_REGISTER_NAME reg_names[5];
   1940             UINT32 reg_count = 5;
   1941             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
   1942             X86CPU *x86_cpu = X86_CPU(cpu);
   1943             CPUX86State *env = &x86_cpu->env;
   1944 
   1945             memset(reg_values, 0, sizeof(reg_values));
   1946 
   1947             rip = vcpu->exit_ctx.VpContext.Rip +
   1948                   vcpu->exit_ctx.VpContext.InstructionLength;
   1949             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
   1950 
   1951             /*
   1952              * Ideally, these should be supplied to the hypervisor during VCPU
   1953              * initialization and it should be able to satisfy this request.
   1954              * But, currently, WHPX doesn't support setting CPUID values in the
   1955              * hypervisor once the partition has been setup, which is too late
   1956              * since VCPUs are realized later. For now, use the values from
   1957              * QEMU to satisfy these requests, until WHPX adds support for
   1958              * being able to set these values in the hypervisor at runtime.
   1959              */
   1960             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
   1961                 (UINT32 *)&rcx, (UINT32 *)&rdx);
   1962             switch (cpuid_fn) {
   1963             case 0x40000000:
   1964                 /* Expose the vmware cpu frequency cpuid leaf */
   1965                 rax = 0x40000010;
   1966                 rbx = rcx = rdx = 0;
   1967                 break;
   1968 
   1969             case 0x40000010:
   1970                 rax = env->tsc_khz;
   1971                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
   1972                 rcx = rdx = 0;
   1973                 break;
   1974 
   1975             case 0x80000001:
   1976                 /* Remove any support of OSVW */
   1977                 rcx &= ~CPUID_EXT3_OSVW;
   1978                 break;
   1979             }
   1980 
   1981             reg_names[0] = WHvX64RegisterRip;
   1982             reg_names[1] = WHvX64RegisterRax;
   1983             reg_names[2] = WHvX64RegisterRcx;
   1984             reg_names[3] = WHvX64RegisterRdx;
   1985             reg_names[4] = WHvX64RegisterRbx;
   1986 
   1987             reg_values[0].Reg64 = rip;
   1988             reg_values[1].Reg64 = rax;
   1989             reg_values[2].Reg64 = rcx;
   1990             reg_values[3].Reg64 = rdx;
   1991             reg_values[4].Reg64 = rbx;
   1992 
   1993             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
   1994                 whpx->partition, cpu->cpu_index,
   1995                 reg_names,
   1996                 reg_count,
   1997                 reg_values);
   1998 
   1999             if (FAILED(hr)) {
   2000                 error_report("WHPX: Failed to set CpuidAccess state registers,"
   2001                              " hr=%08lx", hr);
   2002             }
   2003             ret = 0;
   2004             break;
   2005         }
   2006         case WHvRunVpExitReasonException:
   2007             whpx_get_registers(cpu);
   2008 
   2009             if ((vcpu->exit_ctx.VpException.ExceptionType ==
   2010                  WHvX64ExceptionTypeDebugTrapOrFault) &&
   2011                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
   2012                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
   2013                  whpx_breakpoint_instruction)) {
   2014                 /* Stopped at a software breakpoint. */
   2015                 cpu->exception_index = EXCP_DEBUG;
   2016             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
   2017                         WHvX64ExceptionTypeDebugTrapOrFault) &&
   2018                        !cpu->singlestep_enabled) {
   2019                 /*
   2020                  * Just finished stepping over a breakpoint, but the
   2021                  * gdb does not expect us to do single-stepping.
   2022                  * Don't do anything special.
   2023                  */
   2024                 cpu->exception_index = EXCP_INTERRUPT;
   2025             } else {
   2026                 /* Another exception or debug event. Report it to GDB. */
   2027                 cpu->exception_index = EXCP_DEBUG;
   2028             }
   2029 
   2030             ret = 1;
   2031             break;
   2032         case WHvRunVpExitReasonNone:
   2033         case WHvRunVpExitReasonUnrecoverableException:
   2034         case WHvRunVpExitReasonInvalidVpRegisterValue:
   2035         case WHvRunVpExitReasonUnsupportedFeature:
   2036         default:
   2037             error_report("WHPX: Unexpected VP exit code %d",
   2038                          vcpu->exit_ctx.ExitReason);
   2039             whpx_get_registers(cpu);
   2040             qemu_mutex_lock_iothread();
   2041             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
   2042             qemu_mutex_unlock_iothread();
   2043             break;
   2044         }
   2045 
   2046     } while (!ret);
   2047 
   2048     if (stepped_over_bp) {
   2049         /* Restore the breakpoint we stepped over */
   2050         cpu_memory_rw_debug(cpu,
   2051             stepped_over_bp->address,
   2052             (void *)&whpx_breakpoint_instruction,
   2053             1,
   2054             true);
   2055     }
   2056 
   2057     if (exclusive_step_mode != WHPX_STEP_NONE) {
   2058         g_assert(cpu_in_exclusive_context(cpu));
   2059         cpu->running = false;
   2060         end_exclusive();
   2061 
   2062         exclusive_step_mode = WHPX_STEP_NONE;
   2063     } else {
   2064         cpu_exec_end(cpu);
   2065     }
   2066 
   2067     qemu_mutex_lock_iothread();
   2068     current_cpu = cpu;
   2069 
   2070     if (--whpx->running_cpus == 0) {
   2071         whpx_last_vcpu_stopping(cpu);
   2072     }
   2073 
   2074     qatomic_set(&cpu->exit_request, false);
   2075 
   2076     return ret < 0;
   2077 }
   2078 
   2079 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
   2080 {
   2081     if (!cpu->vcpu_dirty) {
   2082         whpx_get_registers(cpu);
   2083         cpu->vcpu_dirty = true;
   2084     }
   2085 }
   2086 
   2087 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
   2088                                                run_on_cpu_data arg)
   2089 {
   2090     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
   2091     cpu->vcpu_dirty = false;
   2092 }
   2093 
   2094 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
   2095                                               run_on_cpu_data arg)
   2096 {
   2097     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
   2098     cpu->vcpu_dirty = false;
   2099 }
   2100 
   2101 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
   2102                                                run_on_cpu_data arg)
   2103 {
   2104     cpu->vcpu_dirty = true;
   2105 }
   2106 
   2107 /*
   2108  * CPU support.
   2109  */
   2110 
   2111 void whpx_cpu_synchronize_state(CPUState *cpu)
   2112 {
   2113     if (!cpu->vcpu_dirty) {
   2114         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
   2115     }
   2116 }
   2117 
   2118 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
   2119 {
   2120     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
   2121 }
   2122 
   2123 void whpx_cpu_synchronize_post_init(CPUState *cpu)
   2124 {
   2125     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
   2126 }
   2127 
   2128 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
   2129 {
   2130     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
   2131 }
   2132 
   2133 void whpx_cpu_synchronize_pre_resume(bool step_pending)
   2134 {
   2135     whpx_global.step_pending = step_pending;
   2136 }
   2137 
   2138 /*
   2139  * Vcpu support.
   2140  */
   2141 
   2142 static Error *whpx_migration_blocker;
   2143 
   2144 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
   2145 {
   2146     CPUX86State *env = opaque;
   2147 
   2148     if (running) {
   2149         env->tsc_valid = false;
   2150     }
   2151 }
   2152 
   2153 int whpx_init_vcpu(CPUState *cpu)
   2154 {
   2155     HRESULT hr;
   2156     struct whpx_state *whpx = &whpx_global;
   2157     struct whpx_vcpu *vcpu = NULL;
   2158     Error *local_error = NULL;
   2159     CPUX86State *env = cpu->env_ptr;
   2160     X86CPU *x86_cpu = X86_CPU(cpu);
   2161     UINT64 freq = 0;
   2162     int ret;
   2163 
   2164     /* Add migration blockers for all unsupported features of the
   2165      * Windows Hypervisor Platform
   2166      */
   2167     if (whpx_migration_blocker == NULL) {
   2168         error_setg(&whpx_migration_blocker,
   2169                "State blocked due to non-migratable CPUID feature support,"
   2170                "dirty memory tracking support, and XSAVE/XRSTOR support");
   2171 
   2172         if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
   2173             error_report_err(local_error);
   2174             error_free(whpx_migration_blocker);
   2175             ret = -EINVAL;
   2176             goto error;
   2177         }
   2178     }
   2179 
   2180     vcpu = g_new0(struct whpx_vcpu, 1);
   2181 
   2182     if (!vcpu) {
   2183         error_report("WHPX: Failed to allocte VCPU context.");
   2184         ret = -ENOMEM;
   2185         goto error;
   2186     }
   2187 
   2188     hr = whp_dispatch.WHvEmulatorCreateEmulator(
   2189         &whpx_emu_callbacks,
   2190         &vcpu->emulator);
   2191     if (FAILED(hr)) {
   2192         error_report("WHPX: Failed to setup instruction completion support,"
   2193                      " hr=%08lx", hr);
   2194         ret = -EINVAL;
   2195         goto error;
   2196     }
   2197 
   2198     hr = whp_dispatch.WHvCreateVirtualProcessor(
   2199         whpx->partition, cpu->cpu_index, 0);
   2200     if (FAILED(hr)) {
   2201         error_report("WHPX: Failed to create a virtual processor,"
   2202                      " hr=%08lx", hr);
   2203         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
   2204         ret = -EINVAL;
   2205         goto error;
   2206     }
   2207 
   2208     /*
   2209      * vcpu's TSC frequency is either specified by user, or use the value
   2210      * provided by Hyper-V if the former is not present. In the latter case, we
   2211      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
   2212      * frequency can be migrated later via this field.
   2213      */
   2214     if (!env->tsc_khz) {
   2215         hr = whp_dispatch.WHvGetCapability(
   2216             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
   2217                 NULL);
   2218         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
   2219             if (FAILED(hr)) {
   2220                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
   2221             } else {
   2222                 env->tsc_khz = freq / 1000; /* Hz to KHz */
   2223             }
   2224         }
   2225     }
   2226 
   2227     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
   2228     hr = whp_dispatch.WHvGetCapability(
   2229         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
   2230     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
   2231         if (FAILED(hr)) {
   2232             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
   2233         } else {
   2234             env->apic_bus_freq = freq;
   2235         }
   2236     }
   2237 
   2238     /*
   2239      * If the vmware cpuid frequency leaf option is set, and we have a valid
   2240      * tsc value, trap the corresponding cpuid's.
   2241      */
   2242     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
   2243         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
   2244 
   2245         hr = whp_dispatch.WHvSetPartitionProperty(
   2246                 whpx->partition,
   2247                 WHvPartitionPropertyCodeCpuidExitList,
   2248                 cpuidExitList,
   2249                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
   2250 
   2251         if (FAILED(hr)) {
   2252             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
   2253                         hr);
   2254             ret = -EINVAL;
   2255             goto error;
   2256         }
   2257     }
   2258 
   2259     vcpu->interruptable = true;
   2260     cpu->vcpu_dirty = true;
   2261     cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
   2262     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
   2263     qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
   2264 
   2265     return 0;
   2266 
   2267 error:
   2268     g_free(vcpu);
   2269 
   2270     return ret;
   2271 }
   2272 
   2273 int whpx_vcpu_exec(CPUState *cpu)
   2274 {
   2275     int ret;
   2276     int fatal;
   2277 
   2278     for (;;) {
   2279         if (cpu->exception_index >= EXCP_INTERRUPT) {
   2280             ret = cpu->exception_index;
   2281             cpu->exception_index = -1;
   2282             break;
   2283         }
   2284 
   2285         fatal = whpx_vcpu_run(cpu);
   2286 
   2287         if (fatal) {
   2288             error_report("WHPX: Failed to exec a virtual processor");
   2289             abort();
   2290         }
   2291     }
   2292 
   2293     return ret;
   2294 }
   2295 
   2296 void whpx_destroy_vcpu(CPUState *cpu)
   2297 {
   2298     struct whpx_state *whpx = &whpx_global;
   2299     struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
   2300 
   2301     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
   2302     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
   2303     g_free(cpu->hax_vcpu);
   2304     return;
   2305 }
   2306 
   2307 void whpx_vcpu_kick(CPUState *cpu)
   2308 {
   2309     struct whpx_state *whpx = &whpx_global;
   2310     whp_dispatch.WHvCancelRunVirtualProcessor(
   2311         whpx->partition, cpu->cpu_index, 0);
   2312 }
   2313 
   2314 /*
   2315  * Memory support.
   2316  */
   2317 
   2318 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
   2319                                 void *host_va, int add, int rom,
   2320                                 const char *name)
   2321 {
   2322     struct whpx_state *whpx = &whpx_global;
   2323     HRESULT hr;
   2324 
   2325     /*
   2326     if (add) {
   2327         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
   2328                (void*)start_pa, (void*)size, host_va,
   2329                (rom ? "ROM" : "RAM"), name);
   2330     } else {
   2331         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
   2332                (void*)start_pa, (void*)size, host_va, name);
   2333     }
   2334     */
   2335 
   2336     if (add) {
   2337         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
   2338                                          host_va,
   2339                                          start_pa,
   2340                                          size,
   2341                                          (WHvMapGpaRangeFlagRead |
   2342                                           WHvMapGpaRangeFlagExecute |
   2343                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
   2344     } else {
   2345         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
   2346                                            start_pa,
   2347                                            size);
   2348     }
   2349 
   2350     if (FAILED(hr)) {
   2351         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
   2352                      " Host:%p, hr=%08lx",
   2353                      (add ? "MAP" : "UNMAP"), name,
   2354                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
   2355     }
   2356 }
   2357 
   2358 static void whpx_process_section(MemoryRegionSection *section, int add)
   2359 {
   2360     MemoryRegion *mr = section->mr;
   2361     hwaddr start_pa = section->offset_within_address_space;
   2362     ram_addr_t size = int128_get64(section->size);
   2363     unsigned int delta;
   2364     uint64_t host_va;
   2365 
   2366     if (!memory_region_is_ram(mr)) {
   2367         return;
   2368     }
   2369 
   2370     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
   2371     delta &= ~qemu_real_host_page_mask();
   2372     if (delta > size) {
   2373         return;
   2374     }
   2375     start_pa += delta;
   2376     size -= delta;
   2377     size &= qemu_real_host_page_mask();
   2378     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
   2379         return;
   2380     }
   2381 
   2382     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
   2383             + section->offset_within_region + delta;
   2384 
   2385     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
   2386                         memory_region_is_rom(mr), mr->name);
   2387 }
   2388 
   2389 static void whpx_region_add(MemoryListener *listener,
   2390                            MemoryRegionSection *section)
   2391 {
   2392     memory_region_ref(section->mr);
   2393     whpx_process_section(section, 1);
   2394 }
   2395 
   2396 static void whpx_region_del(MemoryListener *listener,
   2397                            MemoryRegionSection *section)
   2398 {
   2399     whpx_process_section(section, 0);
   2400     memory_region_unref(section->mr);
   2401 }
   2402 
   2403 static void whpx_transaction_begin(MemoryListener *listener)
   2404 {
   2405 }
   2406 
   2407 static void whpx_transaction_commit(MemoryListener *listener)
   2408 {
   2409 }
   2410 
   2411 static void whpx_log_sync(MemoryListener *listener,
   2412                          MemoryRegionSection *section)
   2413 {
   2414     MemoryRegion *mr = section->mr;
   2415 
   2416     if (!memory_region_is_ram(mr)) {
   2417         return;
   2418     }
   2419 
   2420     memory_region_set_dirty(mr, 0, int128_get64(section->size));
   2421 }
   2422 
   2423 static MemoryListener whpx_memory_listener = {
   2424     .name = "whpx",
   2425     .begin = whpx_transaction_begin,
   2426     .commit = whpx_transaction_commit,
   2427     .region_add = whpx_region_add,
   2428     .region_del = whpx_region_del,
   2429     .log_sync = whpx_log_sync,
   2430     .priority = 10,
   2431 };
   2432 
   2433 static void whpx_memory_init(void)
   2434 {
   2435     memory_listener_register(&whpx_memory_listener, &address_space_memory);
   2436 }
   2437 
   2438 /*
   2439  * Load the functions from the given library, using the given handle. If a
   2440  * handle is provided, it is used, otherwise the library is opened. The
   2441  * handle will be updated on return with the opened one.
   2442  */
   2443 static bool load_whp_dispatch_fns(HMODULE *handle,
   2444     WHPFunctionList function_list)
   2445 {
   2446     HMODULE hLib = *handle;
   2447 
   2448     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
   2449     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
   2450     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
   2451         whp_dispatch.function_name = \
   2452             (function_name ## _t)GetProcAddress(hLib, #function_name); \
   2453 
   2454     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
   2455         whp_dispatch.function_name = \
   2456             (function_name ## _t)GetProcAddress(hLib, #function_name); \
   2457         if (!whp_dispatch.function_name) { \
   2458             error_report("Could not load function %s", #function_name); \
   2459             goto error; \
   2460         } \
   2461 
   2462     #define WHP_LOAD_LIB(lib_name, handle_lib) \
   2463     if (!handle_lib) { \
   2464         handle_lib = LoadLibrary(lib_name); \
   2465         if (!handle_lib) { \
   2466             error_report("Could not load library %s.", lib_name); \
   2467             goto error; \
   2468         } \
   2469     } \
   2470 
   2471     switch (function_list) {
   2472     case WINHV_PLATFORM_FNS_DEFAULT:
   2473         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
   2474         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
   2475         break;
   2476 
   2477     case WINHV_EMULATION_FNS_DEFAULT:
   2478         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
   2479         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
   2480         break;
   2481 
   2482     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
   2483         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
   2484         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
   2485         break;
   2486     }
   2487 
   2488     *handle = hLib;
   2489     return true;
   2490 
   2491 error:
   2492     if (hLib) {
   2493         FreeLibrary(hLib);
   2494     }
   2495 
   2496     return false;
   2497 }
   2498 
   2499 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
   2500                                    const char *name, void *opaque,
   2501                                    Error **errp)
   2502 {
   2503     struct whpx_state *whpx = &whpx_global;
   2504     OnOffSplit mode;
   2505 
   2506     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
   2507         return;
   2508     }
   2509 
   2510     switch (mode) {
   2511     case ON_OFF_SPLIT_ON:
   2512         whpx->kernel_irqchip_allowed = true;
   2513         whpx->kernel_irqchip_required = true;
   2514         break;
   2515 
   2516     case ON_OFF_SPLIT_OFF:
   2517         whpx->kernel_irqchip_allowed = false;
   2518         whpx->kernel_irqchip_required = false;
   2519         break;
   2520 
   2521     case ON_OFF_SPLIT_SPLIT:
   2522         error_setg(errp, "WHPX: split irqchip currently not supported");
   2523         error_append_hint(errp,
   2524             "Try without kernel-irqchip or with kernel-irqchip=on|off");
   2525         break;
   2526 
   2527     default:
   2528         /*
   2529          * The value was checked in visit_type_OnOffSplit() above. If
   2530          * we get here, then something is wrong in QEMU.
   2531          */
   2532         abort();
   2533     }
   2534 }
   2535 
   2536 /*
   2537  * Partition support
   2538  */
   2539 
   2540 static int whpx_accel_init(MachineState *ms)
   2541 {
   2542     struct whpx_state *whpx;
   2543     int ret;
   2544     HRESULT hr;
   2545     WHV_CAPABILITY whpx_cap;
   2546     UINT32 whpx_cap_size;
   2547     WHV_PARTITION_PROPERTY prop;
   2548     UINT32 cpuidExitList[] = {1, 0x80000001};
   2549     WHV_CAPABILITY_FEATURES features = {0};
   2550 
   2551     whpx = &whpx_global;
   2552 
   2553     if (!init_whp_dispatch()) {
   2554         ret = -ENOSYS;
   2555         goto error;
   2556     }
   2557 
   2558     whpx->mem_quota = ms->ram_size;
   2559 
   2560     hr = whp_dispatch.WHvGetCapability(
   2561         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
   2562         sizeof(whpx_cap), &whpx_cap_size);
   2563     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
   2564         error_report("WHPX: No accelerator found, hr=%08lx", hr);
   2565         ret = -ENOSPC;
   2566         goto error;
   2567     }
   2568 
   2569     hr = whp_dispatch.WHvGetCapability(
   2570         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
   2571     if (FAILED(hr)) {
   2572         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
   2573         ret = -EINVAL;
   2574         goto error;
   2575     }
   2576 
   2577     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
   2578     if (FAILED(hr)) {
   2579         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
   2580         ret = -EINVAL;
   2581         goto error;
   2582     }
   2583 
   2584     /*
   2585      * Query the XSAVE capability of the partition. Any error here is not
   2586      * considered fatal.
   2587      */
   2588     hr = whp_dispatch.WHvGetPartitionProperty(
   2589         whpx->partition,
   2590         WHvPartitionPropertyCodeProcessorXsaveFeatures,
   2591         &whpx_xsave_cap,
   2592         sizeof(whpx_xsave_cap),
   2593         &whpx_cap_size);
   2594 
   2595     /*
   2596      * Windows version which don't support this property will return with the
   2597      * specific error code.
   2598      */
   2599     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
   2600         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
   2601     }
   2602 
   2603     if (!whpx_has_xsave()) {
   2604         printf("WHPX: Partition is not XSAVE capable\n");
   2605     }
   2606 
   2607     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
   2608     prop.ProcessorCount = ms->smp.cpus;
   2609     hr = whp_dispatch.WHvSetPartitionProperty(
   2610         whpx->partition,
   2611         WHvPartitionPropertyCodeProcessorCount,
   2612         &prop,
   2613         sizeof(WHV_PARTITION_PROPERTY));
   2614 
   2615     if (FAILED(hr)) {
   2616         error_report("WHPX: Failed to set partition core count to %d,"
   2617                      " hr=%08lx", ms->smp.cores, hr);
   2618         ret = -EINVAL;
   2619         goto error;
   2620     }
   2621 
   2622     /*
   2623      * Error out if WHP doesn't support apic emulation and user is requiring
   2624      * it.
   2625      */
   2626     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
   2627             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
   2628         error_report("WHPX: kernel irqchip requested, but unavailable. "
   2629             "Try without kernel-irqchip or with kernel-irqchip=off");
   2630         ret = -EINVAL;
   2631         goto error;
   2632     }
   2633 
   2634     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
   2635         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
   2636         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
   2637             WHvX64LocalApicEmulationModeXApic;
   2638         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
   2639         hr = whp_dispatch.WHvSetPartitionProperty(
   2640             whpx->partition,
   2641             WHvPartitionPropertyCodeLocalApicEmulationMode,
   2642             &mode,
   2643             sizeof(mode));
   2644         if (FAILED(hr)) {
   2645             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
   2646             if (whpx->kernel_irqchip_required) {
   2647                 error_report("WHPX: kernel irqchip requested, but unavailable");
   2648                 ret = -EINVAL;
   2649                 goto error;
   2650             }
   2651         } else {
   2652             whpx->apic_in_platform = true;
   2653         }
   2654     }
   2655 
   2656     /* Register for MSR and CPUID exits */
   2657     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
   2658     prop.ExtendedVmExits.X64MsrExit = 1;
   2659     prop.ExtendedVmExits.X64CpuidExit = 1;
   2660     prop.ExtendedVmExits.ExceptionExit = 1;
   2661     if (whpx_apic_in_platform()) {
   2662         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
   2663     }
   2664 
   2665     hr = whp_dispatch.WHvSetPartitionProperty(
   2666             whpx->partition,
   2667             WHvPartitionPropertyCodeExtendedVmExits,
   2668             &prop,
   2669             sizeof(WHV_PARTITION_PROPERTY));
   2670     if (FAILED(hr)) {
   2671         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
   2672         ret = -EINVAL;
   2673         goto error;
   2674     }
   2675 
   2676     hr = whp_dispatch.WHvSetPartitionProperty(
   2677         whpx->partition,
   2678         WHvPartitionPropertyCodeCpuidExitList,
   2679         cpuidExitList,
   2680         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
   2681 
   2682     if (FAILED(hr)) {
   2683         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
   2684                      hr);
   2685         ret = -EINVAL;
   2686         goto error;
   2687     }
   2688 
   2689     /*
   2690      * We do not want to intercept any exceptions from the guest,
   2691      * until we actually start debugging with gdb.
   2692      */
   2693     whpx->exception_exit_bitmap = -1;
   2694     hr = whpx_set_exception_exit_bitmap(0);
   2695 
   2696     if (FAILED(hr)) {
   2697         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
   2698         ret = -EINVAL;
   2699         goto error;
   2700     }
   2701 
   2702     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
   2703     if (FAILED(hr)) {
   2704         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
   2705         ret = -EINVAL;
   2706         goto error;
   2707     }
   2708 
   2709     whpx_memory_init();
   2710 
   2711     printf("Windows Hypervisor Platform accelerator is operational\n");
   2712     return 0;
   2713 
   2714 error:
   2715 
   2716     if (NULL != whpx->partition) {
   2717         whp_dispatch.WHvDeletePartition(whpx->partition);
   2718         whpx->partition = NULL;
   2719     }
   2720 
   2721     return ret;
   2722 }
   2723 
   2724 int whpx_enabled(void)
   2725 {
   2726     return whpx_allowed;
   2727 }
   2728 
   2729 bool whpx_apic_in_platform(void) {
   2730     return whpx_global.apic_in_platform;
   2731 }
   2732 
   2733 static void whpx_accel_class_init(ObjectClass *oc, void *data)
   2734 {
   2735     AccelClass *ac = ACCEL_CLASS(oc);
   2736     ac->name = "WHPX";
   2737     ac->init_machine = whpx_accel_init;
   2738     ac->allowed = &whpx_allowed;
   2739 
   2740     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
   2741         NULL, whpx_set_kernel_irqchip,
   2742         NULL, NULL);
   2743     object_class_property_set_description(oc, "kernel-irqchip",
   2744         "Configure WHPX in-kernel irqchip");
   2745 }
   2746 
   2747 static void whpx_accel_instance_init(Object *obj)
   2748 {
   2749     struct whpx_state *whpx = &whpx_global;
   2750 
   2751     memset(whpx, 0, sizeof(struct whpx_state));
   2752     /* Turn on kernel-irqchip, by default */
   2753     whpx->kernel_irqchip_allowed = true;
   2754 }
   2755 
   2756 static const TypeInfo whpx_accel_type = {
   2757     .name = ACCEL_CLASS_NAME("whpx"),
   2758     .parent = TYPE_ACCEL,
   2759     .instance_init = whpx_accel_instance_init,
   2760     .class_init = whpx_accel_class_init,
   2761 };
   2762 
   2763 static void whpx_type_init(void)
   2764 {
   2765     type_register_static(&whpx_accel_type);
   2766 }
   2767 
   2768 bool init_whp_dispatch(void)
   2769 {
   2770     if (whp_dispatch_initialized) {
   2771         return true;
   2772     }
   2773 
   2774     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
   2775         goto error;
   2776     }
   2777 
   2778     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
   2779         goto error;
   2780     }
   2781 
   2782     assert(load_whp_dispatch_fns(&hWinHvPlatform,
   2783         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
   2784     whp_dispatch_initialized = true;
   2785 
   2786     return true;
   2787 error:
   2788     if (hWinHvPlatform) {
   2789         FreeLibrary(hWinHvPlatform);
   2790     }
   2791 
   2792     if (hWinHvEmulation) {
   2793         FreeLibrary(hWinHvEmulation);
   2794     }
   2795 
   2796     return false;
   2797 }
   2798 
   2799 type_init(whpx_type_init);