qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

kvm.c (180953B)


      1 /*
      2  * QEMU KVM support
      3  *
      4  * Copyright (C) 2006-2008 Qumranet Technologies
      5  * Copyright IBM, Corp. 2008
      6  *
      7  * Authors:
      8  *  Anthony Liguori   <aliguori@us.ibm.com>
      9  *
     10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
     11  * See the COPYING file in the top-level directory.
     12  *
     13  */
     14 
     15 #include "qemu/osdep.h"
     16 #include "qapi/qapi-events-run-state.h"
     17 #include "qapi/error.h"
     18 #include "qapi/visitor.h"
     19 #include <sys/ioctl.h>
     20 #include <sys/utsname.h>
     21 #include <sys/syscall.h>
     22 
     23 #include <linux/kvm.h>
     24 #include "standard-headers/asm-x86/kvm_para.h"
     25 
     26 #include "cpu.h"
     27 #include "host-cpu.h"
     28 #include "sysemu/sysemu.h"
     29 #include "sysemu/hw_accel.h"
     30 #include "sysemu/kvm_int.h"
     31 #include "sysemu/runstate.h"
     32 #include "kvm_i386.h"
     33 #include "sev.h"
     34 #include "hyperv.h"
     35 #include "hyperv-proto.h"
     36 
     37 #include "exec/gdbstub.h"
     38 #include "qemu/host-utils.h"
     39 #include "qemu/main-loop.h"
     40 #include "qemu/config-file.h"
     41 #include "qemu/error-report.h"
     42 #include "qemu/memalign.h"
     43 #include "hw/i386/x86.h"
     44 #include "hw/i386/apic.h"
     45 #include "hw/i386/apic_internal.h"
     46 #include "hw/i386/apic-msidef.h"
     47 #include "hw/i386/intel_iommu.h"
     48 #include "hw/i386/x86-iommu.h"
     49 #include "hw/i386/e820_memory_layout.h"
     50 
     51 #include "hw/pci/pci.h"
     52 #include "hw/pci/msi.h"
     53 #include "hw/pci/msix.h"
     54 #include "migration/blocker.h"
     55 #include "exec/memattrs.h"
     56 #include "trace.h"
     57 
     58 #include CONFIG_DEVICES
     59 
     60 //#define DEBUG_KVM
     61 
     62 #ifdef DEBUG_KVM
     63 #define DPRINTF(fmt, ...) \
     64     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
     65 #else
     66 #define DPRINTF(fmt, ...) \
     67     do { } while (0)
     68 #endif
     69 
     70 /* From arch/x86/kvm/lapic.h */
     71 #define KVM_APIC_BUS_CYCLE_NS       1
     72 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
     73 
     74 #define MSR_KVM_WALL_CLOCK  0x11
     75 #define MSR_KVM_SYSTEM_TIME 0x12
     76 
     77 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
     78  * 255 kvm_msr_entry structs */
     79 #define MSR_BUF_SIZE 4096
     80 
     81 static void kvm_init_msrs(X86CPU *cpu);
     82 
     83 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     84     KVM_CAP_INFO(SET_TSS_ADDR),
     85     KVM_CAP_INFO(EXT_CPUID),
     86     KVM_CAP_INFO(MP_STATE),
     87     KVM_CAP_LAST_INFO
     88 };
     89 
     90 static bool has_msr_star;
     91 static bool has_msr_hsave_pa;
     92 static bool has_msr_tsc_aux;
     93 static bool has_msr_tsc_adjust;
     94 static bool has_msr_tsc_deadline;
     95 static bool has_msr_feature_control;
     96 static bool has_msr_misc_enable;
     97 static bool has_msr_smbase;
     98 static bool has_msr_bndcfgs;
     99 static int lm_capable_kernel;
    100 static bool has_msr_hv_hypercall;
    101 static bool has_msr_hv_crash;
    102 static bool has_msr_hv_reset;
    103 static bool has_msr_hv_vpindex;
    104 static bool hv_vpindex_settable;
    105 static bool has_msr_hv_runtime;
    106 static bool has_msr_hv_synic;
    107 static bool has_msr_hv_stimer;
    108 static bool has_msr_hv_frequencies;
    109 static bool has_msr_hv_reenlightenment;
    110 static bool has_msr_hv_syndbg_options;
    111 static bool has_msr_xss;
    112 static bool has_msr_umwait;
    113 static bool has_msr_spec_ctrl;
    114 static bool has_tsc_scale_msr;
    115 static bool has_msr_tsx_ctrl;
    116 static bool has_msr_virt_ssbd;
    117 static bool has_msr_smi_count;
    118 static bool has_msr_arch_capabs;
    119 static bool has_msr_core_capabs;
    120 static bool has_msr_vmx_vmfunc;
    121 static bool has_msr_ucode_rev;
    122 static bool has_msr_vmx_procbased_ctls2;
    123 static bool has_msr_perf_capabs;
    124 static bool has_msr_pkrs;
    125 
    126 static uint32_t has_architectural_pmu_version;
    127 static uint32_t num_architectural_pmu_gp_counters;
    128 static uint32_t num_architectural_pmu_fixed_counters;
    129 
    130 static int has_xsave;
    131 static int has_xsave2;
    132 static int has_xcrs;
    133 static int has_pit_state2;
    134 static int has_sregs2;
    135 static int has_exception_payload;
    136 static int has_triple_fault_event;
    137 
    138 static bool has_msr_mcg_ext_ctl;
    139 
    140 static struct kvm_cpuid2 *cpuid_cache;
    141 static struct kvm_cpuid2 *hv_cpuid_cache;
    142 static struct kvm_msr_list *kvm_feature_msrs;
    143 
    144 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES];
    145 
    146 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
    147 static RateLimit bus_lock_ratelimit_ctrl;
    148 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value);
    149 
    150 int kvm_has_pit_state2(void)
    151 {
    152     return has_pit_state2;
    153 }
    154 
    155 bool kvm_has_smm(void)
    156 {
    157     return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
    158 }
    159 
    160 bool kvm_has_adjust_clock_stable(void)
    161 {
    162     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
    163 
    164     return (ret & KVM_CLOCK_TSC_STABLE);
    165 }
    166 
    167 bool kvm_has_adjust_clock(void)
    168 {
    169     return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
    170 }
    171 
    172 bool kvm_has_exception_payload(void)
    173 {
    174     return has_exception_payload;
    175 }
    176 
    177 static bool kvm_x2apic_api_set_flags(uint64_t flags)
    178 {
    179     KVMState *s = KVM_STATE(current_accel());
    180 
    181     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
    182 }
    183 
    184 #define MEMORIZE(fn, _result) \
    185     ({ \
    186         static bool _memorized; \
    187         \
    188         if (_memorized) { \
    189             return _result; \
    190         } \
    191         _memorized = true; \
    192         _result = fn; \
    193     })
    194 
    195 static bool has_x2apic_api;
    196 
    197 bool kvm_has_x2apic_api(void)
    198 {
    199     return has_x2apic_api;
    200 }
    201 
    202 bool kvm_enable_x2apic(void)
    203 {
    204     return MEMORIZE(
    205              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
    206                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
    207              has_x2apic_api);
    208 }
    209 
    210 bool kvm_hv_vpindex_settable(void)
    211 {
    212     return hv_vpindex_settable;
    213 }
    214 
    215 static int kvm_get_tsc(CPUState *cs)
    216 {
    217     X86CPU *cpu = X86_CPU(cs);
    218     CPUX86State *env = &cpu->env;
    219     uint64_t value;
    220     int ret;
    221 
    222     if (env->tsc_valid) {
    223         return 0;
    224     }
    225 
    226     env->tsc_valid = !runstate_is_running();
    227 
    228     ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value);
    229     if (ret < 0) {
    230         return ret;
    231     }
    232 
    233     env->tsc = value;
    234     return 0;
    235 }
    236 
    237 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
    238 {
    239     kvm_get_tsc(cpu);
    240 }
    241 
    242 void kvm_synchronize_all_tsc(void)
    243 {
    244     CPUState *cpu;
    245 
    246     if (kvm_enabled()) {
    247         CPU_FOREACH(cpu) {
    248             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
    249         }
    250     }
    251 }
    252 
    253 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
    254 {
    255     struct kvm_cpuid2 *cpuid;
    256     int r, size;
    257 
    258     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
    259     cpuid = g_malloc0(size);
    260     cpuid->nent = max;
    261     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
    262     if (r == 0 && cpuid->nent >= max) {
    263         r = -E2BIG;
    264     }
    265     if (r < 0) {
    266         if (r == -E2BIG) {
    267             g_free(cpuid);
    268             return NULL;
    269         } else {
    270             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
    271                     strerror(-r));
    272             exit(1);
    273         }
    274     }
    275     return cpuid;
    276 }
    277 
    278 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
    279  * for all entries.
    280  */
    281 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
    282 {
    283     struct kvm_cpuid2 *cpuid;
    284     int max = 1;
    285 
    286     if (cpuid_cache != NULL) {
    287         return cpuid_cache;
    288     }
    289     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
    290         max *= 2;
    291     }
    292     cpuid_cache = cpuid;
    293     return cpuid;
    294 }
    295 
    296 static bool host_tsx_broken(void)
    297 {
    298     int family, model, stepping;\
    299     char vendor[CPUID_VENDOR_SZ + 1];
    300 
    301     host_cpu_vendor_fms(vendor, &family, &model, &stepping);
    302 
    303     /* Check if we are running on a Haswell host known to have broken TSX */
    304     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
    305            (family == 6) &&
    306            ((model == 63 && stepping < 4) ||
    307             model == 60 || model == 69 || model == 70);
    308 }
    309 
    310 /* Returns the value for a specific register on the cpuid entry
    311  */
    312 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
    313 {
    314     uint32_t ret = 0;
    315     switch (reg) {
    316     case R_EAX:
    317         ret = entry->eax;
    318         break;
    319     case R_EBX:
    320         ret = entry->ebx;
    321         break;
    322     case R_ECX:
    323         ret = entry->ecx;
    324         break;
    325     case R_EDX:
    326         ret = entry->edx;
    327         break;
    328     }
    329     return ret;
    330 }
    331 
    332 /* Find matching entry for function/index on kvm_cpuid2 struct
    333  */
    334 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
    335                                                  uint32_t function,
    336                                                  uint32_t index)
    337 {
    338     int i;
    339     for (i = 0; i < cpuid->nent; ++i) {
    340         if (cpuid->entries[i].function == function &&
    341             cpuid->entries[i].index == index) {
    342             return &cpuid->entries[i];
    343         }
    344     }
    345     /* not found: */
    346     return NULL;
    347 }
    348 
    349 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
    350                                       uint32_t index, int reg)
    351 {
    352     struct kvm_cpuid2 *cpuid;
    353     uint32_t ret = 0;
    354     uint32_t cpuid_1_edx;
    355     uint64_t bitmask;
    356 
    357     cpuid = get_supported_cpuid(s);
    358 
    359     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
    360     if (entry) {
    361         ret = cpuid_entry_get_reg(entry, reg);
    362     }
    363 
    364     /* Fixups for the data returned by KVM, below */
    365 
    366     if (function == 1 && reg == R_EDX) {
    367         /* KVM before 2.6.30 misreports the following features */
    368         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
    369     } else if (function == 1 && reg == R_ECX) {
    370         /* We can set the hypervisor flag, even if KVM does not return it on
    371          * GET_SUPPORTED_CPUID
    372          */
    373         ret |= CPUID_EXT_HYPERVISOR;
    374         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
    375          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
    376          * and the irqchip is in the kernel.
    377          */
    378         if (kvm_irqchip_in_kernel() &&
    379                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
    380             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
    381         }
    382 
    383         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
    384          * without the in-kernel irqchip
    385          */
    386         if (!kvm_irqchip_in_kernel()) {
    387             ret &= ~CPUID_EXT_X2APIC;
    388         }
    389 
    390         if (enable_cpu_pm) {
    391             int disable_exits = kvm_check_extension(s,
    392                                                     KVM_CAP_X86_DISABLE_EXITS);
    393 
    394             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
    395                 ret |= CPUID_EXT_MONITOR;
    396             }
    397         }
    398     } else if (function == 6 && reg == R_EAX) {
    399         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
    400     } else if (function == 7 && index == 0 && reg == R_EBX) {
    401         if (host_tsx_broken()) {
    402             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
    403         }
    404     } else if (function == 7 && index == 0 && reg == R_EDX) {
    405         /*
    406          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
    407          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
    408          * returned by KVM_GET_MSR_INDEX_LIST.
    409          */
    410         if (!has_msr_arch_capabs) {
    411             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
    412         }
    413     } else if (function == 0xd && index == 0 &&
    414                (reg == R_EAX || reg == R_EDX)) {
    415         /*
    416          * The value returned by KVM_GET_SUPPORTED_CPUID does not include
    417          * features that still have to be enabled with the arch_prctl
    418          * system call.  QEMU needs the full value, which is retrieved
    419          * with KVM_GET_DEVICE_ATTR.
    420          */
    421         struct kvm_device_attr attr = {
    422             .group = 0,
    423             .attr = KVM_X86_XCOMP_GUEST_SUPP,
    424             .addr = (unsigned long) &bitmask
    425         };
    426 
    427         bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
    428         if (!sys_attr) {
    429             return ret;
    430         }
    431 
    432         int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
    433         if (rc < 0) {
    434             if (rc != -ENXIO) {
    435                 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
    436                             "error: %d", rc);
    437             }
    438             return ret;
    439         }
    440         ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
    441     } else if (function == 0x80000001 && reg == R_ECX) {
    442         /*
    443          * It's safe to enable TOPOEXT even if it's not returned by
    444          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
    445          * us to keep CPU models including TOPOEXT runnable on older kernels.
    446          */
    447         ret |= CPUID_EXT3_TOPOEXT;
    448     } else if (function == 0x80000001 && reg == R_EDX) {
    449         /* On Intel, kvm returns cpuid according to the Intel spec,
    450          * so add missing bits according to the AMD spec:
    451          */
    452         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
    453         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
    454     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
    455         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
    456          * be enabled without the in-kernel irqchip
    457          */
    458         if (!kvm_irqchip_in_kernel()) {
    459             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
    460         }
    461         if (kvm_irqchip_is_split()) {
    462             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
    463         }
    464     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
    465         ret |= 1U << KVM_HINTS_REALTIME;
    466     }
    467 
    468     return ret;
    469 }
    470 
    471 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
    472 {
    473     struct {
    474         struct kvm_msrs info;
    475         struct kvm_msr_entry entries[1];
    476     } msr_data = {};
    477     uint64_t value;
    478     uint32_t ret, can_be_one, must_be_one;
    479 
    480     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
    481         return 0;
    482     }
    483 
    484     /* Check if requested MSR is supported feature MSR */
    485     int i;
    486     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
    487         if (kvm_feature_msrs->indices[i] == index) {
    488             break;
    489         }
    490     if (i == kvm_feature_msrs->nmsrs) {
    491         return 0; /* if the feature MSR is not supported, simply return 0 */
    492     }
    493 
    494     msr_data.info.nmsrs = 1;
    495     msr_data.entries[0].index = index;
    496 
    497     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
    498     if (ret != 1) {
    499         error_report("KVM get MSR (index=0x%x) feature failed, %s",
    500             index, strerror(-ret));
    501         exit(1);
    502     }
    503 
    504     value = msr_data.entries[0].data;
    505     switch (index) {
    506     case MSR_IA32_VMX_PROCBASED_CTLS2:
    507         if (!has_msr_vmx_procbased_ctls2) {
    508             /* KVM forgot to add these bits for some time, do this ourselves. */
    509             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
    510                 CPUID_XSAVE_XSAVES) {
    511                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
    512             }
    513             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
    514                 CPUID_EXT_RDRAND) {
    515                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
    516             }
    517             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
    518                 CPUID_7_0_EBX_INVPCID) {
    519                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
    520             }
    521             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
    522                 CPUID_7_0_EBX_RDSEED) {
    523                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
    524             }
    525             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
    526                 CPUID_EXT2_RDTSCP) {
    527                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
    528             }
    529         }
    530         /* fall through */
    531     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
    532     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
    533     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
    534     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
    535         /*
    536          * Return true for bits that can be one, but do not have to be one.
    537          * The SDM tells us which bits could have a "must be one" setting,
    538          * so we can do the opposite transformation in make_vmx_msr_value.
    539          */
    540         must_be_one = (uint32_t)value;
    541         can_be_one = (uint32_t)(value >> 32);
    542         return can_be_one & ~must_be_one;
    543 
    544     default:
    545         return value;
    546     }
    547 }
    548 
    549 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
    550                                      int *max_banks)
    551 {
    552     int r;
    553 
    554     r = kvm_check_extension(s, KVM_CAP_MCE);
    555     if (r > 0) {
    556         *max_banks = r;
    557         return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
    558     }
    559     return -ENOSYS;
    560 }
    561 
    562 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
    563 {
    564     CPUState *cs = CPU(cpu);
    565     CPUX86State *env = &cpu->env;
    566     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
    567                       MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
    568     uint64_t mcg_status = MCG_STATUS_MCIP;
    569     int flags = 0;
    570 
    571     if (code == BUS_MCEERR_AR) {
    572         status |= MCI_STATUS_AR | 0x134;
    573         mcg_status |= MCG_STATUS_RIPV | MCG_STATUS_EIPV;
    574     } else {
    575         status |= 0xc0;
    576         mcg_status |= MCG_STATUS_RIPV;
    577     }
    578 
    579     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
    580     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
    581      * guest kernel back into env->mcg_ext_ctl.
    582      */
    583     cpu_synchronize_state(cs);
    584     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
    585         mcg_status |= MCG_STATUS_LMCE;
    586         flags = 0;
    587     }
    588 
    589     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
    590                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
    591 }
    592 
    593 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
    594 {
    595     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
    596 
    597     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
    598                                    &mff);
    599 }
    600 
    601 static void hardware_memory_error(void *host_addr)
    602 {
    603     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
    604     error_report("QEMU got Hardware memory error at addr %p", host_addr);
    605     exit(1);
    606 }
    607 
    608 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
    609 {
    610     X86CPU *cpu = X86_CPU(c);
    611     CPUX86State *env = &cpu->env;
    612     ram_addr_t ram_addr;
    613     hwaddr paddr;
    614 
    615     /* If we get an action required MCE, it has been injected by KVM
    616      * while the VM was running.  An action optional MCE instead should
    617      * be coming from the main thread, which qemu_init_sigbus identifies
    618      * as the "early kill" thread.
    619      */
    620     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
    621 
    622     if ((env->mcg_cap & MCG_SER_P) && addr) {
    623         ram_addr = qemu_ram_addr_from_host(addr);
    624         if (ram_addr != RAM_ADDR_INVALID &&
    625             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
    626             kvm_hwpoison_page_add(ram_addr);
    627             kvm_mce_inject(cpu, paddr, code);
    628 
    629             /*
    630              * Use different logging severity based on error type.
    631              * If there is additional MCE reporting on the hypervisor, QEMU VA
    632              * could be another source to identify the PA and MCE details.
    633              */
    634             if (code == BUS_MCEERR_AR) {
    635                 error_report("Guest MCE Memory Error at QEMU addr %p and "
    636                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
    637                     addr, paddr, "BUS_MCEERR_AR");
    638             } else {
    639                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
    640                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
    641                      addr, paddr, "BUS_MCEERR_AO");
    642             }
    643 
    644             return;
    645         }
    646 
    647         if (code == BUS_MCEERR_AO) {
    648             warn_report("Hardware memory error at addr %p of type %s "
    649                 "for memory used by QEMU itself instead of guest system!",
    650                  addr, "BUS_MCEERR_AO");
    651         }
    652     }
    653 
    654     if (code == BUS_MCEERR_AR) {
    655         hardware_memory_error(addr);
    656     }
    657 
    658     /* Hope we are lucky for AO MCE, just notify a event */
    659     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
    660 }
    661 
    662 static void kvm_reset_exception(CPUX86State *env)
    663 {
    664     env->exception_nr = -1;
    665     env->exception_pending = 0;
    666     env->exception_injected = 0;
    667     env->exception_has_payload = false;
    668     env->exception_payload = 0;
    669 }
    670 
    671 static void kvm_queue_exception(CPUX86State *env,
    672                                 int32_t exception_nr,
    673                                 uint8_t exception_has_payload,
    674                                 uint64_t exception_payload)
    675 {
    676     assert(env->exception_nr == -1);
    677     assert(!env->exception_pending);
    678     assert(!env->exception_injected);
    679     assert(!env->exception_has_payload);
    680 
    681     env->exception_nr = exception_nr;
    682 
    683     if (has_exception_payload) {
    684         env->exception_pending = 1;
    685 
    686         env->exception_has_payload = exception_has_payload;
    687         env->exception_payload = exception_payload;
    688     } else {
    689         env->exception_injected = 1;
    690 
    691         if (exception_nr == EXCP01_DB) {
    692             assert(exception_has_payload);
    693             env->dr[6] = exception_payload;
    694         } else if (exception_nr == EXCP0E_PAGE) {
    695             assert(exception_has_payload);
    696             env->cr[2] = exception_payload;
    697         } else {
    698             assert(!exception_has_payload);
    699         }
    700     }
    701 }
    702 
    703 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
    704 {
    705     CPUX86State *env = &cpu->env;
    706 
    707     if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
    708         unsigned int bank, bank_num = env->mcg_cap & 0xff;
    709         struct kvm_x86_mce mce;
    710 
    711         kvm_reset_exception(env);
    712 
    713         /*
    714          * There must be at least one bank in use if an MCE is pending.
    715          * Find it and use its values for the event injection.
    716          */
    717         for (bank = 0; bank < bank_num; bank++) {
    718             if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
    719                 break;
    720             }
    721         }
    722         assert(bank < bank_num);
    723 
    724         mce.bank = bank;
    725         mce.status = env->mce_banks[bank * 4 + 1];
    726         mce.mcg_status = env->mcg_status;
    727         mce.addr = env->mce_banks[bank * 4 + 2];
    728         mce.misc = env->mce_banks[bank * 4 + 3];
    729 
    730         return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
    731     }
    732     return 0;
    733 }
    734 
    735 static void cpu_update_state(void *opaque, bool running, RunState state)
    736 {
    737     CPUX86State *env = opaque;
    738 
    739     if (running) {
    740         env->tsc_valid = false;
    741     }
    742 }
    743 
    744 unsigned long kvm_arch_vcpu_id(CPUState *cs)
    745 {
    746     X86CPU *cpu = X86_CPU(cs);
    747     return cpu->apic_id;
    748 }
    749 
    750 #ifndef KVM_CPUID_SIGNATURE_NEXT
    751 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
    752 #endif
    753 
    754 static bool hyperv_enabled(X86CPU *cpu)
    755 {
    756     return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
    757         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
    758          cpu->hyperv_features || cpu->hyperv_passthrough);
    759 }
    760 
    761 /*
    762  * Check whether target_freq is within conservative
    763  * ntp correctable bounds (250ppm) of freq
    764  */
    765 static inline bool freq_within_bounds(int freq, int target_freq)
    766 {
    767         int max_freq = freq + (freq * 250 / 1000000);
    768         int min_freq = freq - (freq * 250 / 1000000);
    769 
    770         if (target_freq >= min_freq && target_freq <= max_freq) {
    771                 return true;
    772         }
    773 
    774         return false;
    775 }
    776 
    777 static int kvm_arch_set_tsc_khz(CPUState *cs)
    778 {
    779     X86CPU *cpu = X86_CPU(cs);
    780     CPUX86State *env = &cpu->env;
    781     int r, cur_freq;
    782     bool set_ioctl = false;
    783 
    784     if (!env->tsc_khz) {
    785         return 0;
    786     }
    787 
    788     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
    789                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
    790 
    791     /*
    792      * If TSC scaling is supported, attempt to set TSC frequency.
    793      */
    794     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
    795         set_ioctl = true;
    796     }
    797 
    798     /*
    799      * If desired TSC frequency is within bounds of NTP correction,
    800      * attempt to set TSC frequency.
    801      */
    802     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
    803         set_ioctl = true;
    804     }
    805 
    806     r = set_ioctl ?
    807         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
    808         -ENOTSUP;
    809 
    810     if (r < 0) {
    811         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
    812          * TSC frequency doesn't match the one we want.
    813          */
    814         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
    815                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
    816                    -ENOTSUP;
    817         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
    818             warn_report("TSC frequency mismatch between "
    819                         "VM (%" PRId64 " kHz) and host (%d kHz), "
    820                         "and TSC scaling unavailable",
    821                         env->tsc_khz, cur_freq);
    822             return r;
    823         }
    824     }
    825 
    826     return 0;
    827 }
    828 
    829 static bool tsc_is_stable_and_known(CPUX86State *env)
    830 {
    831     if (!env->tsc_khz) {
    832         return false;
    833     }
    834     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
    835         || env->user_tsc_khz;
    836 }
    837 
    838 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
    839 
    840 static struct {
    841     const char *desc;
    842     struct {
    843         uint32_t func;
    844         int reg;
    845         uint32_t bits;
    846     } flags[2];
    847     uint64_t dependencies;
    848 } kvm_hyperv_properties[] = {
    849     [HYPERV_FEAT_RELAXED] = {
    850         .desc = "relaxed timing (hv-relaxed)",
    851         .flags = {
    852             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    853              .bits = HV_RELAXED_TIMING_RECOMMENDED}
    854         }
    855     },
    856     [HYPERV_FEAT_VAPIC] = {
    857         .desc = "virtual APIC (hv-vapic)",
    858         .flags = {
    859             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    860              .bits = HV_APIC_ACCESS_AVAILABLE}
    861         }
    862     },
    863     [HYPERV_FEAT_TIME] = {
    864         .desc = "clocksources (hv-time)",
    865         .flags = {
    866             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    867              .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
    868         }
    869     },
    870     [HYPERV_FEAT_CRASH] = {
    871         .desc = "crash MSRs (hv-crash)",
    872         .flags = {
    873             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    874              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
    875         }
    876     },
    877     [HYPERV_FEAT_RESET] = {
    878         .desc = "reset MSR (hv-reset)",
    879         .flags = {
    880             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    881              .bits = HV_RESET_AVAILABLE}
    882         }
    883     },
    884     [HYPERV_FEAT_VPINDEX] = {
    885         .desc = "VP_INDEX MSR (hv-vpindex)",
    886         .flags = {
    887             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    888              .bits = HV_VP_INDEX_AVAILABLE}
    889         }
    890     },
    891     [HYPERV_FEAT_RUNTIME] = {
    892         .desc = "VP_RUNTIME MSR (hv-runtime)",
    893         .flags = {
    894             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    895              .bits = HV_VP_RUNTIME_AVAILABLE}
    896         }
    897     },
    898     [HYPERV_FEAT_SYNIC] = {
    899         .desc = "synthetic interrupt controller (hv-synic)",
    900         .flags = {
    901             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    902              .bits = HV_SYNIC_AVAILABLE}
    903         }
    904     },
    905     [HYPERV_FEAT_STIMER] = {
    906         .desc = "synthetic timers (hv-stimer)",
    907         .flags = {
    908             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    909              .bits = HV_SYNTIMERS_AVAILABLE}
    910         },
    911         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
    912     },
    913     [HYPERV_FEAT_FREQUENCIES] = {
    914         .desc = "frequency MSRs (hv-frequencies)",
    915         .flags = {
    916             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    917              .bits = HV_ACCESS_FREQUENCY_MSRS},
    918             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    919              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
    920         }
    921     },
    922     [HYPERV_FEAT_REENLIGHTENMENT] = {
    923         .desc = "reenlightenment MSRs (hv-reenlightenment)",
    924         .flags = {
    925             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
    926              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
    927         }
    928     },
    929     [HYPERV_FEAT_TLBFLUSH] = {
    930         .desc = "paravirtualized TLB flush (hv-tlbflush)",
    931         .flags = {
    932             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    933              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
    934              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
    935         },
    936         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
    937     },
    938     [HYPERV_FEAT_EVMCS] = {
    939         .desc = "enlightened VMCS (hv-evmcs)",
    940         .flags = {
    941             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    942              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
    943         },
    944         .dependencies = BIT(HYPERV_FEAT_VAPIC)
    945     },
    946     [HYPERV_FEAT_IPI] = {
    947         .desc = "paravirtualized IPI (hv-ipi)",
    948         .flags = {
    949             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    950              .bits = HV_CLUSTER_IPI_RECOMMENDED |
    951              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
    952         },
    953         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
    954     },
    955     [HYPERV_FEAT_STIMER_DIRECT] = {
    956         .desc = "direct mode synthetic timers (hv-stimer-direct)",
    957         .flags = {
    958             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    959              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
    960         },
    961         .dependencies = BIT(HYPERV_FEAT_STIMER)
    962     },
    963     [HYPERV_FEAT_AVIC] = {
    964         .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
    965         .flags = {
    966             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
    967              .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
    968         }
    969     },
    970 #ifdef CONFIG_SYNDBG
    971     [HYPERV_FEAT_SYNDBG] = {
    972         .desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
    973         .flags = {
    974             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    975              .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
    976         },
    977         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
    978     },
    979 #endif
    980     [HYPERV_FEAT_MSR_BITMAP] = {
    981         .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
    982         .flags = {
    983             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
    984              .bits = HV_NESTED_MSR_BITMAP}
    985         }
    986     },
    987     [HYPERV_FEAT_XMM_INPUT] = {
    988         .desc = "XMM fast hypercall input (hv-xmm-input)",
    989         .flags = {
    990             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    991              .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE}
    992         }
    993     },
    994     [HYPERV_FEAT_TLBFLUSH_EXT] = {
    995         .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)",
    996         .flags = {
    997             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
    998              .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE}
    999         },
   1000         .dependencies = BIT(HYPERV_FEAT_TLBFLUSH)
   1001     },
   1002     [HYPERV_FEAT_TLBFLUSH_DIRECT] = {
   1003         .desc = "direct TLB flush (hv-tlbflush-direct)",
   1004         .flags = {
   1005             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
   1006              .bits = HV_NESTED_DIRECT_FLUSH}
   1007         },
   1008         .dependencies = BIT(HYPERV_FEAT_VAPIC)
   1009     },
   1010 };
   1011 
   1012 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
   1013                                            bool do_sys_ioctl)
   1014 {
   1015     struct kvm_cpuid2 *cpuid;
   1016     int r, size;
   1017 
   1018     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
   1019     cpuid = g_malloc0(size);
   1020     cpuid->nent = max;
   1021 
   1022     if (do_sys_ioctl) {
   1023         r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
   1024     } else {
   1025         r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
   1026     }
   1027     if (r == 0 && cpuid->nent >= max) {
   1028         r = -E2BIG;
   1029     }
   1030     if (r < 0) {
   1031         if (r == -E2BIG) {
   1032             g_free(cpuid);
   1033             return NULL;
   1034         } else {
   1035             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
   1036                     strerror(-r));
   1037             exit(1);
   1038         }
   1039     }
   1040     return cpuid;
   1041 }
   1042 
   1043 /*
   1044  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
   1045  * for all entries.
   1046  */
   1047 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
   1048 {
   1049     struct kvm_cpuid2 *cpuid;
   1050     /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */
   1051     int max = 11;
   1052     int i;
   1053     bool do_sys_ioctl;
   1054 
   1055     do_sys_ioctl =
   1056         kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
   1057 
   1058     /*
   1059      * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
   1060      * unsupported, kvm_hyperv_expand_features() checks for that.
   1061      */
   1062     assert(do_sys_ioctl || cs->kvm_state);
   1063 
   1064     /*
   1065      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
   1066      * -E2BIG, however, it doesn't report back the right size. Keep increasing
   1067      * it and re-trying until we succeed.
   1068      */
   1069     while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
   1070         max++;
   1071     }
   1072 
   1073     /*
   1074      * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
   1075      * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
   1076      * information early, just check for the capability and set the bit
   1077      * manually.
   1078      */
   1079     if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
   1080                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
   1081         for (i = 0; i < cpuid->nent; i++) {
   1082             if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
   1083                 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
   1084             }
   1085         }
   1086     }
   1087 
   1088     return cpuid;
   1089 }
   1090 
   1091 /*
   1092  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
   1093  * leaves from KVM_CAP_HYPERV* and present MSRs data.
   1094  */
   1095 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
   1096 {
   1097     X86CPU *cpu = X86_CPU(cs);
   1098     struct kvm_cpuid2 *cpuid;
   1099     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
   1100 
   1101     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
   1102     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
   1103     cpuid->nent = 2;
   1104 
   1105     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
   1106     entry_feat = &cpuid->entries[0];
   1107     entry_feat->function = HV_CPUID_FEATURES;
   1108 
   1109     entry_recomm = &cpuid->entries[1];
   1110     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
   1111     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
   1112 
   1113     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
   1114         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
   1115         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
   1116         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
   1117         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
   1118         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
   1119     }
   1120 
   1121     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
   1122         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
   1123         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
   1124     }
   1125 
   1126     if (has_msr_hv_frequencies) {
   1127         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
   1128         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
   1129     }
   1130 
   1131     if (has_msr_hv_crash) {
   1132         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
   1133     }
   1134 
   1135     if (has_msr_hv_reenlightenment) {
   1136         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
   1137     }
   1138 
   1139     if (has_msr_hv_reset) {
   1140         entry_feat->eax |= HV_RESET_AVAILABLE;
   1141     }
   1142 
   1143     if (has_msr_hv_vpindex) {
   1144         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
   1145     }
   1146 
   1147     if (has_msr_hv_runtime) {
   1148         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
   1149     }
   1150 
   1151     if (has_msr_hv_synic) {
   1152         unsigned int cap = cpu->hyperv_synic_kvm_only ?
   1153             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
   1154 
   1155         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
   1156             entry_feat->eax |= HV_SYNIC_AVAILABLE;
   1157         }
   1158     }
   1159 
   1160     if (has_msr_hv_stimer) {
   1161         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
   1162     }
   1163 
   1164     if (has_msr_hv_syndbg_options) {
   1165         entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE;
   1166         entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
   1167         entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED;
   1168     }
   1169 
   1170     if (kvm_check_extension(cs->kvm_state,
   1171                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
   1172         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
   1173         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
   1174     }
   1175 
   1176     if (kvm_check_extension(cs->kvm_state,
   1177                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
   1178         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
   1179     }
   1180 
   1181     if (kvm_check_extension(cs->kvm_state,
   1182                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
   1183         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
   1184         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
   1185     }
   1186 
   1187     return cpuid;
   1188 }
   1189 
   1190 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
   1191 {
   1192     struct kvm_cpuid_entry2 *entry;
   1193     struct kvm_cpuid2 *cpuid;
   1194 
   1195     if (hv_cpuid_cache) {
   1196         cpuid = hv_cpuid_cache;
   1197     } else {
   1198         if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
   1199             cpuid = get_supported_hv_cpuid(cs);
   1200         } else {
   1201             /*
   1202              * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
   1203              * before KVM context is created but this is only done when
   1204              * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
   1205              * KVM_CAP_HYPERV_CPUID.
   1206              */
   1207             assert(cs->kvm_state);
   1208 
   1209             cpuid = get_supported_hv_cpuid_legacy(cs);
   1210         }
   1211         hv_cpuid_cache = cpuid;
   1212     }
   1213 
   1214     if (!cpuid) {
   1215         return 0;
   1216     }
   1217 
   1218     entry = cpuid_find_entry(cpuid, func, 0);
   1219     if (!entry) {
   1220         return 0;
   1221     }
   1222 
   1223     return cpuid_entry_get_reg(entry, reg);
   1224 }
   1225 
   1226 static bool hyperv_feature_supported(CPUState *cs, int feature)
   1227 {
   1228     uint32_t func, bits;
   1229     int i, reg;
   1230 
   1231     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
   1232 
   1233         func = kvm_hyperv_properties[feature].flags[i].func;
   1234         reg = kvm_hyperv_properties[feature].flags[i].reg;
   1235         bits = kvm_hyperv_properties[feature].flags[i].bits;
   1236 
   1237         if (!func) {
   1238             continue;
   1239         }
   1240 
   1241         if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
   1242             return false;
   1243         }
   1244     }
   1245 
   1246     return true;
   1247 }
   1248 
   1249 /* Checks that all feature dependencies are enabled */
   1250 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
   1251 {
   1252     uint64_t deps;
   1253     int dep_feat;
   1254 
   1255     deps = kvm_hyperv_properties[feature].dependencies;
   1256     while (deps) {
   1257         dep_feat = ctz64(deps);
   1258         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
   1259             error_setg(errp, "Hyper-V %s requires Hyper-V %s",
   1260                        kvm_hyperv_properties[feature].desc,
   1261                        kvm_hyperv_properties[dep_feat].desc);
   1262             return false;
   1263         }
   1264         deps &= ~(1ull << dep_feat);
   1265     }
   1266 
   1267     return true;
   1268 }
   1269 
   1270 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
   1271 {
   1272     X86CPU *cpu = X86_CPU(cs);
   1273     uint32_t r = 0;
   1274     int i, j;
   1275 
   1276     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
   1277         if (!hyperv_feat_enabled(cpu, i)) {
   1278             continue;
   1279         }
   1280 
   1281         for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
   1282             if (kvm_hyperv_properties[i].flags[j].func != func) {
   1283                 continue;
   1284             }
   1285             if (kvm_hyperv_properties[i].flags[j].reg != reg) {
   1286                 continue;
   1287             }
   1288 
   1289             r |= kvm_hyperv_properties[i].flags[j].bits;
   1290         }
   1291     }
   1292 
   1293     /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */
   1294     if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) {
   1295         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
   1296             r |= DEFAULT_EVMCS_VERSION;
   1297         }
   1298     }
   1299 
   1300     return r;
   1301 }
   1302 
   1303 /*
   1304  * Expand Hyper-V CPU features. In partucular, check that all the requested
   1305  * features are supported by the host and the sanity of the configuration
   1306  * (that all the required dependencies are included). Also, this takes care
   1307  * of 'hv_passthrough' mode and fills the environment with all supported
   1308  * Hyper-V features.
   1309  */
   1310 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
   1311 {
   1312     CPUState *cs = CPU(cpu);
   1313     Error *local_err = NULL;
   1314     int feat;
   1315 
   1316     if (!hyperv_enabled(cpu))
   1317         return true;
   1318 
   1319     /*
   1320      * When kvm_hyperv_expand_features is called at CPU feature expansion
   1321      * time per-CPU kvm_state is not available yet so we can only proceed
   1322      * when KVM_CAP_SYS_HYPERV_CPUID is supported.
   1323      */
   1324     if (!cs->kvm_state &&
   1325         !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
   1326         return true;
   1327 
   1328     if (cpu->hyperv_passthrough) {
   1329         cpu->hyperv_vendor_id[0] =
   1330             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
   1331         cpu->hyperv_vendor_id[1] =
   1332             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
   1333         cpu->hyperv_vendor_id[2] =
   1334             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
   1335         cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
   1336                                        sizeof(cpu->hyperv_vendor_id) + 1);
   1337         memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
   1338                sizeof(cpu->hyperv_vendor_id));
   1339         cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
   1340 
   1341         cpu->hyperv_interface_id[0] =
   1342             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
   1343         cpu->hyperv_interface_id[1] =
   1344             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
   1345         cpu->hyperv_interface_id[2] =
   1346             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
   1347         cpu->hyperv_interface_id[3] =
   1348             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
   1349 
   1350         cpu->hyperv_ver_id_build =
   1351             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
   1352         cpu->hyperv_ver_id_major =
   1353             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
   1354         cpu->hyperv_ver_id_minor =
   1355             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
   1356         cpu->hyperv_ver_id_sp =
   1357             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
   1358         cpu->hyperv_ver_id_sb =
   1359             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
   1360         cpu->hyperv_ver_id_sn =
   1361             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
   1362 
   1363         cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
   1364                                             R_EAX);
   1365         cpu->hyperv_limits[0] =
   1366             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
   1367         cpu->hyperv_limits[1] =
   1368             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
   1369         cpu->hyperv_limits[2] =
   1370             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
   1371 
   1372         cpu->hyperv_spinlock_attempts =
   1373             hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
   1374 
   1375         /*
   1376          * Mark feature as enabled in 'cpu->hyperv_features' as
   1377          * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
   1378          */
   1379         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
   1380             if (hyperv_feature_supported(cs, feat)) {
   1381                 cpu->hyperv_features |= BIT(feat);
   1382             }
   1383         }
   1384     } else {
   1385         /* Check features availability and dependencies */
   1386         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
   1387             /* If the feature was not requested skip it. */
   1388             if (!hyperv_feat_enabled(cpu, feat)) {
   1389                 continue;
   1390             }
   1391 
   1392             /* Check if the feature is supported by KVM */
   1393             if (!hyperv_feature_supported(cs, feat)) {
   1394                 error_setg(errp, "Hyper-V %s is not supported by kernel",
   1395                            kvm_hyperv_properties[feat].desc);
   1396                 return false;
   1397             }
   1398 
   1399             /* Check dependencies */
   1400             if (!hv_feature_check_deps(cpu, feat, &local_err)) {
   1401                 error_propagate(errp, local_err);
   1402                 return false;
   1403             }
   1404         }
   1405     }
   1406 
   1407     /* Additional dependencies not covered by kvm_hyperv_properties[] */
   1408     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
   1409         !cpu->hyperv_synic_kvm_only &&
   1410         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
   1411         error_setg(errp, "Hyper-V %s requires Hyper-V %s",
   1412                    kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
   1413                    kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
   1414         return false;
   1415     }
   1416 
   1417     return true;
   1418 }
   1419 
   1420 /*
   1421  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
   1422  */
   1423 static int hyperv_fill_cpuids(CPUState *cs,
   1424                               struct kvm_cpuid_entry2 *cpuid_ent)
   1425 {
   1426     X86CPU *cpu = X86_CPU(cs);
   1427     struct kvm_cpuid_entry2 *c;
   1428     uint32_t signature[3];
   1429     uint32_t cpuid_i = 0, max_cpuid_leaf = 0;
   1430     uint32_t nested_eax =
   1431         hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX);
   1432 
   1433     max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES :
   1434         HV_CPUID_IMPLEMENT_LIMITS;
   1435 
   1436     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
   1437         max_cpuid_leaf =
   1438             MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
   1439     }
   1440 
   1441     c = &cpuid_ent[cpuid_i++];
   1442     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
   1443     c->eax = max_cpuid_leaf;
   1444     c->ebx = cpu->hyperv_vendor_id[0];
   1445     c->ecx = cpu->hyperv_vendor_id[1];
   1446     c->edx = cpu->hyperv_vendor_id[2];
   1447 
   1448     c = &cpuid_ent[cpuid_i++];
   1449     c->function = HV_CPUID_INTERFACE;
   1450     c->eax = cpu->hyperv_interface_id[0];
   1451     c->ebx = cpu->hyperv_interface_id[1];
   1452     c->ecx = cpu->hyperv_interface_id[2];
   1453     c->edx = cpu->hyperv_interface_id[3];
   1454 
   1455     c = &cpuid_ent[cpuid_i++];
   1456     c->function = HV_CPUID_VERSION;
   1457     c->eax = cpu->hyperv_ver_id_build;
   1458     c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
   1459         cpu->hyperv_ver_id_minor;
   1460     c->ecx = cpu->hyperv_ver_id_sp;
   1461     c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
   1462         (cpu->hyperv_ver_id_sn & 0xffffff);
   1463 
   1464     c = &cpuid_ent[cpuid_i++];
   1465     c->function = HV_CPUID_FEATURES;
   1466     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
   1467     c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
   1468     c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
   1469 
   1470     /* Unconditionally required with any Hyper-V enlightenment */
   1471     c->eax |= HV_HYPERCALL_AVAILABLE;
   1472 
   1473     /* SynIC and Vmbus devices require messages/signals hypercalls */
   1474     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
   1475         !cpu->hyperv_synic_kvm_only) {
   1476         c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
   1477     }
   1478 
   1479 
   1480     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
   1481     c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
   1482 
   1483     c = &cpuid_ent[cpuid_i++];
   1484     c->function = HV_CPUID_ENLIGHTMENT_INFO;
   1485     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
   1486     c->ebx = cpu->hyperv_spinlock_attempts;
   1487 
   1488     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
   1489         !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
   1490         c->eax |= HV_APIC_ACCESS_RECOMMENDED;
   1491     }
   1492 
   1493     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
   1494         c->eax |= HV_NO_NONARCH_CORESHARING;
   1495     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
   1496         c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
   1497             HV_NO_NONARCH_CORESHARING;
   1498     }
   1499 
   1500     c = &cpuid_ent[cpuid_i++];
   1501     c->function = HV_CPUID_IMPLEMENT_LIMITS;
   1502     c->eax = cpu->hv_max_vps;
   1503     c->ebx = cpu->hyperv_limits[0];
   1504     c->ecx = cpu->hyperv_limits[1];
   1505     c->edx = cpu->hyperv_limits[2];
   1506 
   1507     if (nested_eax) {
   1508         uint32_t function;
   1509 
   1510         /* Create zeroed 0x40000006..0x40000009 leaves */
   1511         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
   1512              function < HV_CPUID_NESTED_FEATURES; function++) {
   1513             c = &cpuid_ent[cpuid_i++];
   1514             c->function = function;
   1515         }
   1516 
   1517         c = &cpuid_ent[cpuid_i++];
   1518         c->function = HV_CPUID_NESTED_FEATURES;
   1519         c->eax = nested_eax;
   1520     }
   1521 
   1522     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
   1523         c = &cpuid_ent[cpuid_i++];
   1524         c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS;
   1525         c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
   1526             HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
   1527         memcpy(signature, "Microsoft VS", 12);
   1528         c->eax = 0;
   1529         c->ebx = signature[0];
   1530         c->ecx = signature[1];
   1531         c->edx = signature[2];
   1532 
   1533         c = &cpuid_ent[cpuid_i++];
   1534         c->function = HV_CPUID_SYNDBG_INTERFACE;
   1535         memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
   1536         c->eax = signature[0];
   1537         c->ebx = 0;
   1538         c->ecx = 0;
   1539         c->edx = 0;
   1540 
   1541         c = &cpuid_ent[cpuid_i++];
   1542         c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
   1543         c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
   1544         c->ebx = 0;
   1545         c->ecx = 0;
   1546         c->edx = 0;
   1547     }
   1548 
   1549     return cpuid_i;
   1550 }
   1551 
   1552 static Error *hv_passthrough_mig_blocker;
   1553 static Error *hv_no_nonarch_cs_mig_blocker;
   1554 
   1555 /* Checks that the exposed eVMCS version range is supported by KVM */
   1556 static bool evmcs_version_supported(uint16_t evmcs_version,
   1557                                     uint16_t supported_evmcs_version)
   1558 {
   1559     uint8_t min_version = evmcs_version & 0xff;
   1560     uint8_t max_version = evmcs_version >> 8;
   1561     uint8_t min_supported_version = supported_evmcs_version & 0xff;
   1562     uint8_t max_supported_version = supported_evmcs_version >> 8;
   1563 
   1564     return (min_version >= min_supported_version) &&
   1565         (max_version <= max_supported_version);
   1566 }
   1567 
   1568 static int hyperv_init_vcpu(X86CPU *cpu)
   1569 {
   1570     CPUState *cs = CPU(cpu);
   1571     Error *local_err = NULL;
   1572     int ret;
   1573 
   1574     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
   1575         error_setg(&hv_passthrough_mig_blocker,
   1576                    "'hv-passthrough' CPU flag prevents migration, use explicit"
   1577                    " set of hv-* flags instead");
   1578         ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
   1579         if (ret < 0) {
   1580             error_report_err(local_err);
   1581             return ret;
   1582         }
   1583     }
   1584 
   1585     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
   1586         hv_no_nonarch_cs_mig_blocker == NULL) {
   1587         error_setg(&hv_no_nonarch_cs_mig_blocker,
   1588                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
   1589                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
   1590                    " make sure SMT is disabled and/or that vCPUs are properly"
   1591                    " pinned)");
   1592         ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
   1593         if (ret < 0) {
   1594             error_report_err(local_err);
   1595             return ret;
   1596         }
   1597     }
   1598 
   1599     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
   1600         /*
   1601          * the kernel doesn't support setting vp_index; assert that its value
   1602          * is in sync
   1603          */
   1604         uint64_t value;
   1605 
   1606         ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value);
   1607         if (ret < 0) {
   1608             return ret;
   1609         }
   1610 
   1611         if (value != hyperv_vp_index(CPU(cpu))) {
   1612             error_report("kernel's vp_index != QEMU's vp_index");
   1613             return -ENXIO;
   1614         }
   1615     }
   1616 
   1617     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   1618         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
   1619             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
   1620         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
   1621         if (ret < 0) {
   1622             error_report("failed to turn on HyperV SynIC in KVM: %s",
   1623                          strerror(-ret));
   1624             return ret;
   1625         }
   1626 
   1627         if (!cpu->hyperv_synic_kvm_only) {
   1628             ret = hyperv_x86_synic_add(cpu);
   1629             if (ret < 0) {
   1630                 error_report("failed to create HyperV SynIC: %s",
   1631                              strerror(-ret));
   1632                 return ret;
   1633             }
   1634         }
   1635     }
   1636 
   1637     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
   1638         uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
   1639         uint16_t supported_evmcs_version;
   1640 
   1641         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
   1642                                   (uintptr_t)&supported_evmcs_version);
   1643 
   1644         /*
   1645          * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
   1646          * option sets. Note: we hardcode the maximum supported eVMCS version
   1647          * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
   1648          * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
   1649          * to be added.
   1650          */
   1651         if (ret < 0) {
   1652             error_report("Hyper-V %s is not supported by kernel",
   1653                          kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
   1654             return ret;
   1655         }
   1656 
   1657         if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
   1658             error_report("eVMCS version range [%d..%d] is not supported by "
   1659                          "kernel (supported: [%d..%d])", evmcs_version & 0xff,
   1660                          evmcs_version >> 8, supported_evmcs_version & 0xff,
   1661                          supported_evmcs_version >> 8);
   1662             return -ENOTSUP;
   1663         }
   1664     }
   1665 
   1666     if (cpu->hyperv_enforce_cpuid) {
   1667         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
   1668         if (ret < 0) {
   1669             error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
   1670                          strerror(-ret));
   1671             return ret;
   1672         }
   1673     }
   1674 
   1675     return 0;
   1676 }
   1677 
   1678 static Error *invtsc_mig_blocker;
   1679 
   1680 #define KVM_MAX_CPUID_ENTRIES  100
   1681 
   1682 static void kvm_init_xsave(CPUX86State *env)
   1683 {
   1684     if (has_xsave2) {
   1685         env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
   1686     } else if (has_xsave) {
   1687         env->xsave_buf_len = sizeof(struct kvm_xsave);
   1688     } else {
   1689         return;
   1690     }
   1691 
   1692     env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
   1693     memset(env->xsave_buf, 0, env->xsave_buf_len);
   1694     /*
   1695      * The allocated storage must be large enough for all of the
   1696      * possible XSAVE state components.
   1697      */
   1698     assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
   1699            env->xsave_buf_len);
   1700 }
   1701 
   1702 static void kvm_init_nested_state(CPUX86State *env)
   1703 {
   1704     struct kvm_vmx_nested_state_hdr *vmx_hdr;
   1705     uint32_t size;
   1706 
   1707     if (!env->nested_state) {
   1708         return;
   1709     }
   1710 
   1711     size = env->nested_state->size;
   1712 
   1713     memset(env->nested_state, 0, size);
   1714     env->nested_state->size = size;
   1715 
   1716     if (cpu_has_vmx(env)) {
   1717         env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
   1718         vmx_hdr = &env->nested_state->hdr.vmx;
   1719         vmx_hdr->vmxon_pa = -1ull;
   1720         vmx_hdr->vmcs12_pa = -1ull;
   1721     } else if (cpu_has_svm(env)) {
   1722         env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
   1723     }
   1724 }
   1725 
   1726 int kvm_arch_init_vcpu(CPUState *cs)
   1727 {
   1728     struct {
   1729         struct kvm_cpuid2 cpuid;
   1730         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
   1731     } cpuid_data;
   1732     /*
   1733      * The kernel defines these structs with padding fields so there
   1734      * should be no extra padding in our cpuid_data struct.
   1735      */
   1736     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
   1737                       sizeof(struct kvm_cpuid2) +
   1738                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
   1739 
   1740     X86CPU *cpu = X86_CPU(cs);
   1741     CPUX86State *env = &cpu->env;
   1742     uint32_t limit, i, j, cpuid_i;
   1743     uint32_t unused;
   1744     struct kvm_cpuid_entry2 *c;
   1745     uint32_t signature[3];
   1746     int kvm_base = KVM_CPUID_SIGNATURE;
   1747     int max_nested_state_len;
   1748     int r;
   1749     Error *local_err = NULL;
   1750 
   1751     memset(&cpuid_data, 0, sizeof(cpuid_data));
   1752 
   1753     cpuid_i = 0;
   1754 
   1755     has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
   1756 
   1757     r = kvm_arch_set_tsc_khz(cs);
   1758     if (r < 0) {
   1759         return r;
   1760     }
   1761 
   1762     /* vcpu's TSC frequency is either specified by user, or following
   1763      * the value used by KVM if the former is not present. In the
   1764      * latter case, we query it from KVM and record in env->tsc_khz,
   1765      * so that vcpu's TSC frequency can be migrated later via this field.
   1766      */
   1767     if (!env->tsc_khz) {
   1768         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
   1769             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
   1770             -ENOTSUP;
   1771         if (r > 0) {
   1772             env->tsc_khz = r;
   1773         }
   1774     }
   1775 
   1776     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
   1777 
   1778     /*
   1779      * kvm_hyperv_expand_features() is called here for the second time in case
   1780      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
   1781      * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
   1782      * check which Hyper-V enlightenments are supported and which are not, we
   1783      * can still proceed and check/expand Hyper-V enlightenments here so legacy
   1784      * behavior is preserved.
   1785      */
   1786     if (!kvm_hyperv_expand_features(cpu, &local_err)) {
   1787         error_report_err(local_err);
   1788         return -ENOSYS;
   1789     }
   1790 
   1791     if (hyperv_enabled(cpu)) {
   1792         r = hyperv_init_vcpu(cpu);
   1793         if (r) {
   1794             return r;
   1795         }
   1796 
   1797         cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
   1798         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
   1799         has_msr_hv_hypercall = true;
   1800     }
   1801 
   1802     if (cpu->expose_kvm) {
   1803         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
   1804         c = &cpuid_data.entries[cpuid_i++];
   1805         c->function = KVM_CPUID_SIGNATURE | kvm_base;
   1806         c->eax = KVM_CPUID_FEATURES | kvm_base;
   1807         c->ebx = signature[0];
   1808         c->ecx = signature[1];
   1809         c->edx = signature[2];
   1810 
   1811         c = &cpuid_data.entries[cpuid_i++];
   1812         c->function = KVM_CPUID_FEATURES | kvm_base;
   1813         c->eax = env->features[FEAT_KVM];
   1814         c->edx = env->features[FEAT_KVM_HINTS];
   1815     }
   1816 
   1817     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
   1818 
   1819     if (cpu->kvm_pv_enforce_cpuid) {
   1820         r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
   1821         if (r < 0) {
   1822             fprintf(stderr,
   1823                     "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
   1824                     strerror(-r));
   1825             abort();
   1826         }
   1827     }
   1828 
   1829     for (i = 0; i <= limit; i++) {
   1830         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1831             fprintf(stderr, "unsupported level value: 0x%x\n", limit);
   1832             abort();
   1833         }
   1834         c = &cpuid_data.entries[cpuid_i++];
   1835 
   1836         switch (i) {
   1837         case 2: {
   1838             /* Keep reading function 2 till all the input is received */
   1839             int times;
   1840 
   1841             c->function = i;
   1842             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
   1843                        KVM_CPUID_FLAG_STATE_READ_NEXT;
   1844             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1845             times = c->eax & 0xff;
   1846 
   1847             for (j = 1; j < times; ++j) {
   1848                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1849                     fprintf(stderr, "cpuid_data is full, no space for "
   1850                             "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
   1851                     abort();
   1852                 }
   1853                 c = &cpuid_data.entries[cpuid_i++];
   1854                 c->function = i;
   1855                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
   1856                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1857             }
   1858             break;
   1859         }
   1860         case 0x1f:
   1861             if (env->nr_dies < 2) {
   1862                 break;
   1863             }
   1864             /* fallthrough */
   1865         case 4:
   1866         case 0xb:
   1867         case 0xd:
   1868             for (j = 0; ; j++) {
   1869                 if (i == 0xd && j == 64) {
   1870                     break;
   1871                 }
   1872 
   1873                 if (i == 0x1f && j == 64) {
   1874                     break;
   1875                 }
   1876 
   1877                 c->function = i;
   1878                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1879                 c->index = j;
   1880                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1881 
   1882                 if (i == 4 && c->eax == 0) {
   1883                     break;
   1884                 }
   1885                 if (i == 0xb && !(c->ecx & 0xff00)) {
   1886                     break;
   1887                 }
   1888                 if (i == 0x1f && !(c->ecx & 0xff00)) {
   1889                     break;
   1890                 }
   1891                 if (i == 0xd && c->eax == 0) {
   1892                     continue;
   1893                 }
   1894                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1895                     fprintf(stderr, "cpuid_data is full, no space for "
   1896                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   1897                     abort();
   1898                 }
   1899                 c = &cpuid_data.entries[cpuid_i++];
   1900             }
   1901             break;
   1902         case 0x7:
   1903         case 0x12:
   1904             for (j = 0; ; j++) {
   1905                 c->function = i;
   1906                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1907                 c->index = j;
   1908                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1909 
   1910                 if (j > 1 && (c->eax & 0xf) != 1) {
   1911                     break;
   1912                 }
   1913 
   1914                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1915                     fprintf(stderr, "cpuid_data is full, no space for "
   1916                                 "cpuid(eax:0x12,ecx:0x%x)\n", j);
   1917                     abort();
   1918                 }
   1919                 c = &cpuid_data.entries[cpuid_i++];
   1920             }
   1921             break;
   1922         case 0x14:
   1923         case 0x1d:
   1924         case 0x1e: {
   1925             uint32_t times;
   1926 
   1927             c->function = i;
   1928             c->index = 0;
   1929             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1930             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1931             times = c->eax;
   1932 
   1933             for (j = 1; j <= times; ++j) {
   1934                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1935                     fprintf(stderr, "cpuid_data is full, no space for "
   1936                                 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   1937                     abort();
   1938                 }
   1939                 c = &cpuid_data.entries[cpuid_i++];
   1940                 c->function = i;
   1941                 c->index = j;
   1942                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   1943                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1944             }
   1945             break;
   1946         }
   1947         default:
   1948             c->function = i;
   1949             c->flags = 0;
   1950             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   1951             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
   1952                 /*
   1953                  * KVM already returns all zeroes if a CPUID entry is missing,
   1954                  * so we can omit it and avoid hitting KVM's 80-entry limit.
   1955                  */
   1956                 cpuid_i--;
   1957             }
   1958             break;
   1959         }
   1960     }
   1961 
   1962     if (limit >= 0x0a) {
   1963         uint32_t eax, edx;
   1964 
   1965         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
   1966 
   1967         has_architectural_pmu_version = eax & 0xff;
   1968         if (has_architectural_pmu_version > 0) {
   1969             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
   1970 
   1971             /* Shouldn't be more than 32, since that's the number of bits
   1972              * available in EBX to tell us _which_ counters are available.
   1973              * Play it safe.
   1974              */
   1975             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
   1976                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
   1977             }
   1978 
   1979             if (has_architectural_pmu_version > 1) {
   1980                 num_architectural_pmu_fixed_counters = edx & 0x1f;
   1981 
   1982                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
   1983                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
   1984                 }
   1985             }
   1986         }
   1987     }
   1988 
   1989     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
   1990 
   1991     for (i = 0x80000000; i <= limit; i++) {
   1992         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   1993             fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
   1994             abort();
   1995         }
   1996         c = &cpuid_data.entries[cpuid_i++];
   1997 
   1998         switch (i) {
   1999         case 0x8000001d:
   2000             /* Query for all AMD cache information leaves */
   2001             for (j = 0; ; j++) {
   2002                 c->function = i;
   2003                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
   2004                 c->index = j;
   2005                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
   2006 
   2007                 if (c->eax == 0) {
   2008                     break;
   2009                 }
   2010                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   2011                     fprintf(stderr, "cpuid_data is full, no space for "
   2012                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
   2013                     abort();
   2014                 }
   2015                 c = &cpuid_data.entries[cpuid_i++];
   2016             }
   2017             break;
   2018         default:
   2019             c->function = i;
   2020             c->flags = 0;
   2021             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   2022             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
   2023                 /*
   2024                  * KVM already returns all zeroes if a CPUID entry is missing,
   2025                  * so we can omit it and avoid hitting KVM's 80-entry limit.
   2026                  */
   2027                 cpuid_i--;
   2028             }
   2029             break;
   2030         }
   2031     }
   2032 
   2033     /* Call Centaur's CPUID instructions they are supported. */
   2034     if (env->cpuid_xlevel2 > 0) {
   2035         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
   2036 
   2037         for (i = 0xC0000000; i <= limit; i++) {
   2038             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
   2039                 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
   2040                 abort();
   2041             }
   2042             c = &cpuid_data.entries[cpuid_i++];
   2043 
   2044             c->function = i;
   2045             c->flags = 0;
   2046             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
   2047         }
   2048     }
   2049 
   2050     cpuid_data.cpuid.nent = cpuid_i;
   2051 
   2052     if (((env->cpuid_version >> 8)&0xF) >= 6
   2053         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
   2054            (CPUID_MCE | CPUID_MCA)
   2055         && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
   2056         uint64_t mcg_cap, unsupported_caps;
   2057         int banks;
   2058         int ret;
   2059 
   2060         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
   2061         if (ret < 0) {
   2062             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
   2063             return ret;
   2064         }
   2065 
   2066         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
   2067             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
   2068                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
   2069             return -ENOTSUP;
   2070         }
   2071 
   2072         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
   2073         if (unsupported_caps) {
   2074             if (unsupported_caps & MCG_LMCE_P) {
   2075                 error_report("kvm: LMCE not supported");
   2076                 return -ENOTSUP;
   2077             }
   2078             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
   2079                         unsupported_caps);
   2080         }
   2081 
   2082         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
   2083         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
   2084         if (ret < 0) {
   2085             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
   2086             return ret;
   2087         }
   2088     }
   2089 
   2090     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
   2091 
   2092     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
   2093     if (c) {
   2094         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
   2095                                   !!(c->ecx & CPUID_EXT_SMX);
   2096     }
   2097 
   2098     c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
   2099     if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
   2100         has_msr_feature_control = true;
   2101     }
   2102 
   2103     if (env->mcg_cap & MCG_LMCE_P) {
   2104         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
   2105     }
   2106 
   2107     if (!env->user_tsc_khz) {
   2108         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
   2109             invtsc_mig_blocker == NULL) {
   2110             error_setg(&invtsc_mig_blocker,
   2111                        "State blocked by non-migratable CPU device"
   2112                        " (invtsc flag)");
   2113             r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
   2114             if (r < 0) {
   2115                 error_report_err(local_err);
   2116                 return r;
   2117             }
   2118         }
   2119     }
   2120 
   2121     if (cpu->vmware_cpuid_freq
   2122         /* Guests depend on 0x40000000 to detect this feature, so only expose
   2123          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
   2124         && cpu->expose_kvm
   2125         && kvm_base == KVM_CPUID_SIGNATURE
   2126         /* TSC clock must be stable and known for this feature. */
   2127         && tsc_is_stable_and_known(env)) {
   2128 
   2129         c = &cpuid_data.entries[cpuid_i++];
   2130         c->function = KVM_CPUID_SIGNATURE | 0x10;
   2131         c->eax = env->tsc_khz;
   2132         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
   2133         c->ecx = c->edx = 0;
   2134 
   2135         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
   2136         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
   2137     }
   2138 
   2139     cpuid_data.cpuid.nent = cpuid_i;
   2140 
   2141     cpuid_data.cpuid.padding = 0;
   2142     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
   2143     if (r) {
   2144         goto fail;
   2145     }
   2146     kvm_init_xsave(env);
   2147 
   2148     max_nested_state_len = kvm_max_nested_state_length();
   2149     if (max_nested_state_len > 0) {
   2150         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
   2151 
   2152         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
   2153             env->nested_state = g_malloc0(max_nested_state_len);
   2154             env->nested_state->size = max_nested_state_len;
   2155 
   2156             kvm_init_nested_state(env);
   2157         }
   2158     }
   2159 
   2160     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
   2161 
   2162     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
   2163         has_msr_tsc_aux = false;
   2164     }
   2165 
   2166     kvm_init_msrs(cpu);
   2167 
   2168     return 0;
   2169 
   2170  fail:
   2171     migrate_del_blocker(invtsc_mig_blocker);
   2172 
   2173     return r;
   2174 }
   2175 
   2176 int kvm_arch_destroy_vcpu(CPUState *cs)
   2177 {
   2178     X86CPU *cpu = X86_CPU(cs);
   2179     CPUX86State *env = &cpu->env;
   2180 
   2181     g_free(env->xsave_buf);
   2182 
   2183     g_free(cpu->kvm_msr_buf);
   2184     cpu->kvm_msr_buf = NULL;
   2185 
   2186     g_free(env->nested_state);
   2187     env->nested_state = NULL;
   2188 
   2189     qemu_del_vm_change_state_handler(cpu->vmsentry);
   2190 
   2191     return 0;
   2192 }
   2193 
   2194 void kvm_arch_reset_vcpu(X86CPU *cpu)
   2195 {
   2196     CPUX86State *env = &cpu->env;
   2197 
   2198     env->xcr0 = 1;
   2199     if (kvm_irqchip_in_kernel()) {
   2200         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
   2201                                           KVM_MP_STATE_UNINITIALIZED;
   2202     } else {
   2203         env->mp_state = KVM_MP_STATE_RUNNABLE;
   2204     }
   2205 
   2206     /* enabled by default */
   2207     env->poll_control_msr = 1;
   2208 
   2209     kvm_init_nested_state(env);
   2210 
   2211     sev_es_set_reset_vector(CPU(cpu));
   2212 }
   2213 
   2214 void kvm_arch_after_reset_vcpu(X86CPU *cpu)
   2215 {
   2216     CPUX86State *env = &cpu->env;
   2217     int i;
   2218 
   2219     /*
   2220      * Reset SynIC after all other devices have been reset to let them remove
   2221      * their SINT routes first.
   2222      */
   2223     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   2224         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
   2225             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
   2226         }
   2227 
   2228         hyperv_x86_synic_reset(cpu);
   2229     }
   2230 }
   2231 
   2232 void kvm_arch_do_init_vcpu(X86CPU *cpu)
   2233 {
   2234     CPUX86State *env = &cpu->env;
   2235 
   2236     /* APs get directly into wait-for-SIPI state.  */
   2237     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
   2238         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
   2239     }
   2240 }
   2241 
   2242 static int kvm_get_supported_feature_msrs(KVMState *s)
   2243 {
   2244     int ret = 0;
   2245 
   2246     if (kvm_feature_msrs != NULL) {
   2247         return 0;
   2248     }
   2249 
   2250     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
   2251         return 0;
   2252     }
   2253 
   2254     struct kvm_msr_list msr_list;
   2255 
   2256     msr_list.nmsrs = 0;
   2257     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
   2258     if (ret < 0 && ret != -E2BIG) {
   2259         error_report("Fetch KVM feature MSR list failed: %s",
   2260             strerror(-ret));
   2261         return ret;
   2262     }
   2263 
   2264     assert(msr_list.nmsrs > 0);
   2265     kvm_feature_msrs = g_malloc0(sizeof(msr_list) +
   2266                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
   2267 
   2268     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
   2269     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
   2270 
   2271     if (ret < 0) {
   2272         error_report("Fetch KVM feature MSR list failed: %s",
   2273             strerror(-ret));
   2274         g_free(kvm_feature_msrs);
   2275         kvm_feature_msrs = NULL;
   2276         return ret;
   2277     }
   2278 
   2279     return 0;
   2280 }
   2281 
   2282 static int kvm_get_supported_msrs(KVMState *s)
   2283 {
   2284     int ret = 0;
   2285     struct kvm_msr_list msr_list, *kvm_msr_list;
   2286 
   2287     /*
   2288      *  Obtain MSR list from KVM.  These are the MSRs that we must
   2289      *  save/restore.
   2290      */
   2291     msr_list.nmsrs = 0;
   2292     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
   2293     if (ret < 0 && ret != -E2BIG) {
   2294         return ret;
   2295     }
   2296     /*
   2297      * Old kernel modules had a bug and could write beyond the provided
   2298      * memory. Allocate at least a safe amount of 1K.
   2299      */
   2300     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
   2301                                           msr_list.nmsrs *
   2302                                           sizeof(msr_list.indices[0])));
   2303 
   2304     kvm_msr_list->nmsrs = msr_list.nmsrs;
   2305     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
   2306     if (ret >= 0) {
   2307         int i;
   2308 
   2309         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
   2310             switch (kvm_msr_list->indices[i]) {
   2311             case MSR_STAR:
   2312                 has_msr_star = true;
   2313                 break;
   2314             case MSR_VM_HSAVE_PA:
   2315                 has_msr_hsave_pa = true;
   2316                 break;
   2317             case MSR_TSC_AUX:
   2318                 has_msr_tsc_aux = true;
   2319                 break;
   2320             case MSR_TSC_ADJUST:
   2321                 has_msr_tsc_adjust = true;
   2322                 break;
   2323             case MSR_IA32_TSCDEADLINE:
   2324                 has_msr_tsc_deadline = true;
   2325                 break;
   2326             case MSR_IA32_SMBASE:
   2327                 has_msr_smbase = true;
   2328                 break;
   2329             case MSR_SMI_COUNT:
   2330                 has_msr_smi_count = true;
   2331                 break;
   2332             case MSR_IA32_MISC_ENABLE:
   2333                 has_msr_misc_enable = true;
   2334                 break;
   2335             case MSR_IA32_BNDCFGS:
   2336                 has_msr_bndcfgs = true;
   2337                 break;
   2338             case MSR_IA32_XSS:
   2339                 has_msr_xss = true;
   2340                 break;
   2341             case MSR_IA32_UMWAIT_CONTROL:
   2342                 has_msr_umwait = true;
   2343                 break;
   2344             case HV_X64_MSR_CRASH_CTL:
   2345                 has_msr_hv_crash = true;
   2346                 break;
   2347             case HV_X64_MSR_RESET:
   2348                 has_msr_hv_reset = true;
   2349                 break;
   2350             case HV_X64_MSR_VP_INDEX:
   2351                 has_msr_hv_vpindex = true;
   2352                 break;
   2353             case HV_X64_MSR_VP_RUNTIME:
   2354                 has_msr_hv_runtime = true;
   2355                 break;
   2356             case HV_X64_MSR_SCONTROL:
   2357                 has_msr_hv_synic = true;
   2358                 break;
   2359             case HV_X64_MSR_STIMER0_CONFIG:
   2360                 has_msr_hv_stimer = true;
   2361                 break;
   2362             case HV_X64_MSR_TSC_FREQUENCY:
   2363                 has_msr_hv_frequencies = true;
   2364                 break;
   2365             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   2366                 has_msr_hv_reenlightenment = true;
   2367                 break;
   2368             case HV_X64_MSR_SYNDBG_OPTIONS:
   2369                 has_msr_hv_syndbg_options = true;
   2370                 break;
   2371             case MSR_IA32_SPEC_CTRL:
   2372                 has_msr_spec_ctrl = true;
   2373                 break;
   2374             case MSR_AMD64_TSC_RATIO:
   2375                 has_tsc_scale_msr = true;
   2376                 break;
   2377             case MSR_IA32_TSX_CTRL:
   2378                 has_msr_tsx_ctrl = true;
   2379                 break;
   2380             case MSR_VIRT_SSBD:
   2381                 has_msr_virt_ssbd = true;
   2382                 break;
   2383             case MSR_IA32_ARCH_CAPABILITIES:
   2384                 has_msr_arch_capabs = true;
   2385                 break;
   2386             case MSR_IA32_CORE_CAPABILITY:
   2387                 has_msr_core_capabs = true;
   2388                 break;
   2389             case MSR_IA32_PERF_CAPABILITIES:
   2390                 has_msr_perf_capabs = true;
   2391                 break;
   2392             case MSR_IA32_VMX_VMFUNC:
   2393                 has_msr_vmx_vmfunc = true;
   2394                 break;
   2395             case MSR_IA32_UCODE_REV:
   2396                 has_msr_ucode_rev = true;
   2397                 break;
   2398             case MSR_IA32_VMX_PROCBASED_CTLS2:
   2399                 has_msr_vmx_procbased_ctls2 = true;
   2400                 break;
   2401             case MSR_IA32_PKRS:
   2402                 has_msr_pkrs = true;
   2403                 break;
   2404             }
   2405         }
   2406     }
   2407 
   2408     g_free(kvm_msr_list);
   2409 
   2410     return ret;
   2411 }
   2412 
   2413 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
   2414                                         uint64_t *val)
   2415 {
   2416     CPUState *cs = CPU(cpu);
   2417 
   2418     *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */
   2419     *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */
   2420 
   2421     return true;
   2422 }
   2423 
   2424 static Notifier smram_machine_done;
   2425 static KVMMemoryListener smram_listener;
   2426 static AddressSpace smram_address_space;
   2427 static MemoryRegion smram_as_root;
   2428 static MemoryRegion smram_as_mem;
   2429 
   2430 static void register_smram_listener(Notifier *n, void *unused)
   2431 {
   2432     MemoryRegion *smram =
   2433         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
   2434 
   2435     /* Outer container... */
   2436     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
   2437     memory_region_set_enabled(&smram_as_root, true);
   2438 
   2439     /* ... with two regions inside: normal system memory with low
   2440      * priority, and...
   2441      */
   2442     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
   2443                              get_system_memory(), 0, ~0ull);
   2444     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
   2445     memory_region_set_enabled(&smram_as_mem, true);
   2446 
   2447     if (smram) {
   2448         /* ... SMRAM with higher priority */
   2449         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
   2450         memory_region_set_enabled(smram, true);
   2451     }
   2452 
   2453     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
   2454     kvm_memory_listener_register(kvm_state, &smram_listener,
   2455                                  &smram_address_space, 1, "kvm-smram");
   2456 }
   2457 
   2458 int kvm_arch_init(MachineState *ms, KVMState *s)
   2459 {
   2460     uint64_t identity_base = 0xfffbc000;
   2461     uint64_t shadow_mem;
   2462     int ret;
   2463     struct utsname utsname;
   2464     Error *local_err = NULL;
   2465 
   2466     /*
   2467      * Initialize SEV context, if required
   2468      *
   2469      * If no memory encryption is requested (ms->cgs == NULL) this is
   2470      * a no-op.
   2471      *
   2472      * It's also a no-op if a non-SEV confidential guest support
   2473      * mechanism is selected.  SEV is the only mechanism available to
   2474      * select on x86 at present, so this doesn't arise, but if new
   2475      * mechanisms are supported in future (e.g. TDX), they'll need
   2476      * their own initialization either here or elsewhere.
   2477      */
   2478     ret = sev_kvm_init(ms->cgs, &local_err);
   2479     if (ret < 0) {
   2480         error_report_err(local_err);
   2481         return ret;
   2482     }
   2483 
   2484     if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
   2485         error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
   2486         return -ENOTSUP;
   2487     }
   2488 
   2489     has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
   2490     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
   2491     has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
   2492     has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
   2493 
   2494     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
   2495 
   2496     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
   2497     if (has_exception_payload) {
   2498         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
   2499         if (ret < 0) {
   2500             error_report("kvm: Failed to enable exception payload cap: %s",
   2501                          strerror(-ret));
   2502             return ret;
   2503         }
   2504     }
   2505 
   2506     has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
   2507     if (has_triple_fault_event) {
   2508         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
   2509         if (ret < 0) {
   2510             error_report("kvm: Failed to enable triple fault event cap: %s",
   2511                          strerror(-ret));
   2512             return ret;
   2513         }
   2514     }
   2515 
   2516     ret = kvm_get_supported_msrs(s);
   2517     if (ret < 0) {
   2518         return ret;
   2519     }
   2520 
   2521     kvm_get_supported_feature_msrs(s);
   2522 
   2523     uname(&utsname);
   2524     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
   2525 
   2526     /*
   2527      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
   2528      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
   2529      * Since these must be part of guest physical memory, we need to allocate
   2530      * them, both by setting their start addresses in the kernel and by
   2531      * creating a corresponding e820 entry. We need 4 pages before the BIOS.
   2532      *
   2533      * Older KVM versions may not support setting the identity map base. In
   2534      * that case we need to stick with the default, i.e. a 256K maximum BIOS
   2535      * size.
   2536      */
   2537     if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
   2538         /* Allows up to 16M BIOSes. */
   2539         identity_base = 0xfeffc000;
   2540 
   2541         ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
   2542         if (ret < 0) {
   2543             return ret;
   2544         }
   2545     }
   2546 
   2547     /* Set TSS base one page after EPT identity map. */
   2548     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
   2549     if (ret < 0) {
   2550         return ret;
   2551     }
   2552 
   2553     /* Tell fw_cfg to notify the BIOS to reserve the range. */
   2554     ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
   2555     if (ret < 0) {
   2556         fprintf(stderr, "e820_add_entry() table is full\n");
   2557         return ret;
   2558     }
   2559 
   2560     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
   2561     if (shadow_mem != -1) {
   2562         shadow_mem /= 4096;
   2563         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
   2564         if (ret < 0) {
   2565             return ret;
   2566         }
   2567     }
   2568 
   2569     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
   2570         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
   2571         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
   2572         smram_machine_done.notify = register_smram_listener;
   2573         qemu_add_machine_init_done_notifier(&smram_machine_done);
   2574     }
   2575 
   2576     if (enable_cpu_pm) {
   2577         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
   2578         int ret;
   2579 
   2580 /* Work around for kernel header with a typo. TODO: fix header and drop. */
   2581 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
   2582 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
   2583 #endif
   2584         if (disable_exits) {
   2585             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
   2586                               KVM_X86_DISABLE_EXITS_HLT |
   2587                               KVM_X86_DISABLE_EXITS_PAUSE |
   2588                               KVM_X86_DISABLE_EXITS_CSTATE);
   2589         }
   2590 
   2591         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
   2592                                 disable_exits);
   2593         if (ret < 0) {
   2594             error_report("kvm: guest stopping CPU not supported: %s",
   2595                          strerror(-ret));
   2596         }
   2597     }
   2598 
   2599     if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
   2600         X86MachineState *x86ms = X86_MACHINE(ms);
   2601 
   2602         if (x86ms->bus_lock_ratelimit > 0) {
   2603             ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
   2604             if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
   2605                 error_report("kvm: bus lock detection unsupported");
   2606                 return -ENOTSUP;
   2607             }
   2608             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
   2609                                     KVM_BUS_LOCK_DETECTION_EXIT);
   2610             if (ret < 0) {
   2611                 error_report("kvm: Failed to enable bus lock detection cap: %s",
   2612                              strerror(-ret));
   2613                 return ret;
   2614             }
   2615             ratelimit_init(&bus_lock_ratelimit_ctrl);
   2616             ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
   2617                                 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
   2618         }
   2619     }
   2620 
   2621     if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE &&
   2622         kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
   2623             uint64_t notify_window_flags =
   2624                 ((uint64_t)s->notify_window << 32) |
   2625                 KVM_X86_NOTIFY_VMEXIT_ENABLED |
   2626                 KVM_X86_NOTIFY_VMEXIT_USER;
   2627             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
   2628                                     notify_window_flags);
   2629             if (ret < 0) {
   2630                 error_report("kvm: Failed to enable notify vmexit cap: %s",
   2631                              strerror(-ret));
   2632                 return ret;
   2633             }
   2634     }
   2635     if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
   2636         bool r;
   2637 
   2638         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
   2639                                 KVM_MSR_EXIT_REASON_FILTER);
   2640         if (ret) {
   2641             error_report("Could not enable user space MSRs: %s",
   2642                          strerror(-ret));
   2643             exit(1);
   2644         }
   2645 
   2646         r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
   2647                            kvm_rdmsr_core_thread_count, NULL);
   2648         if (!r) {
   2649             error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
   2650                          strerror(-ret));
   2651             exit(1);
   2652         }
   2653     }
   2654 
   2655     return 0;
   2656 }
   2657 
   2658 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
   2659 {
   2660     lhs->selector = rhs->selector;
   2661     lhs->base = rhs->base;
   2662     lhs->limit = rhs->limit;
   2663     lhs->type = 3;
   2664     lhs->present = 1;
   2665     lhs->dpl = 3;
   2666     lhs->db = 0;
   2667     lhs->s = 1;
   2668     lhs->l = 0;
   2669     lhs->g = 0;
   2670     lhs->avl = 0;
   2671     lhs->unusable = 0;
   2672 }
   2673 
   2674 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
   2675 {
   2676     unsigned flags = rhs->flags;
   2677     lhs->selector = rhs->selector;
   2678     lhs->base = rhs->base;
   2679     lhs->limit = rhs->limit;
   2680     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
   2681     lhs->present = (flags & DESC_P_MASK) != 0;
   2682     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
   2683     lhs->db = (flags >> DESC_B_SHIFT) & 1;
   2684     lhs->s = (flags & DESC_S_MASK) != 0;
   2685     lhs->l = (flags >> DESC_L_SHIFT) & 1;
   2686     lhs->g = (flags & DESC_G_MASK) != 0;
   2687     lhs->avl = (flags & DESC_AVL_MASK) != 0;
   2688     lhs->unusable = !lhs->present;
   2689     lhs->padding = 0;
   2690 }
   2691 
   2692 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
   2693 {
   2694     lhs->selector = rhs->selector;
   2695     lhs->base = rhs->base;
   2696     lhs->limit = rhs->limit;
   2697     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
   2698                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
   2699                  (rhs->dpl << DESC_DPL_SHIFT) |
   2700                  (rhs->db << DESC_B_SHIFT) |
   2701                  (rhs->s * DESC_S_MASK) |
   2702                  (rhs->l << DESC_L_SHIFT) |
   2703                  (rhs->g * DESC_G_MASK) |
   2704                  (rhs->avl * DESC_AVL_MASK);
   2705 }
   2706 
   2707 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
   2708 {
   2709     if (set) {
   2710         *kvm_reg = *qemu_reg;
   2711     } else {
   2712         *qemu_reg = *kvm_reg;
   2713     }
   2714 }
   2715 
   2716 static int kvm_getput_regs(X86CPU *cpu, int set)
   2717 {
   2718     CPUX86State *env = &cpu->env;
   2719     struct kvm_regs regs;
   2720     int ret = 0;
   2721 
   2722     if (!set) {
   2723         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
   2724         if (ret < 0) {
   2725             return ret;
   2726         }
   2727     }
   2728 
   2729     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
   2730     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
   2731     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
   2732     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
   2733     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
   2734     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
   2735     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
   2736     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
   2737 #ifdef TARGET_X86_64
   2738     kvm_getput_reg(&regs.r8, &env->regs[8], set);
   2739     kvm_getput_reg(&regs.r9, &env->regs[9], set);
   2740     kvm_getput_reg(&regs.r10, &env->regs[10], set);
   2741     kvm_getput_reg(&regs.r11, &env->regs[11], set);
   2742     kvm_getput_reg(&regs.r12, &env->regs[12], set);
   2743     kvm_getput_reg(&regs.r13, &env->regs[13], set);
   2744     kvm_getput_reg(&regs.r14, &env->regs[14], set);
   2745     kvm_getput_reg(&regs.r15, &env->regs[15], set);
   2746 #endif
   2747 
   2748     kvm_getput_reg(&regs.rflags, &env->eflags, set);
   2749     kvm_getput_reg(&regs.rip, &env->eip, set);
   2750 
   2751     if (set) {
   2752         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
   2753     }
   2754 
   2755     return ret;
   2756 }
   2757 
   2758 static int kvm_put_fpu(X86CPU *cpu)
   2759 {
   2760     CPUX86State *env = &cpu->env;
   2761     struct kvm_fpu fpu;
   2762     int i;
   2763 
   2764     memset(&fpu, 0, sizeof fpu);
   2765     fpu.fsw = env->fpus & ~(7 << 11);
   2766     fpu.fsw |= (env->fpstt & 7) << 11;
   2767     fpu.fcw = env->fpuc;
   2768     fpu.last_opcode = env->fpop;
   2769     fpu.last_ip = env->fpip;
   2770     fpu.last_dp = env->fpdp;
   2771     for (i = 0; i < 8; ++i) {
   2772         fpu.ftwx |= (!env->fptags[i]) << i;
   2773     }
   2774     memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
   2775     for (i = 0; i < CPU_NB_REGS; i++) {
   2776         stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
   2777         stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
   2778     }
   2779     fpu.mxcsr = env->mxcsr;
   2780 
   2781     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
   2782 }
   2783 
   2784 static int kvm_put_xsave(X86CPU *cpu)
   2785 {
   2786     CPUX86State *env = &cpu->env;
   2787     void *xsave = env->xsave_buf;
   2788 
   2789     if (!has_xsave) {
   2790         return kvm_put_fpu(cpu);
   2791     }
   2792     x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
   2793 
   2794     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
   2795 }
   2796 
   2797 static int kvm_put_xcrs(X86CPU *cpu)
   2798 {
   2799     CPUX86State *env = &cpu->env;
   2800     struct kvm_xcrs xcrs = {};
   2801 
   2802     if (!has_xcrs) {
   2803         return 0;
   2804     }
   2805 
   2806     xcrs.nr_xcrs = 1;
   2807     xcrs.flags = 0;
   2808     xcrs.xcrs[0].xcr = 0;
   2809     xcrs.xcrs[0].value = env->xcr0;
   2810     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
   2811 }
   2812 
   2813 static int kvm_put_sregs(X86CPU *cpu)
   2814 {
   2815     CPUX86State *env = &cpu->env;
   2816     struct kvm_sregs sregs;
   2817 
   2818     /*
   2819      * The interrupt_bitmap is ignored because KVM_SET_SREGS is
   2820      * always followed by KVM_SET_VCPU_EVENTS.
   2821      */
   2822     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
   2823 
   2824     if ((env->eflags & VM_MASK)) {
   2825         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
   2826         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
   2827         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
   2828         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
   2829         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
   2830         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
   2831     } else {
   2832         set_seg(&sregs.cs, &env->segs[R_CS]);
   2833         set_seg(&sregs.ds, &env->segs[R_DS]);
   2834         set_seg(&sregs.es, &env->segs[R_ES]);
   2835         set_seg(&sregs.fs, &env->segs[R_FS]);
   2836         set_seg(&sregs.gs, &env->segs[R_GS]);
   2837         set_seg(&sregs.ss, &env->segs[R_SS]);
   2838     }
   2839 
   2840     set_seg(&sregs.tr, &env->tr);
   2841     set_seg(&sregs.ldt, &env->ldt);
   2842 
   2843     sregs.idt.limit = env->idt.limit;
   2844     sregs.idt.base = env->idt.base;
   2845     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
   2846     sregs.gdt.limit = env->gdt.limit;
   2847     sregs.gdt.base = env->gdt.base;
   2848     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
   2849 
   2850     sregs.cr0 = env->cr[0];
   2851     sregs.cr2 = env->cr[2];
   2852     sregs.cr3 = env->cr[3];
   2853     sregs.cr4 = env->cr[4];
   2854 
   2855     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
   2856     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
   2857 
   2858     sregs.efer = env->efer;
   2859 
   2860     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
   2861 }
   2862 
   2863 static int kvm_put_sregs2(X86CPU *cpu)
   2864 {
   2865     CPUX86State *env = &cpu->env;
   2866     struct kvm_sregs2 sregs;
   2867     int i;
   2868 
   2869     sregs.flags = 0;
   2870 
   2871     if ((env->eflags & VM_MASK)) {
   2872         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
   2873         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
   2874         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
   2875         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
   2876         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
   2877         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
   2878     } else {
   2879         set_seg(&sregs.cs, &env->segs[R_CS]);
   2880         set_seg(&sregs.ds, &env->segs[R_DS]);
   2881         set_seg(&sregs.es, &env->segs[R_ES]);
   2882         set_seg(&sregs.fs, &env->segs[R_FS]);
   2883         set_seg(&sregs.gs, &env->segs[R_GS]);
   2884         set_seg(&sregs.ss, &env->segs[R_SS]);
   2885     }
   2886 
   2887     set_seg(&sregs.tr, &env->tr);
   2888     set_seg(&sregs.ldt, &env->ldt);
   2889 
   2890     sregs.idt.limit = env->idt.limit;
   2891     sregs.idt.base = env->idt.base;
   2892     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
   2893     sregs.gdt.limit = env->gdt.limit;
   2894     sregs.gdt.base = env->gdt.base;
   2895     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
   2896 
   2897     sregs.cr0 = env->cr[0];
   2898     sregs.cr2 = env->cr[2];
   2899     sregs.cr3 = env->cr[3];
   2900     sregs.cr4 = env->cr[4];
   2901 
   2902     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
   2903     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
   2904 
   2905     sregs.efer = env->efer;
   2906 
   2907     if (env->pdptrs_valid) {
   2908         for (i = 0; i < 4; i++) {
   2909             sregs.pdptrs[i] = env->pdptrs[i];
   2910         }
   2911         sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
   2912     }
   2913 
   2914     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
   2915 }
   2916 
   2917 
   2918 static void kvm_msr_buf_reset(X86CPU *cpu)
   2919 {
   2920     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
   2921 }
   2922 
   2923 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
   2924 {
   2925     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
   2926     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
   2927     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
   2928 
   2929     assert((void *)(entry + 1) <= limit);
   2930 
   2931     entry->index = index;
   2932     entry->reserved = 0;
   2933     entry->data = value;
   2934     msrs->nmsrs++;
   2935 }
   2936 
   2937 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
   2938 {
   2939     kvm_msr_buf_reset(cpu);
   2940     kvm_msr_entry_add(cpu, index, value);
   2941 
   2942     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
   2943 }
   2944 
   2945 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value)
   2946 {
   2947     int ret;
   2948     struct {
   2949         struct kvm_msrs info;
   2950         struct kvm_msr_entry entries[1];
   2951     } msr_data = {
   2952         .info.nmsrs = 1,
   2953         .entries[0].index = index,
   2954     };
   2955 
   2956     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
   2957     if (ret < 0) {
   2958         return ret;
   2959     }
   2960     assert(ret == 1);
   2961     *value = msr_data.entries[0].data;
   2962     return ret;
   2963 }
   2964 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
   2965 {
   2966     int ret;
   2967 
   2968     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
   2969     assert(ret == 1);
   2970 }
   2971 
   2972 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
   2973 {
   2974     CPUX86State *env = &cpu->env;
   2975     int ret;
   2976 
   2977     if (!has_msr_tsc_deadline) {
   2978         return 0;
   2979     }
   2980 
   2981     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
   2982     if (ret < 0) {
   2983         return ret;
   2984     }
   2985 
   2986     assert(ret == 1);
   2987     return 0;
   2988 }
   2989 
   2990 /*
   2991  * Provide a separate write service for the feature control MSR in order to
   2992  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
   2993  * before writing any other state because forcibly leaving nested mode
   2994  * invalidates the VCPU state.
   2995  */
   2996 static int kvm_put_msr_feature_control(X86CPU *cpu)
   2997 {
   2998     int ret;
   2999 
   3000     if (!has_msr_feature_control) {
   3001         return 0;
   3002     }
   3003 
   3004     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
   3005                           cpu->env.msr_ia32_feature_control);
   3006     if (ret < 0) {
   3007         return ret;
   3008     }
   3009 
   3010     assert(ret == 1);
   3011     return 0;
   3012 }
   3013 
   3014 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
   3015 {
   3016     uint32_t default1, can_be_one, can_be_zero;
   3017     uint32_t must_be_one;
   3018 
   3019     switch (index) {
   3020     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
   3021         default1 = 0x00000016;
   3022         break;
   3023     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
   3024         default1 = 0x0401e172;
   3025         break;
   3026     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
   3027         default1 = 0x000011ff;
   3028         break;
   3029     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
   3030         default1 = 0x00036dff;
   3031         break;
   3032     case MSR_IA32_VMX_PROCBASED_CTLS2:
   3033         default1 = 0;
   3034         break;
   3035     default:
   3036         abort();
   3037     }
   3038 
   3039     /* If a feature bit is set, the control can be either set or clear.
   3040      * Otherwise the value is limited to either 0 or 1 by default1.
   3041      */
   3042     can_be_one = features | default1;
   3043     can_be_zero = features | ~default1;
   3044     must_be_one = ~can_be_zero;
   3045 
   3046     /*
   3047      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
   3048      * Bit 32:63 -> 1 if the control bit can be one.
   3049      */
   3050     return must_be_one | (((uint64_t)can_be_one) << 32);
   3051 }
   3052 
   3053 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
   3054 {
   3055     uint64_t kvm_vmx_basic =
   3056         kvm_arch_get_supported_msr_feature(kvm_state,
   3057                                            MSR_IA32_VMX_BASIC);
   3058 
   3059     if (!kvm_vmx_basic) {
   3060         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
   3061          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
   3062          */
   3063         return;
   3064     }
   3065 
   3066     uint64_t kvm_vmx_misc =
   3067         kvm_arch_get_supported_msr_feature(kvm_state,
   3068                                            MSR_IA32_VMX_MISC);
   3069     uint64_t kvm_vmx_ept_vpid =
   3070         kvm_arch_get_supported_msr_feature(kvm_state,
   3071                                            MSR_IA32_VMX_EPT_VPID_CAP);
   3072 
   3073     /*
   3074      * If the guest is 64-bit, a value of 1 is allowed for the host address
   3075      * space size vmexit control.
   3076      */
   3077     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
   3078         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
   3079 
   3080     /*
   3081      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
   3082      * not change them for backwards compatibility.
   3083      */
   3084     uint64_t fixed_vmx_basic = kvm_vmx_basic &
   3085         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
   3086          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
   3087          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
   3088 
   3089     /*
   3090      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
   3091      * change in the future but are always zero for now, clear them to be
   3092      * future proof.  Bits 32-63 in theory could change, though KVM does
   3093      * not support dual-monitor treatment and probably never will; mask
   3094      * them out as well.
   3095      */
   3096     uint64_t fixed_vmx_misc = kvm_vmx_misc &
   3097         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
   3098          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
   3099 
   3100     /*
   3101      * EPT memory types should not change either, so we do not bother
   3102      * adding features for them.
   3103      */
   3104     uint64_t fixed_vmx_ept_mask =
   3105             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
   3106              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
   3107     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
   3108 
   3109     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   3110                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
   3111                                          f[FEAT_VMX_PROCBASED_CTLS]));
   3112     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   3113                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
   3114                                          f[FEAT_VMX_PINBASED_CTLS]));
   3115     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
   3116                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
   3117                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
   3118     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   3119                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
   3120                                          f[FEAT_VMX_ENTRY_CTLS]));
   3121     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
   3122                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
   3123                                          f[FEAT_VMX_SECONDARY_CTLS]));
   3124     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
   3125                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
   3126     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
   3127                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
   3128     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
   3129                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
   3130     if (has_msr_vmx_vmfunc) {
   3131         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
   3132     }
   3133 
   3134     /*
   3135      * Just to be safe, write these with constant values.  The CRn_FIXED1
   3136      * MSRs are generated by KVM based on the vCPU's CPUID.
   3137      */
   3138     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
   3139                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
   3140     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
   3141                       CR4_VMXE_MASK);
   3142 
   3143     if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
   3144         /* TSC multiplier (0x2032).  */
   3145         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
   3146     } else {
   3147         /* Preemption timer (0x482E).  */
   3148         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
   3149     }
   3150 }
   3151 
   3152 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
   3153 {
   3154     uint64_t kvm_perf_cap =
   3155         kvm_arch_get_supported_msr_feature(kvm_state,
   3156                                            MSR_IA32_PERF_CAPABILITIES);
   3157 
   3158     if (kvm_perf_cap) {
   3159         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
   3160                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
   3161     }
   3162 }
   3163 
   3164 static int kvm_buf_set_msrs(X86CPU *cpu)
   3165 {
   3166     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
   3167     if (ret < 0) {
   3168         return ret;
   3169     }
   3170 
   3171     if (ret < cpu->kvm_msr_buf->nmsrs) {
   3172         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
   3173         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
   3174                      (uint32_t)e->index, (uint64_t)e->data);
   3175     }
   3176 
   3177     assert(ret == cpu->kvm_msr_buf->nmsrs);
   3178     return 0;
   3179 }
   3180 
   3181 static void kvm_init_msrs(X86CPU *cpu)
   3182 {
   3183     CPUX86State *env = &cpu->env;
   3184 
   3185     kvm_msr_buf_reset(cpu);
   3186     if (has_msr_arch_capabs) {
   3187         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
   3188                           env->features[FEAT_ARCH_CAPABILITIES]);
   3189     }
   3190 
   3191     if (has_msr_core_capabs) {
   3192         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
   3193                           env->features[FEAT_CORE_CAPABILITY]);
   3194     }
   3195 
   3196     if (has_msr_perf_capabs && cpu->enable_pmu) {
   3197         kvm_msr_entry_add_perf(cpu, env->features);
   3198     }
   3199 
   3200     if (has_msr_ucode_rev) {
   3201         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
   3202     }
   3203 
   3204     /*
   3205      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
   3206      * all kernels with MSR features should have them.
   3207      */
   3208     if (kvm_feature_msrs && cpu_has_vmx(env)) {
   3209         kvm_msr_entry_add_vmx(cpu, env->features);
   3210     }
   3211 
   3212     assert(kvm_buf_set_msrs(cpu) == 0);
   3213 }
   3214 
   3215 static int kvm_put_msrs(X86CPU *cpu, int level)
   3216 {
   3217     CPUX86State *env = &cpu->env;
   3218     int i;
   3219 
   3220     kvm_msr_buf_reset(cpu);
   3221 
   3222     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
   3223     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
   3224     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
   3225     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
   3226     if (has_msr_star) {
   3227         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
   3228     }
   3229     if (has_msr_hsave_pa) {
   3230         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
   3231     }
   3232     if (has_msr_tsc_aux) {
   3233         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
   3234     }
   3235     if (has_msr_tsc_adjust) {
   3236         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
   3237     }
   3238     if (has_msr_misc_enable) {
   3239         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
   3240                           env->msr_ia32_misc_enable);
   3241     }
   3242     if (has_msr_smbase) {
   3243         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
   3244     }
   3245     if (has_msr_smi_count) {
   3246         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
   3247     }
   3248     if (has_msr_pkrs) {
   3249         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
   3250     }
   3251     if (has_msr_bndcfgs) {
   3252         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
   3253     }
   3254     if (has_msr_xss) {
   3255         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
   3256     }
   3257     if (has_msr_umwait) {
   3258         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
   3259     }
   3260     if (has_msr_spec_ctrl) {
   3261         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
   3262     }
   3263     if (has_tsc_scale_msr) {
   3264         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
   3265     }
   3266 
   3267     if (has_msr_tsx_ctrl) {
   3268         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
   3269     }
   3270     if (has_msr_virt_ssbd) {
   3271         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
   3272     }
   3273 
   3274 #ifdef TARGET_X86_64
   3275     if (lm_capable_kernel) {
   3276         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
   3277         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
   3278         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
   3279         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
   3280     }
   3281 #endif
   3282 
   3283     /*
   3284      * The following MSRs have side effects on the guest or are too heavy
   3285      * for normal writeback. Limit them to reset or full state updates.
   3286      */
   3287     if (level >= KVM_PUT_RESET_STATE) {
   3288         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
   3289         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
   3290         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
   3291         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
   3292             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
   3293         }
   3294         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
   3295             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
   3296         }
   3297         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
   3298             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
   3299         }
   3300         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
   3301             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
   3302         }
   3303 
   3304         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
   3305             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
   3306         }
   3307 
   3308         if (has_architectural_pmu_version > 0) {
   3309             if (has_architectural_pmu_version > 1) {
   3310                 /* Stop the counter.  */
   3311                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
   3312                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
   3313             }
   3314 
   3315             /* Set the counter values.  */
   3316             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
   3317                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
   3318                                   env->msr_fixed_counters[i]);
   3319             }
   3320             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
   3321                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
   3322                                   env->msr_gp_counters[i]);
   3323                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
   3324                                   env->msr_gp_evtsel[i]);
   3325             }
   3326             if (has_architectural_pmu_version > 1) {
   3327                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
   3328                                   env->msr_global_status);
   3329                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
   3330                                   env->msr_global_ovf_ctrl);
   3331 
   3332                 /* Now start the PMU.  */
   3333                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
   3334                                   env->msr_fixed_ctr_ctrl);
   3335                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
   3336                                   env->msr_global_ctrl);
   3337             }
   3338         }
   3339         /*
   3340          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
   3341          * only sync them to KVM on the first cpu
   3342          */
   3343         if (current_cpu == first_cpu) {
   3344             if (has_msr_hv_hypercall) {
   3345                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
   3346                                   env->msr_hv_guest_os_id);
   3347                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
   3348                                   env->msr_hv_hypercall);
   3349             }
   3350             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
   3351                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
   3352                                   env->msr_hv_tsc);
   3353             }
   3354             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
   3355                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
   3356                                   env->msr_hv_reenlightenment_control);
   3357                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
   3358                                   env->msr_hv_tsc_emulation_control);
   3359                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
   3360                                   env->msr_hv_tsc_emulation_status);
   3361             }
   3362 #ifdef CONFIG_SYNDBG
   3363             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
   3364                 has_msr_hv_syndbg_options) {
   3365                 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
   3366                                   hyperv_syndbg_query_options());
   3367             }
   3368 #endif
   3369         }
   3370         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
   3371             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
   3372                               env->msr_hv_vapic);
   3373         }
   3374         if (has_msr_hv_crash) {
   3375             int j;
   3376 
   3377             for (j = 0; j < HV_CRASH_PARAMS; j++)
   3378                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
   3379                                   env->msr_hv_crash_params[j]);
   3380 
   3381             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
   3382         }
   3383         if (has_msr_hv_runtime) {
   3384             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
   3385         }
   3386         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
   3387             && hv_vpindex_settable) {
   3388             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
   3389                               hyperv_vp_index(CPU(cpu)));
   3390         }
   3391         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   3392             int j;
   3393 
   3394             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
   3395 
   3396             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
   3397                               env->msr_hv_synic_control);
   3398             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
   3399                               env->msr_hv_synic_evt_page);
   3400             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
   3401                               env->msr_hv_synic_msg_page);
   3402 
   3403             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
   3404                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
   3405                                   env->msr_hv_synic_sint[j]);
   3406             }
   3407         }
   3408         if (has_msr_hv_stimer) {
   3409             int j;
   3410 
   3411             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
   3412                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
   3413                                 env->msr_hv_stimer_config[j]);
   3414             }
   3415 
   3416             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
   3417                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
   3418                                 env->msr_hv_stimer_count[j]);
   3419             }
   3420         }
   3421         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
   3422             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
   3423 
   3424             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
   3425             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
   3426             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
   3427             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
   3428             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
   3429             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
   3430             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
   3431             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
   3432             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
   3433             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
   3434             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
   3435             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
   3436             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
   3437                 /* The CPU GPs if we write to a bit above the physical limit of
   3438                  * the host CPU (and KVM emulates that)
   3439                  */
   3440                 uint64_t mask = env->mtrr_var[i].mask;
   3441                 mask &= phys_mask;
   3442 
   3443                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
   3444                                   env->mtrr_var[i].base);
   3445                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
   3446             }
   3447         }
   3448         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
   3449             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
   3450                                                     0x14, 1, R_EAX) & 0x7;
   3451 
   3452             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
   3453                             env->msr_rtit_ctrl);
   3454             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
   3455                             env->msr_rtit_status);
   3456             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
   3457                             env->msr_rtit_output_base);
   3458             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
   3459                             env->msr_rtit_output_mask);
   3460             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
   3461                             env->msr_rtit_cr3_match);
   3462             for (i = 0; i < addr_num; i++) {
   3463                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
   3464                             env->msr_rtit_addrs[i]);
   3465             }
   3466         }
   3467 
   3468         if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
   3469             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
   3470                               env->msr_ia32_sgxlepubkeyhash[0]);
   3471             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
   3472                               env->msr_ia32_sgxlepubkeyhash[1]);
   3473             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
   3474                               env->msr_ia32_sgxlepubkeyhash[2]);
   3475             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
   3476                               env->msr_ia32_sgxlepubkeyhash[3]);
   3477         }
   3478 
   3479         if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
   3480             kvm_msr_entry_add(cpu, MSR_IA32_XFD,
   3481                               env->msr_xfd);
   3482             kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
   3483                               env->msr_xfd_err);
   3484         }
   3485 
   3486         if (kvm_enabled() && cpu->enable_pmu &&
   3487             (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
   3488             uint64_t depth;
   3489             int i, ret;
   3490 
   3491             /*
   3492              * Only migrate Arch LBR states when the host Arch LBR depth
   3493              * equals that of source guest's, this is to avoid mismatch
   3494              * of guest/host config for the msr hence avoid unexpected
   3495              * misbehavior.
   3496              */
   3497             ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
   3498 
   3499             if (ret == 1 && !!depth && depth == env->msr_lbr_depth) {
   3500                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl);
   3501                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth);
   3502 
   3503                 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
   3504                     if (!env->lbr_records[i].from) {
   3505                         continue;
   3506                     }
   3507                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i,
   3508                                       env->lbr_records[i].from);
   3509                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i,
   3510                                       env->lbr_records[i].to);
   3511                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i,
   3512                                       env->lbr_records[i].info);
   3513                 }
   3514             }
   3515         }
   3516 
   3517         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
   3518          *       kvm_put_msr_feature_control. */
   3519     }
   3520 
   3521     if (env->mcg_cap) {
   3522         int i;
   3523 
   3524         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
   3525         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
   3526         if (has_msr_mcg_ext_ctl) {
   3527             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
   3528         }
   3529         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
   3530             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
   3531         }
   3532     }
   3533 
   3534     return kvm_buf_set_msrs(cpu);
   3535 }
   3536 
   3537 
   3538 static int kvm_get_fpu(X86CPU *cpu)
   3539 {
   3540     CPUX86State *env = &cpu->env;
   3541     struct kvm_fpu fpu;
   3542     int i, ret;
   3543 
   3544     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
   3545     if (ret < 0) {
   3546         return ret;
   3547     }
   3548 
   3549     env->fpstt = (fpu.fsw >> 11) & 7;
   3550     env->fpus = fpu.fsw;
   3551     env->fpuc = fpu.fcw;
   3552     env->fpop = fpu.last_opcode;
   3553     env->fpip = fpu.last_ip;
   3554     env->fpdp = fpu.last_dp;
   3555     for (i = 0; i < 8; ++i) {
   3556         env->fptags[i] = !((fpu.ftwx >> i) & 1);
   3557     }
   3558     memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
   3559     for (i = 0; i < CPU_NB_REGS; i++) {
   3560         env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
   3561         env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
   3562     }
   3563     env->mxcsr = fpu.mxcsr;
   3564 
   3565     return 0;
   3566 }
   3567 
   3568 static int kvm_get_xsave(X86CPU *cpu)
   3569 {
   3570     CPUX86State *env = &cpu->env;
   3571     void *xsave = env->xsave_buf;
   3572     int type, ret;
   3573 
   3574     if (!has_xsave) {
   3575         return kvm_get_fpu(cpu);
   3576     }
   3577 
   3578     type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
   3579     ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
   3580     if (ret < 0) {
   3581         return ret;
   3582     }
   3583     x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
   3584 
   3585     return 0;
   3586 }
   3587 
   3588 static int kvm_get_xcrs(X86CPU *cpu)
   3589 {
   3590     CPUX86State *env = &cpu->env;
   3591     int i, ret;
   3592     struct kvm_xcrs xcrs;
   3593 
   3594     if (!has_xcrs) {
   3595         return 0;
   3596     }
   3597 
   3598     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
   3599     if (ret < 0) {
   3600         return ret;
   3601     }
   3602 
   3603     for (i = 0; i < xcrs.nr_xcrs; i++) {
   3604         /* Only support xcr0 now */
   3605         if (xcrs.xcrs[i].xcr == 0) {
   3606             env->xcr0 = xcrs.xcrs[i].value;
   3607             break;
   3608         }
   3609     }
   3610     return 0;
   3611 }
   3612 
   3613 static int kvm_get_sregs(X86CPU *cpu)
   3614 {
   3615     CPUX86State *env = &cpu->env;
   3616     struct kvm_sregs sregs;
   3617     int ret;
   3618 
   3619     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
   3620     if (ret < 0) {
   3621         return ret;
   3622     }
   3623 
   3624     /*
   3625      * The interrupt_bitmap is ignored because KVM_GET_SREGS is
   3626      * always preceded by KVM_GET_VCPU_EVENTS.
   3627      */
   3628 
   3629     get_seg(&env->segs[R_CS], &sregs.cs);
   3630     get_seg(&env->segs[R_DS], &sregs.ds);
   3631     get_seg(&env->segs[R_ES], &sregs.es);
   3632     get_seg(&env->segs[R_FS], &sregs.fs);
   3633     get_seg(&env->segs[R_GS], &sregs.gs);
   3634     get_seg(&env->segs[R_SS], &sregs.ss);
   3635 
   3636     get_seg(&env->tr, &sregs.tr);
   3637     get_seg(&env->ldt, &sregs.ldt);
   3638 
   3639     env->idt.limit = sregs.idt.limit;
   3640     env->idt.base = sregs.idt.base;
   3641     env->gdt.limit = sregs.gdt.limit;
   3642     env->gdt.base = sregs.gdt.base;
   3643 
   3644     env->cr[0] = sregs.cr0;
   3645     env->cr[2] = sregs.cr2;
   3646     env->cr[3] = sregs.cr3;
   3647     env->cr[4] = sregs.cr4;
   3648 
   3649     env->efer = sregs.efer;
   3650 
   3651     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
   3652     x86_update_hflags(env);
   3653 
   3654     return 0;
   3655 }
   3656 
   3657 static int kvm_get_sregs2(X86CPU *cpu)
   3658 {
   3659     CPUX86State *env = &cpu->env;
   3660     struct kvm_sregs2 sregs;
   3661     int i, ret;
   3662 
   3663     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
   3664     if (ret < 0) {
   3665         return ret;
   3666     }
   3667 
   3668     get_seg(&env->segs[R_CS], &sregs.cs);
   3669     get_seg(&env->segs[R_DS], &sregs.ds);
   3670     get_seg(&env->segs[R_ES], &sregs.es);
   3671     get_seg(&env->segs[R_FS], &sregs.fs);
   3672     get_seg(&env->segs[R_GS], &sregs.gs);
   3673     get_seg(&env->segs[R_SS], &sregs.ss);
   3674 
   3675     get_seg(&env->tr, &sregs.tr);
   3676     get_seg(&env->ldt, &sregs.ldt);
   3677 
   3678     env->idt.limit = sregs.idt.limit;
   3679     env->idt.base = sregs.idt.base;
   3680     env->gdt.limit = sregs.gdt.limit;
   3681     env->gdt.base = sregs.gdt.base;
   3682 
   3683     env->cr[0] = sregs.cr0;
   3684     env->cr[2] = sregs.cr2;
   3685     env->cr[3] = sregs.cr3;
   3686     env->cr[4] = sregs.cr4;
   3687 
   3688     env->efer = sregs.efer;
   3689 
   3690     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
   3691 
   3692     if (env->pdptrs_valid) {
   3693         for (i = 0; i < 4; i++) {
   3694             env->pdptrs[i] = sregs.pdptrs[i];
   3695         }
   3696     }
   3697 
   3698     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
   3699     x86_update_hflags(env);
   3700 
   3701     return 0;
   3702 }
   3703 
   3704 static int kvm_get_msrs(X86CPU *cpu)
   3705 {
   3706     CPUX86State *env = &cpu->env;
   3707     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
   3708     int ret, i;
   3709     uint64_t mtrr_top_bits;
   3710 
   3711     kvm_msr_buf_reset(cpu);
   3712 
   3713     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
   3714     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
   3715     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
   3716     kvm_msr_entry_add(cpu, MSR_PAT, 0);
   3717     if (has_msr_star) {
   3718         kvm_msr_entry_add(cpu, MSR_STAR, 0);
   3719     }
   3720     if (has_msr_hsave_pa) {
   3721         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
   3722     }
   3723     if (has_msr_tsc_aux) {
   3724         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
   3725     }
   3726     if (has_msr_tsc_adjust) {
   3727         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
   3728     }
   3729     if (has_msr_tsc_deadline) {
   3730         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
   3731     }
   3732     if (has_msr_misc_enable) {
   3733         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
   3734     }
   3735     if (has_msr_smbase) {
   3736         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
   3737     }
   3738     if (has_msr_smi_count) {
   3739         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
   3740     }
   3741     if (has_msr_feature_control) {
   3742         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
   3743     }
   3744     if (has_msr_pkrs) {
   3745         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
   3746     }
   3747     if (has_msr_bndcfgs) {
   3748         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
   3749     }
   3750     if (has_msr_xss) {
   3751         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
   3752     }
   3753     if (has_msr_umwait) {
   3754         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
   3755     }
   3756     if (has_msr_spec_ctrl) {
   3757         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
   3758     }
   3759     if (has_tsc_scale_msr) {
   3760         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
   3761     }
   3762 
   3763     if (has_msr_tsx_ctrl) {
   3764         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
   3765     }
   3766     if (has_msr_virt_ssbd) {
   3767         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
   3768     }
   3769     if (!env->tsc_valid) {
   3770         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
   3771         env->tsc_valid = !runstate_is_running();
   3772     }
   3773 
   3774 #ifdef TARGET_X86_64
   3775     if (lm_capable_kernel) {
   3776         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
   3777         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
   3778         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
   3779         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
   3780     }
   3781 #endif
   3782     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
   3783     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
   3784     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
   3785         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
   3786     }
   3787     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
   3788         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
   3789     }
   3790     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
   3791         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
   3792     }
   3793     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
   3794         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
   3795     }
   3796     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
   3797         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
   3798     }
   3799     if (has_architectural_pmu_version > 0) {
   3800         if (has_architectural_pmu_version > 1) {
   3801             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
   3802             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
   3803             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
   3804             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
   3805         }
   3806         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
   3807             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
   3808         }
   3809         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
   3810             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
   3811             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
   3812         }
   3813     }
   3814 
   3815     if (env->mcg_cap) {
   3816         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
   3817         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
   3818         if (has_msr_mcg_ext_ctl) {
   3819             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
   3820         }
   3821         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
   3822             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
   3823         }
   3824     }
   3825 
   3826     if (has_msr_hv_hypercall) {
   3827         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
   3828         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
   3829     }
   3830     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
   3831         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
   3832     }
   3833     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
   3834         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
   3835     }
   3836     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
   3837         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
   3838         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
   3839         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
   3840     }
   3841     if (has_msr_hv_syndbg_options) {
   3842         kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0);
   3843     }
   3844     if (has_msr_hv_crash) {
   3845         int j;
   3846 
   3847         for (j = 0; j < HV_CRASH_PARAMS; j++) {
   3848             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
   3849         }
   3850     }
   3851     if (has_msr_hv_runtime) {
   3852         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
   3853     }
   3854     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
   3855         uint32_t msr;
   3856 
   3857         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
   3858         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
   3859         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
   3860         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
   3861             kvm_msr_entry_add(cpu, msr, 0);
   3862         }
   3863     }
   3864     if (has_msr_hv_stimer) {
   3865         uint32_t msr;
   3866 
   3867         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
   3868              msr++) {
   3869             kvm_msr_entry_add(cpu, msr, 0);
   3870         }
   3871     }
   3872     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
   3873         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
   3874         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
   3875         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
   3876         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
   3877         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
   3878         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
   3879         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
   3880         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
   3881         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
   3882         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
   3883         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
   3884         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
   3885         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
   3886             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
   3887             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
   3888         }
   3889     }
   3890 
   3891     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
   3892         int addr_num =
   3893             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
   3894 
   3895         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
   3896         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
   3897         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
   3898         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
   3899         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
   3900         for (i = 0; i < addr_num; i++) {
   3901             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
   3902         }
   3903     }
   3904 
   3905     if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
   3906         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
   3907         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
   3908         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
   3909         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
   3910     }
   3911 
   3912     if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
   3913         kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
   3914         kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
   3915     }
   3916 
   3917     if (kvm_enabled() && cpu->enable_pmu &&
   3918         (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
   3919         uint64_t depth;
   3920         int i, ret;
   3921 
   3922         ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
   3923         if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) {
   3924             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0);
   3925             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0);
   3926 
   3927             for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
   3928                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0);
   3929                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0);
   3930                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0);
   3931             }
   3932         }
   3933     }
   3934 
   3935     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
   3936     if (ret < 0) {
   3937         return ret;
   3938     }
   3939 
   3940     if (ret < cpu->kvm_msr_buf->nmsrs) {
   3941         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
   3942         error_report("error: failed to get MSR 0x%" PRIx32,
   3943                      (uint32_t)e->index);
   3944     }
   3945 
   3946     assert(ret == cpu->kvm_msr_buf->nmsrs);
   3947     /*
   3948      * MTRR masks: Each mask consists of 5 parts
   3949      * a  10..0: must be zero
   3950      * b  11   : valid bit
   3951      * c n-1.12: actual mask bits
   3952      * d  51..n: reserved must be zero
   3953      * e  63.52: reserved must be zero
   3954      *
   3955      * 'n' is the number of physical bits supported by the CPU and is
   3956      * apparently always <= 52.   We know our 'n' but don't know what
   3957      * the destinations 'n' is; it might be smaller, in which case
   3958      * it masks (c) on loading. It might be larger, in which case
   3959      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
   3960      * we're migrating to.
   3961      */
   3962 
   3963     if (cpu->fill_mtrr_mask) {
   3964         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
   3965         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
   3966         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
   3967     } else {
   3968         mtrr_top_bits = 0;
   3969     }
   3970 
   3971     for (i = 0; i < ret; i++) {
   3972         uint32_t index = msrs[i].index;
   3973         switch (index) {
   3974         case MSR_IA32_SYSENTER_CS:
   3975             env->sysenter_cs = msrs[i].data;
   3976             break;
   3977         case MSR_IA32_SYSENTER_ESP:
   3978             env->sysenter_esp = msrs[i].data;
   3979             break;
   3980         case MSR_IA32_SYSENTER_EIP:
   3981             env->sysenter_eip = msrs[i].data;
   3982             break;
   3983         case MSR_PAT:
   3984             env->pat = msrs[i].data;
   3985             break;
   3986         case MSR_STAR:
   3987             env->star = msrs[i].data;
   3988             break;
   3989 #ifdef TARGET_X86_64
   3990         case MSR_CSTAR:
   3991             env->cstar = msrs[i].data;
   3992             break;
   3993         case MSR_KERNELGSBASE:
   3994             env->kernelgsbase = msrs[i].data;
   3995             break;
   3996         case MSR_FMASK:
   3997             env->fmask = msrs[i].data;
   3998             break;
   3999         case MSR_LSTAR:
   4000             env->lstar = msrs[i].data;
   4001             break;
   4002 #endif
   4003         case MSR_IA32_TSC:
   4004             env->tsc = msrs[i].data;
   4005             break;
   4006         case MSR_TSC_AUX:
   4007             env->tsc_aux = msrs[i].data;
   4008             break;
   4009         case MSR_TSC_ADJUST:
   4010             env->tsc_adjust = msrs[i].data;
   4011             break;
   4012         case MSR_IA32_TSCDEADLINE:
   4013             env->tsc_deadline = msrs[i].data;
   4014             break;
   4015         case MSR_VM_HSAVE_PA:
   4016             env->vm_hsave = msrs[i].data;
   4017             break;
   4018         case MSR_KVM_SYSTEM_TIME:
   4019             env->system_time_msr = msrs[i].data;
   4020             break;
   4021         case MSR_KVM_WALL_CLOCK:
   4022             env->wall_clock_msr = msrs[i].data;
   4023             break;
   4024         case MSR_MCG_STATUS:
   4025             env->mcg_status = msrs[i].data;
   4026             break;
   4027         case MSR_MCG_CTL:
   4028             env->mcg_ctl = msrs[i].data;
   4029             break;
   4030         case MSR_MCG_EXT_CTL:
   4031             env->mcg_ext_ctl = msrs[i].data;
   4032             break;
   4033         case MSR_IA32_MISC_ENABLE:
   4034             env->msr_ia32_misc_enable = msrs[i].data;
   4035             break;
   4036         case MSR_IA32_SMBASE:
   4037             env->smbase = msrs[i].data;
   4038             break;
   4039         case MSR_SMI_COUNT:
   4040             env->msr_smi_count = msrs[i].data;
   4041             break;
   4042         case MSR_IA32_FEATURE_CONTROL:
   4043             env->msr_ia32_feature_control = msrs[i].data;
   4044             break;
   4045         case MSR_IA32_BNDCFGS:
   4046             env->msr_bndcfgs = msrs[i].data;
   4047             break;
   4048         case MSR_IA32_XSS:
   4049             env->xss = msrs[i].data;
   4050             break;
   4051         case MSR_IA32_UMWAIT_CONTROL:
   4052             env->umwait = msrs[i].data;
   4053             break;
   4054         case MSR_IA32_PKRS:
   4055             env->pkrs = msrs[i].data;
   4056             break;
   4057         default:
   4058             if (msrs[i].index >= MSR_MC0_CTL &&
   4059                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
   4060                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
   4061             }
   4062             break;
   4063         case MSR_KVM_ASYNC_PF_EN:
   4064             env->async_pf_en_msr = msrs[i].data;
   4065             break;
   4066         case MSR_KVM_ASYNC_PF_INT:
   4067             env->async_pf_int_msr = msrs[i].data;
   4068             break;
   4069         case MSR_KVM_PV_EOI_EN:
   4070             env->pv_eoi_en_msr = msrs[i].data;
   4071             break;
   4072         case MSR_KVM_STEAL_TIME:
   4073             env->steal_time_msr = msrs[i].data;
   4074             break;
   4075         case MSR_KVM_POLL_CONTROL: {
   4076             env->poll_control_msr = msrs[i].data;
   4077             break;
   4078         }
   4079         case MSR_CORE_PERF_FIXED_CTR_CTRL:
   4080             env->msr_fixed_ctr_ctrl = msrs[i].data;
   4081             break;
   4082         case MSR_CORE_PERF_GLOBAL_CTRL:
   4083             env->msr_global_ctrl = msrs[i].data;
   4084             break;
   4085         case MSR_CORE_PERF_GLOBAL_STATUS:
   4086             env->msr_global_status = msrs[i].data;
   4087             break;
   4088         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
   4089             env->msr_global_ovf_ctrl = msrs[i].data;
   4090             break;
   4091         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
   4092             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
   4093             break;
   4094         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
   4095             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
   4096             break;
   4097         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
   4098             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
   4099             break;
   4100         case HV_X64_MSR_HYPERCALL:
   4101             env->msr_hv_hypercall = msrs[i].data;
   4102             break;
   4103         case HV_X64_MSR_GUEST_OS_ID:
   4104             env->msr_hv_guest_os_id = msrs[i].data;
   4105             break;
   4106         case HV_X64_MSR_APIC_ASSIST_PAGE:
   4107             env->msr_hv_vapic = msrs[i].data;
   4108             break;
   4109         case HV_X64_MSR_REFERENCE_TSC:
   4110             env->msr_hv_tsc = msrs[i].data;
   4111             break;
   4112         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
   4113             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
   4114             break;
   4115         case HV_X64_MSR_VP_RUNTIME:
   4116             env->msr_hv_runtime = msrs[i].data;
   4117             break;
   4118         case HV_X64_MSR_SCONTROL:
   4119             env->msr_hv_synic_control = msrs[i].data;
   4120             break;
   4121         case HV_X64_MSR_SIEFP:
   4122             env->msr_hv_synic_evt_page = msrs[i].data;
   4123             break;
   4124         case HV_X64_MSR_SIMP:
   4125             env->msr_hv_synic_msg_page = msrs[i].data;
   4126             break;
   4127         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
   4128             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
   4129             break;
   4130         case HV_X64_MSR_STIMER0_CONFIG:
   4131         case HV_X64_MSR_STIMER1_CONFIG:
   4132         case HV_X64_MSR_STIMER2_CONFIG:
   4133         case HV_X64_MSR_STIMER3_CONFIG:
   4134             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
   4135                                 msrs[i].data;
   4136             break;
   4137         case HV_X64_MSR_STIMER0_COUNT:
   4138         case HV_X64_MSR_STIMER1_COUNT:
   4139         case HV_X64_MSR_STIMER2_COUNT:
   4140         case HV_X64_MSR_STIMER3_COUNT:
   4141             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
   4142                                 msrs[i].data;
   4143             break;
   4144         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
   4145             env->msr_hv_reenlightenment_control = msrs[i].data;
   4146             break;
   4147         case HV_X64_MSR_TSC_EMULATION_CONTROL:
   4148             env->msr_hv_tsc_emulation_control = msrs[i].data;
   4149             break;
   4150         case HV_X64_MSR_TSC_EMULATION_STATUS:
   4151             env->msr_hv_tsc_emulation_status = msrs[i].data;
   4152             break;
   4153         case HV_X64_MSR_SYNDBG_OPTIONS:
   4154             env->msr_hv_syndbg_options = msrs[i].data;
   4155             break;
   4156         case MSR_MTRRdefType:
   4157             env->mtrr_deftype = msrs[i].data;
   4158             break;
   4159         case MSR_MTRRfix64K_00000:
   4160             env->mtrr_fixed[0] = msrs[i].data;
   4161             break;
   4162         case MSR_MTRRfix16K_80000:
   4163             env->mtrr_fixed[1] = msrs[i].data;
   4164             break;
   4165         case MSR_MTRRfix16K_A0000:
   4166             env->mtrr_fixed[2] = msrs[i].data;
   4167             break;
   4168         case MSR_MTRRfix4K_C0000:
   4169             env->mtrr_fixed[3] = msrs[i].data;
   4170             break;
   4171         case MSR_MTRRfix4K_C8000:
   4172             env->mtrr_fixed[4] = msrs[i].data;
   4173             break;
   4174         case MSR_MTRRfix4K_D0000:
   4175             env->mtrr_fixed[5] = msrs[i].data;
   4176             break;
   4177         case MSR_MTRRfix4K_D8000:
   4178             env->mtrr_fixed[6] = msrs[i].data;
   4179             break;
   4180         case MSR_MTRRfix4K_E0000:
   4181             env->mtrr_fixed[7] = msrs[i].data;
   4182             break;
   4183         case MSR_MTRRfix4K_E8000:
   4184             env->mtrr_fixed[8] = msrs[i].data;
   4185             break;
   4186         case MSR_MTRRfix4K_F0000:
   4187             env->mtrr_fixed[9] = msrs[i].data;
   4188             break;
   4189         case MSR_MTRRfix4K_F8000:
   4190             env->mtrr_fixed[10] = msrs[i].data;
   4191             break;
   4192         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
   4193             if (index & 1) {
   4194                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
   4195                                                                mtrr_top_bits;
   4196             } else {
   4197                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
   4198             }
   4199             break;
   4200         case MSR_IA32_SPEC_CTRL:
   4201             env->spec_ctrl = msrs[i].data;
   4202             break;
   4203         case MSR_AMD64_TSC_RATIO:
   4204             env->amd_tsc_scale_msr = msrs[i].data;
   4205             break;
   4206         case MSR_IA32_TSX_CTRL:
   4207             env->tsx_ctrl = msrs[i].data;
   4208             break;
   4209         case MSR_VIRT_SSBD:
   4210             env->virt_ssbd = msrs[i].data;
   4211             break;
   4212         case MSR_IA32_RTIT_CTL:
   4213             env->msr_rtit_ctrl = msrs[i].data;
   4214             break;
   4215         case MSR_IA32_RTIT_STATUS:
   4216             env->msr_rtit_status = msrs[i].data;
   4217             break;
   4218         case MSR_IA32_RTIT_OUTPUT_BASE:
   4219             env->msr_rtit_output_base = msrs[i].data;
   4220             break;
   4221         case MSR_IA32_RTIT_OUTPUT_MASK:
   4222             env->msr_rtit_output_mask = msrs[i].data;
   4223             break;
   4224         case MSR_IA32_RTIT_CR3_MATCH:
   4225             env->msr_rtit_cr3_match = msrs[i].data;
   4226             break;
   4227         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
   4228             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
   4229             break;
   4230         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
   4231             env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
   4232                            msrs[i].data;
   4233             break;
   4234         case MSR_IA32_XFD:
   4235             env->msr_xfd = msrs[i].data;
   4236             break;
   4237         case MSR_IA32_XFD_ERR:
   4238             env->msr_xfd_err = msrs[i].data;
   4239             break;
   4240         case MSR_ARCH_LBR_CTL:
   4241             env->msr_lbr_ctl = msrs[i].data;
   4242             break;
   4243         case MSR_ARCH_LBR_DEPTH:
   4244             env->msr_lbr_depth = msrs[i].data;
   4245             break;
   4246         case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
   4247             env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data;
   4248             break;
   4249         case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
   4250             env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data;
   4251             break;
   4252         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
   4253             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
   4254             break;
   4255         }
   4256     }
   4257 
   4258     return 0;
   4259 }
   4260 
   4261 static int kvm_put_mp_state(X86CPU *cpu)
   4262 {
   4263     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
   4264 
   4265     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
   4266 }
   4267 
   4268 static int kvm_get_mp_state(X86CPU *cpu)
   4269 {
   4270     CPUState *cs = CPU(cpu);
   4271     CPUX86State *env = &cpu->env;
   4272     struct kvm_mp_state mp_state;
   4273     int ret;
   4274 
   4275     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
   4276     if (ret < 0) {
   4277         return ret;
   4278     }
   4279     env->mp_state = mp_state.mp_state;
   4280     if (kvm_irqchip_in_kernel()) {
   4281         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
   4282     }
   4283     return 0;
   4284 }
   4285 
   4286 static int kvm_get_apic(X86CPU *cpu)
   4287 {
   4288     DeviceState *apic = cpu->apic_state;
   4289     struct kvm_lapic_state kapic;
   4290     int ret;
   4291 
   4292     if (apic && kvm_irqchip_in_kernel()) {
   4293         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
   4294         if (ret < 0) {
   4295             return ret;
   4296         }
   4297 
   4298         kvm_get_apic_state(apic, &kapic);
   4299     }
   4300     return 0;
   4301 }
   4302 
   4303 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
   4304 {
   4305     CPUState *cs = CPU(cpu);
   4306     CPUX86State *env = &cpu->env;
   4307     struct kvm_vcpu_events events = {};
   4308 
   4309     if (!kvm_has_vcpu_events()) {
   4310         return 0;
   4311     }
   4312 
   4313     events.flags = 0;
   4314 
   4315     if (has_exception_payload) {
   4316         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
   4317         events.exception.pending = env->exception_pending;
   4318         events.exception_has_payload = env->exception_has_payload;
   4319         events.exception_payload = env->exception_payload;
   4320     }
   4321     events.exception.nr = env->exception_nr;
   4322     events.exception.injected = env->exception_injected;
   4323     events.exception.has_error_code = env->has_error_code;
   4324     events.exception.error_code = env->error_code;
   4325 
   4326     events.interrupt.injected = (env->interrupt_injected >= 0);
   4327     events.interrupt.nr = env->interrupt_injected;
   4328     events.interrupt.soft = env->soft_interrupt;
   4329 
   4330     events.nmi.injected = env->nmi_injected;
   4331     events.nmi.pending = env->nmi_pending;
   4332     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
   4333 
   4334     events.sipi_vector = env->sipi_vector;
   4335 
   4336     if (has_msr_smbase) {
   4337         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
   4338         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
   4339         if (kvm_irqchip_in_kernel()) {
   4340             /* As soon as these are moved to the kernel, remove them
   4341              * from cs->interrupt_request.
   4342              */
   4343             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
   4344             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
   4345             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
   4346         } else {
   4347             /* Keep these in cs->interrupt_request.  */
   4348             events.smi.pending = 0;
   4349             events.smi.latched_init = 0;
   4350         }
   4351         /* Stop SMI delivery on old machine types to avoid a reboot
   4352          * on an inward migration of an old VM.
   4353          */
   4354         if (!cpu->kvm_no_smi_migration) {
   4355             events.flags |= KVM_VCPUEVENT_VALID_SMM;
   4356         }
   4357     }
   4358 
   4359     if (level >= KVM_PUT_RESET_STATE) {
   4360         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
   4361         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
   4362             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
   4363         }
   4364     }
   4365 
   4366     if (has_triple_fault_event) {
   4367         events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
   4368         events.triple_fault.pending = env->triple_fault_pending;
   4369     }
   4370 
   4371     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
   4372 }
   4373 
   4374 static int kvm_get_vcpu_events(X86CPU *cpu)
   4375 {
   4376     CPUX86State *env = &cpu->env;
   4377     struct kvm_vcpu_events events;
   4378     int ret;
   4379 
   4380     if (!kvm_has_vcpu_events()) {
   4381         return 0;
   4382     }
   4383 
   4384     memset(&events, 0, sizeof(events));
   4385     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
   4386     if (ret < 0) {
   4387        return ret;
   4388     }
   4389 
   4390     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
   4391         env->exception_pending = events.exception.pending;
   4392         env->exception_has_payload = events.exception_has_payload;
   4393         env->exception_payload = events.exception_payload;
   4394     } else {
   4395         env->exception_pending = 0;
   4396         env->exception_has_payload = false;
   4397     }
   4398     env->exception_injected = events.exception.injected;
   4399     env->exception_nr =
   4400         (env->exception_pending || env->exception_injected) ?
   4401         events.exception.nr : -1;
   4402     env->has_error_code = events.exception.has_error_code;
   4403     env->error_code = events.exception.error_code;
   4404 
   4405     env->interrupt_injected =
   4406         events.interrupt.injected ? events.interrupt.nr : -1;
   4407     env->soft_interrupt = events.interrupt.soft;
   4408 
   4409     env->nmi_injected = events.nmi.injected;
   4410     env->nmi_pending = events.nmi.pending;
   4411     if (events.nmi.masked) {
   4412         env->hflags2 |= HF2_NMI_MASK;
   4413     } else {
   4414         env->hflags2 &= ~HF2_NMI_MASK;
   4415     }
   4416 
   4417     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
   4418         if (events.smi.smm) {
   4419             env->hflags |= HF_SMM_MASK;
   4420         } else {
   4421             env->hflags &= ~HF_SMM_MASK;
   4422         }
   4423         if (events.smi.pending) {
   4424             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
   4425         } else {
   4426             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
   4427         }
   4428         if (events.smi.smm_inside_nmi) {
   4429             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
   4430         } else {
   4431             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
   4432         }
   4433         if (events.smi.latched_init) {
   4434             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
   4435         } else {
   4436             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
   4437         }
   4438     }
   4439 
   4440     if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
   4441         env->triple_fault_pending = events.triple_fault.pending;
   4442     }
   4443 
   4444     env->sipi_vector = events.sipi_vector;
   4445 
   4446     return 0;
   4447 }
   4448 
   4449 static int kvm_guest_debug_workarounds(X86CPU *cpu)
   4450 {
   4451     CPUState *cs = CPU(cpu);
   4452     CPUX86State *env = &cpu->env;
   4453     int ret = 0;
   4454     unsigned long reinject_trap = 0;
   4455 
   4456     if (!kvm_has_vcpu_events()) {
   4457         if (env->exception_nr == EXCP01_DB) {
   4458             reinject_trap = KVM_GUESTDBG_INJECT_DB;
   4459         } else if (env->exception_injected == EXCP03_INT3) {
   4460             reinject_trap = KVM_GUESTDBG_INJECT_BP;
   4461         }
   4462         kvm_reset_exception(env);
   4463     }
   4464 
   4465     /*
   4466      * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
   4467      * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
   4468      * by updating the debug state once again if single-stepping is on.
   4469      * Another reason to call kvm_update_guest_debug here is a pending debug
   4470      * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
   4471      * reinject them via SET_GUEST_DEBUG.
   4472      */
   4473     if (reinject_trap ||
   4474         (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
   4475         ret = kvm_update_guest_debug(cs, reinject_trap);
   4476     }
   4477     return ret;
   4478 }
   4479 
   4480 static int kvm_put_debugregs(X86CPU *cpu)
   4481 {
   4482     CPUX86State *env = &cpu->env;
   4483     struct kvm_debugregs dbgregs;
   4484     int i;
   4485 
   4486     if (!kvm_has_debugregs()) {
   4487         return 0;
   4488     }
   4489 
   4490     memset(&dbgregs, 0, sizeof(dbgregs));
   4491     for (i = 0; i < 4; i++) {
   4492         dbgregs.db[i] = env->dr[i];
   4493     }
   4494     dbgregs.dr6 = env->dr[6];
   4495     dbgregs.dr7 = env->dr[7];
   4496     dbgregs.flags = 0;
   4497 
   4498     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
   4499 }
   4500 
   4501 static int kvm_get_debugregs(X86CPU *cpu)
   4502 {
   4503     CPUX86State *env = &cpu->env;
   4504     struct kvm_debugregs dbgregs;
   4505     int i, ret;
   4506 
   4507     if (!kvm_has_debugregs()) {
   4508         return 0;
   4509     }
   4510 
   4511     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
   4512     if (ret < 0) {
   4513         return ret;
   4514     }
   4515     for (i = 0; i < 4; i++) {
   4516         env->dr[i] = dbgregs.db[i];
   4517     }
   4518     env->dr[4] = env->dr[6] = dbgregs.dr6;
   4519     env->dr[5] = env->dr[7] = dbgregs.dr7;
   4520 
   4521     return 0;
   4522 }
   4523 
   4524 static int kvm_put_nested_state(X86CPU *cpu)
   4525 {
   4526     CPUX86State *env = &cpu->env;
   4527     int max_nested_state_len = kvm_max_nested_state_length();
   4528 
   4529     if (!env->nested_state) {
   4530         return 0;
   4531     }
   4532 
   4533     /*
   4534      * Copy flags that are affected by reset from env->hflags and env->hflags2.
   4535      */
   4536     if (env->hflags & HF_GUEST_MASK) {
   4537         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
   4538     } else {
   4539         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
   4540     }
   4541 
   4542     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
   4543     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
   4544         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
   4545     } else {
   4546         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
   4547     }
   4548 
   4549     assert(env->nested_state->size <= max_nested_state_len);
   4550     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
   4551 }
   4552 
   4553 static int kvm_get_nested_state(X86CPU *cpu)
   4554 {
   4555     CPUX86State *env = &cpu->env;
   4556     int max_nested_state_len = kvm_max_nested_state_length();
   4557     int ret;
   4558 
   4559     if (!env->nested_state) {
   4560         return 0;
   4561     }
   4562 
   4563     /*
   4564      * It is possible that migration restored a smaller size into
   4565      * nested_state->hdr.size than what our kernel support.
   4566      * We preserve migration origin nested_state->hdr.size for
   4567      * call to KVM_SET_NESTED_STATE but wish that our next call
   4568      * to KVM_GET_NESTED_STATE will use max size our kernel support.
   4569      */
   4570     env->nested_state->size = max_nested_state_len;
   4571 
   4572     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
   4573     if (ret < 0) {
   4574         return ret;
   4575     }
   4576 
   4577     /*
   4578      * Copy flags that are affected by reset to env->hflags and env->hflags2.
   4579      */
   4580     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
   4581         env->hflags |= HF_GUEST_MASK;
   4582     } else {
   4583         env->hflags &= ~HF_GUEST_MASK;
   4584     }
   4585 
   4586     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
   4587     if (cpu_has_svm(env)) {
   4588         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
   4589             env->hflags2 |= HF2_GIF_MASK;
   4590         } else {
   4591             env->hflags2 &= ~HF2_GIF_MASK;
   4592         }
   4593     }
   4594 
   4595     return ret;
   4596 }
   4597 
   4598 int kvm_arch_put_registers(CPUState *cpu, int level)
   4599 {
   4600     X86CPU *x86_cpu = X86_CPU(cpu);
   4601     int ret;
   4602 
   4603     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
   4604 
   4605     /*
   4606      * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX
   4607      * root operation upon vCPU reset. kvm_put_msr_feature_control() should also
   4608      * preceed kvm_put_nested_state() when 'real' nested state is set.
   4609      */
   4610     if (level >= KVM_PUT_RESET_STATE) {
   4611         ret = kvm_put_msr_feature_control(x86_cpu);
   4612         if (ret < 0) {
   4613             return ret;
   4614         }
   4615     }
   4616 
   4617     /* must be before kvm_put_nested_state so that EFER.SVME is set */
   4618     ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
   4619     if (ret < 0) {
   4620         return ret;
   4621     }
   4622 
   4623     if (level >= KVM_PUT_RESET_STATE) {
   4624         ret = kvm_put_nested_state(x86_cpu);
   4625         if (ret < 0) {
   4626             return ret;
   4627         }
   4628     }
   4629 
   4630     if (level == KVM_PUT_FULL_STATE) {
   4631         /* We don't check for kvm_arch_set_tsc_khz() errors here,
   4632          * because TSC frequency mismatch shouldn't abort migration,
   4633          * unless the user explicitly asked for a more strict TSC
   4634          * setting (e.g. using an explicit "tsc-freq" option).
   4635          */
   4636         kvm_arch_set_tsc_khz(cpu);
   4637     }
   4638 
   4639     ret = kvm_getput_regs(x86_cpu, 1);
   4640     if (ret < 0) {
   4641         return ret;
   4642     }
   4643     ret = kvm_put_xsave(x86_cpu);
   4644     if (ret < 0) {
   4645         return ret;
   4646     }
   4647     ret = kvm_put_xcrs(x86_cpu);
   4648     if (ret < 0) {
   4649         return ret;
   4650     }
   4651     /* must be before kvm_put_msrs */
   4652     ret = kvm_inject_mce_oldstyle(x86_cpu);
   4653     if (ret < 0) {
   4654         return ret;
   4655     }
   4656     ret = kvm_put_msrs(x86_cpu, level);
   4657     if (ret < 0) {
   4658         return ret;
   4659     }
   4660     ret = kvm_put_vcpu_events(x86_cpu, level);
   4661     if (ret < 0) {
   4662         return ret;
   4663     }
   4664     if (level >= KVM_PUT_RESET_STATE) {
   4665         ret = kvm_put_mp_state(x86_cpu);
   4666         if (ret < 0) {
   4667             return ret;
   4668         }
   4669     }
   4670 
   4671     ret = kvm_put_tscdeadline_msr(x86_cpu);
   4672     if (ret < 0) {
   4673         return ret;
   4674     }
   4675     ret = kvm_put_debugregs(x86_cpu);
   4676     if (ret < 0) {
   4677         return ret;
   4678     }
   4679     /* must be last */
   4680     ret = kvm_guest_debug_workarounds(x86_cpu);
   4681     if (ret < 0) {
   4682         return ret;
   4683     }
   4684     return 0;
   4685 }
   4686 
   4687 int kvm_arch_get_registers(CPUState *cs)
   4688 {
   4689     X86CPU *cpu = X86_CPU(cs);
   4690     int ret;
   4691 
   4692     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
   4693 
   4694     ret = kvm_get_vcpu_events(cpu);
   4695     if (ret < 0) {
   4696         goto out;
   4697     }
   4698     /*
   4699      * KVM_GET_MPSTATE can modify CS and RIP, call it before
   4700      * KVM_GET_REGS and KVM_GET_SREGS.
   4701      */
   4702     ret = kvm_get_mp_state(cpu);
   4703     if (ret < 0) {
   4704         goto out;
   4705     }
   4706     ret = kvm_getput_regs(cpu, 0);
   4707     if (ret < 0) {
   4708         goto out;
   4709     }
   4710     ret = kvm_get_xsave(cpu);
   4711     if (ret < 0) {
   4712         goto out;
   4713     }
   4714     ret = kvm_get_xcrs(cpu);
   4715     if (ret < 0) {
   4716         goto out;
   4717     }
   4718     ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
   4719     if (ret < 0) {
   4720         goto out;
   4721     }
   4722     ret = kvm_get_msrs(cpu);
   4723     if (ret < 0) {
   4724         goto out;
   4725     }
   4726     ret = kvm_get_apic(cpu);
   4727     if (ret < 0) {
   4728         goto out;
   4729     }
   4730     ret = kvm_get_debugregs(cpu);
   4731     if (ret < 0) {
   4732         goto out;
   4733     }
   4734     ret = kvm_get_nested_state(cpu);
   4735     if (ret < 0) {
   4736         goto out;
   4737     }
   4738     ret = 0;
   4739  out:
   4740     cpu_sync_bndcs_hflags(&cpu->env);
   4741     return ret;
   4742 }
   4743 
   4744 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
   4745 {
   4746     X86CPU *x86_cpu = X86_CPU(cpu);
   4747     CPUX86State *env = &x86_cpu->env;
   4748     int ret;
   4749 
   4750     /* Inject NMI */
   4751     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
   4752         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
   4753             qemu_mutex_lock_iothread();
   4754             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
   4755             qemu_mutex_unlock_iothread();
   4756             DPRINTF("injected NMI\n");
   4757             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
   4758             if (ret < 0) {
   4759                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
   4760                         strerror(-ret));
   4761             }
   4762         }
   4763         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
   4764             qemu_mutex_lock_iothread();
   4765             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
   4766             qemu_mutex_unlock_iothread();
   4767             DPRINTF("injected SMI\n");
   4768             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
   4769             if (ret < 0) {
   4770                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
   4771                         strerror(-ret));
   4772             }
   4773         }
   4774     }
   4775 
   4776     if (!kvm_pic_in_kernel()) {
   4777         qemu_mutex_lock_iothread();
   4778     }
   4779 
   4780     /* Force the VCPU out of its inner loop to process any INIT requests
   4781      * or (for userspace APIC, but it is cheap to combine the checks here)
   4782      * pending TPR access reports.
   4783      */
   4784     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
   4785         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
   4786             !(env->hflags & HF_SMM_MASK)) {
   4787             cpu->exit_request = 1;
   4788         }
   4789         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
   4790             cpu->exit_request = 1;
   4791         }
   4792     }
   4793 
   4794     if (!kvm_pic_in_kernel()) {
   4795         /* Try to inject an interrupt if the guest can accept it */
   4796         if (run->ready_for_interrupt_injection &&
   4797             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
   4798             (env->eflags & IF_MASK)) {
   4799             int irq;
   4800 
   4801             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
   4802             irq = cpu_get_pic_interrupt(env);
   4803             if (irq >= 0) {
   4804                 struct kvm_interrupt intr;
   4805 
   4806                 intr.irq = irq;
   4807                 DPRINTF("injected interrupt %d\n", irq);
   4808                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
   4809                 if (ret < 0) {
   4810                     fprintf(stderr,
   4811                             "KVM: injection failed, interrupt lost (%s)\n",
   4812                             strerror(-ret));
   4813                 }
   4814             }
   4815         }
   4816 
   4817         /* If we have an interrupt but the guest is not ready to receive an
   4818          * interrupt, request an interrupt window exit.  This will
   4819          * cause a return to userspace as soon as the guest is ready to
   4820          * receive interrupts. */
   4821         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
   4822             run->request_interrupt_window = 1;
   4823         } else {
   4824             run->request_interrupt_window = 0;
   4825         }
   4826 
   4827         DPRINTF("setting tpr\n");
   4828         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
   4829 
   4830         qemu_mutex_unlock_iothread();
   4831     }
   4832 }
   4833 
   4834 static void kvm_rate_limit_on_bus_lock(void)
   4835 {
   4836     uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
   4837 
   4838     if (delay_ns) {
   4839         g_usleep(delay_ns / SCALE_US);
   4840     }
   4841 }
   4842 
   4843 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
   4844 {
   4845     X86CPU *x86_cpu = X86_CPU(cpu);
   4846     CPUX86State *env = &x86_cpu->env;
   4847 
   4848     if (run->flags & KVM_RUN_X86_SMM) {
   4849         env->hflags |= HF_SMM_MASK;
   4850     } else {
   4851         env->hflags &= ~HF_SMM_MASK;
   4852     }
   4853     if (run->if_flag) {
   4854         env->eflags |= IF_MASK;
   4855     } else {
   4856         env->eflags &= ~IF_MASK;
   4857     }
   4858     if (run->flags & KVM_RUN_X86_BUS_LOCK) {
   4859         kvm_rate_limit_on_bus_lock();
   4860     }
   4861 
   4862     /* We need to protect the apic state against concurrent accesses from
   4863      * different threads in case the userspace irqchip is used. */
   4864     if (!kvm_irqchip_in_kernel()) {
   4865         qemu_mutex_lock_iothread();
   4866     }
   4867     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
   4868     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
   4869     if (!kvm_irqchip_in_kernel()) {
   4870         qemu_mutex_unlock_iothread();
   4871     }
   4872     return cpu_get_mem_attrs(env);
   4873 }
   4874 
   4875 int kvm_arch_process_async_events(CPUState *cs)
   4876 {
   4877     X86CPU *cpu = X86_CPU(cs);
   4878     CPUX86State *env = &cpu->env;
   4879 
   4880     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
   4881         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
   4882         assert(env->mcg_cap);
   4883 
   4884         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
   4885 
   4886         kvm_cpu_synchronize_state(cs);
   4887 
   4888         if (env->exception_nr == EXCP08_DBLE) {
   4889             /* this means triple fault */
   4890             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
   4891             cs->exit_request = 1;
   4892             return 0;
   4893         }
   4894         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
   4895         env->has_error_code = 0;
   4896 
   4897         cs->halted = 0;
   4898         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
   4899             env->mp_state = KVM_MP_STATE_RUNNABLE;
   4900         }
   4901     }
   4902 
   4903     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
   4904         !(env->hflags & HF_SMM_MASK)) {
   4905         kvm_cpu_synchronize_state(cs);
   4906         do_cpu_init(cpu);
   4907     }
   4908 
   4909     if (kvm_irqchip_in_kernel()) {
   4910         return 0;
   4911     }
   4912 
   4913     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
   4914         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
   4915         apic_poll_irq(cpu->apic_state);
   4916     }
   4917     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
   4918          (env->eflags & IF_MASK)) ||
   4919         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
   4920         cs->halted = 0;
   4921     }
   4922     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
   4923         kvm_cpu_synchronize_state(cs);
   4924         do_cpu_sipi(cpu);
   4925     }
   4926     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
   4927         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
   4928         kvm_cpu_synchronize_state(cs);
   4929         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
   4930                                       env->tpr_access_type);
   4931     }
   4932 
   4933     return cs->halted;
   4934 }
   4935 
   4936 static int kvm_handle_halt(X86CPU *cpu)
   4937 {
   4938     CPUState *cs = CPU(cpu);
   4939     CPUX86State *env = &cpu->env;
   4940 
   4941     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
   4942           (env->eflags & IF_MASK)) &&
   4943         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
   4944         cs->halted = 1;
   4945         return EXCP_HLT;
   4946     }
   4947 
   4948     return 0;
   4949 }
   4950 
   4951 static int kvm_handle_tpr_access(X86CPU *cpu)
   4952 {
   4953     CPUState *cs = CPU(cpu);
   4954     struct kvm_run *run = cs->kvm_run;
   4955 
   4956     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
   4957                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
   4958                                                            : TPR_ACCESS_READ);
   4959     return 1;
   4960 }
   4961 
   4962 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   4963 {
   4964     static const uint8_t int3 = 0xcc;
   4965 
   4966     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
   4967         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
   4968         return -EINVAL;
   4969     }
   4970     return 0;
   4971 }
   4972 
   4973 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   4974 {
   4975     uint8_t int3;
   4976 
   4977     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
   4978         return -EINVAL;
   4979     }
   4980     if (int3 != 0xcc) {
   4981         return 0;
   4982     }
   4983     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
   4984         return -EINVAL;
   4985     }
   4986     return 0;
   4987 }
   4988 
   4989 static struct {
   4990     target_ulong addr;
   4991     int len;
   4992     int type;
   4993 } hw_breakpoint[4];
   4994 
   4995 static int nb_hw_breakpoint;
   4996 
   4997 static int find_hw_breakpoint(target_ulong addr, int len, int type)
   4998 {
   4999     int n;
   5000 
   5001     for (n = 0; n < nb_hw_breakpoint; n++) {
   5002         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
   5003             (hw_breakpoint[n].len == len || len == -1)) {
   5004             return n;
   5005         }
   5006     }
   5007     return -1;
   5008 }
   5009 
   5010 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
   5011                                   target_ulong len, int type)
   5012 {
   5013     switch (type) {
   5014     case GDB_BREAKPOINT_HW:
   5015         len = 1;
   5016         break;
   5017     case GDB_WATCHPOINT_WRITE:
   5018     case GDB_WATCHPOINT_ACCESS:
   5019         switch (len) {
   5020         case 1:
   5021             break;
   5022         case 2:
   5023         case 4:
   5024         case 8:
   5025             if (addr & (len - 1)) {
   5026                 return -EINVAL;
   5027             }
   5028             break;
   5029         default:
   5030             return -EINVAL;
   5031         }
   5032         break;
   5033     default:
   5034         return -ENOSYS;
   5035     }
   5036 
   5037     if (nb_hw_breakpoint == 4) {
   5038         return -ENOBUFS;
   5039     }
   5040     if (find_hw_breakpoint(addr, len, type) >= 0) {
   5041         return -EEXIST;
   5042     }
   5043     hw_breakpoint[nb_hw_breakpoint].addr = addr;
   5044     hw_breakpoint[nb_hw_breakpoint].len = len;
   5045     hw_breakpoint[nb_hw_breakpoint].type = type;
   5046     nb_hw_breakpoint++;
   5047 
   5048     return 0;
   5049 }
   5050 
   5051 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
   5052                                   target_ulong len, int type)
   5053 {
   5054     int n;
   5055 
   5056     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
   5057     if (n < 0) {
   5058         return -ENOENT;
   5059     }
   5060     nb_hw_breakpoint--;
   5061     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
   5062 
   5063     return 0;
   5064 }
   5065 
   5066 void kvm_arch_remove_all_hw_breakpoints(void)
   5067 {
   5068     nb_hw_breakpoint = 0;
   5069 }
   5070 
   5071 static CPUWatchpoint hw_watchpoint;
   5072 
   5073 static int kvm_handle_debug(X86CPU *cpu,
   5074                             struct kvm_debug_exit_arch *arch_info)
   5075 {
   5076     CPUState *cs = CPU(cpu);
   5077     CPUX86State *env = &cpu->env;
   5078     int ret = 0;
   5079     int n;
   5080 
   5081     if (arch_info->exception == EXCP01_DB) {
   5082         if (arch_info->dr6 & DR6_BS) {
   5083             if (cs->singlestep_enabled) {
   5084                 ret = EXCP_DEBUG;
   5085             }
   5086         } else {
   5087             for (n = 0; n < 4; n++) {
   5088                 if (arch_info->dr6 & (1 << n)) {
   5089                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
   5090                     case 0x0:
   5091                         ret = EXCP_DEBUG;
   5092                         break;
   5093                     case 0x1:
   5094                         ret = EXCP_DEBUG;
   5095                         cs->watchpoint_hit = &hw_watchpoint;
   5096                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
   5097                         hw_watchpoint.flags = BP_MEM_WRITE;
   5098                         break;
   5099                     case 0x3:
   5100                         ret = EXCP_DEBUG;
   5101                         cs->watchpoint_hit = &hw_watchpoint;
   5102                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
   5103                         hw_watchpoint.flags = BP_MEM_ACCESS;
   5104                         break;
   5105                     }
   5106                 }
   5107             }
   5108         }
   5109     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
   5110         ret = EXCP_DEBUG;
   5111     }
   5112     if (ret == 0) {
   5113         cpu_synchronize_state(cs);
   5114         assert(env->exception_nr == -1);
   5115 
   5116         /* pass to guest */
   5117         kvm_queue_exception(env, arch_info->exception,
   5118                             arch_info->exception == EXCP01_DB,
   5119                             arch_info->dr6);
   5120         env->has_error_code = 0;
   5121     }
   5122 
   5123     return ret;
   5124 }
   5125 
   5126 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
   5127 {
   5128     const uint8_t type_code[] = {
   5129         [GDB_BREAKPOINT_HW] = 0x0,
   5130         [GDB_WATCHPOINT_WRITE] = 0x1,
   5131         [GDB_WATCHPOINT_ACCESS] = 0x3
   5132     };
   5133     const uint8_t len_code[] = {
   5134         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
   5135     };
   5136     int n;
   5137 
   5138     if (kvm_sw_breakpoints_active(cpu)) {
   5139         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
   5140     }
   5141     if (nb_hw_breakpoint > 0) {
   5142         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
   5143         dbg->arch.debugreg[7] = 0x0600;
   5144         for (n = 0; n < nb_hw_breakpoint; n++) {
   5145             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
   5146             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
   5147                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
   5148                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
   5149         }
   5150     }
   5151 }
   5152 
   5153 static bool kvm_install_msr_filters(KVMState *s)
   5154 {
   5155     uint64_t zero = 0;
   5156     struct kvm_msr_filter filter = {
   5157         .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
   5158     };
   5159     int r, i, j = 0;
   5160 
   5161     for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) {
   5162         KVMMSRHandlers *handler = &msr_handlers[i];
   5163         if (handler->msr) {
   5164             struct kvm_msr_filter_range *range = &filter.ranges[j++];
   5165 
   5166             *range = (struct kvm_msr_filter_range) {
   5167                 .flags = 0,
   5168                 .nmsrs = 1,
   5169                 .base = handler->msr,
   5170                 .bitmap = (__u8 *)&zero,
   5171             };
   5172 
   5173             if (handler->rdmsr) {
   5174                 range->flags |= KVM_MSR_FILTER_READ;
   5175             }
   5176 
   5177             if (handler->wrmsr) {
   5178                 range->flags |= KVM_MSR_FILTER_WRITE;
   5179             }
   5180         }
   5181     }
   5182 
   5183     r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
   5184     if (r) {
   5185         return false;
   5186     }
   5187 
   5188     return true;
   5189 }
   5190 
   5191 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
   5192                     QEMUWRMSRHandler *wrmsr)
   5193 {
   5194     int i;
   5195 
   5196     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
   5197         if (!msr_handlers[i].msr) {
   5198             msr_handlers[i] = (KVMMSRHandlers) {
   5199                 .msr = msr,
   5200                 .rdmsr = rdmsr,
   5201                 .wrmsr = wrmsr,
   5202             };
   5203 
   5204             if (!kvm_install_msr_filters(s)) {
   5205                 msr_handlers[i] = (KVMMSRHandlers) { };
   5206                 return false;
   5207             }
   5208 
   5209             return true;
   5210         }
   5211     }
   5212 
   5213     return false;
   5214 }
   5215 
   5216 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
   5217 {
   5218     int i;
   5219     bool r;
   5220 
   5221     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
   5222         KVMMSRHandlers *handler = &msr_handlers[i];
   5223         if (run->msr.index == handler->msr) {
   5224             if (handler->rdmsr) {
   5225                 r = handler->rdmsr(cpu, handler->msr,
   5226                                    (uint64_t *)&run->msr.data);
   5227                 run->msr.error = r ? 0 : 1;
   5228                 return 0;
   5229             }
   5230         }
   5231     }
   5232 
   5233     assert(false);
   5234 }
   5235 
   5236 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
   5237 {
   5238     int i;
   5239     bool r;
   5240 
   5241     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
   5242         KVMMSRHandlers *handler = &msr_handlers[i];
   5243         if (run->msr.index == handler->msr) {
   5244             if (handler->wrmsr) {
   5245                 r = handler->wrmsr(cpu, handler->msr, run->msr.data);
   5246                 run->msr.error = r ? 0 : 1;
   5247                 return 0;
   5248             }
   5249         }
   5250     }
   5251 
   5252     assert(false);
   5253 }
   5254 
   5255 static bool has_sgx_provisioning;
   5256 
   5257 static bool __kvm_enable_sgx_provisioning(KVMState *s)
   5258 {
   5259     int fd, ret;
   5260 
   5261     if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
   5262         return false;
   5263     }
   5264 
   5265     fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
   5266     if (fd < 0) {
   5267         return false;
   5268     }
   5269 
   5270     ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
   5271     if (ret) {
   5272         error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
   5273         exit(1);
   5274     }
   5275     close(fd);
   5276     return true;
   5277 }
   5278 
   5279 bool kvm_enable_sgx_provisioning(KVMState *s)
   5280 {
   5281     return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
   5282 }
   5283 
   5284 static bool host_supports_vmx(void)
   5285 {
   5286     uint32_t ecx, unused;
   5287 
   5288     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
   5289     return ecx & CPUID_EXT_VMX;
   5290 }
   5291 
   5292 #define VMX_INVALID_GUEST_STATE 0x80000021
   5293 
   5294 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
   5295 {
   5296     X86CPU *cpu = X86_CPU(cs);
   5297     uint64_t code;
   5298     int ret;
   5299     bool ctx_invalid;
   5300     char str[256];
   5301     KVMState *state;
   5302 
   5303     switch (run->exit_reason) {
   5304     case KVM_EXIT_HLT:
   5305         DPRINTF("handle_hlt\n");
   5306         qemu_mutex_lock_iothread();
   5307         ret = kvm_handle_halt(cpu);
   5308         qemu_mutex_unlock_iothread();
   5309         break;
   5310     case KVM_EXIT_SET_TPR:
   5311         ret = 0;
   5312         break;
   5313     case KVM_EXIT_TPR_ACCESS:
   5314         qemu_mutex_lock_iothread();
   5315         ret = kvm_handle_tpr_access(cpu);
   5316         qemu_mutex_unlock_iothread();
   5317         break;
   5318     case KVM_EXIT_FAIL_ENTRY:
   5319         code = run->fail_entry.hardware_entry_failure_reason;
   5320         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
   5321                 code);
   5322         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
   5323             fprintf(stderr,
   5324                     "\nIf you're running a guest on an Intel machine without "
   5325                         "unrestricted mode\n"
   5326                     "support, the failure can be most likely due to the guest "
   5327                         "entering an invalid\n"
   5328                     "state for Intel VT. For example, the guest maybe running "
   5329                         "in big real mode\n"
   5330                     "which is not supported on less recent Intel processors."
   5331                         "\n\n");
   5332         }
   5333         ret = -1;
   5334         break;
   5335     case KVM_EXIT_EXCEPTION:
   5336         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
   5337                 run->ex.exception, run->ex.error_code);
   5338         ret = -1;
   5339         break;
   5340     case KVM_EXIT_DEBUG:
   5341         DPRINTF("kvm_exit_debug\n");
   5342         qemu_mutex_lock_iothread();
   5343         ret = kvm_handle_debug(cpu, &run->debug.arch);
   5344         qemu_mutex_unlock_iothread();
   5345         break;
   5346     case KVM_EXIT_HYPERV:
   5347         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
   5348         break;
   5349     case KVM_EXIT_IOAPIC_EOI:
   5350         ioapic_eoi_broadcast(run->eoi.vector);
   5351         ret = 0;
   5352         break;
   5353     case KVM_EXIT_X86_BUS_LOCK:
   5354         /* already handled in kvm_arch_post_run */
   5355         ret = 0;
   5356         break;
   5357     case KVM_EXIT_NOTIFY:
   5358         ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID);
   5359         state = KVM_STATE(current_accel());
   5360         sprintf(str, "Encounter a notify exit with %svalid context in"
   5361                      " guest. There can be possible misbehaves in guest."
   5362                      " Please have a look.", ctx_invalid ? "in" : "");
   5363         if (ctx_invalid ||
   5364             state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) {
   5365             warn_report("KVM internal error: %s", str);
   5366             ret = -1;
   5367         } else {
   5368             warn_report_once("KVM: %s", str);
   5369             ret = 0;
   5370         }
   5371         break;
   5372     case KVM_EXIT_X86_RDMSR:
   5373         /* We only enable MSR filtering, any other exit is bogus */
   5374         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
   5375         ret = kvm_handle_rdmsr(cpu, run);
   5376         break;
   5377     case KVM_EXIT_X86_WRMSR:
   5378         /* We only enable MSR filtering, any other exit is bogus */
   5379         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
   5380         ret = kvm_handle_wrmsr(cpu, run);
   5381         break;
   5382     default:
   5383         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
   5384         ret = -1;
   5385         break;
   5386     }
   5387 
   5388     return ret;
   5389 }
   5390 
   5391 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
   5392 {
   5393     X86CPU *cpu = X86_CPU(cs);
   5394     CPUX86State *env = &cpu->env;
   5395 
   5396     kvm_cpu_synchronize_state(cs);
   5397     return !(env->cr[0] & CR0_PE_MASK) ||
   5398            ((env->segs[R_CS].selector  & 3) != 3);
   5399 }
   5400 
   5401 void kvm_arch_init_irq_routing(KVMState *s)
   5402 {
   5403     /* We know at this point that we're using the in-kernel
   5404      * irqchip, so we can use irqfds, and on x86 we know
   5405      * we can use msi via irqfd and GSI routing.
   5406      */
   5407     kvm_msi_via_irqfd_allowed = true;
   5408     kvm_gsi_routing_allowed = true;
   5409 
   5410     if (kvm_irqchip_is_split()) {
   5411         KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
   5412         int i;
   5413 
   5414         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
   5415            MSI routes for signaling interrupts to the local apics. */
   5416         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
   5417             if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
   5418                 error_report("Could not enable split IRQ mode.");
   5419                 exit(1);
   5420             }
   5421         }
   5422         kvm_irqchip_commit_route_changes(&c);
   5423     }
   5424 }
   5425 
   5426 int kvm_arch_irqchip_create(KVMState *s)
   5427 {
   5428     int ret;
   5429     if (kvm_kernel_irqchip_split()) {
   5430         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
   5431         if (ret) {
   5432             error_report("Could not enable split irqchip mode: %s",
   5433                          strerror(-ret));
   5434             exit(1);
   5435         } else {
   5436             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
   5437             kvm_split_irqchip = true;
   5438             return 1;
   5439         }
   5440     } else {
   5441         return 0;
   5442     }
   5443 }
   5444 
   5445 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
   5446 {
   5447     CPUX86State *env;
   5448     uint64_t ext_id;
   5449 
   5450     if (!first_cpu) {
   5451         return address;
   5452     }
   5453     env = &X86_CPU(first_cpu)->env;
   5454     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
   5455         return address;
   5456     }
   5457 
   5458     /*
   5459      * If the remappable format bit is set, or the upper bits are
   5460      * already set in address_hi, or the low extended bits aren't
   5461      * there anyway, do nothing.
   5462      */
   5463     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
   5464     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
   5465         return address;
   5466     }
   5467 
   5468     address &= ~ext_id;
   5469     address |= ext_id << 35;
   5470     return address;
   5471 }
   5472 
   5473 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
   5474                              uint64_t address, uint32_t data, PCIDevice *dev)
   5475 {
   5476     X86IOMMUState *iommu = x86_iommu_get_default();
   5477 
   5478     if (iommu) {
   5479         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
   5480 
   5481         if (class->int_remap) {
   5482             int ret;
   5483             MSIMessage src, dst;
   5484 
   5485             src.address = route->u.msi.address_hi;
   5486             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
   5487             src.address |= route->u.msi.address_lo;
   5488             src.data = route->u.msi.data;
   5489 
   5490             ret = class->int_remap(iommu, &src, &dst, dev ?     \
   5491                                    pci_requester_id(dev) :      \
   5492                                    X86_IOMMU_SID_INVALID);
   5493             if (ret) {
   5494                 trace_kvm_x86_fixup_msi_error(route->gsi);
   5495                 return 1;
   5496             }
   5497 
   5498             /*
   5499              * Handled untranslated compatibilty format interrupt with
   5500              * extended destination ID in the low bits 11-5. */
   5501             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
   5502 
   5503             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
   5504             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
   5505             route->u.msi.data = dst.data;
   5506             return 0;
   5507         }
   5508     }
   5509 
   5510     address = kvm_swizzle_msi_ext_dest_id(address);
   5511     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
   5512     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
   5513     return 0;
   5514 }
   5515 
   5516 typedef struct MSIRouteEntry MSIRouteEntry;
   5517 
   5518 struct MSIRouteEntry {
   5519     PCIDevice *dev;             /* Device pointer */
   5520     int vector;                 /* MSI/MSIX vector index */
   5521     int virq;                   /* Virtual IRQ index */
   5522     QLIST_ENTRY(MSIRouteEntry) list;
   5523 };
   5524 
   5525 /* List of used GSI routes */
   5526 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
   5527     QLIST_HEAD_INITIALIZER(msi_route_list);
   5528 
   5529 static void kvm_update_msi_routes_all(void *private, bool global,
   5530                                       uint32_t index, uint32_t mask)
   5531 {
   5532     int cnt = 0, vector;
   5533     MSIRouteEntry *entry;
   5534     MSIMessage msg;
   5535     PCIDevice *dev;
   5536 
   5537     /* TODO: explicit route update */
   5538     QLIST_FOREACH(entry, &msi_route_list, list) {
   5539         cnt++;
   5540         vector = entry->vector;
   5541         dev = entry->dev;
   5542         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
   5543             msg = msix_get_message(dev, vector);
   5544         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
   5545             msg = msi_get_message(dev, vector);
   5546         } else {
   5547             /*
   5548              * Either MSI/MSIX is disabled for the device, or the
   5549              * specific message was masked out.  Skip this one.
   5550              */
   5551             continue;
   5552         }
   5553         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
   5554     }
   5555     kvm_irqchip_commit_routes(kvm_state);
   5556     trace_kvm_x86_update_msi_routes(cnt);
   5557 }
   5558 
   5559 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
   5560                                 int vector, PCIDevice *dev)
   5561 {
   5562     static bool notify_list_inited = false;
   5563     MSIRouteEntry *entry;
   5564 
   5565     if (!dev) {
   5566         /* These are (possibly) IOAPIC routes only used for split
   5567          * kernel irqchip mode, while what we are housekeeping are
   5568          * PCI devices only. */
   5569         return 0;
   5570     }
   5571 
   5572     entry = g_new0(MSIRouteEntry, 1);
   5573     entry->dev = dev;
   5574     entry->vector = vector;
   5575     entry->virq = route->gsi;
   5576     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
   5577 
   5578     trace_kvm_x86_add_msi_route(route->gsi);
   5579 
   5580     if (!notify_list_inited) {
   5581         /* For the first time we do add route, add ourselves into
   5582          * IOMMU's IEC notify list if needed. */
   5583         X86IOMMUState *iommu = x86_iommu_get_default();
   5584         if (iommu) {
   5585             x86_iommu_iec_register_notifier(iommu,
   5586                                             kvm_update_msi_routes_all,
   5587                                             NULL);
   5588         }
   5589         notify_list_inited = true;
   5590     }
   5591     return 0;
   5592 }
   5593 
   5594 int kvm_arch_release_virq_post(int virq)
   5595 {
   5596     MSIRouteEntry *entry, *next;
   5597     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
   5598         if (entry->virq == virq) {
   5599             trace_kvm_x86_remove_msi_route(virq);
   5600             QLIST_REMOVE(entry, list);
   5601             g_free(entry);
   5602             break;
   5603         }
   5604     }
   5605     return 0;
   5606 }
   5607 
   5608 int kvm_arch_msi_data_to_gsi(uint32_t data)
   5609 {
   5610     abort();
   5611 }
   5612 
   5613 bool kvm_has_waitpkg(void)
   5614 {
   5615     return has_msr_umwait;
   5616 }
   5617 
   5618 bool kvm_arch_cpu_check_are_resettable(void)
   5619 {
   5620     return !sev_es_enabled();
   5621 }
   5622 
   5623 #define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
   5624 
   5625 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
   5626 {
   5627     KVMState *s = kvm_state;
   5628     uint64_t supported;
   5629 
   5630     mask &= XSTATE_DYNAMIC_MASK;
   5631     if (!mask) {
   5632         return;
   5633     }
   5634     /*
   5635      * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
   5636      * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
   5637      * about them already because they are not supported features.
   5638      */
   5639     supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
   5640     supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
   5641     mask &= supported;
   5642 
   5643     while (mask) {
   5644         int bit = ctz64(mask);
   5645         int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
   5646         if (rc) {
   5647             /*
   5648              * Older kernel version (<5.17) do not support
   5649              * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
   5650              * any dynamic feature from kvm_arch_get_supported_cpuid.
   5651              */
   5652             warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
   5653                         "for feature bit %d", bit);
   5654         }
   5655         mask &= ~BIT_ULL(bit);
   5656     }
   5657 }
   5658 
   5659 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp)
   5660 {
   5661     KVMState *s = KVM_STATE(obj);
   5662     return s->notify_vmexit;
   5663 }
   5664 
   5665 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp)
   5666 {
   5667     KVMState *s = KVM_STATE(obj);
   5668 
   5669     if (s->fd != -1) {
   5670         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
   5671         return;
   5672     }
   5673 
   5674     s->notify_vmexit = value;
   5675 }
   5676 
   5677 static void kvm_arch_get_notify_window(Object *obj, Visitor *v,
   5678                                        const char *name, void *opaque,
   5679                                        Error **errp)
   5680 {
   5681     KVMState *s = KVM_STATE(obj);
   5682     uint32_t value = s->notify_window;
   5683 
   5684     visit_type_uint32(v, name, &value, errp);
   5685 }
   5686 
   5687 static void kvm_arch_set_notify_window(Object *obj, Visitor *v,
   5688                                        const char *name, void *opaque,
   5689                                        Error **errp)
   5690 {
   5691     KVMState *s = KVM_STATE(obj);
   5692     Error *error = NULL;
   5693     uint32_t value;
   5694 
   5695     if (s->fd != -1) {
   5696         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
   5697         return;
   5698     }
   5699 
   5700     visit_type_uint32(v, name, &value, &error);
   5701     if (error) {
   5702         error_propagate(errp, error);
   5703         return;
   5704     }
   5705 
   5706     s->notify_window = value;
   5707 }
   5708 
   5709 void kvm_arch_accel_class_init(ObjectClass *oc)
   5710 {
   5711     object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
   5712                                    &NotifyVmexitOption_lookup,
   5713                                    kvm_arch_get_notify_vmexit,
   5714                                    kvm_arch_set_notify_vmexit);
   5715     object_class_property_set_description(oc, "notify-vmexit",
   5716                                           "Enable notify VM exit");
   5717 
   5718     object_class_property_add(oc, "notify-window", "uint32",
   5719                               kvm_arch_get_notify_window,
   5720                               kvm_arch_set_notify_window,
   5721                               NULL, NULL);
   5722     object_class_property_set_description(oc, "notify-window",
   5723                                           "Clock cycles without an event window "
   5724                                           "after which a notification VM exit occurs");
   5725 }
   5726 
   5727 void kvm_set_max_apic_id(uint32_t max_apic_id)
   5728 {
   5729     kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id);
   5730 }