qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

kvm64.c (50115B)


      1 /*
      2  * ARM implementation of KVM hooks, 64 bit specific code
      3  *
      4  * Copyright Mian-M. Hamayun 2013, Virtual Open Systems
      5  * Copyright Alex Bennée 2014, Linaro
      6  *
      7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
      8  * See the COPYING file in the top-level directory.
      9  *
     10  */
     11 
     12 #include "qemu/osdep.h"
     13 #include <sys/ioctl.h>
     14 #include <sys/ptrace.h>
     15 
     16 #include <linux/elf.h>
     17 #include <linux/kvm.h>
     18 
     19 #include "qapi/error.h"
     20 #include "cpu.h"
     21 #include "qemu/timer.h"
     22 #include "qemu/error-report.h"
     23 #include "qemu/host-utils.h"
     24 #include "qemu/main-loop.h"
     25 #include "exec/gdbstub.h"
     26 #include "sysemu/runstate.h"
     27 #include "sysemu/kvm.h"
     28 #include "sysemu/kvm_int.h"
     29 #include "kvm_arm.h"
     30 #include "internals.h"
     31 #include "hw/acpi/acpi.h"
     32 #include "hw/acpi/ghes.h"
     33 #include "hw/arm/virt.h"
     34 
     35 static bool have_guest_debug;
     36 
     37 /*
     38  * Although the ARM implementation of hardware assisted debugging
     39  * allows for different breakpoints per-core, the current GDB
     40  * interface treats them as a global pool of registers (which seems to
     41  * be the case for x86, ppc and s390). As a result we store one copy
     42  * of registers which is used for all active cores.
     43  *
     44  * Write access is serialised by virtue of the GDB protocol which
     45  * updates things. Read access (i.e. when the values are copied to the
     46  * vCPU) is also gated by GDB's run control.
     47  *
     48  * This is not unreasonable as most of the time debugging kernels you
     49  * never know which core will eventually execute your function.
     50  */
     51 
     52 typedef struct {
     53     uint64_t bcr;
     54     uint64_t bvr;
     55 } HWBreakpoint;
     56 
     57 /* The watchpoint registers can cover more area than the requested
     58  * watchpoint so we need to store the additional information
     59  * somewhere. We also need to supply a CPUWatchpoint to the GDB stub
     60  * when the watchpoint is hit.
     61  */
     62 typedef struct {
     63     uint64_t wcr;
     64     uint64_t wvr;
     65     CPUWatchpoint details;
     66 } HWWatchpoint;
     67 
     68 /* Maximum and current break/watch point counts */
     69 int max_hw_bps, max_hw_wps;
     70 GArray *hw_breakpoints, *hw_watchpoints;
     71 
     72 #define cur_hw_wps      (hw_watchpoints->len)
     73 #define cur_hw_bps      (hw_breakpoints->len)
     74 #define get_hw_bp(i)    (&g_array_index(hw_breakpoints, HWBreakpoint, i))
     75 #define get_hw_wp(i)    (&g_array_index(hw_watchpoints, HWWatchpoint, i))
     76 
     77 /**
     78  * kvm_arm_init_debug() - check for guest debug capabilities
     79  * @cs: CPUState
     80  *
     81  * kvm_check_extension returns the number of debug registers we have
     82  * or 0 if we have none.
     83  *
     84  */
     85 static void kvm_arm_init_debug(CPUState *cs)
     86 {
     87     have_guest_debug = kvm_check_extension(cs->kvm_state,
     88                                            KVM_CAP_SET_GUEST_DEBUG);
     89 
     90     max_hw_wps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_WPS);
     91     hw_watchpoints = g_array_sized_new(true, true,
     92                                        sizeof(HWWatchpoint), max_hw_wps);
     93 
     94     max_hw_bps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_BPS);
     95     hw_breakpoints = g_array_sized_new(true, true,
     96                                        sizeof(HWBreakpoint), max_hw_bps);
     97     return;
     98 }
     99 
    100 /**
    101  * insert_hw_breakpoint()
    102  * @addr: address of breakpoint
    103  *
    104  * See ARM ARM D2.9.1 for details but here we are only going to create
    105  * simple un-linked breakpoints (i.e. we don't chain breakpoints
    106  * together to match address and context or vmid). The hardware is
    107  * capable of fancier matching but that will require exposing that
    108  * fanciness to GDB's interface
    109  *
    110  * DBGBCR<n>_EL1, Debug Breakpoint Control Registers
    111  *
    112  *  31  24 23  20 19   16 15 14  13  12   9 8   5 4    3 2   1  0
    113  * +------+------+-------+-----+----+------+-----+------+-----+---+
    114  * | RES0 |  BT  |  LBN  | SSC | HMC| RES0 | BAS | RES0 | PMC | E |
    115  * +------+------+-------+-----+----+------+-----+------+-----+---+
    116  *
    117  * BT: Breakpoint type (0 = unlinked address match)
    118  * LBN: Linked BP number (0 = unused)
    119  * SSC/HMC/PMC: Security, Higher and Priv access control (Table D-12)
    120  * BAS: Byte Address Select (RES1 for AArch64)
    121  * E: Enable bit
    122  *
    123  * DBGBVR<n>_EL1, Debug Breakpoint Value Registers
    124  *
    125  *  63  53 52       49 48       2  1 0
    126  * +------+-----------+----------+-----+
    127  * | RESS | VA[52:49] | VA[48:2] | 0 0 |
    128  * +------+-----------+----------+-----+
    129  *
    130  * Depending on the addressing mode bits the top bits of the register
    131  * are a sign extension of the highest applicable VA bit. Some
    132  * versions of GDB don't do it correctly so we ensure they are correct
    133  * here so future PC comparisons will work properly.
    134  */
    135 
    136 static int insert_hw_breakpoint(target_ulong addr)
    137 {
    138     HWBreakpoint brk = {
    139         .bcr = 0x1,                             /* BCR E=1, enable */
    140         .bvr = sextract64(addr, 0, 53)
    141     };
    142 
    143     if (cur_hw_bps >= max_hw_bps) {
    144         return -ENOBUFS;
    145     }
    146 
    147     brk.bcr = deposit32(brk.bcr, 1, 2, 0x3);   /* PMC = 11 */
    148     brk.bcr = deposit32(brk.bcr, 5, 4, 0xf);   /* BAS = RES1 */
    149 
    150     g_array_append_val(hw_breakpoints, brk);
    151 
    152     return 0;
    153 }
    154 
    155 /**
    156  * delete_hw_breakpoint()
    157  * @pc: address of breakpoint
    158  *
    159  * Delete a breakpoint and shuffle any above down
    160  */
    161 
    162 static int delete_hw_breakpoint(target_ulong pc)
    163 {
    164     int i;
    165     for (i = 0; i < hw_breakpoints->len; i++) {
    166         HWBreakpoint *brk = get_hw_bp(i);
    167         if (brk->bvr == pc) {
    168             g_array_remove_index(hw_breakpoints, i);
    169             return 0;
    170         }
    171     }
    172     return -ENOENT;
    173 }
    174 
    175 /**
    176  * insert_hw_watchpoint()
    177  * @addr: address of watch point
    178  * @len: size of area
    179  * @type: type of watch point
    180  *
    181  * See ARM ARM D2.10. As with the breakpoints we can do some advanced
    182  * stuff if we want to. The watch points can be linked with the break
    183  * points above to make them context aware. However for simplicity
    184  * currently we only deal with simple read/write watch points.
    185  *
    186  * D7.3.11 DBGWCR<n>_EL1, Debug Watchpoint Control Registers
    187  *
    188  *  31  29 28   24 23  21  20  19 16 15 14  13   12  5 4   3 2   1  0
    189  * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
    190  * | RES0 |  MASK | RES0 | WT | LBN | SSC | HMC | BAS | LSC | PAC | E |
    191  * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
    192  *
    193  * MASK: num bits addr mask (0=none,01/10=res,11=3 bits (8 bytes))
    194  * WT: 0 - unlinked, 1 - linked (not currently used)
    195  * LBN: Linked BP number (not currently used)
    196  * SSC/HMC/PAC: Security, Higher and Priv access control (Table D2-11)
    197  * BAS: Byte Address Select
    198  * LSC: Load/Store control (01: load, 10: store, 11: both)
    199  * E: Enable
    200  *
    201  * The bottom 2 bits of the value register are masked. Therefore to
    202  * break on any sizes smaller than an unaligned word you need to set
    203  * MASK=0, BAS=bit per byte in question. For larger regions (^2) you
    204  * need to ensure you mask the address as required and set BAS=0xff
    205  */
    206 
    207 static int insert_hw_watchpoint(target_ulong addr,
    208                                 target_ulong len, int type)
    209 {
    210     HWWatchpoint wp = {
    211         .wcr = R_DBGWCR_E_MASK, /* E=1, enable */
    212         .wvr = addr & (~0x7ULL),
    213         .details = { .vaddr = addr, .len = len }
    214     };
    215 
    216     if (cur_hw_wps >= max_hw_wps) {
    217         return -ENOBUFS;
    218     }
    219 
    220     /*
    221      * HMC=0 SSC=0 PAC=3 will hit EL0 or EL1, any security state,
    222      * valid whether EL3 is implemented or not
    223      */
    224     wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, PAC, 3);
    225 
    226     switch (type) {
    227     case GDB_WATCHPOINT_READ:
    228         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 1);
    229         wp.details.flags = BP_MEM_READ;
    230         break;
    231     case GDB_WATCHPOINT_WRITE:
    232         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 2);
    233         wp.details.flags = BP_MEM_WRITE;
    234         break;
    235     case GDB_WATCHPOINT_ACCESS:
    236         wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 3);
    237         wp.details.flags = BP_MEM_ACCESS;
    238         break;
    239     default:
    240         g_assert_not_reached();
    241         break;
    242     }
    243     if (len <= 8) {
    244         /* we align the address and set the bits in BAS */
    245         int off = addr & 0x7;
    246         int bas = (1 << len) - 1;
    247 
    248         wp.wcr = deposit32(wp.wcr, 5 + off, 8 - off, bas);
    249     } else {
    250         /* For ranges above 8 bytes we need to be a power of 2 */
    251         if (is_power_of_2(len)) {
    252             int bits = ctz64(len);
    253 
    254             wp.wvr &= ~((1 << bits) - 1);
    255             wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, MASK, bits);
    256             wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, BAS, 0xff);
    257         } else {
    258             return -ENOBUFS;
    259         }
    260     }
    261 
    262     g_array_append_val(hw_watchpoints, wp);
    263     return 0;
    264 }
    265 
    266 
    267 static bool check_watchpoint_in_range(int i, target_ulong addr)
    268 {
    269     HWWatchpoint *wp = get_hw_wp(i);
    270     uint64_t addr_top, addr_bottom = wp->wvr;
    271     int bas = extract32(wp->wcr, 5, 8);
    272     int mask = extract32(wp->wcr, 24, 4);
    273 
    274     if (mask) {
    275         addr_top = addr_bottom + (1 << mask);
    276     } else {
    277         /* BAS must be contiguous but can offset against the base
    278          * address in DBGWVR */
    279         addr_bottom = addr_bottom + ctz32(bas);
    280         addr_top = addr_bottom + clo32(bas);
    281     }
    282 
    283     if (addr >= addr_bottom && addr <= addr_top) {
    284         return true;
    285     }
    286 
    287     return false;
    288 }
    289 
    290 /**
    291  * delete_hw_watchpoint()
    292  * @addr: address of breakpoint
    293  *
    294  * Delete a breakpoint and shuffle any above down
    295  */
    296 
    297 static int delete_hw_watchpoint(target_ulong addr,
    298                                 target_ulong len, int type)
    299 {
    300     int i;
    301     for (i = 0; i < cur_hw_wps; i++) {
    302         if (check_watchpoint_in_range(i, addr)) {
    303             g_array_remove_index(hw_watchpoints, i);
    304             return 0;
    305         }
    306     }
    307     return -ENOENT;
    308 }
    309 
    310 
    311 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
    312                                   target_ulong len, int type)
    313 {
    314     switch (type) {
    315     case GDB_BREAKPOINT_HW:
    316         return insert_hw_breakpoint(addr);
    317         break;
    318     case GDB_WATCHPOINT_READ:
    319     case GDB_WATCHPOINT_WRITE:
    320     case GDB_WATCHPOINT_ACCESS:
    321         return insert_hw_watchpoint(addr, len, type);
    322     default:
    323         return -ENOSYS;
    324     }
    325 }
    326 
    327 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
    328                                   target_ulong len, int type)
    329 {
    330     switch (type) {
    331     case GDB_BREAKPOINT_HW:
    332         return delete_hw_breakpoint(addr);
    333     case GDB_WATCHPOINT_READ:
    334     case GDB_WATCHPOINT_WRITE:
    335     case GDB_WATCHPOINT_ACCESS:
    336         return delete_hw_watchpoint(addr, len, type);
    337     default:
    338         return -ENOSYS;
    339     }
    340 }
    341 
    342 
    343 void kvm_arch_remove_all_hw_breakpoints(void)
    344 {
    345     if (cur_hw_wps > 0) {
    346         g_array_remove_range(hw_watchpoints, 0, cur_hw_wps);
    347     }
    348     if (cur_hw_bps > 0) {
    349         g_array_remove_range(hw_breakpoints, 0, cur_hw_bps);
    350     }
    351 }
    352 
    353 void kvm_arm_copy_hw_debug_data(struct kvm_guest_debug_arch *ptr)
    354 {
    355     int i;
    356     memset(ptr, 0, sizeof(struct kvm_guest_debug_arch));
    357 
    358     for (i = 0; i < max_hw_wps; i++) {
    359         HWWatchpoint *wp = get_hw_wp(i);
    360         ptr->dbg_wcr[i] = wp->wcr;
    361         ptr->dbg_wvr[i] = wp->wvr;
    362     }
    363     for (i = 0; i < max_hw_bps; i++) {
    364         HWBreakpoint *bp = get_hw_bp(i);
    365         ptr->dbg_bcr[i] = bp->bcr;
    366         ptr->dbg_bvr[i] = bp->bvr;
    367     }
    368 }
    369 
    370 bool kvm_arm_hw_debug_active(CPUState *cs)
    371 {
    372     return ((cur_hw_wps > 0) || (cur_hw_bps > 0));
    373 }
    374 
    375 static bool find_hw_breakpoint(CPUState *cpu, target_ulong pc)
    376 {
    377     int i;
    378 
    379     for (i = 0; i < cur_hw_bps; i++) {
    380         HWBreakpoint *bp = get_hw_bp(i);
    381         if (bp->bvr == pc) {
    382             return true;
    383         }
    384     }
    385     return false;
    386 }
    387 
    388 static CPUWatchpoint *find_hw_watchpoint(CPUState *cpu, target_ulong addr)
    389 {
    390     int i;
    391 
    392     for (i = 0; i < cur_hw_wps; i++) {
    393         if (check_watchpoint_in_range(i, addr)) {
    394             return &get_hw_wp(i)->details;
    395         }
    396     }
    397     return NULL;
    398 }
    399 
    400 static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr,
    401                                     const char *name)
    402 {
    403     int err;
    404 
    405     err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
    406     if (err != 0) {
    407         error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err));
    408         return false;
    409     }
    410 
    411     err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr);
    412     if (err != 0) {
    413         error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err));
    414         return false;
    415     }
    416 
    417     return true;
    418 }
    419 
    420 void kvm_arm_pmu_init(CPUState *cs)
    421 {
    422     struct kvm_device_attr attr = {
    423         .group = KVM_ARM_VCPU_PMU_V3_CTRL,
    424         .attr = KVM_ARM_VCPU_PMU_V3_INIT,
    425     };
    426 
    427     if (!ARM_CPU(cs)->has_pmu) {
    428         return;
    429     }
    430     if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
    431         error_report("failed to init PMU");
    432         abort();
    433     }
    434 }
    435 
    436 void kvm_arm_pmu_set_irq(CPUState *cs, int irq)
    437 {
    438     struct kvm_device_attr attr = {
    439         .group = KVM_ARM_VCPU_PMU_V3_CTRL,
    440         .addr = (intptr_t)&irq,
    441         .attr = KVM_ARM_VCPU_PMU_V3_IRQ,
    442     };
    443 
    444     if (!ARM_CPU(cs)->has_pmu) {
    445         return;
    446     }
    447     if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
    448         error_report("failed to set irq for PMU");
    449         abort();
    450     }
    451 }
    452 
    453 void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
    454 {
    455     struct kvm_device_attr attr = {
    456         .group = KVM_ARM_VCPU_PVTIME_CTRL,
    457         .attr = KVM_ARM_VCPU_PVTIME_IPA,
    458         .addr = (uint64_t)&ipa,
    459     };
    460 
    461     if (ARM_CPU(cs)->kvm_steal_time == ON_OFF_AUTO_OFF) {
    462         return;
    463     }
    464     if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
    465         error_report("failed to init PVTIME IPA");
    466         abort();
    467     }
    468 }
    469 
    470 static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id)
    471 {
    472     uint64_t ret;
    473     struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)&ret };
    474     int err;
    475 
    476     assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
    477     err = ioctl(fd, KVM_GET_ONE_REG, &idreg);
    478     if (err < 0) {
    479         return -1;
    480     }
    481     *pret = ret;
    482     return 0;
    483 }
    484 
    485 static int read_sys_reg64(int fd, uint64_t *pret, uint64_t id)
    486 {
    487     struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)pret };
    488 
    489     assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
    490     return ioctl(fd, KVM_GET_ONE_REG, &idreg);
    491 }
    492 
    493 static bool kvm_arm_pauth_supported(void)
    494 {
    495     return (kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_ADDRESS) &&
    496             kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
    497 }
    498 
    499 bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
    500 {
    501     /* Identify the feature bits corresponding to the host CPU, and
    502      * fill out the ARMHostCPUClass fields accordingly. To do this
    503      * we have to create a scratch VM, create a single CPU inside it,
    504      * and then query that CPU for the relevant ID registers.
    505      */
    506     int fdarray[3];
    507     bool sve_supported;
    508     bool pmu_supported = false;
    509     uint64_t features = 0;
    510     int err;
    511 
    512     /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
    513      * we know these will only support creating one kind of guest CPU,
    514      * which is its preferred CPU type. Fortunately these old kernels
    515      * support only a very limited number of CPUs.
    516      */
    517     static const uint32_t cpus_to_try[] = {
    518         KVM_ARM_TARGET_AEM_V8,
    519         KVM_ARM_TARGET_FOUNDATION_V8,
    520         KVM_ARM_TARGET_CORTEX_A57,
    521         QEMU_KVM_ARM_TARGET_NONE
    522     };
    523     /*
    524      * target = -1 informs kvm_arm_create_scratch_host_vcpu()
    525      * to use the preferred target
    526      */
    527     struct kvm_vcpu_init init = { .target = -1, };
    528 
    529     /*
    530      * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
    531      * which is otherwise RAZ.
    532      */
    533     sve_supported = kvm_arm_sve_supported();
    534     if (sve_supported) {
    535         init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
    536     }
    537 
    538     /*
    539      * Ask for Pointer Authentication if supported, so that we get
    540      * the unsanitized field values for AA64ISAR1_EL1.
    541      */
    542     if (kvm_arm_pauth_supported()) {
    543         init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
    544                              1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
    545     }
    546 
    547     if (kvm_arm_pmu_supported()) {
    548         init.features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
    549         pmu_supported = true;
    550     }
    551 
    552     if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
    553         return false;
    554     }
    555 
    556     ahcf->target = init.target;
    557     ahcf->dtb_compatible = "arm,arm-v8";
    558 
    559     err = read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr0,
    560                          ARM64_SYS_REG(3, 0, 0, 4, 0));
    561     if (unlikely(err < 0)) {
    562         /*
    563          * Before v4.15, the kernel only exposed a limited number of system
    564          * registers, not including any of the interesting AArch64 ID regs.
    565          * For the most part we could leave these fields as zero with minimal
    566          * effect, since this does not affect the values seen by the guest.
    567          *
    568          * However, it could cause problems down the line for QEMU,
    569          * so provide a minimal v8.0 default.
    570          *
    571          * ??? Could read MIDR and use knowledge from cpu64.c.
    572          * ??? Could map a page of memory into our temp guest and
    573          *     run the tiniest of hand-crafted kernels to extract
    574          *     the values seen by the guest.
    575          * ??? Either of these sounds like too much effort just
    576          *     to work around running a modern host kernel.
    577          */
    578         ahcf->isar.id_aa64pfr0 = 0x00000011; /* EL1&0, AArch64 only */
    579         err = 0;
    580     } else {
    581         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr1,
    582                               ARM64_SYS_REG(3, 0, 0, 4, 1));
    583         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64smfr0,
    584                               ARM64_SYS_REG(3, 0, 0, 4, 5));
    585         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr0,
    586                               ARM64_SYS_REG(3, 0, 0, 5, 0));
    587         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr1,
    588                               ARM64_SYS_REG(3, 0, 0, 5, 1));
    589         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar0,
    590                               ARM64_SYS_REG(3, 0, 0, 6, 0));
    591         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar1,
    592                               ARM64_SYS_REG(3, 0, 0, 6, 1));
    593         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr0,
    594                               ARM64_SYS_REG(3, 0, 0, 7, 0));
    595         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr1,
    596                               ARM64_SYS_REG(3, 0, 0, 7, 1));
    597         err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr2,
    598                               ARM64_SYS_REG(3, 0, 0, 7, 2));
    599 
    600         /*
    601          * Note that if AArch32 support is not present in the host,
    602          * the AArch32 sysregs are present to be read, but will
    603          * return UNKNOWN values.  This is neither better nor worse
    604          * than skipping the reads and leaving 0, as we must avoid
    605          * considering the values in every case.
    606          */
    607         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr0,
    608                               ARM64_SYS_REG(3, 0, 0, 1, 0));
    609         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr1,
    610                               ARM64_SYS_REG(3, 0, 0, 1, 1));
    611         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_dfr0,
    612                               ARM64_SYS_REG(3, 0, 0, 1, 2));
    613         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr0,
    614                               ARM64_SYS_REG(3, 0, 0, 1, 4));
    615         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr1,
    616                               ARM64_SYS_REG(3, 0, 0, 1, 5));
    617         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr2,
    618                               ARM64_SYS_REG(3, 0, 0, 1, 6));
    619         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr3,
    620                               ARM64_SYS_REG(3, 0, 0, 1, 7));
    621         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar0,
    622                               ARM64_SYS_REG(3, 0, 0, 2, 0));
    623         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar1,
    624                               ARM64_SYS_REG(3, 0, 0, 2, 1));
    625         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar2,
    626                               ARM64_SYS_REG(3, 0, 0, 2, 2));
    627         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar3,
    628                               ARM64_SYS_REG(3, 0, 0, 2, 3));
    629         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar4,
    630                               ARM64_SYS_REG(3, 0, 0, 2, 4));
    631         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar5,
    632                               ARM64_SYS_REG(3, 0, 0, 2, 5));
    633         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr4,
    634                               ARM64_SYS_REG(3, 0, 0, 2, 6));
    635         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar6,
    636                               ARM64_SYS_REG(3, 0, 0, 2, 7));
    637 
    638         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr0,
    639                               ARM64_SYS_REG(3, 0, 0, 3, 0));
    640         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr1,
    641                               ARM64_SYS_REG(3, 0, 0, 3, 1));
    642         err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr2,
    643                               ARM64_SYS_REG(3, 0, 0, 3, 2));
    644         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr2,
    645                               ARM64_SYS_REG(3, 0, 0, 3, 4));
    646         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_dfr1,
    647                               ARM64_SYS_REG(3, 0, 0, 3, 5));
    648         err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr5,
    649                               ARM64_SYS_REG(3, 0, 0, 3, 6));
    650 
    651         /*
    652          * DBGDIDR is a bit complicated because the kernel doesn't
    653          * provide an accessor for it in 64-bit mode, which is what this
    654          * scratch VM is in, and there's no architected "64-bit sysreg
    655          * which reads the same as the 32-bit register" the way there is
    656          * for other ID registers. Instead we synthesize a value from the
    657          * AArch64 ID_AA64DFR0, the same way the kernel code in
    658          * arch/arm64/kvm/sys_regs.c:trap_dbgidr() does.
    659          * We only do this if the CPU supports AArch32 at EL1.
    660          */
    661         if (FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL1) >= 2) {
    662             int wrps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, WRPS);
    663             int brps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, BRPS);
    664             int ctx_cmps =
    665                 FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, CTX_CMPS);
    666             int version = 6; /* ARMv8 debug architecture */
    667             bool has_el3 =
    668                 !!FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL3);
    669             uint32_t dbgdidr = 0;
    670 
    671             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, WRPS, wrps);
    672             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, BRPS, brps);
    673             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, CTX_CMPS, ctx_cmps);
    674             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, VERSION, version);
    675             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, NSUHD_IMP, has_el3);
    676             dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, SE_IMP, has_el3);
    677             dbgdidr |= (1 << 15); /* RES1 bit */
    678             ahcf->isar.dbgdidr = dbgdidr;
    679         }
    680 
    681         if (pmu_supported) {
    682             /* PMCR_EL0 is only accessible if the vCPU has feature PMU_V3 */
    683             err |= read_sys_reg64(fdarray[2], &ahcf->isar.reset_pmcr_el0,
    684                                   ARM64_SYS_REG(3, 3, 9, 12, 0));
    685         }
    686 
    687         if (sve_supported) {
    688             /*
    689              * There is a range of kernels between kernel commit 73433762fcae
    690              * and f81cb2c3ad41 which have a bug where the kernel doesn't
    691              * expose SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has
    692              * enabled SVE support, which resulted in an error rather than RAZ.
    693              * So only read the register if we set KVM_ARM_VCPU_SVE above.
    694              */
    695             err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64zfr0,
    696                                   ARM64_SYS_REG(3, 0, 0, 4, 4));
    697         }
    698     }
    699 
    700     kvm_arm_destroy_scratch_host_vcpu(fdarray);
    701 
    702     if (err < 0) {
    703         return false;
    704     }
    705 
    706     /*
    707      * We can assume any KVM supporting CPU is at least a v8
    708      * with VFPv4+Neon; this in turn implies most of the other
    709      * feature bits.
    710      */
    711     features |= 1ULL << ARM_FEATURE_V8;
    712     features |= 1ULL << ARM_FEATURE_NEON;
    713     features |= 1ULL << ARM_FEATURE_AARCH64;
    714     features |= 1ULL << ARM_FEATURE_PMU;
    715     features |= 1ULL << ARM_FEATURE_GENERIC_TIMER;
    716 
    717     ahcf->features = features;
    718 
    719     return true;
    720 }
    721 
    722 void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp)
    723 {
    724     bool has_steal_time = kvm_arm_steal_time_supported();
    725 
    726     if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
    727         if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
    728             cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
    729         } else {
    730             cpu->kvm_steal_time = ON_OFF_AUTO_ON;
    731         }
    732     } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
    733         if (!has_steal_time) {
    734             error_setg(errp, "'kvm-steal-time' cannot be enabled "
    735                              "on this host");
    736             return;
    737         } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
    738             /*
    739              * DEN0057A chapter 2 says "This specification only covers
    740              * systems in which the Execution state of the hypervisor
    741              * as well as EL1 of virtual machines is AArch64.". And,
    742              * to ensure that, the smc/hvc calls are only specified as
    743              * smc64/hvc64.
    744              */
    745             error_setg(errp, "'kvm-steal-time' cannot be enabled "
    746                              "for AArch32 guests");
    747             return;
    748         }
    749     }
    750 }
    751 
    752 bool kvm_arm_aarch32_supported(void)
    753 {
    754     return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);
    755 }
    756 
    757 bool kvm_arm_sve_supported(void)
    758 {
    759     return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
    760 }
    761 
    762 bool kvm_arm_steal_time_supported(void)
    763 {
    764     return kvm_check_extension(kvm_state, KVM_CAP_STEAL_TIME);
    765 }
    766 
    767 QEMU_BUILD_BUG_ON(KVM_ARM64_SVE_VQ_MIN != 1);
    768 
    769 uint32_t kvm_arm_sve_get_vls(CPUState *cs)
    770 {
    771     /* Only call this function if kvm_arm_sve_supported() returns true. */
    772     static uint64_t vls[KVM_ARM64_SVE_VLS_WORDS];
    773     static bool probed;
    774     uint32_t vq = 0;
    775     int i;
    776 
    777     /*
    778      * KVM ensures all host CPUs support the same set of vector lengths.
    779      * So we only need to create the scratch VCPUs once and then cache
    780      * the results.
    781      */
    782     if (!probed) {
    783         struct kvm_vcpu_init init = {
    784             .target = -1,
    785             .features[0] = (1 << KVM_ARM_VCPU_SVE),
    786         };
    787         struct kvm_one_reg reg = {
    788             .id = KVM_REG_ARM64_SVE_VLS,
    789             .addr = (uint64_t)&vls[0],
    790         };
    791         int fdarray[3], ret;
    792 
    793         probed = true;
    794 
    795         if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, &init)) {
    796             error_report("failed to create scratch VCPU with SVE enabled");
    797             abort();
    798         }
    799         ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &reg);
    800         kvm_arm_destroy_scratch_host_vcpu(fdarray);
    801         if (ret) {
    802             error_report("failed to get KVM_REG_ARM64_SVE_VLS: %s",
    803                          strerror(errno));
    804             abort();
    805         }
    806 
    807         for (i = KVM_ARM64_SVE_VLS_WORDS - 1; i >= 0; --i) {
    808             if (vls[i]) {
    809                 vq = 64 - clz64(vls[i]) + i * 64;
    810                 break;
    811             }
    812         }
    813         if (vq > ARM_MAX_VQ) {
    814             warn_report("KVM supports vector lengths larger than "
    815                         "QEMU can enable");
    816             vls[0] &= MAKE_64BIT_MASK(0, ARM_MAX_VQ);
    817         }
    818     }
    819 
    820     return vls[0];
    821 }
    822 
    823 static int kvm_arm_sve_set_vls(CPUState *cs)
    824 {
    825     ARMCPU *cpu = ARM_CPU(cs);
    826     uint64_t vls[KVM_ARM64_SVE_VLS_WORDS] = { cpu->sve_vq.map };
    827     struct kvm_one_reg reg = {
    828         .id = KVM_REG_ARM64_SVE_VLS,
    829         .addr = (uint64_t)&vls[0],
    830     };
    831 
    832     assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX);
    833 
    834     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
    835 }
    836 
    837 #define ARM_CPU_ID_MPIDR       3, 0, 0, 0, 5
    838 
    839 int kvm_arch_init_vcpu(CPUState *cs)
    840 {
    841     int ret;
    842     uint64_t mpidr;
    843     ARMCPU *cpu = ARM_CPU(cs);
    844     CPUARMState *env = &cpu->env;
    845     uint64_t psciver;
    846 
    847     if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
    848         !object_dynamic_cast(OBJECT(cpu), TYPE_AARCH64_CPU)) {
    849         error_report("KVM is not supported for this guest CPU type");
    850         return -EINVAL;
    851     }
    852 
    853     qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
    854 
    855     /* Determine init features for this CPU */
    856     memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
    857     if (cs->start_powered_off) {
    858         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF;
    859     }
    860     if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) {
    861         cpu->psci_version = QEMU_PSCI_VERSION_0_2;
    862         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PSCI_0_2;
    863     }
    864     if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
    865         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_EL1_32BIT;
    866     }
    867     if (!kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
    868         cpu->has_pmu = false;
    869     }
    870     if (cpu->has_pmu) {
    871         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
    872     } else {
    873         env->features &= ~(1ULL << ARM_FEATURE_PMU);
    874     }
    875     if (cpu_isar_feature(aa64_sve, cpu)) {
    876         assert(kvm_arm_sve_supported());
    877         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_SVE;
    878     }
    879     if (cpu_isar_feature(aa64_pauth, cpu)) {
    880         cpu->kvm_init_features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
    881                                       1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
    882     }
    883 
    884     /* Do KVM_ARM_VCPU_INIT ioctl */
    885     ret = kvm_arm_vcpu_init(cs);
    886     if (ret) {
    887         return ret;
    888     }
    889 
    890     if (cpu_isar_feature(aa64_sve, cpu)) {
    891         ret = kvm_arm_sve_set_vls(cs);
    892         if (ret) {
    893             return ret;
    894         }
    895         ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_SVE);
    896         if (ret) {
    897             return ret;
    898         }
    899     }
    900 
    901     /*
    902      * KVM reports the exact PSCI version it is implementing via a
    903      * special sysreg. If it is present, use its contents to determine
    904      * what to report to the guest in the dtb (it is the PSCI version,
    905      * in the same 15-bits major 16-bits minor format that PSCI_VERSION
    906      * returns).
    907      */
    908     if (!kvm_get_one_reg(cs, KVM_REG_ARM_PSCI_VERSION, &psciver)) {
    909         cpu->psci_version = psciver;
    910     }
    911 
    912     /*
    913      * When KVM is in use, PSCI is emulated in-kernel and not by qemu.
    914      * Currently KVM has its own idea about MPIDR assignment, so we
    915      * override our defaults with what we get from KVM.
    916      */
    917     ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr);
    918     if (ret) {
    919         return ret;
    920     }
    921     cpu->mp_affinity = mpidr & ARM64_AFFINITY_MASK;
    922 
    923     kvm_arm_init_debug(cs);
    924 
    925     /* Check whether user space can specify guest syndrome value */
    926     kvm_arm_init_serror_injection(cs);
    927 
    928     return kvm_arm_init_cpreg_list(cpu);
    929 }
    930 
    931 int kvm_arch_destroy_vcpu(CPUState *cs)
    932 {
    933     return 0;
    934 }
    935 
    936 bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx)
    937 {
    938     /* Return true if the regidx is a register we should synchronize
    939      * via the cpreg_tuples array (ie is not a core or sve reg that
    940      * we sync by hand in kvm_arch_get/put_registers())
    941      */
    942     switch (regidx & KVM_REG_ARM_COPROC_MASK) {
    943     case KVM_REG_ARM_CORE:
    944     case KVM_REG_ARM64_SVE:
    945         return false;
    946     default:
    947         return true;
    948     }
    949 }
    950 
    951 typedef struct CPRegStateLevel {
    952     uint64_t regidx;
    953     int level;
    954 } CPRegStateLevel;
    955 
    956 /* All system registers not listed in the following table are assumed to be
    957  * of the level KVM_PUT_RUNTIME_STATE. If a register should be written less
    958  * often, you must add it to this table with a state of either
    959  * KVM_PUT_RESET_STATE or KVM_PUT_FULL_STATE.
    960  */
    961 static const CPRegStateLevel non_runtime_cpregs[] = {
    962     { KVM_REG_ARM_TIMER_CNT, KVM_PUT_FULL_STATE },
    963 };
    964 
    965 int kvm_arm_cpreg_level(uint64_t regidx)
    966 {
    967     int i;
    968 
    969     for (i = 0; i < ARRAY_SIZE(non_runtime_cpregs); i++) {
    970         const CPRegStateLevel *l = &non_runtime_cpregs[i];
    971         if (l->regidx == regidx) {
    972             return l->level;
    973         }
    974     }
    975 
    976     return KVM_PUT_RUNTIME_STATE;
    977 }
    978 
    979 /* Callers must hold the iothread mutex lock */
    980 static void kvm_inject_arm_sea(CPUState *c)
    981 {
    982     ARMCPU *cpu = ARM_CPU(c);
    983     CPUARMState *env = &cpu->env;
    984     uint32_t esr;
    985     bool same_el;
    986 
    987     c->exception_index = EXCP_DATA_ABORT;
    988     env->exception.target_el = 1;
    989 
    990     /*
    991      * Set the DFSC to synchronous external abort and set FnV to not valid,
    992      * this will tell guest the FAR_ELx is UNKNOWN for this abort.
    993      */
    994     same_el = arm_current_el(env) == env->exception.target_el;
    995     esr = syn_data_abort_no_iss(same_el, 1, 0, 0, 0, 0, 0x10);
    996 
    997     env->exception.syndrome = esr;
    998 
    999     arm_cpu_do_interrupt(c);
   1000 }
   1001 
   1002 #define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
   1003                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
   1004 
   1005 #define AARCH64_SIMD_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U128 | \
   1006                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
   1007 
   1008 #define AARCH64_SIMD_CTRL_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U32 | \
   1009                  KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
   1010 
   1011 static int kvm_arch_put_fpsimd(CPUState *cs)
   1012 {
   1013     CPUARMState *env = &ARM_CPU(cs)->env;
   1014     struct kvm_one_reg reg;
   1015     int i, ret;
   1016 
   1017     for (i = 0; i < 32; i++) {
   1018         uint64_t *q = aa64_vfp_qreg(env, i);
   1019 #if HOST_BIG_ENDIAN
   1020         uint64_t fp_val[2] = { q[1], q[0] };
   1021         reg.addr = (uintptr_t)fp_val;
   1022 #else
   1023         reg.addr = (uintptr_t)q;
   1024 #endif
   1025         reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
   1026         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1027         if (ret) {
   1028             return ret;
   1029         }
   1030     }
   1031 
   1032     return 0;
   1033 }
   1034 
   1035 /*
   1036  * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
   1037  * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
   1038  * code the slice index to zero for now as it's unlikely we'll need more than
   1039  * one slice for quite some time.
   1040  */
   1041 static int kvm_arch_put_sve(CPUState *cs)
   1042 {
   1043     ARMCPU *cpu = ARM_CPU(cs);
   1044     CPUARMState *env = &cpu->env;
   1045     uint64_t tmp[ARM_MAX_VQ * 2];
   1046     uint64_t *r;
   1047     struct kvm_one_reg reg;
   1048     int n, ret;
   1049 
   1050     for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
   1051         r = sve_bswap64(tmp, &env->vfp.zregs[n].d[0], cpu->sve_max_vq * 2);
   1052         reg.addr = (uintptr_t)r;
   1053         reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
   1054         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1055         if (ret) {
   1056             return ret;
   1057         }
   1058     }
   1059 
   1060     for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
   1061         r = sve_bswap64(tmp, r = &env->vfp.pregs[n].p[0],
   1062                         DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
   1063         reg.addr = (uintptr_t)r;
   1064         reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
   1065         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1066         if (ret) {
   1067             return ret;
   1068         }
   1069     }
   1070 
   1071     r = sve_bswap64(tmp, &env->vfp.pregs[FFR_PRED_NUM].p[0],
   1072                     DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
   1073     reg.addr = (uintptr_t)r;
   1074     reg.id = KVM_REG_ARM64_SVE_FFR(0);
   1075     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1076     if (ret) {
   1077         return ret;
   1078     }
   1079 
   1080     return 0;
   1081 }
   1082 
   1083 int kvm_arch_put_registers(CPUState *cs, int level)
   1084 {
   1085     struct kvm_one_reg reg;
   1086     uint64_t val;
   1087     uint32_t fpr;
   1088     int i, ret;
   1089     unsigned int el;
   1090 
   1091     ARMCPU *cpu = ARM_CPU(cs);
   1092     CPUARMState *env = &cpu->env;
   1093 
   1094     /* If we are in AArch32 mode then we need to copy the AArch32 regs to the
   1095      * AArch64 registers before pushing them out to 64-bit KVM.
   1096      */
   1097     if (!is_a64(env)) {
   1098         aarch64_sync_32_to_64(env);
   1099     }
   1100 
   1101     for (i = 0; i < 31; i++) {
   1102         reg.id = AARCH64_CORE_REG(regs.regs[i]);
   1103         reg.addr = (uintptr_t) &env->xregs[i];
   1104         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1105         if (ret) {
   1106             return ret;
   1107         }
   1108     }
   1109 
   1110     /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
   1111      * QEMU side we keep the current SP in xregs[31] as well.
   1112      */
   1113     aarch64_save_sp(env, 1);
   1114 
   1115     reg.id = AARCH64_CORE_REG(regs.sp);
   1116     reg.addr = (uintptr_t) &env->sp_el[0];
   1117     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1118     if (ret) {
   1119         return ret;
   1120     }
   1121 
   1122     reg.id = AARCH64_CORE_REG(sp_el1);
   1123     reg.addr = (uintptr_t) &env->sp_el[1];
   1124     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1125     if (ret) {
   1126         return ret;
   1127     }
   1128 
   1129     /* Note that KVM thinks pstate is 64 bit but we use a uint32_t */
   1130     if (is_a64(env)) {
   1131         val = pstate_read(env);
   1132     } else {
   1133         val = cpsr_read(env);
   1134     }
   1135     reg.id = AARCH64_CORE_REG(regs.pstate);
   1136     reg.addr = (uintptr_t) &val;
   1137     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1138     if (ret) {
   1139         return ret;
   1140     }
   1141 
   1142     reg.id = AARCH64_CORE_REG(regs.pc);
   1143     reg.addr = (uintptr_t) &env->pc;
   1144     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1145     if (ret) {
   1146         return ret;
   1147     }
   1148 
   1149     reg.id = AARCH64_CORE_REG(elr_el1);
   1150     reg.addr = (uintptr_t) &env->elr_el[1];
   1151     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1152     if (ret) {
   1153         return ret;
   1154     }
   1155 
   1156     /* Saved Program State Registers
   1157      *
   1158      * Before we restore from the banked_spsr[] array we need to
   1159      * ensure that any modifications to env->spsr are correctly
   1160      * reflected in the banks.
   1161      */
   1162     el = arm_current_el(env);
   1163     if (el > 0 && !is_a64(env)) {
   1164         i = bank_number(env->uncached_cpsr & CPSR_M);
   1165         env->banked_spsr[i] = env->spsr;
   1166     }
   1167 
   1168     /* KVM 0-4 map to QEMU banks 1-5 */
   1169     for (i = 0; i < KVM_NR_SPSR; i++) {
   1170         reg.id = AARCH64_CORE_REG(spsr[i]);
   1171         reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
   1172         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1173         if (ret) {
   1174             return ret;
   1175         }
   1176     }
   1177 
   1178     if (cpu_isar_feature(aa64_sve, cpu)) {
   1179         ret = kvm_arch_put_sve(cs);
   1180     } else {
   1181         ret = kvm_arch_put_fpsimd(cs);
   1182     }
   1183     if (ret) {
   1184         return ret;
   1185     }
   1186 
   1187     reg.addr = (uintptr_t)(&fpr);
   1188     fpr = vfp_get_fpsr(env);
   1189     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
   1190     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1191     if (ret) {
   1192         return ret;
   1193     }
   1194 
   1195     reg.addr = (uintptr_t)(&fpr);
   1196     fpr = vfp_get_fpcr(env);
   1197     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
   1198     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
   1199     if (ret) {
   1200         return ret;
   1201     }
   1202 
   1203     write_cpustate_to_list(cpu, true);
   1204 
   1205     if (!write_list_to_kvmstate(cpu, level)) {
   1206         return -EINVAL;
   1207     }
   1208 
   1209    /*
   1210     * Setting VCPU events should be triggered after syncing the registers
   1211     * to avoid overwriting potential changes made by KVM upon calling
   1212     * KVM_SET_VCPU_EVENTS ioctl
   1213     */
   1214     ret = kvm_put_vcpu_events(cpu);
   1215     if (ret) {
   1216         return ret;
   1217     }
   1218 
   1219     kvm_arm_sync_mpstate_to_kvm(cpu);
   1220 
   1221     return ret;
   1222 }
   1223 
   1224 static int kvm_arch_get_fpsimd(CPUState *cs)
   1225 {
   1226     CPUARMState *env = &ARM_CPU(cs)->env;
   1227     struct kvm_one_reg reg;
   1228     int i, ret;
   1229 
   1230     for (i = 0; i < 32; i++) {
   1231         uint64_t *q = aa64_vfp_qreg(env, i);
   1232         reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
   1233         reg.addr = (uintptr_t)q;
   1234         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1235         if (ret) {
   1236             return ret;
   1237         } else {
   1238 #if HOST_BIG_ENDIAN
   1239             uint64_t t;
   1240             t = q[0], q[0] = q[1], q[1] = t;
   1241 #endif
   1242         }
   1243     }
   1244 
   1245     return 0;
   1246 }
   1247 
   1248 /*
   1249  * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
   1250  * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
   1251  * code the slice index to zero for now as it's unlikely we'll need more than
   1252  * one slice for quite some time.
   1253  */
   1254 static int kvm_arch_get_sve(CPUState *cs)
   1255 {
   1256     ARMCPU *cpu = ARM_CPU(cs);
   1257     CPUARMState *env = &cpu->env;
   1258     struct kvm_one_reg reg;
   1259     uint64_t *r;
   1260     int n, ret;
   1261 
   1262     for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
   1263         r = &env->vfp.zregs[n].d[0];
   1264         reg.addr = (uintptr_t)r;
   1265         reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
   1266         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1267         if (ret) {
   1268             return ret;
   1269         }
   1270         sve_bswap64(r, r, cpu->sve_max_vq * 2);
   1271     }
   1272 
   1273     for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
   1274         r = &env->vfp.pregs[n].p[0];
   1275         reg.addr = (uintptr_t)r;
   1276         reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
   1277         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1278         if (ret) {
   1279             return ret;
   1280         }
   1281         sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
   1282     }
   1283 
   1284     r = &env->vfp.pregs[FFR_PRED_NUM].p[0];
   1285     reg.addr = (uintptr_t)r;
   1286     reg.id = KVM_REG_ARM64_SVE_FFR(0);
   1287     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1288     if (ret) {
   1289         return ret;
   1290     }
   1291     sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
   1292 
   1293     return 0;
   1294 }
   1295 
   1296 int kvm_arch_get_registers(CPUState *cs)
   1297 {
   1298     struct kvm_one_reg reg;
   1299     uint64_t val;
   1300     unsigned int el;
   1301     uint32_t fpr;
   1302     int i, ret;
   1303 
   1304     ARMCPU *cpu = ARM_CPU(cs);
   1305     CPUARMState *env = &cpu->env;
   1306 
   1307     for (i = 0; i < 31; i++) {
   1308         reg.id = AARCH64_CORE_REG(regs.regs[i]);
   1309         reg.addr = (uintptr_t) &env->xregs[i];
   1310         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1311         if (ret) {
   1312             return ret;
   1313         }
   1314     }
   1315 
   1316     reg.id = AARCH64_CORE_REG(regs.sp);
   1317     reg.addr = (uintptr_t) &env->sp_el[0];
   1318     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1319     if (ret) {
   1320         return ret;
   1321     }
   1322 
   1323     reg.id = AARCH64_CORE_REG(sp_el1);
   1324     reg.addr = (uintptr_t) &env->sp_el[1];
   1325     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1326     if (ret) {
   1327         return ret;
   1328     }
   1329 
   1330     reg.id = AARCH64_CORE_REG(regs.pstate);
   1331     reg.addr = (uintptr_t) &val;
   1332     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1333     if (ret) {
   1334         return ret;
   1335     }
   1336 
   1337     env->aarch64 = ((val & PSTATE_nRW) == 0);
   1338     if (is_a64(env)) {
   1339         pstate_write(env, val);
   1340     } else {
   1341         cpsr_write(env, val, 0xffffffff, CPSRWriteRaw);
   1342     }
   1343 
   1344     /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
   1345      * QEMU side we keep the current SP in xregs[31] as well.
   1346      */
   1347     aarch64_restore_sp(env, 1);
   1348 
   1349     reg.id = AARCH64_CORE_REG(regs.pc);
   1350     reg.addr = (uintptr_t) &env->pc;
   1351     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1352     if (ret) {
   1353         return ret;
   1354     }
   1355 
   1356     /* If we are in AArch32 mode then we need to sync the AArch32 regs with the
   1357      * incoming AArch64 regs received from 64-bit KVM.
   1358      * We must perform this after all of the registers have been acquired from
   1359      * the kernel.
   1360      */
   1361     if (!is_a64(env)) {
   1362         aarch64_sync_64_to_32(env);
   1363     }
   1364 
   1365     reg.id = AARCH64_CORE_REG(elr_el1);
   1366     reg.addr = (uintptr_t) &env->elr_el[1];
   1367     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1368     if (ret) {
   1369         return ret;
   1370     }
   1371 
   1372     /* Fetch the SPSR registers
   1373      *
   1374      * KVM SPSRs 0-4 map to QEMU banks 1-5
   1375      */
   1376     for (i = 0; i < KVM_NR_SPSR; i++) {
   1377         reg.id = AARCH64_CORE_REG(spsr[i]);
   1378         reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
   1379         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1380         if (ret) {
   1381             return ret;
   1382         }
   1383     }
   1384 
   1385     el = arm_current_el(env);
   1386     if (el > 0 && !is_a64(env)) {
   1387         i = bank_number(env->uncached_cpsr & CPSR_M);
   1388         env->spsr = env->banked_spsr[i];
   1389     }
   1390 
   1391     if (cpu_isar_feature(aa64_sve, cpu)) {
   1392         ret = kvm_arch_get_sve(cs);
   1393     } else {
   1394         ret = kvm_arch_get_fpsimd(cs);
   1395     }
   1396     if (ret) {
   1397         return ret;
   1398     }
   1399 
   1400     reg.addr = (uintptr_t)(&fpr);
   1401     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
   1402     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1403     if (ret) {
   1404         return ret;
   1405     }
   1406     vfp_set_fpsr(env, fpr);
   1407 
   1408     reg.addr = (uintptr_t)(&fpr);
   1409     reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
   1410     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
   1411     if (ret) {
   1412         return ret;
   1413     }
   1414     vfp_set_fpcr(env, fpr);
   1415 
   1416     ret = kvm_get_vcpu_events(cpu);
   1417     if (ret) {
   1418         return ret;
   1419     }
   1420 
   1421     if (!write_kvmstate_to_list(cpu)) {
   1422         return -EINVAL;
   1423     }
   1424     /* Note that it's OK to have registers which aren't in CPUState,
   1425      * so we can ignore a failure return here.
   1426      */
   1427     write_list_to_cpustate(cpu);
   1428 
   1429     kvm_arm_sync_mpstate_to_qemu(cpu);
   1430 
   1431     /* TODO: other registers */
   1432     return ret;
   1433 }
   1434 
   1435 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
   1436 {
   1437     ram_addr_t ram_addr;
   1438     hwaddr paddr;
   1439 
   1440     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
   1441 
   1442     if (acpi_ghes_present() && addr) {
   1443         ram_addr = qemu_ram_addr_from_host(addr);
   1444         if (ram_addr != RAM_ADDR_INVALID &&
   1445             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
   1446             kvm_hwpoison_page_add(ram_addr);
   1447             /*
   1448              * If this is a BUS_MCEERR_AR, we know we have been called
   1449              * synchronously from the vCPU thread, so we can easily
   1450              * synchronize the state and inject an error.
   1451              *
   1452              * TODO: we currently don't tell the guest at all about
   1453              * BUS_MCEERR_AO. In that case we might either be being
   1454              * called synchronously from the vCPU thread, or a bit
   1455              * later from the main thread, so doing the injection of
   1456              * the error would be more complicated.
   1457              */
   1458             if (code == BUS_MCEERR_AR) {
   1459                 kvm_cpu_synchronize_state(c);
   1460                 if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
   1461                     kvm_inject_arm_sea(c);
   1462                 } else {
   1463                     error_report("failed to record the error");
   1464                     abort();
   1465                 }
   1466             }
   1467             return;
   1468         }
   1469         if (code == BUS_MCEERR_AO) {
   1470             error_report("Hardware memory error at addr %p for memory used by "
   1471                 "QEMU itself instead of guest system!", addr);
   1472         }
   1473     }
   1474 
   1475     if (code == BUS_MCEERR_AR) {
   1476         error_report("Hardware memory error!");
   1477         exit(1);
   1478     }
   1479 }
   1480 
   1481 /* C6.6.29 BRK instruction */
   1482 static const uint32_t brk_insn = 0xd4200000;
   1483 
   1484 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   1485 {
   1486     if (have_guest_debug) {
   1487         if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) ||
   1488             cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) {
   1489             return -EINVAL;
   1490         }
   1491         return 0;
   1492     } else {
   1493         error_report("guest debug not supported on this kernel");
   1494         return -EINVAL;
   1495     }
   1496 }
   1497 
   1498 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
   1499 {
   1500     static uint32_t brk;
   1501 
   1502     if (have_guest_debug) {
   1503         if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) ||
   1504             brk != brk_insn ||
   1505             cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) {
   1506             return -EINVAL;
   1507         }
   1508         return 0;
   1509     } else {
   1510         error_report("guest debug not supported on this kernel");
   1511         return -EINVAL;
   1512     }
   1513 }
   1514 
   1515 /* See v8 ARM ARM D7.2.27 ESR_ELx, Exception Syndrome Register
   1516  *
   1517  * To minimise translating between kernel and user-space the kernel
   1518  * ABI just provides user-space with the full exception syndrome
   1519  * register value to be decoded in QEMU.
   1520  */
   1521 
   1522 bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
   1523 {
   1524     int hsr_ec = syn_get_ec(debug_exit->hsr);
   1525     ARMCPU *cpu = ARM_CPU(cs);
   1526     CPUARMState *env = &cpu->env;
   1527 
   1528     /* Ensure PC is synchronised */
   1529     kvm_cpu_synchronize_state(cs);
   1530 
   1531     switch (hsr_ec) {
   1532     case EC_SOFTWARESTEP:
   1533         if (cs->singlestep_enabled) {
   1534             return true;
   1535         } else {
   1536             /*
   1537              * The kernel should have suppressed the guest's ability to
   1538              * single step at this point so something has gone wrong.
   1539              */
   1540             error_report("%s: guest single-step while debugging unsupported"
   1541                          " (%"PRIx64", %"PRIx32")",
   1542                          __func__, env->pc, debug_exit->hsr);
   1543             return false;
   1544         }
   1545         break;
   1546     case EC_AA64_BKPT:
   1547         if (kvm_find_sw_breakpoint(cs, env->pc)) {
   1548             return true;
   1549         }
   1550         break;
   1551     case EC_BREAKPOINT:
   1552         if (find_hw_breakpoint(cs, env->pc)) {
   1553             return true;
   1554         }
   1555         break;
   1556     case EC_WATCHPOINT:
   1557     {
   1558         CPUWatchpoint *wp = find_hw_watchpoint(cs, debug_exit->far);
   1559         if (wp) {
   1560             cs->watchpoint_hit = wp;
   1561             return true;
   1562         }
   1563         break;
   1564     }
   1565     default:
   1566         error_report("%s: unhandled debug exit (%"PRIx32", %"PRIx64")",
   1567                      __func__, debug_exit->hsr, env->pc);
   1568     }
   1569 
   1570     /* If we are not handling the debug exception it must belong to
   1571      * the guest. Let's re-use the existing TCG interrupt code to set
   1572      * everything up properly.
   1573      */
   1574     cs->exception_index = EXCP_BKPT;
   1575     env->exception.syndrome = debug_exit->hsr;
   1576     env->exception.vaddress = debug_exit->far;
   1577     env->exception.target_el = 1;
   1578     qemu_mutex_lock_iothread();
   1579     arm_cpu_do_interrupt(cs);
   1580     qemu_mutex_unlock_iothread();
   1581 
   1582     return false;
   1583 }
   1584 
   1585 #define ARM64_REG_ESR_EL1 ARM64_SYS_REG(3, 0, 5, 2, 0)
   1586 #define ARM64_REG_TCR_EL1 ARM64_SYS_REG(3, 0, 2, 0, 2)
   1587 
   1588 /*
   1589  * ESR_EL1
   1590  * ISS encoding
   1591  * AARCH64: DFSC,   bits [5:0]
   1592  * AARCH32:
   1593  *      TTBCR.EAE == 0
   1594  *          FS[4]   - DFSR[10]
   1595  *          FS[3:0] - DFSR[3:0]
   1596  *      TTBCR.EAE == 1
   1597  *          FS, bits [5:0]
   1598  */
   1599 #define ESR_DFSC(aarch64, lpae, v)        \
   1600     ((aarch64 || (lpae)) ? ((v) & 0x3F)   \
   1601                : (((v) >> 6) | ((v) & 0x1F)))
   1602 
   1603 #define ESR_DFSC_EXTABT(aarch64, lpae) \
   1604     ((aarch64) ? 0x10 : (lpae) ? 0x10 : 0x8)
   1605 
   1606 bool kvm_arm_verify_ext_dabt_pending(CPUState *cs)
   1607 {
   1608     uint64_t dfsr_val;
   1609 
   1610     if (!kvm_get_one_reg(cs, ARM64_REG_ESR_EL1, &dfsr_val)) {
   1611         ARMCPU *cpu = ARM_CPU(cs);
   1612         CPUARMState *env = &cpu->env;
   1613         int aarch64_mode = arm_feature(env, ARM_FEATURE_AARCH64);
   1614         int lpae = 0;
   1615 
   1616         if (!aarch64_mode) {
   1617             uint64_t ttbcr;
   1618 
   1619             if (!kvm_get_one_reg(cs, ARM64_REG_TCR_EL1, &ttbcr)) {
   1620                 lpae = arm_feature(env, ARM_FEATURE_LPAE)
   1621                         && (ttbcr & TTBCR_EAE);
   1622             }
   1623         }
   1624         /*
   1625          * The verification here is based on the DFSC bits
   1626          * of the ESR_EL1 reg only
   1627          */
   1628          return (ESR_DFSC(aarch64_mode, lpae, dfsr_val) ==
   1629                 ESR_DFSC_EXTABT(aarch64_mode, lpae));
   1630     }
   1631     return false;
   1632 }