qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

spapr.c (172198B)


      1 /*
      2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
      3  *
      4  * Copyright (c) 2004-2007 Fabrice Bellard
      5  * Copyright (c) 2007 Jocelyn Mayer
      6  * Copyright (c) 2010 David Gibson, IBM Corporation.
      7  *
      8  * Permission is hereby granted, free of charge, to any person obtaining a copy
      9  * of this software and associated documentation files (the "Software"), to deal
     10  * in the Software without restriction, including without limitation the rights
     11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     12  * copies of the Software, and to permit persons to whom the Software is
     13  * furnished to do so, subject to the following conditions:
     14  *
     15  * The above copyright notice and this permission notice shall be included in
     16  * all copies or substantial portions of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     24  * THE SOFTWARE.
     25  */
     26 
     27 #include "qemu/osdep.h"
     28 #include "qemu/datadir.h"
     29 #include "qemu/memalign.h"
     30 #include "qemu/guest-random.h"
     31 #include "qapi/error.h"
     32 #include "qapi/qapi-events-machine.h"
     33 #include "qapi/qapi-events-qdev.h"
     34 #include "qapi/visitor.h"
     35 #include "sysemu/sysemu.h"
     36 #include "sysemu/hostmem.h"
     37 #include "sysemu/numa.h"
     38 #include "sysemu/qtest.h"
     39 #include "sysemu/reset.h"
     40 #include "sysemu/runstate.h"
     41 #include "qemu/log.h"
     42 #include "hw/fw-path-provider.h"
     43 #include "elf.h"
     44 #include "net/net.h"
     45 #include "sysemu/device_tree.h"
     46 #include "sysemu/cpus.h"
     47 #include "sysemu/hw_accel.h"
     48 #include "kvm_ppc.h"
     49 #include "migration/misc.h"
     50 #include "migration/qemu-file-types.h"
     51 #include "migration/global_state.h"
     52 #include "migration/register.h"
     53 #include "migration/blocker.h"
     54 #include "mmu-hash64.h"
     55 #include "mmu-book3s-v3.h"
     56 #include "cpu-models.h"
     57 #include "hw/core/cpu.h"
     58 
     59 #include "hw/ppc/ppc.h"
     60 #include "hw/loader.h"
     61 
     62 #include "hw/ppc/fdt.h"
     63 #include "hw/ppc/spapr.h"
     64 #include "hw/ppc/spapr_vio.h"
     65 #include "hw/qdev-properties.h"
     66 #include "hw/pci-host/spapr.h"
     67 #include "hw/pci/msi.h"
     68 
     69 #include "hw/pci/pci.h"
     70 #include "hw/scsi/scsi.h"
     71 #include "hw/virtio/virtio-scsi.h"
     72 #include "hw/virtio/vhost-scsi-common.h"
     73 
     74 #include "exec/ram_addr.h"
     75 #include "hw/usb.h"
     76 #include "qemu/config-file.h"
     77 #include "qemu/error-report.h"
     78 #include "trace.h"
     79 #include "hw/nmi.h"
     80 #include "hw/intc/intc.h"
     81 
     82 #include "hw/ppc/spapr_cpu_core.h"
     83 #include "hw/mem/memory-device.h"
     84 #include "hw/ppc/spapr_tpm_proxy.h"
     85 #include "hw/ppc/spapr_nvdimm.h"
     86 #include "hw/ppc/spapr_numa.h"
     87 #include "hw/ppc/pef.h"
     88 
     89 #include "monitor/monitor.h"
     90 
     91 #include <libfdt.h>
     92 
     93 /* SLOF memory layout:
     94  *
     95  * SLOF raw image loaded at 0, copies its romfs right below the flat
     96  * device-tree, then position SLOF itself 31M below that
     97  *
     98  * So we set FW_OVERHEAD to 40MB which should account for all of that
     99  * and more
    100  *
    101  * We load our kernel at 4M, leaving space for SLOF initial image
    102  */
    103 #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
    104 #define FW_MAX_SIZE             0x400000
    105 #define FW_FILE_NAME            "slof.bin"
    106 #define FW_FILE_NAME_VOF        "vof.bin"
    107 #define FW_OVERHEAD             0x2800000
    108 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
    109 
    110 #define MIN_RMA_SLOF            (128 * MiB)
    111 
    112 #define PHANDLE_INTC            0x00001111
    113 
    114 /* These two functions implement the VCPU id numbering: one to compute them
    115  * all and one to identify thread 0 of a VCORE. Any change to the first one
    116  * is likely to have an impact on the second one, so let's keep them close.
    117  */
    118 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
    119 {
    120     MachineState *ms = MACHINE(spapr);
    121     unsigned int smp_threads = ms->smp.threads;
    122 
    123     assert(spapr->vsmt);
    124     return
    125         (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
    126 }
    127 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
    128                                       PowerPCCPU *cpu)
    129 {
    130     assert(spapr->vsmt);
    131     return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
    132 }
    133 
    134 static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
    135 {
    136     /* Dummy entries correspond to unused ICPState objects in older QEMUs,
    137      * and newer QEMUs don't even have them. In both cases, we don't want
    138      * to send anything on the wire.
    139      */
    140     return false;
    141 }
    142 
    143 static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
    144     .name = "icp/server",
    145     .version_id = 1,
    146     .minimum_version_id = 1,
    147     .needed = pre_2_10_vmstate_dummy_icp_needed,
    148     .fields = (VMStateField[]) {
    149         VMSTATE_UNUSED(4), /* uint32_t xirr */
    150         VMSTATE_UNUSED(1), /* uint8_t pending_priority */
    151         VMSTATE_UNUSED(1), /* uint8_t mfrr */
    152         VMSTATE_END_OF_LIST()
    153     },
    154 };
    155 
    156 static void pre_2_10_vmstate_register_dummy_icp(int i)
    157 {
    158     vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
    159                      (void *)(uintptr_t) i);
    160 }
    161 
    162 static void pre_2_10_vmstate_unregister_dummy_icp(int i)
    163 {
    164     vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
    165                        (void *)(uintptr_t) i);
    166 }
    167 
    168 int spapr_max_server_number(SpaprMachineState *spapr)
    169 {
    170     MachineState *ms = MACHINE(spapr);
    171 
    172     assert(spapr->vsmt);
    173     return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
    174 }
    175 
    176 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
    177                                   int smt_threads)
    178 {
    179     int i, ret = 0;
    180     g_autofree uint32_t *servers_prop = g_new(uint32_t, smt_threads);
    181     g_autofree uint32_t *gservers_prop = g_new(uint32_t, smt_threads * 2);
    182     int index = spapr_get_vcpu_id(cpu);
    183 
    184     if (cpu->compat_pvr) {
    185         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
    186         if (ret < 0) {
    187             return ret;
    188         }
    189     }
    190 
    191     /* Build interrupt servers and gservers properties */
    192     for (i = 0; i < smt_threads; i++) {
    193         servers_prop[i] = cpu_to_be32(index + i);
    194         /* Hack, direct the group queues back to cpu 0 */
    195         gservers_prop[i*2] = cpu_to_be32(index + i);
    196         gservers_prop[i*2 + 1] = 0;
    197     }
    198     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
    199                       servers_prop, sizeof(*servers_prop) * smt_threads);
    200     if (ret < 0) {
    201         return ret;
    202     }
    203     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
    204                       gservers_prop, sizeof(*gservers_prop) * smt_threads * 2);
    205 
    206     return ret;
    207 }
    208 
    209 static void spapr_dt_pa_features(SpaprMachineState *spapr,
    210                                  PowerPCCPU *cpu,
    211                                  void *fdt, int offset)
    212 {
    213     uint8_t pa_features_206[] = { 6, 0,
    214         0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
    215     uint8_t pa_features_207[] = { 24, 0,
    216         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
    217         0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
    218         0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
    219         0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
    220     uint8_t pa_features_300[] = { 66, 0,
    221         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
    222         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
    223         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
    224         /* 6: DS207 */
    225         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
    226         /* 16: Vector */
    227         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
    228         /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
    229         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
    230         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
    231         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
    232         /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
    233         0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
    234         /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
    235         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
    236         /* 42: PM, 44: PC RA, 46: SC vec'd */
    237         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
    238         /* 48: SIMD, 50: QP BFP, 52: String */
    239         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
    240         /* 54: DecFP, 56: DecI, 58: SHA */
    241         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
    242         /* 60: NM atomic, 62: RNG */
    243         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
    244     };
    245     uint8_t *pa_features = NULL;
    246     size_t pa_size;
    247 
    248     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
    249         pa_features = pa_features_206;
    250         pa_size = sizeof(pa_features_206);
    251     }
    252     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
    253         pa_features = pa_features_207;
    254         pa_size = sizeof(pa_features_207);
    255     }
    256     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
    257         pa_features = pa_features_300;
    258         pa_size = sizeof(pa_features_300);
    259     }
    260     if (!pa_features) {
    261         return;
    262     }
    263 
    264     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
    265         /*
    266          * Note: we keep CI large pages off by default because a 64K capable
    267          * guest provisioned with large pages might otherwise try to map a qemu
    268          * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
    269          * even if that qemu runs on a 4k host.
    270          * We dd this bit back here if we are confident this is not an issue
    271          */
    272         pa_features[3] |= 0x20;
    273     }
    274     if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
    275         pa_features[24] |= 0x80;    /* Transactional memory support */
    276     }
    277     if (spapr->cas_pre_isa3_guest && pa_size > 40) {
    278         /* Workaround for broken kernels that attempt (guest) radix
    279          * mode when they can't handle it, if they see the radix bit set
    280          * in pa-features. So hide it from them. */
    281         pa_features[40 + 2] &= ~0x80; /* Radix MMU */
    282     }
    283 
    284     _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
    285 }
    286 
    287 static hwaddr spapr_node0_size(MachineState *machine)
    288 {
    289     if (machine->numa_state->num_nodes) {
    290         int i;
    291         for (i = 0; i < machine->numa_state->num_nodes; ++i) {
    292             if (machine->numa_state->nodes[i].node_mem) {
    293                 return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
    294                            machine->ram_size);
    295             }
    296         }
    297     }
    298     return machine->ram_size;
    299 }
    300 
    301 static void add_str(GString *s, const gchar *s1)
    302 {
    303     g_string_append_len(s, s1, strlen(s1) + 1);
    304 }
    305 
    306 static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
    307                                 hwaddr start, hwaddr size)
    308 {
    309     char mem_name[32];
    310     uint64_t mem_reg_property[2];
    311     int off;
    312 
    313     mem_reg_property[0] = cpu_to_be64(start);
    314     mem_reg_property[1] = cpu_to_be64(size);
    315 
    316     sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
    317     off = fdt_add_subnode(fdt, 0, mem_name);
    318     _FDT(off);
    319     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
    320     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
    321                       sizeof(mem_reg_property))));
    322     spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
    323     return off;
    324 }
    325 
    326 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
    327 {
    328     MemoryDeviceInfoList *info;
    329 
    330     for (info = list; info; info = info->next) {
    331         MemoryDeviceInfo *value = info->value;
    332 
    333         if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
    334             PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
    335 
    336             if (addr >= pcdimm_info->addr &&
    337                 addr < (pcdimm_info->addr + pcdimm_info->size)) {
    338                 return pcdimm_info->node;
    339             }
    340         }
    341     }
    342 
    343     return -1;
    344 }
    345 
    346 struct sPAPRDrconfCellV2 {
    347      uint32_t seq_lmbs;
    348      uint64_t base_addr;
    349      uint32_t drc_index;
    350      uint32_t aa_index;
    351      uint32_t flags;
    352 } QEMU_PACKED;
    353 
    354 typedef struct DrconfCellQueue {
    355     struct sPAPRDrconfCellV2 cell;
    356     QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
    357 } DrconfCellQueue;
    358 
    359 static DrconfCellQueue *
    360 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
    361                       uint32_t drc_index, uint32_t aa_index,
    362                       uint32_t flags)
    363 {
    364     DrconfCellQueue *elem;
    365 
    366     elem = g_malloc0(sizeof(*elem));
    367     elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
    368     elem->cell.base_addr = cpu_to_be64(base_addr);
    369     elem->cell.drc_index = cpu_to_be32(drc_index);
    370     elem->cell.aa_index = cpu_to_be32(aa_index);
    371     elem->cell.flags = cpu_to_be32(flags);
    372 
    373     return elem;
    374 }
    375 
    376 static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
    377                                       int offset, MemoryDeviceInfoList *dimms)
    378 {
    379     MachineState *machine = MACHINE(spapr);
    380     uint8_t *int_buf, *cur_index;
    381     int ret;
    382     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
    383     uint64_t addr, cur_addr, size;
    384     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
    385     uint64_t mem_end = machine->device_memory->base +
    386                        memory_region_size(&machine->device_memory->mr);
    387     uint32_t node, buf_len, nr_entries = 0;
    388     SpaprDrc *drc;
    389     DrconfCellQueue *elem, *next;
    390     MemoryDeviceInfoList *info;
    391     QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
    392         = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
    393 
    394     /* Entry to cover RAM and the gap area */
    395     elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
    396                                  SPAPR_LMB_FLAGS_RESERVED |
    397                                  SPAPR_LMB_FLAGS_DRC_INVALID);
    398     QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
    399     nr_entries++;
    400 
    401     cur_addr = machine->device_memory->base;
    402     for (info = dimms; info; info = info->next) {
    403         PCDIMMDeviceInfo *di = info->value->u.dimm.data;
    404 
    405         addr = di->addr;
    406         size = di->size;
    407         node = di->node;
    408 
    409         /*
    410          * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
    411          * area is marked hotpluggable in the next iteration for the bigger
    412          * chunk including the NVDIMM occupied area.
    413          */
    414         if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
    415             continue;
    416 
    417         /* Entry for hot-pluggable area */
    418         if (cur_addr < addr) {
    419             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
    420             g_assert(drc);
    421             elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
    422                                          cur_addr, spapr_drc_index(drc), -1, 0);
    423             QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
    424             nr_entries++;
    425         }
    426 
    427         /* Entry for DIMM */
    428         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
    429         g_assert(drc);
    430         elem = spapr_get_drconf_cell(size / lmb_size, addr,
    431                                      spapr_drc_index(drc), node,
    432                                      (SPAPR_LMB_FLAGS_ASSIGNED |
    433                                       SPAPR_LMB_FLAGS_HOTREMOVABLE));
    434         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
    435         nr_entries++;
    436         cur_addr = addr + size;
    437     }
    438 
    439     /* Entry for remaining hotpluggable area */
    440     if (cur_addr < mem_end) {
    441         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
    442         g_assert(drc);
    443         elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
    444                                      cur_addr, spapr_drc_index(drc), -1, 0);
    445         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
    446         nr_entries++;
    447     }
    448 
    449     buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
    450     int_buf = cur_index = g_malloc0(buf_len);
    451     *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
    452     cur_index += sizeof(nr_entries);
    453 
    454     QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
    455         memcpy(cur_index, &elem->cell, sizeof(elem->cell));
    456         cur_index += sizeof(elem->cell);
    457         QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
    458         g_free(elem);
    459     }
    460 
    461     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
    462     g_free(int_buf);
    463     if (ret < 0) {
    464         return -1;
    465     }
    466     return 0;
    467 }
    468 
    469 static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
    470                                    int offset, MemoryDeviceInfoList *dimms)
    471 {
    472     MachineState *machine = MACHINE(spapr);
    473     int i, ret;
    474     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
    475     uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
    476     uint32_t nr_lmbs = (machine->device_memory->base +
    477                        memory_region_size(&machine->device_memory->mr)) /
    478                        lmb_size;
    479     uint32_t *int_buf, *cur_index, buf_len;
    480 
    481     /*
    482      * Allocate enough buffer size to fit in ibm,dynamic-memory
    483      */
    484     buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
    485     cur_index = int_buf = g_malloc0(buf_len);
    486     int_buf[0] = cpu_to_be32(nr_lmbs);
    487     cur_index++;
    488     for (i = 0; i < nr_lmbs; i++) {
    489         uint64_t addr = i * lmb_size;
    490         uint32_t *dynamic_memory = cur_index;
    491 
    492         if (i >= device_lmb_start) {
    493             SpaprDrc *drc;
    494 
    495             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
    496             g_assert(drc);
    497 
    498             dynamic_memory[0] = cpu_to_be32(addr >> 32);
    499             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
    500             dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
    501             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
    502             dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
    503             if (memory_region_present(get_system_memory(), addr)) {
    504                 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
    505             } else {
    506                 dynamic_memory[5] = cpu_to_be32(0);
    507             }
    508         } else {
    509             /*
    510              * LMB information for RMA, boot time RAM and gap b/n RAM and
    511              * device memory region -- all these are marked as reserved
    512              * and as having no valid DRC.
    513              */
    514             dynamic_memory[0] = cpu_to_be32(addr >> 32);
    515             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
    516             dynamic_memory[2] = cpu_to_be32(0);
    517             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
    518             dynamic_memory[4] = cpu_to_be32(-1);
    519             dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
    520                                             SPAPR_LMB_FLAGS_DRC_INVALID);
    521         }
    522 
    523         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
    524     }
    525     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
    526     g_free(int_buf);
    527     if (ret < 0) {
    528         return -1;
    529     }
    530     return 0;
    531 }
    532 
    533 /*
    534  * Adds ibm,dynamic-reconfiguration-memory node.
    535  * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
    536  * of this device tree node.
    537  */
    538 static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
    539                                                    void *fdt)
    540 {
    541     MachineState *machine = MACHINE(spapr);
    542     int ret, offset;
    543     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
    544     uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
    545                                 cpu_to_be32(lmb_size & 0xffffffff)};
    546     MemoryDeviceInfoList *dimms = NULL;
    547 
    548     /*
    549      * Don't create the node if there is no device memory
    550      */
    551     if (machine->ram_size == machine->maxram_size) {
    552         return 0;
    553     }
    554 
    555     offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
    556 
    557     ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
    558                     sizeof(prop_lmb_size));
    559     if (ret < 0) {
    560         return ret;
    561     }
    562 
    563     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
    564     if (ret < 0) {
    565         return ret;
    566     }
    567 
    568     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
    569     if (ret < 0) {
    570         return ret;
    571     }
    572 
    573     /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
    574     dimms = qmp_memory_device_list();
    575     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
    576         ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
    577     } else {
    578         ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
    579     }
    580     qapi_free_MemoryDeviceInfoList(dimms);
    581 
    582     if (ret < 0) {
    583         return ret;
    584     }
    585 
    586     ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
    587 
    588     return ret;
    589 }
    590 
    591 static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
    592 {
    593     MachineState *machine = MACHINE(spapr);
    594     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
    595     hwaddr mem_start, node_size;
    596     int i, nb_nodes = machine->numa_state->num_nodes;
    597     NodeInfo *nodes = machine->numa_state->nodes;
    598 
    599     for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
    600         if (!nodes[i].node_mem) {
    601             continue;
    602         }
    603         if (mem_start >= machine->ram_size) {
    604             node_size = 0;
    605         } else {
    606             node_size = nodes[i].node_mem;
    607             if (node_size > machine->ram_size - mem_start) {
    608                 node_size = machine->ram_size - mem_start;
    609             }
    610         }
    611         if (!mem_start) {
    612             /* spapr_machine_init() checks for rma_size <= node0_size
    613              * already */
    614             spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
    615             mem_start += spapr->rma_size;
    616             node_size -= spapr->rma_size;
    617         }
    618         for ( ; node_size; ) {
    619             hwaddr sizetmp = pow2floor(node_size);
    620 
    621             /* mem_start != 0 here */
    622             if (ctzl(mem_start) < ctzl(sizetmp)) {
    623                 sizetmp = 1ULL << ctzl(mem_start);
    624             }
    625 
    626             spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
    627             node_size -= sizetmp;
    628             mem_start += sizetmp;
    629         }
    630     }
    631 
    632     /* Generate ibm,dynamic-reconfiguration-memory node if required */
    633     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
    634         int ret;
    635 
    636         g_assert(smc->dr_lmb_enabled);
    637         ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
    638         if (ret) {
    639             return ret;
    640         }
    641     }
    642 
    643     return 0;
    644 }
    645 
    646 static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
    647                          SpaprMachineState *spapr)
    648 {
    649     MachineState *ms = MACHINE(spapr);
    650     PowerPCCPU *cpu = POWERPC_CPU(cs);
    651     CPUPPCState *env = &cpu->env;
    652     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
    653     int index = spapr_get_vcpu_id(cpu);
    654     uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
    655                        0xffffffff, 0xffffffff};
    656     uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
    657         : SPAPR_TIMEBASE_FREQ;
    658     uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
    659     uint32_t page_sizes_prop[64];
    660     size_t page_sizes_prop_size;
    661     unsigned int smp_threads = ms->smp.threads;
    662     uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
    663     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
    664     int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
    665     SpaprDrc *drc;
    666     int drc_index;
    667     uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
    668     int i;
    669 
    670     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
    671     if (drc) {
    672         drc_index = spapr_drc_index(drc);
    673         _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
    674     }
    675 
    676     _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
    677     _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
    678 
    679     _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
    680     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
    681                            env->dcache_line_size)));
    682     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
    683                            env->dcache_line_size)));
    684     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
    685                            env->icache_line_size)));
    686     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
    687                            env->icache_line_size)));
    688 
    689     if (pcc->l1_dcache_size) {
    690         _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
    691                                pcc->l1_dcache_size)));
    692     } else {
    693         warn_report("Unknown L1 dcache size for cpu");
    694     }
    695     if (pcc->l1_icache_size) {
    696         _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
    697                                pcc->l1_icache_size)));
    698     } else {
    699         warn_report("Unknown L1 icache size for cpu");
    700     }
    701 
    702     _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
    703     _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
    704     _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
    705     _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
    706     _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
    707     _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
    708 
    709     if (ppc_has_spr(cpu, SPR_PURR)) {
    710         _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
    711     }
    712     if (ppc_has_spr(cpu, SPR_PURR)) {
    713         _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
    714     }
    715 
    716     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
    717         _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
    718                           segs, sizeof(segs))));
    719     }
    720 
    721     /* Advertise VSX (vector extensions) if available
    722      *   1               == VMX / Altivec available
    723      *   2               == VSX available
    724      *
    725      * Only CPUs for which we create core types in spapr_cpu_core.c
    726      * are possible, and all of those have VMX */
    727     if (env->insns_flags & PPC_ALTIVEC) {
    728         if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
    729             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
    730         } else {
    731             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
    732         }
    733     }
    734 
    735     /* Advertise DFP (Decimal Floating Point) if available
    736      *   0 / no property == no DFP
    737      *   1               == DFP available */
    738     if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
    739         _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
    740     }
    741 
    742     page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
    743                                                       sizeof(page_sizes_prop));
    744     if (page_sizes_prop_size) {
    745         _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
    746                           page_sizes_prop, page_sizes_prop_size)));
    747     }
    748 
    749     spapr_dt_pa_features(spapr, cpu, fdt, offset);
    750 
    751     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
    752                            cs->cpu_index / vcpus_per_socket)));
    753 
    754     _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
    755                       pft_size_prop, sizeof(pft_size_prop))));
    756 
    757     if (ms->numa_state->num_nodes > 1) {
    758         _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
    759     }
    760 
    761     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
    762 
    763     if (pcc->radix_page_info) {
    764         for (i = 0; i < pcc->radix_page_info->count; i++) {
    765             radix_AP_encodings[i] =
    766                 cpu_to_be32(pcc->radix_page_info->entries[i]);
    767         }
    768         _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
    769                           radix_AP_encodings,
    770                           pcc->radix_page_info->count *
    771                           sizeof(radix_AP_encodings[0]))));
    772     }
    773 
    774     /*
    775      * We set this property to let the guest know that it can use the large
    776      * decrementer and its width in bits.
    777      */
    778     if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
    779         _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
    780                               pcc->lrg_decr_bits)));
    781 }
    782 
    783 static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
    784 {
    785     CPUState **rev;
    786     CPUState *cs;
    787     int n_cpus;
    788     int cpus_offset;
    789     int i;
    790 
    791     cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
    792     _FDT(cpus_offset);
    793     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
    794     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
    795 
    796     /*
    797      * We walk the CPUs in reverse order to ensure that CPU DT nodes
    798      * created by fdt_add_subnode() end up in the right order in FDT
    799      * for the guest kernel the enumerate the CPUs correctly.
    800      *
    801      * The CPU list cannot be traversed in reverse order, so we need
    802      * to do extra work.
    803      */
    804     n_cpus = 0;
    805     rev = NULL;
    806     CPU_FOREACH(cs) {
    807         rev = g_renew(CPUState *, rev, n_cpus + 1);
    808         rev[n_cpus++] = cs;
    809     }
    810 
    811     for (i = n_cpus - 1; i >= 0; i--) {
    812         CPUState *cs = rev[i];
    813         PowerPCCPU *cpu = POWERPC_CPU(cs);
    814         int index = spapr_get_vcpu_id(cpu);
    815         DeviceClass *dc = DEVICE_GET_CLASS(cs);
    816         g_autofree char *nodename = NULL;
    817         int offset;
    818 
    819         if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
    820             continue;
    821         }
    822 
    823         nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
    824         offset = fdt_add_subnode(fdt, cpus_offset, nodename);
    825         _FDT(offset);
    826         spapr_dt_cpu(cs, fdt, offset, spapr);
    827     }
    828 
    829     g_free(rev);
    830 }
    831 
    832 static int spapr_dt_rng(void *fdt)
    833 {
    834     int node;
    835     int ret;
    836 
    837     node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
    838     if (node <= 0) {
    839         return -1;
    840     }
    841     ret = fdt_setprop_string(fdt, node, "device_type",
    842                              "ibm,platform-facilities");
    843     ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
    844     ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
    845 
    846     node = fdt_add_subnode(fdt, node, "ibm,random-v1");
    847     if (node <= 0) {
    848         return -1;
    849     }
    850     ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
    851 
    852     return ret ? -1 : 0;
    853 }
    854 
    855 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
    856 {
    857     MachineState *ms = MACHINE(spapr);
    858     int rtas;
    859     GString *hypertas = g_string_sized_new(256);
    860     GString *qemu_hypertas = g_string_sized_new(256);
    861     uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
    862         memory_region_size(&MACHINE(spapr)->device_memory->mr);
    863     uint32_t lrdr_capacity[] = {
    864         cpu_to_be32(max_device_addr >> 32),
    865         cpu_to_be32(max_device_addr & 0xffffffff),
    866         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
    867         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
    868         cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
    869     };
    870 
    871     _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
    872 
    873     /* hypertas */
    874     add_str(hypertas, "hcall-pft");
    875     add_str(hypertas, "hcall-term");
    876     add_str(hypertas, "hcall-dabr");
    877     add_str(hypertas, "hcall-interrupt");
    878     add_str(hypertas, "hcall-tce");
    879     add_str(hypertas, "hcall-vio");
    880     add_str(hypertas, "hcall-splpar");
    881     add_str(hypertas, "hcall-join");
    882     add_str(hypertas, "hcall-bulk");
    883     add_str(hypertas, "hcall-set-mode");
    884     add_str(hypertas, "hcall-sprg0");
    885     add_str(hypertas, "hcall-copy");
    886     add_str(hypertas, "hcall-debug");
    887     add_str(hypertas, "hcall-vphn");
    888     if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
    889         add_str(hypertas, "hcall-rpt-invalidate");
    890     }
    891 
    892     add_str(qemu_hypertas, "hcall-memop1");
    893 
    894     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
    895         add_str(hypertas, "hcall-multi-tce");
    896     }
    897 
    898     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
    899         add_str(hypertas, "hcall-hpt-resize");
    900     }
    901 
    902     add_str(hypertas, "hcall-watchdog");
    903 
    904     _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
    905                      hypertas->str, hypertas->len));
    906     g_string_free(hypertas, TRUE);
    907     _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
    908                      qemu_hypertas->str, qemu_hypertas->len));
    909     g_string_free(qemu_hypertas, TRUE);
    910 
    911     spapr_numa_write_rtas_dt(spapr, fdt, rtas);
    912 
    913     /*
    914      * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
    915      * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
    916      *
    917      * The system reset requirements are driven by existing Linux and PowerVM
    918      * implementation which (contrary to PAPR) saves r3 in the error log
    919      * structure like machine check, so Linux expects to find the saved r3
    920      * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
    921      * does not look at the error value).
    922      *
    923      * System reset interrupts are not subject to interlock like machine
    924      * check, so this memory area could be corrupted if the sreset is
    925      * interrupted by a machine check (or vice versa) if it was shared. To
    926      * prevent this, system reset uses per-CPU areas for the sreset save
    927      * area. A system reset that interrupts a system reset handler could
    928      * still overwrite this area, but Linux doesn't try to recover in that
    929      * case anyway.
    930      *
    931      * The extra 8 bytes is required because Linux's FWNMI error log check
    932      * is off-by-one.
    933      *
    934      * RTAS_MIN_SIZE is required for the RTAS blob itself.
    935      */
    936     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
    937                           RTAS_ERROR_LOG_MAX +
    938                           ms->smp.max_cpus * sizeof(uint64_t) * 2 +
    939                           sizeof(uint64_t)));
    940     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
    941                           RTAS_ERROR_LOG_MAX));
    942     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
    943                           RTAS_EVENT_SCAN_RATE));
    944 
    945     g_assert(msi_nonbroken);
    946     _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
    947 
    948     /*
    949      * According to PAPR, rtas ibm,os-term does not guarantee a return
    950      * back to the guest cpu.
    951      *
    952      * While an additional ibm,extended-os-term property indicates
    953      * that rtas call return will always occur. Set this property.
    954      */
    955     _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
    956 
    957     _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
    958                      lrdr_capacity, sizeof(lrdr_capacity)));
    959 
    960     spapr_dt_rtas_tokens(fdt, rtas);
    961 }
    962 
    963 /*
    964  * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
    965  * and the XIVE features that the guest may request and thus the valid
    966  * values for bytes 23..26 of option vector 5:
    967  */
    968 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
    969                                           int chosen)
    970 {
    971     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
    972 
    973     char val[2 * 4] = {
    974         23, 0x00, /* XICS / XIVE mode */
    975         24, 0x00, /* Hash/Radix, filled in below. */
    976         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
    977         26, 0x40, /* Radix options: GTSE == yes. */
    978     };
    979 
    980     if (spapr->irq->xics && spapr->irq->xive) {
    981         val[1] = SPAPR_OV5_XIVE_BOTH;
    982     } else if (spapr->irq->xive) {
    983         val[1] = SPAPR_OV5_XIVE_EXPLOIT;
    984     } else {
    985         assert(spapr->irq->xics);
    986         val[1] = SPAPR_OV5_XIVE_LEGACY;
    987     }
    988 
    989     if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
    990                           first_ppc_cpu->compat_pvr)) {
    991         /*
    992          * If we're in a pre POWER9 compat mode then the guest should
    993          * do hash and use the legacy interrupt mode
    994          */
    995         val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
    996         val[3] = 0x00; /* Hash */
    997         spapr_check_mmu_mode(false);
    998     } else if (kvm_enabled()) {
    999         if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
   1000             val[3] = 0x80; /* OV5_MMU_BOTH */
   1001         } else if (kvmppc_has_cap_mmu_radix()) {
   1002             val[3] = 0x40; /* OV5_MMU_RADIX_300 */
   1003         } else {
   1004             val[3] = 0x00; /* Hash */
   1005         }
   1006     } else {
   1007         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
   1008         val[3] = 0xC0;
   1009     }
   1010     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
   1011                      val, sizeof(val)));
   1012 }
   1013 
   1014 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
   1015 {
   1016     MachineState *machine = MACHINE(spapr);
   1017     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
   1018     uint8_t rng_seed[32];
   1019     int chosen;
   1020 
   1021     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
   1022 
   1023     if (reset) {
   1024         const char *boot_device = spapr->boot_device;
   1025         g_autofree char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
   1026         size_t cb = 0;
   1027         g_autofree char *bootlist = get_boot_devices_list(&cb);
   1028 
   1029         if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
   1030             _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
   1031                                     machine->kernel_cmdline));
   1032         }
   1033 
   1034         if (spapr->initrd_size) {
   1035             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
   1036                                   spapr->initrd_base));
   1037             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
   1038                                   spapr->initrd_base + spapr->initrd_size));
   1039         }
   1040 
   1041         if (spapr->kernel_size) {
   1042             uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
   1043                                   cpu_to_be64(spapr->kernel_size) };
   1044 
   1045             _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
   1046                          &kprop, sizeof(kprop)));
   1047             if (spapr->kernel_le) {
   1048                 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
   1049             }
   1050         }
   1051         if (machine->boot_config.has_menu && machine->boot_config.menu) {
   1052             _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", true)));
   1053         }
   1054         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
   1055         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
   1056         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
   1057 
   1058         if (cb && bootlist) {
   1059             int i;
   1060 
   1061             for (i = 0; i < cb; i++) {
   1062                 if (bootlist[i] == '\n') {
   1063                     bootlist[i] = ' ';
   1064                 }
   1065             }
   1066             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
   1067         }
   1068 
   1069         if (boot_device && strlen(boot_device)) {
   1070             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
   1071         }
   1072 
   1073         if (spapr->want_stdout_path && stdout_path) {
   1074             /*
   1075              * "linux,stdout-path" and "stdout" properties are
   1076              * deprecated by linux kernel. New platforms should only
   1077              * use the "stdout-path" property. Set the new property
   1078              * and continue using older property to remain compatible
   1079              * with the existing firmware.
   1080              */
   1081             _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
   1082             _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
   1083         }
   1084 
   1085         /*
   1086          * We can deal with BAR reallocation just fine, advertise it
   1087          * to the guest
   1088          */
   1089         if (smc->linux_pci_probe) {
   1090             _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
   1091         }
   1092 
   1093         spapr_dt_ov5_platform_support(spapr, fdt, chosen);
   1094     }
   1095 
   1096     qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
   1097     _FDT(fdt_setprop(fdt, chosen, "rng-seed", rng_seed, sizeof(rng_seed)));
   1098 
   1099     _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
   1100 }
   1101 
   1102 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
   1103 {
   1104     /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
   1105      * KVM to work under pHyp with some guest co-operation */
   1106     int hypervisor;
   1107     uint8_t hypercall[16];
   1108 
   1109     _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
   1110     /* indicate KVM hypercall interface */
   1111     _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
   1112     if (kvmppc_has_cap_fixup_hcalls()) {
   1113         /*
   1114          * Older KVM versions with older guest kernels were broken
   1115          * with the magic page, don't allow the guest to map it.
   1116          */
   1117         if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
   1118                                   sizeof(hypercall))) {
   1119             _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
   1120                              hypercall, sizeof(hypercall)));
   1121         }
   1122     }
   1123 }
   1124 
   1125 void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
   1126 {
   1127     MachineState *machine = MACHINE(spapr);
   1128     MachineClass *mc = MACHINE_GET_CLASS(machine);
   1129     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
   1130     uint32_t root_drc_type_mask = 0;
   1131     int ret;
   1132     void *fdt;
   1133     SpaprPhbState *phb;
   1134     char *buf;
   1135 
   1136     fdt = g_malloc0(space);
   1137     _FDT((fdt_create_empty_tree(fdt, space)));
   1138 
   1139     /* Root node */
   1140     _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
   1141     _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
   1142     _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
   1143 
   1144     /* Guest UUID & Name*/
   1145     buf = qemu_uuid_unparse_strdup(&qemu_uuid);
   1146     _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
   1147     if (qemu_uuid_set) {
   1148         _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
   1149     }
   1150     g_free(buf);
   1151 
   1152     if (qemu_get_vm_name()) {
   1153         _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
   1154                                 qemu_get_vm_name()));
   1155     }
   1156 
   1157     /* Host Model & Serial Number */
   1158     if (spapr->host_model) {
   1159         _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
   1160     } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
   1161         _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
   1162         g_free(buf);
   1163     }
   1164 
   1165     if (spapr->host_serial) {
   1166         _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
   1167     } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
   1168         _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
   1169         g_free(buf);
   1170     }
   1171 
   1172     _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
   1173     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
   1174 
   1175     /* /interrupt controller */
   1176     spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
   1177 
   1178     ret = spapr_dt_memory(spapr, fdt);
   1179     if (ret < 0) {
   1180         error_report("couldn't setup memory nodes in fdt");
   1181         exit(1);
   1182     }
   1183 
   1184     /* /vdevice */
   1185     spapr_dt_vdevice(spapr->vio_bus, fdt);
   1186 
   1187     if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
   1188         ret = spapr_dt_rng(fdt);
   1189         if (ret < 0) {
   1190             error_report("could not set up rng device in the fdt");
   1191             exit(1);
   1192         }
   1193     }
   1194 
   1195     QLIST_FOREACH(phb, &spapr->phbs, list) {
   1196         ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
   1197         if (ret < 0) {
   1198             error_report("couldn't setup PCI devices in fdt");
   1199             exit(1);
   1200         }
   1201     }
   1202 
   1203     spapr_dt_cpus(fdt, spapr);
   1204 
   1205     /* ibm,drc-indexes and friends */
   1206     if (smc->dr_lmb_enabled) {
   1207         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
   1208     }
   1209     if (smc->dr_phb_enabled) {
   1210         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
   1211     }
   1212     if (mc->nvdimm_supported) {
   1213         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
   1214     }
   1215     if (root_drc_type_mask) {
   1216         _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
   1217     }
   1218 
   1219     if (mc->has_hotpluggable_cpus) {
   1220         int offset = fdt_path_offset(fdt, "/cpus");
   1221         ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
   1222         if (ret < 0) {
   1223             error_report("Couldn't set up CPU DR device tree properties");
   1224             exit(1);
   1225         }
   1226     }
   1227 
   1228     /* /event-sources */
   1229     spapr_dt_events(spapr, fdt);
   1230 
   1231     /* /rtas */
   1232     spapr_dt_rtas(spapr, fdt);
   1233 
   1234     /* /chosen */
   1235     spapr_dt_chosen(spapr, fdt, reset);
   1236 
   1237     /* /hypervisor */
   1238     if (kvm_enabled()) {
   1239         spapr_dt_hypervisor(spapr, fdt);
   1240     }
   1241 
   1242     /* Build memory reserve map */
   1243     if (reset) {
   1244         if (spapr->kernel_size) {
   1245             _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
   1246                                   spapr->kernel_size)));
   1247         }
   1248         if (spapr->initrd_size) {
   1249             _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
   1250                                   spapr->initrd_size)));
   1251         }
   1252     }
   1253 
   1254     /* NVDIMM devices */
   1255     if (mc->nvdimm_supported) {
   1256         spapr_dt_persistent_memory(spapr, fdt);
   1257     }
   1258 
   1259     return fdt;
   1260 }
   1261 
   1262 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
   1263 {
   1264     SpaprMachineState *spapr = opaque;
   1265 
   1266     return (addr & 0x0fffffff) + spapr->kernel_addr;
   1267 }
   1268 
   1269 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
   1270                                     PowerPCCPU *cpu)
   1271 {
   1272     CPUPPCState *env = &cpu->env;
   1273 
   1274     /* The TCG path should also be holding the BQL at this point */
   1275     g_assert(qemu_mutex_iothread_locked());
   1276 
   1277     g_assert(!vhyp_cpu_in_nested(cpu));
   1278 
   1279     if (FIELD_EX64(env->msr, MSR, PR)) {
   1280         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
   1281         env->gpr[3] = H_PRIVILEGE;
   1282     } else {
   1283         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
   1284     }
   1285 }
   1286 
   1287 struct LPCRSyncState {
   1288     target_ulong value;
   1289     target_ulong mask;
   1290 };
   1291 
   1292 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
   1293 {
   1294     struct LPCRSyncState *s = arg.host_ptr;
   1295     PowerPCCPU *cpu = POWERPC_CPU(cs);
   1296     CPUPPCState *env = &cpu->env;
   1297     target_ulong lpcr;
   1298 
   1299     cpu_synchronize_state(cs);
   1300     lpcr = env->spr[SPR_LPCR];
   1301     lpcr &= ~s->mask;
   1302     lpcr |= s->value;
   1303     ppc_store_lpcr(cpu, lpcr);
   1304 }
   1305 
   1306 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
   1307 {
   1308     CPUState *cs;
   1309     struct LPCRSyncState s = {
   1310         .value = value,
   1311         .mask = mask
   1312     };
   1313     CPU_FOREACH(cs) {
   1314         run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
   1315     }
   1316 }
   1317 
   1318 static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu,
   1319                            target_ulong lpid, ppc_v3_pate_t *entry)
   1320 {
   1321     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1322     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
   1323 
   1324     if (!spapr_cpu->in_nested) {
   1325         assert(lpid == 0);
   1326 
   1327         /* Copy PATE1:GR into PATE0:HR */
   1328         entry->dw0 = spapr->patb_entry & PATE0_HR;
   1329         entry->dw1 = spapr->patb_entry;
   1330 
   1331     } else {
   1332         uint64_t patb, pats;
   1333 
   1334         assert(lpid != 0);
   1335 
   1336         patb = spapr->nested_ptcr & PTCR_PATB;
   1337         pats = spapr->nested_ptcr & PTCR_PATS;
   1338 
   1339         /* Check if partition table is properly aligned */
   1340         if (patb & MAKE_64BIT_MASK(0, pats + 12)) {
   1341             return false;
   1342         }
   1343 
   1344         /* Calculate number of entries */
   1345         pats = 1ull << (pats + 12 - 4);
   1346         if (pats <= lpid) {
   1347             return false;
   1348         }
   1349 
   1350         /* Grab entry */
   1351         patb += 16 * lpid;
   1352         entry->dw0 = ldq_phys(CPU(cpu)->as, patb);
   1353         entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8);
   1354     }
   1355 
   1356     return true;
   1357 }
   1358 
   1359 #define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
   1360 #define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
   1361 #define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
   1362 #define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
   1363 #define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
   1364 
   1365 /*
   1366  * Get the fd to access the kernel htab, re-opening it if necessary
   1367  */
   1368 static int get_htab_fd(SpaprMachineState *spapr)
   1369 {
   1370     Error *local_err = NULL;
   1371 
   1372     if (spapr->htab_fd >= 0) {
   1373         return spapr->htab_fd;
   1374     }
   1375 
   1376     spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
   1377     if (spapr->htab_fd < 0) {
   1378         error_report_err(local_err);
   1379     }
   1380 
   1381     return spapr->htab_fd;
   1382 }
   1383 
   1384 void close_htab_fd(SpaprMachineState *spapr)
   1385 {
   1386     if (spapr->htab_fd >= 0) {
   1387         close(spapr->htab_fd);
   1388     }
   1389     spapr->htab_fd = -1;
   1390 }
   1391 
   1392 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
   1393 {
   1394     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1395 
   1396     return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
   1397 }
   1398 
   1399 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
   1400 {
   1401     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1402 
   1403     assert(kvm_enabled());
   1404 
   1405     if (!spapr->htab) {
   1406         return 0;
   1407     }
   1408 
   1409     return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
   1410 }
   1411 
   1412 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
   1413                                                 hwaddr ptex, int n)
   1414 {
   1415     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1416     hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
   1417 
   1418     if (!spapr->htab) {
   1419         /*
   1420          * HTAB is controlled by KVM. Fetch into temporary buffer
   1421          */
   1422         ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
   1423         kvmppc_read_hptes(hptes, ptex, n);
   1424         return hptes;
   1425     }
   1426 
   1427     /*
   1428      * HTAB is controlled by QEMU. Just point to the internally
   1429      * accessible PTEG.
   1430      */
   1431     return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
   1432 }
   1433 
   1434 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
   1435                               const ppc_hash_pte64_t *hptes,
   1436                               hwaddr ptex, int n)
   1437 {
   1438     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1439 
   1440     if (!spapr->htab) {
   1441         g_free((void *)hptes);
   1442     }
   1443 
   1444     /* Nothing to do for qemu managed HPT */
   1445 }
   1446 
   1447 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
   1448                       uint64_t pte0, uint64_t pte1)
   1449 {
   1450     SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
   1451     hwaddr offset = ptex * HASH_PTE_SIZE_64;
   1452 
   1453     if (!spapr->htab) {
   1454         kvmppc_write_hpte(ptex, pte0, pte1);
   1455     } else {
   1456         if (pte0 & HPTE64_V_VALID) {
   1457             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
   1458             /*
   1459              * When setting valid, we write PTE1 first. This ensures
   1460              * proper synchronization with the reading code in
   1461              * ppc_hash64_pteg_search()
   1462              */
   1463             smp_wmb();
   1464             stq_p(spapr->htab + offset, pte0);
   1465         } else {
   1466             stq_p(spapr->htab + offset, pte0);
   1467             /*
   1468              * When clearing it we set PTE0 first. This ensures proper
   1469              * synchronization with the reading code in
   1470              * ppc_hash64_pteg_search()
   1471              */
   1472             smp_wmb();
   1473             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
   1474         }
   1475     }
   1476 }
   1477 
   1478 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
   1479                              uint64_t pte1)
   1480 {
   1481     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C;
   1482     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1483 
   1484     if (!spapr->htab) {
   1485         /* There should always be a hash table when this is called */
   1486         error_report("spapr_hpte_set_c called with no hash table !");
   1487         return;
   1488     }
   1489 
   1490     /* The HW performs a non-atomic byte update */
   1491     stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
   1492 }
   1493 
   1494 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
   1495                              uint64_t pte1)
   1496 {
   1497     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R;
   1498     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
   1499 
   1500     if (!spapr->htab) {
   1501         /* There should always be a hash table when this is called */
   1502         error_report("spapr_hpte_set_r called with no hash table !");
   1503         return;
   1504     }
   1505 
   1506     /* The HW performs a non-atomic byte update */
   1507     stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
   1508 }
   1509 
   1510 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
   1511 {
   1512     int shift;
   1513 
   1514     /* We aim for a hash table of size 1/128 the size of RAM (rounded
   1515      * up).  The PAPR recommendation is actually 1/64 of RAM size, but
   1516      * that's much more than is needed for Linux guests */
   1517     shift = ctz64(pow2ceil(ramsize)) - 7;
   1518     shift = MAX(shift, 18); /* Minimum architected size */
   1519     shift = MIN(shift, 46); /* Maximum architected size */
   1520     return shift;
   1521 }
   1522 
   1523 void spapr_free_hpt(SpaprMachineState *spapr)
   1524 {
   1525     qemu_vfree(spapr->htab);
   1526     spapr->htab = NULL;
   1527     spapr->htab_shift = 0;
   1528     close_htab_fd(spapr);
   1529 }
   1530 
   1531 int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
   1532 {
   1533     ERRP_GUARD();
   1534     long rc;
   1535 
   1536     /* Clean up any HPT info from a previous boot */
   1537     spapr_free_hpt(spapr);
   1538 
   1539     rc = kvmppc_reset_htab(shift);
   1540 
   1541     if (rc == -EOPNOTSUPP) {
   1542         error_setg(errp, "HPT not supported in nested guests");
   1543         return -EOPNOTSUPP;
   1544     }
   1545 
   1546     if (rc < 0) {
   1547         /* kernel-side HPT needed, but couldn't allocate one */
   1548         error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
   1549                          shift);
   1550         error_append_hint(errp, "Try smaller maxmem?\n");
   1551         return -errno;
   1552     } else if (rc > 0) {
   1553         /* kernel-side HPT allocated */
   1554         if (rc != shift) {
   1555             error_setg(errp,
   1556                        "Requested order %d HPT, but kernel allocated order %ld",
   1557                        shift, rc);
   1558             error_append_hint(errp, "Try smaller maxmem?\n");
   1559             return -ENOSPC;
   1560         }
   1561 
   1562         spapr->htab_shift = shift;
   1563         spapr->htab = NULL;
   1564     } else {
   1565         /* kernel-side HPT not needed, allocate in userspace instead */
   1566         size_t size = 1ULL << shift;
   1567         int i;
   1568 
   1569         spapr->htab = qemu_memalign(size, size);
   1570         memset(spapr->htab, 0, size);
   1571         spapr->htab_shift = shift;
   1572 
   1573         for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
   1574             DIRTY_HPTE(HPTE(spapr->htab, i));
   1575         }
   1576     }
   1577     /* We're setting up a hash table, so that means we're not radix */
   1578     spapr->patb_entry = 0;
   1579     spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
   1580     return 0;
   1581 }
   1582 
   1583 void spapr_setup_hpt(SpaprMachineState *spapr)
   1584 {
   1585     int hpt_shift;
   1586 
   1587     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
   1588         hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
   1589     } else {
   1590         uint64_t current_ram_size;
   1591 
   1592         current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
   1593         hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
   1594     }
   1595     spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
   1596 
   1597     if (kvm_enabled()) {
   1598         hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
   1599 
   1600         /* Check our RMA fits in the possible VRMA */
   1601         if (vrma_limit < spapr->rma_size) {
   1602             error_report("Unable to create %" HWADDR_PRIu
   1603                          "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
   1604                          spapr->rma_size / MiB, vrma_limit / MiB);
   1605             exit(EXIT_FAILURE);
   1606         }
   1607     }
   1608 }
   1609 
   1610 void spapr_check_mmu_mode(bool guest_radix)
   1611 {
   1612     if (guest_radix) {
   1613         if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
   1614             error_report("Guest requested unavailable MMU mode (radix).");
   1615             exit(EXIT_FAILURE);
   1616         }
   1617     } else {
   1618         if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
   1619             && !kvmppc_has_cap_mmu_hash_v3()) {
   1620             error_report("Guest requested unavailable MMU mode (hash).");
   1621             exit(EXIT_FAILURE);
   1622         }
   1623     }
   1624 }
   1625 
   1626 static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
   1627 {
   1628     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
   1629     PowerPCCPU *first_ppc_cpu;
   1630     hwaddr fdt_addr;
   1631     void *fdt;
   1632     int rc;
   1633 
   1634     pef_kvm_reset(machine->cgs, &error_fatal);
   1635     spapr_caps_apply(spapr);
   1636 
   1637     first_ppc_cpu = POWERPC_CPU(first_cpu);
   1638     if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
   1639         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
   1640                               spapr->max_compat_pvr)) {
   1641         /*
   1642          * If using KVM with radix mode available, VCPUs can be started
   1643          * without a HPT because KVM will start them in radix mode.
   1644          * Set the GR bit in PATE so that we know there is no HPT.
   1645          */
   1646         spapr->patb_entry = PATE1_GR;
   1647         spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
   1648     } else {
   1649         spapr_setup_hpt(spapr);
   1650     }
   1651 
   1652     qemu_devices_reset(reason);
   1653 
   1654     spapr_ovec_cleanup(spapr->ov5_cas);
   1655     spapr->ov5_cas = spapr_ovec_new();
   1656 
   1657     ppc_set_compat_all(spapr->max_compat_pvr, &error_fatal);
   1658 
   1659     /*
   1660      * This is fixing some of the default configuration of the XIVE
   1661      * devices. To be called after the reset of the machine devices.
   1662      */
   1663     spapr_irq_reset(spapr, &error_fatal);
   1664 
   1665     /*
   1666      * There is no CAS under qtest. Simulate one to please the code that
   1667      * depends on spapr->ov5_cas. This is especially needed to test device
   1668      * unplug, so we do that before resetting the DRCs.
   1669      */
   1670     if (qtest_enabled()) {
   1671         spapr_ovec_cleanup(spapr->ov5_cas);
   1672         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
   1673     }
   1674 
   1675     spapr_nvdimm_finish_flushes();
   1676 
   1677     /* DRC reset may cause a device to be unplugged. This will cause troubles
   1678      * if this device is used by another device (eg, a running vhost backend
   1679      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
   1680      * situations, we reset DRCs after all devices have been reset.
   1681      */
   1682     spapr_drc_reset_all(spapr);
   1683 
   1684     spapr_clear_pending_events(spapr);
   1685 
   1686     /*
   1687      * We place the device tree just below either the top of the RMA,
   1688      * or just below 2GB, whichever is lower, so that it can be
   1689      * processed with 32-bit real mode code if necessary
   1690      */
   1691     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
   1692 
   1693     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
   1694     if (spapr->vof) {
   1695         spapr_vof_reset(spapr, fdt, &error_fatal);
   1696         /*
   1697          * Do not pack the FDT as the client may change properties.
   1698          * VOF client does not expect the FDT so we do not load it to the VM.
   1699          */
   1700     } else {
   1701         rc = fdt_pack(fdt);
   1702         /* Should only fail if we've built a corrupted tree */
   1703         assert(rc == 0);
   1704 
   1705         spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
   1706                                   0, fdt_addr, 0);
   1707         cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
   1708     }
   1709     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
   1710 
   1711     g_free(spapr->fdt_blob);
   1712     spapr->fdt_size = fdt_totalsize(fdt);
   1713     spapr->fdt_initial_size = spapr->fdt_size;
   1714     spapr->fdt_blob = fdt;
   1715 
   1716     /* Set machine->fdt for 'dumpdtb' QMP/HMP command */
   1717     machine->fdt = fdt;
   1718 
   1719     /* Set up the entry state */
   1720     first_ppc_cpu->env.gpr[5] = 0;
   1721 
   1722     spapr->fwnmi_system_reset_addr = -1;
   1723     spapr->fwnmi_machine_check_addr = -1;
   1724     spapr->fwnmi_machine_check_interlock = -1;
   1725 
   1726     /* Signal all vCPUs waiting on this condition */
   1727     qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
   1728 
   1729     migrate_del_blocker(spapr->fwnmi_migration_blocker);
   1730 }
   1731 
   1732 static void spapr_create_nvram(SpaprMachineState *spapr)
   1733 {
   1734     DeviceState *dev = qdev_new("spapr-nvram");
   1735     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
   1736 
   1737     if (dinfo) {
   1738         qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
   1739                                 &error_fatal);
   1740     }
   1741 
   1742     qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
   1743 
   1744     spapr->nvram = (struct SpaprNvram *)dev;
   1745 }
   1746 
   1747 static void spapr_rtc_create(SpaprMachineState *spapr)
   1748 {
   1749     object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
   1750                                        sizeof(spapr->rtc), TYPE_SPAPR_RTC,
   1751                                        &error_fatal, NULL);
   1752     qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
   1753     object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
   1754                               "date");
   1755 }
   1756 
   1757 /* Returns whether we want to use VGA or not */
   1758 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
   1759 {
   1760     vga_interface_created = true;
   1761     switch (vga_interface_type) {
   1762     case VGA_NONE:
   1763         return false;
   1764     case VGA_DEVICE:
   1765         return true;
   1766     case VGA_STD:
   1767     case VGA_VIRTIO:
   1768     case VGA_CIRRUS:
   1769         return pci_vga_init(pci_bus) != NULL;
   1770     default:
   1771         error_setg(errp,
   1772                    "Unsupported VGA mode, only -vga std or -vga virtio is supported");
   1773         return false;
   1774     }
   1775 }
   1776 
   1777 static int spapr_pre_load(void *opaque)
   1778 {
   1779     int rc;
   1780 
   1781     rc = spapr_caps_pre_load(opaque);
   1782     if (rc) {
   1783         return rc;
   1784     }
   1785 
   1786     return 0;
   1787 }
   1788 
   1789 static int spapr_post_load(void *opaque, int version_id)
   1790 {
   1791     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
   1792     int err = 0;
   1793 
   1794     err = spapr_caps_post_migration(spapr);
   1795     if (err) {
   1796         return err;
   1797     }
   1798 
   1799     /*
   1800      * In earlier versions, there was no separate qdev for the PAPR
   1801      * RTC, so the RTC offset was stored directly in sPAPREnvironment.
   1802      * So when migrating from those versions, poke the incoming offset
   1803      * value into the RTC device
   1804      */
   1805     if (version_id < 3) {
   1806         err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
   1807         if (err) {
   1808             return err;
   1809         }
   1810     }
   1811 
   1812     if (kvm_enabled() && spapr->patb_entry) {
   1813         PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
   1814         bool radix = !!(spapr->patb_entry & PATE1_GR);
   1815         bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
   1816 
   1817         /*
   1818          * Update LPCR:HR and UPRT as they may not be set properly in
   1819          * the stream
   1820          */
   1821         spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
   1822                             LPCR_HR | LPCR_UPRT);
   1823 
   1824         err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
   1825         if (err) {
   1826             error_report("Process table config unsupported by the host");
   1827             return -EINVAL;
   1828         }
   1829     }
   1830 
   1831     err = spapr_irq_post_load(spapr, version_id);
   1832     if (err) {
   1833         return err;
   1834     }
   1835 
   1836     return err;
   1837 }
   1838 
   1839 static int spapr_pre_save(void *opaque)
   1840 {
   1841     int rc;
   1842 
   1843     rc = spapr_caps_pre_save(opaque);
   1844     if (rc) {
   1845         return rc;
   1846     }
   1847 
   1848     return 0;
   1849 }
   1850 
   1851 static bool version_before_3(void *opaque, int version_id)
   1852 {
   1853     return version_id < 3;
   1854 }
   1855 
   1856 static bool spapr_pending_events_needed(void *opaque)
   1857 {
   1858     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
   1859     return !QTAILQ_EMPTY(&spapr->pending_events);
   1860 }
   1861 
   1862 static const VMStateDescription vmstate_spapr_event_entry = {
   1863     .name = "spapr_event_log_entry",
   1864     .version_id = 1,
   1865     .minimum_version_id = 1,
   1866     .fields = (VMStateField[]) {
   1867         VMSTATE_UINT32(summary, SpaprEventLogEntry),
   1868         VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
   1869         VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
   1870                                      NULL, extended_length),
   1871         VMSTATE_END_OF_LIST()
   1872     },
   1873 };
   1874 
   1875 static const VMStateDescription vmstate_spapr_pending_events = {
   1876     .name = "spapr_pending_events",
   1877     .version_id = 1,
   1878     .minimum_version_id = 1,
   1879     .needed = spapr_pending_events_needed,
   1880     .fields = (VMStateField[]) {
   1881         VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
   1882                          vmstate_spapr_event_entry, SpaprEventLogEntry, next),
   1883         VMSTATE_END_OF_LIST()
   1884     },
   1885 };
   1886 
   1887 static bool spapr_ov5_cas_needed(void *opaque)
   1888 {
   1889     SpaprMachineState *spapr = opaque;
   1890     SpaprOptionVector *ov5_mask = spapr_ovec_new();
   1891     bool cas_needed;
   1892 
   1893     /* Prior to the introduction of SpaprOptionVector, we had two option
   1894      * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
   1895      * Both of these options encode machine topology into the device-tree
   1896      * in such a way that the now-booted OS should still be able to interact
   1897      * appropriately with QEMU regardless of what options were actually
   1898      * negotiatied on the source side.
   1899      *
   1900      * As such, we can avoid migrating the CAS-negotiated options if these
   1901      * are the only options available on the current machine/platform.
   1902      * Since these are the only options available for pseries-2.7 and
   1903      * earlier, this allows us to maintain old->new/new->old migration
   1904      * compatibility.
   1905      *
   1906      * For QEMU 2.8+, there are additional CAS-negotiatable options available
   1907      * via default pseries-2.8 machines and explicit command-line parameters.
   1908      * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
   1909      * of the actual CAS-negotiated values to continue working properly. For
   1910      * example, availability of memory unplug depends on knowing whether
   1911      * OV5_HP_EVT was negotiated via CAS.
   1912      *
   1913      * Thus, for any cases where the set of available CAS-negotiatable
   1914      * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
   1915      * include the CAS-negotiated options in the migration stream, unless
   1916      * if they affect boot time behaviour only.
   1917      */
   1918     spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
   1919     spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
   1920     spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
   1921 
   1922     /* We need extra information if we have any bits outside the mask
   1923      * defined above */
   1924     cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
   1925 
   1926     spapr_ovec_cleanup(ov5_mask);
   1927 
   1928     return cas_needed;
   1929 }
   1930 
   1931 static const VMStateDescription vmstate_spapr_ov5_cas = {
   1932     .name = "spapr_option_vector_ov5_cas",
   1933     .version_id = 1,
   1934     .minimum_version_id = 1,
   1935     .needed = spapr_ov5_cas_needed,
   1936     .fields = (VMStateField[]) {
   1937         VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
   1938                                  vmstate_spapr_ovec, SpaprOptionVector),
   1939         VMSTATE_END_OF_LIST()
   1940     },
   1941 };
   1942 
   1943 static bool spapr_patb_entry_needed(void *opaque)
   1944 {
   1945     SpaprMachineState *spapr = opaque;
   1946 
   1947     return !!spapr->patb_entry;
   1948 }
   1949 
   1950 static const VMStateDescription vmstate_spapr_patb_entry = {
   1951     .name = "spapr_patb_entry",
   1952     .version_id = 1,
   1953     .minimum_version_id = 1,
   1954     .needed = spapr_patb_entry_needed,
   1955     .fields = (VMStateField[]) {
   1956         VMSTATE_UINT64(patb_entry, SpaprMachineState),
   1957         VMSTATE_END_OF_LIST()
   1958     },
   1959 };
   1960 
   1961 static bool spapr_irq_map_needed(void *opaque)
   1962 {
   1963     SpaprMachineState *spapr = opaque;
   1964 
   1965     return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
   1966 }
   1967 
   1968 static const VMStateDescription vmstate_spapr_irq_map = {
   1969     .name = "spapr_irq_map",
   1970     .version_id = 1,
   1971     .minimum_version_id = 1,
   1972     .needed = spapr_irq_map_needed,
   1973     .fields = (VMStateField[]) {
   1974         VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
   1975         VMSTATE_END_OF_LIST()
   1976     },
   1977 };
   1978 
   1979 static bool spapr_dtb_needed(void *opaque)
   1980 {
   1981     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
   1982 
   1983     return smc->update_dt_enabled;
   1984 }
   1985 
   1986 static int spapr_dtb_pre_load(void *opaque)
   1987 {
   1988     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
   1989 
   1990     g_free(spapr->fdt_blob);
   1991     spapr->fdt_blob = NULL;
   1992     spapr->fdt_size = 0;
   1993 
   1994     return 0;
   1995 }
   1996 
   1997 static const VMStateDescription vmstate_spapr_dtb = {
   1998     .name = "spapr_dtb",
   1999     .version_id = 1,
   2000     .minimum_version_id = 1,
   2001     .needed = spapr_dtb_needed,
   2002     .pre_load = spapr_dtb_pre_load,
   2003     .fields = (VMStateField[]) {
   2004         VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
   2005         VMSTATE_UINT32(fdt_size, SpaprMachineState),
   2006         VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
   2007                                      fdt_size),
   2008         VMSTATE_END_OF_LIST()
   2009     },
   2010 };
   2011 
   2012 static bool spapr_fwnmi_needed(void *opaque)
   2013 {
   2014     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
   2015 
   2016     return spapr->fwnmi_machine_check_addr != -1;
   2017 }
   2018 
   2019 static int spapr_fwnmi_pre_save(void *opaque)
   2020 {
   2021     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
   2022 
   2023     /*
   2024      * Check if machine check handling is in progress and print a
   2025      * warning message.
   2026      */
   2027     if (spapr->fwnmi_machine_check_interlock != -1) {
   2028         warn_report("A machine check is being handled during migration. The"
   2029                 "handler may run and log hardware error on the destination");
   2030     }
   2031 
   2032     return 0;
   2033 }
   2034 
   2035 static const VMStateDescription vmstate_spapr_fwnmi = {
   2036     .name = "spapr_fwnmi",
   2037     .version_id = 1,
   2038     .minimum_version_id = 1,
   2039     .needed = spapr_fwnmi_needed,
   2040     .pre_save = spapr_fwnmi_pre_save,
   2041     .fields = (VMStateField[]) {
   2042         VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
   2043         VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
   2044         VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
   2045         VMSTATE_END_OF_LIST()
   2046     },
   2047 };
   2048 
   2049 static const VMStateDescription vmstate_spapr = {
   2050     .name = "spapr",
   2051     .version_id = 3,
   2052     .minimum_version_id = 1,
   2053     .pre_load = spapr_pre_load,
   2054     .post_load = spapr_post_load,
   2055     .pre_save = spapr_pre_save,
   2056     .fields = (VMStateField[]) {
   2057         /* used to be @next_irq */
   2058         VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
   2059 
   2060         /* RTC offset */
   2061         VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
   2062 
   2063         VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
   2064         VMSTATE_END_OF_LIST()
   2065     },
   2066     .subsections = (const VMStateDescription*[]) {
   2067         &vmstate_spapr_ov5_cas,
   2068         &vmstate_spapr_patb_entry,
   2069         &vmstate_spapr_pending_events,
   2070         &vmstate_spapr_cap_htm,
   2071         &vmstate_spapr_cap_vsx,
   2072         &vmstate_spapr_cap_dfp,
   2073         &vmstate_spapr_cap_cfpc,
   2074         &vmstate_spapr_cap_sbbc,
   2075         &vmstate_spapr_cap_ibs,
   2076         &vmstate_spapr_cap_hpt_maxpagesize,
   2077         &vmstate_spapr_irq_map,
   2078         &vmstate_spapr_cap_nested_kvm_hv,
   2079         &vmstate_spapr_dtb,
   2080         &vmstate_spapr_cap_large_decr,
   2081         &vmstate_spapr_cap_ccf_assist,
   2082         &vmstate_spapr_cap_fwnmi,
   2083         &vmstate_spapr_fwnmi,
   2084         &vmstate_spapr_cap_rpt_invalidate,
   2085         NULL
   2086     }
   2087 };
   2088 
   2089 static int htab_save_setup(QEMUFile *f, void *opaque)
   2090 {
   2091     SpaprMachineState *spapr = opaque;
   2092 
   2093     /* "Iteration" header */
   2094     if (!spapr->htab_shift) {
   2095         qemu_put_be32(f, -1);
   2096     } else {
   2097         qemu_put_be32(f, spapr->htab_shift);
   2098     }
   2099 
   2100     if (spapr->htab) {
   2101         spapr->htab_save_index = 0;
   2102         spapr->htab_first_pass = true;
   2103     } else {
   2104         if (spapr->htab_shift) {
   2105             assert(kvm_enabled());
   2106         }
   2107     }
   2108 
   2109 
   2110     return 0;
   2111 }
   2112 
   2113 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
   2114                             int chunkstart, int n_valid, int n_invalid)
   2115 {
   2116     qemu_put_be32(f, chunkstart);
   2117     qemu_put_be16(f, n_valid);
   2118     qemu_put_be16(f, n_invalid);
   2119     qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
   2120                     HASH_PTE_SIZE_64 * n_valid);
   2121 }
   2122 
   2123 static void htab_save_end_marker(QEMUFile *f)
   2124 {
   2125     qemu_put_be32(f, 0);
   2126     qemu_put_be16(f, 0);
   2127     qemu_put_be16(f, 0);
   2128 }
   2129 
   2130 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
   2131                                  int64_t max_ns)
   2132 {
   2133     bool has_timeout = max_ns != -1;
   2134     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
   2135     int index = spapr->htab_save_index;
   2136     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
   2137 
   2138     assert(spapr->htab_first_pass);
   2139 
   2140     do {
   2141         int chunkstart;
   2142 
   2143         /* Consume invalid HPTEs */
   2144         while ((index < htabslots)
   2145                && !HPTE_VALID(HPTE(spapr->htab, index))) {
   2146             CLEAN_HPTE(HPTE(spapr->htab, index));
   2147             index++;
   2148         }
   2149 
   2150         /* Consume valid HPTEs */
   2151         chunkstart = index;
   2152         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
   2153                && HPTE_VALID(HPTE(spapr->htab, index))) {
   2154             CLEAN_HPTE(HPTE(spapr->htab, index));
   2155             index++;
   2156         }
   2157 
   2158         if (index > chunkstart) {
   2159             int n_valid = index - chunkstart;
   2160 
   2161             htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
   2162 
   2163             if (has_timeout &&
   2164                 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
   2165                 break;
   2166             }
   2167         }
   2168     } while ((index < htabslots) && !qemu_file_rate_limit(f));
   2169 
   2170     if (index >= htabslots) {
   2171         assert(index == htabslots);
   2172         index = 0;
   2173         spapr->htab_first_pass = false;
   2174     }
   2175     spapr->htab_save_index = index;
   2176 }
   2177 
   2178 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
   2179                                 int64_t max_ns)
   2180 {
   2181     bool final = max_ns < 0;
   2182     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
   2183     int examined = 0, sent = 0;
   2184     int index = spapr->htab_save_index;
   2185     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
   2186 
   2187     assert(!spapr->htab_first_pass);
   2188 
   2189     do {
   2190         int chunkstart, invalidstart;
   2191 
   2192         /* Consume non-dirty HPTEs */
   2193         while ((index < htabslots)
   2194                && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
   2195             index++;
   2196             examined++;
   2197         }
   2198 
   2199         chunkstart = index;
   2200         /* Consume valid dirty HPTEs */
   2201         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
   2202                && HPTE_DIRTY(HPTE(spapr->htab, index))
   2203                && HPTE_VALID(HPTE(spapr->htab, index))) {
   2204             CLEAN_HPTE(HPTE(spapr->htab, index));
   2205             index++;
   2206             examined++;
   2207         }
   2208 
   2209         invalidstart = index;
   2210         /* Consume invalid dirty HPTEs */
   2211         while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
   2212                && HPTE_DIRTY(HPTE(spapr->htab, index))
   2213                && !HPTE_VALID(HPTE(spapr->htab, index))) {
   2214             CLEAN_HPTE(HPTE(spapr->htab, index));
   2215             index++;
   2216             examined++;
   2217         }
   2218 
   2219         if (index > chunkstart) {
   2220             int n_valid = invalidstart - chunkstart;
   2221             int n_invalid = index - invalidstart;
   2222 
   2223             htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
   2224             sent += index - chunkstart;
   2225 
   2226             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
   2227                 break;
   2228             }
   2229         }
   2230 
   2231         if (examined >= htabslots) {
   2232             break;
   2233         }
   2234 
   2235         if (index >= htabslots) {
   2236             assert(index == htabslots);
   2237             index = 0;
   2238         }
   2239     } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
   2240 
   2241     if (index >= htabslots) {
   2242         assert(index == htabslots);
   2243         index = 0;
   2244     }
   2245 
   2246     spapr->htab_save_index = index;
   2247 
   2248     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
   2249 }
   2250 
   2251 #define MAX_ITERATION_NS    5000000 /* 5 ms */
   2252 #define MAX_KVM_BUF_SIZE    2048
   2253 
   2254 static int htab_save_iterate(QEMUFile *f, void *opaque)
   2255 {
   2256     SpaprMachineState *spapr = opaque;
   2257     int fd;
   2258     int rc = 0;
   2259 
   2260     /* Iteration header */
   2261     if (!spapr->htab_shift) {
   2262         qemu_put_be32(f, -1);
   2263         return 1;
   2264     } else {
   2265         qemu_put_be32(f, 0);
   2266     }
   2267 
   2268     if (!spapr->htab) {
   2269         assert(kvm_enabled());
   2270 
   2271         fd = get_htab_fd(spapr);
   2272         if (fd < 0) {
   2273             return fd;
   2274         }
   2275 
   2276         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
   2277         if (rc < 0) {
   2278             return rc;
   2279         }
   2280     } else  if (spapr->htab_first_pass) {
   2281         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
   2282     } else {
   2283         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
   2284     }
   2285 
   2286     htab_save_end_marker(f);
   2287 
   2288     return rc;
   2289 }
   2290 
   2291 static int htab_save_complete(QEMUFile *f, void *opaque)
   2292 {
   2293     SpaprMachineState *spapr = opaque;
   2294     int fd;
   2295 
   2296     /* Iteration header */
   2297     if (!spapr->htab_shift) {
   2298         qemu_put_be32(f, -1);
   2299         return 0;
   2300     } else {
   2301         qemu_put_be32(f, 0);
   2302     }
   2303 
   2304     if (!spapr->htab) {
   2305         int rc;
   2306 
   2307         assert(kvm_enabled());
   2308 
   2309         fd = get_htab_fd(spapr);
   2310         if (fd < 0) {
   2311             return fd;
   2312         }
   2313 
   2314         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
   2315         if (rc < 0) {
   2316             return rc;
   2317         }
   2318     } else {
   2319         if (spapr->htab_first_pass) {
   2320             htab_save_first_pass(f, spapr, -1);
   2321         }
   2322         htab_save_later_pass(f, spapr, -1);
   2323     }
   2324 
   2325     /* End marker */
   2326     htab_save_end_marker(f);
   2327 
   2328     return 0;
   2329 }
   2330 
   2331 static int htab_load(QEMUFile *f, void *opaque, int version_id)
   2332 {
   2333     SpaprMachineState *spapr = opaque;
   2334     uint32_t section_hdr;
   2335     int fd = -1;
   2336     Error *local_err = NULL;
   2337 
   2338     if (version_id < 1 || version_id > 1) {
   2339         error_report("htab_load() bad version");
   2340         return -EINVAL;
   2341     }
   2342 
   2343     section_hdr = qemu_get_be32(f);
   2344 
   2345     if (section_hdr == -1) {
   2346         spapr_free_hpt(spapr);
   2347         return 0;
   2348     }
   2349 
   2350     if (section_hdr) {
   2351         int ret;
   2352 
   2353         /* First section gives the htab size */
   2354         ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
   2355         if (ret < 0) {
   2356             error_report_err(local_err);
   2357             return ret;
   2358         }
   2359         return 0;
   2360     }
   2361 
   2362     if (!spapr->htab) {
   2363         assert(kvm_enabled());
   2364 
   2365         fd = kvmppc_get_htab_fd(true, 0, &local_err);
   2366         if (fd < 0) {
   2367             error_report_err(local_err);
   2368             return fd;
   2369         }
   2370     }
   2371 
   2372     while (true) {
   2373         uint32_t index;
   2374         uint16_t n_valid, n_invalid;
   2375 
   2376         index = qemu_get_be32(f);
   2377         n_valid = qemu_get_be16(f);
   2378         n_invalid = qemu_get_be16(f);
   2379 
   2380         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
   2381             /* End of Stream */
   2382             break;
   2383         }
   2384 
   2385         if ((index + n_valid + n_invalid) >
   2386             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
   2387             /* Bad index in stream */
   2388             error_report(
   2389                 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
   2390                 index, n_valid, n_invalid, spapr->htab_shift);
   2391             return -EINVAL;
   2392         }
   2393 
   2394         if (spapr->htab) {
   2395             if (n_valid) {
   2396                 qemu_get_buffer(f, HPTE(spapr->htab, index),
   2397                                 HASH_PTE_SIZE_64 * n_valid);
   2398             }
   2399             if (n_invalid) {
   2400                 memset(HPTE(spapr->htab, index + n_valid), 0,
   2401                        HASH_PTE_SIZE_64 * n_invalid);
   2402             }
   2403         } else {
   2404             int rc;
   2405 
   2406             assert(fd >= 0);
   2407 
   2408             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
   2409                                         &local_err);
   2410             if (rc < 0) {
   2411                 error_report_err(local_err);
   2412                 return rc;
   2413             }
   2414         }
   2415     }
   2416 
   2417     if (!spapr->htab) {
   2418         assert(fd >= 0);
   2419         close(fd);
   2420     }
   2421 
   2422     return 0;
   2423 }
   2424 
   2425 static void htab_save_cleanup(void *opaque)
   2426 {
   2427     SpaprMachineState *spapr = opaque;
   2428 
   2429     close_htab_fd(spapr);
   2430 }
   2431 
   2432 static SaveVMHandlers savevm_htab_handlers = {
   2433     .save_setup = htab_save_setup,
   2434     .save_live_iterate = htab_save_iterate,
   2435     .save_live_complete_precopy = htab_save_complete,
   2436     .save_cleanup = htab_save_cleanup,
   2437     .load_state = htab_load,
   2438 };
   2439 
   2440 static void spapr_boot_set(void *opaque, const char *boot_device,
   2441                            Error **errp)
   2442 {
   2443     SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
   2444 
   2445     g_free(spapr->boot_device);
   2446     spapr->boot_device = g_strdup(boot_device);
   2447 }
   2448 
   2449 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
   2450 {
   2451     MachineState *machine = MACHINE(spapr);
   2452     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
   2453     uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
   2454     int i;
   2455 
   2456     for (i = 0; i < nr_lmbs; i++) {
   2457         uint64_t addr;
   2458 
   2459         addr = i * lmb_size + machine->device_memory->base;
   2460         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
   2461                                addr / lmb_size);
   2462     }
   2463 }
   2464 
   2465 /*
   2466  * If RAM size, maxmem size and individual node mem sizes aren't aligned
   2467  * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
   2468  * since we can't support such unaligned sizes with DRCONF_MEMORY.
   2469  */
   2470 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
   2471 {
   2472     int i;
   2473 
   2474     if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
   2475         error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
   2476                    " is not aligned to %" PRIu64 " MiB",
   2477                    machine->ram_size,
   2478                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
   2479         return;
   2480     }
   2481 
   2482     if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
   2483         error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
   2484                    " is not aligned to %" PRIu64 " MiB",
   2485                    machine->ram_size,
   2486                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
   2487         return;
   2488     }
   2489 
   2490     for (i = 0; i < machine->numa_state->num_nodes; i++) {
   2491         if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
   2492             error_setg(errp,
   2493                        "Node %d memory size 0x%" PRIx64
   2494                        " is not aligned to %" PRIu64 " MiB",
   2495                        i, machine->numa_state->nodes[i].node_mem,
   2496                        SPAPR_MEMORY_BLOCK_SIZE / MiB);
   2497             return;
   2498         }
   2499     }
   2500 }
   2501 
   2502 /* find cpu slot in machine->possible_cpus by core_id */
   2503 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
   2504 {
   2505     int index = id / ms->smp.threads;
   2506 
   2507     if (index >= ms->possible_cpus->len) {
   2508         return NULL;
   2509     }
   2510     if (idx) {
   2511         *idx = index;
   2512     }
   2513     return &ms->possible_cpus->cpus[index];
   2514 }
   2515 
   2516 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
   2517 {
   2518     MachineState *ms = MACHINE(spapr);
   2519     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
   2520     Error *local_err = NULL;
   2521     bool vsmt_user = !!spapr->vsmt;
   2522     int kvm_smt = kvmppc_smt_threads();
   2523     int ret;
   2524     unsigned int smp_threads = ms->smp.threads;
   2525 
   2526     if (!kvm_enabled() && (smp_threads > 1)) {
   2527         error_setg(errp, "TCG cannot support more than 1 thread/core "
   2528                    "on a pseries machine");
   2529         return;
   2530     }
   2531     if (!is_power_of_2(smp_threads)) {
   2532         error_setg(errp, "Cannot support %d threads/core on a pseries "
   2533                    "machine because it must be a power of 2", smp_threads);
   2534         return;
   2535     }
   2536 
   2537     /* Detemine the VSMT mode to use: */
   2538     if (vsmt_user) {
   2539         if (spapr->vsmt < smp_threads) {
   2540             error_setg(errp, "Cannot support VSMT mode %d"
   2541                        " because it must be >= threads/core (%d)",
   2542                        spapr->vsmt, smp_threads);
   2543             return;
   2544         }
   2545         /* In this case, spapr->vsmt has been set by the command line */
   2546     } else if (!smc->smp_threads_vsmt) {
   2547         /*
   2548          * Default VSMT value is tricky, because we need it to be as
   2549          * consistent as possible (for migration), but this requires
   2550          * changing it for at least some existing cases.  We pick 8 as
   2551          * the value that we'd get with KVM on POWER8, the
   2552          * overwhelmingly common case in production systems.
   2553          */
   2554         spapr->vsmt = MAX(8, smp_threads);
   2555     } else {
   2556         spapr->vsmt = smp_threads;
   2557     }
   2558 
   2559     /* KVM: If necessary, set the SMT mode: */
   2560     if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
   2561         ret = kvmppc_set_smt_threads(spapr->vsmt);
   2562         if (ret) {
   2563             /* Looks like KVM isn't able to change VSMT mode */
   2564             error_setg(&local_err,
   2565                        "Failed to set KVM's VSMT mode to %d (errno %d)",
   2566                        spapr->vsmt, ret);
   2567             /* We can live with that if the default one is big enough
   2568              * for the number of threads, and a submultiple of the one
   2569              * we want.  In this case we'll waste some vcpu ids, but
   2570              * behaviour will be correct */
   2571             if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
   2572                 warn_report_err(local_err);
   2573             } else {
   2574                 if (!vsmt_user) {
   2575                     error_append_hint(&local_err,
   2576                                       "On PPC, a VM with %d threads/core"
   2577                                       " on a host with %d threads/core"
   2578                                       " requires the use of VSMT mode %d.\n",
   2579                                       smp_threads, kvm_smt, spapr->vsmt);
   2580                 }
   2581                 kvmppc_error_append_smt_possible_hint(&local_err);
   2582                 error_propagate(errp, local_err);
   2583             }
   2584         }
   2585     }
   2586     /* else TCG: nothing to do currently */
   2587 }
   2588 
   2589 static void spapr_init_cpus(SpaprMachineState *spapr)
   2590 {
   2591     MachineState *machine = MACHINE(spapr);
   2592     MachineClass *mc = MACHINE_GET_CLASS(machine);
   2593     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
   2594     const char *type = spapr_get_cpu_core_type(machine->cpu_type);
   2595     const CPUArchIdList *possible_cpus;
   2596     unsigned int smp_cpus = machine->smp.cpus;
   2597     unsigned int smp_threads = machine->smp.threads;
   2598     unsigned int max_cpus = machine->smp.max_cpus;
   2599     int boot_cores_nr = smp_cpus / smp_threads;
   2600     int i;
   2601 
   2602     possible_cpus = mc->possible_cpu_arch_ids(machine);
   2603     if (mc->has_hotpluggable_cpus) {
   2604         if (smp_cpus % smp_threads) {
   2605             error_report("smp_cpus (%u) must be multiple of threads (%u)",
   2606                          smp_cpus, smp_threads);
   2607             exit(1);
   2608         }
   2609         if (max_cpus % smp_threads) {
   2610             error_report("max_cpus (%u) must be multiple of threads (%u)",
   2611                          max_cpus, smp_threads);
   2612             exit(1);
   2613         }
   2614     } else {
   2615         if (max_cpus != smp_cpus) {
   2616             error_report("This machine version does not support CPU hotplug");
   2617             exit(1);
   2618         }
   2619         boot_cores_nr = possible_cpus->len;
   2620     }
   2621 
   2622     if (smc->pre_2_10_has_unused_icps) {
   2623         int i;
   2624 
   2625         for (i = 0; i < spapr_max_server_number(spapr); i++) {
   2626             /* Dummy entries get deregistered when real ICPState objects
   2627              * are registered during CPU core hotplug.
   2628              */
   2629             pre_2_10_vmstate_register_dummy_icp(i);
   2630         }
   2631     }
   2632 
   2633     for (i = 0; i < possible_cpus->len; i++) {
   2634         int core_id = i * smp_threads;
   2635 
   2636         if (mc->has_hotpluggable_cpus) {
   2637             spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
   2638                                    spapr_vcpu_id(spapr, core_id));
   2639         }
   2640 
   2641         if (i < boot_cores_nr) {
   2642             Object *core  = object_new(type);
   2643             int nr_threads = smp_threads;
   2644 
   2645             /* Handle the partially filled core for older machine types */
   2646             if ((i + 1) * smp_threads >= smp_cpus) {
   2647                 nr_threads = smp_cpus - i * smp_threads;
   2648             }
   2649 
   2650             object_property_set_int(core, "nr-threads", nr_threads,
   2651                                     &error_fatal);
   2652             object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
   2653                                     &error_fatal);
   2654             qdev_realize(DEVICE(core), NULL, &error_fatal);
   2655 
   2656             object_unref(core);
   2657         }
   2658     }
   2659 }
   2660 
   2661 static PCIHostState *spapr_create_default_phb(void)
   2662 {
   2663     DeviceState *dev;
   2664 
   2665     dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
   2666     qdev_prop_set_uint32(dev, "index", 0);
   2667     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
   2668 
   2669     return PCI_HOST_BRIDGE(dev);
   2670 }
   2671 
   2672 static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
   2673 {
   2674     MachineState *machine = MACHINE(spapr);
   2675     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
   2676     hwaddr rma_size = machine->ram_size;
   2677     hwaddr node0_size = spapr_node0_size(machine);
   2678 
   2679     /* RMA has to fit in the first NUMA node */
   2680     rma_size = MIN(rma_size, node0_size);
   2681 
   2682     /*
   2683      * VRMA access is via a special 1TiB SLB mapping, so the RMA can
   2684      * never exceed that
   2685      */
   2686     rma_size = MIN(rma_size, 1 * TiB);
   2687 
   2688     /*
   2689      * Clamp the RMA size based on machine type.  This is for
   2690      * migration compatibility with older qemu versions, which limited
   2691      * the RMA size for complicated and mostly bad reasons.
   2692      */
   2693     if (smc->rma_limit) {
   2694         rma_size = MIN(rma_size, smc->rma_limit);
   2695     }
   2696 
   2697     if (rma_size < MIN_RMA_SLOF) {
   2698         error_setg(errp,
   2699                    "pSeries SLOF firmware requires >= %" HWADDR_PRIx
   2700                    "ldMiB guest RMA (Real Mode Area memory)",
   2701                    MIN_RMA_SLOF / MiB);
   2702         return 0;
   2703     }
   2704 
   2705     return rma_size;
   2706 }
   2707 
   2708 static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
   2709 {
   2710     MachineState *machine = MACHINE(spapr);
   2711     int i;
   2712 
   2713     for (i = 0; i < machine->ram_slots; i++) {
   2714         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
   2715     }
   2716 }
   2717 
   2718 /* pSeries LPAR / sPAPR hardware init */
   2719 static void spapr_machine_init(MachineState *machine)
   2720 {
   2721     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
   2722     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
   2723     MachineClass *mc = MACHINE_GET_CLASS(machine);
   2724     const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
   2725     const char *bios_name = machine->firmware ?: bios_default;
   2726     g_autofree char *filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
   2727     const char *kernel_filename = machine->kernel_filename;
   2728     const char *initrd_filename = machine->initrd_filename;
   2729     PCIHostState *phb;
   2730     bool has_vga;
   2731     int i;
   2732     MemoryRegion *sysmem = get_system_memory();
   2733     long load_limit, fw_size;
   2734     Error *resize_hpt_err = NULL;
   2735 
   2736     if (!filename) {
   2737         error_report("Could not find LPAR firmware '%s'", bios_name);
   2738         exit(1);
   2739     }
   2740     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
   2741     if (fw_size <= 0) {
   2742         error_report("Could not load LPAR firmware '%s'", filename);
   2743         exit(1);
   2744     }
   2745 
   2746     /*
   2747      * if Secure VM (PEF) support is configured, then initialize it
   2748      */
   2749     pef_kvm_init(machine->cgs, &error_fatal);
   2750 
   2751     msi_nonbroken = true;
   2752 
   2753     QLIST_INIT(&spapr->phbs);
   2754     QTAILQ_INIT(&spapr->pending_dimm_unplugs);
   2755 
   2756     /* Determine capabilities to run with */
   2757     spapr_caps_init(spapr);
   2758 
   2759     kvmppc_check_papr_resize_hpt(&resize_hpt_err);
   2760     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
   2761         /*
   2762          * If the user explicitly requested a mode we should either
   2763          * supply it, or fail completely (which we do below).  But if
   2764          * it's not set explicitly, we reset our mode to something
   2765          * that works
   2766          */
   2767         if (resize_hpt_err) {
   2768             spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
   2769             error_free(resize_hpt_err);
   2770             resize_hpt_err = NULL;
   2771         } else {
   2772             spapr->resize_hpt = smc->resize_hpt_default;
   2773         }
   2774     }
   2775 
   2776     assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
   2777 
   2778     if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
   2779         /*
   2780          * User requested HPT resize, but this host can't supply it.  Bail out
   2781          */
   2782         error_report_err(resize_hpt_err);
   2783         exit(1);
   2784     }
   2785     error_free(resize_hpt_err);
   2786 
   2787     spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
   2788 
   2789     /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
   2790     load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
   2791 
   2792     /*
   2793      * VSMT must be set in order to be able to compute VCPU ids, ie to
   2794      * call spapr_max_server_number() or spapr_vcpu_id().
   2795      */
   2796     spapr_set_vsmt_mode(spapr, &error_fatal);
   2797 
   2798     /* Set up Interrupt Controller before we create the VCPUs */
   2799     spapr_irq_init(spapr, &error_fatal);
   2800 
   2801     /* Set up containers for ibm,client-architecture-support negotiated options
   2802      */
   2803     spapr->ov5 = spapr_ovec_new();
   2804     spapr->ov5_cas = spapr_ovec_new();
   2805 
   2806     if (smc->dr_lmb_enabled) {
   2807         spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
   2808         spapr_validate_node_memory(machine, &error_fatal);
   2809     }
   2810 
   2811     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
   2812 
   2813     /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
   2814     if (!smc->pre_6_2_numa_affinity) {
   2815         spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
   2816     }
   2817 
   2818     /* advertise support for dedicated HP event source to guests */
   2819     if (spapr->use_hotplug_event_source) {
   2820         spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
   2821     }
   2822 
   2823     /* advertise support for HPT resizing */
   2824     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
   2825         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
   2826     }
   2827 
   2828     /* advertise support for ibm,dyamic-memory-v2 */
   2829     spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
   2830 
   2831     /* advertise XIVE on POWER9 machines */
   2832     if (spapr->irq->xive) {
   2833         spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
   2834     }
   2835 
   2836     /* init CPUs */
   2837     spapr_init_cpus(spapr);
   2838 
   2839     spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
   2840 
   2841     /* Init numa_assoc_array */
   2842     spapr_numa_associativity_init(spapr, machine);
   2843 
   2844     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
   2845         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
   2846                               spapr->max_compat_pvr)) {
   2847         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
   2848         /* KVM and TCG always allow GTSE with radix... */
   2849         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
   2850     }
   2851     /* ... but not with hash (currently). */
   2852 
   2853     if (kvm_enabled()) {
   2854         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
   2855         kvmppc_enable_logical_ci_hcalls();
   2856         kvmppc_enable_set_mode_hcall();
   2857 
   2858         /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
   2859         kvmppc_enable_clear_ref_mod_hcalls();
   2860 
   2861         /* Enable H_PAGE_INIT */
   2862         kvmppc_enable_h_page_init();
   2863     }
   2864 
   2865     /* map RAM */
   2866     memory_region_add_subregion(sysmem, 0, machine->ram);
   2867 
   2868     /* always allocate the device memory information */
   2869     machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
   2870 
   2871     /* initialize hotplug memory address space */
   2872     if (machine->ram_size < machine->maxram_size) {
   2873         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
   2874         /*
   2875          * Limit the number of hotpluggable memory slots to half the number
   2876          * slots that KVM supports, leaving the other half for PCI and other
   2877          * devices. However ensure that number of slots doesn't drop below 32.
   2878          */
   2879         int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
   2880                            SPAPR_MAX_RAM_SLOTS;
   2881 
   2882         if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
   2883             max_memslots = SPAPR_MAX_RAM_SLOTS;
   2884         }
   2885         if (machine->ram_slots > max_memslots) {
   2886             error_report("Specified number of memory slots %"
   2887                          PRIu64" exceeds max supported %d",
   2888                          machine->ram_slots, max_memslots);
   2889             exit(1);
   2890         }
   2891 
   2892         machine->device_memory->base = ROUND_UP(machine->ram_size,
   2893                                                 SPAPR_DEVICE_MEM_ALIGN);
   2894         memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
   2895                            "device-memory", device_mem_size);
   2896         memory_region_add_subregion(sysmem, machine->device_memory->base,
   2897                                     &machine->device_memory->mr);
   2898     }
   2899 
   2900     if (smc->dr_lmb_enabled) {
   2901         spapr_create_lmb_dr_connectors(spapr);
   2902     }
   2903 
   2904     if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
   2905         /* Create the error string for live migration blocker */
   2906         error_setg(&spapr->fwnmi_migration_blocker,
   2907             "A machine check is being handled during migration. The handler"
   2908             "may run and log hardware error on the destination");
   2909     }
   2910 
   2911     if (mc->nvdimm_supported) {
   2912         spapr_create_nvdimm_dr_connectors(spapr);
   2913     }
   2914 
   2915     /* Set up RTAS event infrastructure */
   2916     spapr_events_init(spapr);
   2917 
   2918     /* Set up the RTC RTAS interfaces */
   2919     spapr_rtc_create(spapr);
   2920 
   2921     /* Set up VIO bus */
   2922     spapr->vio_bus = spapr_vio_bus_init();
   2923 
   2924     for (i = 0; serial_hd(i); i++) {
   2925         spapr_vty_create(spapr->vio_bus, serial_hd(i));
   2926     }
   2927 
   2928     /* We always have at least the nvram device on VIO */
   2929     spapr_create_nvram(spapr);
   2930 
   2931     /*
   2932      * Setup hotplug / dynamic-reconfiguration connectors. top-level
   2933      * connectors (described in root DT node's "ibm,drc-types" property)
   2934      * are pre-initialized here. additional child connectors (such as
   2935      * connectors for a PHBs PCI slots) are added as needed during their
   2936      * parent's realization.
   2937      */
   2938     if (smc->dr_phb_enabled) {
   2939         for (i = 0; i < SPAPR_MAX_PHBS; i++) {
   2940             spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
   2941         }
   2942     }
   2943 
   2944     /* Set up PCI */
   2945     spapr_pci_rtas_init();
   2946 
   2947     phb = spapr_create_default_phb();
   2948 
   2949     for (i = 0; i < nb_nics; i++) {
   2950         NICInfo *nd = &nd_table[i];
   2951 
   2952         if (!nd->model) {
   2953             nd->model = g_strdup("spapr-vlan");
   2954         }
   2955 
   2956         if (g_str_equal(nd->model, "spapr-vlan") ||
   2957             g_str_equal(nd->model, "ibmveth")) {
   2958             spapr_vlan_create(spapr->vio_bus, nd);
   2959         } else {
   2960             pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
   2961         }
   2962     }
   2963 
   2964     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
   2965         spapr_vscsi_create(spapr->vio_bus);
   2966     }
   2967 
   2968     /* Graphics */
   2969     has_vga = spapr_vga_init(phb->bus, &error_fatal);
   2970     if (has_vga) {
   2971         spapr->want_stdout_path = !machine->enable_graphics;
   2972         machine->usb |= defaults_enabled() && !machine->usb_disabled;
   2973     } else {
   2974         spapr->want_stdout_path = true;
   2975     }
   2976 
   2977     if (machine->usb) {
   2978         if (smc->use_ohci_by_default) {
   2979             pci_create_simple(phb->bus, -1, "pci-ohci");
   2980         } else {
   2981             pci_create_simple(phb->bus, -1, "nec-usb-xhci");
   2982         }
   2983 
   2984         if (has_vga) {
   2985             USBBus *usb_bus = usb_bus_find(-1);
   2986 
   2987             usb_create_simple(usb_bus, "usb-kbd");
   2988             usb_create_simple(usb_bus, "usb-mouse");
   2989         }
   2990     }
   2991 
   2992     if (kernel_filename) {
   2993         uint64_t loaded_addr = 0;
   2994 
   2995         spapr->kernel_size = load_elf(kernel_filename, NULL,
   2996                                       translate_kernel_address, spapr,
   2997                                       NULL, &loaded_addr, NULL, NULL, 1,
   2998                                       PPC_ELF_MACHINE, 0, 0);
   2999         if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
   3000             spapr->kernel_size = load_elf(kernel_filename, NULL,
   3001                                           translate_kernel_address, spapr,
   3002                                           NULL, &loaded_addr, NULL, NULL, 0,
   3003                                           PPC_ELF_MACHINE, 0, 0);
   3004             spapr->kernel_le = spapr->kernel_size > 0;
   3005         }
   3006         if (spapr->kernel_size < 0) {
   3007             error_report("error loading %s: %s", kernel_filename,
   3008                          load_elf_strerror(spapr->kernel_size));
   3009             exit(1);
   3010         }
   3011 
   3012         if (spapr->kernel_addr != loaded_addr) {
   3013             warn_report("spapr: kernel_addr changed from 0x%"PRIx64
   3014                         " to 0x%"PRIx64,
   3015                         spapr->kernel_addr, loaded_addr);
   3016             spapr->kernel_addr = loaded_addr;
   3017         }
   3018 
   3019         /* load initrd */
   3020         if (initrd_filename) {
   3021             /* Try to locate the initrd in the gap between the kernel
   3022              * and the firmware. Add a bit of space just in case
   3023              */
   3024             spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
   3025                                   + 0x1ffff) & ~0xffff;
   3026             spapr->initrd_size = load_image_targphys(initrd_filename,
   3027                                                      spapr->initrd_base,
   3028                                                      load_limit
   3029                                                      - spapr->initrd_base);
   3030             if (spapr->initrd_size < 0) {
   3031                 error_report("could not load initial ram disk '%s'",
   3032                              initrd_filename);
   3033                 exit(1);
   3034             }
   3035         }
   3036     }
   3037 
   3038     /* FIXME: Should register things through the MachineState's qdev
   3039      * interface, this is a legacy from the sPAPREnvironment structure
   3040      * which predated MachineState but had a similar function */
   3041     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
   3042     register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
   3043                          &savevm_htab_handlers, spapr);
   3044 
   3045     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
   3046 
   3047     qemu_register_boot_set(spapr_boot_set, spapr);
   3048 
   3049     /*
   3050      * Nothing needs to be done to resume a suspended guest because
   3051      * suspending does not change the machine state, so no need for
   3052      * a ->wakeup method.
   3053      */
   3054     qemu_register_wakeup_support();
   3055 
   3056     if (kvm_enabled()) {
   3057         /* to stop and start vmclock */
   3058         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
   3059                                          &spapr->tb);
   3060 
   3061         kvmppc_spapr_enable_inkernel_multitce();
   3062     }
   3063 
   3064     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
   3065     if (spapr->vof) {
   3066         spapr->vof->fw_size = fw_size; /* for claim() on itself */
   3067         spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
   3068     }
   3069 
   3070     spapr_watchdog_init(spapr);
   3071 }
   3072 
   3073 #define DEFAULT_KVM_TYPE "auto"
   3074 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
   3075 {
   3076     /*
   3077      * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
   3078      * accomodate the 'HV' and 'PV' formats that exists in the
   3079      * wild. The 'auto' mode is being introduced already as
   3080      * lower-case, thus we don't need to bother checking for
   3081      * "AUTO".
   3082      */
   3083     if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
   3084         return 0;
   3085     }
   3086 
   3087     if (!g_ascii_strcasecmp(vm_type, "hv")) {
   3088         return 1;
   3089     }
   3090 
   3091     if (!g_ascii_strcasecmp(vm_type, "pr")) {
   3092         return 2;
   3093     }
   3094 
   3095     error_report("Unknown kvm-type specified '%s'", vm_type);
   3096     exit(1);
   3097 }
   3098 
   3099 /*
   3100  * Implementation of an interface to adjust firmware path
   3101  * for the bootindex property handling.
   3102  */
   3103 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
   3104                                    DeviceState *dev)
   3105 {
   3106 #define CAST(type, obj, name) \
   3107     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
   3108     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
   3109     SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
   3110     VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
   3111     PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
   3112 
   3113     if (d && bus) {
   3114         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
   3115         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
   3116         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
   3117 
   3118         if (spapr) {
   3119             /*
   3120              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
   3121              * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
   3122              * 0x8000 | (target << 8) | (bus << 5) | lun
   3123              * (see the "Logical unit addressing format" table in SAM5)
   3124              */
   3125             unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
   3126             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
   3127                                    (uint64_t)id << 48);
   3128         } else if (virtio) {
   3129             /*
   3130              * We use SRP luns of the form 01000000 | (target << 8) | lun
   3131              * in the top 32 bits of the 64-bit LUN
   3132              * Note: the quote above is from SLOF and it is wrong,
   3133              * the actual binding is:
   3134              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
   3135              */
   3136             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
   3137             if (d->lun >= 256) {
   3138                 /* Use the LUN "flat space addressing method" */
   3139                 id |= 0x4000;
   3140             }
   3141             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
   3142                                    (uint64_t)id << 32);
   3143         } else if (usb) {
   3144             /*
   3145              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
   3146              * in the top 32 bits of the 64-bit LUN
   3147              */
   3148             unsigned usb_port = atoi(usb->port->path);
   3149             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
   3150             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
   3151                                    (uint64_t)id << 32);
   3152         }
   3153     }
   3154 
   3155     /*
   3156      * SLOF probes the USB devices, and if it recognizes that the device is a
   3157      * storage device, it changes its name to "storage" instead of "usb-host",
   3158      * and additionally adds a child node for the SCSI LUN, so the correct
   3159      * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
   3160      */
   3161     if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
   3162         USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
   3163         if (usb_device_is_scsi_storage(usbdev)) {
   3164             return g_strdup_printf("storage@%s/disk", usbdev->port->path);
   3165         }
   3166     }
   3167 
   3168     if (phb) {
   3169         /* Replace "pci" with "pci@800000020000000" */
   3170         return g_strdup_printf("pci@%"PRIX64, phb->buid);
   3171     }
   3172 
   3173     if (vsc) {
   3174         /* Same logic as virtio above */
   3175         unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
   3176         return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
   3177     }
   3178 
   3179     if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
   3180         /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
   3181         PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
   3182         return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
   3183     }
   3184 
   3185     if (pcidev) {
   3186         return spapr_pci_fw_dev_name(pcidev);
   3187     }
   3188 
   3189     return NULL;
   3190 }
   3191 
   3192 static char *spapr_get_kvm_type(Object *obj, Error **errp)
   3193 {
   3194     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3195 
   3196     return g_strdup(spapr->kvm_type);
   3197 }
   3198 
   3199 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
   3200 {
   3201     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3202 
   3203     g_free(spapr->kvm_type);
   3204     spapr->kvm_type = g_strdup(value);
   3205 }
   3206 
   3207 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
   3208 {
   3209     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3210 
   3211     return spapr->use_hotplug_event_source;
   3212 }
   3213 
   3214 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
   3215                                             Error **errp)
   3216 {
   3217     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3218 
   3219     spapr->use_hotplug_event_source = value;
   3220 }
   3221 
   3222 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
   3223 {
   3224     return true;
   3225 }
   3226 
   3227 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
   3228 {
   3229     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3230 
   3231     switch (spapr->resize_hpt) {
   3232     case SPAPR_RESIZE_HPT_DEFAULT:
   3233         return g_strdup("default");
   3234     case SPAPR_RESIZE_HPT_DISABLED:
   3235         return g_strdup("disabled");
   3236     case SPAPR_RESIZE_HPT_ENABLED:
   3237         return g_strdup("enabled");
   3238     case SPAPR_RESIZE_HPT_REQUIRED:
   3239         return g_strdup("required");
   3240     }
   3241     g_assert_not_reached();
   3242 }
   3243 
   3244 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
   3245 {
   3246     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3247 
   3248     if (strcmp(value, "default") == 0) {
   3249         spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
   3250     } else if (strcmp(value, "disabled") == 0) {
   3251         spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
   3252     } else if (strcmp(value, "enabled") == 0) {
   3253         spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
   3254     } else if (strcmp(value, "required") == 0) {
   3255         spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
   3256     } else {
   3257         error_setg(errp, "Bad value for \"resize-hpt\" property");
   3258     }
   3259 }
   3260 
   3261 static bool spapr_get_vof(Object *obj, Error **errp)
   3262 {
   3263     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3264 
   3265     return spapr->vof != NULL;
   3266 }
   3267 
   3268 static void spapr_set_vof(Object *obj, bool value, Error **errp)
   3269 {
   3270     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3271 
   3272     if (spapr->vof) {
   3273         vof_cleanup(spapr->vof);
   3274         g_free(spapr->vof);
   3275         spapr->vof = NULL;
   3276     }
   3277     if (!value) {
   3278         return;
   3279     }
   3280     spapr->vof = g_malloc0(sizeof(*spapr->vof));
   3281 }
   3282 
   3283 static char *spapr_get_ic_mode(Object *obj, Error **errp)
   3284 {
   3285     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3286 
   3287     if (spapr->irq == &spapr_irq_xics_legacy) {
   3288         return g_strdup("legacy");
   3289     } else if (spapr->irq == &spapr_irq_xics) {
   3290         return g_strdup("xics");
   3291     } else if (spapr->irq == &spapr_irq_xive) {
   3292         return g_strdup("xive");
   3293     } else if (spapr->irq == &spapr_irq_dual) {
   3294         return g_strdup("dual");
   3295     }
   3296     g_assert_not_reached();
   3297 }
   3298 
   3299 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
   3300 {
   3301     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3302 
   3303     if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
   3304         error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
   3305         return;
   3306     }
   3307 
   3308     /* The legacy IRQ backend can not be set */
   3309     if (strcmp(value, "xics") == 0) {
   3310         spapr->irq = &spapr_irq_xics;
   3311     } else if (strcmp(value, "xive") == 0) {
   3312         spapr->irq = &spapr_irq_xive;
   3313     } else if (strcmp(value, "dual") == 0) {
   3314         spapr->irq = &spapr_irq_dual;
   3315     } else {
   3316         error_setg(errp, "Bad value for \"ic-mode\" property");
   3317     }
   3318 }
   3319 
   3320 static char *spapr_get_host_model(Object *obj, Error **errp)
   3321 {
   3322     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3323 
   3324     return g_strdup(spapr->host_model);
   3325 }
   3326 
   3327 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
   3328 {
   3329     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3330 
   3331     g_free(spapr->host_model);
   3332     spapr->host_model = g_strdup(value);
   3333 }
   3334 
   3335 static char *spapr_get_host_serial(Object *obj, Error **errp)
   3336 {
   3337     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3338 
   3339     return g_strdup(spapr->host_serial);
   3340 }
   3341 
   3342 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
   3343 {
   3344     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3345 
   3346     g_free(spapr->host_serial);
   3347     spapr->host_serial = g_strdup(value);
   3348 }
   3349 
   3350 static void spapr_instance_init(Object *obj)
   3351 {
   3352     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3353     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
   3354     MachineState *ms = MACHINE(spapr);
   3355     MachineClass *mc = MACHINE_GET_CLASS(ms);
   3356 
   3357     /*
   3358      * NVDIMM support went live in 5.1 without considering that, in
   3359      * other archs, the user needs to enable NVDIMM support with the
   3360      * 'nvdimm' machine option and the default behavior is NVDIMM
   3361      * support disabled. It is too late to roll back to the standard
   3362      * behavior without breaking 5.1 guests.
   3363      */
   3364     if (mc->nvdimm_supported) {
   3365         ms->nvdimms_state->is_enabled = true;
   3366     }
   3367 
   3368     spapr->htab_fd = -1;
   3369     spapr->use_hotplug_event_source = true;
   3370     spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
   3371     object_property_add_str(obj, "kvm-type",
   3372                             spapr_get_kvm_type, spapr_set_kvm_type);
   3373     object_property_set_description(obj, "kvm-type",
   3374                                     "Specifies the KVM virtualization mode (auto,"
   3375                                     " hv, pr). Defaults to 'auto'. This mode will use"
   3376                                     " any available KVM module loaded in the host,"
   3377                                     " where kvm_hv takes precedence if both kvm_hv and"
   3378                                     " kvm_pr are loaded.");
   3379     object_property_add_bool(obj, "modern-hotplug-events",
   3380                             spapr_get_modern_hotplug_events,
   3381                             spapr_set_modern_hotplug_events);
   3382     object_property_set_description(obj, "modern-hotplug-events",
   3383                                     "Use dedicated hotplug event mechanism in"
   3384                                     " place of standard EPOW events when possible"
   3385                                     " (required for memory hot-unplug support)");
   3386     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
   3387                             "Maximum permitted CPU compatibility mode");
   3388 
   3389     object_property_add_str(obj, "resize-hpt",
   3390                             spapr_get_resize_hpt, spapr_set_resize_hpt);
   3391     object_property_set_description(obj, "resize-hpt",
   3392                                     "Resizing of the Hash Page Table (enabled, disabled, required)");
   3393     object_property_add_uint32_ptr(obj, "vsmt",
   3394                                    &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
   3395     object_property_set_description(obj, "vsmt",
   3396                                     "Virtual SMT: KVM behaves as if this were"
   3397                                     " the host's SMT mode");
   3398 
   3399     object_property_add_bool(obj, "vfio-no-msix-emulation",
   3400                              spapr_get_msix_emulation, NULL);
   3401 
   3402     object_property_add_uint64_ptr(obj, "kernel-addr",
   3403                                    &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
   3404     object_property_set_description(obj, "kernel-addr",
   3405                                     stringify(KERNEL_LOAD_ADDR)
   3406                                     " for -kernel is the default");
   3407     spapr->kernel_addr = KERNEL_LOAD_ADDR;
   3408 
   3409     object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
   3410     object_property_set_description(obj, "x-vof",
   3411                                     "Enable Virtual Open Firmware (experimental)");
   3412 
   3413     /* The machine class defines the default interrupt controller mode */
   3414     spapr->irq = smc->irq;
   3415     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
   3416                             spapr_set_ic_mode);
   3417     object_property_set_description(obj, "ic-mode",
   3418                  "Specifies the interrupt controller mode (xics, xive, dual)");
   3419 
   3420     object_property_add_str(obj, "host-model",
   3421         spapr_get_host_model, spapr_set_host_model);
   3422     object_property_set_description(obj, "host-model",
   3423         "Host model to advertise in guest device tree");
   3424     object_property_add_str(obj, "host-serial",
   3425         spapr_get_host_serial, spapr_set_host_serial);
   3426     object_property_set_description(obj, "host-serial",
   3427         "Host serial number to advertise in guest device tree");
   3428 }
   3429 
   3430 static void spapr_machine_finalizefn(Object *obj)
   3431 {
   3432     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   3433 
   3434     g_free(spapr->kvm_type);
   3435 }
   3436 
   3437 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
   3438 {
   3439     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
   3440     PowerPCCPU *cpu = POWERPC_CPU(cs);
   3441     CPUPPCState *env = &cpu->env;
   3442 
   3443     cpu_synchronize_state(cs);
   3444     /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
   3445     if (spapr->fwnmi_system_reset_addr != -1) {
   3446         uint64_t rtas_addr, addr;
   3447 
   3448         /* get rtas addr from fdt */
   3449         rtas_addr = spapr_get_rtas_addr();
   3450         if (!rtas_addr) {
   3451             qemu_system_guest_panicked(NULL);
   3452             return;
   3453         }
   3454 
   3455         addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
   3456         stq_be_phys(&address_space_memory, addr, env->gpr[3]);
   3457         stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
   3458         env->gpr[3] = addr;
   3459     }
   3460     ppc_cpu_do_system_reset(cs);
   3461     if (spapr->fwnmi_system_reset_addr != -1) {
   3462         env->nip = spapr->fwnmi_system_reset_addr;
   3463     }
   3464 }
   3465 
   3466 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
   3467 {
   3468     CPUState *cs;
   3469 
   3470     CPU_FOREACH(cs) {
   3471         async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
   3472     }
   3473 }
   3474 
   3475 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
   3476                           void *fdt, int *fdt_start_offset, Error **errp)
   3477 {
   3478     uint64_t addr;
   3479     uint32_t node;
   3480 
   3481     addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
   3482     node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
   3483                                     &error_abort);
   3484     *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
   3485                                              SPAPR_MEMORY_BLOCK_SIZE);
   3486     return 0;
   3487 }
   3488 
   3489 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
   3490                            bool dedicated_hp_event_source)
   3491 {
   3492     SpaprDrc *drc;
   3493     uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
   3494     int i;
   3495     uint64_t addr = addr_start;
   3496     bool hotplugged = spapr_drc_hotplugged(dev);
   3497 
   3498     for (i = 0; i < nr_lmbs; i++) {
   3499         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3500                               addr / SPAPR_MEMORY_BLOCK_SIZE);
   3501         g_assert(drc);
   3502 
   3503         /*
   3504          * memory_device_get_free_addr() provided a range of free addresses
   3505          * that doesn't overlap with any existing mapping at pre-plug. The
   3506          * corresponding LMB DRCs are thus assumed to be all attachable.
   3507          */
   3508         spapr_drc_attach(drc, dev);
   3509         if (!hotplugged) {
   3510             spapr_drc_reset(drc);
   3511         }
   3512         addr += SPAPR_MEMORY_BLOCK_SIZE;
   3513     }
   3514     /* send hotplug notification to the
   3515      * guest only in case of hotplugged memory
   3516      */
   3517     if (hotplugged) {
   3518         if (dedicated_hp_event_source) {
   3519             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3520                                   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
   3521             g_assert(drc);
   3522             spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
   3523                                                    nr_lmbs,
   3524                                                    spapr_drc_index(drc));
   3525         } else {
   3526             spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
   3527                                            nr_lmbs);
   3528         }
   3529     }
   3530 }
   3531 
   3532 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
   3533 {
   3534     SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
   3535     PCDIMMDevice *dimm = PC_DIMM(dev);
   3536     uint64_t size, addr;
   3537     int64_t slot;
   3538     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
   3539 
   3540     size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
   3541 
   3542     pc_dimm_plug(dimm, MACHINE(ms));
   3543 
   3544     if (!is_nvdimm) {
   3545         addr = object_property_get_uint(OBJECT(dimm),
   3546                                         PC_DIMM_ADDR_PROP, &error_abort);
   3547         spapr_add_lmbs(dev, addr, size,
   3548                        spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
   3549     } else {
   3550         slot = object_property_get_int(OBJECT(dimm),
   3551                                        PC_DIMM_SLOT_PROP, &error_abort);
   3552         /* We should have valid slot number at this point */
   3553         g_assert(slot >= 0);
   3554         spapr_add_nvdimm(dev, slot);
   3555     }
   3556 }
   3557 
   3558 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
   3559                                   Error **errp)
   3560 {
   3561     const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
   3562     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
   3563     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
   3564     PCDIMMDevice *dimm = PC_DIMM(dev);
   3565     Error *local_err = NULL;
   3566     uint64_t size;
   3567     Object *memdev;
   3568     hwaddr pagesize;
   3569 
   3570     if (!smc->dr_lmb_enabled) {
   3571         error_setg(errp, "Memory hotplug not supported for this machine");
   3572         return;
   3573     }
   3574 
   3575     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
   3576     if (local_err) {
   3577         error_propagate(errp, local_err);
   3578         return;
   3579     }
   3580 
   3581     if (is_nvdimm) {
   3582         if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
   3583             return;
   3584         }
   3585     } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
   3586         error_setg(errp, "Hotplugged memory size must be a multiple of "
   3587                    "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
   3588         return;
   3589     }
   3590 
   3591     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
   3592                                       &error_abort);
   3593     pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
   3594     if (!spapr_check_pagesize(spapr, pagesize, errp)) {
   3595         return;
   3596     }
   3597 
   3598     pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
   3599 }
   3600 
   3601 struct SpaprDimmState {
   3602     PCDIMMDevice *dimm;
   3603     uint32_t nr_lmbs;
   3604     QTAILQ_ENTRY(SpaprDimmState) next;
   3605 };
   3606 
   3607 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
   3608                                                        PCDIMMDevice *dimm)
   3609 {
   3610     SpaprDimmState *dimm_state = NULL;
   3611 
   3612     QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
   3613         if (dimm_state->dimm == dimm) {
   3614             break;
   3615         }
   3616     }
   3617     return dimm_state;
   3618 }
   3619 
   3620 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
   3621                                                       uint32_t nr_lmbs,
   3622                                                       PCDIMMDevice *dimm)
   3623 {
   3624     SpaprDimmState *ds = NULL;
   3625 
   3626     /*
   3627      * If this request is for a DIMM whose removal had failed earlier
   3628      * (due to guest's refusal to remove the LMBs), we would have this
   3629      * dimm already in the pending_dimm_unplugs list. In that
   3630      * case don't add again.
   3631      */
   3632     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
   3633     if (!ds) {
   3634         ds = g_new0(SpaprDimmState, 1);
   3635         ds->nr_lmbs = nr_lmbs;
   3636         ds->dimm = dimm;
   3637         QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
   3638     }
   3639     return ds;
   3640 }
   3641 
   3642 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
   3643                                               SpaprDimmState *dimm_state)
   3644 {
   3645     QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
   3646     g_free(dimm_state);
   3647 }
   3648 
   3649 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
   3650                                                         PCDIMMDevice *dimm)
   3651 {
   3652     SpaprDrc *drc;
   3653     uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
   3654                                                   &error_abort);
   3655     uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
   3656     uint32_t avail_lmbs = 0;
   3657     uint64_t addr_start, addr;
   3658     int i;
   3659 
   3660     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
   3661                                           &error_abort);
   3662 
   3663     addr = addr_start;
   3664     for (i = 0; i < nr_lmbs; i++) {
   3665         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3666                               addr / SPAPR_MEMORY_BLOCK_SIZE);
   3667         g_assert(drc);
   3668         if (drc->dev) {
   3669             avail_lmbs++;
   3670         }
   3671         addr += SPAPR_MEMORY_BLOCK_SIZE;
   3672     }
   3673 
   3674     return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
   3675 }
   3676 
   3677 void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
   3678 {
   3679     SpaprDimmState *ds;
   3680     PCDIMMDevice *dimm;
   3681     SpaprDrc *drc;
   3682     uint32_t nr_lmbs;
   3683     uint64_t size, addr_start, addr;
   3684     g_autofree char *qapi_error = NULL;
   3685     int i;
   3686 
   3687     if (!dev) {
   3688         return;
   3689     }
   3690 
   3691     dimm = PC_DIMM(dev);
   3692     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
   3693 
   3694     /*
   3695      * 'ds == NULL' would mean that the DIMM doesn't have a pending
   3696      * unplug state, but one of its DRC is marked as unplug_requested.
   3697      * This is bad and weird enough to g_assert() out.
   3698      */
   3699     g_assert(ds);
   3700 
   3701     spapr_pending_dimm_unplugs_remove(spapr, ds);
   3702 
   3703     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
   3704     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
   3705 
   3706     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
   3707                                           &error_abort);
   3708 
   3709     addr = addr_start;
   3710     for (i = 0; i < nr_lmbs; i++) {
   3711         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3712                               addr / SPAPR_MEMORY_BLOCK_SIZE);
   3713         g_assert(drc);
   3714 
   3715         drc->unplug_requested = false;
   3716         addr += SPAPR_MEMORY_BLOCK_SIZE;
   3717     }
   3718 
   3719     /*
   3720      * Tell QAPI that something happened and the memory
   3721      * hotunplug wasn't successful. Keep sending
   3722      * MEM_UNPLUG_ERROR even while sending
   3723      * DEVICE_UNPLUG_GUEST_ERROR until the deprecation of
   3724      * MEM_UNPLUG_ERROR is due.
   3725      */
   3726     qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest "
   3727                                  "for device %s", dev->id);
   3728 
   3729     qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error);
   3730 
   3731     qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id,
   3732                                               dev->canonical_path);
   3733 }
   3734 
   3735 /* Callback to be called during DRC release. */
   3736 void spapr_lmb_release(DeviceState *dev)
   3737 {
   3738     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
   3739     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
   3740     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
   3741 
   3742     /* This information will get lost if a migration occurs
   3743      * during the unplug process. In this case recover it. */
   3744     if (ds == NULL) {
   3745         ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
   3746         g_assert(ds);
   3747         /* The DRC being examined by the caller at least must be counted */
   3748         g_assert(ds->nr_lmbs);
   3749     }
   3750 
   3751     if (--ds->nr_lmbs) {
   3752         return;
   3753     }
   3754 
   3755     /*
   3756      * Now that all the LMBs have been removed by the guest, call the
   3757      * unplug handler chain. This can never fail.
   3758      */
   3759     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
   3760     object_unparent(OBJECT(dev));
   3761 }
   3762 
   3763 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
   3764 {
   3765     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
   3766     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
   3767 
   3768     /* We really shouldn't get this far without anything to unplug */
   3769     g_assert(ds);
   3770 
   3771     pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
   3772     qdev_unrealize(dev);
   3773     spapr_pending_dimm_unplugs_remove(spapr, ds);
   3774 }
   3775 
   3776 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
   3777                                         DeviceState *dev, Error **errp)
   3778 {
   3779     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
   3780     PCDIMMDevice *dimm = PC_DIMM(dev);
   3781     uint32_t nr_lmbs;
   3782     uint64_t size, addr_start, addr;
   3783     int i;
   3784     SpaprDrc *drc;
   3785 
   3786     if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
   3787         error_setg(errp, "nvdimm device hot unplug is not supported yet.");
   3788         return;
   3789     }
   3790 
   3791     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
   3792     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
   3793 
   3794     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
   3795                                           &error_abort);
   3796 
   3797     /*
   3798      * An existing pending dimm state for this DIMM means that there is an
   3799      * unplug operation in progress, waiting for the spapr_lmb_release
   3800      * callback to complete the job (BQL can't cover that far). In this case,
   3801      * bail out to avoid detaching DRCs that were already released.
   3802      */
   3803     if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
   3804         error_setg(errp, "Memory unplug already in progress for device %s",
   3805                    dev->id);
   3806         return;
   3807     }
   3808 
   3809     spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
   3810 
   3811     addr = addr_start;
   3812     for (i = 0; i < nr_lmbs; i++) {
   3813         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3814                               addr / SPAPR_MEMORY_BLOCK_SIZE);
   3815         g_assert(drc);
   3816 
   3817         spapr_drc_unplug_request(drc);
   3818         addr += SPAPR_MEMORY_BLOCK_SIZE;
   3819     }
   3820 
   3821     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
   3822                           addr_start / SPAPR_MEMORY_BLOCK_SIZE);
   3823     spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
   3824                                               nr_lmbs, spapr_drc_index(drc));
   3825 }
   3826 
   3827 /* Callback to be called during DRC release. */
   3828 void spapr_core_release(DeviceState *dev)
   3829 {
   3830     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
   3831 
   3832     /* Call the unplug handler chain. This can never fail. */
   3833     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
   3834     object_unparent(OBJECT(dev));
   3835 }
   3836 
   3837 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
   3838 {
   3839     MachineState *ms = MACHINE(hotplug_dev);
   3840     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
   3841     CPUCore *cc = CPU_CORE(dev);
   3842     CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
   3843 
   3844     if (smc->pre_2_10_has_unused_icps) {
   3845         SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
   3846         int i;
   3847 
   3848         for (i = 0; i < cc->nr_threads; i++) {
   3849             CPUState *cs = CPU(sc->threads[i]);
   3850 
   3851             pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
   3852         }
   3853     }
   3854 
   3855     assert(core_slot);
   3856     core_slot->cpu = NULL;
   3857     qdev_unrealize(dev);
   3858 }
   3859 
   3860 static
   3861 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
   3862                                Error **errp)
   3863 {
   3864     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   3865     int index;
   3866     SpaprDrc *drc;
   3867     CPUCore *cc = CPU_CORE(dev);
   3868 
   3869     if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
   3870         error_setg(errp, "Unable to find CPU core with core-id: %d",
   3871                    cc->core_id);
   3872         return;
   3873     }
   3874     if (index == 0) {
   3875         error_setg(errp, "Boot CPU core may not be unplugged");
   3876         return;
   3877     }
   3878 
   3879     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
   3880                           spapr_vcpu_id(spapr, cc->core_id));
   3881     g_assert(drc);
   3882 
   3883     if (!spapr_drc_unplug_requested(drc)) {
   3884         spapr_drc_unplug_request(drc);
   3885     }
   3886 
   3887     /*
   3888      * spapr_hotplug_req_remove_by_index is left unguarded, out of the
   3889      * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
   3890      * pulses removing the same CPU. Otherwise, in an failed hotunplug
   3891      * attempt (e.g. the kernel will refuse to remove the last online
   3892      * CPU), we will never attempt it again because unplug_requested
   3893      * will still be 'true' in that case.
   3894      */
   3895     spapr_hotplug_req_remove_by_index(drc);
   3896 }
   3897 
   3898 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
   3899                            void *fdt, int *fdt_start_offset, Error **errp)
   3900 {
   3901     SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
   3902     CPUState *cs = CPU(core->threads[0]);
   3903     PowerPCCPU *cpu = POWERPC_CPU(cs);
   3904     DeviceClass *dc = DEVICE_GET_CLASS(cs);
   3905     int id = spapr_get_vcpu_id(cpu);
   3906     g_autofree char *nodename = NULL;
   3907     int offset;
   3908 
   3909     nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
   3910     offset = fdt_add_subnode(fdt, 0, nodename);
   3911 
   3912     spapr_dt_cpu(cs, fdt, offset, spapr);
   3913 
   3914     /*
   3915      * spapr_dt_cpu() does not fill the 'name' property in the
   3916      * CPU node. The function is called during boot process, before
   3917      * and after CAS, and overwriting the 'name' property written
   3918      * by SLOF is not allowed.
   3919      *
   3920      * Write it manually after spapr_dt_cpu(). This makes the hotplug
   3921      * CPUs more compatible with the coldplugged ones, which have
   3922      * the 'name' property. Linux Kernel also relies on this
   3923      * property to identify CPU nodes.
   3924      */
   3925     _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
   3926 
   3927     *fdt_start_offset = offset;
   3928     return 0;
   3929 }
   3930 
   3931 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
   3932 {
   3933     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   3934     MachineClass *mc = MACHINE_GET_CLASS(spapr);
   3935     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   3936     SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
   3937     CPUCore *cc = CPU_CORE(dev);
   3938     CPUState *cs;
   3939     SpaprDrc *drc;
   3940     CPUArchId *core_slot;
   3941     int index;
   3942     bool hotplugged = spapr_drc_hotplugged(dev);
   3943     int i;
   3944 
   3945     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
   3946     g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
   3947 
   3948     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
   3949                           spapr_vcpu_id(spapr, cc->core_id));
   3950 
   3951     g_assert(drc || !mc->has_hotpluggable_cpus);
   3952 
   3953     if (drc) {
   3954         /*
   3955          * spapr_core_pre_plug() already buys us this is a brand new
   3956          * core being plugged into a free slot. Nothing should already
   3957          * be attached to the corresponding DRC.
   3958          */
   3959         spapr_drc_attach(drc, dev);
   3960 
   3961         if (hotplugged) {
   3962             /*
   3963              * Send hotplug notification interrupt to the guest only
   3964              * in case of hotplugged CPUs.
   3965              */
   3966             spapr_hotplug_req_add_by_index(drc);
   3967         } else {
   3968             spapr_drc_reset(drc);
   3969         }
   3970     }
   3971 
   3972     core_slot->cpu = OBJECT(dev);
   3973 
   3974     /*
   3975      * Set compatibility mode to match the boot CPU, which was either set
   3976      * by the machine reset code or by CAS. This really shouldn't fail at
   3977      * this point.
   3978      */
   3979     if (hotplugged) {
   3980         for (i = 0; i < cc->nr_threads; i++) {
   3981             ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
   3982                            &error_abort);
   3983         }
   3984     }
   3985 
   3986     if (smc->pre_2_10_has_unused_icps) {
   3987         for (i = 0; i < cc->nr_threads; i++) {
   3988             cs = CPU(core->threads[i]);
   3989             pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
   3990         }
   3991     }
   3992 }
   3993 
   3994 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
   3995                                 Error **errp)
   3996 {
   3997     MachineState *machine = MACHINE(OBJECT(hotplug_dev));
   3998     MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
   3999     CPUCore *cc = CPU_CORE(dev);
   4000     const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
   4001     const char *type = object_get_typename(OBJECT(dev));
   4002     CPUArchId *core_slot;
   4003     int index;
   4004     unsigned int smp_threads = machine->smp.threads;
   4005 
   4006     if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
   4007         error_setg(errp, "CPU hotplug not supported for this machine");
   4008         return;
   4009     }
   4010 
   4011     if (strcmp(base_core_type, type)) {
   4012         error_setg(errp, "CPU core type should be %s", base_core_type);
   4013         return;
   4014     }
   4015 
   4016     if (cc->core_id % smp_threads) {
   4017         error_setg(errp, "invalid core id %d", cc->core_id);
   4018         return;
   4019     }
   4020 
   4021     /*
   4022      * In general we should have homogeneous threads-per-core, but old
   4023      * (pre hotplug support) machine types allow the last core to have
   4024      * reduced threads as a compatibility hack for when we allowed
   4025      * total vcpus not a multiple of threads-per-core.
   4026      */
   4027     if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
   4028         error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
   4029                    smp_threads);
   4030         return;
   4031     }
   4032 
   4033     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
   4034     if (!core_slot) {
   4035         error_setg(errp, "core id %d out of range", cc->core_id);
   4036         return;
   4037     }
   4038 
   4039     if (core_slot->cpu) {
   4040         error_setg(errp, "core %d already populated", cc->core_id);
   4041         return;
   4042     }
   4043 
   4044     numa_cpu_pre_plug(core_slot, dev, errp);
   4045 }
   4046 
   4047 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
   4048                           void *fdt, int *fdt_start_offset, Error **errp)
   4049 {
   4050     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
   4051     int intc_phandle;
   4052 
   4053     intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
   4054     if (intc_phandle <= 0) {
   4055         return -1;
   4056     }
   4057 
   4058     if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
   4059         error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
   4060         return -1;
   4061     }
   4062 
   4063     /* generally SLOF creates these, for hotplug it's up to QEMU */
   4064     _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
   4065 
   4066     return 0;
   4067 }
   4068 
   4069 static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
   4070                                Error **errp)
   4071 {
   4072     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4073     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
   4074     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
   4075     const unsigned windows_supported = spapr_phb_windows_supported(sphb);
   4076     SpaprDrc *drc;
   4077 
   4078     if (dev->hotplugged && !smc->dr_phb_enabled) {
   4079         error_setg(errp, "PHB hotplug not supported for this machine");
   4080         return false;
   4081     }
   4082 
   4083     if (sphb->index == (uint32_t)-1) {
   4084         error_setg(errp, "\"index\" for PAPR PHB is mandatory");
   4085         return false;
   4086     }
   4087 
   4088     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
   4089     if (drc && drc->dev) {
   4090         error_setg(errp, "PHB %d already attached", sphb->index);
   4091         return false;
   4092     }
   4093 
   4094     /*
   4095      * This will check that sphb->index doesn't exceed the maximum number of
   4096      * PHBs for the current machine type.
   4097      */
   4098     return
   4099         smc->phb_placement(spapr, sphb->index,
   4100                            &sphb->buid, &sphb->io_win_addr,
   4101                            &sphb->mem_win_addr, &sphb->mem64_win_addr,
   4102                            windows_supported, sphb->dma_liobn,
   4103                            &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
   4104                            errp);
   4105 }
   4106 
   4107 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
   4108 {
   4109     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4110     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
   4111     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
   4112     SpaprDrc *drc;
   4113     bool hotplugged = spapr_drc_hotplugged(dev);
   4114 
   4115     if (!smc->dr_phb_enabled) {
   4116         return;
   4117     }
   4118 
   4119     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
   4120     /* hotplug hooks should check it's enabled before getting this far */
   4121     assert(drc);
   4122 
   4123     /* spapr_phb_pre_plug() already checked the DRC is attachable */
   4124     spapr_drc_attach(drc, dev);
   4125 
   4126     if (hotplugged) {
   4127         spapr_hotplug_req_add_by_index(drc);
   4128     } else {
   4129         spapr_drc_reset(drc);
   4130     }
   4131 }
   4132 
   4133 void spapr_phb_release(DeviceState *dev)
   4134 {
   4135     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
   4136 
   4137     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
   4138     object_unparent(OBJECT(dev));
   4139 }
   4140 
   4141 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
   4142 {
   4143     qdev_unrealize(dev);
   4144 }
   4145 
   4146 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
   4147                                      DeviceState *dev, Error **errp)
   4148 {
   4149     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
   4150     SpaprDrc *drc;
   4151 
   4152     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
   4153     assert(drc);
   4154 
   4155     if (!spapr_drc_unplug_requested(drc)) {
   4156         spapr_drc_unplug_request(drc);
   4157         spapr_hotplug_req_remove_by_index(drc);
   4158     } else {
   4159         error_setg(errp,
   4160                    "PCI Host Bridge unplug already in progress for device %s",
   4161                    dev->id);
   4162     }
   4163 }
   4164 
   4165 static
   4166 bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
   4167                               Error **errp)
   4168 {
   4169     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4170 
   4171     if (spapr->tpm_proxy != NULL) {
   4172         error_setg(errp, "Only one TPM proxy can be specified for this machine");
   4173         return false;
   4174     }
   4175 
   4176     return true;
   4177 }
   4178 
   4179 static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
   4180 {
   4181     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4182     SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
   4183 
   4184     /* Already checked in spapr_tpm_proxy_pre_plug() */
   4185     g_assert(spapr->tpm_proxy == NULL);
   4186 
   4187     spapr->tpm_proxy = tpm_proxy;
   4188 }
   4189 
   4190 static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
   4191 {
   4192     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4193 
   4194     qdev_unrealize(dev);
   4195     object_unparent(OBJECT(dev));
   4196     spapr->tpm_proxy = NULL;
   4197 }
   4198 
   4199 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
   4200                                       DeviceState *dev, Error **errp)
   4201 {
   4202     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
   4203         spapr_memory_plug(hotplug_dev, dev);
   4204     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
   4205         spapr_core_plug(hotplug_dev, dev);
   4206     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
   4207         spapr_phb_plug(hotplug_dev, dev);
   4208     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
   4209         spapr_tpm_proxy_plug(hotplug_dev, dev);
   4210     }
   4211 }
   4212 
   4213 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
   4214                                         DeviceState *dev, Error **errp)
   4215 {
   4216     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
   4217         spapr_memory_unplug(hotplug_dev, dev);
   4218     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
   4219         spapr_core_unplug(hotplug_dev, dev);
   4220     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
   4221         spapr_phb_unplug(hotplug_dev, dev);
   4222     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
   4223         spapr_tpm_proxy_unplug(hotplug_dev, dev);
   4224     }
   4225 }
   4226 
   4227 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
   4228 {
   4229     return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
   4230         /*
   4231          * CAS will process all pending unplug requests.
   4232          *
   4233          * HACK: a guest could theoretically have cleared all bits in OV5,
   4234          * but none of the guests we care for do.
   4235          */
   4236         spapr_ovec_empty(spapr->ov5_cas);
   4237 }
   4238 
   4239 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
   4240                                                 DeviceState *dev, Error **errp)
   4241 {
   4242     SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
   4243     MachineClass *mc = MACHINE_GET_CLASS(sms);
   4244     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4245 
   4246     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
   4247         if (spapr_memory_hot_unplug_supported(sms)) {
   4248             spapr_memory_unplug_request(hotplug_dev, dev, errp);
   4249         } else {
   4250             error_setg(errp, "Memory hot unplug not supported for this guest");
   4251         }
   4252     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
   4253         if (!mc->has_hotpluggable_cpus) {
   4254             error_setg(errp, "CPU hot unplug not supported on this machine");
   4255             return;
   4256         }
   4257         spapr_core_unplug_request(hotplug_dev, dev, errp);
   4258     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
   4259         if (!smc->dr_phb_enabled) {
   4260             error_setg(errp, "PHB hot unplug not supported on this machine");
   4261             return;
   4262         }
   4263         spapr_phb_unplug_request(hotplug_dev, dev, errp);
   4264     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
   4265         spapr_tpm_proxy_unplug(hotplug_dev, dev);
   4266     }
   4267 }
   4268 
   4269 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
   4270                                           DeviceState *dev, Error **errp)
   4271 {
   4272     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
   4273         spapr_memory_pre_plug(hotplug_dev, dev, errp);
   4274     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
   4275         spapr_core_pre_plug(hotplug_dev, dev, errp);
   4276     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
   4277         spapr_phb_pre_plug(hotplug_dev, dev, errp);
   4278     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
   4279         spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
   4280     }
   4281 }
   4282 
   4283 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
   4284                                                  DeviceState *dev)
   4285 {
   4286     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
   4287         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
   4288         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
   4289         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
   4290         return HOTPLUG_HANDLER(machine);
   4291     }
   4292     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
   4293         PCIDevice *pcidev = PCI_DEVICE(dev);
   4294         PCIBus *root = pci_device_root_bus(pcidev);
   4295         SpaprPhbState *phb =
   4296             (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
   4297                                                  TYPE_SPAPR_PCI_HOST_BRIDGE);
   4298 
   4299         if (phb) {
   4300             return HOTPLUG_HANDLER(phb);
   4301         }
   4302     }
   4303     return NULL;
   4304 }
   4305 
   4306 static CpuInstanceProperties
   4307 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
   4308 {
   4309     CPUArchId *core_slot;
   4310     MachineClass *mc = MACHINE_GET_CLASS(machine);
   4311 
   4312     /* make sure possible_cpu are intialized */
   4313     mc->possible_cpu_arch_ids(machine);
   4314     /* get CPU core slot containing thread that matches cpu_index */
   4315     core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
   4316     assert(core_slot);
   4317     return core_slot->props;
   4318 }
   4319 
   4320 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
   4321 {
   4322     return idx / ms->smp.cores % ms->numa_state->num_nodes;
   4323 }
   4324 
   4325 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
   4326 {
   4327     int i;
   4328     unsigned int smp_threads = machine->smp.threads;
   4329     unsigned int smp_cpus = machine->smp.cpus;
   4330     const char *core_type;
   4331     int spapr_max_cores = machine->smp.max_cpus / smp_threads;
   4332     MachineClass *mc = MACHINE_GET_CLASS(machine);
   4333 
   4334     if (!mc->has_hotpluggable_cpus) {
   4335         spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
   4336     }
   4337     if (machine->possible_cpus) {
   4338         assert(machine->possible_cpus->len == spapr_max_cores);
   4339         return machine->possible_cpus;
   4340     }
   4341 
   4342     core_type = spapr_get_cpu_core_type(machine->cpu_type);
   4343     if (!core_type) {
   4344         error_report("Unable to find sPAPR CPU Core definition");
   4345         exit(1);
   4346     }
   4347 
   4348     machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
   4349                              sizeof(CPUArchId) * spapr_max_cores);
   4350     machine->possible_cpus->len = spapr_max_cores;
   4351     for (i = 0; i < machine->possible_cpus->len; i++) {
   4352         int core_id = i * smp_threads;
   4353 
   4354         machine->possible_cpus->cpus[i].type = core_type;
   4355         machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
   4356         machine->possible_cpus->cpus[i].arch_id = core_id;
   4357         machine->possible_cpus->cpus[i].props.has_core_id = true;
   4358         machine->possible_cpus->cpus[i].props.core_id = core_id;
   4359     }
   4360     return machine->possible_cpus;
   4361 }
   4362 
   4363 static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
   4364                                 uint64_t *buid, hwaddr *pio,
   4365                                 hwaddr *mmio32, hwaddr *mmio64,
   4366                                 unsigned n_dma, uint32_t *liobns,
   4367                                 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
   4368 {
   4369     /*
   4370      * New-style PHB window placement.
   4371      *
   4372      * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
   4373      * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
   4374      * windows.
   4375      *
   4376      * Some guest kernels can't work with MMIO windows above 1<<46
   4377      * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
   4378      *
   4379      * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
   4380      * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
   4381      * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
   4382      * 1TiB 64-bit MMIO windows for each PHB.
   4383      */
   4384     const uint64_t base_buid = 0x800000020000000ULL;
   4385     int i;
   4386 
   4387     /* Sanity check natural alignments */
   4388     QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
   4389     QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
   4390     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
   4391     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
   4392     /* Sanity check bounds */
   4393     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
   4394                       SPAPR_PCI_MEM32_WIN_SIZE);
   4395     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
   4396                       SPAPR_PCI_MEM64_WIN_SIZE);
   4397 
   4398     if (index >= SPAPR_MAX_PHBS) {
   4399         error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
   4400                    SPAPR_MAX_PHBS - 1);
   4401         return false;
   4402     }
   4403 
   4404     *buid = base_buid + index;
   4405     for (i = 0; i < n_dma; ++i) {
   4406         liobns[i] = SPAPR_PCI_LIOBN(index, i);
   4407     }
   4408 
   4409     *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
   4410     *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
   4411     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
   4412 
   4413     *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
   4414     *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
   4415     return true;
   4416 }
   4417 
   4418 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
   4419 {
   4420     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
   4421 
   4422     return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
   4423 }
   4424 
   4425 static void spapr_ics_resend(XICSFabric *dev)
   4426 {
   4427     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
   4428 
   4429     ics_resend(spapr->ics);
   4430 }
   4431 
   4432 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
   4433 {
   4434     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
   4435 
   4436     return cpu ? spapr_cpu_state(cpu)->icp : NULL;
   4437 }
   4438 
   4439 static void spapr_pic_print_info(InterruptStatsProvider *obj,
   4440                                  Monitor *mon)
   4441 {
   4442     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
   4443 
   4444     spapr_irq_print_info(spapr, mon);
   4445     monitor_printf(mon, "irqchip: %s\n",
   4446                    kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
   4447 }
   4448 
   4449 /*
   4450  * This is a XIVE only operation
   4451  */
   4452 static int spapr_match_nvt(XiveFabric *xfb, uint8_t format,
   4453                            uint8_t nvt_blk, uint32_t nvt_idx,
   4454                            bool cam_ignore, uint8_t priority,
   4455                            uint32_t logic_serv, XiveTCTXMatch *match)
   4456 {
   4457     SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
   4458     XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
   4459     XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
   4460     int count;
   4461 
   4462     count = xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, cam_ignore,
   4463                            priority, logic_serv, match);
   4464     if (count < 0) {
   4465         return count;
   4466     }
   4467 
   4468     /*
   4469      * When we implement the save and restore of the thread interrupt
   4470      * contexts in the enter/exit CPU handlers of the machine and the
   4471      * escalations in QEMU, we should be able to handle non dispatched
   4472      * vCPUs.
   4473      *
   4474      * Until this is done, the sPAPR machine should find at least one
   4475      * matching context always.
   4476      */
   4477     if (count == 0) {
   4478         qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
   4479                       nvt_blk, nvt_idx);
   4480     }
   4481 
   4482     return count;
   4483 }
   4484 
   4485 int spapr_get_vcpu_id(PowerPCCPU *cpu)
   4486 {
   4487     return cpu->vcpu_id;
   4488 }
   4489 
   4490 bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
   4491 {
   4492     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
   4493     MachineState *ms = MACHINE(spapr);
   4494     int vcpu_id;
   4495 
   4496     vcpu_id = spapr_vcpu_id(spapr, cpu_index);
   4497 
   4498     if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
   4499         error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
   4500         error_append_hint(errp, "Adjust the number of cpus to %d "
   4501                           "or try to raise the number of threads per core\n",
   4502                           vcpu_id * ms->smp.threads / spapr->vsmt);
   4503         return false;
   4504     }
   4505 
   4506     cpu->vcpu_id = vcpu_id;
   4507     return true;
   4508 }
   4509 
   4510 PowerPCCPU *spapr_find_cpu(int vcpu_id)
   4511 {
   4512     CPUState *cs;
   4513 
   4514     CPU_FOREACH(cs) {
   4515         PowerPCCPU *cpu = POWERPC_CPU(cs);
   4516 
   4517         if (spapr_get_vcpu_id(cpu) == vcpu_id) {
   4518             return cpu;
   4519         }
   4520     }
   4521 
   4522     return NULL;
   4523 }
   4524 
   4525 static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
   4526 {
   4527     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
   4528 
   4529     return spapr_cpu->in_nested;
   4530 }
   4531 
   4532 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
   4533 {
   4534     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
   4535 
   4536     /* These are only called by TCG, KVM maintains dispatch state */
   4537 
   4538     spapr_cpu->prod = false;
   4539     if (spapr_cpu->vpa_addr) {
   4540         CPUState *cs = CPU(cpu);
   4541         uint32_t dispatch;
   4542 
   4543         dispatch = ldl_be_phys(cs->as,
   4544                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
   4545         dispatch++;
   4546         if ((dispatch & 1) != 0) {
   4547             qemu_log_mask(LOG_GUEST_ERROR,
   4548                           "VPA: incorrect dispatch counter value for "
   4549                           "dispatched partition %u, correcting.\n", dispatch);
   4550             dispatch++;
   4551         }
   4552         stl_be_phys(cs->as,
   4553                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
   4554     }
   4555 }
   4556 
   4557 static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
   4558 {
   4559     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
   4560 
   4561     if (spapr_cpu->vpa_addr) {
   4562         CPUState *cs = CPU(cpu);
   4563         uint32_t dispatch;
   4564 
   4565         dispatch = ldl_be_phys(cs->as,
   4566                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
   4567         dispatch++;
   4568         if ((dispatch & 1) != 1) {
   4569             qemu_log_mask(LOG_GUEST_ERROR,
   4570                           "VPA: incorrect dispatch counter value for "
   4571                           "preempted partition %u, correcting.\n", dispatch);
   4572             dispatch++;
   4573         }
   4574         stl_be_phys(cs->as,
   4575                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
   4576     }
   4577 }
   4578 
   4579 static void spapr_machine_class_init(ObjectClass *oc, void *data)
   4580 {
   4581     MachineClass *mc = MACHINE_CLASS(oc);
   4582     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
   4583     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
   4584     NMIClass *nc = NMI_CLASS(oc);
   4585     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
   4586     PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
   4587     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
   4588     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
   4589     XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
   4590     VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
   4591 
   4592     mc->desc = "pSeries Logical Partition (PAPR compliant)";
   4593     mc->ignore_boot_device_suffixes = true;
   4594 
   4595     /*
   4596      * We set up the default / latest behaviour here.  The class_init
   4597      * functions for the specific versioned machine types can override
   4598      * these details for backwards compatibility
   4599      */
   4600     mc->init = spapr_machine_init;
   4601     mc->reset = spapr_machine_reset;
   4602     mc->block_default_type = IF_SCSI;
   4603 
   4604     /*
   4605      * Setting max_cpus to INT32_MAX. Both KVM and TCG max_cpus values
   4606      * should be limited by the host capability instead of hardcoded.
   4607      * max_cpus for KVM guests will be checked in kvm_init(), and TCG
   4608      * guests are welcome to have as many CPUs as the host are capable
   4609      * of emulate.
   4610      */
   4611     mc->max_cpus = INT32_MAX;
   4612 
   4613     mc->no_parallel = 1;
   4614     mc->default_boot_order = "";
   4615     mc->default_ram_size = 512 * MiB;
   4616     mc->default_ram_id = "ppc_spapr.ram";
   4617     mc->default_display = "std";
   4618     mc->kvm_type = spapr_kvm_type;
   4619     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
   4620     mc->pci_allow_0_address = true;
   4621     assert(!mc->get_hotplug_handler);
   4622     mc->get_hotplug_handler = spapr_get_hotplug_handler;
   4623     hc->pre_plug = spapr_machine_device_pre_plug;
   4624     hc->plug = spapr_machine_device_plug;
   4625     mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
   4626     mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
   4627     mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
   4628     hc->unplug_request = spapr_machine_device_unplug_request;
   4629     hc->unplug = spapr_machine_device_unplug;
   4630 
   4631     smc->dr_lmb_enabled = true;
   4632     smc->update_dt_enabled = true;
   4633     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
   4634     mc->has_hotpluggable_cpus = true;
   4635     mc->nvdimm_supported = true;
   4636     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
   4637     fwc->get_dev_path = spapr_get_fw_dev_path;
   4638     nc->nmi_monitor_handler = spapr_nmi;
   4639     smc->phb_placement = spapr_phb_placement;
   4640     vhc->cpu_in_nested = spapr_cpu_in_nested;
   4641     vhc->deliver_hv_excp = spapr_exit_nested;
   4642     vhc->hypercall = emulate_spapr_hypercall;
   4643     vhc->hpt_mask = spapr_hpt_mask;
   4644     vhc->map_hptes = spapr_map_hptes;
   4645     vhc->unmap_hptes = spapr_unmap_hptes;
   4646     vhc->hpte_set_c = spapr_hpte_set_c;
   4647     vhc->hpte_set_r = spapr_hpte_set_r;
   4648     vhc->get_pate = spapr_get_pate;
   4649     vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
   4650     vhc->cpu_exec_enter = spapr_cpu_exec_enter;
   4651     vhc->cpu_exec_exit = spapr_cpu_exec_exit;
   4652     xic->ics_get = spapr_ics_get;
   4653     xic->ics_resend = spapr_ics_resend;
   4654     xic->icp_get = spapr_icp_get;
   4655     ispc->print_info = spapr_pic_print_info;
   4656     /* Force NUMA node memory size to be a multiple of
   4657      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
   4658      * in which LMBs are represented and hot-added
   4659      */
   4660     mc->numa_mem_align_shift = 28;
   4661     mc->auto_enable_numa = true;
   4662 
   4663     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
   4664     smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
   4665     smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
   4666     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
   4667     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
   4668     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
   4669     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
   4670     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
   4671     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
   4672     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
   4673     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
   4674     smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
   4675     spapr_caps_add_properties(smc);
   4676     smc->irq = &spapr_irq_dual;
   4677     smc->dr_phb_enabled = true;
   4678     smc->linux_pci_probe = true;
   4679     smc->smp_threads_vsmt = true;
   4680     smc->nr_xirqs = SPAPR_NR_XIRQS;
   4681     xfc->match_nvt = spapr_match_nvt;
   4682     vmc->client_architecture_support = spapr_vof_client_architecture_support;
   4683     vmc->quiesce = spapr_vof_quiesce;
   4684     vmc->setprop = spapr_vof_setprop;
   4685 }
   4686 
   4687 static const TypeInfo spapr_machine_info = {
   4688     .name          = TYPE_SPAPR_MACHINE,
   4689     .parent        = TYPE_MACHINE,
   4690     .abstract      = true,
   4691     .instance_size = sizeof(SpaprMachineState),
   4692     .instance_init = spapr_instance_init,
   4693     .instance_finalize = spapr_machine_finalizefn,
   4694     .class_size    = sizeof(SpaprMachineClass),
   4695     .class_init    = spapr_machine_class_init,
   4696     .interfaces = (InterfaceInfo[]) {
   4697         { TYPE_FW_PATH_PROVIDER },
   4698         { TYPE_NMI },
   4699         { TYPE_HOTPLUG_HANDLER },
   4700         { TYPE_PPC_VIRTUAL_HYPERVISOR },
   4701         { TYPE_XICS_FABRIC },
   4702         { TYPE_INTERRUPT_STATS_PROVIDER },
   4703         { TYPE_XIVE_FABRIC },
   4704         { TYPE_VOF_MACHINE_IF },
   4705         { }
   4706     },
   4707 };
   4708 
   4709 static void spapr_machine_latest_class_options(MachineClass *mc)
   4710 {
   4711     mc->alias = "pseries";
   4712     mc->is_default = true;
   4713 }
   4714 
   4715 #define DEFINE_SPAPR_MACHINE(suffix, verstr, latest)                 \
   4716     static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
   4717                                                     void *data)      \
   4718     {                                                                \
   4719         MachineClass *mc = MACHINE_CLASS(oc);                        \
   4720         spapr_machine_##suffix##_class_options(mc);                  \
   4721         if (latest) {                                                \
   4722             spapr_machine_latest_class_options(mc);                  \
   4723         }                                                            \
   4724     }                                                                \
   4725     static const TypeInfo spapr_machine_##suffix##_info = {          \
   4726         .name = MACHINE_TYPE_NAME("pseries-" verstr),                \
   4727         .parent = TYPE_SPAPR_MACHINE,                                \
   4728         .class_init = spapr_machine_##suffix##_class_init,           \
   4729     };                                                               \
   4730     static void spapr_machine_register_##suffix(void)                \
   4731     {                                                                \
   4732         type_register(&spapr_machine_##suffix##_info);               \
   4733     }                                                                \
   4734     type_init(spapr_machine_register_##suffix)
   4735 
   4736 /*
   4737  * pseries-7.2
   4738  */
   4739 static void spapr_machine_7_2_class_options(MachineClass *mc)
   4740 {
   4741     /* Defaults for the latest behaviour inherited from the base class */
   4742 }
   4743 
   4744 DEFINE_SPAPR_MACHINE(7_2, "7.2", true);
   4745 
   4746 /*
   4747  * pseries-7.1
   4748  */
   4749 static void spapr_machine_7_1_class_options(MachineClass *mc)
   4750 {
   4751     spapr_machine_7_2_class_options(mc);
   4752     compat_props_add(mc->compat_props, hw_compat_7_1, hw_compat_7_1_len);
   4753 }
   4754 
   4755 DEFINE_SPAPR_MACHINE(7_1, "7.1", false);
   4756 
   4757 /*
   4758  * pseries-7.0
   4759  */
   4760 static void spapr_machine_7_0_class_options(MachineClass *mc)
   4761 {
   4762     spapr_machine_7_1_class_options(mc);
   4763     compat_props_add(mc->compat_props, hw_compat_7_0, hw_compat_7_0_len);
   4764 }
   4765 
   4766 DEFINE_SPAPR_MACHINE(7_0, "7.0", false);
   4767 
   4768 /*
   4769  * pseries-6.2
   4770  */
   4771 static void spapr_machine_6_2_class_options(MachineClass *mc)
   4772 {
   4773     spapr_machine_7_0_class_options(mc);
   4774     compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len);
   4775 }
   4776 
   4777 DEFINE_SPAPR_MACHINE(6_2, "6.2", false);
   4778 
   4779 /*
   4780  * pseries-6.1
   4781  */
   4782 static void spapr_machine_6_1_class_options(MachineClass *mc)
   4783 {
   4784     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4785 
   4786     spapr_machine_6_2_class_options(mc);
   4787     compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
   4788     smc->pre_6_2_numa_affinity = true;
   4789     mc->smp_props.prefer_sockets = true;
   4790 }
   4791 
   4792 DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
   4793 
   4794 /*
   4795  * pseries-6.0
   4796  */
   4797 static void spapr_machine_6_0_class_options(MachineClass *mc)
   4798 {
   4799     spapr_machine_6_1_class_options(mc);
   4800     compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
   4801 }
   4802 
   4803 DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
   4804 
   4805 /*
   4806  * pseries-5.2
   4807  */
   4808 static void spapr_machine_5_2_class_options(MachineClass *mc)
   4809 {
   4810     spapr_machine_6_0_class_options(mc);
   4811     compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
   4812 }
   4813 
   4814 DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
   4815 
   4816 /*
   4817  * pseries-5.1
   4818  */
   4819 static void spapr_machine_5_1_class_options(MachineClass *mc)
   4820 {
   4821     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4822 
   4823     spapr_machine_5_2_class_options(mc);
   4824     compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
   4825     smc->pre_5_2_numa_associativity = true;
   4826 }
   4827 
   4828 DEFINE_SPAPR_MACHINE(5_1, "5.1", false);
   4829 
   4830 /*
   4831  * pseries-5.0
   4832  */
   4833 static void spapr_machine_5_0_class_options(MachineClass *mc)
   4834 {
   4835     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4836     static GlobalProperty compat[] = {
   4837         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
   4838     };
   4839 
   4840     spapr_machine_5_1_class_options(mc);
   4841     compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
   4842     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   4843     mc->numa_mem_supported = true;
   4844     smc->pre_5_1_assoc_refpoints = true;
   4845 }
   4846 
   4847 DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
   4848 
   4849 /*
   4850  * pseries-4.2
   4851  */
   4852 static void spapr_machine_4_2_class_options(MachineClass *mc)
   4853 {
   4854     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4855 
   4856     spapr_machine_5_0_class_options(mc);
   4857     compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
   4858     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
   4859     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
   4860     smc->rma_limit = 16 * GiB;
   4861     mc->nvdimm_supported = false;
   4862 }
   4863 
   4864 DEFINE_SPAPR_MACHINE(4_2, "4.2", false);
   4865 
   4866 /*
   4867  * pseries-4.1
   4868  */
   4869 static void spapr_machine_4_1_class_options(MachineClass *mc)
   4870 {
   4871     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4872     static GlobalProperty compat[] = {
   4873         /* Only allow 4kiB and 64kiB IOMMU pagesizes */
   4874         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
   4875     };
   4876 
   4877     spapr_machine_4_2_class_options(mc);
   4878     smc->linux_pci_probe = false;
   4879     smc->smp_threads_vsmt = false;
   4880     compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
   4881     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   4882 }
   4883 
   4884 DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
   4885 
   4886 /*
   4887  * pseries-4.0
   4888  */
   4889 static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
   4890                               uint64_t *buid, hwaddr *pio,
   4891                               hwaddr *mmio32, hwaddr *mmio64,
   4892                               unsigned n_dma, uint32_t *liobns,
   4893                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
   4894 {
   4895     if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
   4896                              liobns, nv2gpa, nv2atsd, errp)) {
   4897         return false;
   4898     }
   4899 
   4900     *nv2gpa = 0;
   4901     *nv2atsd = 0;
   4902     return true;
   4903 }
   4904 static void spapr_machine_4_0_class_options(MachineClass *mc)
   4905 {
   4906     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4907 
   4908     spapr_machine_4_1_class_options(mc);
   4909     compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
   4910     smc->phb_placement = phb_placement_4_0;
   4911     smc->irq = &spapr_irq_xics;
   4912     smc->pre_4_1_migration = true;
   4913 }
   4914 
   4915 DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
   4916 
   4917 /*
   4918  * pseries-3.1
   4919  */
   4920 static void spapr_machine_3_1_class_options(MachineClass *mc)
   4921 {
   4922     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4923 
   4924     spapr_machine_4_0_class_options(mc);
   4925     compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
   4926 
   4927     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
   4928     smc->update_dt_enabled = false;
   4929     smc->dr_phb_enabled = false;
   4930     smc->broken_host_serial_model = true;
   4931     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
   4932     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
   4933     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
   4934     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
   4935 }
   4936 
   4937 DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
   4938 
   4939 /*
   4940  * pseries-3.0
   4941  */
   4942 
   4943 static void spapr_machine_3_0_class_options(MachineClass *mc)
   4944 {
   4945     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4946 
   4947     spapr_machine_3_1_class_options(mc);
   4948     compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
   4949 
   4950     smc->legacy_irq_allocation = true;
   4951     smc->nr_xirqs = 0x400;
   4952     smc->irq = &spapr_irq_xics_legacy;
   4953 }
   4954 
   4955 DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
   4956 
   4957 /*
   4958  * pseries-2.12
   4959  */
   4960 static void spapr_machine_2_12_class_options(MachineClass *mc)
   4961 {
   4962     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4963     static GlobalProperty compat[] = {
   4964         { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
   4965         { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
   4966     };
   4967 
   4968     spapr_machine_3_0_class_options(mc);
   4969     compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
   4970     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   4971 
   4972     /* We depend on kvm_enabled() to choose a default value for the
   4973      * hpt-max-page-size capability. Of course we can't do it here
   4974      * because this is too early and the HW accelerator isn't initialzed
   4975      * yet. Postpone this to machine init (see default_caps_with_cpu()).
   4976      */
   4977     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
   4978 }
   4979 
   4980 DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
   4981 
   4982 static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
   4983 {
   4984     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   4985 
   4986     spapr_machine_2_12_class_options(mc);
   4987     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
   4988     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
   4989     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
   4990 }
   4991 
   4992 DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
   4993 
   4994 /*
   4995  * pseries-2.11
   4996  */
   4997 
   4998 static void spapr_machine_2_11_class_options(MachineClass *mc)
   4999 {
   5000     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   5001 
   5002     spapr_machine_2_12_class_options(mc);
   5003     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
   5004     compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
   5005 }
   5006 
   5007 DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
   5008 
   5009 /*
   5010  * pseries-2.10
   5011  */
   5012 
   5013 static void spapr_machine_2_10_class_options(MachineClass *mc)
   5014 {
   5015     spapr_machine_2_11_class_options(mc);
   5016     compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
   5017 }
   5018 
   5019 DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
   5020 
   5021 /*
   5022  * pseries-2.9
   5023  */
   5024 
   5025 static void spapr_machine_2_9_class_options(MachineClass *mc)
   5026 {
   5027     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   5028     static GlobalProperty compat[] = {
   5029         { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
   5030     };
   5031 
   5032     spapr_machine_2_10_class_options(mc);
   5033     compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
   5034     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5035     smc->pre_2_10_has_unused_icps = true;
   5036     smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
   5037 }
   5038 
   5039 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
   5040 
   5041 /*
   5042  * pseries-2.8
   5043  */
   5044 
   5045 static void spapr_machine_2_8_class_options(MachineClass *mc)
   5046 {
   5047     static GlobalProperty compat[] = {
   5048         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
   5049     };
   5050 
   5051     spapr_machine_2_9_class_options(mc);
   5052     compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
   5053     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5054     mc->numa_mem_align_shift = 23;
   5055 }
   5056 
   5057 DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
   5058 
   5059 /*
   5060  * pseries-2.7
   5061  */
   5062 
   5063 static bool phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
   5064                               uint64_t *buid, hwaddr *pio,
   5065                               hwaddr *mmio32, hwaddr *mmio64,
   5066                               unsigned n_dma, uint32_t *liobns,
   5067                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
   5068 {
   5069     /* Legacy PHB placement for pseries-2.7 and earlier machine types */
   5070     const uint64_t base_buid = 0x800000020000000ULL;
   5071     const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
   5072     const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
   5073     const hwaddr pio_offset = 0x80000000; /* 2 GiB */
   5074     const uint32_t max_index = 255;
   5075     const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
   5076 
   5077     uint64_t ram_top = MACHINE(spapr)->ram_size;
   5078     hwaddr phb0_base, phb_base;
   5079     int i;
   5080 
   5081     /* Do we have device memory? */
   5082     if (MACHINE(spapr)->maxram_size > ram_top) {
   5083         /* Can't just use maxram_size, because there may be an
   5084          * alignment gap between normal and device memory regions
   5085          */
   5086         ram_top = MACHINE(spapr)->device_memory->base +
   5087             memory_region_size(&MACHINE(spapr)->device_memory->mr);
   5088     }
   5089 
   5090     phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
   5091 
   5092     if (index > max_index) {
   5093         error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
   5094                    max_index);
   5095         return false;
   5096     }
   5097 
   5098     *buid = base_buid + index;
   5099     for (i = 0; i < n_dma; ++i) {
   5100         liobns[i] = SPAPR_PCI_LIOBN(index, i);
   5101     }
   5102 
   5103     phb_base = phb0_base + index * phb_spacing;
   5104     *pio = phb_base + pio_offset;
   5105     *mmio32 = phb_base + mmio_offset;
   5106     /*
   5107      * We don't set the 64-bit MMIO window, relying on the PHB's
   5108      * fallback behaviour of automatically splitting a large "32-bit"
   5109      * window into contiguous 32-bit and 64-bit windows
   5110      */
   5111 
   5112     *nv2gpa = 0;
   5113     *nv2atsd = 0;
   5114     return true;
   5115 }
   5116 
   5117 static void spapr_machine_2_7_class_options(MachineClass *mc)
   5118 {
   5119     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   5120     static GlobalProperty compat[] = {
   5121         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
   5122         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
   5123         { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
   5124         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
   5125     };
   5126 
   5127     spapr_machine_2_8_class_options(mc);
   5128     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
   5129     mc->default_machine_opts = "modern-hotplug-events=off";
   5130     compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
   5131     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5132     smc->phb_placement = phb_placement_2_7;
   5133 }
   5134 
   5135 DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
   5136 
   5137 /*
   5138  * pseries-2.6
   5139  */
   5140 
   5141 static void spapr_machine_2_6_class_options(MachineClass *mc)
   5142 {
   5143     static GlobalProperty compat[] = {
   5144         { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
   5145     };
   5146 
   5147     spapr_machine_2_7_class_options(mc);
   5148     mc->has_hotpluggable_cpus = false;
   5149     compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
   5150     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5151 }
   5152 
   5153 DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
   5154 
   5155 /*
   5156  * pseries-2.5
   5157  */
   5158 
   5159 static void spapr_machine_2_5_class_options(MachineClass *mc)
   5160 {
   5161     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   5162     static GlobalProperty compat[] = {
   5163         { "spapr-vlan", "use-rx-buffer-pools", "off" },
   5164     };
   5165 
   5166     spapr_machine_2_6_class_options(mc);
   5167     smc->use_ohci_by_default = true;
   5168     compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
   5169     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5170 }
   5171 
   5172 DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
   5173 
   5174 /*
   5175  * pseries-2.4
   5176  */
   5177 
   5178 static void spapr_machine_2_4_class_options(MachineClass *mc)
   5179 {
   5180     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
   5181 
   5182     spapr_machine_2_5_class_options(mc);
   5183     smc->dr_lmb_enabled = false;
   5184     compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
   5185 }
   5186 
   5187 DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
   5188 
   5189 /*
   5190  * pseries-2.3
   5191  */
   5192 
   5193 static void spapr_machine_2_3_class_options(MachineClass *mc)
   5194 {
   5195     static GlobalProperty compat[] = {
   5196         { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
   5197     };
   5198     spapr_machine_2_4_class_options(mc);
   5199     compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
   5200     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5201 }
   5202 DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
   5203 
   5204 /*
   5205  * pseries-2.2
   5206  */
   5207 
   5208 static void spapr_machine_2_2_class_options(MachineClass *mc)
   5209 {
   5210     static GlobalProperty compat[] = {
   5211         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
   5212     };
   5213 
   5214     spapr_machine_2_3_class_options(mc);
   5215     compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
   5216     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
   5217     mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
   5218 }
   5219 DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
   5220 
   5221 /*
   5222  * pseries-2.1
   5223  */
   5224 
   5225 static void spapr_machine_2_1_class_options(MachineClass *mc)
   5226 {
   5227     spapr_machine_2_2_class_options(mc);
   5228     compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
   5229 }
   5230 DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
   5231 
   5232 static void spapr_machine_register_types(void)
   5233 {
   5234     type_register_static(&spapr_machine_info);
   5235 }
   5236 
   5237 type_init(spapr_machine_register_types)