qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

x86.c (49990B)


      1 /*
      2  * Copyright (c) 2003-2004 Fabrice Bellard
      3  * Copyright (c) 2019 Red Hat, Inc.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a copy
      6  * of this software and associated documentation files (the "Software"), to deal
      7  * in the Software without restriction, including without limitation the rights
      8  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      9  * copies of the Software, and to permit persons to whom the Software is
     10  * furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice shall be included in
     13  * all copies or substantial portions of the Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     21  * THE SOFTWARE.
     22  */
     23 #include "qemu/osdep.h"
     24 #include "qemu/error-report.h"
     25 #include "qemu/option.h"
     26 #include "qemu/cutils.h"
     27 #include "qemu/units.h"
     28 #include "qemu/datadir.h"
     29 #include "qemu/guest-random.h"
     30 #include "qapi/error.h"
     31 #include "qapi/qmp/qerror.h"
     32 #include "qapi/qapi-visit-common.h"
     33 #include "qapi/clone-visitor.h"
     34 #include "qapi/qapi-visit-machine.h"
     35 #include "qapi/visitor.h"
     36 #include "sysemu/qtest.h"
     37 #include "sysemu/whpx.h"
     38 #include "sysemu/numa.h"
     39 #include "sysemu/replay.h"
     40 #include "sysemu/reset.h"
     41 #include "sysemu/sysemu.h"
     42 #include "sysemu/cpu-timers.h"
     43 #include "sysemu/xen.h"
     44 #include "trace.h"
     45 
     46 #include "hw/i386/x86.h"
     47 #include "target/i386/cpu.h"
     48 #include "hw/i386/topology.h"
     49 #include "hw/i386/fw_cfg.h"
     50 #include "hw/intc/i8259.h"
     51 #include "hw/rtc/mc146818rtc.h"
     52 #include "target/i386/sev.h"
     53 
     54 #include "hw/acpi/cpu_hotplug.h"
     55 #include "hw/irq.h"
     56 #include "hw/nmi.h"
     57 #include "hw/loader.h"
     58 #include "multiboot.h"
     59 #include "elf.h"
     60 #include "standard-headers/asm-x86/bootparam.h"
     61 #include CONFIG_DEVICES
     62 #include "kvm/kvm_i386.h"
     63 
     64 /* Physical Address of PVH entry point read from kernel ELF NOTE */
     65 static size_t pvh_start_addr;
     66 
     67 inline void init_topo_info(X86CPUTopoInfo *topo_info,
     68                            const X86MachineState *x86ms)
     69 {
     70     MachineState *ms = MACHINE(x86ms);
     71 
     72     topo_info->dies_per_pkg = ms->smp.dies;
     73     topo_info->cores_per_die = ms->smp.cores;
     74     topo_info->threads_per_core = ms->smp.threads;
     75 }
     76 
     77 /*
     78  * Calculates initial APIC ID for a specific CPU index
     79  *
     80  * Currently we need to be able to calculate the APIC ID from the CPU index
     81  * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
     82  * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
     83  * all CPUs up to max_cpus.
     84  */
     85 uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
     86                                     unsigned int cpu_index)
     87 {
     88     X86CPUTopoInfo topo_info;
     89 
     90     init_topo_info(&topo_info, x86ms);
     91 
     92     return x86_apicid_from_cpu_idx(&topo_info, cpu_index);
     93 }
     94 
     95 
     96 void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
     97 {
     98     Object *cpu = object_new(MACHINE(x86ms)->cpu_type);
     99 
    100     if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
    101         goto out;
    102     }
    103     qdev_realize(DEVICE(cpu), NULL, errp);
    104 
    105 out:
    106     object_unref(cpu);
    107 }
    108 
    109 void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
    110 {
    111     int i;
    112     const CPUArchIdList *possible_cpus;
    113     MachineState *ms = MACHINE(x86ms);
    114     MachineClass *mc = MACHINE_GET_CLASS(x86ms);
    115 
    116     x86_cpu_set_default_version(default_cpu_version);
    117 
    118     /*
    119      * Calculates the limit to CPU APIC ID values
    120      *
    121      * Limit for the APIC ID value, so that all
    122      * CPU APIC IDs are < x86ms->apic_id_limit.
    123      *
    124      * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
    125      */
    126     x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
    127                                                       ms->smp.max_cpus - 1) + 1;
    128 
    129     /*
    130      * Can we support APIC ID 255 or higher?
    131      *
    132      * Under Xen: yes.
    133      * With userspace emulated lapic: no
    134      * With KVM's in-kernel lapic: only if X2APIC API is enabled.
    135      */
    136     if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
    137         (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
    138         error_report("current -smp configuration requires kernel "
    139                      "irqchip and X2APIC API support.");
    140         exit(EXIT_FAILURE);
    141     }
    142 
    143     if (kvm_enabled()) {
    144         kvm_set_max_apic_id(x86ms->apic_id_limit);
    145     }
    146 
    147     possible_cpus = mc->possible_cpu_arch_ids(ms);
    148     for (i = 0; i < ms->smp.cpus; i++) {
    149         x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
    150     }
    151 }
    152 
    153 void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
    154 {
    155     if (cpus_count > 0xff) {
    156         /*
    157          * If the number of CPUs can't be represented in 8 bits, the
    158          * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just
    159          * to make old BIOSes fail more predictably.
    160          */
    161         rtc_set_memory(rtc, 0x5f, 0);
    162     } else {
    163         rtc_set_memory(rtc, 0x5f, cpus_count - 1);
    164     }
    165 }
    166 
    167 static int x86_apic_cmp(const void *a, const void *b)
    168 {
    169    CPUArchId *apic_a = (CPUArchId *)a;
    170    CPUArchId *apic_b = (CPUArchId *)b;
    171 
    172    return apic_a->arch_id - apic_b->arch_id;
    173 }
    174 
    175 /*
    176  * returns pointer to CPUArchId descriptor that matches CPU's apic_id
    177  * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no
    178  * entry corresponding to CPU's apic_id returns NULL.
    179  */
    180 CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
    181 {
    182     CPUArchId apic_id, *found_cpu;
    183 
    184     apic_id.arch_id = id;
    185     found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus,
    186         ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus),
    187         x86_apic_cmp);
    188     if (found_cpu && idx) {
    189         *idx = found_cpu - ms->possible_cpus->cpus;
    190     }
    191     return found_cpu;
    192 }
    193 
    194 void x86_cpu_plug(HotplugHandler *hotplug_dev,
    195                   DeviceState *dev, Error **errp)
    196 {
    197     CPUArchId *found_cpu;
    198     Error *local_err = NULL;
    199     X86CPU *cpu = X86_CPU(dev);
    200     X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
    201 
    202     if (x86ms->acpi_dev) {
    203         hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err);
    204         if (local_err) {
    205             goto out;
    206         }
    207     }
    208 
    209     /* increment the number of CPUs */
    210     x86ms->boot_cpus++;
    211     if (x86ms->rtc) {
    212         x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
    213     }
    214     if (x86ms->fw_cfg) {
    215         fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
    216     }
    217 
    218     found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
    219     found_cpu->cpu = OBJECT(dev);
    220 out:
    221     error_propagate(errp, local_err);
    222 }
    223 
    224 void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
    225                                DeviceState *dev, Error **errp)
    226 {
    227     int idx = -1;
    228     X86CPU *cpu = X86_CPU(dev);
    229     X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
    230 
    231     if (!x86ms->acpi_dev) {
    232         error_setg(errp, "CPU hot unplug not supported without ACPI");
    233         return;
    234     }
    235 
    236     x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
    237     assert(idx != -1);
    238     if (idx == 0) {
    239         error_setg(errp, "Boot CPU is unpluggable");
    240         return;
    241     }
    242 
    243     hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
    244                                    errp);
    245 }
    246 
    247 void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
    248                        DeviceState *dev, Error **errp)
    249 {
    250     CPUArchId *found_cpu;
    251     Error *local_err = NULL;
    252     X86CPU *cpu = X86_CPU(dev);
    253     X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
    254 
    255     hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
    256     if (local_err) {
    257         goto out;
    258     }
    259 
    260     found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
    261     found_cpu->cpu = NULL;
    262     qdev_unrealize(dev);
    263 
    264     /* decrement the number of CPUs */
    265     x86ms->boot_cpus--;
    266     /* Update the number of CPUs in CMOS */
    267     x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
    268     fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
    269  out:
    270     error_propagate(errp, local_err);
    271 }
    272 
    273 void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
    274                       DeviceState *dev, Error **errp)
    275 {
    276     int idx;
    277     CPUState *cs;
    278     CPUArchId *cpu_slot;
    279     X86CPUTopoIDs topo_ids;
    280     X86CPU *cpu = X86_CPU(dev);
    281     CPUX86State *env = &cpu->env;
    282     MachineState *ms = MACHINE(hotplug_dev);
    283     X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
    284     unsigned int smp_cores = ms->smp.cores;
    285     unsigned int smp_threads = ms->smp.threads;
    286     X86CPUTopoInfo topo_info;
    287 
    288     if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
    289         error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
    290                    ms->cpu_type);
    291         return;
    292     }
    293 
    294     if (x86ms->acpi_dev) {
    295         Error *local_err = NULL;
    296 
    297         hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev,
    298                                  &local_err);
    299         if (local_err) {
    300             error_propagate(errp, local_err);
    301             return;
    302         }
    303     }
    304 
    305     init_topo_info(&topo_info, x86ms);
    306 
    307     env->nr_dies = ms->smp.dies;
    308 
    309     /*
    310      * If APIC ID is not set,
    311      * set it based on socket/die/core/thread properties.
    312      */
    313     if (cpu->apic_id == UNASSIGNED_APIC_ID) {
    314         int max_socket = (ms->smp.max_cpus - 1) /
    315                                 smp_threads / smp_cores / ms->smp.dies;
    316 
    317         /*
    318          * die-id was optional in QEMU 4.0 and older, so keep it optional
    319          * if there's only one die per socket.
    320          */
    321         if (cpu->die_id < 0 && ms->smp.dies == 1) {
    322             cpu->die_id = 0;
    323         }
    324 
    325         if (cpu->socket_id < 0) {
    326             error_setg(errp, "CPU socket-id is not set");
    327             return;
    328         } else if (cpu->socket_id > max_socket) {
    329             error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u",
    330                        cpu->socket_id, max_socket);
    331             return;
    332         }
    333         if (cpu->die_id < 0) {
    334             error_setg(errp, "CPU die-id is not set");
    335             return;
    336         } else if (cpu->die_id > ms->smp.dies - 1) {
    337             error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u",
    338                        cpu->die_id, ms->smp.dies - 1);
    339             return;
    340         }
    341         if (cpu->core_id < 0) {
    342             error_setg(errp, "CPU core-id is not set");
    343             return;
    344         } else if (cpu->core_id > (smp_cores - 1)) {
    345             error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u",
    346                        cpu->core_id, smp_cores - 1);
    347             return;
    348         }
    349         if (cpu->thread_id < 0) {
    350             error_setg(errp, "CPU thread-id is not set");
    351             return;
    352         } else if (cpu->thread_id > (smp_threads - 1)) {
    353             error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u",
    354                        cpu->thread_id, smp_threads - 1);
    355             return;
    356         }
    357 
    358         topo_ids.pkg_id = cpu->socket_id;
    359         topo_ids.die_id = cpu->die_id;
    360         topo_ids.core_id = cpu->core_id;
    361         topo_ids.smt_id = cpu->thread_id;
    362         cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids);
    363     }
    364 
    365     cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
    366     if (!cpu_slot) {
    367         MachineState *ms = MACHINE(x86ms);
    368 
    369         x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
    370         error_setg(errp,
    371             "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with"
    372             " APIC ID %" PRIu32 ", valid index range 0:%d",
    373             topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id,
    374             cpu->apic_id, ms->possible_cpus->len - 1);
    375         return;
    376     }
    377 
    378     if (cpu_slot->cpu) {
    379         error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists",
    380                    idx, cpu->apic_id);
    381         return;
    382     }
    383 
    384     /* if 'address' properties socket-id/core-id/thread-id are not set, set them
    385      * so that machine_query_hotpluggable_cpus would show correct values
    386      */
    387     /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
    388      * once -smp refactoring is complete and there will be CPU private
    389      * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */
    390     x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
    391     if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) {
    392         error_setg(errp, "property socket-id: %u doesn't match set apic-id:"
    393             " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id,
    394             topo_ids.pkg_id);
    395         return;
    396     }
    397     cpu->socket_id = topo_ids.pkg_id;
    398 
    399     if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) {
    400         error_setg(errp, "property die-id: %u doesn't match set apic-id:"
    401             " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id);
    402         return;
    403     }
    404     cpu->die_id = topo_ids.die_id;
    405 
    406     if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) {
    407         error_setg(errp, "property core-id: %u doesn't match set apic-id:"
    408             " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id,
    409             topo_ids.core_id);
    410         return;
    411     }
    412     cpu->core_id = topo_ids.core_id;
    413 
    414     if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) {
    415         error_setg(errp, "property thread-id: %u doesn't match set apic-id:"
    416             " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id,
    417             topo_ids.smt_id);
    418         return;
    419     }
    420     cpu->thread_id = topo_ids.smt_id;
    421 
    422     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
    423         !kvm_hv_vpindex_settable()) {
    424         error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
    425         return;
    426     }
    427 
    428     cs = CPU(cpu);
    429     cs->cpu_index = idx;
    430 
    431     numa_cpu_pre_plug(cpu_slot, dev, errp);
    432 }
    433 
    434 CpuInstanceProperties
    435 x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
    436 {
    437     MachineClass *mc = MACHINE_GET_CLASS(ms);
    438     const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
    439 
    440     assert(cpu_index < possible_cpus->len);
    441     return possible_cpus->cpus[cpu_index].props;
    442 }
    443 
    444 int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
    445 {
    446    X86CPUTopoIDs topo_ids;
    447    X86MachineState *x86ms = X86_MACHINE(ms);
    448    X86CPUTopoInfo topo_info;
    449 
    450    init_topo_info(&topo_info, x86ms);
    451 
    452    assert(idx < ms->possible_cpus->len);
    453    x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
    454                             &topo_info, &topo_ids);
    455    return topo_ids.pkg_id % ms->numa_state->num_nodes;
    456 }
    457 
    458 const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
    459 {
    460     X86MachineState *x86ms = X86_MACHINE(ms);
    461     unsigned int max_cpus = ms->smp.max_cpus;
    462     X86CPUTopoInfo topo_info;
    463     int i;
    464 
    465     if (ms->possible_cpus) {
    466         /*
    467          * make sure that max_cpus hasn't changed since the first use, i.e.
    468          * -smp hasn't been parsed after it
    469          */
    470         assert(ms->possible_cpus->len == max_cpus);
    471         return ms->possible_cpus;
    472     }
    473 
    474     ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
    475                                   sizeof(CPUArchId) * max_cpus);
    476     ms->possible_cpus->len = max_cpus;
    477 
    478     init_topo_info(&topo_info, x86ms);
    479 
    480     for (i = 0; i < ms->possible_cpus->len; i++) {
    481         X86CPUTopoIDs topo_ids;
    482 
    483         ms->possible_cpus->cpus[i].type = ms->cpu_type;
    484         ms->possible_cpus->cpus[i].vcpus_count = 1;
    485         ms->possible_cpus->cpus[i].arch_id =
    486             x86_cpu_apic_id_from_index(x86ms, i);
    487         x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
    488                                  &topo_info, &topo_ids);
    489         ms->possible_cpus->cpus[i].props.has_socket_id = true;
    490         ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
    491         if (ms->smp.dies > 1) {
    492             ms->possible_cpus->cpus[i].props.has_die_id = true;
    493             ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
    494         }
    495         ms->possible_cpus->cpus[i].props.has_core_id = true;
    496         ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
    497         ms->possible_cpus->cpus[i].props.has_thread_id = true;
    498         ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
    499     }
    500     return ms->possible_cpus;
    501 }
    502 
    503 static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
    504 {
    505     /* cpu index isn't used */
    506     CPUState *cs;
    507 
    508     CPU_FOREACH(cs) {
    509         X86CPU *cpu = X86_CPU(cs);
    510 
    511         if (!cpu->apic_state) {
    512             cpu_interrupt(cs, CPU_INTERRUPT_NMI);
    513         } else {
    514             apic_deliver_nmi(cpu->apic_state);
    515         }
    516     }
    517 }
    518 
    519 static long get_file_size(FILE *f)
    520 {
    521     long where, size;
    522 
    523     /* XXX: on Unix systems, using fstat() probably makes more sense */
    524 
    525     where = ftell(f);
    526     fseek(f, 0, SEEK_END);
    527     size = ftell(f);
    528     fseek(f, where, SEEK_SET);
    529 
    530     return size;
    531 }
    532 
    533 /* TSC handling */
    534 uint64_t cpu_get_tsc(CPUX86State *env)
    535 {
    536     return cpus_get_elapsed_ticks();
    537 }
    538 
    539 /* IRQ handling */
    540 static void pic_irq_request(void *opaque, int irq, int level)
    541 {
    542     CPUState *cs = first_cpu;
    543     X86CPU *cpu = X86_CPU(cs);
    544 
    545     trace_x86_pic_interrupt(irq, level);
    546     if (cpu->apic_state && !kvm_irqchip_in_kernel() &&
    547         !whpx_apic_in_platform()) {
    548         CPU_FOREACH(cs) {
    549             cpu = X86_CPU(cs);
    550             if (apic_accept_pic_intr(cpu->apic_state)) {
    551                 apic_deliver_pic_intr(cpu->apic_state, level);
    552             }
    553         }
    554     } else {
    555         if (level) {
    556             cpu_interrupt(cs, CPU_INTERRUPT_HARD);
    557         } else {
    558             cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
    559         }
    560     }
    561 }
    562 
    563 qemu_irq x86_allocate_cpu_irq(void)
    564 {
    565     return qemu_allocate_irq(pic_irq_request, NULL, 0);
    566 }
    567 
    568 int cpu_get_pic_interrupt(CPUX86State *env)
    569 {
    570     X86CPU *cpu = env_archcpu(env);
    571     int intno;
    572 
    573     if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) {
    574         intno = apic_get_interrupt(cpu->apic_state);
    575         if (intno >= 0) {
    576             return intno;
    577         }
    578         /* read the irq from the PIC */
    579         if (!apic_accept_pic_intr(cpu->apic_state)) {
    580             return -1;
    581         }
    582     }
    583 
    584     intno = pic_read_irq(isa_pic);
    585     return intno;
    586 }
    587 
    588 DeviceState *cpu_get_current_apic(void)
    589 {
    590     if (current_cpu) {
    591         X86CPU *cpu = X86_CPU(current_cpu);
    592         return cpu->apic_state;
    593     } else {
    594         return NULL;
    595     }
    596 }
    597 
    598 void gsi_handler(void *opaque, int n, int level)
    599 {
    600     GSIState *s = opaque;
    601 
    602     trace_x86_gsi_interrupt(n, level);
    603     switch (n) {
    604     case 0 ... ISA_NUM_IRQS - 1:
    605         if (s->i8259_irq[n]) {
    606             /* Under KVM, Kernel will forward to both PIC and IOAPIC */
    607             qemu_set_irq(s->i8259_irq[n], level);
    608         }
    609         /* fall through */
    610     case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
    611         qemu_set_irq(s->ioapic_irq[n], level);
    612         break;
    613     case IO_APIC_SECONDARY_IRQBASE
    614         ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1:
    615         qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level);
    616         break;
    617     }
    618 }
    619 
    620 void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
    621 {
    622     DeviceState *dev;
    623     SysBusDevice *d;
    624     unsigned int i;
    625 
    626     assert(parent_name);
    627     if (kvm_ioapic_in_kernel()) {
    628         dev = qdev_new(TYPE_KVM_IOAPIC);
    629     } else {
    630         dev = qdev_new(TYPE_IOAPIC);
    631     }
    632     object_property_add_child(object_resolve_path(parent_name, NULL),
    633                               "ioapic", OBJECT(dev));
    634     d = SYS_BUS_DEVICE(dev);
    635     sysbus_realize_and_unref(d, &error_fatal);
    636     sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
    637 
    638     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
    639         gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
    640     }
    641 }
    642 
    643 DeviceState *ioapic_init_secondary(GSIState *gsi_state)
    644 {
    645     DeviceState *dev;
    646     SysBusDevice *d;
    647     unsigned int i;
    648 
    649     dev = qdev_new(TYPE_IOAPIC);
    650     d = SYS_BUS_DEVICE(dev);
    651     sysbus_realize_and_unref(d, &error_fatal);
    652     sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
    653 
    654     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
    655         gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
    656     }
    657     return dev;
    658 }
    659 
    660 typedef struct SetupData {
    661     uint64_t next;
    662     uint32_t type;
    663     uint32_t len;
    664     uint8_t data[];
    665 } __attribute__((packed)) SetupData;
    666 
    667 
    668 /*
    669  * The entry point into the kernel for PVH boot is different from
    670  * the native entry point.  The PVH entry is defined by the x86/HVM
    671  * direct boot ABI and is available in an ELFNOTE in the kernel binary.
    672  *
    673  * This function is passed to load_elf() when it is called from
    674  * load_elfboot() which then additionally checks for an ELF Note of
    675  * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
    676  * parse the PVH entry address from the ELF Note.
    677  *
    678  * Due to trickery in elf_opts.h, load_elf() is actually available as
    679  * load_elf32() or load_elf64() and this routine needs to be able
    680  * to deal with being called as 32 or 64 bit.
    681  *
    682  * The address of the PVH entry point is saved to the 'pvh_start_addr'
    683  * global variable.  (although the entry point is 32-bit, the kernel
    684  * binary can be either 32-bit or 64-bit).
    685  */
    686 static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
    687 {
    688     size_t *elf_note_data_addr;
    689 
    690     /* Check if ELF Note header passed in is valid */
    691     if (arg1 == NULL) {
    692         return 0;
    693     }
    694 
    695     if (is64) {
    696         struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
    697         uint64_t nhdr_size64 = sizeof(struct elf64_note);
    698         uint64_t phdr_align = *(uint64_t *)arg2;
    699         uint64_t nhdr_namesz = nhdr64->n_namesz;
    700 
    701         elf_note_data_addr =
    702             ((void *)nhdr64) + nhdr_size64 +
    703             QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
    704 
    705         pvh_start_addr = *elf_note_data_addr;
    706     } else {
    707         struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
    708         uint32_t nhdr_size32 = sizeof(struct elf32_note);
    709         uint32_t phdr_align = *(uint32_t *)arg2;
    710         uint32_t nhdr_namesz = nhdr32->n_namesz;
    711 
    712         elf_note_data_addr =
    713             ((void *)nhdr32) + nhdr_size32 +
    714             QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
    715 
    716         pvh_start_addr = *(uint32_t *)elf_note_data_addr;
    717     }
    718 
    719     return pvh_start_addr;
    720 }
    721 
    722 static bool load_elfboot(const char *kernel_filename,
    723                          int kernel_file_size,
    724                          uint8_t *header,
    725                          size_t pvh_xen_start_addr,
    726                          FWCfgState *fw_cfg)
    727 {
    728     uint32_t flags = 0;
    729     uint32_t mh_load_addr = 0;
    730     uint32_t elf_kernel_size = 0;
    731     uint64_t elf_entry;
    732     uint64_t elf_low, elf_high;
    733     int kernel_size;
    734 
    735     if (ldl_p(header) != 0x464c457f) {
    736         return false; /* no elfboot */
    737     }
    738 
    739     bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
    740     flags = elf_is64 ?
    741         ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
    742 
    743     if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
    744         error_report("elfboot unsupported flags = %x", flags);
    745         exit(1);
    746     }
    747 
    748     uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
    749     kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
    750                            NULL, &elf_note_type, &elf_entry,
    751                            &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
    752                            0, 0);
    753 
    754     if (kernel_size < 0) {
    755         error_report("Error while loading elf kernel");
    756         exit(1);
    757     }
    758     mh_load_addr = elf_low;
    759     elf_kernel_size = elf_high - elf_low;
    760 
    761     if (pvh_start_addr == 0) {
    762         error_report("Error loading uncompressed kernel without PVH ELF Note");
    763         exit(1);
    764     }
    765     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
    766     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
    767     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
    768 
    769     return true;
    770 }
    771 
    772 typedef struct SetupDataFixup {
    773     void *pos;
    774     hwaddr orig_val, new_val;
    775     uint32_t addr;
    776 } SetupDataFixup;
    777 
    778 static void fixup_setup_data(void *opaque)
    779 {
    780     SetupDataFixup *fixup = opaque;
    781     stq_p(fixup->pos, fixup->new_val);
    782 }
    783 
    784 static void reset_setup_data(void *opaque)
    785 {
    786     SetupDataFixup *fixup = opaque;
    787     stq_p(fixup->pos, fixup->orig_val);
    788 }
    789 
    790 static void reset_rng_seed(void *opaque)
    791 {
    792     SetupData *setup_data = opaque;
    793     qemu_guest_getrandom_nofail(setup_data->data, le32_to_cpu(setup_data->len));
    794 }
    795 
    796 void x86_load_linux(X86MachineState *x86ms,
    797                     FWCfgState *fw_cfg,
    798                     int acpi_data_size,
    799                     bool pvh_enabled,
    800                     bool legacy_no_rng_seed)
    801 {
    802     bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
    803     uint16_t protocol;
    804     int setup_size, kernel_size, cmdline_size;
    805     int dtb_size, setup_data_offset;
    806     uint32_t initrd_max;
    807     uint8_t header[8192], *setup, *kernel;
    808     hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, first_setup_data = 0;
    809     FILE *f;
    810     char *vmode;
    811     MachineState *machine = MACHINE(x86ms);
    812     SetupData *setup_data;
    813     const char *kernel_filename = machine->kernel_filename;
    814     const char *initrd_filename = machine->initrd_filename;
    815     const char *dtb_filename = machine->dtb;
    816     const char *kernel_cmdline = machine->kernel_cmdline;
    817     SevKernelLoaderContext sev_load_ctx = {};
    818     enum { RNG_SEED_LENGTH = 32 };
    819 
    820     /* Align to 16 bytes as a paranoia measure */
    821     cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
    822 
    823     /* load the kernel header */
    824     f = fopen(kernel_filename, "rb");
    825     if (!f) {
    826         fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
    827                 kernel_filename, strerror(errno));
    828         exit(1);
    829     }
    830 
    831     kernel_size = get_file_size(f);
    832     if (!kernel_size ||
    833         fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
    834         MIN(ARRAY_SIZE(header), kernel_size)) {
    835         fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
    836                 kernel_filename, strerror(errno));
    837         exit(1);
    838     }
    839 
    840     /* kernel protocol version */
    841     if (ldl_p(header + 0x202) == 0x53726448) {
    842         protocol = lduw_p(header + 0x206);
    843     } else {
    844         /*
    845          * This could be a multiboot kernel. If it is, let's stop treating it
    846          * like a Linux kernel.
    847          * Note: some multiboot images could be in the ELF format (the same of
    848          * PVH), so we try multiboot first since we check the multiboot magic
    849          * header before to load it.
    850          */
    851         if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename,
    852                            kernel_cmdline, kernel_size, header)) {
    853             return;
    854         }
    855         /*
    856          * Check if the file is an uncompressed kernel file (ELF) and load it,
    857          * saving the PVH entry point used by the x86/HVM direct boot ABI.
    858          * If load_elfboot() is successful, populate the fw_cfg info.
    859          */
    860         if (pvh_enabled &&
    861             load_elfboot(kernel_filename, kernel_size,
    862                          header, pvh_start_addr, fw_cfg)) {
    863             fclose(f);
    864 
    865             fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
    866                 strlen(kernel_cmdline) + 1);
    867             fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
    868 
    869             fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
    870             fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
    871                              header, sizeof(header));
    872 
    873             /* load initrd */
    874             if (initrd_filename) {
    875                 GMappedFile *mapped_file;
    876                 gsize initrd_size;
    877                 gchar *initrd_data;
    878                 GError *gerr = NULL;
    879 
    880                 mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
    881                 if (!mapped_file) {
    882                     fprintf(stderr, "qemu: error reading initrd %s: %s\n",
    883                             initrd_filename, gerr->message);
    884                     exit(1);
    885                 }
    886                 x86ms->initrd_mapped_file = mapped_file;
    887 
    888                 initrd_data = g_mapped_file_get_contents(mapped_file);
    889                 initrd_size = g_mapped_file_get_length(mapped_file);
    890                 initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
    891                 if (initrd_size >= initrd_max) {
    892                     fprintf(stderr, "qemu: initrd is too large, cannot support."
    893                             "(max: %"PRIu32", need %"PRId64")\n",
    894                             initrd_max, (uint64_t)initrd_size);
    895                     exit(1);
    896                 }
    897 
    898                 initrd_addr = (initrd_max - initrd_size) & ~4095;
    899 
    900                 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
    901                 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
    902                 fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
    903                                  initrd_size);
    904             }
    905 
    906             option_rom[nb_option_roms].bootindex = 0;
    907             option_rom[nb_option_roms].name = "pvh.bin";
    908             nb_option_roms++;
    909 
    910             return;
    911         }
    912         protocol = 0;
    913     }
    914 
    915     if (protocol < 0x200 || !(header[0x211] & 0x01)) {
    916         /* Low kernel */
    917         real_addr    = 0x90000;
    918         cmdline_addr = 0x9a000 - cmdline_size;
    919         prot_addr    = 0x10000;
    920     } else if (protocol < 0x202) {
    921         /* High but ancient kernel */
    922         real_addr    = 0x90000;
    923         cmdline_addr = 0x9a000 - cmdline_size;
    924         prot_addr    = 0x100000;
    925     } else {
    926         /* High and recent kernel */
    927         real_addr    = 0x10000;
    928         cmdline_addr = 0x20000;
    929         prot_addr    = 0x100000;
    930     }
    931 
    932     /* highest address for loading the initrd */
    933     if (protocol >= 0x20c &&
    934         lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
    935         /*
    936          * Linux has supported initrd up to 4 GB for a very long time (2007,
    937          * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
    938          * though it only sets initrd_max to 2 GB to "work around bootloader
    939          * bugs". Luckily, QEMU firmware(which does something like bootloader)
    940          * has supported this.
    941          *
    942          * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
    943          * be loaded into any address.
    944          *
    945          * In addition, initrd_max is uint32_t simply because QEMU doesn't
    946          * support the 64-bit boot protocol (specifically the ext_ramdisk_image
    947          * field).
    948          *
    949          * Therefore here just limit initrd_max to UINT32_MAX simply as well.
    950          */
    951         initrd_max = UINT32_MAX;
    952     } else if (protocol >= 0x203) {
    953         initrd_max = ldl_p(header + 0x22c);
    954     } else {
    955         initrd_max = 0x37ffffff;
    956     }
    957 
    958     if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
    959         initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
    960     }
    961 
    962     fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
    963     fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
    964     fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
    965     sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
    966     sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1;
    967 
    968     if (protocol >= 0x202) {
    969         stl_p(header + 0x228, cmdline_addr);
    970     } else {
    971         stw_p(header + 0x20, 0xA33F);
    972         stw_p(header + 0x22, cmdline_addr - real_addr);
    973     }
    974 
    975     /* handle vga= parameter */
    976     vmode = strstr(kernel_cmdline, "vga=");
    977     if (vmode) {
    978         unsigned int video_mode;
    979         const char *end;
    980         int ret;
    981         /* skip "vga=" */
    982         vmode += 4;
    983         if (!strncmp(vmode, "normal", 6)) {
    984             video_mode = 0xffff;
    985         } else if (!strncmp(vmode, "ext", 3)) {
    986             video_mode = 0xfffe;
    987         } else if (!strncmp(vmode, "ask", 3)) {
    988             video_mode = 0xfffd;
    989         } else {
    990             ret = qemu_strtoui(vmode, &end, 0, &video_mode);
    991             if (ret != 0 || (*end && *end != ' ')) {
    992                 fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
    993                 exit(1);
    994             }
    995         }
    996         stw_p(header + 0x1fa, video_mode);
    997     }
    998 
    999     /* loader type */
   1000     /*
   1001      * High nybble = B reserved for QEMU; low nybble is revision number.
   1002      * If this code is substantially changed, you may want to consider
   1003      * incrementing the revision.
   1004      */
   1005     if (protocol >= 0x200) {
   1006         header[0x210] = 0xB0;
   1007     }
   1008     /* heap */
   1009     if (protocol >= 0x201) {
   1010         header[0x211] |= 0x80; /* CAN_USE_HEAP */
   1011         stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
   1012     }
   1013 
   1014     /* load initrd */
   1015     if (initrd_filename) {
   1016         GMappedFile *mapped_file;
   1017         gsize initrd_size;
   1018         gchar *initrd_data;
   1019         GError *gerr = NULL;
   1020 
   1021         if (protocol < 0x200) {
   1022             fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
   1023             exit(1);
   1024         }
   1025 
   1026         mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
   1027         if (!mapped_file) {
   1028             fprintf(stderr, "qemu: error reading initrd %s: %s\n",
   1029                     initrd_filename, gerr->message);
   1030             exit(1);
   1031         }
   1032         x86ms->initrd_mapped_file = mapped_file;
   1033 
   1034         initrd_data = g_mapped_file_get_contents(mapped_file);
   1035         initrd_size = g_mapped_file_get_length(mapped_file);
   1036         if (initrd_size >= initrd_max) {
   1037             fprintf(stderr, "qemu: initrd is too large, cannot support."
   1038                     "(max: %"PRIu32", need %"PRId64")\n",
   1039                     initrd_max, (uint64_t)initrd_size);
   1040             exit(1);
   1041         }
   1042 
   1043         initrd_addr = (initrd_max - initrd_size) & ~4095;
   1044 
   1045         fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
   1046         fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
   1047         fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
   1048         sev_load_ctx.initrd_data = initrd_data;
   1049         sev_load_ctx.initrd_size = initrd_size;
   1050 
   1051         stl_p(header + 0x218, initrd_addr);
   1052         stl_p(header + 0x21c, initrd_size);
   1053     }
   1054 
   1055     /* load kernel and setup */
   1056     setup_size = header[0x1f1];
   1057     if (setup_size == 0) {
   1058         setup_size = 4;
   1059     }
   1060     setup_size = (setup_size + 1) * 512;
   1061     if (setup_size > kernel_size) {
   1062         fprintf(stderr, "qemu: invalid kernel header\n");
   1063         exit(1);
   1064     }
   1065     kernel_size -= setup_size;
   1066 
   1067     setup  = g_malloc(setup_size);
   1068     kernel = g_malloc(kernel_size);
   1069     fseek(f, 0, SEEK_SET);
   1070     if (fread(setup, 1, setup_size, f) != setup_size) {
   1071         fprintf(stderr, "fread() failed\n");
   1072         exit(1);
   1073     }
   1074     if (fread(kernel, 1, kernel_size, f) != kernel_size) {
   1075         fprintf(stderr, "fread() failed\n");
   1076         exit(1);
   1077     }
   1078     fclose(f);
   1079 
   1080     /* append dtb to kernel */
   1081     if (dtb_filename) {
   1082         if (protocol < 0x209) {
   1083             fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
   1084             exit(1);
   1085         }
   1086 
   1087         dtb_size = get_image_size(dtb_filename);
   1088         if (dtb_size <= 0) {
   1089             fprintf(stderr, "qemu: error reading dtb %s: %s\n",
   1090                     dtb_filename, strerror(errno));
   1091             exit(1);
   1092         }
   1093 
   1094         setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
   1095         kernel_size = setup_data_offset + sizeof(SetupData) + dtb_size;
   1096         kernel = g_realloc(kernel, kernel_size);
   1097 
   1098 
   1099         setup_data = (SetupData *)(kernel + setup_data_offset);
   1100         setup_data->next = cpu_to_le64(first_setup_data);
   1101         first_setup_data = prot_addr + setup_data_offset;
   1102         setup_data->type = cpu_to_le32(SETUP_DTB);
   1103         setup_data->len = cpu_to_le32(dtb_size);
   1104 
   1105         load_image_size(dtb_filename, setup_data->data, dtb_size);
   1106     }
   1107 
   1108     if (!legacy_no_rng_seed) {
   1109         setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
   1110         kernel_size = setup_data_offset + sizeof(SetupData) + RNG_SEED_LENGTH;
   1111         kernel = g_realloc(kernel, kernel_size);
   1112         setup_data = (SetupData *)(kernel + setup_data_offset);
   1113         setup_data->next = cpu_to_le64(first_setup_data);
   1114         first_setup_data = prot_addr + setup_data_offset;
   1115         setup_data->type = cpu_to_le32(SETUP_RNG_SEED);
   1116         setup_data->len = cpu_to_le32(RNG_SEED_LENGTH);
   1117         qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH);
   1118         qemu_register_reset_nosnapshotload(reset_rng_seed, setup_data);
   1119         fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_KERNEL_DATA, reset_rng_seed, NULL,
   1120                                   setup_data, kernel, kernel_size, true);
   1121     } else {
   1122         fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
   1123     }
   1124 
   1125     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
   1126     fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
   1127     sev_load_ctx.kernel_data = (char *)kernel;
   1128     sev_load_ctx.kernel_size = kernel_size;
   1129 
   1130     /*
   1131      * If we're starting an encrypted VM, it will be OVMF based, which uses the
   1132      * efi stub for booting and doesn't require any values to be placed in the
   1133      * kernel header.  We therefore don't update the header so the hash of the
   1134      * kernel on the other side of the fw_cfg interface matches the hash of the
   1135      * file the user passed in.
   1136      */
   1137     if (!sev_enabled()) {
   1138         SetupDataFixup *fixup = g_malloc(sizeof(*fixup));
   1139 
   1140         memcpy(setup, header, MIN(sizeof(header), setup_size));
   1141         /* Offset 0x250 is a pointer to the first setup_data link. */
   1142         fixup->pos = setup + 0x250;
   1143         fixup->orig_val = ldq_p(fixup->pos);
   1144         fixup->new_val = first_setup_data;
   1145         fixup->addr = cpu_to_le32(real_addr);
   1146         fw_cfg_add_bytes_callback(fw_cfg, FW_CFG_SETUP_ADDR, fixup_setup_data, NULL,
   1147                                   fixup, &fixup->addr, sizeof(fixup->addr), true);
   1148         qemu_register_reset(reset_setup_data, fixup);
   1149     } else {
   1150         fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
   1151     }
   1152     fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
   1153     fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
   1154     sev_load_ctx.setup_data = (char *)setup;
   1155     sev_load_ctx.setup_size = setup_size;
   1156 
   1157     if (sev_enabled()) {
   1158         sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
   1159     }
   1160 
   1161     option_rom[nb_option_roms].bootindex = 0;
   1162     option_rom[nb_option_roms].name = "linuxboot.bin";
   1163     if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
   1164         option_rom[nb_option_roms].name = "linuxboot_dma.bin";
   1165     }
   1166     nb_option_roms++;
   1167 }
   1168 
   1169 void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
   1170                        MemoryRegion *rom_memory, bool isapc_ram_fw)
   1171 {
   1172     const char *bios_name;
   1173     char *filename;
   1174     MemoryRegion *bios, *isa_bios;
   1175     int bios_size, isa_bios_size;
   1176     ssize_t ret;
   1177 
   1178     /* BIOS load */
   1179     bios_name = ms->firmware ?: default_firmware;
   1180     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
   1181     if (filename) {
   1182         bios_size = get_image_size(filename);
   1183     } else {
   1184         bios_size = -1;
   1185     }
   1186     if (bios_size <= 0 ||
   1187         (bios_size % 65536) != 0) {
   1188         goto bios_error;
   1189     }
   1190     bios = g_malloc(sizeof(*bios));
   1191     memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
   1192     if (sev_enabled()) {
   1193         /*
   1194          * The concept of a "reset" simply doesn't exist for
   1195          * confidential computing guests, we have to destroy and
   1196          * re-launch them instead.  So there is no need to register
   1197          * the firmware as rom to properly re-initialize on reset.
   1198          * Just go for a straight file load instead.
   1199          */
   1200         void *ptr = memory_region_get_ram_ptr(bios);
   1201         load_image_size(filename, ptr, bios_size);
   1202         x86_firmware_configure(ptr, bios_size);
   1203     } else {
   1204         if (!isapc_ram_fw) {
   1205             memory_region_set_readonly(bios, true);
   1206         }
   1207         ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
   1208         if (ret != 0) {
   1209             goto bios_error;
   1210         }
   1211     }
   1212     g_free(filename);
   1213 
   1214     /* map the last 128KB of the BIOS in ISA space */
   1215     isa_bios_size = MIN(bios_size, 128 * KiB);
   1216     isa_bios = g_malloc(sizeof(*isa_bios));
   1217     memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
   1218                              bios_size - isa_bios_size, isa_bios_size);
   1219     memory_region_add_subregion_overlap(rom_memory,
   1220                                         0x100000 - isa_bios_size,
   1221                                         isa_bios,
   1222                                         1);
   1223     if (!isapc_ram_fw) {
   1224         memory_region_set_readonly(isa_bios, true);
   1225     }
   1226 
   1227     /* map all the bios at the top of memory */
   1228     memory_region_add_subregion(rom_memory,
   1229                                 (uint32_t)(-bios_size),
   1230                                 bios);
   1231     return;
   1232 
   1233 bios_error:
   1234     fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
   1235     exit(1);
   1236 }
   1237 
   1238 bool x86_machine_is_smm_enabled(const X86MachineState *x86ms)
   1239 {
   1240     bool smm_available = false;
   1241 
   1242     if (x86ms->smm == ON_OFF_AUTO_OFF) {
   1243         return false;
   1244     }
   1245 
   1246     if (tcg_enabled() || qtest_enabled()) {
   1247         smm_available = true;
   1248     } else if (kvm_enabled()) {
   1249         smm_available = kvm_has_smm();
   1250     }
   1251 
   1252     if (smm_available) {
   1253         return true;
   1254     }
   1255 
   1256     if (x86ms->smm == ON_OFF_AUTO_ON) {
   1257         error_report("System Management Mode not supported by this hypervisor.");
   1258         exit(1);
   1259     }
   1260     return false;
   1261 }
   1262 
   1263 static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
   1264                                void *opaque, Error **errp)
   1265 {
   1266     X86MachineState *x86ms = X86_MACHINE(obj);
   1267     OnOffAuto smm = x86ms->smm;
   1268 
   1269     visit_type_OnOffAuto(v, name, &smm, errp);
   1270 }
   1271 
   1272 static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
   1273                                void *opaque, Error **errp)
   1274 {
   1275     X86MachineState *x86ms = X86_MACHINE(obj);
   1276 
   1277     visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
   1278 }
   1279 
   1280 bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms)
   1281 {
   1282     if (x86ms->acpi == ON_OFF_AUTO_OFF) {
   1283         return false;
   1284     }
   1285     return true;
   1286 }
   1287 
   1288 static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
   1289                                  void *opaque, Error **errp)
   1290 {
   1291     X86MachineState *x86ms = X86_MACHINE(obj);
   1292     OnOffAuto acpi = x86ms->acpi;
   1293 
   1294     visit_type_OnOffAuto(v, name, &acpi, errp);
   1295 }
   1296 
   1297 static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
   1298                                  void *opaque, Error **errp)
   1299 {
   1300     X86MachineState *x86ms = X86_MACHINE(obj);
   1301 
   1302     visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
   1303 }
   1304 
   1305 static void x86_machine_get_pit(Object *obj, Visitor *v, const char *name,
   1306                                     void *opaque, Error **errp)
   1307 {
   1308     X86MachineState *x86ms = X86_MACHINE(obj);
   1309     OnOffAuto pit = x86ms->pit;
   1310 
   1311     visit_type_OnOffAuto(v, name, &pit, errp);
   1312 }
   1313 
   1314 static void x86_machine_set_pit(Object *obj, Visitor *v, const char *name,
   1315                                     void *opaque, Error **errp)
   1316 {
   1317     X86MachineState *x86ms = X86_MACHINE(obj);;
   1318 
   1319     visit_type_OnOffAuto(v, name, &x86ms->pit, errp);
   1320 }
   1321 
   1322 static void x86_machine_get_pic(Object *obj, Visitor *v, const char *name,
   1323                                 void *opaque, Error **errp)
   1324 {
   1325     X86MachineState *x86ms = X86_MACHINE(obj);
   1326     OnOffAuto pic = x86ms->pic;
   1327 
   1328     visit_type_OnOffAuto(v, name, &pic, errp);
   1329 }
   1330 
   1331 static void x86_machine_set_pic(Object *obj, Visitor *v, const char *name,
   1332                                 void *opaque, Error **errp)
   1333 {
   1334     X86MachineState *x86ms = X86_MACHINE(obj);
   1335 
   1336     visit_type_OnOffAuto(v, name, &x86ms->pic, errp);
   1337 }
   1338 
   1339 static char *x86_machine_get_oem_id(Object *obj, Error **errp)
   1340 {
   1341     X86MachineState *x86ms = X86_MACHINE(obj);
   1342 
   1343     return g_strdup(x86ms->oem_id);
   1344 }
   1345 
   1346 static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp)
   1347 {
   1348     X86MachineState *x86ms = X86_MACHINE(obj);
   1349     size_t len = strlen(value);
   1350 
   1351     if (len > 6) {
   1352         error_setg(errp,
   1353                    "User specified "X86_MACHINE_OEM_ID" value is bigger than "
   1354                    "6 bytes in size");
   1355         return;
   1356     }
   1357 
   1358     strncpy(x86ms->oem_id, value, 6);
   1359 }
   1360 
   1361 static char *x86_machine_get_oem_table_id(Object *obj, Error **errp)
   1362 {
   1363     X86MachineState *x86ms = X86_MACHINE(obj);
   1364 
   1365     return g_strdup(x86ms->oem_table_id);
   1366 }
   1367 
   1368 static void x86_machine_set_oem_table_id(Object *obj, const char *value,
   1369                                          Error **errp)
   1370 {
   1371     X86MachineState *x86ms = X86_MACHINE(obj);
   1372     size_t len = strlen(value);
   1373 
   1374     if (len > 8) {
   1375         error_setg(errp,
   1376                    "User specified "X86_MACHINE_OEM_TABLE_ID
   1377                    " value is bigger than "
   1378                    "8 bytes in size");
   1379         return;
   1380     }
   1381     strncpy(x86ms->oem_table_id, value, 8);
   1382 }
   1383 
   1384 static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v,
   1385                                 const char *name, void *opaque, Error **errp)
   1386 {
   1387     X86MachineState *x86ms = X86_MACHINE(obj);
   1388     uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit;
   1389 
   1390     visit_type_uint64(v, name, &bus_lock_ratelimit, errp);
   1391 }
   1392 
   1393 static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v,
   1394                                const char *name, void *opaque, Error **errp)
   1395 {
   1396     X86MachineState *x86ms = X86_MACHINE(obj);
   1397 
   1398     visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp);
   1399 }
   1400 
   1401 static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name,
   1402                                 void *opaque, Error **errp)
   1403 {
   1404     X86MachineState *x86ms = X86_MACHINE(obj);
   1405     SgxEPCList *list = x86ms->sgx_epc_list;
   1406 
   1407     visit_type_SgxEPCList(v, name, &list, errp);
   1408 }
   1409 
   1410 static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name,
   1411                                 void *opaque, Error **errp)
   1412 {
   1413     X86MachineState *x86ms = X86_MACHINE(obj);
   1414     SgxEPCList *list;
   1415 
   1416     list = x86ms->sgx_epc_list;
   1417     visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp);
   1418 
   1419     qapi_free_SgxEPCList(list);
   1420 }
   1421 
   1422 static void x86_machine_initfn(Object *obj)
   1423 {
   1424     X86MachineState *x86ms = X86_MACHINE(obj);
   1425 
   1426     x86ms->smm = ON_OFF_AUTO_AUTO;
   1427     x86ms->acpi = ON_OFF_AUTO_AUTO;
   1428     x86ms->pit = ON_OFF_AUTO_AUTO;
   1429     x86ms->pic = ON_OFF_AUTO_AUTO;
   1430     x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS;
   1431     x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
   1432     x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
   1433     x86ms->bus_lock_ratelimit = 0;
   1434     x86ms->above_4g_mem_start = 4 * GiB;
   1435 }
   1436 
   1437 static void x86_machine_class_init(ObjectClass *oc, void *data)
   1438 {
   1439     MachineClass *mc = MACHINE_CLASS(oc);
   1440     X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
   1441     NMIClass *nc = NMI_CLASS(oc);
   1442 
   1443     mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
   1444     mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
   1445     mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
   1446     x86mc->save_tsc_khz = true;
   1447     x86mc->fwcfg_dma_enabled = true;
   1448     nc->nmi_monitor_handler = x86_nmi;
   1449 
   1450     object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
   1451         x86_machine_get_smm, x86_machine_set_smm,
   1452         NULL, NULL);
   1453     object_class_property_set_description(oc, X86_MACHINE_SMM,
   1454         "Enable SMM");
   1455 
   1456     object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
   1457         x86_machine_get_acpi, x86_machine_set_acpi,
   1458         NULL, NULL);
   1459     object_class_property_set_description(oc, X86_MACHINE_ACPI,
   1460         "Enable ACPI");
   1461 
   1462     object_class_property_add(oc, X86_MACHINE_PIT, "OnOffAuto",
   1463                               x86_machine_get_pit,
   1464                               x86_machine_set_pit,
   1465                               NULL, NULL);
   1466     object_class_property_set_description(oc, X86_MACHINE_PIT,
   1467         "Enable i8254 PIT");
   1468 
   1469     object_class_property_add(oc, X86_MACHINE_PIC, "OnOffAuto",
   1470                               x86_machine_get_pic,
   1471                               x86_machine_set_pic,
   1472                               NULL, NULL);
   1473     object_class_property_set_description(oc, X86_MACHINE_PIC,
   1474         "Enable i8259 PIC");
   1475 
   1476     object_class_property_add_str(oc, X86_MACHINE_OEM_ID,
   1477                                   x86_machine_get_oem_id,
   1478                                   x86_machine_set_oem_id);
   1479     object_class_property_set_description(oc, X86_MACHINE_OEM_ID,
   1480                                           "Override the default value of field OEMID "
   1481                                           "in ACPI table header."
   1482                                           "The string may be up to 6 bytes in size");
   1483 
   1484 
   1485     object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID,
   1486                                   x86_machine_get_oem_table_id,
   1487                                   x86_machine_set_oem_table_id);
   1488     object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID,
   1489                                           "Override the default value of field OEM Table ID "
   1490                                           "in ACPI table header."
   1491                                           "The string may be up to 8 bytes in size");
   1492 
   1493     object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t",
   1494                                 x86_machine_get_bus_lock_ratelimit,
   1495                                 x86_machine_set_bus_lock_ratelimit, NULL, NULL);
   1496     object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT,
   1497             "Set the ratelimit for the bus locks acquired in VMs");
   1498 
   1499     object_class_property_add(oc, "sgx-epc", "SgxEPC",
   1500         machine_get_sgx_epc, machine_set_sgx_epc,
   1501         NULL, NULL);
   1502     object_class_property_set_description(oc, "sgx-epc",
   1503         "SGX EPC device");
   1504 }
   1505 
   1506 static const TypeInfo x86_machine_info = {
   1507     .name = TYPE_X86_MACHINE,
   1508     .parent = TYPE_MACHINE,
   1509     .abstract = true,
   1510     .instance_size = sizeof(X86MachineState),
   1511     .instance_init = x86_machine_initfn,
   1512     .class_size = sizeof(X86MachineClass),
   1513     .class_init = x86_machine_class_init,
   1514     .interfaces = (InterfaceInfo[]) {
   1515          { TYPE_NMI },
   1516          { }
   1517     },
   1518 };
   1519 
   1520 static void x86_machine_register_types(void)
   1521 {
   1522     type_register_static(&x86_machine_info);
   1523 }
   1524 
   1525 type_init(x86_machine_register_types)