qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

spapr_numa.c (24181B)


      1 /*
      2  * QEMU PowerPC pSeries Logical Partition NUMA associativity handling
      3  *
      4  * Copyright IBM Corp. 2020
      5  *
      6  * Authors:
      7  *  Daniel Henrique Barboza      <danielhb413@gmail.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
     10  * See the COPYING file in the top-level directory.
     11  */
     12 
     13 #include "qemu/osdep.h"
     14 #include "hw/ppc/spapr_numa.h"
     15 #include "hw/pci-host/spapr.h"
     16 #include "hw/ppc/fdt.h"
     17 
     18 /* Moved from hw/ppc/spapr_pci_nvlink2.c */
     19 #define SPAPR_GPU_NUMA_ID           (cpu_to_be32(1))
     20 
     21 /*
     22  * Retrieves max_dist_ref_points of the current NUMA affinity.
     23  */
     24 static int get_max_dist_ref_points(SpaprMachineState *spapr)
     25 {
     26     if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
     27         return FORM2_DIST_REF_POINTS;
     28     }
     29 
     30     return FORM1_DIST_REF_POINTS;
     31 }
     32 
     33 /*
     34  * Retrieves numa_assoc_size of the current NUMA affinity.
     35  */
     36 static int get_numa_assoc_size(SpaprMachineState *spapr)
     37 {
     38     if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
     39         return FORM2_NUMA_ASSOC_SIZE;
     40     }
     41 
     42     return FORM1_NUMA_ASSOC_SIZE;
     43 }
     44 
     45 /*
     46  * Retrieves vcpu_assoc_size of the current NUMA affinity.
     47  *
     48  * vcpu_assoc_size is the size of ibm,associativity array
     49  * for CPUs, which has an extra element (vcpu_id) in the end.
     50  */
     51 static int get_vcpu_assoc_size(SpaprMachineState *spapr)
     52 {
     53     return get_numa_assoc_size(spapr) + 1;
     54 }
     55 
     56 /*
     57  * Retrieves the ibm,associativity array of NUMA node 'node_id'
     58  * for the current NUMA affinity.
     59  */
     60 static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id)
     61 {
     62     if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
     63         return spapr->FORM2_assoc_array[node_id];
     64     }
     65     return spapr->FORM1_assoc_array[node_id];
     66 }
     67 
     68 /*
     69  * Wrapper that returns node distance from ms->numa_state->nodes
     70  * after handling edge cases where the distance might be absent.
     71  */
     72 static int get_numa_distance(MachineState *ms, int src, int dst)
     73 {
     74     NodeInfo *numa_info = ms->numa_state->nodes;
     75     int ret = numa_info[src].distance[dst];
     76 
     77     if (ret != 0) {
     78         return ret;
     79     }
     80 
     81     /*
     82      * In case QEMU adds a default NUMA single node when the user
     83      * did not add any, or where the user did not supply distances,
     84      * the distance will be absent (zero). Return local/remote
     85      * distance in this case.
     86      */
     87     if (src == dst) {
     88         return NUMA_DISTANCE_MIN;
     89     }
     90 
     91     return NUMA_DISTANCE_DEFAULT;
     92 }
     93 
     94 static bool spapr_numa_is_symmetrical(MachineState *ms)
     95 {
     96     int nb_numa_nodes = ms->numa_state->num_nodes;
     97     int src, dst;
     98 
     99     for (src = 0; src < nb_numa_nodes; src++) {
    100         for (dst = src; dst < nb_numa_nodes; dst++) {
    101             if (get_numa_distance(ms, src, dst) !=
    102                 get_numa_distance(ms, dst, src)) {
    103                 return false;
    104             }
    105         }
    106     }
    107 
    108     return true;
    109 }
    110 
    111 /*
    112  * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
    113  * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
    114  * called from vPHB reset handler so we initialize the counter here.
    115  * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
    116  * must be equally distant from any other node.
    117  * The final value of spapr->gpu_numa_id is going to be written to
    118  * max-associativity-domains in spapr_build_fdt().
    119  */
    120 unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine)
    121 {
    122     return MAX(1, machine->numa_state->num_nodes);
    123 }
    124 
    125 /*
    126  * This function will translate the user distances into
    127  * what the kernel understand as possible values: 10
    128  * (local distance), 20, 40, 80 and 160, and return the equivalent
    129  * NUMA level for each. Current heuristic is:
    130  *  - local distance (10) returns numa_level = 0x4, meaning there is
    131  *    no rounding for local distance
    132  *  - distances between 11 and 30 inclusive -> rounded to 20,
    133  *    numa_level = 0x3
    134  *  - distances between 31 and 60 inclusive -> rounded to 40,
    135  *    numa_level = 0x2
    136  *  - distances between 61 and 120 inclusive -> rounded to 80,
    137  *    numa_level = 0x1
    138  *  - everything above 120 returns numa_level = 0 to indicate that
    139  *    there is no match. This will be calculated as disntace = 160
    140  *    by the kernel (as of v5.9)
    141  */
    142 static uint8_t spapr_numa_get_numa_level(uint8_t distance)
    143 {
    144     if (distance == 10) {
    145         return 0x4;
    146     } else if (distance > 11 && distance <= 30) {
    147         return 0x3;
    148     } else if (distance > 31 && distance <= 60) {
    149         return 0x2;
    150     } else if (distance > 61 && distance <= 120) {
    151         return 0x1;
    152     }
    153 
    154     return 0;
    155 }
    156 
    157 static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr)
    158 {
    159     MachineState *ms = MACHINE(spapr);
    160     int nb_numa_nodes = ms->numa_state->num_nodes;
    161     int src, dst, i, j;
    162 
    163     /*
    164      * Fill all associativity domains of non-zero NUMA nodes with
    165      * node_id. This is required because the default value (0) is
    166      * considered a match with associativity domains of node 0.
    167      */
    168     for (i = 1; i < nb_numa_nodes; i++) {
    169         for (j = 1; j < FORM1_DIST_REF_POINTS; j++) {
    170             spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i);
    171         }
    172     }
    173 
    174     for (src = 0; src < nb_numa_nodes; src++) {
    175         for (dst = src; dst < nb_numa_nodes; dst++) {
    176             /*
    177              * This is how the associativity domain between A and B
    178              * is calculated:
    179              *
    180              * - get the distance D between them
    181              * - get the correspondent NUMA level 'n_level' for D
    182              * - all associativity arrays were initialized with their own
    183              * numa_ids, and we're calculating the distance in node_id
    184              * ascending order, starting from node id 0 (the first node
    185              * retrieved by numa_state). This will have a cascade effect in
    186              * the algorithm because the associativity domains that node 0
    187              * defines will be carried over to other nodes, and node 1
    188              * associativities will be carried over after taking node 0
    189              * associativities into account, and so on. This happens because
    190              * we'll assign assoc_src as the associativity domain of dst
    191              * as well, for all NUMA levels beyond and including n_level.
    192              *
    193              * The PPC kernel expects the associativity domains of node 0 to
    194              * be always 0, and this algorithm will grant that by default.
    195              */
    196             uint8_t distance = get_numa_distance(ms, src, dst);
    197             uint8_t n_level = spapr_numa_get_numa_level(distance);
    198             uint32_t assoc_src;
    199 
    200             /*
    201              * n_level = 0 means that the distance is greater than our last
    202              * rounded value (120). In this case there is no NUMA level match
    203              * between src and dst and we can skip the remaining of the loop.
    204              *
    205              * The Linux kernel will assume that the distance between src and
    206              * dst, in this case of no match, is 10 (local distance) doubled
    207              * for each NUMA it didn't match. We have FORM1_DIST_REF_POINTS
    208              * levels (4), so this gives us 10*2*2*2*2 = 160.
    209              *
    210              * This logic can be seen in the Linux kernel source code, as of
    211              * v5.9, in arch/powerpc/mm/numa.c, function __node_distance().
    212              */
    213             if (n_level == 0) {
    214                 continue;
    215             }
    216 
    217             /*
    218              * We must assign all assoc_src to dst, starting from n_level
    219              * and going up to 0x1.
    220              */
    221             for (i = n_level; i > 0; i--) {
    222                 assoc_src = spapr->FORM1_assoc_array[src][i];
    223                 spapr->FORM1_assoc_array[dst][i] = assoc_src;
    224             }
    225         }
    226     }
    227 
    228 }
    229 
    230 static void spapr_numa_FORM1_affinity_check(MachineState *machine)
    231 {
    232     int i;
    233 
    234     /*
    235      * Check we don't have a memory-less/cpu-less NUMA node
    236      * Firmware relies on the existing memory/cpu topology to provide the
    237      * NUMA topology to the kernel.
    238      * And the linux kernel needs to know the NUMA topology at start
    239      * to be able to hotplug CPUs later.
    240      */
    241     if (machine->numa_state->num_nodes) {
    242         for (i = 0; i < machine->numa_state->num_nodes; ++i) {
    243             /* check for memory-less node */
    244             if (machine->numa_state->nodes[i].node_mem == 0) {
    245                 CPUState *cs;
    246                 int found = 0;
    247                 /* check for cpu-less node */
    248                 CPU_FOREACH(cs) {
    249                     PowerPCCPU *cpu = POWERPC_CPU(cs);
    250                     if (cpu->node_id == i) {
    251                         found = 1;
    252                         break;
    253                     }
    254                 }
    255                 /* memory-less and cpu-less node */
    256                 if (!found) {
    257                     error_report(
    258 "Memory-less/cpu-less nodes are not supported with FORM1 NUMA (node %d)", i);
    259                     exit(EXIT_FAILURE);
    260                 }
    261             }
    262         }
    263     }
    264 
    265     if (!spapr_numa_is_symmetrical(machine)) {
    266         error_report(
    267 "Asymmetrical NUMA topologies aren't supported in the pSeries machine using FORM1 NUMA");
    268         exit(EXIT_FAILURE);
    269     }
    270 }
    271 
    272 /*
    273  * Set NUMA machine state data based on FORM1 affinity semantics.
    274  */
    275 static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
    276                                            MachineState *machine)
    277 {
    278     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
    279     int nb_numa_nodes = machine->numa_state->num_nodes;
    280     int i, j, max_nodes_with_gpus;
    281 
    282     /*
    283      * For all associativity arrays: first position is the size,
    284      * position FORM1_DIST_REF_POINTS is always the numa_id,
    285      * represented by the index 'i'.
    286      *
    287      * This will break on sparse NUMA setups, when/if QEMU starts
    288      * to support it, because there will be no more guarantee that
    289      * 'i' will be a valid node_id set by the user.
    290      */
    291     for (i = 0; i < nb_numa_nodes; i++) {
    292         spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS);
    293         spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i);
    294     }
    295 
    296     /*
    297      * Initialize NVLink GPU associativity arrays. We know that
    298      * the first GPU will take the first available NUMA id, and
    299      * we'll have a maximum of NVGPU_MAX_NUM GPUs in the machine.
    300      * At this point we're not sure if there are GPUs or not, but
    301      * let's initialize the associativity arrays and allow NVLink
    302      * GPUs to be handled like regular NUMA nodes later on.
    303      */
    304     max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM;
    305 
    306     for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) {
    307         spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS);
    308 
    309         for (j = 1; j < FORM1_DIST_REF_POINTS; j++) {
    310             uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ?
    311                                  SPAPR_GPU_NUMA_ID : cpu_to_be32(i);
    312             spapr->FORM1_assoc_array[i][j] = gpu_assoc;
    313         }
    314 
    315         spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i);
    316     }
    317 
    318     /*
    319      * Guests pseries-5.1 and older uses zeroed associativity domains,
    320      * i.e. no domain definition based on NUMA distance input.
    321      *
    322      * Same thing with guests that have only one NUMA node.
    323      */
    324     if (smc->pre_5_2_numa_associativity ||
    325         machine->numa_state->num_nodes <= 1) {
    326         return;
    327     }
    328 
    329     spapr_numa_define_FORM1_domains(spapr);
    330 }
    331 
    332 /*
    333  * Init NUMA FORM2 machine state data
    334  */
    335 static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr)
    336 {
    337     int i;
    338 
    339     /*
    340      * For all resources but CPUs, FORM2 associativity arrays will
    341      * be a size 2 array with the following format:
    342      *
    343      * ibm,associativity = {1, numa_id}
    344      *
    345      * CPUs will write an additional 'vcpu_id' on top of the arrays
    346      * being initialized here. 'numa_id' is represented by the
    347      * index 'i' of the loop.
    348      *
    349      * Given that this initialization is also valid for GPU associativity
    350      * arrays, handle everything in one single step by populating the
    351      * arrays up to NUMA_NODES_MAX_NUM.
    352      */
    353     for (i = 0; i < NUMA_NODES_MAX_NUM; i++) {
    354         spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1);
    355         spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i);
    356     }
    357 }
    358 
    359 void spapr_numa_associativity_init(SpaprMachineState *spapr,
    360                                    MachineState *machine)
    361 {
    362     spapr_numa_FORM1_affinity_init(spapr, machine);
    363     spapr_numa_FORM2_affinity_init(spapr);
    364 }
    365 
    366 void spapr_numa_associativity_check(SpaprMachineState *spapr)
    367 {
    368     /*
    369      * FORM2 does not have any restrictions we need to handle
    370      * at CAS time, for now.
    371      */
    372     if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
    373         return;
    374     }
    375 
    376     spapr_numa_FORM1_affinity_check(MACHINE(spapr));
    377 }
    378 
    379 void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,
    380                                        int offset, int nodeid)
    381 {
    382     const uint32_t *associativity = get_associativity(spapr, nodeid);
    383 
    384     _FDT((fdt_setprop(fdt, offset, "ibm,associativity",
    385                       associativity,
    386                       get_numa_assoc_size(spapr) * sizeof(uint32_t))));
    387 }
    388 
    389 static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr,
    390                                            PowerPCCPU *cpu)
    391 {
    392     const uint32_t *associativity = get_associativity(spapr, cpu->node_id);
    393     int max_distance_ref_points = get_max_dist_ref_points(spapr);
    394     int vcpu_assoc_size = get_vcpu_assoc_size(spapr);
    395     uint32_t *vcpu_assoc = g_new(uint32_t, vcpu_assoc_size);
    396     int index = spapr_get_vcpu_id(cpu);
    397 
    398     /*
    399      * VCPUs have an extra 'cpu_id' value in ibm,associativity
    400      * compared to other resources. Increment the size at index
    401      * 0, put cpu_id last, then copy the remaining associativity
    402      * domains.
    403      */
    404     vcpu_assoc[0] = cpu_to_be32(max_distance_ref_points + 1);
    405     vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index);
    406     memcpy(vcpu_assoc + 1, associativity + 1,
    407            (vcpu_assoc_size - 2) * sizeof(uint32_t));
    408 
    409     return vcpu_assoc;
    410 }
    411 
    412 int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt,
    413                             int offset, PowerPCCPU *cpu)
    414 {
    415     g_autofree uint32_t *vcpu_assoc = NULL;
    416     int vcpu_assoc_size = get_vcpu_assoc_size(spapr);
    417 
    418     vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu);
    419 
    420     /* Advertise NUMA via ibm,associativity */
    421     return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc,
    422                        vcpu_assoc_size * sizeof(uint32_t));
    423 }
    424 
    425 
    426 int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt,
    427                                          int offset)
    428 {
    429     MachineState *machine = MACHINE(spapr);
    430     int max_distance_ref_points = get_max_dist_ref_points(spapr);
    431     int nb_numa_nodes = machine->numa_state->num_nodes;
    432     int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
    433     g_autofree uint32_t *int_buf = NULL;
    434     uint32_t *cur_index;
    435     int i;
    436 
    437     /* ibm,associativity-lookup-arrays */
    438     int_buf = g_new0(uint32_t, nr_nodes * max_distance_ref_points + 2);
    439     cur_index = int_buf;
    440     int_buf[0] = cpu_to_be32(nr_nodes);
    441      /* Number of entries per associativity list */
    442     int_buf[1] = cpu_to_be32(max_distance_ref_points);
    443     cur_index += 2;
    444     for (i = 0; i < nr_nodes; i++) {
    445         /*
    446          * For the lookup-array we use the ibm,associativity array of the
    447          * current NUMA affinity, without the first element (size).
    448          */
    449         const uint32_t *associativity = get_associativity(spapr, i);
    450         memcpy(cur_index, ++associativity,
    451                sizeof(uint32_t) * max_distance_ref_points);
    452         cur_index += max_distance_ref_points;
    453     }
    454 
    455     return fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays",
    456                        int_buf, (cur_index - int_buf) * sizeof(uint32_t));
    457 }
    458 
    459 static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr,
    460                                            void *fdt, int rtas)
    461 {
    462     MachineState *ms = MACHINE(spapr);
    463     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
    464     uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
    465                                    spapr_numa_initial_nvgpu_numa_id(ms);
    466     uint32_t refpoints[] = {
    467         cpu_to_be32(0x4),
    468         cpu_to_be32(0x3),
    469         cpu_to_be32(0x2),
    470         cpu_to_be32(0x1),
    471     };
    472     uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
    473     uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
    474     uint32_t maxdomains[] = {
    475         cpu_to_be32(4),
    476         cpu_to_be32(maxdomain),
    477         cpu_to_be32(maxdomain),
    478         cpu_to_be32(maxdomain),
    479         cpu_to_be32(maxdomain)
    480     };
    481 
    482     if (smc->pre_5_2_numa_associativity ||
    483         ms->numa_state->num_nodes <= 1) {
    484         uint32_t legacy_refpoints[] = {
    485             cpu_to_be32(0x4),
    486             cpu_to_be32(0x4),
    487             cpu_to_be32(0x2),
    488         };
    489         uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0;
    490         uint32_t legacy_maxdomains[] = {
    491             cpu_to_be32(4),
    492             cpu_to_be32(legacy_maxdomain),
    493             cpu_to_be32(legacy_maxdomain),
    494             cpu_to_be32(legacy_maxdomain),
    495             cpu_to_be32(spapr->gpu_numa_id),
    496         };
    497 
    498         G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints));
    499         G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains));
    500 
    501         nr_refpoints = 3;
    502 
    503         memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints));
    504         memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains));
    505 
    506         /* pseries-5.0 and older reference-points array is {0x4, 0x4} */
    507         if (smc->pre_5_1_assoc_refpoints) {
    508             nr_refpoints = 2;
    509         }
    510     }
    511 
    512     _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
    513                      refpoints, nr_refpoints * sizeof(refpoints[0])));
    514 
    515     _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
    516                      maxdomains, sizeof(maxdomains)));
    517 }
    518 
    519 static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr,
    520                                                void *fdt, int rtas)
    521 {
    522     MachineState *ms = MACHINE(spapr);
    523     int nb_numa_nodes = ms->numa_state->num_nodes;
    524     int distance_table_entries = nb_numa_nodes * nb_numa_nodes;
    525     g_autofree uint32_t *lookup_index_table = NULL;
    526     g_autofree uint8_t *distance_table = NULL;
    527     int src, dst, i, distance_table_size;
    528 
    529     /*
    530      * ibm,numa-lookup-index-table: array with length and a
    531      * list of NUMA ids present in the guest.
    532      */
    533     lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1);
    534     lookup_index_table[0] = cpu_to_be32(nb_numa_nodes);
    535 
    536     for (i = 0; i < nb_numa_nodes; i++) {
    537         lookup_index_table[i + 1] = cpu_to_be32(i);
    538     }
    539 
    540     _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table",
    541                      lookup_index_table,
    542                      (nb_numa_nodes + 1) * sizeof(uint32_t)));
    543 
    544     /*
    545      * ibm,numa-distance-table: contains all node distances. First
    546      * element is the size of the table as uint32, followed up
    547      * by all the uint8 distances from the first NUMA node, then all
    548      * distances from the second NUMA node and so on.
    549      *
    550      * ibm,numa-lookup-index-table is used by guest to navigate this
    551      * array because NUMA ids can be sparse (node 0 is the first,
    552      * node 8 is the second ...).
    553      */
    554     distance_table_size = distance_table_entries * sizeof(uint8_t) +
    555                           sizeof(uint32_t);
    556     distance_table = g_new0(uint8_t, distance_table_size);
    557     stl_be_p(distance_table, distance_table_entries);
    558 
    559     /* Skip the uint32_t array length at the start */
    560     i = sizeof(uint32_t);
    561 
    562     for (src = 0; src < nb_numa_nodes; src++) {
    563         for (dst = 0; dst < nb_numa_nodes; dst++) {
    564             distance_table[i++] = get_numa_distance(ms, src, dst);
    565         }
    566     }
    567 
    568     _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table",
    569                      distance_table, distance_table_size));
    570 }
    571 
    572 /*
    573  * This helper could be compressed in a single function with
    574  * FORM1 logic since we're setting the same DT values, with the
    575  * difference being a call to spapr_numa_FORM2_write_rtas_tables()
    576  * in the end. The separation was made to avoid clogging FORM1 code
    577  * which already has to deal with compat modes from previous
    578  * QEMU machine types.
    579  */
    580 static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr,
    581                                            void *fdt, int rtas)
    582 {
    583     MachineState *ms = MACHINE(spapr);
    584     uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
    585                                    spapr_numa_initial_nvgpu_numa_id(ms);
    586 
    587     /*
    588      * In FORM2, ibm,associativity-reference-points will point to
    589      * the element in the ibm,associativity array that contains the
    590      * primary domain index (for FORM2, the first element).
    591      *
    592      * This value (in our case, the numa-id) is then used as an index
    593      * to retrieve all other attributes of the node (distance,
    594      * bandwidth, latency) via ibm,numa-lookup-index-table and other
    595      * ibm,numa-*-table properties.
    596      */
    597     uint32_t refpoints[] = { cpu_to_be32(1) };
    598 
    599     uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
    600     uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) };
    601 
    602     _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
    603                      refpoints, sizeof(refpoints)));
    604 
    605     _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
    606                      maxdomains, sizeof(maxdomains)));
    607 
    608     spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas);
    609 }
    610 
    611 /*
    612  * Helper that writes ibm,associativity-reference-points and
    613  * max-associativity-domains in the RTAS pointed by @rtas
    614  * in the DT @fdt.
    615  */
    616 void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
    617 {
    618     if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) {
    619         spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas);
    620         return;
    621     }
    622 
    623     spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas);
    624 }
    625 
    626 static target_ulong h_home_node_associativity(PowerPCCPU *cpu,
    627                                               SpaprMachineState *spapr,
    628                                               target_ulong opcode,
    629                                               target_ulong *args)
    630 {
    631     g_autofree uint32_t *vcpu_assoc = NULL;
    632     target_ulong flags = args[0];
    633     target_ulong procno = args[1];
    634     PowerPCCPU *tcpu;
    635     int idx, assoc_idx;
    636     int vcpu_assoc_size = get_vcpu_assoc_size(spapr);
    637 
    638     /* only support procno from H_REGISTER_VPA */
    639     if (flags != 0x1) {
    640         return H_FUNCTION;
    641     }
    642 
    643     tcpu = spapr_find_cpu(procno);
    644     if (tcpu == NULL) {
    645         return H_P2;
    646     }
    647 
    648     /*
    649      * Given that we want to be flexible with the sizes and indexes,
    650      * we must consider that there is a hard limit of how many
    651      * associativities domain we can fit in R4 up to R9, which would be
    652      * 12 associativity domains for vcpus. Assert and bail if that's
    653      * not the case.
    654      */
    655     g_assert((vcpu_assoc_size - 1) <= 12);
    656 
    657     vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu);
    658     /* assoc_idx starts at 1 to skip associativity size */
    659     assoc_idx = 1;
    660 
    661 #define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \
    662                              ((uint64_t)(b) & 0xffffffff))
    663 
    664     for (idx = 0; idx < 6; idx++) {
    665         int32_t a, b;
    666 
    667         /*
    668          * vcpu_assoc[] will contain the associativity domains for tcpu,
    669          * including tcpu->node_id and procno, meaning that we don't
    670          * need to use these variables here.
    671          *
    672          * We'll read 2 values at a time to fill up the ASSOCIATIVITY()
    673          * macro. The ternary will fill the remaining registers with -1
    674          * after we went through vcpu_assoc[].
    675          */
    676         a = assoc_idx < vcpu_assoc_size ?
    677             be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1;
    678         b = assoc_idx < vcpu_assoc_size ?
    679             be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1;
    680 
    681         args[idx] = ASSOCIATIVITY(a, b);
    682     }
    683 #undef ASSOCIATIVITY
    684 
    685     return H_SUCCESS;
    686 }
    687 
    688 static void spapr_numa_register_types(void)
    689 {
    690     /* Virtual Processor Home Node */
    691     spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY,
    692                              h_home_node_associativity);
    693 }
    694 
    695 type_init(spapr_numa_register_types)