spapr_pci_nvlink2.c (15550B)
1 /* 2 * QEMU sPAPR PCI for NVLink2 pass through 3 * 4 * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qapi/error.h" 26 #include "hw/pci/pci.h" 27 #include "hw/pci-host/spapr.h" 28 #include "hw/ppc/spapr_numa.h" 29 #include "qemu/error-report.h" 30 #include "hw/ppc/fdt.h" 31 #include "hw/pci/pci_bridge.h" 32 33 #define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \ 34 (((phb)->index) << 16) | ((pdev)->devfn)) 35 #define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \ 36 (((phb)->index) << 16)) 37 #define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ 38 ((gn) << 4) | (nn)) 39 40 typedef struct SpaprPhbPciNvGpuSlot { 41 uint64_t tgt; 42 uint64_t gpa; 43 unsigned numa_id; 44 PCIDevice *gpdev; 45 int linknum; 46 struct { 47 uint64_t atsd_gpa; 48 PCIDevice *npdev; 49 uint32_t link_speed; 50 } links[NVGPU_MAX_LINKS]; 51 } SpaprPhbPciNvGpuSlot; 52 53 struct SpaprPhbPciNvGpuConfig { 54 uint64_t nv2_ram_current; 55 uint64_t nv2_atsd_current; 56 int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */ 57 SpaprPhbPciNvGpuSlot slots[NVGPU_MAX_NUM]; 58 Error *err; 59 }; 60 61 static SpaprPhbPciNvGpuSlot * 62 spapr_nvgpu_get_slot(SpaprPhbPciNvGpuConfig *nvgpus, uint64_t tgt) 63 { 64 int i; 65 66 /* Search for partially collected "slot" */ 67 for (i = 0; i < nvgpus->num; ++i) { 68 if (nvgpus->slots[i].tgt == tgt) { 69 return &nvgpus->slots[i]; 70 } 71 } 72 73 if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) { 74 return NULL; 75 } 76 77 i = nvgpus->num; 78 nvgpus->slots[i].tgt = tgt; 79 ++nvgpus->num; 80 81 return &nvgpus->slots[i]; 82 } 83 84 static void spapr_pci_collect_nvgpu(SpaprPhbPciNvGpuConfig *nvgpus, 85 PCIDevice *pdev, uint64_t tgt, 86 MemoryRegion *mr, Error **errp) 87 { 88 MachineState *machine = MACHINE(qdev_get_machine()); 89 SpaprMachineState *spapr = SPAPR_MACHINE(machine); 90 SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); 91 92 if (!nvslot) { 93 error_setg(errp, "Found too many GPUs per vPHB"); 94 return; 95 } 96 g_assert(!nvslot->gpdev); 97 nvslot->gpdev = pdev; 98 99 nvslot->gpa = nvgpus->nv2_ram_current; 100 nvgpus->nv2_ram_current += memory_region_size(mr); 101 nvslot->numa_id = spapr->gpu_numa_id; 102 ++spapr->gpu_numa_id; 103 } 104 105 static void spapr_pci_collect_nvnpu(SpaprPhbPciNvGpuConfig *nvgpus, 106 PCIDevice *pdev, uint64_t tgt, 107 MemoryRegion *mr, Error **errp) 108 { 109 SpaprPhbPciNvGpuSlot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt); 110 int j; 111 112 if (!nvslot) { 113 error_setg(errp, "Found too many NVLink bridges per vPHB"); 114 return; 115 } 116 117 j = nvslot->linknum; 118 if (j == ARRAY_SIZE(nvslot->links)) { 119 error_setg(errp, "Found too many NVLink bridges per GPU"); 120 return; 121 } 122 ++nvslot->linknum; 123 124 g_assert(!nvslot->links[j].npdev); 125 nvslot->links[j].npdev = pdev; 126 nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current; 127 nvgpus->nv2_atsd_current += memory_region_size(mr); 128 nvslot->links[j].link_speed = 129 object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL); 130 } 131 132 static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev, 133 void *opaque) 134 { 135 PCIBus *sec_bus; 136 Object *po = OBJECT(pdev); 137 uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL); 138 139 if (tgt) { 140 Error *local_err = NULL; 141 SpaprPhbPciNvGpuConfig *nvgpus = opaque; 142 Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL); 143 Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]", 144 NULL); 145 146 g_assert(mr_gpu || mr_npu); 147 if (mr_gpu) { 148 spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu), 149 &local_err); 150 } else { 151 spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu), 152 &local_err); 153 } 154 error_propagate(&nvgpus->err, local_err); 155 } 156 if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != 157 PCI_HEADER_TYPE_BRIDGE)) { 158 return; 159 } 160 161 sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); 162 if (!sec_bus) { 163 return; 164 } 165 166 pci_for_each_device_under_bus(sec_bus, spapr_phb_pci_collect_nvgpu, opaque); 167 } 168 169 void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp) 170 { 171 int i, j, valid_gpu_num; 172 PCIBus *bus; 173 174 /* Search for GPUs and NPUs */ 175 if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) { 176 return; 177 } 178 179 sphb->nvgpus = g_new0(SpaprPhbPciNvGpuConfig, 1); 180 sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr; 181 sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr; 182 183 bus = PCI_HOST_BRIDGE(sphb)->bus; 184 pci_for_each_device_under_bus(bus, spapr_phb_pci_collect_nvgpu, 185 sphb->nvgpus); 186 187 if (sphb->nvgpus->err) { 188 error_propagate(errp, sphb->nvgpus->err); 189 sphb->nvgpus->err = NULL; 190 goto cleanup_exit; 191 } 192 193 /* Add found GPU RAM and ATSD MRs if found */ 194 for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) { 195 Object *nvmrobj; 196 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 197 198 if (!nvslot->gpdev) { 199 continue; 200 } 201 nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev), 202 "nvlink2-mr[0]", NULL); 203 /* ATSD is pointless without GPU RAM MR so skip those */ 204 if (!nvmrobj) { 205 continue; 206 } 207 208 ++valid_gpu_num; 209 memory_region_add_subregion(get_system_memory(), nvslot->gpa, 210 MEMORY_REGION(nvmrobj)); 211 212 for (j = 0; j < nvslot->linknum; ++j) { 213 Object *atsdmrobj; 214 215 atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev), 216 "nvlink2-atsd-mr[0]", NULL); 217 if (!atsdmrobj) { 218 continue; 219 } 220 memory_region_add_subregion(get_system_memory(), 221 nvslot->links[j].atsd_gpa, 222 MEMORY_REGION(atsdmrobj)); 223 } 224 } 225 226 if (valid_gpu_num) { 227 return; 228 } 229 /* We did not find any interesting GPU */ 230 cleanup_exit: 231 g_free(sphb->nvgpus); 232 sphb->nvgpus = NULL; 233 } 234 235 void spapr_phb_nvgpu_free(SpaprPhbState *sphb) 236 { 237 int i, j; 238 239 if (!sphb->nvgpus) { 240 return; 241 } 242 243 for (i = 0; i < sphb->nvgpus->num; ++i) { 244 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 245 Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), 246 "nvlink2-mr[0]", NULL); 247 248 if (nv_mrobj) { 249 memory_region_del_subregion(get_system_memory(), 250 MEMORY_REGION(nv_mrobj)); 251 } 252 for (j = 0; j < nvslot->linknum; ++j) { 253 PCIDevice *npdev = nvslot->links[j].npdev; 254 Object *atsd_mrobj; 255 atsd_mrobj = object_property_get_link(OBJECT(npdev), 256 "nvlink2-atsd-mr[0]", NULL); 257 if (atsd_mrobj) { 258 memory_region_del_subregion(get_system_memory(), 259 MEMORY_REGION(atsd_mrobj)); 260 } 261 } 262 } 263 g_free(sphb->nvgpus); 264 sphb->nvgpus = NULL; 265 } 266 267 void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off, 268 Error **errp) 269 { 270 int i, j, atsdnum = 0; 271 uint64_t atsd[8]; /* The existing limitation of known guests */ 272 273 if (!sphb->nvgpus) { 274 return; 275 } 276 277 for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) { 278 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 279 280 if (!nvslot->gpdev) { 281 continue; 282 } 283 for (j = 0; j < nvslot->linknum; ++j) { 284 if (!nvslot->links[j].atsd_gpa) { 285 continue; 286 } 287 288 if (atsdnum == ARRAY_SIZE(atsd)) { 289 error_report("Only %"PRIuPTR" ATSD registers supported", 290 ARRAY_SIZE(atsd)); 291 break; 292 } 293 atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa); 294 ++atsdnum; 295 } 296 } 297 298 if (!atsdnum) { 299 error_setg(errp, "No ATSD registers found"); 300 return; 301 } 302 303 if (!spapr_phb_eeh_available(sphb)) { 304 /* 305 * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB 306 * which we do not emulate as a separate device. Instead we put 307 * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not 308 * put GPUs from different IOMMU groups to the same vPHB to ensure 309 * that the guest will use ATSDs from the corresponding NPU. 310 */ 311 error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group"); 312 return; 313 } 314 315 _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd, 316 atsdnum * sizeof(atsd[0])))); 317 } 318 319 void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) 320 { 321 int i, j, linkidx, npuoff; 322 g_autofree char *npuname = NULL; 323 324 if (!sphb->nvgpus) { 325 return; 326 } 327 328 npuname = g_strdup_printf("npuphb%d", sphb->index); 329 npuoff = fdt_add_subnode(fdt, 0, npuname); 330 _FDT(npuoff); 331 _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1)); 332 _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0)); 333 /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */ 334 _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu"))); 335 336 for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) { 337 for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) { 338 g_autofree char *linkname = g_strdup_printf("link@%d", linkidx); 339 int off = fdt_add_subnode(fdt, npuoff, linkname); 340 341 _FDT(off); 342 /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */ 343 _FDT((fdt_setprop_string(fdt, off, "compatible", 344 "ibm,npu-link"))); 345 _FDT((fdt_setprop_cell(fdt, off, "phandle", 346 PHANDLE_NVLINK(sphb, i, j)))); 347 _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx))); 348 ++linkidx; 349 } 350 } 351 352 /* Add memory nodes for GPU RAM and mark them unusable */ 353 for (i = 0; i < sphb->nvgpus->num; ++i) { 354 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 355 Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), 356 "nvlink2-mr[0]", 357 &error_abort); 358 uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); 359 uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; 360 g_autofree char *mem_name = g_strdup_printf("memory@%"PRIx64, 361 nvslot->gpa); 362 int off = fdt_add_subnode(fdt, 0, mem_name); 363 364 _FDT(off); 365 _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); 366 _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); 367 368 spapr_numa_write_associativity_dt(SPAPR_MACHINE(qdev_get_machine()), 369 fdt, off, nvslot->numa_id); 370 371 _FDT((fdt_setprop_string(fdt, off, "compatible", 372 "ibm,coherent-device-memory"))); 373 374 mem_reg[1] = cpu_to_be64(0); 375 _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg, 376 sizeof(mem_reg)))); 377 _FDT((fdt_setprop_cell(fdt, off, "phandle", 378 PHANDLE_GPURAM(sphb, i)))); 379 } 380 381 } 382 383 void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset, 384 SpaprPhbState *sphb) 385 { 386 int i, j; 387 388 if (!sphb->nvgpus) { 389 return; 390 } 391 392 for (i = 0; i < sphb->nvgpus->num; ++i) { 393 SpaprPhbPciNvGpuSlot *nvslot = &sphb->nvgpus->slots[i]; 394 395 /* Skip "slot" without attached GPU */ 396 if (!nvslot->gpdev) { 397 continue; 398 } 399 if (dev == nvslot->gpdev) { 400 g_autofree uint32_t *npus = g_new(uint32_t, nvslot->linknum); 401 402 for (j = 0; j < nvslot->linknum; ++j) { 403 PCIDevice *npdev = nvslot->links[j].npdev; 404 405 npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev)); 406 } 407 _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus, 408 j * sizeof(npus[0]))); 409 _FDT((fdt_setprop_cell(fdt, offset, "phandle", 410 PHANDLE_PCIDEV(sphb, dev)))); 411 continue; 412 } 413 414 for (j = 0; j < nvslot->linknum; ++j) { 415 if (dev != nvslot->links[j].npdev) { 416 continue; 417 } 418 419 _FDT((fdt_setprop_cell(fdt, offset, "phandle", 420 PHANDLE_PCIDEV(sphb, dev)))); 421 _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu", 422 PHANDLE_PCIDEV(sphb, nvslot->gpdev))); 423 _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink", 424 PHANDLE_NVLINK(sphb, i, j)))); 425 /* 426 * If we ever want to emulate GPU RAM at the same location as on 427 * the host - here is the encoding GPA->TGT: 428 * 429 * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42; 430 * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43; 431 * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45; 432 * gta |= sphb->nv2_gpa & ((1UL << 43) - 1); 433 */ 434 _FDT(fdt_setprop_cell(fdt, offset, "memory-region", 435 PHANDLE_GPURAM(sphb, i))); 436 _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr", 437 nvslot->tgt)); 438 _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed", 439 nvslot->links[j].link_speed)); 440 } 441 } 442 }