qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

spapr_nvdimm.c (30182B)


      1 /*
      2  * QEMU PAPR Storage Class Memory Interfaces
      3  *
      4  * Copyright (c) 2019-2020, IBM Corporation.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #include "qemu/osdep.h"
     25 #include "qemu/cutils.h"
     26 #include "qapi/error.h"
     27 #include "hw/ppc/spapr_drc.h"
     28 #include "hw/ppc/spapr_nvdimm.h"
     29 #include "hw/mem/nvdimm.h"
     30 #include "qemu/nvdimm-utils.h"
     31 #include "hw/ppc/fdt.h"
     32 #include "qemu/range.h"
     33 #include "hw/ppc/spapr_numa.h"
     34 #include "block/thread-pool.h"
     35 #include "migration/vmstate.h"
     36 #include "qemu/pmem.h"
     37 #include "hw/qdev-properties.h"
     38 
     39 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
     40 /* SCM device is unable to persist memory contents */
     41 #define PAPR_PMEM_UNARMED PPC_BIT(0)
     42 
     43 /*
     44  * The nvdimm size should be aligned to SCM block size.
     45  * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
     46  * in order to have SCM regions not to overlap with dimm memory regions.
     47  * The SCM devices can have variable block sizes. For now, fixing the
     48  * block size to the minimum value.
     49  */
     50 #define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
     51 
     52 /* Have an explicit check for alignment */
     53 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
     54 
     55 #define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
     56 OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
     57 
     58 struct SPAPRNVDIMMClass {
     59     /* private */
     60     NVDIMMClass parent_class;
     61 
     62     /* public */
     63     void (*realize)(NVDIMMDevice *dimm, Error **errp);
     64     void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
     65 };
     66 
     67 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
     68                            uint64_t size, Error **errp)
     69 {
     70     const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
     71     const MachineState *ms = MACHINE(hotplug_dev);
     72     PCDIMMDevice *dimm = PC_DIMM(nvdimm);
     73     MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
     74     g_autofree char *uuidstr = NULL;
     75     QemuUUID uuid;
     76     int ret;
     77 
     78     if (!mc->nvdimm_supported) {
     79         error_setg(errp, "NVDIMM hotplug not supported for this machine");
     80         return false;
     81     }
     82 
     83     if (!ms->nvdimms_state->is_enabled) {
     84         error_setg(errp, "nvdimm device found but 'nvdimm=off' was set");
     85         return false;
     86     }
     87 
     88     if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP,
     89                                 &error_abort) == 0) {
     90         error_setg(errp, "PAPR requires NVDIMM devices to have label-size set");
     91         return false;
     92     }
     93 
     94     if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
     95         error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)"
     96                    " to be a multiple of %" PRIu64 "MB",
     97                    SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
     98         return false;
     99     }
    100 
    101     uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP,
    102                                       &error_abort);
    103     ret = qemu_uuid_parse(uuidstr, &uuid);
    104     g_assert(!ret);
    105 
    106     if (qemu_uuid_is_null(&uuid)) {
    107         error_setg(errp, "NVDIMM device requires the uuid to be set");
    108         return false;
    109     }
    110 
    111     if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
    112         (memory_region_get_fd(mr) < 0)) {
    113         error_setg(errp, "spapr-nvdimm device requires the "
    114                    "memdev %s to be of memory-backend-file type",
    115                    object_get_canonical_path_component(OBJECT(dimm->hostmem)));
    116         return false;
    117     }
    118 
    119     return true;
    120 }
    121 
    122 
    123 void spapr_add_nvdimm(DeviceState *dev, uint64_t slot)
    124 {
    125     SpaprDrc *drc;
    126     bool hotplugged = spapr_drc_hotplugged(dev);
    127 
    128     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
    129     g_assert(drc);
    130 
    131     /*
    132      * pc_dimm_get_free_slot() provided a free slot at pre-plug. The
    133      * corresponding DRC is thus assumed to be attachable.
    134      */
    135     spapr_drc_attach(drc, dev);
    136 
    137     if (hotplugged) {
    138         spapr_hotplug_req_add_by_index(drc);
    139     }
    140 }
    141 
    142 static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,
    143                            int parent_offset, NVDIMMDevice *nvdimm)
    144 {
    145     int child_offset;
    146     char *buf;
    147     SpaprDrc *drc;
    148     uint32_t drc_idx;
    149     uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
    150                                              &error_abort);
    151     uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
    152                                              &error_abort);
    153     uint64_t lsize = nvdimm->label_size;
    154     uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
    155                                             NULL);
    156 
    157     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
    158     g_assert(drc);
    159 
    160     drc_idx = spapr_drc_index(drc);
    161 
    162     buf = g_strdup_printf("ibm,pmemory@%x", drc_idx);
    163     child_offset = fdt_add_subnode(fdt, parent_offset, buf);
    164     g_free(buf);
    165 
    166     _FDT(child_offset);
    167 
    168     _FDT((fdt_setprop_cell(fdt, child_offset, "reg", drc_idx)));
    169     _FDT((fdt_setprop_string(fdt, child_offset, "compatible", "ibm,pmemory")));
    170     _FDT((fdt_setprop_string(fdt, child_offset, "device_type", "ibm,pmemory")));
    171 
    172     spapr_numa_write_associativity_dt(spapr, fdt, child_offset, node);
    173 
    174     buf = qemu_uuid_unparse_strdup(&nvdimm->uuid);
    175     _FDT((fdt_setprop_string(fdt, child_offset, "ibm,unit-guid", buf)));
    176     g_free(buf);
    177 
    178     _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,my-drc-index", drc_idx)));
    179 
    180     _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,block-size",
    181                           SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
    182     _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,number-of-blocks",
    183                           size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
    184     _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,metadata-size", lsize)));
    185 
    186     _FDT((fdt_setprop_string(fdt, child_offset, "ibm,pmem-application",
    187                              "operating-system")));
    188     _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
    189 
    190     if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
    191         bool is_pmem = false, pmem_override = false;
    192         PCDIMMDevice *dimm = PC_DIMM(nvdimm);
    193         HostMemoryBackend *hostmem = dimm->hostmem;
    194 
    195         is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
    196         pmem_override = object_property_get_bool(OBJECT(nvdimm),
    197                                                  "pmem-override", NULL);
    198         if (!is_pmem || pmem_override) {
    199             _FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
    200                              NULL, 0));
    201         }
    202     }
    203 
    204     return child_offset;
    205 }
    206 
    207 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
    208                            void *fdt, int *fdt_start_offset, Error **errp)
    209 {
    210     NVDIMMDevice *nvdimm = NVDIMM(drc->dev);
    211 
    212     *fdt_start_offset = spapr_dt_nvdimm(spapr, fdt, 0, nvdimm);
    213 
    214     return 0;
    215 }
    216 
    217 void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt)
    218 {
    219     int offset = fdt_subnode_offset(fdt, 0, "ibm,persistent-memory");
    220     GSList *iter, *nvdimms = nvdimm_get_device_list();
    221 
    222     if (offset < 0) {
    223         offset = fdt_add_subnode(fdt, 0, "ibm,persistent-memory");
    224         _FDT(offset);
    225         _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
    226         _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
    227         _FDT((fdt_setprop_string(fdt, offset, "device_type",
    228                                  "ibm,persistent-memory")));
    229     }
    230 
    231     /* Create DT entries for cold plugged NVDIMM devices */
    232     for (iter = nvdimms; iter; iter = iter->next) {
    233         NVDIMMDevice *nvdimm = iter->data;
    234 
    235         spapr_dt_nvdimm(spapr, fdt, offset, nvdimm);
    236     }
    237     g_slist_free(nvdimms);
    238 
    239     return;
    240 }
    241 
    242 static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
    243                                         SpaprMachineState *spapr,
    244                                         target_ulong opcode,
    245                                         target_ulong *args)
    246 {
    247     uint32_t drc_index = args[0];
    248     uint64_t offset = args[1];
    249     uint64_t len = args[2];
    250     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    251     NVDIMMDevice *nvdimm;
    252     NVDIMMClass *ddc;
    253     uint64_t data = 0;
    254     uint8_t buf[8] = { 0 };
    255 
    256     if (!drc || !drc->dev ||
    257         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    258         return H_PARAMETER;
    259     }
    260 
    261     if (len != 1 && len != 2 &&
    262         len != 4 && len != 8) {
    263         return H_P3;
    264     }
    265 
    266     nvdimm = NVDIMM(drc->dev);
    267     if ((offset + len < offset) ||
    268         (nvdimm->label_size < len + offset)) {
    269         return H_P2;
    270     }
    271 
    272     ddc = NVDIMM_GET_CLASS(nvdimm);
    273     ddc->read_label_data(nvdimm, buf, len, offset);
    274 
    275     switch (len) {
    276     case 1:
    277         data = ldub_p(buf);
    278         break;
    279     case 2:
    280         data = lduw_be_p(buf);
    281         break;
    282     case 4:
    283         data = ldl_be_p(buf);
    284         break;
    285     case 8:
    286         data = ldq_be_p(buf);
    287         break;
    288     default:
    289         g_assert_not_reached();
    290     }
    291 
    292     args[0] = data;
    293 
    294     return H_SUCCESS;
    295 }
    296 
    297 static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
    298                                          SpaprMachineState *spapr,
    299                                          target_ulong opcode,
    300                                          target_ulong *args)
    301 {
    302     uint32_t drc_index = args[0];
    303     uint64_t offset = args[1];
    304     uint64_t data = args[2];
    305     uint64_t len = args[3];
    306     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    307     NVDIMMDevice *nvdimm;
    308     NVDIMMClass *ddc;
    309     uint8_t buf[8] = { 0 };
    310 
    311     if (!drc || !drc->dev ||
    312         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    313         return H_PARAMETER;
    314     }
    315 
    316     if (len != 1 && len != 2 &&
    317         len != 4 && len != 8) {
    318         return H_P4;
    319     }
    320 
    321     nvdimm = NVDIMM(drc->dev);
    322     if ((offset + len < offset) ||
    323         (nvdimm->label_size < len + offset)) {
    324         return H_P2;
    325     }
    326 
    327     switch (len) {
    328     case 1:
    329         if (data & 0xffffffffffffff00) {
    330             return H_P2;
    331         }
    332         stb_p(buf, data);
    333         break;
    334     case 2:
    335         if (data & 0xffffffffffff0000) {
    336             return H_P2;
    337         }
    338         stw_be_p(buf, data);
    339         break;
    340     case 4:
    341         if (data & 0xffffffff00000000) {
    342             return H_P2;
    343         }
    344         stl_be_p(buf, data);
    345         break;
    346     case 8:
    347         stq_be_p(buf, data);
    348         break;
    349     default:
    350             g_assert_not_reached();
    351     }
    352 
    353     ddc = NVDIMM_GET_CLASS(nvdimm);
    354     ddc->write_label_data(nvdimm, buf, len, offset);
    355 
    356     return H_SUCCESS;
    357 }
    358 
    359 static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
    360                                    target_ulong opcode, target_ulong *args)
    361 {
    362     uint32_t drc_index = args[0];
    363     uint64_t starting_idx = args[1];
    364     uint64_t no_of_scm_blocks_to_bind = args[2];
    365     uint64_t target_logical_mem_addr = args[3];
    366     uint64_t continue_token = args[4];
    367     uint64_t size;
    368     uint64_t total_no_of_scm_blocks;
    369     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    370     hwaddr addr;
    371     NVDIMMDevice *nvdimm;
    372 
    373     if (!drc || !drc->dev ||
    374         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    375         return H_PARAMETER;
    376     }
    377 
    378     /*
    379      * Currently continue token should be zero qemu has already bound
    380      * everything and this hcall doesnt return H_BUSY.
    381      */
    382     if (continue_token > 0) {
    383         return H_P5;
    384     }
    385 
    386     /* Currently qemu assigns the address. */
    387     if (target_logical_mem_addr != 0xffffffffffffffff) {
    388         return H_OVERLAP;
    389     }
    390 
    391     nvdimm = NVDIMM(drc->dev);
    392 
    393     size = object_property_get_uint(OBJECT(nvdimm),
    394                                     PC_DIMM_SIZE_PROP, &error_abort);
    395 
    396     total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
    397 
    398     if (starting_idx > total_no_of_scm_blocks) {
    399         return H_P2;
    400     }
    401 
    402     if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
    403         ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
    404         return H_P3;
    405     }
    406 
    407     addr = object_property_get_uint(OBJECT(nvdimm),
    408                                     PC_DIMM_ADDR_PROP, &error_abort);
    409 
    410     addr += starting_idx * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
    411 
    412     /* Already bound, Return target logical address in R5 */
    413     args[1] = addr;
    414     args[2] = no_of_scm_blocks_to_bind;
    415 
    416     return H_SUCCESS;
    417 }
    418 
    419 typedef struct SpaprNVDIMMDeviceFlushState {
    420     uint64_t continue_token;
    421     int64_t hcall_ret;
    422     uint32_t drcidx;
    423 
    424     QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
    425 } SpaprNVDIMMDeviceFlushState;
    426 
    427 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
    428 struct SpaprNVDIMMDevice {
    429     /* private */
    430     NVDIMMDevice parent_obj;
    431 
    432     bool hcall_flush_required;
    433     uint64_t nvdimm_flush_token;
    434     QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
    435     QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
    436 
    437     /* public */
    438 
    439     /*
    440      * The 'on' value for this property forced the qemu to enable the hcall
    441      * flush for the nvdimm device even if the backend is a pmem
    442      */
    443     bool pmem_override;
    444 };
    445 
    446 static int flush_worker_cb(void *opaque)
    447 {
    448     SpaprNVDIMMDeviceFlushState *state = opaque;
    449     SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
    450     PCDIMMDevice *dimm;
    451     HostMemoryBackend *backend;
    452     int backend_fd;
    453 
    454     g_assert(drc != NULL);
    455 
    456     dimm = PC_DIMM(drc->dev);
    457     backend = MEMORY_BACKEND(dimm->hostmem);
    458     backend_fd = memory_region_get_fd(&backend->mr);
    459 
    460     if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
    461         MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
    462         void *ptr = memory_region_get_ram_ptr(mr);
    463         size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
    464                                                NULL);
    465 
    466         /* flush pmem backend */
    467         pmem_persist(ptr, size);
    468     } else {
    469         /* flush raw backing image */
    470         if (qemu_fdatasync(backend_fd) < 0) {
    471             error_report("papr_scm: Could not sync nvdimm to backend file: %s",
    472                          strerror(errno));
    473             return H_HARDWARE;
    474         }
    475     }
    476 
    477     return H_SUCCESS;
    478 }
    479 
    480 static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
    481 {
    482     SpaprNVDIMMDeviceFlushState *state = opaque;
    483     SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
    484     SpaprNVDIMMDevice *s_nvdimm;
    485 
    486     g_assert(drc != NULL);
    487 
    488     s_nvdimm = SPAPR_NVDIMM(drc->dev);
    489 
    490     state->hcall_ret = hcall_ret;
    491     QLIST_REMOVE(state, node);
    492     QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node);
    493 }
    494 
    495 static int spapr_nvdimm_flush_post_load(void *opaque, int version_id)
    496 {
    497     SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
    498     SpaprNVDIMMDeviceFlushState *state;
    499     ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
    500     HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem);
    501     bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
    502     bool pmem_override = object_property_get_bool(OBJECT(s_nvdimm),
    503                                                   "pmem-override", NULL);
    504     bool dest_hcall_flush_required = pmem_override || !is_pmem;
    505 
    506     if (!s_nvdimm->hcall_flush_required && dest_hcall_flush_required) {
    507         error_report("The file backend for the spapr-nvdimm device %s at "
    508                      "source is a pmem, use pmem=on and pmem-override=off to "
    509                      "continue.", DEVICE(s_nvdimm)->id);
    510         return -EINVAL;
    511     }
    512     if (s_nvdimm->hcall_flush_required && !dest_hcall_flush_required) {
    513         error_report("The guest expects hcall-flush support for the "
    514                      "spapr-nvdimm device %s, use pmem_override=on to "
    515                      "continue.", DEVICE(s_nvdimm)->id);
    516         return -EINVAL;
    517     }
    518 
    519     QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
    520         thread_pool_submit_aio(pool, flush_worker_cb, state,
    521                                spapr_nvdimm_flush_completion_cb, state);
    522     }
    523 
    524     return 0;
    525 }
    526 
    527 static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
    528      .name = "spapr_nvdimm_flush_state",
    529      .version_id = 1,
    530      .minimum_version_id = 1,
    531      .fields = (VMStateField[]) {
    532          VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
    533          VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
    534          VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
    535          VMSTATE_END_OF_LIST()
    536      },
    537 };
    538 
    539 const VMStateDescription vmstate_spapr_nvdimm_states = {
    540     .name = "spapr_nvdimm_states",
    541     .version_id = 1,
    542     .minimum_version_id = 1,
    543     .post_load = spapr_nvdimm_flush_post_load,
    544     .fields = (VMStateField[]) {
    545         VMSTATE_BOOL(hcall_flush_required, SpaprNVDIMMDevice),
    546         VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice),
    547         VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
    548                         vmstate_spapr_nvdimm_flush_state,
    549                         SpaprNVDIMMDeviceFlushState, node),
    550         VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
    551                         vmstate_spapr_nvdimm_flush_state,
    552                         SpaprNVDIMMDeviceFlushState, node),
    553         VMSTATE_END_OF_LIST()
    554     },
    555 };
    556 
    557 /*
    558  * Assign a token and reserve it for the new flush state.
    559  */
    560 static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
    561                                                 SpaprNVDIMMDevice *spapr_nvdimm)
    562 {
    563     SpaprNVDIMMDeviceFlushState *state;
    564 
    565     state = g_malloc0(sizeof(*state));
    566 
    567     spapr_nvdimm->nvdimm_flush_token++;
    568     /* Token zero is presumed as no job pending. Assert on overflow to zero */
    569     g_assert(spapr_nvdimm->nvdimm_flush_token != 0);
    570 
    571     state->continue_token = spapr_nvdimm->nvdimm_flush_token;
    572 
    573     QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node);
    574 
    575     return state;
    576 }
    577 
    578 /*
    579  * spapr_nvdimm_finish_flushes
    580  *      Waits for all pending flush requests to complete
    581  *      their execution and free the states
    582  */
    583 void spapr_nvdimm_finish_flushes(void)
    584 {
    585     SpaprNVDIMMDeviceFlushState *state, *next;
    586     GSList *list, *nvdimms;
    587 
    588     /*
    589      * Called on reset path, the main loop thread which calls
    590      * the pending BHs has gotten out running in the reset path,
    591      * finally reaching here. Other code path being guest
    592      * h_client_architecture_support, thats early boot up.
    593      */
    594     nvdimms = nvdimm_get_device_list();
    595     for (list = nvdimms; list; list = list->next) {
    596         NVDIMMDevice *nvdimm = list->data;
    597         if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
    598             SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm);
    599             while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) {
    600                 aio_poll(qemu_get_aio_context(), true);
    601             }
    602 
    603             QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
    604                                node, next) {
    605                 QLIST_REMOVE(state, node);
    606                 g_free(state);
    607             }
    608         }
    609     }
    610     g_slist_free(nvdimms);
    611 }
    612 
    613 /*
    614  * spapr_nvdimm_get_flush_status
    615  *      Fetches the status of the hcall worker and returns
    616  *      H_LONG_BUSY_ORDER_10_MSEC if the worker is still running.
    617  */
    618 static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm,
    619                                          uint64_t token)
    620 {
    621     SpaprNVDIMMDeviceFlushState *state, *node;
    622 
    623     QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
    624         if (state->continue_token == token) {
    625             return H_LONG_BUSY_ORDER_10_MSEC;
    626         }
    627     }
    628 
    629     QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
    630                        node, node) {
    631         if (state->continue_token == token) {
    632             int ret = state->hcall_ret;
    633             QLIST_REMOVE(state, node);
    634             g_free(state);
    635             return ret;
    636         }
    637     }
    638 
    639     /* If not found in complete list too, invalid token */
    640     return H_P2;
    641 }
    642 
    643 /*
    644  * H_SCM_FLUSH
    645  * Input: drc_index, continue-token
    646  * Out: continue-token
    647  * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC,
    648  *               H_UNSUPPORTED
    649  *
    650  * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns
    651  * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall
    652  * needs to be issued multiple times in order to be completely serviced. The
    653  * continue-token from the output to be passed in the argument list of
    654  * subsequent hcalls until the hcall is completely serviced at which point
    655  * H_SUCCESS or other error is returned.
    656  */
    657 static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
    658                                 target_ulong opcode, target_ulong *args)
    659 {
    660     int ret;
    661     uint32_t drc_index = args[0];
    662     uint64_t continue_token = args[1];
    663     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    664     PCDIMMDevice *dimm;
    665     HostMemoryBackend *backend = NULL;
    666     SpaprNVDIMMDeviceFlushState *state;
    667     ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
    668     int fd;
    669 
    670     if (!drc || !drc->dev ||
    671         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    672         return H_PARAMETER;
    673     }
    674 
    675     dimm = PC_DIMM(drc->dev);
    676     if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
    677         return H_PARAMETER;
    678     }
    679     if (continue_token == 0) {
    680         bool is_pmem = false, pmem_override = false;
    681         backend = MEMORY_BACKEND(dimm->hostmem);
    682         fd = memory_region_get_fd(&backend->mr);
    683 
    684         if (fd < 0) {
    685             return H_UNSUPPORTED;
    686         }
    687 
    688         is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
    689         pmem_override = object_property_get_bool(OBJECT(dimm),
    690                                                 "pmem-override", NULL);
    691         if (is_pmem && !pmem_override) {
    692             return H_UNSUPPORTED;
    693         }
    694 
    695         state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm));
    696         if (!state) {
    697             return H_HARDWARE;
    698         }
    699 
    700         state->drcidx = drc_index;
    701 
    702         thread_pool_submit_aio(pool, flush_worker_cb, state,
    703                                spapr_nvdimm_flush_completion_cb, state);
    704 
    705         continue_token = state->continue_token;
    706     }
    707 
    708     ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token);
    709     if (H_IS_LONG_BUSY(ret)) {
    710         args[0] = continue_token;
    711     }
    712 
    713     return ret;
    714 }
    715 
    716 static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
    717                                      target_ulong opcode, target_ulong *args)
    718 {
    719     uint32_t drc_index = args[0];
    720     uint64_t starting_scm_logical_addr = args[1];
    721     uint64_t no_of_scm_blocks_to_unbind = args[2];
    722     uint64_t continue_token = args[3];
    723     uint64_t size_to_unbind;
    724     Range blockrange = range_empty;
    725     Range nvdimmrange = range_empty;
    726     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    727     NVDIMMDevice *nvdimm;
    728     uint64_t size, addr;
    729 
    730     if (!drc || !drc->dev ||
    731         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    732         return H_PARAMETER;
    733     }
    734 
    735     /* continue_token should be zero as this hcall doesn't return H_BUSY. */
    736     if (continue_token > 0) {
    737         return H_P4;
    738     }
    739 
    740     /* Check if starting_scm_logical_addr is block aligned */
    741     if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
    742                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
    743         return H_P2;
    744     }
    745 
    746     size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
    747     if (no_of_scm_blocks_to_unbind == 0 || no_of_scm_blocks_to_unbind !=
    748                                size_to_unbind / SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
    749         return H_P3;
    750     }
    751 
    752     nvdimm = NVDIMM(drc->dev);
    753     size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
    754                                    &error_abort);
    755     addr = object_property_get_int(OBJECT(nvdimm), PC_DIMM_ADDR_PROP,
    756                                    &error_abort);
    757 
    758     range_init_nofail(&nvdimmrange, addr, size);
    759     range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
    760 
    761     if (!range_contains_range(&nvdimmrange, &blockrange)) {
    762         return H_P3;
    763     }
    764 
    765     args[1] = no_of_scm_blocks_to_unbind;
    766 
    767     /* let unplug take care of actual unbind */
    768     return H_SUCCESS;
    769 }
    770 
    771 #define H_UNBIND_SCOPE_ALL 0x1
    772 #define H_UNBIND_SCOPE_DRC 0x2
    773 
    774 static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, SpaprMachineState *spapr,
    775                                      target_ulong opcode, target_ulong *args)
    776 {
    777     uint64_t target_scope = args[0];
    778     uint32_t drc_index = args[1];
    779     uint64_t continue_token = args[2];
    780     NVDIMMDevice *nvdimm;
    781     uint64_t size;
    782     uint64_t no_of_scm_blocks_unbound = 0;
    783 
    784     /* continue_token should be zero as this hcall doesn't return H_BUSY. */
    785     if (continue_token > 0) {
    786         return H_P4;
    787     }
    788 
    789     if (target_scope == H_UNBIND_SCOPE_DRC) {
    790         SpaprDrc *drc = spapr_drc_by_index(drc_index);
    791 
    792         if (!drc || !drc->dev ||
    793             spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    794             return H_P2;
    795         }
    796 
    797         nvdimm = NVDIMM(drc->dev);
    798         size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
    799                                        &error_abort);
    800 
    801         no_of_scm_blocks_unbound = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
    802     } else if (target_scope ==  H_UNBIND_SCOPE_ALL) {
    803         GSList *list, *nvdimms;
    804 
    805         nvdimms = nvdimm_get_device_list();
    806         for (list = nvdimms; list; list = list->next) {
    807             nvdimm = list->data;
    808             size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
    809                                            &error_abort);
    810 
    811             no_of_scm_blocks_unbound += size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
    812         }
    813         g_slist_free(nvdimms);
    814     } else {
    815         return H_PARAMETER;
    816     }
    817 
    818     args[1] = no_of_scm_blocks_unbound;
    819 
    820     /* let unplug take care of actual unbind */
    821     return H_SUCCESS;
    822 }
    823 
    824 static target_ulong h_scm_health(PowerPCCPU *cpu, SpaprMachineState *spapr,
    825                                  target_ulong opcode, target_ulong *args)
    826 {
    827 
    828     NVDIMMDevice *nvdimm;
    829     uint64_t hbitmap = 0;
    830     uint32_t drc_index = args[0];
    831     SpaprDrc *drc = spapr_drc_by_index(drc_index);
    832     const uint64_t hbitmap_mask = PAPR_PMEM_UNARMED;
    833 
    834 
    835     /* Ensure that the drc is valid & is valid PMEM dimm and is plugged in */
    836     if (!drc || !drc->dev ||
    837         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
    838         return H_PARAMETER;
    839     }
    840 
    841     nvdimm = NVDIMM(drc->dev);
    842 
    843     /* Update if the nvdimm is unarmed and send its status via health bitmaps */
    844     if (object_property_get_bool(OBJECT(nvdimm), NVDIMM_UNARMED_PROP, NULL)) {
    845         hbitmap |= PAPR_PMEM_UNARMED;
    846     }
    847 
    848     /* Update the out args with health bitmap/mask */
    849     args[0] = hbitmap;
    850     args[1] = hbitmap_mask;
    851 
    852     return H_SUCCESS;
    853 }
    854 
    855 static void spapr_scm_register_types(void)
    856 {
    857     /* qemu/scm specific hcalls */
    858     spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
    859     spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
    860     spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
    861     spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
    862     spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
    863     spapr_register_hypercall(H_SCM_HEALTH, h_scm_health);
    864     spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
    865 }
    866 
    867 type_init(spapr_scm_register_types)
    868 
    869 static void spapr_nvdimm_realize(NVDIMMDevice *dimm, Error **errp)
    870 {
    871     SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(dimm);
    872     HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(dimm)->hostmem);
    873     bool is_pmem = object_property_get_bool(OBJECT(backend),  "pmem", NULL);
    874     bool pmem_override = object_property_get_bool(OBJECT(dimm), "pmem-override",
    875                                              NULL);
    876     if (!is_pmem || pmem_override) {
    877         s_nvdimm->hcall_flush_required = true;
    878     }
    879 
    880     vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY,
    881                      &vmstate_spapr_nvdimm_states, dimm);
    882 }
    883 
    884 static void spapr_nvdimm_unrealize(NVDIMMDevice *dimm)
    885 {
    886     vmstate_unregister(NULL, &vmstate_spapr_nvdimm_states, dimm);
    887 }
    888 
    889 static Property spapr_nvdimm_properties[] = {
    890 #ifdef CONFIG_LIBPMEM
    891     DEFINE_PROP_BOOL("pmem-override", SpaprNVDIMMDevice, pmem_override, false),
    892 #endif
    893     DEFINE_PROP_END_OF_LIST(),
    894 };
    895 
    896 static void spapr_nvdimm_class_init(ObjectClass *oc, void *data)
    897 {
    898     DeviceClass *dc = DEVICE_CLASS(oc);
    899     NVDIMMClass *nvc = NVDIMM_CLASS(oc);
    900 
    901     nvc->realize = spapr_nvdimm_realize;
    902     nvc->unrealize = spapr_nvdimm_unrealize;
    903 
    904     device_class_set_props(dc, spapr_nvdimm_properties);
    905 }
    906 
    907 static void spapr_nvdimm_init(Object *obj)
    908 {
    909     SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(obj);
    910 
    911     s_nvdimm->hcall_flush_required = false;
    912     QLIST_INIT(&s_nvdimm->pending_nvdimm_flush_states);
    913     QLIST_INIT(&s_nvdimm->completed_nvdimm_flush_states);
    914 }
    915 
    916 static TypeInfo spapr_nvdimm_info = {
    917     .name          = TYPE_SPAPR_NVDIMM,
    918     .parent        = TYPE_NVDIMM,
    919     .class_init    = spapr_nvdimm_class_init,
    920     .class_size    = sizeof(SPAPRNVDIMMClass),
    921     .instance_size = sizeof(SpaprNVDIMMDevice),
    922     .instance_init = spapr_nvdimm_init,
    923 };
    924 
    925 static void spapr_nvdimm_register_types(void)
    926 {
    927     type_register_static(&spapr_nvdimm_info);
    928 }
    929 
    930 type_init(spapr_nvdimm_register_types)