qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

physmem.c (119014B)


      1 /*
      2  * RAM allocation and memory access
      3  *
      4  *  Copyright (c) 2003 Fabrice Bellard
      5  *
      6  * This library is free software; you can redistribute it and/or
      7  * modify it under the terms of the GNU Lesser General Public
      8  * License as published by the Free Software Foundation; either
      9  * version 2.1 of the License, or (at your option) any later version.
     10  *
     11  * This library is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  * Lesser General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU Lesser General Public
     17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
     18  */
     19 
     20 #include "qemu/osdep.h"
     21 #include "exec/page-vary.h"
     22 #include "qapi/error.h"
     23 
     24 #include "qemu/cutils.h"
     25 #include "qemu/cacheflush.h"
     26 #include "qemu/madvise.h"
     27 
     28 #ifdef CONFIG_TCG
     29 #include "hw/core/tcg-cpu-ops.h"
     30 #endif /* CONFIG_TCG */
     31 
     32 #include "exec/exec-all.h"
     33 #include "exec/target_page.h"
     34 #include "hw/qdev-core.h"
     35 #include "hw/qdev-properties.h"
     36 #include "hw/boards.h"
     37 #include "hw/xen/xen.h"
     38 #include "sysemu/kvm.h"
     39 #include "sysemu/tcg.h"
     40 #include "sysemu/qtest.h"
     41 #include "qemu/timer.h"
     42 #include "qemu/config-file.h"
     43 #include "qemu/error-report.h"
     44 #include "qemu/qemu-print.h"
     45 #include "qemu/log.h"
     46 #include "qemu/memalign.h"
     47 #include "exec/memory.h"
     48 #include "exec/ioport.h"
     49 #include "sysemu/dma.h"
     50 #include "sysemu/hostmem.h"
     51 #include "sysemu/hw_accel.h"
     52 #include "sysemu/xen-mapcache.h"
     53 #include "trace/trace-root.h"
     54 
     55 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
     56 #include <linux/falloc.h>
     57 #endif
     58 
     59 #include "qemu/rcu_queue.h"
     60 #include "qemu/main-loop.h"
     61 #include "exec/translate-all.h"
     62 #include "sysemu/replay.h"
     63 
     64 #include "exec/memory-internal.h"
     65 #include "exec/ram_addr.h"
     66 
     67 #include "qemu/pmem.h"
     68 
     69 #include "migration/vmstate.h"
     70 
     71 #include "qemu/range.h"
     72 #ifndef _WIN32
     73 #include "qemu/mmap-alloc.h"
     74 #endif
     75 
     76 #include "monitor/monitor.h"
     77 
     78 #ifdef CONFIG_LIBDAXCTL
     79 #include <daxctl/libdaxctl.h>
     80 #endif
     81 
     82 //#define DEBUG_SUBPAGE
     83 
     84 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
     85  * are protected by the ramlist lock.
     86  */
     87 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
     88 
     89 static MemoryRegion *system_memory;
     90 static MemoryRegion *system_io;
     91 
     92 AddressSpace address_space_io;
     93 AddressSpace address_space_memory;
     94 
     95 static MemoryRegion io_mem_unassigned;
     96 
     97 typedef struct PhysPageEntry PhysPageEntry;
     98 
     99 struct PhysPageEntry {
    100     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
    101     uint32_t skip : 6;
    102      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
    103     uint32_t ptr : 26;
    104 };
    105 
    106 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
    107 
    108 /* Size of the L2 (and L3, etc) page tables.  */
    109 #define ADDR_SPACE_BITS 64
    110 
    111 #define P_L2_BITS 9
    112 #define P_L2_SIZE (1 << P_L2_BITS)
    113 
    114 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
    115 
    116 typedef PhysPageEntry Node[P_L2_SIZE];
    117 
    118 typedef struct PhysPageMap {
    119     struct rcu_head rcu;
    120 
    121     unsigned sections_nb;
    122     unsigned sections_nb_alloc;
    123     unsigned nodes_nb;
    124     unsigned nodes_nb_alloc;
    125     Node *nodes;
    126     MemoryRegionSection *sections;
    127 } PhysPageMap;
    128 
    129 struct AddressSpaceDispatch {
    130     MemoryRegionSection *mru_section;
    131     /* This is a multi-level map on the physical address space.
    132      * The bottom level has pointers to MemoryRegionSections.
    133      */
    134     PhysPageEntry phys_map;
    135     PhysPageMap map;
    136 };
    137 
    138 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
    139 typedef struct subpage_t {
    140     MemoryRegion iomem;
    141     FlatView *fv;
    142     hwaddr base;
    143     uint16_t sub_section[];
    144 } subpage_t;
    145 
    146 #define PHYS_SECTION_UNASSIGNED 0
    147 
    148 static void io_mem_init(void);
    149 static void memory_map_init(void);
    150 static void tcg_log_global_after_sync(MemoryListener *listener);
    151 static void tcg_commit(MemoryListener *listener);
    152 
    153 /**
    154  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
    155  * @cpu: the CPU whose AddressSpace this is
    156  * @as: the AddressSpace itself
    157  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
    158  * @tcg_as_listener: listener for tracking changes to the AddressSpace
    159  */
    160 struct CPUAddressSpace {
    161     CPUState *cpu;
    162     AddressSpace *as;
    163     struct AddressSpaceDispatch *memory_dispatch;
    164     MemoryListener tcg_as_listener;
    165 };
    166 
    167 struct DirtyBitmapSnapshot {
    168     ram_addr_t start;
    169     ram_addr_t end;
    170     unsigned long dirty[];
    171 };
    172 
    173 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
    174 {
    175     static unsigned alloc_hint = 16;
    176     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
    177         map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
    178         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
    179         alloc_hint = map->nodes_nb_alloc;
    180     }
    181 }
    182 
    183 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
    184 {
    185     unsigned i;
    186     uint32_t ret;
    187     PhysPageEntry e;
    188     PhysPageEntry *p;
    189 
    190     ret = map->nodes_nb++;
    191     p = map->nodes[ret];
    192     assert(ret != PHYS_MAP_NODE_NIL);
    193     assert(ret != map->nodes_nb_alloc);
    194 
    195     e.skip = leaf ? 0 : 1;
    196     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
    197     for (i = 0; i < P_L2_SIZE; ++i) {
    198         memcpy(&p[i], &e, sizeof(e));
    199     }
    200     return ret;
    201 }
    202 
    203 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
    204                                 hwaddr *index, uint64_t *nb, uint16_t leaf,
    205                                 int level)
    206 {
    207     PhysPageEntry *p;
    208     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
    209 
    210     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
    211         lp->ptr = phys_map_node_alloc(map, level == 0);
    212     }
    213     p = map->nodes[lp->ptr];
    214     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
    215 
    216     while (*nb && lp < &p[P_L2_SIZE]) {
    217         if ((*index & (step - 1)) == 0 && *nb >= step) {
    218             lp->skip = 0;
    219             lp->ptr = leaf;
    220             *index += step;
    221             *nb -= step;
    222         } else {
    223             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
    224         }
    225         ++lp;
    226     }
    227 }
    228 
    229 static void phys_page_set(AddressSpaceDispatch *d,
    230                           hwaddr index, uint64_t nb,
    231                           uint16_t leaf)
    232 {
    233     /* Wildly overreserve - it doesn't matter much. */
    234     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
    235 
    236     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
    237 }
    238 
    239 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
    240  * and update our entry so we can skip it and go directly to the destination.
    241  */
    242 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
    243 {
    244     unsigned valid_ptr = P_L2_SIZE;
    245     int valid = 0;
    246     PhysPageEntry *p;
    247     int i;
    248 
    249     if (lp->ptr == PHYS_MAP_NODE_NIL) {
    250         return;
    251     }
    252 
    253     p = nodes[lp->ptr];
    254     for (i = 0; i < P_L2_SIZE; i++) {
    255         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
    256             continue;
    257         }
    258 
    259         valid_ptr = i;
    260         valid++;
    261         if (p[i].skip) {
    262             phys_page_compact(&p[i], nodes);
    263         }
    264     }
    265 
    266     /* We can only compress if there's only one child. */
    267     if (valid != 1) {
    268         return;
    269     }
    270 
    271     assert(valid_ptr < P_L2_SIZE);
    272 
    273     /* Don't compress if it won't fit in the # of bits we have. */
    274     if (P_L2_LEVELS >= (1 << 6) &&
    275         lp->skip + p[valid_ptr].skip >= (1 << 6)) {
    276         return;
    277     }
    278 
    279     lp->ptr = p[valid_ptr].ptr;
    280     if (!p[valid_ptr].skip) {
    281         /* If our only child is a leaf, make this a leaf. */
    282         /* By design, we should have made this node a leaf to begin with so we
    283          * should never reach here.
    284          * But since it's so simple to handle this, let's do it just in case we
    285          * change this rule.
    286          */
    287         lp->skip = 0;
    288     } else {
    289         lp->skip += p[valid_ptr].skip;
    290     }
    291 }
    292 
    293 void address_space_dispatch_compact(AddressSpaceDispatch *d)
    294 {
    295     if (d->phys_map.skip) {
    296         phys_page_compact(&d->phys_map, d->map.nodes);
    297     }
    298 }
    299 
    300 static inline bool section_covers_addr(const MemoryRegionSection *section,
    301                                        hwaddr addr)
    302 {
    303     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
    304      * the section must cover the entire address space.
    305      */
    306     return int128_gethi(section->size) ||
    307            range_covers_byte(section->offset_within_address_space,
    308                              int128_getlo(section->size), addr);
    309 }
    310 
    311 static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
    312 {
    313     PhysPageEntry lp = d->phys_map, *p;
    314     Node *nodes = d->map.nodes;
    315     MemoryRegionSection *sections = d->map.sections;
    316     hwaddr index = addr >> TARGET_PAGE_BITS;
    317     int i;
    318 
    319     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
    320         if (lp.ptr == PHYS_MAP_NODE_NIL) {
    321             return &sections[PHYS_SECTION_UNASSIGNED];
    322         }
    323         p = nodes[lp.ptr];
    324         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
    325     }
    326 
    327     if (section_covers_addr(&sections[lp.ptr], addr)) {
    328         return &sections[lp.ptr];
    329     } else {
    330         return &sections[PHYS_SECTION_UNASSIGNED];
    331     }
    332 }
    333 
    334 /* Called from RCU critical section */
    335 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
    336                                                         hwaddr addr,
    337                                                         bool resolve_subpage)
    338 {
    339     MemoryRegionSection *section = qatomic_read(&d->mru_section);
    340     subpage_t *subpage;
    341 
    342     if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
    343         !section_covers_addr(section, addr)) {
    344         section = phys_page_find(d, addr);
    345         qatomic_set(&d->mru_section, section);
    346     }
    347     if (resolve_subpage && section->mr->subpage) {
    348         subpage = container_of(section->mr, subpage_t, iomem);
    349         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
    350     }
    351     return section;
    352 }
    353 
    354 /* Called from RCU critical section */
    355 static MemoryRegionSection *
    356 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
    357                                  hwaddr *plen, bool resolve_subpage)
    358 {
    359     MemoryRegionSection *section;
    360     MemoryRegion *mr;
    361     Int128 diff;
    362 
    363     section = address_space_lookup_region(d, addr, resolve_subpage);
    364     /* Compute offset within MemoryRegionSection */
    365     addr -= section->offset_within_address_space;
    366 
    367     /* Compute offset within MemoryRegion */
    368     *xlat = addr + section->offset_within_region;
    369 
    370     mr = section->mr;
    371 
    372     /* MMIO registers can be expected to perform full-width accesses based only
    373      * on their address, without considering adjacent registers that could
    374      * decode to completely different MemoryRegions.  When such registers
    375      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
    376      * regions overlap wildly.  For this reason we cannot clamp the accesses
    377      * here.
    378      *
    379      * If the length is small (as is the case for address_space_ldl/stl),
    380      * everything works fine.  If the incoming length is large, however,
    381      * the caller really has to do the clamping through memory_access_size.
    382      */
    383     if (memory_region_is_ram(mr)) {
    384         diff = int128_sub(section->size, int128_make64(addr));
    385         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    386     }
    387     return section;
    388 }
    389 
    390 /**
    391  * address_space_translate_iommu - translate an address through an IOMMU
    392  * memory region and then through the target address space.
    393  *
    394  * @iommu_mr: the IOMMU memory region that we start the translation from
    395  * @addr: the address to be translated through the MMU
    396  * @xlat: the translated address offset within the destination memory region.
    397  *        It cannot be %NULL.
    398  * @plen_out: valid read/write length of the translated address. It
    399  *            cannot be %NULL.
    400  * @page_mask_out: page mask for the translated address. This
    401  *            should only be meaningful for IOMMU translated
    402  *            addresses, since there may be huge pages that this bit
    403  *            would tell. It can be %NULL if we don't care about it.
    404  * @is_write: whether the translation operation is for write
    405  * @is_mmio: whether this can be MMIO, set true if it can
    406  * @target_as: the address space targeted by the IOMMU
    407  * @attrs: transaction attributes
    408  *
    409  * This function is called from RCU critical section.  It is the common
    410  * part of flatview_do_translate and address_space_translate_cached.
    411  */
    412 static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
    413                                                          hwaddr *xlat,
    414                                                          hwaddr *plen_out,
    415                                                          hwaddr *page_mask_out,
    416                                                          bool is_write,
    417                                                          bool is_mmio,
    418                                                          AddressSpace **target_as,
    419                                                          MemTxAttrs attrs)
    420 {
    421     MemoryRegionSection *section;
    422     hwaddr page_mask = (hwaddr)-1;
    423 
    424     do {
    425         hwaddr addr = *xlat;
    426         IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
    427         int iommu_idx = 0;
    428         IOMMUTLBEntry iotlb;
    429 
    430         if (imrc->attrs_to_index) {
    431             iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
    432         }
    433 
    434         iotlb = imrc->translate(iommu_mr, addr, is_write ?
    435                                 IOMMU_WO : IOMMU_RO, iommu_idx);
    436 
    437         if (!(iotlb.perm & (1 << is_write))) {
    438             goto unassigned;
    439         }
    440 
    441         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
    442                 | (addr & iotlb.addr_mask));
    443         page_mask &= iotlb.addr_mask;
    444         *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
    445         *target_as = iotlb.target_as;
    446 
    447         section = address_space_translate_internal(
    448                 address_space_to_dispatch(iotlb.target_as), addr, xlat,
    449                 plen_out, is_mmio);
    450 
    451         iommu_mr = memory_region_get_iommu(section->mr);
    452     } while (unlikely(iommu_mr));
    453 
    454     if (page_mask_out) {
    455         *page_mask_out = page_mask;
    456     }
    457     return *section;
    458 
    459 unassigned:
    460     return (MemoryRegionSection) { .mr = &io_mem_unassigned };
    461 }
    462 
    463 /**
    464  * flatview_do_translate - translate an address in FlatView
    465  *
    466  * @fv: the flat view that we want to translate on
    467  * @addr: the address to be translated in above address space
    468  * @xlat: the translated address offset within memory region. It
    469  *        cannot be @NULL.
    470  * @plen_out: valid read/write length of the translated address. It
    471  *            can be @NULL when we don't care about it.
    472  * @page_mask_out: page mask for the translated address. This
    473  *            should only be meaningful for IOMMU translated
    474  *            addresses, since there may be huge pages that this bit
    475  *            would tell. It can be @NULL if we don't care about it.
    476  * @is_write: whether the translation operation is for write
    477  * @is_mmio: whether this can be MMIO, set true if it can
    478  * @target_as: the address space targeted by the IOMMU
    479  * @attrs: memory transaction attributes
    480  *
    481  * This function is called from RCU critical section
    482  */
    483 static MemoryRegionSection flatview_do_translate(FlatView *fv,
    484                                                  hwaddr addr,
    485                                                  hwaddr *xlat,
    486                                                  hwaddr *plen_out,
    487                                                  hwaddr *page_mask_out,
    488                                                  bool is_write,
    489                                                  bool is_mmio,
    490                                                  AddressSpace **target_as,
    491                                                  MemTxAttrs attrs)
    492 {
    493     MemoryRegionSection *section;
    494     IOMMUMemoryRegion *iommu_mr;
    495     hwaddr plen = (hwaddr)(-1);
    496 
    497     if (!plen_out) {
    498         plen_out = &plen;
    499     }
    500 
    501     section = address_space_translate_internal(
    502             flatview_to_dispatch(fv), addr, xlat,
    503             plen_out, is_mmio);
    504 
    505     iommu_mr = memory_region_get_iommu(section->mr);
    506     if (unlikely(iommu_mr)) {
    507         return address_space_translate_iommu(iommu_mr, xlat,
    508                                              plen_out, page_mask_out,
    509                                              is_write, is_mmio,
    510                                              target_as, attrs);
    511     }
    512     if (page_mask_out) {
    513         /* Not behind an IOMMU, use default page size. */
    514         *page_mask_out = ~TARGET_PAGE_MASK;
    515     }
    516 
    517     return *section;
    518 }
    519 
    520 /* Called from RCU critical section */
    521 IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
    522                                             bool is_write, MemTxAttrs attrs)
    523 {
    524     MemoryRegionSection section;
    525     hwaddr xlat, page_mask;
    526 
    527     /*
    528      * This can never be MMIO, and we don't really care about plen,
    529      * but page mask.
    530      */
    531     section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
    532                                     NULL, &page_mask, is_write, false, &as,
    533                                     attrs);
    534 
    535     /* Illegal translation */
    536     if (section.mr == &io_mem_unassigned) {
    537         goto iotlb_fail;
    538     }
    539 
    540     /* Convert memory region offset into address space offset */
    541     xlat += section.offset_within_address_space -
    542         section.offset_within_region;
    543 
    544     return (IOMMUTLBEntry) {
    545         .target_as = as,
    546         .iova = addr & ~page_mask,
    547         .translated_addr = xlat & ~page_mask,
    548         .addr_mask = page_mask,
    549         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
    550         .perm = IOMMU_RW,
    551     };
    552 
    553 iotlb_fail:
    554     return (IOMMUTLBEntry) {0};
    555 }
    556 
    557 /* Called from RCU critical section */
    558 MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
    559                                  hwaddr *plen, bool is_write,
    560                                  MemTxAttrs attrs)
    561 {
    562     MemoryRegion *mr;
    563     MemoryRegionSection section;
    564     AddressSpace *as = NULL;
    565 
    566     /* This can be MMIO, so setup MMIO bit. */
    567     section = flatview_do_translate(fv, addr, xlat, plen, NULL,
    568                                     is_write, true, &as, attrs);
    569     mr = section.mr;
    570 
    571     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
    572         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
    573         *plen = MIN(page, *plen);
    574     }
    575 
    576     return mr;
    577 }
    578 
    579 typedef struct TCGIOMMUNotifier {
    580     IOMMUNotifier n;
    581     MemoryRegion *mr;
    582     CPUState *cpu;
    583     int iommu_idx;
    584     bool active;
    585 } TCGIOMMUNotifier;
    586 
    587 static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
    588 {
    589     TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
    590 
    591     if (!notifier->active) {
    592         return;
    593     }
    594     tlb_flush(notifier->cpu);
    595     notifier->active = false;
    596     /* We leave the notifier struct on the list to avoid reallocating it later.
    597      * Generally the number of IOMMUs a CPU deals with will be small.
    598      * In any case we can't unregister the iommu notifier from a notify
    599      * callback.
    600      */
    601 }
    602 
    603 static void tcg_register_iommu_notifier(CPUState *cpu,
    604                                         IOMMUMemoryRegion *iommu_mr,
    605                                         int iommu_idx)
    606 {
    607     /* Make sure this CPU has an IOMMU notifier registered for this
    608      * IOMMU/IOMMU index combination, so that we can flush its TLB
    609      * when the IOMMU tells us the mappings we've cached have changed.
    610      */
    611     MemoryRegion *mr = MEMORY_REGION(iommu_mr);
    612     TCGIOMMUNotifier *notifier = NULL;
    613     int i;
    614 
    615     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
    616         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
    617         if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
    618             break;
    619         }
    620     }
    621     if (i == cpu->iommu_notifiers->len) {
    622         /* Not found, add a new entry at the end of the array */
    623         cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
    624         notifier = g_new0(TCGIOMMUNotifier, 1);
    625         g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
    626 
    627         notifier->mr = mr;
    628         notifier->iommu_idx = iommu_idx;
    629         notifier->cpu = cpu;
    630         /* Rather than trying to register interest in the specific part
    631          * of the iommu's address space that we've accessed and then
    632          * expand it later as subsequent accesses touch more of it, we
    633          * just register interest in the whole thing, on the assumption
    634          * that iommu reconfiguration will be rare.
    635          */
    636         iommu_notifier_init(&notifier->n,
    637                             tcg_iommu_unmap_notify,
    638                             IOMMU_NOTIFIER_UNMAP,
    639                             0,
    640                             HWADDR_MAX,
    641                             iommu_idx);
    642         memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
    643                                               &error_fatal);
    644     }
    645 
    646     if (!notifier->active) {
    647         notifier->active = true;
    648     }
    649 }
    650 
    651 void tcg_iommu_free_notifier_list(CPUState *cpu)
    652 {
    653     /* Destroy the CPU's notifier list */
    654     int i;
    655     TCGIOMMUNotifier *notifier;
    656 
    657     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
    658         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
    659         memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
    660         g_free(notifier);
    661     }
    662     g_array_free(cpu->iommu_notifiers, true);
    663 }
    664 
    665 void tcg_iommu_init_notifier_list(CPUState *cpu)
    666 {
    667     cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
    668 }
    669 
    670 /* Called from RCU critical section */
    671 MemoryRegionSection *
    672 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr,
    673                                   hwaddr *xlat, hwaddr *plen,
    674                                   MemTxAttrs attrs, int *prot)
    675 {
    676     MemoryRegionSection *section;
    677     IOMMUMemoryRegion *iommu_mr;
    678     IOMMUMemoryRegionClass *imrc;
    679     IOMMUTLBEntry iotlb;
    680     int iommu_idx;
    681     hwaddr addr = orig_addr;
    682     AddressSpaceDispatch *d =
    683         qatomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
    684 
    685     for (;;) {
    686         section = address_space_translate_internal(d, addr, &addr, plen, false);
    687 
    688         iommu_mr = memory_region_get_iommu(section->mr);
    689         if (!iommu_mr) {
    690             break;
    691         }
    692 
    693         imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
    694 
    695         iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
    696         tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
    697         /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
    698          * doesn't short-cut its translation table walk.
    699          */
    700         iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
    701         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
    702                 | (addr & iotlb.addr_mask));
    703         /* Update the caller's prot bits to remove permissions the IOMMU
    704          * is giving us a failure response for. If we get down to no
    705          * permissions left at all we can give up now.
    706          */
    707         if (!(iotlb.perm & IOMMU_RO)) {
    708             *prot &= ~(PAGE_READ | PAGE_EXEC);
    709         }
    710         if (!(iotlb.perm & IOMMU_WO)) {
    711             *prot &= ~PAGE_WRITE;
    712         }
    713 
    714         if (!*prot) {
    715             goto translate_fail;
    716         }
    717 
    718         d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
    719     }
    720 
    721     assert(!memory_region_is_iommu(section->mr));
    722     *xlat = addr;
    723     return section;
    724 
    725 translate_fail:
    726     /*
    727      * We should be given a page-aligned address -- certainly
    728      * tlb_set_page_with_attrs() does so.  The page offset of xlat
    729      * is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0.
    730      * The page portion of xlat will be logged by memory_region_access_valid()
    731      * when this memory access is rejected, so use the original untranslated
    732      * physical address.
    733      */
    734     assert((orig_addr & ~TARGET_PAGE_MASK) == 0);
    735     *xlat = orig_addr;
    736     return &d->map.sections[PHYS_SECTION_UNASSIGNED];
    737 }
    738 
    739 void cpu_address_space_init(CPUState *cpu, int asidx,
    740                             const char *prefix, MemoryRegion *mr)
    741 {
    742     CPUAddressSpace *newas;
    743     AddressSpace *as = g_new0(AddressSpace, 1);
    744     char *as_name;
    745 
    746     assert(mr);
    747     as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
    748     address_space_init(as, mr, as_name);
    749     g_free(as_name);
    750 
    751     /* Target code should have set num_ases before calling us */
    752     assert(asidx < cpu->num_ases);
    753 
    754     if (asidx == 0) {
    755         /* address space 0 gets the convenience alias */
    756         cpu->as = as;
    757     }
    758 
    759     /* KVM cannot currently support multiple address spaces. */
    760     assert(asidx == 0 || !kvm_enabled());
    761 
    762     if (!cpu->cpu_ases) {
    763         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
    764     }
    765 
    766     newas = &cpu->cpu_ases[asidx];
    767     newas->cpu = cpu;
    768     newas->as = as;
    769     if (tcg_enabled()) {
    770         newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
    771         newas->tcg_as_listener.commit = tcg_commit;
    772         newas->tcg_as_listener.name = "tcg";
    773         memory_listener_register(&newas->tcg_as_listener, as);
    774     }
    775 }
    776 
    777 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
    778 {
    779     /* Return the AddressSpace corresponding to the specified index */
    780     return cpu->cpu_ases[asidx].as;
    781 }
    782 
    783 /* Add a watchpoint.  */
    784 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
    785                           int flags, CPUWatchpoint **watchpoint)
    786 {
    787     CPUWatchpoint *wp;
    788     vaddr in_page;
    789 
    790     /* forbid ranges which are empty or run off the end of the address space */
    791     if (len == 0 || (addr + len - 1) < addr) {
    792         error_report("tried to set invalid watchpoint at %"
    793                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
    794         return -EINVAL;
    795     }
    796     wp = g_malloc(sizeof(*wp));
    797 
    798     wp->vaddr = addr;
    799     wp->len = len;
    800     wp->flags = flags;
    801 
    802     /* keep all GDB-injected watchpoints in front */
    803     if (flags & BP_GDB) {
    804         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
    805     } else {
    806         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
    807     }
    808 
    809     in_page = -(addr | TARGET_PAGE_MASK);
    810     if (len <= in_page) {
    811         tlb_flush_page(cpu, addr);
    812     } else {
    813         tlb_flush(cpu);
    814     }
    815 
    816     if (watchpoint)
    817         *watchpoint = wp;
    818     return 0;
    819 }
    820 
    821 /* Remove a specific watchpoint.  */
    822 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
    823                           int flags)
    824 {
    825     CPUWatchpoint *wp;
    826 
    827     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
    828         if (addr == wp->vaddr && len == wp->len
    829                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
    830             cpu_watchpoint_remove_by_ref(cpu, wp);
    831             return 0;
    832         }
    833     }
    834     return -ENOENT;
    835 }
    836 
    837 /* Remove a specific watchpoint by reference.  */
    838 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
    839 {
    840     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
    841 
    842     tlb_flush_page(cpu, watchpoint->vaddr);
    843 
    844     g_free(watchpoint);
    845 }
    846 
    847 /* Remove all matching watchpoints.  */
    848 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
    849 {
    850     CPUWatchpoint *wp, *next;
    851 
    852     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
    853         if (wp->flags & mask) {
    854             cpu_watchpoint_remove_by_ref(cpu, wp);
    855         }
    856     }
    857 }
    858 
    859 #ifdef CONFIG_TCG
    860 /* Return true if this watchpoint address matches the specified
    861  * access (ie the address range covered by the watchpoint overlaps
    862  * partially or completely with the address range covered by the
    863  * access).
    864  */
    865 static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
    866                                               vaddr addr, vaddr len)
    867 {
    868     /* We know the lengths are non-zero, but a little caution is
    869      * required to avoid errors in the case where the range ends
    870      * exactly at the top of the address space and so addr + len
    871      * wraps round to zero.
    872      */
    873     vaddr wpend = wp->vaddr + wp->len - 1;
    874     vaddr addrend = addr + len - 1;
    875 
    876     return !(addr > wpend || wp->vaddr > addrend);
    877 }
    878 
    879 /* Return flags for watchpoints that match addr + prot.  */
    880 int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
    881 {
    882     CPUWatchpoint *wp;
    883     int ret = 0;
    884 
    885     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
    886         if (watchpoint_address_matches(wp, addr, len)) {
    887             ret |= wp->flags;
    888         }
    889     }
    890     return ret;
    891 }
    892 
    893 /* Generate a debug exception if a watchpoint has been hit.  */
    894 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
    895                           MemTxAttrs attrs, int flags, uintptr_t ra)
    896 {
    897     CPUClass *cc = CPU_GET_CLASS(cpu);
    898     CPUWatchpoint *wp;
    899 
    900     assert(tcg_enabled());
    901     if (cpu->watchpoint_hit) {
    902         /*
    903          * We re-entered the check after replacing the TB.
    904          * Now raise the debug interrupt so that it will
    905          * trigger after the current instruction.
    906          */
    907         qemu_mutex_lock_iothread();
    908         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
    909         qemu_mutex_unlock_iothread();
    910         return;
    911     }
    912 
    913     if (cc->tcg_ops->adjust_watchpoint_address) {
    914         /* this is currently used only by ARM BE32 */
    915         addr = cc->tcg_ops->adjust_watchpoint_address(cpu, addr, len);
    916     }
    917     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
    918         if (watchpoint_address_matches(wp, addr, len)
    919             && (wp->flags & flags)) {
    920             if (replay_running_debug()) {
    921                 /*
    922                  * replay_breakpoint reads icount.
    923                  * Force recompile to succeed, because icount may
    924                  * be read only at the end of the block.
    925                  */
    926                 if (!cpu->can_do_io) {
    927                     /* Force execution of one insn next time.  */
    928                     cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ | curr_cflags(cpu);
    929                     cpu_loop_exit_restore(cpu, ra);
    930                 }
    931                 /*
    932                  * Don't process the watchpoints when we are
    933                  * in a reverse debugging operation.
    934                  */
    935                 replay_breakpoint();
    936                 return;
    937             }
    938             if (flags == BP_MEM_READ) {
    939                 wp->flags |= BP_WATCHPOINT_HIT_READ;
    940             } else {
    941                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
    942             }
    943             wp->hitaddr = MAX(addr, wp->vaddr);
    944             wp->hitattrs = attrs;
    945 
    946             if (wp->flags & BP_CPU && cc->tcg_ops->debug_check_watchpoint &&
    947                 !cc->tcg_ops->debug_check_watchpoint(cpu, wp)) {
    948                 wp->flags &= ~BP_WATCHPOINT_HIT;
    949                 continue;
    950             }
    951             cpu->watchpoint_hit = wp;
    952 
    953             mmap_lock();
    954             /* This call also restores vCPU state */
    955             tb_check_watchpoint(cpu, ra);
    956             if (wp->flags & BP_STOP_BEFORE_ACCESS) {
    957                 cpu->exception_index = EXCP_DEBUG;
    958                 mmap_unlock();
    959                 cpu_loop_exit(cpu);
    960             } else {
    961                 /* Force execution of one insn next time.  */
    962                 cpu->cflags_next_tb = 1 | CF_LAST_IO | CF_NOIRQ | curr_cflags(cpu);
    963                 mmap_unlock();
    964                 cpu_loop_exit_noexc(cpu);
    965             }
    966         } else {
    967             wp->flags &= ~BP_WATCHPOINT_HIT;
    968         }
    969     }
    970 }
    971 
    972 #endif /* CONFIG_TCG */
    973 
    974 /* Called from RCU critical section */
    975 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
    976 {
    977     RAMBlock *block;
    978 
    979     block = qatomic_rcu_read(&ram_list.mru_block);
    980     if (block && addr - block->offset < block->max_length) {
    981         return block;
    982     }
    983     RAMBLOCK_FOREACH(block) {
    984         if (addr - block->offset < block->max_length) {
    985             goto found;
    986         }
    987     }
    988 
    989     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
    990     abort();
    991 
    992 found:
    993     /* It is safe to write mru_block outside the iothread lock.  This
    994      * is what happens:
    995      *
    996      *     mru_block = xxx
    997      *     rcu_read_unlock()
    998      *                                        xxx removed from list
    999      *                  rcu_read_lock()
   1000      *                  read mru_block
   1001      *                                        mru_block = NULL;
   1002      *                                        call_rcu(reclaim_ramblock, xxx);
   1003      *                  rcu_read_unlock()
   1004      *
   1005      * qatomic_rcu_set is not needed here.  The block was already published
   1006      * when it was placed into the list.  Here we're just making an extra
   1007      * copy of the pointer.
   1008      */
   1009     ram_list.mru_block = block;
   1010     return block;
   1011 }
   1012 
   1013 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
   1014 {
   1015     CPUState *cpu;
   1016     ram_addr_t start1;
   1017     RAMBlock *block;
   1018     ram_addr_t end;
   1019 
   1020     assert(tcg_enabled());
   1021     end = TARGET_PAGE_ALIGN(start + length);
   1022     start &= TARGET_PAGE_MASK;
   1023 
   1024     RCU_READ_LOCK_GUARD();
   1025     block = qemu_get_ram_block(start);
   1026     assert(block == qemu_get_ram_block(end - 1));
   1027     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
   1028     CPU_FOREACH(cpu) {
   1029         tlb_reset_dirty(cpu, start1, length);
   1030     }
   1031 }
   1032 
   1033 /* Note: start and end must be within the same ram block.  */
   1034 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
   1035                                               ram_addr_t length,
   1036                                               unsigned client)
   1037 {
   1038     DirtyMemoryBlocks *blocks;
   1039     unsigned long end, page, start_page;
   1040     bool dirty = false;
   1041     RAMBlock *ramblock;
   1042     uint64_t mr_offset, mr_size;
   1043 
   1044     if (length == 0) {
   1045         return false;
   1046     }
   1047 
   1048     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
   1049     start_page = start >> TARGET_PAGE_BITS;
   1050     page = start_page;
   1051 
   1052     WITH_RCU_READ_LOCK_GUARD() {
   1053         blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
   1054         ramblock = qemu_get_ram_block(start);
   1055         /* Range sanity check on the ramblock */
   1056         assert(start >= ramblock->offset &&
   1057                start + length <= ramblock->offset + ramblock->used_length);
   1058 
   1059         while (page < end) {
   1060             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
   1061             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
   1062             unsigned long num = MIN(end - page,
   1063                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
   1064 
   1065             dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
   1066                                                   offset, num);
   1067             page += num;
   1068         }
   1069 
   1070         mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
   1071         mr_size = (end - start_page) << TARGET_PAGE_BITS;
   1072         memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
   1073     }
   1074 
   1075     if (dirty && tcg_enabled()) {
   1076         tlb_reset_dirty_range_all(start, length);
   1077     }
   1078 
   1079     return dirty;
   1080 }
   1081 
   1082 DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
   1083     (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
   1084 {
   1085     DirtyMemoryBlocks *blocks;
   1086     ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
   1087     unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
   1088     ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
   1089     ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
   1090     DirtyBitmapSnapshot *snap;
   1091     unsigned long page, end, dest;
   1092 
   1093     snap = g_malloc0(sizeof(*snap) +
   1094                      ((last - first) >> (TARGET_PAGE_BITS + 3)));
   1095     snap->start = first;
   1096     snap->end   = last;
   1097 
   1098     page = first >> TARGET_PAGE_BITS;
   1099     end  = last  >> TARGET_PAGE_BITS;
   1100     dest = 0;
   1101 
   1102     WITH_RCU_READ_LOCK_GUARD() {
   1103         blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
   1104 
   1105         while (page < end) {
   1106             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
   1107             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
   1108             unsigned long num = MIN(end - page,
   1109                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
   1110 
   1111             assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
   1112             assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
   1113             offset >>= BITS_PER_LEVEL;
   1114 
   1115             bitmap_copy_and_clear_atomic(snap->dirty + dest,
   1116                                          blocks->blocks[idx] + offset,
   1117                                          num);
   1118             page += num;
   1119             dest += num >> BITS_PER_LEVEL;
   1120         }
   1121     }
   1122 
   1123     if (tcg_enabled()) {
   1124         tlb_reset_dirty_range_all(start, length);
   1125     }
   1126 
   1127     memory_region_clear_dirty_bitmap(mr, offset, length);
   1128 
   1129     return snap;
   1130 }
   1131 
   1132 bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
   1133                                             ram_addr_t start,
   1134                                             ram_addr_t length)
   1135 {
   1136     unsigned long page, end;
   1137 
   1138     assert(start >= snap->start);
   1139     assert(start + length <= snap->end);
   1140 
   1141     end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
   1142     page = (start - snap->start) >> TARGET_PAGE_BITS;
   1143 
   1144     while (page < end) {
   1145         if (test_bit(page, snap->dirty)) {
   1146             return true;
   1147         }
   1148         page++;
   1149     }
   1150     return false;
   1151 }
   1152 
   1153 /* Called from RCU critical section */
   1154 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
   1155                                        MemoryRegionSection *section)
   1156 {
   1157     AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
   1158     return section - d->map.sections;
   1159 }
   1160 
   1161 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
   1162                             uint16_t section);
   1163 static subpage_t *subpage_init(FlatView *fv, hwaddr base);
   1164 
   1165 static uint16_t phys_section_add(PhysPageMap *map,
   1166                                  MemoryRegionSection *section)
   1167 {
   1168     /* The physical section number is ORed with a page-aligned
   1169      * pointer to produce the iotlb entries.  Thus it should
   1170      * never overflow into the page-aligned value.
   1171      */
   1172     assert(map->sections_nb < TARGET_PAGE_SIZE);
   1173 
   1174     if (map->sections_nb == map->sections_nb_alloc) {
   1175         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
   1176         map->sections = g_renew(MemoryRegionSection, map->sections,
   1177                                 map->sections_nb_alloc);
   1178     }
   1179     map->sections[map->sections_nb] = *section;
   1180     memory_region_ref(section->mr);
   1181     return map->sections_nb++;
   1182 }
   1183 
   1184 static void phys_section_destroy(MemoryRegion *mr)
   1185 {
   1186     bool have_sub_page = mr->subpage;
   1187 
   1188     memory_region_unref(mr);
   1189 
   1190     if (have_sub_page) {
   1191         subpage_t *subpage = container_of(mr, subpage_t, iomem);
   1192         object_unref(OBJECT(&subpage->iomem));
   1193         g_free(subpage);
   1194     }
   1195 }
   1196 
   1197 static void phys_sections_free(PhysPageMap *map)
   1198 {
   1199     while (map->sections_nb > 0) {
   1200         MemoryRegionSection *section = &map->sections[--map->sections_nb];
   1201         phys_section_destroy(section->mr);
   1202     }
   1203     g_free(map->sections);
   1204     g_free(map->nodes);
   1205 }
   1206 
   1207 static void register_subpage(FlatView *fv, MemoryRegionSection *section)
   1208 {
   1209     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
   1210     subpage_t *subpage;
   1211     hwaddr base = section->offset_within_address_space
   1212         & TARGET_PAGE_MASK;
   1213     MemoryRegionSection *existing = phys_page_find(d, base);
   1214     MemoryRegionSection subsection = {
   1215         .offset_within_address_space = base,
   1216         .size = int128_make64(TARGET_PAGE_SIZE),
   1217     };
   1218     hwaddr start, end;
   1219 
   1220     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
   1221 
   1222     if (!(existing->mr->subpage)) {
   1223         subpage = subpage_init(fv, base);
   1224         subsection.fv = fv;
   1225         subsection.mr = &subpage->iomem;
   1226         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
   1227                       phys_section_add(&d->map, &subsection));
   1228     } else {
   1229         subpage = container_of(existing->mr, subpage_t, iomem);
   1230     }
   1231     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
   1232     end = start + int128_get64(section->size) - 1;
   1233     subpage_register(subpage, start, end,
   1234                      phys_section_add(&d->map, section));
   1235 }
   1236 
   1237 
   1238 static void register_multipage(FlatView *fv,
   1239                                MemoryRegionSection *section)
   1240 {
   1241     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
   1242     hwaddr start_addr = section->offset_within_address_space;
   1243     uint16_t section_index = phys_section_add(&d->map, section);
   1244     uint64_t num_pages = int128_get64(int128_rshift(section->size,
   1245                                                     TARGET_PAGE_BITS));
   1246 
   1247     assert(num_pages);
   1248     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
   1249 }
   1250 
   1251 /*
   1252  * The range in *section* may look like this:
   1253  *
   1254  *      |s|PPPPPPP|s|
   1255  *
   1256  * where s stands for subpage and P for page.
   1257  */
   1258 void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
   1259 {
   1260     MemoryRegionSection remain = *section;
   1261     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
   1262 
   1263     /* register first subpage */
   1264     if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
   1265         uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
   1266                         - remain.offset_within_address_space;
   1267 
   1268         MemoryRegionSection now = remain;
   1269         now.size = int128_min(int128_make64(left), now.size);
   1270         register_subpage(fv, &now);
   1271         if (int128_eq(remain.size, now.size)) {
   1272             return;
   1273         }
   1274         remain.size = int128_sub(remain.size, now.size);
   1275         remain.offset_within_address_space += int128_get64(now.size);
   1276         remain.offset_within_region += int128_get64(now.size);
   1277     }
   1278 
   1279     /* register whole pages */
   1280     if (int128_ge(remain.size, page_size)) {
   1281         MemoryRegionSection now = remain;
   1282         now.size = int128_and(now.size, int128_neg(page_size));
   1283         register_multipage(fv, &now);
   1284         if (int128_eq(remain.size, now.size)) {
   1285             return;
   1286         }
   1287         remain.size = int128_sub(remain.size, now.size);
   1288         remain.offset_within_address_space += int128_get64(now.size);
   1289         remain.offset_within_region += int128_get64(now.size);
   1290     }
   1291 
   1292     /* register last subpage */
   1293     register_subpage(fv, &remain);
   1294 }
   1295 
   1296 void qemu_flush_coalesced_mmio_buffer(void)
   1297 {
   1298     if (kvm_enabled())
   1299         kvm_flush_coalesced_mmio_buffer();
   1300 }
   1301 
   1302 void qemu_mutex_lock_ramlist(void)
   1303 {
   1304     qemu_mutex_lock(&ram_list.mutex);
   1305 }
   1306 
   1307 void qemu_mutex_unlock_ramlist(void)
   1308 {
   1309     qemu_mutex_unlock(&ram_list.mutex);
   1310 }
   1311 
   1312 GString *ram_block_format(void)
   1313 {
   1314     RAMBlock *block;
   1315     char *psize;
   1316     GString *buf = g_string_new("");
   1317 
   1318     RCU_READ_LOCK_GUARD();
   1319     g_string_append_printf(buf, "%24s %8s  %18s %18s %18s\n",
   1320                            "Block Name", "PSize", "Offset", "Used", "Total");
   1321     RAMBLOCK_FOREACH(block) {
   1322         psize = size_to_str(block->page_size);
   1323         g_string_append_printf(buf, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
   1324                                " 0x%016" PRIx64 "\n", block->idstr, psize,
   1325                                (uint64_t)block->offset,
   1326                                (uint64_t)block->used_length,
   1327                                (uint64_t)block->max_length);
   1328         g_free(psize);
   1329     }
   1330 
   1331     return buf;
   1332 }
   1333 
   1334 static int find_min_backend_pagesize(Object *obj, void *opaque)
   1335 {
   1336     long *hpsize_min = opaque;
   1337 
   1338     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
   1339         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
   1340         long hpsize = host_memory_backend_pagesize(backend);
   1341 
   1342         if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
   1343             *hpsize_min = hpsize;
   1344         }
   1345     }
   1346 
   1347     return 0;
   1348 }
   1349 
   1350 static int find_max_backend_pagesize(Object *obj, void *opaque)
   1351 {
   1352     long *hpsize_max = opaque;
   1353 
   1354     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
   1355         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
   1356         long hpsize = host_memory_backend_pagesize(backend);
   1357 
   1358         if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
   1359             *hpsize_max = hpsize;
   1360         }
   1361     }
   1362 
   1363     return 0;
   1364 }
   1365 
   1366 /*
   1367  * TODO: We assume right now that all mapped host memory backends are
   1368  * used as RAM, however some might be used for different purposes.
   1369  */
   1370 long qemu_minrampagesize(void)
   1371 {
   1372     long hpsize = LONG_MAX;
   1373     Object *memdev_root = object_resolve_path("/objects", NULL);
   1374 
   1375     object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
   1376     return hpsize;
   1377 }
   1378 
   1379 long qemu_maxrampagesize(void)
   1380 {
   1381     long pagesize = 0;
   1382     Object *memdev_root = object_resolve_path("/objects", NULL);
   1383 
   1384     object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
   1385     return pagesize;
   1386 }
   1387 
   1388 #ifdef CONFIG_POSIX
   1389 static int64_t get_file_size(int fd)
   1390 {
   1391     int64_t size;
   1392 #if defined(__linux__)
   1393     struct stat st;
   1394 
   1395     if (fstat(fd, &st) < 0) {
   1396         return -errno;
   1397     }
   1398 
   1399     /* Special handling for devdax character devices */
   1400     if (S_ISCHR(st.st_mode)) {
   1401         g_autofree char *subsystem_path = NULL;
   1402         g_autofree char *subsystem = NULL;
   1403 
   1404         subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
   1405                                          major(st.st_rdev), minor(st.st_rdev));
   1406         subsystem = g_file_read_link(subsystem_path, NULL);
   1407 
   1408         if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
   1409             g_autofree char *size_path = NULL;
   1410             g_autofree char *size_str = NULL;
   1411 
   1412             size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
   1413                                     major(st.st_rdev), minor(st.st_rdev));
   1414 
   1415             if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
   1416                 return g_ascii_strtoll(size_str, NULL, 0);
   1417             }
   1418         }
   1419     }
   1420 #endif /* defined(__linux__) */
   1421 
   1422     /* st.st_size may be zero for special files yet lseek(2) works */
   1423     size = lseek(fd, 0, SEEK_END);
   1424     if (size < 0) {
   1425         return -errno;
   1426     }
   1427     return size;
   1428 }
   1429 
   1430 static int64_t get_file_align(int fd)
   1431 {
   1432     int64_t align = -1;
   1433 #if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
   1434     struct stat st;
   1435 
   1436     if (fstat(fd, &st) < 0) {
   1437         return -errno;
   1438     }
   1439 
   1440     /* Special handling for devdax character devices */
   1441     if (S_ISCHR(st.st_mode)) {
   1442         g_autofree char *path = NULL;
   1443         g_autofree char *rpath = NULL;
   1444         struct daxctl_ctx *ctx;
   1445         struct daxctl_region *region;
   1446         int rc = 0;
   1447 
   1448         path = g_strdup_printf("/sys/dev/char/%d:%d",
   1449                     major(st.st_rdev), minor(st.st_rdev));
   1450         rpath = realpath(path, NULL);
   1451         if (!rpath) {
   1452             return -errno;
   1453         }
   1454 
   1455         rc = daxctl_new(&ctx);
   1456         if (rc) {
   1457             return -1;
   1458         }
   1459 
   1460         daxctl_region_foreach(ctx, region) {
   1461             if (strstr(rpath, daxctl_region_get_path(region))) {
   1462                 align = daxctl_region_get_align(region);
   1463                 break;
   1464             }
   1465         }
   1466         daxctl_unref(ctx);
   1467     }
   1468 #endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
   1469 
   1470     return align;
   1471 }
   1472 
   1473 static int file_ram_open(const char *path,
   1474                          const char *region_name,
   1475                          bool readonly,
   1476                          bool *created,
   1477                          Error **errp)
   1478 {
   1479     char *filename;
   1480     char *sanitized_name;
   1481     char *c;
   1482     int fd = -1;
   1483 
   1484     *created = false;
   1485     for (;;) {
   1486         fd = open(path, readonly ? O_RDONLY : O_RDWR);
   1487         if (fd >= 0) {
   1488             /* @path names an existing file, use it */
   1489             break;
   1490         }
   1491         if (errno == ENOENT) {
   1492             /* @path names a file that doesn't exist, create it */
   1493             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
   1494             if (fd >= 0) {
   1495                 *created = true;
   1496                 break;
   1497             }
   1498         } else if (errno == EISDIR) {
   1499             /* @path names a directory, create a file there */
   1500             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
   1501             sanitized_name = g_strdup(region_name);
   1502             for (c = sanitized_name; *c != '\0'; c++) {
   1503                 if (*c == '/') {
   1504                     *c = '_';
   1505                 }
   1506             }
   1507 
   1508             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
   1509                                        sanitized_name);
   1510             g_free(sanitized_name);
   1511 
   1512             fd = mkstemp(filename);
   1513             if (fd >= 0) {
   1514                 unlink(filename);
   1515                 g_free(filename);
   1516                 break;
   1517             }
   1518             g_free(filename);
   1519         }
   1520         if (errno != EEXIST && errno != EINTR) {
   1521             error_setg_errno(errp, errno,
   1522                              "can't open backing store %s for guest RAM",
   1523                              path);
   1524             return -1;
   1525         }
   1526         /*
   1527          * Try again on EINTR and EEXIST.  The latter happens when
   1528          * something else creates the file between our two open().
   1529          */
   1530     }
   1531 
   1532     return fd;
   1533 }
   1534 
   1535 static void *file_ram_alloc(RAMBlock *block,
   1536                             ram_addr_t memory,
   1537                             int fd,
   1538                             bool readonly,
   1539                             bool truncate,
   1540                             off_t offset,
   1541                             Error **errp)
   1542 {
   1543     uint32_t qemu_map_flags;
   1544     void *area;
   1545 
   1546     block->page_size = qemu_fd_getpagesize(fd);
   1547     if (block->mr->align % block->page_size) {
   1548         error_setg(errp, "alignment 0x%" PRIx64
   1549                    " must be multiples of page size 0x%zx",
   1550                    block->mr->align, block->page_size);
   1551         return NULL;
   1552     } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
   1553         error_setg(errp, "alignment 0x%" PRIx64
   1554                    " must be a power of two", block->mr->align);
   1555         return NULL;
   1556     }
   1557     block->mr->align = MAX(block->page_size, block->mr->align);
   1558 #if defined(__s390x__)
   1559     if (kvm_enabled()) {
   1560         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
   1561     }
   1562 #endif
   1563 
   1564     if (memory < block->page_size) {
   1565         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
   1566                    "or larger than page size 0x%zx",
   1567                    memory, block->page_size);
   1568         return NULL;
   1569     }
   1570 
   1571     memory = ROUND_UP(memory, block->page_size);
   1572 
   1573     /*
   1574      * ftruncate is not supported by hugetlbfs in older
   1575      * hosts, so don't bother bailing out on errors.
   1576      * If anything goes wrong with it under other filesystems,
   1577      * mmap will fail.
   1578      *
   1579      * Do not truncate the non-empty backend file to avoid corrupting
   1580      * the existing data in the file. Disabling shrinking is not
   1581      * enough. For example, the current vNVDIMM implementation stores
   1582      * the guest NVDIMM labels at the end of the backend file. If the
   1583      * backend file is later extended, QEMU will not be able to find
   1584      * those labels. Therefore, extending the non-empty backend file
   1585      * is disabled as well.
   1586      */
   1587     if (truncate && ftruncate(fd, memory)) {
   1588         perror("ftruncate");
   1589     }
   1590 
   1591     qemu_map_flags = readonly ? QEMU_MAP_READONLY : 0;
   1592     qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
   1593     qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
   1594     qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
   1595     area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);
   1596     if (area == MAP_FAILED) {
   1597         error_setg_errno(errp, errno,
   1598                          "unable to map backing store for guest RAM");
   1599         return NULL;
   1600     }
   1601 
   1602     block->fd = fd;
   1603     return area;
   1604 }
   1605 #endif
   1606 
   1607 /* Allocate space within the ram_addr_t space that governs the
   1608  * dirty bitmaps.
   1609  * Called with the ramlist lock held.
   1610  */
   1611 static ram_addr_t find_ram_offset(ram_addr_t size)
   1612 {
   1613     RAMBlock *block, *next_block;
   1614     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
   1615 
   1616     assert(size != 0); /* it would hand out same offset multiple times */
   1617 
   1618     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
   1619         return 0;
   1620     }
   1621 
   1622     RAMBLOCK_FOREACH(block) {
   1623         ram_addr_t candidate, next = RAM_ADDR_MAX;
   1624 
   1625         /* Align blocks to start on a 'long' in the bitmap
   1626          * which makes the bitmap sync'ing take the fast path.
   1627          */
   1628         candidate = block->offset + block->max_length;
   1629         candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
   1630 
   1631         /* Search for the closest following block
   1632          * and find the gap.
   1633          */
   1634         RAMBLOCK_FOREACH(next_block) {
   1635             if (next_block->offset >= candidate) {
   1636                 next = MIN(next, next_block->offset);
   1637             }
   1638         }
   1639 
   1640         /* If it fits remember our place and remember the size
   1641          * of gap, but keep going so that we might find a smaller
   1642          * gap to fill so avoiding fragmentation.
   1643          */
   1644         if (next - candidate >= size && next - candidate < mingap) {
   1645             offset = candidate;
   1646             mingap = next - candidate;
   1647         }
   1648 
   1649         trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
   1650     }
   1651 
   1652     if (offset == RAM_ADDR_MAX) {
   1653         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
   1654                 (uint64_t)size);
   1655         abort();
   1656     }
   1657 
   1658     trace_find_ram_offset(size, offset);
   1659 
   1660     return offset;
   1661 }
   1662 
   1663 static unsigned long last_ram_page(void)
   1664 {
   1665     RAMBlock *block;
   1666     ram_addr_t last = 0;
   1667 
   1668     RCU_READ_LOCK_GUARD();
   1669     RAMBLOCK_FOREACH(block) {
   1670         last = MAX(last, block->offset + block->max_length);
   1671     }
   1672     return last >> TARGET_PAGE_BITS;
   1673 }
   1674 
   1675 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
   1676 {
   1677     int ret;
   1678 
   1679     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
   1680     if (!machine_dump_guest_core(current_machine)) {
   1681         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
   1682         if (ret) {
   1683             perror("qemu_madvise");
   1684             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
   1685                             "but dump_guest_core=off specified\n");
   1686         }
   1687     }
   1688 }
   1689 
   1690 const char *qemu_ram_get_idstr(RAMBlock *rb)
   1691 {
   1692     return rb->idstr;
   1693 }
   1694 
   1695 void *qemu_ram_get_host_addr(RAMBlock *rb)
   1696 {
   1697     return rb->host;
   1698 }
   1699 
   1700 ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
   1701 {
   1702     return rb->offset;
   1703 }
   1704 
   1705 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
   1706 {
   1707     return rb->used_length;
   1708 }
   1709 
   1710 ram_addr_t qemu_ram_get_max_length(RAMBlock *rb)
   1711 {
   1712     return rb->max_length;
   1713 }
   1714 
   1715 bool qemu_ram_is_shared(RAMBlock *rb)
   1716 {
   1717     return rb->flags & RAM_SHARED;
   1718 }
   1719 
   1720 bool qemu_ram_is_noreserve(RAMBlock *rb)
   1721 {
   1722     return rb->flags & RAM_NORESERVE;
   1723 }
   1724 
   1725 /* Note: Only set at the start of postcopy */
   1726 bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
   1727 {
   1728     return rb->flags & RAM_UF_ZEROPAGE;
   1729 }
   1730 
   1731 void qemu_ram_set_uf_zeroable(RAMBlock *rb)
   1732 {
   1733     rb->flags |= RAM_UF_ZEROPAGE;
   1734 }
   1735 
   1736 bool qemu_ram_is_migratable(RAMBlock *rb)
   1737 {
   1738     return rb->flags & RAM_MIGRATABLE;
   1739 }
   1740 
   1741 void qemu_ram_set_migratable(RAMBlock *rb)
   1742 {
   1743     rb->flags |= RAM_MIGRATABLE;
   1744 }
   1745 
   1746 void qemu_ram_unset_migratable(RAMBlock *rb)
   1747 {
   1748     rb->flags &= ~RAM_MIGRATABLE;
   1749 }
   1750 
   1751 int qemu_ram_get_fd(RAMBlock *rb)
   1752 {
   1753     return rb->fd;
   1754 }
   1755 
   1756 /* Called with iothread lock held.  */
   1757 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
   1758 {
   1759     RAMBlock *block;
   1760 
   1761     assert(new_block);
   1762     assert(!new_block->idstr[0]);
   1763 
   1764     if (dev) {
   1765         char *id = qdev_get_dev_path(dev);
   1766         if (id) {
   1767             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
   1768             g_free(id);
   1769         }
   1770     }
   1771     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
   1772 
   1773     RCU_READ_LOCK_GUARD();
   1774     RAMBLOCK_FOREACH(block) {
   1775         if (block != new_block &&
   1776             !strcmp(block->idstr, new_block->idstr)) {
   1777             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
   1778                     new_block->idstr);
   1779             abort();
   1780         }
   1781     }
   1782 }
   1783 
   1784 /* Called with iothread lock held.  */
   1785 void qemu_ram_unset_idstr(RAMBlock *block)
   1786 {
   1787     /* FIXME: arch_init.c assumes that this is not called throughout
   1788      * migration.  Ignore the problem since hot-unplug during migration
   1789      * does not work anyway.
   1790      */
   1791     if (block) {
   1792         memset(block->idstr, 0, sizeof(block->idstr));
   1793     }
   1794 }
   1795 
   1796 size_t qemu_ram_pagesize(RAMBlock *rb)
   1797 {
   1798     return rb->page_size;
   1799 }
   1800 
   1801 /* Returns the largest size of page in use */
   1802 size_t qemu_ram_pagesize_largest(void)
   1803 {
   1804     RAMBlock *block;
   1805     size_t largest = 0;
   1806 
   1807     RAMBLOCK_FOREACH(block) {
   1808         largest = MAX(largest, qemu_ram_pagesize(block));
   1809     }
   1810 
   1811     return largest;
   1812 }
   1813 
   1814 static int memory_try_enable_merging(void *addr, size_t len)
   1815 {
   1816     if (!machine_mem_merge(current_machine)) {
   1817         /* disabled by the user */
   1818         return 0;
   1819     }
   1820 
   1821     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
   1822 }
   1823 
   1824 /*
   1825  * Resizing RAM while migrating can result in the migration being canceled.
   1826  * Care has to be taken if the guest might have already detected the memory.
   1827  *
   1828  * As memory core doesn't know how is memory accessed, it is up to
   1829  * resize callback to update device state and/or add assertions to detect
   1830  * misuse, if necessary.
   1831  */
   1832 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
   1833 {
   1834     const ram_addr_t oldsize = block->used_length;
   1835     const ram_addr_t unaligned_size = newsize;
   1836 
   1837     assert(block);
   1838 
   1839     newsize = HOST_PAGE_ALIGN(newsize);
   1840 
   1841     if (block->used_length == newsize) {
   1842         /*
   1843          * We don't have to resize the ram block (which only knows aligned
   1844          * sizes), however, we have to notify if the unaligned size changed.
   1845          */
   1846         if (unaligned_size != memory_region_size(block->mr)) {
   1847             memory_region_set_size(block->mr, unaligned_size);
   1848             if (block->resized) {
   1849                 block->resized(block->idstr, unaligned_size, block->host);
   1850             }
   1851         }
   1852         return 0;
   1853     }
   1854 
   1855     if (!(block->flags & RAM_RESIZEABLE)) {
   1856         error_setg_errno(errp, EINVAL,
   1857                          "Size mismatch: %s: 0x" RAM_ADDR_FMT
   1858                          " != 0x" RAM_ADDR_FMT, block->idstr,
   1859                          newsize, block->used_length);
   1860         return -EINVAL;
   1861     }
   1862 
   1863     if (block->max_length < newsize) {
   1864         error_setg_errno(errp, EINVAL,
   1865                          "Size too large: %s: 0x" RAM_ADDR_FMT
   1866                          " > 0x" RAM_ADDR_FMT, block->idstr,
   1867                          newsize, block->max_length);
   1868         return -EINVAL;
   1869     }
   1870 
   1871     /* Notify before modifying the ram block and touching the bitmaps. */
   1872     if (block->host) {
   1873         ram_block_notify_resize(block->host, oldsize, newsize);
   1874     }
   1875 
   1876     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
   1877     block->used_length = newsize;
   1878     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
   1879                                         DIRTY_CLIENTS_ALL);
   1880     memory_region_set_size(block->mr, unaligned_size);
   1881     if (block->resized) {
   1882         block->resized(block->idstr, unaligned_size, block->host);
   1883     }
   1884     return 0;
   1885 }
   1886 
   1887 /*
   1888  * Trigger sync on the given ram block for range [start, start + length]
   1889  * with the backing store if one is available.
   1890  * Otherwise no-op.
   1891  * @Note: this is supposed to be a synchronous op.
   1892  */
   1893 void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
   1894 {
   1895     /* The requested range should fit in within the block range */
   1896     g_assert((start + length) <= block->used_length);
   1897 
   1898 #ifdef CONFIG_LIBPMEM
   1899     /* The lack of support for pmem should not block the sync */
   1900     if (ramblock_is_pmem(block)) {
   1901         void *addr = ramblock_ptr(block, start);
   1902         pmem_persist(addr, length);
   1903         return;
   1904     }
   1905 #endif
   1906     if (block->fd >= 0) {
   1907         /**
   1908          * Case there is no support for PMEM or the memory has not been
   1909          * specified as persistent (or is not one) - use the msync.
   1910          * Less optimal but still achieves the same goal
   1911          */
   1912         void *addr = ramblock_ptr(block, start);
   1913         if (qemu_msync(addr, length, block->fd)) {
   1914             warn_report("%s: failed to sync memory range: start: "
   1915                     RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
   1916                     __func__, start, length);
   1917         }
   1918     }
   1919 }
   1920 
   1921 /* Called with ram_list.mutex held */
   1922 static void dirty_memory_extend(ram_addr_t old_ram_size,
   1923                                 ram_addr_t new_ram_size)
   1924 {
   1925     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
   1926                                              DIRTY_MEMORY_BLOCK_SIZE);
   1927     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
   1928                                              DIRTY_MEMORY_BLOCK_SIZE);
   1929     int i;
   1930 
   1931     /* Only need to extend if block count increased */
   1932     if (new_num_blocks <= old_num_blocks) {
   1933         return;
   1934     }
   1935 
   1936     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
   1937         DirtyMemoryBlocks *old_blocks;
   1938         DirtyMemoryBlocks *new_blocks;
   1939         int j;
   1940 
   1941         old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]);
   1942         new_blocks = g_malloc(sizeof(*new_blocks) +
   1943                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
   1944 
   1945         if (old_num_blocks) {
   1946             memcpy(new_blocks->blocks, old_blocks->blocks,
   1947                    old_num_blocks * sizeof(old_blocks->blocks[0]));
   1948         }
   1949 
   1950         for (j = old_num_blocks; j < new_num_blocks; j++) {
   1951             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
   1952         }
   1953 
   1954         qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
   1955 
   1956         if (old_blocks) {
   1957             g_free_rcu(old_blocks, rcu);
   1958         }
   1959     }
   1960 }
   1961 
   1962 static void ram_block_add(RAMBlock *new_block, Error **errp)
   1963 {
   1964     const bool noreserve = qemu_ram_is_noreserve(new_block);
   1965     const bool shared = qemu_ram_is_shared(new_block);
   1966     RAMBlock *block;
   1967     RAMBlock *last_block = NULL;
   1968     ram_addr_t old_ram_size, new_ram_size;
   1969     Error *err = NULL;
   1970 
   1971     old_ram_size = last_ram_page();
   1972 
   1973     qemu_mutex_lock_ramlist();
   1974     new_block->offset = find_ram_offset(new_block->max_length);
   1975 
   1976     if (!new_block->host) {
   1977         if (xen_enabled()) {
   1978             xen_ram_alloc(new_block->offset, new_block->max_length,
   1979                           new_block->mr, &err);
   1980             if (err) {
   1981                 error_propagate(errp, err);
   1982                 qemu_mutex_unlock_ramlist();
   1983                 return;
   1984             }
   1985         } else {
   1986             new_block->host = qemu_anon_ram_alloc(new_block->max_length,
   1987                                                   &new_block->mr->align,
   1988                                                   shared, noreserve);
   1989             if (!new_block->host) {
   1990                 error_setg_errno(errp, errno,
   1991                                  "cannot set up guest memory '%s'",
   1992                                  memory_region_name(new_block->mr));
   1993                 qemu_mutex_unlock_ramlist();
   1994                 return;
   1995             }
   1996             memory_try_enable_merging(new_block->host, new_block->max_length);
   1997         }
   1998     }
   1999 
   2000     new_ram_size = MAX(old_ram_size,
   2001               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
   2002     if (new_ram_size > old_ram_size) {
   2003         dirty_memory_extend(old_ram_size, new_ram_size);
   2004     }
   2005     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
   2006      * QLIST (which has an RCU-friendly variant) does not have insertion at
   2007      * tail, so save the last element in last_block.
   2008      */
   2009     RAMBLOCK_FOREACH(block) {
   2010         last_block = block;
   2011         if (block->max_length < new_block->max_length) {
   2012             break;
   2013         }
   2014     }
   2015     if (block) {
   2016         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
   2017     } else if (last_block) {
   2018         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
   2019     } else { /* list is empty */
   2020         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
   2021     }
   2022     ram_list.mru_block = NULL;
   2023 
   2024     /* Write list before version */
   2025     smp_wmb();
   2026     ram_list.version++;
   2027     qemu_mutex_unlock_ramlist();
   2028 
   2029     cpu_physical_memory_set_dirty_range(new_block->offset,
   2030                                         new_block->used_length,
   2031                                         DIRTY_CLIENTS_ALL);
   2032 
   2033     if (new_block->host) {
   2034         qemu_ram_setup_dump(new_block->host, new_block->max_length);
   2035         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
   2036         /*
   2037          * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
   2038          * Configure it unless the machine is a qtest server, in which case
   2039          * KVM is not used and it may be forked (eg for fuzzing purposes).
   2040          */
   2041         if (!qtest_enabled()) {
   2042             qemu_madvise(new_block->host, new_block->max_length,
   2043                          QEMU_MADV_DONTFORK);
   2044         }
   2045         ram_block_notify_add(new_block->host, new_block->used_length,
   2046                              new_block->max_length);
   2047     }
   2048 }
   2049 
   2050 #ifdef CONFIG_POSIX
   2051 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
   2052                                  uint32_t ram_flags, int fd, off_t offset,
   2053                                  bool readonly, Error **errp)
   2054 {
   2055     RAMBlock *new_block;
   2056     Error *local_err = NULL;
   2057     int64_t file_size, file_align;
   2058 
   2059     /* Just support these ram flags by now. */
   2060     assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
   2061                           RAM_PROTECTED)) == 0);
   2062 
   2063     if (xen_enabled()) {
   2064         error_setg(errp, "-mem-path not supported with Xen");
   2065         return NULL;
   2066     }
   2067 
   2068     if (kvm_enabled() && !kvm_has_sync_mmu()) {
   2069         error_setg(errp,
   2070                    "host lacks kvm mmu notifiers, -mem-path unsupported");
   2071         return NULL;
   2072     }
   2073 
   2074     size = HOST_PAGE_ALIGN(size);
   2075     file_size = get_file_size(fd);
   2076     if (file_size > 0 && file_size < size) {
   2077         error_setg(errp, "backing store size 0x%" PRIx64
   2078                    " does not match 'size' option 0x" RAM_ADDR_FMT,
   2079                    file_size, size);
   2080         return NULL;
   2081     }
   2082 
   2083     file_align = get_file_align(fd);
   2084     if (file_align > 0 && file_align > mr->align) {
   2085         error_setg(errp, "backing store align 0x%" PRIx64
   2086                    " is larger than 'align' option 0x%" PRIx64,
   2087                    file_align, mr->align);
   2088         return NULL;
   2089     }
   2090 
   2091     new_block = g_malloc0(sizeof(*new_block));
   2092     new_block->mr = mr;
   2093     new_block->used_length = size;
   2094     new_block->max_length = size;
   2095     new_block->flags = ram_flags;
   2096     new_block->host = file_ram_alloc(new_block, size, fd, readonly,
   2097                                      !file_size, offset, errp);
   2098     if (!new_block->host) {
   2099         g_free(new_block);
   2100         return NULL;
   2101     }
   2102 
   2103     ram_block_add(new_block, &local_err);
   2104     if (local_err) {
   2105         g_free(new_block);
   2106         error_propagate(errp, local_err);
   2107         return NULL;
   2108     }
   2109     return new_block;
   2110 
   2111 }
   2112 
   2113 
   2114 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
   2115                                    uint32_t ram_flags, const char *mem_path,
   2116                                    bool readonly, Error **errp)
   2117 {
   2118     int fd;
   2119     bool created;
   2120     RAMBlock *block;
   2121 
   2122     fd = file_ram_open(mem_path, memory_region_name(mr), readonly, &created,
   2123                        errp);
   2124     if (fd < 0) {
   2125         return NULL;
   2126     }
   2127 
   2128     block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp);
   2129     if (!block) {
   2130         if (created) {
   2131             unlink(mem_path);
   2132         }
   2133         close(fd);
   2134         return NULL;
   2135     }
   2136 
   2137     return block;
   2138 }
   2139 #endif
   2140 
   2141 static
   2142 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
   2143                                   void (*resized)(const char*,
   2144                                                   uint64_t length,
   2145                                                   void *host),
   2146                                   void *host, uint32_t ram_flags,
   2147                                   MemoryRegion *mr, Error **errp)
   2148 {
   2149     RAMBlock *new_block;
   2150     Error *local_err = NULL;
   2151 
   2152     assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
   2153                           RAM_NORESERVE)) == 0);
   2154     assert(!host ^ (ram_flags & RAM_PREALLOC));
   2155 
   2156     size = HOST_PAGE_ALIGN(size);
   2157     max_size = HOST_PAGE_ALIGN(max_size);
   2158     new_block = g_malloc0(sizeof(*new_block));
   2159     new_block->mr = mr;
   2160     new_block->resized = resized;
   2161     new_block->used_length = size;
   2162     new_block->max_length = max_size;
   2163     assert(max_size >= size);
   2164     new_block->fd = -1;
   2165     new_block->page_size = qemu_real_host_page_size();
   2166     new_block->host = host;
   2167     new_block->flags = ram_flags;
   2168     ram_block_add(new_block, &local_err);
   2169     if (local_err) {
   2170         g_free(new_block);
   2171         error_propagate(errp, local_err);
   2172         return NULL;
   2173     }
   2174     return new_block;
   2175 }
   2176 
   2177 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
   2178                                    MemoryRegion *mr, Error **errp)
   2179 {
   2180     return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr,
   2181                                    errp);
   2182 }
   2183 
   2184 RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
   2185                          MemoryRegion *mr, Error **errp)
   2186 {
   2187     assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE)) == 0);
   2188     return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
   2189 }
   2190 
   2191 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
   2192                                      void (*resized)(const char*,
   2193                                                      uint64_t length,
   2194                                                      void *host),
   2195                                      MemoryRegion *mr, Error **errp)
   2196 {
   2197     return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
   2198                                    RAM_RESIZEABLE, mr, errp);
   2199 }
   2200 
   2201 static void reclaim_ramblock(RAMBlock *block)
   2202 {
   2203     if (block->flags & RAM_PREALLOC) {
   2204         ;
   2205     } else if (xen_enabled()) {
   2206         xen_invalidate_map_cache_entry(block->host);
   2207 #ifndef _WIN32
   2208     } else if (block->fd >= 0) {
   2209         qemu_ram_munmap(block->fd, block->host, block->max_length);
   2210         close(block->fd);
   2211 #endif
   2212     } else {
   2213         qemu_anon_ram_free(block->host, block->max_length);
   2214     }
   2215     g_free(block);
   2216 }
   2217 
   2218 void qemu_ram_free(RAMBlock *block)
   2219 {
   2220     if (!block) {
   2221         return;
   2222     }
   2223 
   2224     if (block->host) {
   2225         ram_block_notify_remove(block->host, block->used_length,
   2226                                 block->max_length);
   2227     }
   2228 
   2229     qemu_mutex_lock_ramlist();
   2230     QLIST_REMOVE_RCU(block, next);
   2231     ram_list.mru_block = NULL;
   2232     /* Write list before version */
   2233     smp_wmb();
   2234     ram_list.version++;
   2235     call_rcu(block, reclaim_ramblock, rcu);
   2236     qemu_mutex_unlock_ramlist();
   2237 }
   2238 
   2239 #ifndef _WIN32
   2240 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
   2241 {
   2242     RAMBlock *block;
   2243     ram_addr_t offset;
   2244     int flags;
   2245     void *area, *vaddr;
   2246 
   2247     RAMBLOCK_FOREACH(block) {
   2248         offset = addr - block->offset;
   2249         if (offset < block->max_length) {
   2250             vaddr = ramblock_ptr(block, offset);
   2251             if (block->flags & RAM_PREALLOC) {
   2252                 ;
   2253             } else if (xen_enabled()) {
   2254                 abort();
   2255             } else {
   2256                 flags = MAP_FIXED;
   2257                 flags |= block->flags & RAM_SHARED ?
   2258                          MAP_SHARED : MAP_PRIVATE;
   2259                 flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
   2260                 if (block->fd >= 0) {
   2261                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
   2262                                 flags, block->fd, offset);
   2263                 } else {
   2264                     flags |= MAP_ANONYMOUS;
   2265                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
   2266                                 flags, -1, 0);
   2267                 }
   2268                 if (area != vaddr) {
   2269                     error_report("Could not remap addr: "
   2270                                  RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
   2271                                  length, addr);
   2272                     exit(1);
   2273                 }
   2274                 memory_try_enable_merging(vaddr, length);
   2275                 qemu_ram_setup_dump(vaddr, length);
   2276             }
   2277         }
   2278     }
   2279 }
   2280 #endif /* !_WIN32 */
   2281 
   2282 /* Return a host pointer to ram allocated with qemu_ram_alloc.
   2283  * This should not be used for general purpose DMA.  Use address_space_map
   2284  * or address_space_rw instead. For local memory (e.g. video ram) that the
   2285  * device owns, use memory_region_get_ram_ptr.
   2286  *
   2287  * Called within RCU critical section.
   2288  */
   2289 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
   2290 {
   2291     RAMBlock *block = ram_block;
   2292 
   2293     if (block == NULL) {
   2294         block = qemu_get_ram_block(addr);
   2295         addr -= block->offset;
   2296     }
   2297 
   2298     if (xen_enabled() && block->host == NULL) {
   2299         /* We need to check if the requested address is in the RAM
   2300          * because we don't want to map the entire memory in QEMU.
   2301          * In that case just map until the end of the page.
   2302          */
   2303         if (block->offset == 0) {
   2304             return xen_map_cache(addr, 0, 0, false);
   2305         }
   2306 
   2307         block->host = xen_map_cache(block->offset, block->max_length, 1, false);
   2308     }
   2309     return ramblock_ptr(block, addr);
   2310 }
   2311 
   2312 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
   2313  * but takes a size argument.
   2314  *
   2315  * Called within RCU critical section.
   2316  */
   2317 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
   2318                                  hwaddr *size, bool lock)
   2319 {
   2320     RAMBlock *block = ram_block;
   2321     if (*size == 0) {
   2322         return NULL;
   2323     }
   2324 
   2325     if (block == NULL) {
   2326         block = qemu_get_ram_block(addr);
   2327         addr -= block->offset;
   2328     }
   2329     *size = MIN(*size, block->max_length - addr);
   2330 
   2331     if (xen_enabled() && block->host == NULL) {
   2332         /* We need to check if the requested address is in the RAM
   2333          * because we don't want to map the entire memory in QEMU.
   2334          * In that case just map the requested area.
   2335          */
   2336         if (block->offset == 0) {
   2337             return xen_map_cache(addr, *size, lock, lock);
   2338         }
   2339 
   2340         block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
   2341     }
   2342 
   2343     return ramblock_ptr(block, addr);
   2344 }
   2345 
   2346 /* Return the offset of a hostpointer within a ramblock */
   2347 ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
   2348 {
   2349     ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
   2350     assert((uintptr_t)host >= (uintptr_t)rb->host);
   2351     assert(res < rb->max_length);
   2352 
   2353     return res;
   2354 }
   2355 
   2356 /*
   2357  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
   2358  * in that RAMBlock.
   2359  *
   2360  * ptr: Host pointer to look up
   2361  * round_offset: If true round the result offset down to a page boundary
   2362  * *ram_addr: set to result ram_addr
   2363  * *offset: set to result offset within the RAMBlock
   2364  *
   2365  * Returns: RAMBlock (or NULL if not found)
   2366  *
   2367  * By the time this function returns, the returned pointer is not protected
   2368  * by RCU anymore.  If the caller is not within an RCU critical section and
   2369  * does not hold the iothread lock, it must have other means of protecting the
   2370  * pointer, such as a reference to the region that includes the incoming
   2371  * ram_addr_t.
   2372  */
   2373 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
   2374                                    ram_addr_t *offset)
   2375 {
   2376     RAMBlock *block;
   2377     uint8_t *host = ptr;
   2378 
   2379     if (xen_enabled()) {
   2380         ram_addr_t ram_addr;
   2381         RCU_READ_LOCK_GUARD();
   2382         ram_addr = xen_ram_addr_from_mapcache(ptr);
   2383         block = qemu_get_ram_block(ram_addr);
   2384         if (block) {
   2385             *offset = ram_addr - block->offset;
   2386         }
   2387         return block;
   2388     }
   2389 
   2390     RCU_READ_LOCK_GUARD();
   2391     block = qatomic_rcu_read(&ram_list.mru_block);
   2392     if (block && block->host && host - block->host < block->max_length) {
   2393         goto found;
   2394     }
   2395 
   2396     RAMBLOCK_FOREACH(block) {
   2397         /* This case append when the block is not mapped. */
   2398         if (block->host == NULL) {
   2399             continue;
   2400         }
   2401         if (host - block->host < block->max_length) {
   2402             goto found;
   2403         }
   2404     }
   2405 
   2406     return NULL;
   2407 
   2408 found:
   2409     *offset = (host - block->host);
   2410     if (round_offset) {
   2411         *offset &= TARGET_PAGE_MASK;
   2412     }
   2413     return block;
   2414 }
   2415 
   2416 /*
   2417  * Finds the named RAMBlock
   2418  *
   2419  * name: The name of RAMBlock to find
   2420  *
   2421  * Returns: RAMBlock (or NULL if not found)
   2422  */
   2423 RAMBlock *qemu_ram_block_by_name(const char *name)
   2424 {
   2425     RAMBlock *block;
   2426 
   2427     RAMBLOCK_FOREACH(block) {
   2428         if (!strcmp(name, block->idstr)) {
   2429             return block;
   2430         }
   2431     }
   2432 
   2433     return NULL;
   2434 }
   2435 
   2436 /* Some of the softmmu routines need to translate from a host pointer
   2437    (typically a TLB entry) back to a ram offset.  */
   2438 ram_addr_t qemu_ram_addr_from_host(void *ptr)
   2439 {
   2440     RAMBlock *block;
   2441     ram_addr_t offset;
   2442 
   2443     block = qemu_ram_block_from_host(ptr, false, &offset);
   2444     if (!block) {
   2445         return RAM_ADDR_INVALID;
   2446     }
   2447 
   2448     return block->offset + offset;
   2449 }
   2450 
   2451 ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
   2452 {
   2453     ram_addr_t ram_addr;
   2454 
   2455     ram_addr = qemu_ram_addr_from_host(ptr);
   2456     if (ram_addr == RAM_ADDR_INVALID) {
   2457         error_report("Bad ram pointer %p", ptr);
   2458         abort();
   2459     }
   2460     return ram_addr;
   2461 }
   2462 
   2463 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
   2464                                  MemTxAttrs attrs, void *buf, hwaddr len);
   2465 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
   2466                                   const void *buf, hwaddr len);
   2467 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
   2468                                   bool is_write, MemTxAttrs attrs);
   2469 
   2470 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
   2471                                 unsigned len, MemTxAttrs attrs)
   2472 {
   2473     subpage_t *subpage = opaque;
   2474     uint8_t buf[8];
   2475     MemTxResult res;
   2476 
   2477 #if defined(DEBUG_SUBPAGE)
   2478     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
   2479            subpage, len, addr);
   2480 #endif
   2481     res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
   2482     if (res) {
   2483         return res;
   2484     }
   2485     *data = ldn_p(buf, len);
   2486     return MEMTX_OK;
   2487 }
   2488 
   2489 static MemTxResult subpage_write(void *opaque, hwaddr addr,
   2490                                  uint64_t value, unsigned len, MemTxAttrs attrs)
   2491 {
   2492     subpage_t *subpage = opaque;
   2493     uint8_t buf[8];
   2494 
   2495 #if defined(DEBUG_SUBPAGE)
   2496     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
   2497            " value %"PRIx64"\n",
   2498            __func__, subpage, len, addr, value);
   2499 #endif
   2500     stn_p(buf, len, value);
   2501     return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
   2502 }
   2503 
   2504 static bool subpage_accepts(void *opaque, hwaddr addr,
   2505                             unsigned len, bool is_write,
   2506                             MemTxAttrs attrs)
   2507 {
   2508     subpage_t *subpage = opaque;
   2509 #if defined(DEBUG_SUBPAGE)
   2510     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
   2511            __func__, subpage, is_write ? 'w' : 'r', len, addr);
   2512 #endif
   2513 
   2514     return flatview_access_valid(subpage->fv, addr + subpage->base,
   2515                                  len, is_write, attrs);
   2516 }
   2517 
   2518 static const MemoryRegionOps subpage_ops = {
   2519     .read_with_attrs = subpage_read,
   2520     .write_with_attrs = subpage_write,
   2521     .impl.min_access_size = 1,
   2522     .impl.max_access_size = 8,
   2523     .valid.min_access_size = 1,
   2524     .valid.max_access_size = 8,
   2525     .valid.accepts = subpage_accepts,
   2526     .endianness = DEVICE_NATIVE_ENDIAN,
   2527 };
   2528 
   2529 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
   2530                             uint16_t section)
   2531 {
   2532     int idx, eidx;
   2533 
   2534     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
   2535         return -1;
   2536     idx = SUBPAGE_IDX(start);
   2537     eidx = SUBPAGE_IDX(end);
   2538 #if defined(DEBUG_SUBPAGE)
   2539     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
   2540            __func__, mmio, start, end, idx, eidx, section);
   2541 #endif
   2542     for (; idx <= eidx; idx++) {
   2543         mmio->sub_section[idx] = section;
   2544     }
   2545 
   2546     return 0;
   2547 }
   2548 
   2549 static subpage_t *subpage_init(FlatView *fv, hwaddr base)
   2550 {
   2551     subpage_t *mmio;
   2552 
   2553     /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
   2554     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
   2555     mmio->fv = fv;
   2556     mmio->base = base;
   2557     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
   2558                           NULL, TARGET_PAGE_SIZE);
   2559     mmio->iomem.subpage = true;
   2560 #if defined(DEBUG_SUBPAGE)
   2561     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
   2562            mmio, base, TARGET_PAGE_SIZE);
   2563 #endif
   2564 
   2565     return mmio;
   2566 }
   2567 
   2568 static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
   2569 {
   2570     assert(fv);
   2571     MemoryRegionSection section = {
   2572         .fv = fv,
   2573         .mr = mr,
   2574         .offset_within_address_space = 0,
   2575         .offset_within_region = 0,
   2576         .size = int128_2_64(),
   2577     };
   2578 
   2579     return phys_section_add(map, &section);
   2580 }
   2581 
   2582 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
   2583                                       hwaddr index, MemTxAttrs attrs)
   2584 {
   2585     int asidx = cpu_asidx_from_attrs(cpu, attrs);
   2586     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
   2587     AddressSpaceDispatch *d = qatomic_rcu_read(&cpuas->memory_dispatch);
   2588     MemoryRegionSection *sections = d->map.sections;
   2589 
   2590     return &sections[index & ~TARGET_PAGE_MASK];
   2591 }
   2592 
   2593 static void io_mem_init(void)
   2594 {
   2595     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
   2596                           NULL, UINT64_MAX);
   2597 }
   2598 
   2599 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
   2600 {
   2601     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
   2602     uint16_t n;
   2603 
   2604     n = dummy_section(&d->map, fv, &io_mem_unassigned);
   2605     assert(n == PHYS_SECTION_UNASSIGNED);
   2606 
   2607     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
   2608 
   2609     return d;
   2610 }
   2611 
   2612 void address_space_dispatch_free(AddressSpaceDispatch *d)
   2613 {
   2614     phys_sections_free(&d->map);
   2615     g_free(d);
   2616 }
   2617 
   2618 static void do_nothing(CPUState *cpu, run_on_cpu_data d)
   2619 {
   2620 }
   2621 
   2622 static void tcg_log_global_after_sync(MemoryListener *listener)
   2623 {
   2624     CPUAddressSpace *cpuas;
   2625 
   2626     /* Wait for the CPU to end the current TB.  This avoids the following
   2627      * incorrect race:
   2628      *
   2629      *      vCPU                         migration
   2630      *      ----------------------       -------------------------
   2631      *      TLB check -> slow path
   2632      *        notdirty_mem_write
   2633      *          write to RAM
   2634      *          mark dirty
   2635      *                                   clear dirty flag
   2636      *      TLB check -> fast path
   2637      *                                   read memory
   2638      *        write to RAM
   2639      *
   2640      * by pushing the migration thread's memory read after the vCPU thread has
   2641      * written the memory.
   2642      */
   2643     if (replay_mode == REPLAY_MODE_NONE) {
   2644         /*
   2645          * VGA can make calls to this function while updating the screen.
   2646          * In record/replay mode this causes a deadlock, because
   2647          * run_on_cpu waits for rr mutex. Therefore no races are possible
   2648          * in this case and no need for making run_on_cpu when
   2649          * record/replay is enabled.
   2650          */
   2651         cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
   2652         run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
   2653     }
   2654 }
   2655 
   2656 static void tcg_commit(MemoryListener *listener)
   2657 {
   2658     CPUAddressSpace *cpuas;
   2659     AddressSpaceDispatch *d;
   2660 
   2661     assert(tcg_enabled());
   2662     /* since each CPU stores ram addresses in its TLB cache, we must
   2663        reset the modified entries */
   2664     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
   2665     cpu_reloading_memory_map();
   2666     /* The CPU and TLB are protected by the iothread lock.
   2667      * We reload the dispatch pointer now because cpu_reloading_memory_map()
   2668      * may have split the RCU critical section.
   2669      */
   2670     d = address_space_to_dispatch(cpuas->as);
   2671     qatomic_rcu_set(&cpuas->memory_dispatch, d);
   2672     tlb_flush(cpuas->cpu);
   2673 }
   2674 
   2675 static void memory_map_init(void)
   2676 {
   2677     system_memory = g_malloc(sizeof(*system_memory));
   2678 
   2679     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
   2680     address_space_init(&address_space_memory, system_memory, "memory");
   2681 
   2682     system_io = g_malloc(sizeof(*system_io));
   2683     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
   2684                           65536);
   2685     address_space_init(&address_space_io, system_io, "I/O");
   2686 }
   2687 
   2688 MemoryRegion *get_system_memory(void)
   2689 {
   2690     return system_memory;
   2691 }
   2692 
   2693 MemoryRegion *get_system_io(void)
   2694 {
   2695     return system_io;
   2696 }
   2697 
   2698 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
   2699                                      hwaddr length)
   2700 {
   2701     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
   2702     addr += memory_region_get_ram_addr(mr);
   2703 
   2704     /* No early return if dirty_log_mask is or becomes 0, because
   2705      * cpu_physical_memory_set_dirty_range will still call
   2706      * xen_modified_memory.
   2707      */
   2708     if (dirty_log_mask) {
   2709         dirty_log_mask =
   2710             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
   2711     }
   2712     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
   2713         assert(tcg_enabled());
   2714         tb_invalidate_phys_range(addr, addr + length);
   2715         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
   2716     }
   2717     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
   2718 }
   2719 
   2720 void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
   2721 {
   2722     /*
   2723      * In principle this function would work on other memory region types too,
   2724      * but the ROM device use case is the only one where this operation is
   2725      * necessary.  Other memory regions should use the
   2726      * address_space_read/write() APIs.
   2727      */
   2728     assert(memory_region_is_romd(mr));
   2729 
   2730     invalidate_and_set_dirty(mr, addr, size);
   2731 }
   2732 
   2733 int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
   2734 {
   2735     unsigned access_size_max = mr->ops->valid.max_access_size;
   2736 
   2737     /* Regions are assumed to support 1-4 byte accesses unless
   2738        otherwise specified.  */
   2739     if (access_size_max == 0) {
   2740         access_size_max = 4;
   2741     }
   2742 
   2743     /* Bound the maximum access by the alignment of the address.  */
   2744     if (!mr->ops->impl.unaligned) {
   2745         unsigned align_size_max = addr & -addr;
   2746         if (align_size_max != 0 && align_size_max < access_size_max) {
   2747             access_size_max = align_size_max;
   2748         }
   2749     }
   2750 
   2751     /* Don't attempt accesses larger than the maximum.  */
   2752     if (l > access_size_max) {
   2753         l = access_size_max;
   2754     }
   2755     l = pow2floor(l);
   2756 
   2757     return l;
   2758 }
   2759 
   2760 bool prepare_mmio_access(MemoryRegion *mr)
   2761 {
   2762     bool release_lock = false;
   2763 
   2764     if (!qemu_mutex_iothread_locked()) {
   2765         qemu_mutex_lock_iothread();
   2766         release_lock = true;
   2767     }
   2768     if (mr->flush_coalesced_mmio) {
   2769         qemu_flush_coalesced_mmio_buffer();
   2770     }
   2771 
   2772     return release_lock;
   2773 }
   2774 
   2775 /**
   2776  * flatview_access_allowed
   2777  * @mr: #MemoryRegion to be accessed
   2778  * @attrs: memory transaction attributes
   2779  * @addr: address within that memory region
   2780  * @len: the number of bytes to access
   2781  *
   2782  * Check if a memory transaction is allowed.
   2783  *
   2784  * Returns: true if transaction is allowed, false if denied.
   2785  */
   2786 static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs,
   2787                                     hwaddr addr, hwaddr len)
   2788 {
   2789     if (likely(!attrs.memory)) {
   2790         return true;
   2791     }
   2792     if (memory_region_is_ram(mr)) {
   2793         return true;
   2794     }
   2795     qemu_log_mask(LOG_GUEST_ERROR,
   2796                   "Invalid access to non-RAM device at "
   2797                   "addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", "
   2798                   "region '%s'\n", addr, len, memory_region_name(mr));
   2799     return false;
   2800 }
   2801 
   2802 /* Called within RCU critical section.  */
   2803 static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
   2804                                            MemTxAttrs attrs,
   2805                                            const void *ptr,
   2806                                            hwaddr len, hwaddr addr1,
   2807                                            hwaddr l, MemoryRegion *mr)
   2808 {
   2809     uint8_t *ram_ptr;
   2810     uint64_t val;
   2811     MemTxResult result = MEMTX_OK;
   2812     bool release_lock = false;
   2813     const uint8_t *buf = ptr;
   2814 
   2815     for (;;) {
   2816         if (!flatview_access_allowed(mr, attrs, addr1, l)) {
   2817             result |= MEMTX_ACCESS_ERROR;
   2818             /* Keep going. */
   2819         } else if (!memory_access_is_direct(mr, true)) {
   2820             release_lock |= prepare_mmio_access(mr);
   2821             l = memory_access_size(mr, l, addr1);
   2822             /* XXX: could force current_cpu to NULL to avoid
   2823                potential bugs */
   2824             val = ldn_he_p(buf, l);
   2825             result |= memory_region_dispatch_write(mr, addr1, val,
   2826                                                    size_memop(l), attrs);
   2827         } else {
   2828             /* RAM case */
   2829             ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
   2830             memcpy(ram_ptr, buf, l);
   2831             invalidate_and_set_dirty(mr, addr1, l);
   2832         }
   2833 
   2834         if (release_lock) {
   2835             qemu_mutex_unlock_iothread();
   2836             release_lock = false;
   2837         }
   2838 
   2839         len -= l;
   2840         buf += l;
   2841         addr += l;
   2842 
   2843         if (!len) {
   2844             break;
   2845         }
   2846 
   2847         l = len;
   2848         mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
   2849     }
   2850 
   2851     return result;
   2852 }
   2853 
   2854 /* Called from RCU critical section.  */
   2855 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
   2856                                   const void *buf, hwaddr len)
   2857 {
   2858     hwaddr l;
   2859     hwaddr addr1;
   2860     MemoryRegion *mr;
   2861 
   2862     l = len;
   2863     mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
   2864     if (!flatview_access_allowed(mr, attrs, addr, len)) {
   2865         return MEMTX_ACCESS_ERROR;
   2866     }
   2867     return flatview_write_continue(fv, addr, attrs, buf, len,
   2868                                    addr1, l, mr);
   2869 }
   2870 
   2871 /* Called within RCU critical section.  */
   2872 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
   2873                                    MemTxAttrs attrs, void *ptr,
   2874                                    hwaddr len, hwaddr addr1, hwaddr l,
   2875                                    MemoryRegion *mr)
   2876 {
   2877     uint8_t *ram_ptr;
   2878     uint64_t val;
   2879     MemTxResult result = MEMTX_OK;
   2880     bool release_lock = false;
   2881     uint8_t *buf = ptr;
   2882 
   2883     fuzz_dma_read_cb(addr, len, mr);
   2884     for (;;) {
   2885         if (!flatview_access_allowed(mr, attrs, addr1, l)) {
   2886             result |= MEMTX_ACCESS_ERROR;
   2887             /* Keep going. */
   2888         } else if (!memory_access_is_direct(mr, false)) {
   2889             /* I/O case */
   2890             release_lock |= prepare_mmio_access(mr);
   2891             l = memory_access_size(mr, l, addr1);
   2892             result |= memory_region_dispatch_read(mr, addr1, &val,
   2893                                                   size_memop(l), attrs);
   2894             stn_he_p(buf, l, val);
   2895         } else {
   2896             /* RAM case */
   2897             ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
   2898             memcpy(buf, ram_ptr, l);
   2899         }
   2900 
   2901         if (release_lock) {
   2902             qemu_mutex_unlock_iothread();
   2903             release_lock = false;
   2904         }
   2905 
   2906         len -= l;
   2907         buf += l;
   2908         addr += l;
   2909 
   2910         if (!len) {
   2911             break;
   2912         }
   2913 
   2914         l = len;
   2915         mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
   2916     }
   2917 
   2918     return result;
   2919 }
   2920 
   2921 /* Called from RCU critical section.  */
   2922 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
   2923                                  MemTxAttrs attrs, void *buf, hwaddr len)
   2924 {
   2925     hwaddr l;
   2926     hwaddr addr1;
   2927     MemoryRegion *mr;
   2928 
   2929     l = len;
   2930     mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
   2931     if (!flatview_access_allowed(mr, attrs, addr, len)) {
   2932         return MEMTX_ACCESS_ERROR;
   2933     }
   2934     return flatview_read_continue(fv, addr, attrs, buf, len,
   2935                                   addr1, l, mr);
   2936 }
   2937 
   2938 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
   2939                                     MemTxAttrs attrs, void *buf, hwaddr len)
   2940 {
   2941     MemTxResult result = MEMTX_OK;
   2942     FlatView *fv;
   2943 
   2944     if (len > 0) {
   2945         RCU_READ_LOCK_GUARD();
   2946         fv = address_space_to_flatview(as);
   2947         result = flatview_read(fv, addr, attrs, buf, len);
   2948     }
   2949 
   2950     return result;
   2951 }
   2952 
   2953 MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
   2954                                 MemTxAttrs attrs,
   2955                                 const void *buf, hwaddr len)
   2956 {
   2957     MemTxResult result = MEMTX_OK;
   2958     FlatView *fv;
   2959 
   2960     if (len > 0) {
   2961         RCU_READ_LOCK_GUARD();
   2962         fv = address_space_to_flatview(as);
   2963         result = flatview_write(fv, addr, attrs, buf, len);
   2964     }
   2965 
   2966     return result;
   2967 }
   2968 
   2969 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
   2970                              void *buf, hwaddr len, bool is_write)
   2971 {
   2972     if (is_write) {
   2973         return address_space_write(as, addr, attrs, buf, len);
   2974     } else {
   2975         return address_space_read_full(as, addr, attrs, buf, len);
   2976     }
   2977 }
   2978 
   2979 MemTxResult address_space_set(AddressSpace *as, hwaddr addr,
   2980                               uint8_t c, hwaddr len, MemTxAttrs attrs)
   2981 {
   2982 #define FILLBUF_SIZE 512
   2983     uint8_t fillbuf[FILLBUF_SIZE];
   2984     int l;
   2985     MemTxResult error = MEMTX_OK;
   2986 
   2987     memset(fillbuf, c, FILLBUF_SIZE);
   2988     while (len > 0) {
   2989         l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
   2990         error |= address_space_write(as, addr, attrs, fillbuf, l);
   2991         len -= l;
   2992         addr += l;
   2993     }
   2994 
   2995     return error;
   2996 }
   2997 
   2998 void cpu_physical_memory_rw(hwaddr addr, void *buf,
   2999                             hwaddr len, bool is_write)
   3000 {
   3001     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
   3002                      buf, len, is_write);
   3003 }
   3004 
   3005 enum write_rom_type {
   3006     WRITE_DATA,
   3007     FLUSH_CACHE,
   3008 };
   3009 
   3010 static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
   3011                                                            hwaddr addr,
   3012                                                            MemTxAttrs attrs,
   3013                                                            const void *ptr,
   3014                                                            hwaddr len,
   3015                                                            enum write_rom_type type)
   3016 {
   3017     hwaddr l;
   3018     uint8_t *ram_ptr;
   3019     hwaddr addr1;
   3020     MemoryRegion *mr;
   3021     const uint8_t *buf = ptr;
   3022 
   3023     RCU_READ_LOCK_GUARD();
   3024     while (len > 0) {
   3025         l = len;
   3026         mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
   3027 
   3028         if (!(memory_region_is_ram(mr) ||
   3029               memory_region_is_romd(mr))) {
   3030             l = memory_access_size(mr, l, addr1);
   3031         } else {
   3032             /* ROM/RAM case */
   3033             ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
   3034             switch (type) {
   3035             case WRITE_DATA:
   3036                 memcpy(ram_ptr, buf, l);
   3037                 invalidate_and_set_dirty(mr, addr1, l);
   3038                 break;
   3039             case FLUSH_CACHE:
   3040                 flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l);
   3041                 break;
   3042             }
   3043         }
   3044         len -= l;
   3045         buf += l;
   3046         addr += l;
   3047     }
   3048     return MEMTX_OK;
   3049 }
   3050 
   3051 /* used for ROM loading : can write in RAM and ROM */
   3052 MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
   3053                                     MemTxAttrs attrs,
   3054                                     const void *buf, hwaddr len)
   3055 {
   3056     return address_space_write_rom_internal(as, addr, attrs,
   3057                                             buf, len, WRITE_DATA);
   3058 }
   3059 
   3060 void cpu_flush_icache_range(hwaddr start, hwaddr len)
   3061 {
   3062     /*
   3063      * This function should do the same thing as an icache flush that was
   3064      * triggered from within the guest. For TCG we are always cache coherent,
   3065      * so there is no need to flush anything. For KVM / Xen we need to flush
   3066      * the host's instruction cache at least.
   3067      */
   3068     if (tcg_enabled()) {
   3069         return;
   3070     }
   3071 
   3072     address_space_write_rom_internal(&address_space_memory,
   3073                                      start, MEMTXATTRS_UNSPECIFIED,
   3074                                      NULL, len, FLUSH_CACHE);
   3075 }
   3076 
   3077 typedef struct {
   3078     MemoryRegion *mr;
   3079     void *buffer;
   3080     hwaddr addr;
   3081     hwaddr len;
   3082     bool in_use;
   3083 } BounceBuffer;
   3084 
   3085 static BounceBuffer bounce;
   3086 
   3087 typedef struct MapClient {
   3088     QEMUBH *bh;
   3089     QLIST_ENTRY(MapClient) link;
   3090 } MapClient;
   3091 
   3092 QemuMutex map_client_list_lock;
   3093 static QLIST_HEAD(, MapClient) map_client_list
   3094     = QLIST_HEAD_INITIALIZER(map_client_list);
   3095 
   3096 static void cpu_unregister_map_client_do(MapClient *client)
   3097 {
   3098     QLIST_REMOVE(client, link);
   3099     g_free(client);
   3100 }
   3101 
   3102 static void cpu_notify_map_clients_locked(void)
   3103 {
   3104     MapClient *client;
   3105 
   3106     while (!QLIST_EMPTY(&map_client_list)) {
   3107         client = QLIST_FIRST(&map_client_list);
   3108         qemu_bh_schedule(client->bh);
   3109         cpu_unregister_map_client_do(client);
   3110     }
   3111 }
   3112 
   3113 void cpu_register_map_client(QEMUBH *bh)
   3114 {
   3115     MapClient *client = g_malloc(sizeof(*client));
   3116 
   3117     qemu_mutex_lock(&map_client_list_lock);
   3118     client->bh = bh;
   3119     QLIST_INSERT_HEAD(&map_client_list, client, link);
   3120     if (!qatomic_read(&bounce.in_use)) {
   3121         cpu_notify_map_clients_locked();
   3122     }
   3123     qemu_mutex_unlock(&map_client_list_lock);
   3124 }
   3125 
   3126 void cpu_exec_init_all(void)
   3127 {
   3128     qemu_mutex_init(&ram_list.mutex);
   3129     /* The data structures we set up here depend on knowing the page size,
   3130      * so no more changes can be made after this point.
   3131      * In an ideal world, nothing we did before we had finished the
   3132      * machine setup would care about the target page size, and we could
   3133      * do this much later, rather than requiring board models to state
   3134      * up front what their requirements are.
   3135      */
   3136     finalize_target_page_bits();
   3137     io_mem_init();
   3138     memory_map_init();
   3139     qemu_mutex_init(&map_client_list_lock);
   3140 }
   3141 
   3142 void cpu_unregister_map_client(QEMUBH *bh)
   3143 {
   3144     MapClient *client;
   3145 
   3146     qemu_mutex_lock(&map_client_list_lock);
   3147     QLIST_FOREACH(client, &map_client_list, link) {
   3148         if (client->bh == bh) {
   3149             cpu_unregister_map_client_do(client);
   3150             break;
   3151         }
   3152     }
   3153     qemu_mutex_unlock(&map_client_list_lock);
   3154 }
   3155 
   3156 static void cpu_notify_map_clients(void)
   3157 {
   3158     qemu_mutex_lock(&map_client_list_lock);
   3159     cpu_notify_map_clients_locked();
   3160     qemu_mutex_unlock(&map_client_list_lock);
   3161 }
   3162 
   3163 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
   3164                                   bool is_write, MemTxAttrs attrs)
   3165 {
   3166     MemoryRegion *mr;
   3167     hwaddr l, xlat;
   3168 
   3169     while (len > 0) {
   3170         l = len;
   3171         mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
   3172         if (!memory_access_is_direct(mr, is_write)) {
   3173             l = memory_access_size(mr, l, addr);
   3174             if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
   3175                 return false;
   3176             }
   3177         }
   3178 
   3179         len -= l;
   3180         addr += l;
   3181     }
   3182     return true;
   3183 }
   3184 
   3185 bool address_space_access_valid(AddressSpace *as, hwaddr addr,
   3186                                 hwaddr len, bool is_write,
   3187                                 MemTxAttrs attrs)
   3188 {
   3189     FlatView *fv;
   3190 
   3191     RCU_READ_LOCK_GUARD();
   3192     fv = address_space_to_flatview(as);
   3193     return flatview_access_valid(fv, addr, len, is_write, attrs);
   3194 }
   3195 
   3196 static hwaddr
   3197 flatview_extend_translation(FlatView *fv, hwaddr addr,
   3198                             hwaddr target_len,
   3199                             MemoryRegion *mr, hwaddr base, hwaddr len,
   3200                             bool is_write, MemTxAttrs attrs)
   3201 {
   3202     hwaddr done = 0;
   3203     hwaddr xlat;
   3204     MemoryRegion *this_mr;
   3205 
   3206     for (;;) {
   3207         target_len -= len;
   3208         addr += len;
   3209         done += len;
   3210         if (target_len == 0) {
   3211             return done;
   3212         }
   3213 
   3214         len = target_len;
   3215         this_mr = flatview_translate(fv, addr, &xlat,
   3216                                      &len, is_write, attrs);
   3217         if (this_mr != mr || xlat != base + done) {
   3218             return done;
   3219         }
   3220     }
   3221 }
   3222 
   3223 /* Map a physical memory region into a host virtual address.
   3224  * May map a subset of the requested range, given by and returned in *plen.
   3225  * May return NULL if resources needed to perform the mapping are exhausted.
   3226  * Use only for reads OR writes - not for read-modify-write operations.
   3227  * Use cpu_register_map_client() to know when retrying the map operation is
   3228  * likely to succeed.
   3229  */
   3230 void *address_space_map(AddressSpace *as,
   3231                         hwaddr addr,
   3232                         hwaddr *plen,
   3233                         bool is_write,
   3234                         MemTxAttrs attrs)
   3235 {
   3236     hwaddr len = *plen;
   3237     hwaddr l, xlat;
   3238     MemoryRegion *mr;
   3239     void *ptr;
   3240     FlatView *fv;
   3241 
   3242     if (len == 0) {
   3243         return NULL;
   3244     }
   3245 
   3246     l = len;
   3247     RCU_READ_LOCK_GUARD();
   3248     fv = address_space_to_flatview(as);
   3249     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
   3250 
   3251     if (!memory_access_is_direct(mr, is_write)) {
   3252         if (qatomic_xchg(&bounce.in_use, true)) {
   3253             *plen = 0;
   3254             return NULL;
   3255         }
   3256         /* Avoid unbounded allocations */
   3257         l = MIN(l, TARGET_PAGE_SIZE);
   3258         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
   3259         bounce.addr = addr;
   3260         bounce.len = l;
   3261 
   3262         memory_region_ref(mr);
   3263         bounce.mr = mr;
   3264         if (!is_write) {
   3265             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
   3266                                bounce.buffer, l);
   3267         }
   3268 
   3269         *plen = l;
   3270         return bounce.buffer;
   3271     }
   3272 
   3273 
   3274     memory_region_ref(mr);
   3275     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
   3276                                         l, is_write, attrs);
   3277     fuzz_dma_read_cb(addr, *plen, mr);
   3278     ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
   3279 
   3280     return ptr;
   3281 }
   3282 
   3283 /* Unmaps a memory region previously mapped by address_space_map().
   3284  * Will also mark the memory as dirty if is_write is true.  access_len gives
   3285  * the amount of memory that was actually read or written by the caller.
   3286  */
   3287 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
   3288                          bool is_write, hwaddr access_len)
   3289 {
   3290     if (buffer != bounce.buffer) {
   3291         MemoryRegion *mr;
   3292         ram_addr_t addr1;
   3293 
   3294         mr = memory_region_from_host(buffer, &addr1);
   3295         assert(mr != NULL);
   3296         if (is_write) {
   3297             invalidate_and_set_dirty(mr, addr1, access_len);
   3298         }
   3299         if (xen_enabled()) {
   3300             xen_invalidate_map_cache_entry(buffer);
   3301         }
   3302         memory_region_unref(mr);
   3303         return;
   3304     }
   3305     if (is_write) {
   3306         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
   3307                             bounce.buffer, access_len);
   3308     }
   3309     qemu_vfree(bounce.buffer);
   3310     bounce.buffer = NULL;
   3311     memory_region_unref(bounce.mr);
   3312     qatomic_mb_set(&bounce.in_use, false);
   3313     cpu_notify_map_clients();
   3314 }
   3315 
   3316 void *cpu_physical_memory_map(hwaddr addr,
   3317                               hwaddr *plen,
   3318                               bool is_write)
   3319 {
   3320     return address_space_map(&address_space_memory, addr, plen, is_write,
   3321                              MEMTXATTRS_UNSPECIFIED);
   3322 }
   3323 
   3324 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
   3325                                bool is_write, hwaddr access_len)
   3326 {
   3327     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
   3328 }
   3329 
   3330 #define ARG1_DECL                AddressSpace *as
   3331 #define ARG1                     as
   3332 #define SUFFIX
   3333 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
   3334 #define RCU_READ_LOCK(...)       rcu_read_lock()
   3335 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
   3336 #include "memory_ldst.c.inc"
   3337 
   3338 int64_t address_space_cache_init(MemoryRegionCache *cache,
   3339                                  AddressSpace *as,
   3340                                  hwaddr addr,
   3341                                  hwaddr len,
   3342                                  bool is_write)
   3343 {
   3344     AddressSpaceDispatch *d;
   3345     hwaddr l;
   3346     MemoryRegion *mr;
   3347     Int128 diff;
   3348 
   3349     assert(len > 0);
   3350 
   3351     l = len;
   3352     cache->fv = address_space_get_flatview(as);
   3353     d = flatview_to_dispatch(cache->fv);
   3354     cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
   3355 
   3356     /*
   3357      * cache->xlat is now relative to cache->mrs.mr, not to the section itself.
   3358      * Take that into account to compute how many bytes are there between
   3359      * cache->xlat and the end of the section.
   3360      */
   3361     diff = int128_sub(cache->mrs.size,
   3362 		      int128_make64(cache->xlat - cache->mrs.offset_within_region));
   3363     l = int128_get64(int128_min(diff, int128_make64(l)));
   3364 
   3365     mr = cache->mrs.mr;
   3366     memory_region_ref(mr);
   3367     if (memory_access_is_direct(mr, is_write)) {
   3368         /* We don't care about the memory attributes here as we're only
   3369          * doing this if we found actual RAM, which behaves the same
   3370          * regardless of attributes; so UNSPECIFIED is fine.
   3371          */
   3372         l = flatview_extend_translation(cache->fv, addr, len, mr,
   3373                                         cache->xlat, l, is_write,
   3374                                         MEMTXATTRS_UNSPECIFIED);
   3375         cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
   3376     } else {
   3377         cache->ptr = NULL;
   3378     }
   3379 
   3380     cache->len = l;
   3381     cache->is_write = is_write;
   3382     return l;
   3383 }
   3384 
   3385 void address_space_cache_invalidate(MemoryRegionCache *cache,
   3386                                     hwaddr addr,
   3387                                     hwaddr access_len)
   3388 {
   3389     assert(cache->is_write);
   3390     if (likely(cache->ptr)) {
   3391         invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
   3392     }
   3393 }
   3394 
   3395 void address_space_cache_destroy(MemoryRegionCache *cache)
   3396 {
   3397     if (!cache->mrs.mr) {
   3398         return;
   3399     }
   3400 
   3401     if (xen_enabled()) {
   3402         xen_invalidate_map_cache_entry(cache->ptr);
   3403     }
   3404     memory_region_unref(cache->mrs.mr);
   3405     flatview_unref(cache->fv);
   3406     cache->mrs.mr = NULL;
   3407     cache->fv = NULL;
   3408 }
   3409 
   3410 /* Called from RCU critical section.  This function has the same
   3411  * semantics as address_space_translate, but it only works on a
   3412  * predefined range of a MemoryRegion that was mapped with
   3413  * address_space_cache_init.
   3414  */
   3415 static inline MemoryRegion *address_space_translate_cached(
   3416     MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
   3417     hwaddr *plen, bool is_write, MemTxAttrs attrs)
   3418 {
   3419     MemoryRegionSection section;
   3420     MemoryRegion *mr;
   3421     IOMMUMemoryRegion *iommu_mr;
   3422     AddressSpace *target_as;
   3423 
   3424     assert(!cache->ptr);
   3425     *xlat = addr + cache->xlat;
   3426 
   3427     mr = cache->mrs.mr;
   3428     iommu_mr = memory_region_get_iommu(mr);
   3429     if (!iommu_mr) {
   3430         /* MMIO region.  */
   3431         return mr;
   3432     }
   3433 
   3434     section = address_space_translate_iommu(iommu_mr, xlat, plen,
   3435                                             NULL, is_write, true,
   3436                                             &target_as, attrs);
   3437     return section.mr;
   3438 }
   3439 
   3440 /* Called from RCU critical section. address_space_read_cached uses this
   3441  * out of line function when the target is an MMIO or IOMMU region.
   3442  */
   3443 MemTxResult
   3444 address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
   3445                                    void *buf, hwaddr len)
   3446 {
   3447     hwaddr addr1, l;
   3448     MemoryRegion *mr;
   3449 
   3450     l = len;
   3451     mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
   3452                                         MEMTXATTRS_UNSPECIFIED);
   3453     return flatview_read_continue(cache->fv,
   3454                                   addr, MEMTXATTRS_UNSPECIFIED, buf, len,
   3455                                   addr1, l, mr);
   3456 }
   3457 
   3458 /* Called from RCU critical section. address_space_write_cached uses this
   3459  * out of line function when the target is an MMIO or IOMMU region.
   3460  */
   3461 MemTxResult
   3462 address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
   3463                                     const void *buf, hwaddr len)
   3464 {
   3465     hwaddr addr1, l;
   3466     MemoryRegion *mr;
   3467 
   3468     l = len;
   3469     mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
   3470                                         MEMTXATTRS_UNSPECIFIED);
   3471     return flatview_write_continue(cache->fv,
   3472                                    addr, MEMTXATTRS_UNSPECIFIED, buf, len,
   3473                                    addr1, l, mr);
   3474 }
   3475 
   3476 #define ARG1_DECL                MemoryRegionCache *cache
   3477 #define ARG1                     cache
   3478 #define SUFFIX                   _cached_slow
   3479 #define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
   3480 #define RCU_READ_LOCK()          ((void)0)
   3481 #define RCU_READ_UNLOCK()        ((void)0)
   3482 #include "memory_ldst.c.inc"
   3483 
   3484 /* virtual memory access for debug (includes writing to ROM) */
   3485 int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
   3486                         void *ptr, size_t len, bool is_write)
   3487 {
   3488     hwaddr phys_addr;
   3489     vaddr l, page;
   3490     uint8_t *buf = ptr;
   3491 
   3492     cpu_synchronize_state(cpu);
   3493     while (len > 0) {
   3494         int asidx;
   3495         MemTxAttrs attrs;
   3496         MemTxResult res;
   3497 
   3498         page = addr & TARGET_PAGE_MASK;
   3499         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
   3500         asidx = cpu_asidx_from_attrs(cpu, attrs);
   3501         /* if no physical page mapped, return an error */
   3502         if (phys_addr == -1)
   3503             return -1;
   3504         l = (page + TARGET_PAGE_SIZE) - addr;
   3505         if (l > len)
   3506             l = len;
   3507         phys_addr += (addr & ~TARGET_PAGE_MASK);
   3508         if (is_write) {
   3509             res = address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
   3510                                           attrs, buf, l);
   3511         } else {
   3512             res = address_space_read(cpu->cpu_ases[asidx].as, phys_addr,
   3513                                      attrs, buf, l);
   3514         }
   3515         if (res != MEMTX_OK) {
   3516             return -1;
   3517         }
   3518         len -= l;
   3519         buf += l;
   3520         addr += l;
   3521     }
   3522     return 0;
   3523 }
   3524 
   3525 /*
   3526  * Allows code that needs to deal with migration bitmaps etc to still be built
   3527  * target independent.
   3528  */
   3529 size_t qemu_target_page_size(void)
   3530 {
   3531     return TARGET_PAGE_SIZE;
   3532 }
   3533 
   3534 int qemu_target_page_bits(void)
   3535 {
   3536     return TARGET_PAGE_BITS;
   3537 }
   3538 
   3539 int qemu_target_page_bits_min(void)
   3540 {
   3541     return TARGET_PAGE_BITS_MIN;
   3542 }
   3543 
   3544 bool cpu_physical_memory_is_io(hwaddr phys_addr)
   3545 {
   3546     MemoryRegion*mr;
   3547     hwaddr l = 1;
   3548     bool res;
   3549 
   3550     RCU_READ_LOCK_GUARD();
   3551     mr = address_space_translate(&address_space_memory,
   3552                                  phys_addr, &phys_addr, &l, false,
   3553                                  MEMTXATTRS_UNSPECIFIED);
   3554 
   3555     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
   3556     return res;
   3557 }
   3558 
   3559 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
   3560 {
   3561     RAMBlock *block;
   3562     int ret = 0;
   3563 
   3564     RCU_READ_LOCK_GUARD();
   3565     RAMBLOCK_FOREACH(block) {
   3566         ret = func(block, opaque);
   3567         if (ret) {
   3568             break;
   3569         }
   3570     }
   3571     return ret;
   3572 }
   3573 
   3574 /*
   3575  * Unmap pages of memory from start to start+length such that
   3576  * they a) read as 0, b) Trigger whatever fault mechanism
   3577  * the OS provides for postcopy.
   3578  * The pages must be unmapped by the end of the function.
   3579  * Returns: 0 on success, none-0 on failure
   3580  *
   3581  */
   3582 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
   3583 {
   3584     int ret = -1;
   3585 
   3586     uint8_t *host_startaddr = rb->host + start;
   3587 
   3588     if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
   3589         error_report("ram_block_discard_range: Unaligned start address: %p",
   3590                      host_startaddr);
   3591         goto err;
   3592     }
   3593 
   3594     if ((start + length) <= rb->max_length) {
   3595         bool need_madvise, need_fallocate;
   3596         if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
   3597             error_report("ram_block_discard_range: Unaligned length: %zx",
   3598                          length);
   3599             goto err;
   3600         }
   3601 
   3602         errno = ENOTSUP; /* If we are missing MADVISE etc */
   3603 
   3604         /* The logic here is messy;
   3605          *    madvise DONTNEED fails for hugepages
   3606          *    fallocate works on hugepages and shmem
   3607          *    shared anonymous memory requires madvise REMOVE
   3608          */
   3609         need_madvise = (rb->page_size == qemu_host_page_size);
   3610         need_fallocate = rb->fd != -1;
   3611         if (need_fallocate) {
   3612             /* For a file, this causes the area of the file to be zero'd
   3613              * if read, and for hugetlbfs also causes it to be unmapped
   3614              * so a userfault will trigger.
   3615              */
   3616 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   3617             ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   3618                             start, length);
   3619             if (ret) {
   3620                 ret = -errno;
   3621                 error_report("ram_block_discard_range: Failed to fallocate "
   3622                              "%s:%" PRIx64 " +%zx (%d)",
   3623                              rb->idstr, start, length, ret);
   3624                 goto err;
   3625             }
   3626 #else
   3627             ret = -ENOSYS;
   3628             error_report("ram_block_discard_range: fallocate not available/file"
   3629                          "%s:%" PRIx64 " +%zx (%d)",
   3630                          rb->idstr, start, length, ret);
   3631             goto err;
   3632 #endif
   3633         }
   3634         if (need_madvise) {
   3635             /* For normal RAM this causes it to be unmapped,
   3636              * for shared memory it causes the local mapping to disappear
   3637              * and to fall back on the file contents (which we just
   3638              * fallocate'd away).
   3639              */
   3640 #if defined(CONFIG_MADVISE)
   3641             if (qemu_ram_is_shared(rb) && rb->fd < 0) {
   3642                 ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE);
   3643             } else {
   3644                 ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED);
   3645             }
   3646             if (ret) {
   3647                 ret = -errno;
   3648                 error_report("ram_block_discard_range: Failed to discard range "
   3649                              "%s:%" PRIx64 " +%zx (%d)",
   3650                              rb->idstr, start, length, ret);
   3651                 goto err;
   3652             }
   3653 #else
   3654             ret = -ENOSYS;
   3655             error_report("ram_block_discard_range: MADVISE not available"
   3656                          "%s:%" PRIx64 " +%zx (%d)",
   3657                          rb->idstr, start, length, ret);
   3658             goto err;
   3659 #endif
   3660         }
   3661         trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
   3662                                       need_madvise, need_fallocate, ret);
   3663     } else {
   3664         error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
   3665                      "/%zx/" RAM_ADDR_FMT")",
   3666                      rb->idstr, start, length, rb->max_length);
   3667     }
   3668 
   3669 err:
   3670     return ret;
   3671 }
   3672 
   3673 bool ramblock_is_pmem(RAMBlock *rb)
   3674 {
   3675     return rb->flags & RAM_PMEM;
   3676 }
   3677 
   3678 static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
   3679 {
   3680     if (start == end - 1) {
   3681         qemu_printf("\t%3d      ", start);
   3682     } else {
   3683         qemu_printf("\t%3d..%-3d ", start, end - 1);
   3684     }
   3685     qemu_printf(" skip=%d ", skip);
   3686     if (ptr == PHYS_MAP_NODE_NIL) {
   3687         qemu_printf(" ptr=NIL");
   3688     } else if (!skip) {
   3689         qemu_printf(" ptr=#%d", ptr);
   3690     } else {
   3691         qemu_printf(" ptr=[%d]", ptr);
   3692     }
   3693     qemu_printf("\n");
   3694 }
   3695 
   3696 #define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
   3697                            int128_sub((size), int128_one())) : 0)
   3698 
   3699 void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
   3700 {
   3701     int i;
   3702 
   3703     qemu_printf("  Dispatch\n");
   3704     qemu_printf("    Physical sections\n");
   3705 
   3706     for (i = 0; i < d->map.sections_nb; ++i) {
   3707         MemoryRegionSection *s = d->map.sections + i;
   3708         const char *names[] = { " [unassigned]", " [not dirty]",
   3709                                 " [ROM]", " [watch]" };
   3710 
   3711         qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
   3712                     " %s%s%s%s%s",
   3713             i,
   3714             s->offset_within_address_space,
   3715             s->offset_within_address_space + MR_SIZE(s->size),
   3716             s->mr->name ? s->mr->name : "(noname)",
   3717             i < ARRAY_SIZE(names) ? names[i] : "",
   3718             s->mr == root ? " [ROOT]" : "",
   3719             s == d->mru_section ? " [MRU]" : "",
   3720             s->mr->is_iommu ? " [iommu]" : "");
   3721 
   3722         if (s->mr->alias) {
   3723             qemu_printf(" alias=%s", s->mr->alias->name ?
   3724                     s->mr->alias->name : "noname");
   3725         }
   3726         qemu_printf("\n");
   3727     }
   3728 
   3729     qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
   3730                P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
   3731     for (i = 0; i < d->map.nodes_nb; ++i) {
   3732         int j, jprev;
   3733         PhysPageEntry prev;
   3734         Node *n = d->map.nodes + i;
   3735 
   3736         qemu_printf("      [%d]\n", i);
   3737 
   3738         for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
   3739             PhysPageEntry *pe = *n + j;
   3740 
   3741             if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
   3742                 continue;
   3743             }
   3744 
   3745             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
   3746 
   3747             jprev = j;
   3748             prev = *pe;
   3749         }
   3750 
   3751         if (jprev != ARRAY_SIZE(*n)) {
   3752             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
   3753         }
   3754     }
   3755 }
   3756 
   3757 /* Require any discards to work. */
   3758 static unsigned int ram_block_discard_required_cnt;
   3759 /* Require only coordinated discards to work. */
   3760 static unsigned int ram_block_coordinated_discard_required_cnt;
   3761 /* Disable any discards. */
   3762 static unsigned int ram_block_discard_disabled_cnt;
   3763 /* Disable only uncoordinated discards. */
   3764 static unsigned int ram_block_uncoordinated_discard_disabled_cnt;
   3765 static QemuMutex ram_block_discard_disable_mutex;
   3766 
   3767 static void ram_block_discard_disable_mutex_lock(void)
   3768 {
   3769     static gsize initialized;
   3770 
   3771     if (g_once_init_enter(&initialized)) {
   3772         qemu_mutex_init(&ram_block_discard_disable_mutex);
   3773         g_once_init_leave(&initialized, 1);
   3774     }
   3775     qemu_mutex_lock(&ram_block_discard_disable_mutex);
   3776 }
   3777 
   3778 static void ram_block_discard_disable_mutex_unlock(void)
   3779 {
   3780     qemu_mutex_unlock(&ram_block_discard_disable_mutex);
   3781 }
   3782 
   3783 int ram_block_discard_disable(bool state)
   3784 {
   3785     int ret = 0;
   3786 
   3787     ram_block_discard_disable_mutex_lock();
   3788     if (!state) {
   3789         ram_block_discard_disabled_cnt--;
   3790     } else if (ram_block_discard_required_cnt ||
   3791                ram_block_coordinated_discard_required_cnt) {
   3792         ret = -EBUSY;
   3793     } else {
   3794         ram_block_discard_disabled_cnt++;
   3795     }
   3796     ram_block_discard_disable_mutex_unlock();
   3797     return ret;
   3798 }
   3799 
   3800 int ram_block_uncoordinated_discard_disable(bool state)
   3801 {
   3802     int ret = 0;
   3803 
   3804     ram_block_discard_disable_mutex_lock();
   3805     if (!state) {
   3806         ram_block_uncoordinated_discard_disabled_cnt--;
   3807     } else if (ram_block_discard_required_cnt) {
   3808         ret = -EBUSY;
   3809     } else {
   3810         ram_block_uncoordinated_discard_disabled_cnt++;
   3811     }
   3812     ram_block_discard_disable_mutex_unlock();
   3813     return ret;
   3814 }
   3815 
   3816 int ram_block_discard_require(bool state)
   3817 {
   3818     int ret = 0;
   3819 
   3820     ram_block_discard_disable_mutex_lock();
   3821     if (!state) {
   3822         ram_block_discard_required_cnt--;
   3823     } else if (ram_block_discard_disabled_cnt ||
   3824                ram_block_uncoordinated_discard_disabled_cnt) {
   3825         ret = -EBUSY;
   3826     } else {
   3827         ram_block_discard_required_cnt++;
   3828     }
   3829     ram_block_discard_disable_mutex_unlock();
   3830     return ret;
   3831 }
   3832 
   3833 int ram_block_coordinated_discard_require(bool state)
   3834 {
   3835     int ret = 0;
   3836 
   3837     ram_block_discard_disable_mutex_lock();
   3838     if (!state) {
   3839         ram_block_coordinated_discard_required_cnt--;
   3840     } else if (ram_block_discard_disabled_cnt) {
   3841         ret = -EBUSY;
   3842     } else {
   3843         ram_block_coordinated_discard_required_cnt++;
   3844     }
   3845     ram_block_discard_disable_mutex_unlock();
   3846     return ret;
   3847 }
   3848 
   3849 bool ram_block_discard_is_disabled(void)
   3850 {
   3851     return qatomic_read(&ram_block_discard_disabled_cnt) ||
   3852            qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt);
   3853 }
   3854 
   3855 bool ram_block_discard_is_required(void)
   3856 {
   3857     return qatomic_read(&ram_block_discard_required_cnt) ||
   3858            qatomic_read(&ram_block_coordinated_discard_required_cnt);
   3859 }