qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

mmap-alloc.c (8575B)


      1 /*
      2  * Support for RAM backed by mmaped host memory.
      3  *
      4  * Copyright (c) 2015 Red Hat, Inc.
      5  *
      6  * Authors:
      7  *  Michael S. Tsirkin <mst@redhat.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2 or
     10  * later.  See the COPYING file in the top-level directory.
     11  */
     12 
     13 #ifdef CONFIG_LINUX
     14 #include <linux/mman.h>
     15 #else  /* !CONFIG_LINUX */
     16 #define MAP_SYNC              0x0
     17 #define MAP_SHARED_VALIDATE   0x0
     18 #endif /* CONFIG_LINUX */
     19 
     20 #include "qemu/osdep.h"
     21 #include "qemu/mmap-alloc.h"
     22 #include "qemu/host-utils.h"
     23 #include "qemu/cutils.h"
     24 #include "qemu/error-report.h"
     25 
     26 #define HUGETLBFS_MAGIC       0x958458f6
     27 
     28 #ifdef CONFIG_LINUX
     29 #include <sys/vfs.h>
     30 #endif
     31 
     32 size_t qemu_fd_getpagesize(int fd)
     33 {
     34 #ifdef CONFIG_LINUX
     35     struct statfs fs;
     36     int ret;
     37 
     38     if (fd != -1) {
     39         do {
     40             ret = fstatfs(fd, &fs);
     41         } while (ret != 0 && errno == EINTR);
     42 
     43         if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
     44             return fs.f_bsize;
     45         }
     46     }
     47 #ifdef __sparc__
     48     /* SPARC Linux needs greater alignment than the pagesize */
     49     return QEMU_VMALLOC_ALIGN;
     50 #endif
     51 #endif
     52 
     53     return qemu_real_host_page_size();
     54 }
     55 
     56 #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
     57 static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
     58 {
     59 #if defined(__linux__)
     60     const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
     61     const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
     62     gchar *content = NULL;
     63     const char *endptr;
     64     unsigned int tmp;
     65 
     66     /*
     67      * hugeltb accounting is different than ordinary swap reservation:
     68      * a) Hugetlb pages from the pool are reserved for both private and
     69      *    shared mappings. For shared mappings, all mappers have to specify
     70      *    MAP_NORESERVE.
     71      * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
     72      */
     73     if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size()) {
     74         return true;
     75     }
     76 
     77     /*
     78      * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
     79      * are private writable mappings (see mm/mmap.c:accountable_mapping() in
     80      * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
     81      * implicitly active -- no reservation; this includes shmem. The only
     82      * exception is shared anonymous memory, it is accounted like private
     83      * anonymous memory.
     84      */
     85     if (readonly || (shared && fd >= 0)) {
     86         return true;
     87     }
     88 
     89     /*
     90      * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
     91      * memory overcommit is set to "never". Sparse memory regions aren't really
     92      * possible in this system configuration.
     93      *
     94      * Bail out now instead of silently committing way more memory than
     95      * currently desired by the user.
     96      */
     97     if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
     98         !qemu_strtoui(content, &endptr, 0, &tmp) &&
     99         (!endptr || *endptr == '\n')) {
    100         if (tmp == 2) {
    101             error_report("Skipping reservation of swap space is not supported:"
    102                          " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
    103             return false;
    104         }
    105         return true;
    106     }
    107     /* this interface has been around since Linux 2.6 */
    108     error_report("Skipping reservation of swap space is not supported:"
    109                  " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
    110     return false;
    111 #endif
    112     /*
    113      * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
    114      * and removed it a while ago.
    115      */
    116     error_report("Skipping reservation of swap space is not supported");
    117     return false;
    118 }
    119 
    120 /*
    121  * Reserve a new memory region of the requested size to be used for mapping
    122  * from the given fd (if any).
    123  */
    124 static void *mmap_reserve(size_t size, int fd)
    125 {
    126     int flags = MAP_PRIVATE;
    127 
    128 #if defined(__powerpc64__) && defined(__linux__)
    129     /*
    130      * On ppc64 mappings in the same segment (aka slice) must share the same
    131      * page size. Since we will be re-allocating part of this segment
    132      * from the supplied fd, we should make sure to use the same page size, to
    133      * this end we mmap the supplied fd.  In this case, set MAP_NORESERVE to
    134      * avoid allocating backing store memory.
    135      * We do this unless we are using the system page size, in which case
    136      * anonymous memory is OK.
    137      */
    138     if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size()) {
    139         fd = -1;
    140         flags |= MAP_ANONYMOUS;
    141     } else {
    142         flags |= MAP_NORESERVE;
    143     }
    144 #else
    145     fd = -1;
    146     flags |= MAP_ANONYMOUS;
    147 #endif
    148 
    149     return mmap(0, size, PROT_NONE, flags, fd, 0);
    150 }
    151 
    152 /*
    153  * Activate memory in a reserved region from the given fd (if any), to make
    154  * it accessible.
    155  */
    156 static void *mmap_activate(void *ptr, size_t size, int fd,
    157                            uint32_t qemu_map_flags, off_t map_offset)
    158 {
    159     const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;
    160     const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
    161     const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
    162     const bool sync = qemu_map_flags & QEMU_MAP_SYNC;
    163     const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
    164     int map_sync_flags = 0;
    165     int flags = MAP_FIXED;
    166     void *activated_ptr;
    167 
    168     if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {
    169         return MAP_FAILED;
    170     }
    171 
    172     flags |= fd == -1 ? MAP_ANONYMOUS : 0;
    173     flags |= shared ? MAP_SHARED : MAP_PRIVATE;
    174     flags |= noreserve ? MAP_NORESERVE : 0;
    175     if (shared && sync) {
    176         map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
    177     }
    178 
    179     activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,
    180                          map_offset);
    181     if (activated_ptr == MAP_FAILED && map_sync_flags) {
    182         if (errno == ENOTSUP) {
    183             char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
    184             char *file_name = g_malloc0(PATH_MAX);
    185             int len = readlink(proc_link, file_name, PATH_MAX - 1);
    186 
    187             if (len < 0) {
    188                 len = 0;
    189             }
    190             file_name[len] = '\0';
    191             fprintf(stderr, "Warning: requesting persistence across crashes "
    192                     "for backend file %s failed. Proceeding without "
    193                     "persistence, data might become corrupted in case of host "
    194                     "crash.\n", file_name);
    195             g_free(proc_link);
    196             g_free(file_name);
    197             warn_report("Using non DAX backing file with 'pmem=on' option"
    198                         " is deprecated");
    199         }
    200         /*
    201          * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
    202          * again without these flags to handle backwards compatibility.
    203          */
    204         activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);
    205     }
    206     return activated_ptr;
    207 }
    208 
    209 static inline size_t mmap_guard_pagesize(int fd)
    210 {
    211 #if defined(__powerpc64__) && defined(__linux__)
    212     /* Mappings in the same segment must share the same page size */
    213     return qemu_fd_getpagesize(fd);
    214 #else
    215     return qemu_real_host_page_size();
    216 #endif
    217 }
    218 
    219 void *qemu_ram_mmap(int fd,
    220                     size_t size,
    221                     size_t align,
    222                     uint32_t qemu_map_flags,
    223                     off_t map_offset)
    224 {
    225     const size_t guard_pagesize = mmap_guard_pagesize(fd);
    226     size_t offset, total;
    227     void *ptr, *guardptr;
    228 
    229     /*
    230      * Note: this always allocates at least one extra page of virtual address
    231      * space, even if size is already aligned.
    232      */
    233     total = size + align;
    234 
    235     guardptr = mmap_reserve(total, fd);
    236     if (guardptr == MAP_FAILED) {
    237         return MAP_FAILED;
    238     }
    239 
    240     assert(is_power_of_2(align));
    241     /* Always align to host page size */
    242     assert(align >= guard_pagesize);
    243 
    244     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
    245 
    246     ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,
    247                         map_offset);
    248     if (ptr == MAP_FAILED) {
    249         munmap(guardptr, total);
    250         return MAP_FAILED;
    251     }
    252 
    253     if (offset > 0) {
    254         munmap(guardptr, offset);
    255     }
    256 
    257     /*
    258      * Leave a single PROT_NONE page allocated after the RAM block, to serve as
    259      * a guard page guarding against potential buffer overflows.
    260      */
    261     total -= offset;
    262     if (total > size + guard_pagesize) {
    263         munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);
    264     }
    265 
    266     return ptr;
    267 }
    268 
    269 void qemu_ram_munmap(int fd, void *ptr, size_t size)
    270 {
    271     if (ptr) {
    272         /* Unmap both the RAM block and the guard page */
    273         munmap(ptr, size + mmap_guard_pagesize(fd));
    274     }
    275 }