qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

virtio-balloon.c (34649B)


      1 /*
      2  * Virtio Balloon Device
      3  *
      4  * Copyright IBM, Corp. 2008
      5  * Copyright (C) 2011 Red Hat, Inc.
      6  * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
      7  *
      8  * Authors:
      9  *  Anthony Liguori   <aliguori@us.ibm.com>
     10  *
     11  * This work is licensed under the terms of the GNU GPL, version 2.  See
     12  * the COPYING file in the top-level directory.
     13  *
     14  */
     15 
     16 #include "qemu/osdep.h"
     17 #include "qemu/iov.h"
     18 #include "qemu/module.h"
     19 #include "qemu/timer.h"
     20 #include "qemu/madvise.h"
     21 #include "hw/virtio/virtio.h"
     22 #include "hw/mem/pc-dimm.h"
     23 #include "hw/qdev-properties.h"
     24 #include "hw/boards.h"
     25 #include "sysemu/balloon.h"
     26 #include "hw/virtio/virtio-balloon.h"
     27 #include "exec/address-spaces.h"
     28 #include "qapi/error.h"
     29 #include "qapi/qapi-events-machine.h"
     30 #include "qapi/visitor.h"
     31 #include "trace.h"
     32 #include "qemu/error-report.h"
     33 #include "migration/misc.h"
     34 #include "migration/migration.h"
     35 
     36 #include "hw/virtio/virtio-bus.h"
     37 #include "hw/virtio/virtio-access.h"
     38 
     39 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
     40 
     41 typedef struct PartiallyBalloonedPage {
     42     ram_addr_t base_gpa;
     43     unsigned long *bitmap;
     44 } PartiallyBalloonedPage;
     45 
     46 static void virtio_balloon_pbp_free(PartiallyBalloonedPage *pbp)
     47 {
     48     if (!pbp->bitmap) {
     49         return;
     50     }
     51     g_free(pbp->bitmap);
     52     pbp->bitmap = NULL;
     53 }
     54 
     55 static void virtio_balloon_pbp_alloc(PartiallyBalloonedPage *pbp,
     56                                      ram_addr_t base_gpa,
     57                                      long subpages)
     58 {
     59     pbp->base_gpa = base_gpa;
     60     pbp->bitmap = bitmap_new(subpages);
     61 }
     62 
     63 static bool virtio_balloon_pbp_matches(PartiallyBalloonedPage *pbp,
     64                                        ram_addr_t base_gpa)
     65 {
     66     return pbp->base_gpa == base_gpa;
     67 }
     68 
     69 static bool virtio_balloon_inhibited(void)
     70 {
     71     /*
     72      * Postcopy cannot deal with concurrent discards,
     73      * so it's special, as well as background snapshots.
     74      */
     75     return ram_block_discard_is_disabled() || migration_in_incoming_postcopy() ||
     76             migration_in_bg_snapshot();
     77 }
     78 
     79 static void balloon_inflate_page(VirtIOBalloon *balloon,
     80                                  MemoryRegion *mr, hwaddr mr_offset,
     81                                  PartiallyBalloonedPage *pbp)
     82 {
     83     void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
     84     ram_addr_t rb_offset, rb_aligned_offset, base_gpa;
     85     RAMBlock *rb;
     86     size_t rb_page_size;
     87     int subpages;
     88 
     89     /* XXX is there a better way to get to the RAMBlock than via a
     90      * host address? */
     91     rb = qemu_ram_block_from_host(addr, false, &rb_offset);
     92     rb_page_size = qemu_ram_pagesize(rb);
     93 
     94     if (rb_page_size == BALLOON_PAGE_SIZE) {
     95         /* Easy case */
     96 
     97         ram_block_discard_range(rb, rb_offset, rb_page_size);
     98         /* We ignore errors from ram_block_discard_range(), because it
     99          * has already reported them, and failing to discard a balloon
    100          * page is not fatal */
    101         return;
    102     }
    103 
    104     /* Hard case
    105      *
    106      * We've put a piece of a larger host page into the balloon - we
    107      * need to keep track until we have a whole host page to
    108      * discard
    109      */
    110     warn_report_once(
    111 "Balloon used with backing page size > 4kiB, this may not be reliable");
    112 
    113     rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size);
    114     subpages = rb_page_size / BALLOON_PAGE_SIZE;
    115     base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
    116                (rb_offset - rb_aligned_offset);
    117 
    118     if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
    119         /* We've partially ballooned part of a host page, but now
    120          * we're trying to balloon part of a different one.  Too hard,
    121          * give up on the old partial page */
    122         virtio_balloon_pbp_free(pbp);
    123     }
    124 
    125     if (!pbp->bitmap) {
    126         virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
    127     }
    128 
    129     set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
    130             pbp->bitmap);
    131 
    132     if (bitmap_full(pbp->bitmap, subpages)) {
    133         /* We've accumulated a full host page, we can actually discard
    134          * it now */
    135 
    136         ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
    137         /* We ignore errors from ram_block_discard_range(), because it
    138          * has already reported them, and failing to discard a balloon
    139          * page is not fatal */
    140         virtio_balloon_pbp_free(pbp);
    141     }
    142 }
    143 
    144 static void balloon_deflate_page(VirtIOBalloon *balloon,
    145                                  MemoryRegion *mr, hwaddr mr_offset)
    146 {
    147     void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
    148     ram_addr_t rb_offset;
    149     RAMBlock *rb;
    150     size_t rb_page_size;
    151     void *host_addr;
    152     int ret;
    153 
    154     /* XXX is there a better way to get to the RAMBlock than via a
    155      * host address? */
    156     rb = qemu_ram_block_from_host(addr, false, &rb_offset);
    157     rb_page_size = qemu_ram_pagesize(rb);
    158 
    159     host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));
    160 
    161     /* When a page is deflated, we hint the whole host page it lives
    162      * on, since we can't do anything smaller */
    163     ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
    164     if (ret != 0) {
    165         warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s",
    166                     strerror(errno));
    167         /* Otherwise ignore, failing to page hint shouldn't be fatal */
    168     }
    169 }
    170 
    171 static const char *balloon_stat_names[] = {
    172    [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
    173    [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
    174    [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
    175    [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
    176    [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
    177    [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
    178    [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
    179    [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
    180    [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
    181    [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
    182    [VIRTIO_BALLOON_S_NR] = NULL
    183 };
    184 
    185 /*
    186  * reset_stats - Mark all items in the stats array as unset
    187  *
    188  * This function needs to be called at device initialization and before
    189  * updating to a set of newly-generated stats.  This will ensure that no
    190  * stale values stick around in case the guest reports a subset of the supported
    191  * statistics.
    192  */
    193 static inline void reset_stats(VirtIOBalloon *dev)
    194 {
    195     int i;
    196     for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1);
    197 }
    198 
    199 static bool balloon_stats_supported(const VirtIOBalloon *s)
    200 {
    201     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    202     return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
    203 }
    204 
    205 static bool balloon_stats_enabled(const VirtIOBalloon *s)
    206 {
    207     return s->stats_poll_interval > 0;
    208 }
    209 
    210 static void balloon_stats_destroy_timer(VirtIOBalloon *s)
    211 {
    212     if (balloon_stats_enabled(s)) {
    213         timer_free(s->stats_timer);
    214         s->stats_timer = NULL;
    215         s->stats_poll_interval = 0;
    216     }
    217 }
    218 
    219 static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
    220 {
    221     timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
    222 }
    223 
    224 static void balloon_stats_poll_cb(void *opaque)
    225 {
    226     VirtIOBalloon *s = opaque;
    227     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    228 
    229     if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
    230         /* re-schedule */
    231         balloon_stats_change_timer(s, s->stats_poll_interval);
    232         return;
    233     }
    234 
    235     virtqueue_push(s->svq, s->stats_vq_elem, 0);
    236     virtio_notify(vdev, s->svq);
    237     g_free(s->stats_vq_elem);
    238     s->stats_vq_elem = NULL;
    239 }
    240 
    241 static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name,
    242                                   void *opaque, Error **errp)
    243 {
    244     Error *err = NULL;
    245     VirtIOBalloon *s = VIRTIO_BALLOON(obj);
    246     int i;
    247 
    248     if (!visit_start_struct(v, name, NULL, 0, &err)) {
    249         goto out;
    250     }
    251     if (!visit_type_int(v, "last-update", &s->stats_last_update, &err)) {
    252         goto out_end;
    253     }
    254 
    255     if (!visit_start_struct(v, "stats", NULL, 0, &err)) {
    256         goto out_end;
    257     }
    258     for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
    259         if (!visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err)) {
    260             goto out_nested;
    261         }
    262     }
    263     visit_check_struct(v, &err);
    264 out_nested:
    265     visit_end_struct(v, NULL);
    266 
    267     if (!err) {
    268         visit_check_struct(v, &err);
    269     }
    270 out_end:
    271     visit_end_struct(v, NULL);
    272 out:
    273     error_propagate(errp, err);
    274 }
    275 
    276 static void balloon_stats_get_poll_interval(Object *obj, Visitor *v,
    277                                             const char *name, void *opaque,
    278                                             Error **errp)
    279 {
    280     VirtIOBalloon *s = VIRTIO_BALLOON(obj);
    281     visit_type_int(v, name, &s->stats_poll_interval, errp);
    282 }
    283 
    284 static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
    285                                             const char *name, void *opaque,
    286                                             Error **errp)
    287 {
    288     VirtIOBalloon *s = VIRTIO_BALLOON(obj);
    289     int64_t value;
    290 
    291     if (!visit_type_int(v, name, &value, errp)) {
    292         return;
    293     }
    294 
    295     if (value < 0) {
    296         error_setg(errp, "timer value must be greater than zero");
    297         return;
    298     }
    299 
    300     if (value > UINT32_MAX) {
    301         error_setg(errp, "timer value is too big");
    302         return;
    303     }
    304 
    305     if (value == s->stats_poll_interval) {
    306         return;
    307     }
    308 
    309     if (value == 0) {
    310         /* timer=0 disables the timer */
    311         balloon_stats_destroy_timer(s);
    312         return;
    313     }
    314 
    315     if (balloon_stats_enabled(s)) {
    316         /* timer interval change */
    317         s->stats_poll_interval = value;
    318         balloon_stats_change_timer(s, value);
    319         return;
    320     }
    321 
    322     /* create a new timer */
    323     g_assert(s->stats_timer == NULL);
    324     s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
    325     s->stats_poll_interval = value;
    326     balloon_stats_change_timer(s, 0);
    327 }
    328 
    329 static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq)
    330 {
    331     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
    332     VirtQueueElement *elem;
    333 
    334     while ((elem = virtqueue_pop(vq, sizeof(VirtQueueElement)))) {
    335         unsigned int i;
    336 
    337         /*
    338          * When we discard the page it has the effect of removing the page
    339          * from the hypervisor itself and causing it to be zeroed when it
    340          * is returned to us. So we must not discard the page if it is
    341          * accessible by another device or process, or if the guest is
    342          * expecting it to retain a non-zero value.
    343          */
    344         if (virtio_balloon_inhibited() || dev->poison_val) {
    345             goto skip_element;
    346         }
    347 
    348         for (i = 0; i < elem->in_num; i++) {
    349             void *addr = elem->in_sg[i].iov_base;
    350             size_t size = elem->in_sg[i].iov_len;
    351             ram_addr_t ram_offset;
    352             RAMBlock *rb;
    353 
    354             /*
    355              * There is no need to check the memory section to see if
    356              * it is ram/readonly/romd like there is for handle_output
    357              * below. If the region is not meant to be written to then
    358              * address_space_map will have allocated a bounce buffer
    359              * and it will be freed in address_space_unmap and trigger
    360              * and unassigned_mem_write before failing to copy over the
    361              * buffer. If more than one bad descriptor is provided it
    362              * will return NULL after the first bounce buffer and fail
    363              * to map any resources.
    364              */
    365             rb = qemu_ram_block_from_host(addr, false, &ram_offset);
    366             if (!rb) {
    367                 trace_virtio_balloon_bad_addr(elem->in_addr[i]);
    368                 continue;
    369             }
    370 
    371             /*
    372              * For now we will simply ignore unaligned memory regions, or
    373              * regions that overrun the end of the RAMBlock.
    374              */
    375             if (!QEMU_IS_ALIGNED(ram_offset | size, qemu_ram_pagesize(rb)) ||
    376                 (ram_offset + size) > qemu_ram_get_used_length(rb)) {
    377                 continue;
    378             }
    379 
    380             ram_block_discard_range(rb, ram_offset, size);
    381         }
    382 
    383 skip_element:
    384         virtqueue_push(vq, elem, 0);
    385         virtio_notify(vdev, vq);
    386         g_free(elem);
    387     }
    388 }
    389 
    390 static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
    391 {
    392     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
    393     VirtQueueElement *elem;
    394     MemoryRegionSection section;
    395 
    396     for (;;) {
    397         PartiallyBalloonedPage pbp = {};
    398         size_t offset = 0;
    399         uint32_t pfn;
    400 
    401         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
    402         if (!elem) {
    403             break;
    404         }
    405 
    406         while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
    407             unsigned int p = virtio_ldl_p(vdev, &pfn);
    408             hwaddr pa;
    409 
    410             pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
    411             offset += 4;
    412 
    413             section = memory_region_find(get_system_memory(), pa,
    414                                          BALLOON_PAGE_SIZE);
    415             if (!section.mr) {
    416                 trace_virtio_balloon_bad_addr(pa);
    417                 continue;
    418             }
    419             if (!memory_region_is_ram(section.mr) ||
    420                 memory_region_is_rom(section.mr) ||
    421                 memory_region_is_romd(section.mr)) {
    422                 trace_virtio_balloon_bad_addr(pa);
    423                 memory_region_unref(section.mr);
    424                 continue;
    425             }
    426 
    427             trace_virtio_balloon_handle_output(memory_region_name(section.mr),
    428                                                pa);
    429             if (!virtio_balloon_inhibited()) {
    430                 if (vq == s->ivq) {
    431                     balloon_inflate_page(s, section.mr,
    432                                          section.offset_within_region, &pbp);
    433                 } else if (vq == s->dvq) {
    434                     balloon_deflate_page(s, section.mr, section.offset_within_region);
    435                 } else {
    436                     g_assert_not_reached();
    437                 }
    438             }
    439             memory_region_unref(section.mr);
    440         }
    441 
    442         virtqueue_push(vq, elem, 0);
    443         virtio_notify(vdev, vq);
    444         g_free(elem);
    445         virtio_balloon_pbp_free(&pbp);
    446     }
    447 }
    448 
    449 static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq)
    450 {
    451     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
    452     VirtQueueElement *elem;
    453     VirtIOBalloonStat stat;
    454     size_t offset = 0;
    455 
    456     elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
    457     if (!elem) {
    458         goto out;
    459     }
    460 
    461     if (s->stats_vq_elem != NULL) {
    462         /* This should never happen if the driver follows the spec. */
    463         virtqueue_push(vq, s->stats_vq_elem, 0);
    464         virtio_notify(vdev, vq);
    465         g_free(s->stats_vq_elem);
    466     }
    467 
    468     s->stats_vq_elem = elem;
    469 
    470     /* Initialize the stats to get rid of any stale values.  This is only
    471      * needed to handle the case where a guest supports fewer stats than it
    472      * used to (ie. it has booted into an old kernel).
    473      */
    474     reset_stats(s);
    475 
    476     while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat))
    477            == sizeof(stat)) {
    478         uint16_t tag = virtio_tswap16(vdev, stat.tag);
    479         uint64_t val = virtio_tswap64(vdev, stat.val);
    480 
    481         offset += sizeof(stat);
    482         if (tag < VIRTIO_BALLOON_S_NR)
    483             s->stats[tag] = val;
    484     }
    485     s->stats_vq_offset = offset;
    486     s->stats_last_update = g_get_real_time() / G_USEC_PER_SEC;
    487 
    488 out:
    489     if (balloon_stats_enabled(s)) {
    490         balloon_stats_change_timer(s, s->stats_poll_interval);
    491     }
    492 }
    493 
    494 static void virtio_balloon_handle_free_page_vq(VirtIODevice *vdev,
    495                                                VirtQueue *vq)
    496 {
    497     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
    498     qemu_bh_schedule(s->free_page_bh);
    499 }
    500 
    501 static bool get_free_page_hints(VirtIOBalloon *dev)
    502 {
    503     VirtQueueElement *elem;
    504     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    505     VirtQueue *vq = dev->free_page_vq;
    506     bool ret = true;
    507     int i;
    508 
    509     while (dev->block_iothread) {
    510         qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
    511     }
    512 
    513     elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
    514     if (!elem) {
    515         return false;
    516     }
    517 
    518     if (elem->out_num) {
    519         uint32_t id;
    520         size_t size = iov_to_buf(elem->out_sg, elem->out_num, 0,
    521                                  &id, sizeof(id));
    522 
    523         virtio_tswap32s(vdev, &id);
    524         if (unlikely(size != sizeof(id))) {
    525             virtio_error(vdev, "received an incorrect cmd id");
    526             ret = false;
    527             goto out;
    528         }
    529         if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED &&
    530             id == dev->free_page_hint_cmd_id) {
    531             dev->free_page_hint_status = FREE_PAGE_HINT_S_START;
    532         } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_START) {
    533             /*
    534              * Stop the optimization only when it has started. This
    535              * avoids a stale stop sign for the previous command.
    536              */
    537             dev->free_page_hint_status = FREE_PAGE_HINT_S_STOP;
    538         }
    539     }
    540 
    541     if (elem->in_num && dev->free_page_hint_status == FREE_PAGE_HINT_S_START) {
    542         for (i = 0; i < elem->in_num; i++) {
    543             qemu_guest_free_page_hint(elem->in_sg[i].iov_base,
    544                                       elem->in_sg[i].iov_len);
    545         }
    546     }
    547 
    548 out:
    549     virtqueue_push(vq, elem, 0);
    550     g_free(elem);
    551     return ret;
    552 }
    553 
    554 static void virtio_ballloon_get_free_page_hints(void *opaque)
    555 {
    556     VirtIOBalloon *dev = opaque;
    557     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    558     VirtQueue *vq = dev->free_page_vq;
    559     bool continue_to_get_hints;
    560 
    561     do {
    562         qemu_mutex_lock(&dev->free_page_lock);
    563         virtio_queue_set_notification(vq, 0);
    564         continue_to_get_hints = get_free_page_hints(dev);
    565         qemu_mutex_unlock(&dev->free_page_lock);
    566         virtio_notify(vdev, vq);
    567       /*
    568        * Start to poll the vq once the hinting started. Otherwise, continue
    569        * only when there are entries on the vq, which need to be given back.
    570        */
    571     } while (continue_to_get_hints ||
    572              dev->free_page_hint_status == FREE_PAGE_HINT_S_START);
    573     virtio_queue_set_notification(vq, 1);
    574 }
    575 
    576 static bool virtio_balloon_free_page_support(void *opaque)
    577 {
    578     VirtIOBalloon *s = opaque;
    579     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    580 
    581     return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
    582 }
    583 
    584 static void virtio_balloon_free_page_start(VirtIOBalloon *s)
    585 {
    586     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    587 
    588     qemu_mutex_lock(&s->free_page_lock);
    589 
    590     if (s->free_page_hint_cmd_id == UINT_MAX) {
    591         s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN;
    592     } else {
    593         s->free_page_hint_cmd_id++;
    594     }
    595 
    596     s->free_page_hint_status = FREE_PAGE_HINT_S_REQUESTED;
    597     qemu_mutex_unlock(&s->free_page_lock);
    598 
    599     virtio_notify_config(vdev);
    600 }
    601 
    602 static void virtio_balloon_free_page_stop(VirtIOBalloon *s)
    603 {
    604     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    605 
    606     if (s->free_page_hint_status != FREE_PAGE_HINT_S_STOP) {
    607         /*
    608          * The lock also guarantees us that the
    609          * virtio_ballloon_get_free_page_hints exits after the
    610          * free_page_hint_status is set to S_STOP.
    611          */
    612         qemu_mutex_lock(&s->free_page_lock);
    613         /*
    614          * The guest isn't done hinting, so send a notification
    615          * to the guest to actively stop the hinting.
    616          */
    617         s->free_page_hint_status = FREE_PAGE_HINT_S_STOP;
    618         qemu_mutex_unlock(&s->free_page_lock);
    619         virtio_notify_config(vdev);
    620     }
    621 }
    622 
    623 static void virtio_balloon_free_page_done(VirtIOBalloon *s)
    624 {
    625     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    626 
    627     if (s->free_page_hint_status != FREE_PAGE_HINT_S_DONE) {
    628         /* See virtio_balloon_free_page_stop() */
    629         qemu_mutex_lock(&s->free_page_lock);
    630         s->free_page_hint_status = FREE_PAGE_HINT_S_DONE;
    631         qemu_mutex_unlock(&s->free_page_lock);
    632         virtio_notify_config(vdev);
    633     }
    634 }
    635 
    636 static int
    637 virtio_balloon_free_page_hint_notify(NotifierWithReturn *n, void *data)
    638 {
    639     VirtIOBalloon *dev = container_of(n, VirtIOBalloon, free_page_hint_notify);
    640     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    641     PrecopyNotifyData *pnd = data;
    642 
    643     if (!virtio_balloon_free_page_support(dev)) {
    644         /*
    645          * This is an optimization provided to migration, so just return 0 to
    646          * have the normal migration process not affected when this feature is
    647          * not supported.
    648          */
    649         return 0;
    650     }
    651 
    652     /*
    653      * Pages hinted via qemu_guest_free_page_hint() are cleared from the dirty
    654      * bitmap and will not get migrated, especially also not when the postcopy
    655      * destination starts using them and requests migration from the source; the
    656      * faulting thread will stall until postcopy migration finishes and
    657      * all threads are woken up. Let's not start free page hinting if postcopy
    658      * is possible.
    659      */
    660     if (migrate_postcopy_ram()) {
    661         return 0;
    662     }
    663 
    664     switch (pnd->reason) {
    665     case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC:
    666         virtio_balloon_free_page_stop(dev);
    667         break;
    668     case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
    669         if (vdev->vm_running) {
    670             virtio_balloon_free_page_start(dev);
    671             break;
    672         }
    673         /*
    674          * Set S_DONE before migrating the vmstate, so the guest will reuse
    675          * all hinted pages once running on the destination. Fall through.
    676          */
    677     case PRECOPY_NOTIFY_CLEANUP:
    678         /*
    679          * Especially, if something goes wrong during precopy or if migration
    680          * is canceled, we have to properly communicate S_DONE to the VM.
    681          */
    682         virtio_balloon_free_page_done(dev);
    683         break;
    684     case PRECOPY_NOTIFY_SETUP:
    685     case PRECOPY_NOTIFY_COMPLETE:
    686         break;
    687     default:
    688         virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason);
    689     }
    690 
    691     return 0;
    692 }
    693 
    694 static size_t virtio_balloon_config_size(VirtIOBalloon *s)
    695 {
    696     uint64_t features = s->host_features;
    697 
    698     if (s->qemu_4_0_config_size) {
    699         return sizeof(struct virtio_balloon_config);
    700     }
    701     if (virtio_has_feature(features, VIRTIO_BALLOON_F_PAGE_POISON)) {
    702         return sizeof(struct virtio_balloon_config);
    703     }
    704     if (virtio_has_feature(features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
    705         return offsetof(struct virtio_balloon_config, poison_val);
    706     }
    707     return offsetof(struct virtio_balloon_config, free_page_hint_cmd_id);
    708 }
    709 
    710 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
    711 {
    712     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
    713     struct virtio_balloon_config config = {};
    714 
    715     config.num_pages = cpu_to_le32(dev->num_pages);
    716     config.actual = cpu_to_le32(dev->actual);
    717     config.poison_val = cpu_to_le32(dev->poison_val);
    718 
    719     if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED) {
    720         config.free_page_hint_cmd_id =
    721                        cpu_to_le32(dev->free_page_hint_cmd_id);
    722     } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_STOP) {
    723         config.free_page_hint_cmd_id =
    724                        cpu_to_le32(VIRTIO_BALLOON_CMD_ID_STOP);
    725     } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_DONE) {
    726         config.free_page_hint_cmd_id =
    727                        cpu_to_le32(VIRTIO_BALLOON_CMD_ID_DONE);
    728     }
    729 
    730     trace_virtio_balloon_get_config(config.num_pages, config.actual);
    731     memcpy(config_data, &config, virtio_balloon_config_size(dev));
    732 }
    733 
    734 static int build_dimm_list(Object *obj, void *opaque)
    735 {
    736     GSList **list = opaque;
    737 
    738     if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
    739         DeviceState *dev = DEVICE(obj);
    740         if (dev->realized) { /* only realized DIMMs matter */
    741             *list = g_slist_prepend(*list, dev);
    742         }
    743     }
    744 
    745     object_child_foreach(obj, build_dimm_list, opaque);
    746     return 0;
    747 }
    748 
    749 static ram_addr_t get_current_ram_size(void)
    750 {
    751     GSList *list = NULL, *item;
    752     ram_addr_t size = current_machine->ram_size;
    753 
    754     build_dimm_list(qdev_get_machine(), &list);
    755     for (item = list; item; item = g_slist_next(item)) {
    756         Object *obj = OBJECT(item->data);
    757         if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) {
    758             size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
    759                                             &error_abort);
    760         }
    761     }
    762     g_slist_free(list);
    763 
    764     return size;
    765 }
    766 
    767 static bool virtio_balloon_page_poison_support(void *opaque)
    768 {
    769     VirtIOBalloon *s = opaque;
    770     VirtIODevice *vdev = VIRTIO_DEVICE(s);
    771 
    772     return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
    773 }
    774 
    775 static void virtio_balloon_set_config(VirtIODevice *vdev,
    776                                       const uint8_t *config_data)
    777 {
    778     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
    779     struct virtio_balloon_config config;
    780     uint32_t oldactual = dev->actual;
    781     ram_addr_t vm_ram_size = get_current_ram_size();
    782 
    783     memcpy(&config, config_data, virtio_balloon_config_size(dev));
    784     dev->actual = le32_to_cpu(config.actual);
    785     if (dev->actual != oldactual) {
    786         qapi_event_send_balloon_change(vm_ram_size -
    787                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
    788     }
    789     dev->poison_val = 0;
    790     if (virtio_balloon_page_poison_support(dev)) {
    791         dev->poison_val = le32_to_cpu(config.poison_val);
    792     }
    793     trace_virtio_balloon_set_config(dev->actual, oldactual);
    794 }
    795 
    796 static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
    797                                             Error **errp)
    798 {
    799     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
    800     f |= dev->host_features;
    801     virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
    802 
    803     return f;
    804 }
    805 
    806 static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
    807 {
    808     VirtIOBalloon *dev = opaque;
    809     info->actual = get_current_ram_size() - ((uint64_t) dev->actual <<
    810                                              VIRTIO_BALLOON_PFN_SHIFT);
    811 }
    812 
    813 static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
    814 {
    815     VirtIOBalloon *dev = VIRTIO_BALLOON(opaque);
    816     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    817     ram_addr_t vm_ram_size = get_current_ram_size();
    818 
    819     if (target > vm_ram_size) {
    820         target = vm_ram_size;
    821     }
    822     if (target) {
    823         dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
    824         virtio_notify_config(vdev);
    825     }
    826     trace_virtio_balloon_to_target(target, dev->num_pages);
    827 }
    828 
    829 static int virtio_balloon_post_load_device(void *opaque, int version_id)
    830 {
    831     VirtIOBalloon *s = VIRTIO_BALLOON(opaque);
    832 
    833     if (balloon_stats_enabled(s)) {
    834         balloon_stats_change_timer(s, s->stats_poll_interval);
    835     }
    836     return 0;
    837 }
    838 
    839 static const VMStateDescription vmstate_virtio_balloon_free_page_hint = {
    840     .name = "virtio-balloon-device/free-page-report",
    841     .version_id = 1,
    842     .minimum_version_id = 1,
    843     .needed = virtio_balloon_free_page_support,
    844     .fields = (VMStateField[]) {
    845         VMSTATE_UINT32(free_page_hint_cmd_id, VirtIOBalloon),
    846         VMSTATE_UINT32(free_page_hint_status, VirtIOBalloon),
    847         VMSTATE_END_OF_LIST()
    848     }
    849 };
    850 
    851 static const VMStateDescription vmstate_virtio_balloon_page_poison = {
    852     .name = "virtio-balloon-device/page-poison",
    853     .version_id = 1,
    854     .minimum_version_id = 1,
    855     .needed = virtio_balloon_page_poison_support,
    856     .fields = (VMStateField[]) {
    857         VMSTATE_UINT32(poison_val, VirtIOBalloon),
    858         VMSTATE_END_OF_LIST()
    859     }
    860 };
    861 
    862 static const VMStateDescription vmstate_virtio_balloon_device = {
    863     .name = "virtio-balloon-device",
    864     .version_id = 1,
    865     .minimum_version_id = 1,
    866     .post_load = virtio_balloon_post_load_device,
    867     .fields = (VMStateField[]) {
    868         VMSTATE_UINT32(num_pages, VirtIOBalloon),
    869         VMSTATE_UINT32(actual, VirtIOBalloon),
    870         VMSTATE_END_OF_LIST()
    871     },
    872     .subsections = (const VMStateDescription * []) {
    873         &vmstate_virtio_balloon_free_page_hint,
    874         &vmstate_virtio_balloon_page_poison,
    875         NULL
    876     }
    877 };
    878 
    879 static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
    880 {
    881     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    882     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
    883     int ret;
    884 
    885     virtio_init(vdev, VIRTIO_ID_BALLOON, virtio_balloon_config_size(s));
    886 
    887     ret = qemu_add_balloon_handler(virtio_balloon_to_target,
    888                                    virtio_balloon_stat, s);
    889 
    890     if (ret < 0) {
    891         error_setg(errp, "Only one balloon device is supported");
    892         virtio_cleanup(vdev);
    893         return;
    894     }
    895 
    896     if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
    897         !s->iothread) {
    898         error_setg(errp, "'free-page-hint' requires 'iothread' to be set");
    899         virtio_cleanup(vdev);
    900         return;
    901     }
    902 
    903     s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
    904     s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
    905     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
    906 
    907     if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
    908         s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE,
    909                                            virtio_balloon_handle_free_page_vq);
    910         precopy_add_notifier(&s->free_page_hint_notify);
    911 
    912         object_ref(OBJECT(s->iothread));
    913         s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
    914                                      virtio_ballloon_get_free_page_hints, s);
    915     }
    916 
    917     if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) {
    918         s->reporting_vq = virtio_add_queue(vdev, 32,
    919                                            virtio_balloon_handle_report);
    920     }
    921 
    922     reset_stats(s);
    923 }
    924 
    925 static void virtio_balloon_device_unrealize(DeviceState *dev)
    926 {
    927     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    928     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
    929 
    930     if (s->free_page_bh) {
    931         qemu_bh_delete(s->free_page_bh);
    932         object_unref(OBJECT(s->iothread));
    933         virtio_balloon_free_page_stop(s);
    934         precopy_remove_notifier(&s->free_page_hint_notify);
    935     }
    936     balloon_stats_destroy_timer(s);
    937     qemu_remove_balloon_handler(s);
    938 
    939     virtio_delete_queue(s->ivq);
    940     virtio_delete_queue(s->dvq);
    941     virtio_delete_queue(s->svq);
    942     if (s->free_page_vq) {
    943         virtio_delete_queue(s->free_page_vq);
    944     }
    945     if (s->reporting_vq) {
    946         virtio_delete_queue(s->reporting_vq);
    947     }
    948     virtio_cleanup(vdev);
    949 }
    950 
    951 static void virtio_balloon_device_reset(VirtIODevice *vdev)
    952 {
    953     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
    954 
    955     if (virtio_balloon_free_page_support(s)) {
    956         virtio_balloon_free_page_stop(s);
    957     }
    958 
    959     if (s->stats_vq_elem != NULL) {
    960         virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
    961         g_free(s->stats_vq_elem);
    962         s->stats_vq_elem = NULL;
    963     }
    964 
    965     s->poison_val = 0;
    966 }
    967 
    968 static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
    969 {
    970     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
    971 
    972     if (!s->stats_vq_elem && vdev->vm_running &&
    973         (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
    974         /* poll stats queue for the element we have discarded when the VM
    975          * was stopped */
    976         virtio_balloon_receive_stats(vdev, s->svq);
    977     }
    978 
    979     if (virtio_balloon_free_page_support(s)) {
    980         /*
    981          * The VM is woken up and the iothread was blocked, so signal it to
    982          * continue.
    983          */
    984         if (vdev->vm_running && s->block_iothread) {
    985             qemu_mutex_lock(&s->free_page_lock);
    986             s->block_iothread = false;
    987             qemu_cond_signal(&s->free_page_cond);
    988             qemu_mutex_unlock(&s->free_page_lock);
    989         }
    990 
    991         /* The VM is stopped, block the iothread. */
    992         if (!vdev->vm_running) {
    993             qemu_mutex_lock(&s->free_page_lock);
    994             s->block_iothread = true;
    995             qemu_mutex_unlock(&s->free_page_lock);
    996         }
    997     }
    998 }
    999 
   1000 static void virtio_balloon_instance_init(Object *obj)
   1001 {
   1002     VirtIOBalloon *s = VIRTIO_BALLOON(obj);
   1003 
   1004     qemu_mutex_init(&s->free_page_lock);
   1005     qemu_cond_init(&s->free_page_cond);
   1006     s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN;
   1007     s->free_page_hint_notify.notify = virtio_balloon_free_page_hint_notify;
   1008 
   1009     object_property_add(obj, "guest-stats", "guest statistics",
   1010                         balloon_stats_get_all, NULL, NULL, NULL);
   1011 
   1012     object_property_add(obj, "guest-stats-polling-interval", "int",
   1013                         balloon_stats_get_poll_interval,
   1014                         balloon_stats_set_poll_interval,
   1015                         NULL, NULL);
   1016 }
   1017 
   1018 static const VMStateDescription vmstate_virtio_balloon = {
   1019     .name = "virtio-balloon",
   1020     .minimum_version_id = 1,
   1021     .version_id = 1,
   1022     .fields = (VMStateField[]) {
   1023         VMSTATE_VIRTIO_DEVICE,
   1024         VMSTATE_END_OF_LIST()
   1025     },
   1026 };
   1027 
   1028 static Property virtio_balloon_properties[] = {
   1029     DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
   1030                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
   1031     DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
   1032                     VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
   1033     DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features,
   1034                     VIRTIO_BALLOON_F_PAGE_POISON, true),
   1035     DEFINE_PROP_BIT("free-page-reporting", VirtIOBalloon, host_features,
   1036                     VIRTIO_BALLOON_F_REPORTING, false),
   1037     /* QEMU 4.0 accidentally changed the config size even when free-page-hint
   1038      * is disabled, resulting in QEMU 3.1 migration incompatibility.  This
   1039      * property retains this quirk for QEMU 4.1 machine types.
   1040      */
   1041     DEFINE_PROP_BOOL("qemu-4-0-config-size", VirtIOBalloon,
   1042                      qemu_4_0_config_size, false),
   1043     DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD,
   1044                      IOThread *),
   1045     DEFINE_PROP_END_OF_LIST(),
   1046 };
   1047 
   1048 static void virtio_balloon_class_init(ObjectClass *klass, void *data)
   1049 {
   1050     DeviceClass *dc = DEVICE_CLASS(klass);
   1051     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
   1052 
   1053     device_class_set_props(dc, virtio_balloon_properties);
   1054     dc->vmsd = &vmstate_virtio_balloon;
   1055     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
   1056     vdc->realize = virtio_balloon_device_realize;
   1057     vdc->unrealize = virtio_balloon_device_unrealize;
   1058     vdc->reset = virtio_balloon_device_reset;
   1059     vdc->get_config = virtio_balloon_get_config;
   1060     vdc->set_config = virtio_balloon_set_config;
   1061     vdc->get_features = virtio_balloon_get_features;
   1062     vdc->set_status = virtio_balloon_set_status;
   1063     vdc->vmsd = &vmstate_virtio_balloon_device;
   1064 }
   1065 
   1066 static const TypeInfo virtio_balloon_info = {
   1067     .name = TYPE_VIRTIO_BALLOON,
   1068     .parent = TYPE_VIRTIO_DEVICE,
   1069     .instance_size = sizeof(VirtIOBalloon),
   1070     .instance_init = virtio_balloon_instance_init,
   1071     .class_init = virtio_balloon_class_init,
   1072 };
   1073 
   1074 static void virtio_register_types(void)
   1075 {
   1076     type_register_static(&virtio_balloon_info);
   1077 }
   1078 
   1079 type_init(virtio_register_types)