qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

ram.c (145264B)


      1 /*
      2  * QEMU System Emulator
      3  *
      4  * Copyright (c) 2003-2008 Fabrice Bellard
      5  * Copyright (c) 2011-2015 Red Hat Inc
      6  *
      7  * Authors:
      8  *  Juan Quintela <quintela@redhat.com>
      9  *
     10  * Permission is hereby granted, free of charge, to any person obtaining a copy
     11  * of this software and associated documentation files (the "Software"), to deal
     12  * in the Software without restriction, including without limitation the rights
     13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     14  * copies of the Software, and to permit persons to whom the Software is
     15  * furnished to do so, subject to the following conditions:
     16  *
     17  * The above copyright notice and this permission notice shall be included in
     18  * all copies or substantial portions of the Software.
     19  *
     20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     26  * THE SOFTWARE.
     27  */
     28 
     29 #include "qemu/osdep.h"
     30 #include "qemu/cutils.h"
     31 #include "qemu/bitops.h"
     32 #include "qemu/bitmap.h"
     33 #include "qemu/madvise.h"
     34 #include "qemu/main-loop.h"
     35 #include "io/channel-null.h"
     36 #include "xbzrle.h"
     37 #include "ram.h"
     38 #include "migration.h"
     39 #include "migration/register.h"
     40 #include "migration/misc.h"
     41 #include "qemu-file.h"
     42 #include "postcopy-ram.h"
     43 #include "page_cache.h"
     44 #include "qemu/error-report.h"
     45 #include "qapi/error.h"
     46 #include "qapi/qapi-types-migration.h"
     47 #include "qapi/qapi-events-migration.h"
     48 #include "qapi/qmp/qerror.h"
     49 #include "trace.h"
     50 #include "exec/ram_addr.h"
     51 #include "exec/target_page.h"
     52 #include "qemu/rcu_queue.h"
     53 #include "migration/colo.h"
     54 #include "block.h"
     55 #include "sysemu/cpu-throttle.h"
     56 #include "savevm.h"
     57 #include "qemu/iov.h"
     58 #include "multifd.h"
     59 #include "sysemu/runstate.h"
     60 
     61 #include "hw/boards.h" /* for machine_dump_guest_core() */
     62 
     63 #if defined(__linux__)
     64 #include "qemu/userfaultfd.h"
     65 #endif /* defined(__linux__) */
     66 
     67 /***********************************************************/
     68 /* ram save/restore */
     69 
     70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
     71  * worked for pages that where filled with the same char.  We switched
     72  * it to only search for the zero value.  And to avoid confusion with
     73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
     74  */
     75 
     76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
     77 #define RAM_SAVE_FLAG_ZERO     0x02
     78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
     79 #define RAM_SAVE_FLAG_PAGE     0x08
     80 #define RAM_SAVE_FLAG_EOS      0x10
     81 #define RAM_SAVE_FLAG_CONTINUE 0x20
     82 #define RAM_SAVE_FLAG_XBZRLE   0x40
     83 /* 0x80 is reserved in migration.h start with 0x100 next */
     84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
     85 
     86 XBZRLECacheStats xbzrle_counters;
     87 
     88 /* struct contains XBZRLE cache and a static page
     89    used by the compression */
     90 static struct {
     91     /* buffer used for XBZRLE encoding */
     92     uint8_t *encoded_buf;
     93     /* buffer for storing page content */
     94     uint8_t *current_buf;
     95     /* Cache for XBZRLE, Protected by lock. */
     96     PageCache *cache;
     97     QemuMutex lock;
     98     /* it will store a page full of zeros */
     99     uint8_t *zero_target_page;
    100     /* buffer used for XBZRLE decoding */
    101     uint8_t *decoded_buf;
    102 } XBZRLE;
    103 
    104 static void XBZRLE_cache_lock(void)
    105 {
    106     if (migrate_use_xbzrle()) {
    107         qemu_mutex_lock(&XBZRLE.lock);
    108     }
    109 }
    110 
    111 static void XBZRLE_cache_unlock(void)
    112 {
    113     if (migrate_use_xbzrle()) {
    114         qemu_mutex_unlock(&XBZRLE.lock);
    115     }
    116 }
    117 
    118 /**
    119  * xbzrle_cache_resize: resize the xbzrle cache
    120  *
    121  * This function is called from migrate_params_apply in main
    122  * thread, possibly while a migration is in progress.  A running
    123  * migration may be using the cache and might finish during this call,
    124  * hence changes to the cache are protected by XBZRLE.lock().
    125  *
    126  * Returns 0 for success or -1 for error
    127  *
    128  * @new_size: new cache size
    129  * @errp: set *errp if the check failed, with reason
    130  */
    131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
    132 {
    133     PageCache *new_cache;
    134     int64_t ret = 0;
    135 
    136     /* Check for truncation */
    137     if (new_size != (size_t)new_size) {
    138         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
    139                    "exceeding address space");
    140         return -1;
    141     }
    142 
    143     if (new_size == migrate_xbzrle_cache_size()) {
    144         /* nothing to do */
    145         return 0;
    146     }
    147 
    148     XBZRLE_cache_lock();
    149 
    150     if (XBZRLE.cache != NULL) {
    151         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
    152         if (!new_cache) {
    153             ret = -1;
    154             goto out;
    155         }
    156 
    157         cache_fini(XBZRLE.cache);
    158         XBZRLE.cache = new_cache;
    159     }
    160 out:
    161     XBZRLE_cache_unlock();
    162     return ret;
    163 }
    164 
    165 bool ramblock_is_ignored(RAMBlock *block)
    166 {
    167     return !qemu_ram_is_migratable(block) ||
    168            (migrate_ignore_shared() && qemu_ram_is_shared(block));
    169 }
    170 
    171 #undef RAMBLOCK_FOREACH
    172 
    173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
    174 {
    175     RAMBlock *block;
    176     int ret = 0;
    177 
    178     RCU_READ_LOCK_GUARD();
    179 
    180     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
    181         ret = func(block, opaque);
    182         if (ret) {
    183             break;
    184         }
    185     }
    186     return ret;
    187 }
    188 
    189 static void ramblock_recv_map_init(void)
    190 {
    191     RAMBlock *rb;
    192 
    193     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
    194         assert(!rb->receivedmap);
    195         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
    196     }
    197 }
    198 
    199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
    200 {
    201     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
    202                     rb->receivedmap);
    203 }
    204 
    205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
    206 {
    207     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
    208 }
    209 
    210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
    211 {
    212     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
    213 }
    214 
    215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
    216                                     size_t nr)
    217 {
    218     bitmap_set_atomic(rb->receivedmap,
    219                       ramblock_recv_bitmap_offset(host_addr, rb),
    220                       nr);
    221 }
    222 
    223 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
    224 
    225 /*
    226  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
    227  *
    228  * Returns >0 if success with sent bytes, or <0 if error.
    229  */
    230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
    231                                   const char *block_name)
    232 {
    233     RAMBlock *block = qemu_ram_block_by_name(block_name);
    234     unsigned long *le_bitmap, nbits;
    235     uint64_t size;
    236 
    237     if (!block) {
    238         error_report("%s: invalid block name: %s", __func__, block_name);
    239         return -1;
    240     }
    241 
    242     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
    243 
    244     /*
    245      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
    246      * machines we may need 4 more bytes for padding (see below
    247      * comment). So extend it a bit before hand.
    248      */
    249     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
    250 
    251     /*
    252      * Always use little endian when sending the bitmap. This is
    253      * required that when source and destination VMs are not using the
    254      * same endianness. (Note: big endian won't work.)
    255      */
    256     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
    257 
    258     /* Size of the bitmap, in bytes */
    259     size = DIV_ROUND_UP(nbits, 8);
    260 
    261     /*
    262      * size is always aligned to 8 bytes for 64bit machines, but it
    263      * may not be true for 32bit machines. We need this padding to
    264      * make sure the migration can survive even between 32bit and
    265      * 64bit machines.
    266      */
    267     size = ROUND_UP(size, 8);
    268 
    269     qemu_put_be64(file, size);
    270     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
    271     /*
    272      * Mark as an end, in case the middle part is screwed up due to
    273      * some "mysterious" reason.
    274      */
    275     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
    276     qemu_fflush(file);
    277 
    278     g_free(le_bitmap);
    279 
    280     if (qemu_file_get_error(file)) {
    281         return qemu_file_get_error(file);
    282     }
    283 
    284     return size + sizeof(size);
    285 }
    286 
    287 /*
    288  * An outstanding page request, on the source, having been received
    289  * and queued
    290  */
    291 struct RAMSrcPageRequest {
    292     RAMBlock *rb;
    293     hwaddr    offset;
    294     hwaddr    len;
    295 
    296     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
    297 };
    298 
    299 typedef struct {
    300     /*
    301      * Cached ramblock/offset values if preempted.  They're only meaningful if
    302      * preempted==true below.
    303      */
    304     RAMBlock *ram_block;
    305     unsigned long ram_page;
    306     /*
    307      * Whether a postcopy preemption just happened.  Will be reset after
    308      * precopy recovered to background migration.
    309      */
    310     bool preempted;
    311 } PostcopyPreemptState;
    312 
    313 /* State of RAM for migration */
    314 struct RAMState {
    315     /* QEMUFile used for this migration */
    316     QEMUFile *f;
    317     /* UFFD file descriptor, used in 'write-tracking' migration */
    318     int uffdio_fd;
    319     /* Last block that we have visited searching for dirty pages */
    320     RAMBlock *last_seen_block;
    321     /* Last block from where we have sent data */
    322     RAMBlock *last_sent_block;
    323     /* Last dirty target page we have sent */
    324     ram_addr_t last_page;
    325     /* last ram version we have seen */
    326     uint32_t last_version;
    327     /* How many times we have dirty too many pages */
    328     int dirty_rate_high_cnt;
    329     /* these variables are used for bitmap sync */
    330     /* last time we did a full bitmap_sync */
    331     int64_t time_last_bitmap_sync;
    332     /* bytes transferred at start_time */
    333     uint64_t bytes_xfer_prev;
    334     /* number of dirty pages since start_time */
    335     uint64_t num_dirty_pages_period;
    336     /* xbzrle misses since the beginning of the period */
    337     uint64_t xbzrle_cache_miss_prev;
    338     /* Amount of xbzrle pages since the beginning of the period */
    339     uint64_t xbzrle_pages_prev;
    340     /* Amount of xbzrle encoded bytes since the beginning of the period */
    341     uint64_t xbzrle_bytes_prev;
    342     /* Start using XBZRLE (e.g., after the first round). */
    343     bool xbzrle_enabled;
    344     /* Are we on the last stage of migration */
    345     bool last_stage;
    346     /* compression statistics since the beginning of the period */
    347     /* amount of count that no free thread to compress data */
    348     uint64_t compress_thread_busy_prev;
    349     /* amount bytes after compression */
    350     uint64_t compressed_size_prev;
    351     /* amount of compressed pages */
    352     uint64_t compress_pages_prev;
    353 
    354     /* total handled target pages at the beginning of period */
    355     uint64_t target_page_count_prev;
    356     /* total handled target pages since start */
    357     uint64_t target_page_count;
    358     /* number of dirty bits in the bitmap */
    359     uint64_t migration_dirty_pages;
    360     /* Protects modification of the bitmap and migration dirty pages */
    361     QemuMutex bitmap_mutex;
    362     /* The RAMBlock used in the last src_page_requests */
    363     RAMBlock *last_req_rb;
    364     /* Queue of outstanding page requests from the destination */
    365     QemuMutex src_page_req_mutex;
    366     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
    367 
    368     /* Postcopy preemption informations */
    369     PostcopyPreemptState postcopy_preempt_state;
    370     /*
    371      * Current channel we're using on src VM.  Only valid if postcopy-preempt
    372      * is enabled.
    373      */
    374     unsigned int postcopy_channel;
    375 };
    376 typedef struct RAMState RAMState;
    377 
    378 static RAMState *ram_state;
    379 
    380 static NotifierWithReturnList precopy_notifier_list;
    381 
    382 static void postcopy_preempt_reset(RAMState *rs)
    383 {
    384     memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
    385 }
    386 
    387 /* Whether postcopy has queued requests? */
    388 static bool postcopy_has_request(RAMState *rs)
    389 {
    390     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
    391 }
    392 
    393 void precopy_infrastructure_init(void)
    394 {
    395     notifier_with_return_list_init(&precopy_notifier_list);
    396 }
    397 
    398 void precopy_add_notifier(NotifierWithReturn *n)
    399 {
    400     notifier_with_return_list_add(&precopy_notifier_list, n);
    401 }
    402 
    403 void precopy_remove_notifier(NotifierWithReturn *n)
    404 {
    405     notifier_with_return_remove(n);
    406 }
    407 
    408 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
    409 {
    410     PrecopyNotifyData pnd;
    411     pnd.reason = reason;
    412     pnd.errp = errp;
    413 
    414     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
    415 }
    416 
    417 uint64_t ram_bytes_remaining(void)
    418 {
    419     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
    420                        0;
    421 }
    422 
    423 MigrationStats ram_counters;
    424 
    425 static void ram_transferred_add(uint64_t bytes)
    426 {
    427     if (runstate_is_running()) {
    428         ram_counters.precopy_bytes += bytes;
    429     } else if (migration_in_postcopy()) {
    430         ram_counters.postcopy_bytes += bytes;
    431     } else {
    432         ram_counters.downtime_bytes += bytes;
    433     }
    434     ram_counters.transferred += bytes;
    435 }
    436 
    437 void dirty_sync_missed_zero_copy(void)
    438 {
    439     ram_counters.dirty_sync_missed_zero_copy++;
    440 }
    441 
    442 /* used by the search for pages to send */
    443 struct PageSearchStatus {
    444     /* Current block being searched */
    445     RAMBlock    *block;
    446     /* Current page to search from */
    447     unsigned long page;
    448     /* Set once we wrap around */
    449     bool         complete_round;
    450     /*
    451      * [POSTCOPY-ONLY] Whether current page is explicitly requested by
    452      * postcopy.  When set, the request is "urgent" because the dest QEMU
    453      * threads are waiting for us.
    454      */
    455     bool         postcopy_requested;
    456     /*
    457      * [POSTCOPY-ONLY] The target channel to use to send current page.
    458      *
    459      * Note: This may _not_ match with the value in postcopy_requested
    460      * above. Let's imagine the case where the postcopy request is exactly
    461      * the page that we're sending in progress during precopy. In this case
    462      * we'll have postcopy_requested set to true but the target channel
    463      * will be the precopy channel (so that we don't split brain on that
    464      * specific page since the precopy channel already contains partial of
    465      * that page data).
    466      *
    467      * Besides that specific use case, postcopy_target_channel should
    468      * always be equal to postcopy_requested, because by default we send
    469      * postcopy pages via postcopy preempt channel.
    470      */
    471     bool         postcopy_target_channel;
    472 };
    473 typedef struct PageSearchStatus PageSearchStatus;
    474 
    475 CompressionStats compression_counters;
    476 
    477 struct CompressParam {
    478     bool done;
    479     bool quit;
    480     bool zero_page;
    481     QEMUFile *file;
    482     QemuMutex mutex;
    483     QemuCond cond;
    484     RAMBlock *block;
    485     ram_addr_t offset;
    486 
    487     /* internally used fields */
    488     z_stream stream;
    489     uint8_t *originbuf;
    490 };
    491 typedef struct CompressParam CompressParam;
    492 
    493 struct DecompressParam {
    494     bool done;
    495     bool quit;
    496     QemuMutex mutex;
    497     QemuCond cond;
    498     void *des;
    499     uint8_t *compbuf;
    500     int len;
    501     z_stream stream;
    502 };
    503 typedef struct DecompressParam DecompressParam;
    504 
    505 static CompressParam *comp_param;
    506 static QemuThread *compress_threads;
    507 /* comp_done_cond is used to wake up the migration thread when
    508  * one of the compression threads has finished the compression.
    509  * comp_done_lock is used to co-work with comp_done_cond.
    510  */
    511 static QemuMutex comp_done_lock;
    512 static QemuCond comp_done_cond;
    513 
    514 static QEMUFile *decomp_file;
    515 static DecompressParam *decomp_param;
    516 static QemuThread *decompress_threads;
    517 static QemuMutex decomp_done_lock;
    518 static QemuCond decomp_done_cond;
    519 
    520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
    521                                  ram_addr_t offset, uint8_t *source_buf);
    522 
    523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
    524                                      bool postcopy_requested);
    525 
    526 static void *do_data_compress(void *opaque)
    527 {
    528     CompressParam *param = opaque;
    529     RAMBlock *block;
    530     ram_addr_t offset;
    531     bool zero_page;
    532 
    533     qemu_mutex_lock(&param->mutex);
    534     while (!param->quit) {
    535         if (param->block) {
    536             block = param->block;
    537             offset = param->offset;
    538             param->block = NULL;
    539             qemu_mutex_unlock(&param->mutex);
    540 
    541             zero_page = do_compress_ram_page(param->file, &param->stream,
    542                                              block, offset, param->originbuf);
    543 
    544             qemu_mutex_lock(&comp_done_lock);
    545             param->done = true;
    546             param->zero_page = zero_page;
    547             qemu_cond_signal(&comp_done_cond);
    548             qemu_mutex_unlock(&comp_done_lock);
    549 
    550             qemu_mutex_lock(&param->mutex);
    551         } else {
    552             qemu_cond_wait(&param->cond, &param->mutex);
    553         }
    554     }
    555     qemu_mutex_unlock(&param->mutex);
    556 
    557     return NULL;
    558 }
    559 
    560 static void compress_threads_save_cleanup(void)
    561 {
    562     int i, thread_count;
    563 
    564     if (!migrate_use_compression() || !comp_param) {
    565         return;
    566     }
    567 
    568     thread_count = migrate_compress_threads();
    569     for (i = 0; i < thread_count; i++) {
    570         /*
    571          * we use it as a indicator which shows if the thread is
    572          * properly init'd or not
    573          */
    574         if (!comp_param[i].file) {
    575             break;
    576         }
    577 
    578         qemu_mutex_lock(&comp_param[i].mutex);
    579         comp_param[i].quit = true;
    580         qemu_cond_signal(&comp_param[i].cond);
    581         qemu_mutex_unlock(&comp_param[i].mutex);
    582 
    583         qemu_thread_join(compress_threads + i);
    584         qemu_mutex_destroy(&comp_param[i].mutex);
    585         qemu_cond_destroy(&comp_param[i].cond);
    586         deflateEnd(&comp_param[i].stream);
    587         g_free(comp_param[i].originbuf);
    588         qemu_fclose(comp_param[i].file);
    589         comp_param[i].file = NULL;
    590     }
    591     qemu_mutex_destroy(&comp_done_lock);
    592     qemu_cond_destroy(&comp_done_cond);
    593     g_free(compress_threads);
    594     g_free(comp_param);
    595     compress_threads = NULL;
    596     comp_param = NULL;
    597 }
    598 
    599 static int compress_threads_save_setup(void)
    600 {
    601     int i, thread_count;
    602 
    603     if (!migrate_use_compression()) {
    604         return 0;
    605     }
    606     thread_count = migrate_compress_threads();
    607     compress_threads = g_new0(QemuThread, thread_count);
    608     comp_param = g_new0(CompressParam, thread_count);
    609     qemu_cond_init(&comp_done_cond);
    610     qemu_mutex_init(&comp_done_lock);
    611     for (i = 0; i < thread_count; i++) {
    612         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
    613         if (!comp_param[i].originbuf) {
    614             goto exit;
    615         }
    616 
    617         if (deflateInit(&comp_param[i].stream,
    618                         migrate_compress_level()) != Z_OK) {
    619             g_free(comp_param[i].originbuf);
    620             goto exit;
    621         }
    622 
    623         /* comp_param[i].file is just used as a dummy buffer to save data,
    624          * set its ops to empty.
    625          */
    626         comp_param[i].file = qemu_file_new_output(
    627             QIO_CHANNEL(qio_channel_null_new()));
    628         comp_param[i].done = true;
    629         comp_param[i].quit = false;
    630         qemu_mutex_init(&comp_param[i].mutex);
    631         qemu_cond_init(&comp_param[i].cond);
    632         qemu_thread_create(compress_threads + i, "compress",
    633                            do_data_compress, comp_param + i,
    634                            QEMU_THREAD_JOINABLE);
    635     }
    636     return 0;
    637 
    638 exit:
    639     compress_threads_save_cleanup();
    640     return -1;
    641 }
    642 
    643 /**
    644  * save_page_header: write page header to wire
    645  *
    646  * If this is the 1st block, it also writes the block identification
    647  *
    648  * Returns the number of bytes written
    649  *
    650  * @f: QEMUFile where to send the data
    651  * @block: block that contains the page we want to send
    652  * @offset: offset inside the block for the page
    653  *          in the lower bits, it contains flags
    654  */
    655 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
    656                                ram_addr_t offset)
    657 {
    658     size_t size, len;
    659 
    660     if (block == rs->last_sent_block) {
    661         offset |= RAM_SAVE_FLAG_CONTINUE;
    662     }
    663     qemu_put_be64(f, offset);
    664     size = 8;
    665 
    666     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
    667         len = strlen(block->idstr);
    668         qemu_put_byte(f, len);
    669         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
    670         size += 1 + len;
    671         rs->last_sent_block = block;
    672     }
    673     return size;
    674 }
    675 
    676 /**
    677  * mig_throttle_guest_down: throttle down the guest
    678  *
    679  * Reduce amount of guest cpu execution to hopefully slow down memory
    680  * writes. If guest dirty memory rate is reduced below the rate at
    681  * which we can transfer pages to the destination then we should be
    682  * able to complete migration. Some workloads dirty memory way too
    683  * fast and will not effectively converge, even with auto-converge.
    684  */
    685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
    686                                     uint64_t bytes_dirty_threshold)
    687 {
    688     MigrationState *s = migrate_get_current();
    689     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
    690     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
    691     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
    692     int pct_max = s->parameters.max_cpu_throttle;
    693 
    694     uint64_t throttle_now = cpu_throttle_get_percentage();
    695     uint64_t cpu_now, cpu_ideal, throttle_inc;
    696 
    697     /* We have not started throttling yet. Let's start it. */
    698     if (!cpu_throttle_active()) {
    699         cpu_throttle_set(pct_initial);
    700     } else {
    701         /* Throttling already on, just increase the rate */
    702         if (!pct_tailslow) {
    703             throttle_inc = pct_increment;
    704         } else {
    705             /* Compute the ideal CPU percentage used by Guest, which may
    706              * make the dirty rate match the dirty rate threshold. */
    707             cpu_now = 100 - throttle_now;
    708             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
    709                         bytes_dirty_period);
    710             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
    711         }
    712         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
    713     }
    714 }
    715 
    716 void mig_throttle_counter_reset(void)
    717 {
    718     RAMState *rs = ram_state;
    719 
    720     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
    721     rs->num_dirty_pages_period = 0;
    722     rs->bytes_xfer_prev = ram_counters.transferred;
    723 }
    724 
    725 /**
    726  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
    727  *
    728  * @rs: current RAM state
    729  * @current_addr: address for the zero page
    730  *
    731  * Update the xbzrle cache to reflect a page that's been sent as all 0.
    732  * The important thing is that a stale (not-yet-0'd) page be replaced
    733  * by the new data.
    734  * As a bonus, if the page wasn't in the cache it gets added so that
    735  * when a small write is made into the 0'd page it gets XBZRLE sent.
    736  */
    737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
    738 {
    739     if (!rs->xbzrle_enabled) {
    740         return;
    741     }
    742 
    743     /* We don't care if this fails to allocate a new cache page
    744      * as long as it updated an old one */
    745     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
    746                  ram_counters.dirty_sync_count);
    747 }
    748 
    749 #define ENCODING_FLAG_XBZRLE 0x1
    750 
    751 /**
    752  * save_xbzrle_page: compress and send current page
    753  *
    754  * Returns: 1 means that we wrote the page
    755  *          0 means that page is identical to the one already sent
    756  *          -1 means that xbzrle would be longer than normal
    757  *
    758  * @rs: current RAM state
    759  * @current_data: pointer to the address of the page contents
    760  * @current_addr: addr of the page
    761  * @block: block that contains the page we want to send
    762  * @offset: offset inside the block for the page
    763  */
    764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
    765                             ram_addr_t current_addr, RAMBlock *block,
    766                             ram_addr_t offset)
    767 {
    768     int encoded_len = 0, bytes_xbzrle;
    769     uint8_t *prev_cached_page;
    770 
    771     if (!cache_is_cached(XBZRLE.cache, current_addr,
    772                          ram_counters.dirty_sync_count)) {
    773         xbzrle_counters.cache_miss++;
    774         if (!rs->last_stage) {
    775             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
    776                              ram_counters.dirty_sync_count) == -1) {
    777                 return -1;
    778             } else {
    779                 /* update *current_data when the page has been
    780                    inserted into cache */
    781                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
    782             }
    783         }
    784         return -1;
    785     }
    786 
    787     /*
    788      * Reaching here means the page has hit the xbzrle cache, no matter what
    789      * encoding result it is (normal encoding, overflow or skipping the page),
    790      * count the page as encoded. This is used to calculate the encoding rate.
    791      *
    792      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
    793      * 2nd page turns out to be skipped (i.e. no new bytes written to the
    794      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
    795      * skipped page included. In this way, the encoding rate can tell if the
    796      * guest page is good for xbzrle encoding.
    797      */
    798     xbzrle_counters.pages++;
    799     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
    800 
    801     /* save current buffer into memory */
    802     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
    803 
    804     /* XBZRLE encoding (if there is no overflow) */
    805     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
    806                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
    807                                        TARGET_PAGE_SIZE);
    808 
    809     /*
    810      * Update the cache contents, so that it corresponds to the data
    811      * sent, in all cases except where we skip the page.
    812      */
    813     if (!rs->last_stage && encoded_len != 0) {
    814         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
    815         /*
    816          * In the case where we couldn't compress, ensure that the caller
    817          * sends the data from the cache, since the guest might have
    818          * changed the RAM since we copied it.
    819          */
    820         *current_data = prev_cached_page;
    821     }
    822 
    823     if (encoded_len == 0) {
    824         trace_save_xbzrle_page_skipping();
    825         return 0;
    826     } else if (encoded_len == -1) {
    827         trace_save_xbzrle_page_overflow();
    828         xbzrle_counters.overflow++;
    829         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
    830         return -1;
    831     }
    832 
    833     /* Send XBZRLE based compressed page */
    834     bytes_xbzrle = save_page_header(rs, rs->f, block,
    835                                     offset | RAM_SAVE_FLAG_XBZRLE);
    836     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
    837     qemu_put_be16(rs->f, encoded_len);
    838     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
    839     bytes_xbzrle += encoded_len + 1 + 2;
    840     /*
    841      * Like compressed_size (please see update_compress_thread_counts),
    842      * the xbzrle encoded bytes don't count the 8 byte header with
    843      * RAM_SAVE_FLAG_CONTINUE.
    844      */
    845     xbzrle_counters.bytes += bytes_xbzrle - 8;
    846     ram_transferred_add(bytes_xbzrle);
    847 
    848     return 1;
    849 }
    850 
    851 /**
    852  * migration_bitmap_find_dirty: find the next dirty page from start
    853  *
    854  * Returns the page offset within memory region of the start of a dirty page
    855  *
    856  * @rs: current RAM state
    857  * @rb: RAMBlock where to search for dirty pages
    858  * @start: page where we start the search
    859  */
    860 static inline
    861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
    862                                           unsigned long start)
    863 {
    864     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    865     unsigned long *bitmap = rb->bmap;
    866 
    867     if (ramblock_is_ignored(rb)) {
    868         return size;
    869     }
    870 
    871     return find_next_bit(bitmap, size, start);
    872 }
    873 
    874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
    875                                                        unsigned long page)
    876 {
    877     uint8_t shift;
    878     hwaddr size, start;
    879 
    880     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
    881         return;
    882     }
    883 
    884     shift = rb->clear_bmap_shift;
    885     /*
    886      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
    887      * can make things easier sometimes since then start address
    888      * of the small chunk will always be 64 pages aligned so the
    889      * bitmap will always be aligned to unsigned long. We should
    890      * even be able to remove this restriction but I'm simply
    891      * keeping it.
    892      */
    893     assert(shift >= 6);
    894 
    895     size = 1ULL << (TARGET_PAGE_BITS + shift);
    896     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
    897     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
    898     memory_region_clear_dirty_bitmap(rb->mr, start, size);
    899 }
    900 
    901 static void
    902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
    903                                                  unsigned long start,
    904                                                  unsigned long npages)
    905 {
    906     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
    907     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
    908     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
    909 
    910     /*
    911      * Clear pages from start to start + npages - 1, so the end boundary is
    912      * exclusive.
    913      */
    914     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
    915         migration_clear_memory_region_dirty_bitmap(rb, i);
    916     }
    917 }
    918 
    919 /*
    920  * colo_bitmap_find_diry:find contiguous dirty pages from start
    921  *
    922  * Returns the page offset within memory region of the start of the contiguout
    923  * dirty page
    924  *
    925  * @rs: current RAM state
    926  * @rb: RAMBlock where to search for dirty pages
    927  * @start: page where we start the search
    928  * @num: the number of contiguous dirty pages
    929  */
    930 static inline
    931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
    932                                      unsigned long start, unsigned long *num)
    933 {
    934     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
    935     unsigned long *bitmap = rb->bmap;
    936     unsigned long first, next;
    937 
    938     *num = 0;
    939 
    940     if (ramblock_is_ignored(rb)) {
    941         return size;
    942     }
    943 
    944     first = find_next_bit(bitmap, size, start);
    945     if (first >= size) {
    946         return first;
    947     }
    948     next = find_next_zero_bit(bitmap, size, first + 1);
    949     assert(next >= first);
    950     *num = next - first;
    951     return first;
    952 }
    953 
    954 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
    955                                                 RAMBlock *rb,
    956                                                 unsigned long page)
    957 {
    958     bool ret;
    959 
    960     /*
    961      * Clear dirty bitmap if needed.  This _must_ be called before we
    962      * send any of the page in the chunk because we need to make sure
    963      * we can capture further page content changes when we sync dirty
    964      * log the next time.  So as long as we are going to send any of
    965      * the page in the chunk we clear the remote dirty bitmap for all.
    966      * Clearing it earlier won't be a problem, but too late will.
    967      */
    968     migration_clear_memory_region_dirty_bitmap(rb, page);
    969 
    970     ret = test_and_clear_bit(page, rb->bmap);
    971     if (ret) {
    972         rs->migration_dirty_pages--;
    973     }
    974 
    975     return ret;
    976 }
    977 
    978 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
    979                                        void *opaque)
    980 {
    981     const hwaddr offset = section->offset_within_region;
    982     const hwaddr size = int128_get64(section->size);
    983     const unsigned long start = offset >> TARGET_PAGE_BITS;
    984     const unsigned long npages = size >> TARGET_PAGE_BITS;
    985     RAMBlock *rb = section->mr->ram_block;
    986     uint64_t *cleared_bits = opaque;
    987 
    988     /*
    989      * We don't grab ram_state->bitmap_mutex because we expect to run
    990      * only when starting migration or during postcopy recovery where
    991      * we don't have concurrent access.
    992      */
    993     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
    994         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
    995     }
    996     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
    997     bitmap_clear(rb->bmap, start, npages);
    998 }
    999 
   1000 /*
   1001  * Exclude all dirty pages from migration that fall into a discarded range as
   1002  * managed by a RamDiscardManager responsible for the mapped memory region of
   1003  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
   1004  *
   1005  * Discarded pages ("logically unplugged") have undefined content and must
   1006  * not get migrated, because even reading these pages for migration might
   1007  * result in undesired behavior.
   1008  *
   1009  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
   1010  *
   1011  * Note: The result is only stable while migrating (precopy/postcopy).
   1012  */
   1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
   1014 {
   1015     uint64_t cleared_bits = 0;
   1016 
   1017     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
   1018         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
   1019         MemoryRegionSection section = {
   1020             .mr = rb->mr,
   1021             .offset_within_region = 0,
   1022             .size = int128_make64(qemu_ram_get_used_length(rb)),
   1023         };
   1024 
   1025         ram_discard_manager_replay_discarded(rdm, &section,
   1026                                              dirty_bitmap_clear_section,
   1027                                              &cleared_bits);
   1028     }
   1029     return cleared_bits;
   1030 }
   1031 
   1032 /*
   1033  * Check if a host-page aligned page falls into a discarded range as managed by
   1034  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
   1035  *
   1036  * Note: The result is only stable while migrating (precopy/postcopy).
   1037  */
   1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
   1039 {
   1040     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
   1041         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
   1042         MemoryRegionSection section = {
   1043             .mr = rb->mr,
   1044             .offset_within_region = start,
   1045             .size = int128_make64(qemu_ram_pagesize(rb)),
   1046         };
   1047 
   1048         return !ram_discard_manager_is_populated(rdm, &section);
   1049     }
   1050     return false;
   1051 }
   1052 
   1053 /* Called with RCU critical section */
   1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
   1055 {
   1056     uint64_t new_dirty_pages =
   1057         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
   1058 
   1059     rs->migration_dirty_pages += new_dirty_pages;
   1060     rs->num_dirty_pages_period += new_dirty_pages;
   1061 }
   1062 
   1063 /**
   1064  * ram_pagesize_summary: calculate all the pagesizes of a VM
   1065  *
   1066  * Returns a summary bitmap of the page sizes of all RAMBlocks
   1067  *
   1068  * For VMs with just normal pages this is equivalent to the host page
   1069  * size. If it's got some huge pages then it's the OR of all the
   1070  * different page sizes.
   1071  */
   1072 uint64_t ram_pagesize_summary(void)
   1073 {
   1074     RAMBlock *block;
   1075     uint64_t summary = 0;
   1076 
   1077     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1078         summary |= block->page_size;
   1079     }
   1080 
   1081     return summary;
   1082 }
   1083 
   1084 uint64_t ram_get_total_transferred_pages(void)
   1085 {
   1086     return  ram_counters.normal + ram_counters.duplicate +
   1087                 compression_counters.pages + xbzrle_counters.pages;
   1088 }
   1089 
   1090 static void migration_update_rates(RAMState *rs, int64_t end_time)
   1091 {
   1092     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
   1093     double compressed_size;
   1094 
   1095     /* calculate period counters */
   1096     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
   1097                 / (end_time - rs->time_last_bitmap_sync);
   1098 
   1099     if (!page_count) {
   1100         return;
   1101     }
   1102 
   1103     if (migrate_use_xbzrle()) {
   1104         double encoded_size, unencoded_size;
   1105 
   1106         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
   1107             rs->xbzrle_cache_miss_prev) / page_count;
   1108         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
   1109         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
   1110                          TARGET_PAGE_SIZE;
   1111         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
   1112         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
   1113             xbzrle_counters.encoding_rate = 0;
   1114         } else {
   1115             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
   1116         }
   1117         rs->xbzrle_pages_prev = xbzrle_counters.pages;
   1118         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
   1119     }
   1120 
   1121     if (migrate_use_compression()) {
   1122         compression_counters.busy_rate = (double)(compression_counters.busy -
   1123             rs->compress_thread_busy_prev) / page_count;
   1124         rs->compress_thread_busy_prev = compression_counters.busy;
   1125 
   1126         compressed_size = compression_counters.compressed_size -
   1127                           rs->compressed_size_prev;
   1128         if (compressed_size) {
   1129             double uncompressed_size = (compression_counters.pages -
   1130                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
   1131 
   1132             /* Compression-Ratio = Uncompressed-size / Compressed-size */
   1133             compression_counters.compression_rate =
   1134                                         uncompressed_size / compressed_size;
   1135 
   1136             rs->compress_pages_prev = compression_counters.pages;
   1137             rs->compressed_size_prev = compression_counters.compressed_size;
   1138         }
   1139     }
   1140 }
   1141 
   1142 static void migration_trigger_throttle(RAMState *rs)
   1143 {
   1144     MigrationState *s = migrate_get_current();
   1145     uint64_t threshold = s->parameters.throttle_trigger_threshold;
   1146 
   1147     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
   1148     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
   1149     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
   1150 
   1151     /* During block migration the auto-converge logic incorrectly detects
   1152      * that ram migration makes no progress. Avoid this by disabling the
   1153      * throttling logic during the bulk phase of block migration. */
   1154     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
   1155         /* The following detection logic can be refined later. For now:
   1156            Check to see if the ratio between dirtied bytes and the approx.
   1157            amount of bytes that just got transferred since the last time
   1158            we were in this routine reaches the threshold. If that happens
   1159            twice, start or increase throttling. */
   1160 
   1161         if ((bytes_dirty_period > bytes_dirty_threshold) &&
   1162             (++rs->dirty_rate_high_cnt >= 2)) {
   1163             trace_migration_throttle();
   1164             rs->dirty_rate_high_cnt = 0;
   1165             mig_throttle_guest_down(bytes_dirty_period,
   1166                                     bytes_dirty_threshold);
   1167         }
   1168     }
   1169 }
   1170 
   1171 static void migration_bitmap_sync(RAMState *rs)
   1172 {
   1173     RAMBlock *block;
   1174     int64_t end_time;
   1175 
   1176     ram_counters.dirty_sync_count++;
   1177 
   1178     if (!rs->time_last_bitmap_sync) {
   1179         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   1180     }
   1181 
   1182     trace_migration_bitmap_sync_start();
   1183     memory_global_dirty_log_sync();
   1184 
   1185     qemu_mutex_lock(&rs->bitmap_mutex);
   1186     WITH_RCU_READ_LOCK_GUARD() {
   1187         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1188             ramblock_sync_dirty_bitmap(rs, block);
   1189         }
   1190         ram_counters.remaining = ram_bytes_remaining();
   1191     }
   1192     qemu_mutex_unlock(&rs->bitmap_mutex);
   1193 
   1194     memory_global_after_dirty_log_sync();
   1195     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
   1196 
   1197     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
   1198 
   1199     /* more than 1 second = 1000 millisecons */
   1200     if (end_time > rs->time_last_bitmap_sync + 1000) {
   1201         migration_trigger_throttle(rs);
   1202 
   1203         migration_update_rates(rs, end_time);
   1204 
   1205         rs->target_page_count_prev = rs->target_page_count;
   1206 
   1207         /* reset period counters */
   1208         rs->time_last_bitmap_sync = end_time;
   1209         rs->num_dirty_pages_period = 0;
   1210         rs->bytes_xfer_prev = ram_counters.transferred;
   1211     }
   1212     if (migrate_use_events()) {
   1213         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
   1214     }
   1215 }
   1216 
   1217 static void migration_bitmap_sync_precopy(RAMState *rs)
   1218 {
   1219     Error *local_err = NULL;
   1220 
   1221     /*
   1222      * The current notifier usage is just an optimization to migration, so we
   1223      * don't stop the normal migration process in the error case.
   1224      */
   1225     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
   1226         error_report_err(local_err);
   1227         local_err = NULL;
   1228     }
   1229 
   1230     migration_bitmap_sync(rs);
   1231 
   1232     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
   1233         error_report_err(local_err);
   1234     }
   1235 }
   1236 
   1237 static void ram_release_page(const char *rbname, uint64_t offset)
   1238 {
   1239     if (!migrate_release_ram() || !migration_in_postcopy()) {
   1240         return;
   1241     }
   1242 
   1243     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
   1244 }
   1245 
   1246 /**
   1247  * save_zero_page_to_file: send the zero page to the file
   1248  *
   1249  * Returns the size of data written to the file, 0 means the page is not
   1250  * a zero page
   1251  *
   1252  * @rs: current RAM state
   1253  * @file: the file where the data is saved
   1254  * @block: block that contains the page we want to send
   1255  * @offset: offset inside the block for the page
   1256  */
   1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
   1258                                   RAMBlock *block, ram_addr_t offset)
   1259 {
   1260     uint8_t *p = block->host + offset;
   1261     int len = 0;
   1262 
   1263     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
   1264         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
   1265         qemu_put_byte(file, 0);
   1266         len += 1;
   1267         ram_release_page(block->idstr, offset);
   1268     }
   1269     return len;
   1270 }
   1271 
   1272 /**
   1273  * save_zero_page: send the zero page to the stream
   1274  *
   1275  * Returns the number of pages written.
   1276  *
   1277  * @rs: current RAM state
   1278  * @block: block that contains the page we want to send
   1279  * @offset: offset inside the block for the page
   1280  */
   1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
   1282 {
   1283     int len = save_zero_page_to_file(rs, rs->f, block, offset);
   1284 
   1285     if (len) {
   1286         ram_counters.duplicate++;
   1287         ram_transferred_add(len);
   1288         return 1;
   1289     }
   1290     return -1;
   1291 }
   1292 
   1293 /*
   1294  * @pages: the number of pages written by the control path,
   1295  *        < 0 - error
   1296  *        > 0 - number of pages written
   1297  *
   1298  * Return true if the pages has been saved, otherwise false is returned.
   1299  */
   1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
   1301                               int *pages)
   1302 {
   1303     uint64_t bytes_xmit = 0;
   1304     int ret;
   1305 
   1306     *pages = -1;
   1307     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
   1308                                 &bytes_xmit);
   1309     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
   1310         return false;
   1311     }
   1312 
   1313     if (bytes_xmit) {
   1314         ram_transferred_add(bytes_xmit);
   1315         *pages = 1;
   1316     }
   1317 
   1318     if (ret == RAM_SAVE_CONTROL_DELAYED) {
   1319         return true;
   1320     }
   1321 
   1322     if (bytes_xmit > 0) {
   1323         ram_counters.normal++;
   1324     } else if (bytes_xmit == 0) {
   1325         ram_counters.duplicate++;
   1326     }
   1327 
   1328     return true;
   1329 }
   1330 
   1331 /*
   1332  * directly send the page to the stream
   1333  *
   1334  * Returns the number of pages written.
   1335  *
   1336  * @rs: current RAM state
   1337  * @block: block that contains the page we want to send
   1338  * @offset: offset inside the block for the page
   1339  * @buf: the page to be sent
   1340  * @async: send to page asyncly
   1341  */
   1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
   1343                             uint8_t *buf, bool async)
   1344 {
   1345     ram_transferred_add(save_page_header(rs, rs->f, block,
   1346                                          offset | RAM_SAVE_FLAG_PAGE));
   1347     if (async) {
   1348         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
   1349                               migrate_release_ram() &&
   1350                               migration_in_postcopy());
   1351     } else {
   1352         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
   1353     }
   1354     ram_transferred_add(TARGET_PAGE_SIZE);
   1355     ram_counters.normal++;
   1356     return 1;
   1357 }
   1358 
   1359 /**
   1360  * ram_save_page: send the given page to the stream
   1361  *
   1362  * Returns the number of pages written.
   1363  *          < 0 - error
   1364  *          >=0 - Number of pages written - this might legally be 0
   1365  *                if xbzrle noticed the page was the same.
   1366  *
   1367  * @rs: current RAM state
   1368  * @block: block that contains the page we want to send
   1369  * @offset: offset inside the block for the page
   1370  */
   1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
   1372 {
   1373     int pages = -1;
   1374     uint8_t *p;
   1375     bool send_async = true;
   1376     RAMBlock *block = pss->block;
   1377     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
   1378     ram_addr_t current_addr = block->offset + offset;
   1379 
   1380     p = block->host + offset;
   1381     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
   1382 
   1383     XBZRLE_cache_lock();
   1384     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
   1385         pages = save_xbzrle_page(rs, &p, current_addr, block,
   1386                                  offset);
   1387         if (!rs->last_stage) {
   1388             /* Can't send this cached data async, since the cache page
   1389              * might get updated before it gets to the wire
   1390              */
   1391             send_async = false;
   1392         }
   1393     }
   1394 
   1395     /* XBZRLE overflow or normal page */
   1396     if (pages == -1) {
   1397         pages = save_normal_page(rs, block, offset, p, send_async);
   1398     }
   1399 
   1400     XBZRLE_cache_unlock();
   1401 
   1402     return pages;
   1403 }
   1404 
   1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
   1406                                  ram_addr_t offset)
   1407 {
   1408     if (multifd_queue_page(rs->f, block, offset) < 0) {
   1409         return -1;
   1410     }
   1411     ram_counters.normal++;
   1412 
   1413     return 1;
   1414 }
   1415 
   1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
   1417                                  ram_addr_t offset, uint8_t *source_buf)
   1418 {
   1419     RAMState *rs = ram_state;
   1420     uint8_t *p = block->host + offset;
   1421     int ret;
   1422 
   1423     if (save_zero_page_to_file(rs, f, block, offset)) {
   1424         return true;
   1425     }
   1426 
   1427     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
   1428 
   1429     /*
   1430      * copy it to a internal buffer to avoid it being modified by VM
   1431      * so that we can catch up the error during compression and
   1432      * decompression
   1433      */
   1434     memcpy(source_buf, p, TARGET_PAGE_SIZE);
   1435     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
   1436     if (ret < 0) {
   1437         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
   1438         error_report("compressed data failed!");
   1439     }
   1440     return false;
   1441 }
   1442 
   1443 static void
   1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
   1445 {
   1446     ram_transferred_add(bytes_xmit);
   1447 
   1448     if (param->zero_page) {
   1449         ram_counters.duplicate++;
   1450         return;
   1451     }
   1452 
   1453     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
   1454     compression_counters.compressed_size += bytes_xmit - 8;
   1455     compression_counters.pages++;
   1456 }
   1457 
   1458 static bool save_page_use_compression(RAMState *rs);
   1459 
   1460 static void flush_compressed_data(RAMState *rs)
   1461 {
   1462     int idx, len, thread_count;
   1463 
   1464     if (!save_page_use_compression(rs)) {
   1465         return;
   1466     }
   1467     thread_count = migrate_compress_threads();
   1468 
   1469     qemu_mutex_lock(&comp_done_lock);
   1470     for (idx = 0; idx < thread_count; idx++) {
   1471         while (!comp_param[idx].done) {
   1472             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
   1473         }
   1474     }
   1475     qemu_mutex_unlock(&comp_done_lock);
   1476 
   1477     for (idx = 0; idx < thread_count; idx++) {
   1478         qemu_mutex_lock(&comp_param[idx].mutex);
   1479         if (!comp_param[idx].quit) {
   1480             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
   1481             /*
   1482              * it's safe to fetch zero_page without holding comp_done_lock
   1483              * as there is no further request submitted to the thread,
   1484              * i.e, the thread should be waiting for a request at this point.
   1485              */
   1486             update_compress_thread_counts(&comp_param[idx], len);
   1487         }
   1488         qemu_mutex_unlock(&comp_param[idx].mutex);
   1489     }
   1490 }
   1491 
   1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
   1493                                        ram_addr_t offset)
   1494 {
   1495     param->block = block;
   1496     param->offset = offset;
   1497 }
   1498 
   1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
   1500                                            ram_addr_t offset)
   1501 {
   1502     int idx, thread_count, bytes_xmit = -1, pages = -1;
   1503     bool wait = migrate_compress_wait_thread();
   1504 
   1505     thread_count = migrate_compress_threads();
   1506     qemu_mutex_lock(&comp_done_lock);
   1507 retry:
   1508     for (idx = 0; idx < thread_count; idx++) {
   1509         if (comp_param[idx].done) {
   1510             comp_param[idx].done = false;
   1511             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
   1512             qemu_mutex_lock(&comp_param[idx].mutex);
   1513             set_compress_params(&comp_param[idx], block, offset);
   1514             qemu_cond_signal(&comp_param[idx].cond);
   1515             qemu_mutex_unlock(&comp_param[idx].mutex);
   1516             pages = 1;
   1517             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
   1518             break;
   1519         }
   1520     }
   1521 
   1522     /*
   1523      * wait for the free thread if the user specifies 'compress-wait-thread',
   1524      * otherwise we will post the page out in the main thread as normal page.
   1525      */
   1526     if (pages < 0 && wait) {
   1527         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
   1528         goto retry;
   1529     }
   1530     qemu_mutex_unlock(&comp_done_lock);
   1531 
   1532     return pages;
   1533 }
   1534 
   1535 /**
   1536  * find_dirty_block: find the next dirty page and update any state
   1537  * associated with the search process.
   1538  *
   1539  * Returns true if a page is found
   1540  *
   1541  * @rs: current RAM state
   1542  * @pss: data about the state of the current dirty page scan
   1543  * @again: set to false if the search has scanned the whole of RAM
   1544  */
   1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
   1546 {
   1547     /*
   1548      * This is not a postcopy requested page, mark it "not urgent", and use
   1549      * precopy channel to send it.
   1550      */
   1551     pss->postcopy_requested = false;
   1552     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
   1553 
   1554     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
   1555     if (pss->complete_round && pss->block == rs->last_seen_block &&
   1556         pss->page >= rs->last_page) {
   1557         /*
   1558          * We've been once around the RAM and haven't found anything.
   1559          * Give up.
   1560          */
   1561         *again = false;
   1562         return false;
   1563     }
   1564     if (!offset_in_ramblock(pss->block,
   1565                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
   1566         /* Didn't find anything in this RAM Block */
   1567         pss->page = 0;
   1568         pss->block = QLIST_NEXT_RCU(pss->block, next);
   1569         if (!pss->block) {
   1570             /*
   1571              * If memory migration starts over, we will meet a dirtied page
   1572              * which may still exists in compression threads's ring, so we
   1573              * should flush the compressed data to make sure the new page
   1574              * is not overwritten by the old one in the destination.
   1575              *
   1576              * Also If xbzrle is on, stop using the data compression at this
   1577              * point. In theory, xbzrle can do better than compression.
   1578              */
   1579             flush_compressed_data(rs);
   1580 
   1581             /* Hit the end of the list */
   1582             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
   1583             /* Flag that we've looped */
   1584             pss->complete_round = true;
   1585             /* After the first round, enable XBZRLE. */
   1586             if (migrate_use_xbzrle()) {
   1587                 rs->xbzrle_enabled = true;
   1588             }
   1589         }
   1590         /* Didn't find anything this time, but try again on the new block */
   1591         *again = true;
   1592         return false;
   1593     } else {
   1594         /* Can go around again, but... */
   1595         *again = true;
   1596         /* We've found something so probably don't need to */
   1597         return true;
   1598     }
   1599 }
   1600 
   1601 /**
   1602  * unqueue_page: gets a page of the queue
   1603  *
   1604  * Helper for 'get_queued_page' - gets a page off the queue
   1605  *
   1606  * Returns the block of the page (or NULL if none available)
   1607  *
   1608  * @rs: current RAM state
   1609  * @offset: used to return the offset within the RAMBlock
   1610  */
   1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
   1612 {
   1613     struct RAMSrcPageRequest *entry;
   1614     RAMBlock *block = NULL;
   1615 
   1616     if (!postcopy_has_request(rs)) {
   1617         return NULL;
   1618     }
   1619 
   1620     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
   1621 
   1622     /*
   1623      * This should _never_ change even after we take the lock, because no one
   1624      * should be taking anything off the request list other than us.
   1625      */
   1626     assert(postcopy_has_request(rs));
   1627 
   1628     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
   1629     block = entry->rb;
   1630     *offset = entry->offset;
   1631 
   1632     if (entry->len > TARGET_PAGE_SIZE) {
   1633         entry->len -= TARGET_PAGE_SIZE;
   1634         entry->offset += TARGET_PAGE_SIZE;
   1635     } else {
   1636         memory_region_unref(block->mr);
   1637         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
   1638         g_free(entry);
   1639         migration_consume_urgent_request();
   1640     }
   1641 
   1642     return block;
   1643 }
   1644 
   1645 #if defined(__linux__)
   1646 /**
   1647  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
   1648  *   is found, return RAM block pointer and page offset
   1649  *
   1650  * Returns pointer to the RAMBlock containing faulting page,
   1651  *   NULL if no write faults are pending
   1652  *
   1653  * @rs: current RAM state
   1654  * @offset: page offset from the beginning of the block
   1655  */
   1656 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
   1657 {
   1658     struct uffd_msg uffd_msg;
   1659     void *page_address;
   1660     RAMBlock *block;
   1661     int res;
   1662 
   1663     if (!migrate_background_snapshot()) {
   1664         return NULL;
   1665     }
   1666 
   1667     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
   1668     if (res <= 0) {
   1669         return NULL;
   1670     }
   1671 
   1672     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
   1673     block = qemu_ram_block_from_host(page_address, false, offset);
   1674     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
   1675     return block;
   1676 }
   1677 
   1678 /**
   1679  * ram_save_release_protection: release UFFD write protection after
   1680  *   a range of pages has been saved
   1681  *
   1682  * @rs: current RAM state
   1683  * @pss: page-search-status structure
   1684  * @start_page: index of the first page in the range relative to pss->block
   1685  *
   1686  * Returns 0 on success, negative value in case of an error
   1687 */
   1688 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
   1689         unsigned long start_page)
   1690 {
   1691     int res = 0;
   1692 
   1693     /* Check if page is from UFFD-managed region. */
   1694     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
   1695         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
   1696         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
   1697 
   1698         /* Flush async buffers before un-protect. */
   1699         qemu_fflush(rs->f);
   1700         /* Un-protect memory range. */
   1701         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
   1702                 false, false);
   1703     }
   1704 
   1705     return res;
   1706 }
   1707 
   1708 /* ram_write_tracking_available: check if kernel supports required UFFD features
   1709  *
   1710  * Returns true if supports, false otherwise
   1711  */
   1712 bool ram_write_tracking_available(void)
   1713 {
   1714     uint64_t uffd_features;
   1715     int res;
   1716 
   1717     res = uffd_query_features(&uffd_features);
   1718     return (res == 0 &&
   1719             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
   1720 }
   1721 
   1722 /* ram_write_tracking_compatible: check if guest configuration is
   1723  *   compatible with 'write-tracking'
   1724  *
   1725  * Returns true if compatible, false otherwise
   1726  */
   1727 bool ram_write_tracking_compatible(void)
   1728 {
   1729     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
   1730     int uffd_fd;
   1731     RAMBlock *block;
   1732     bool ret = false;
   1733 
   1734     /* Open UFFD file descriptor */
   1735     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
   1736     if (uffd_fd < 0) {
   1737         return false;
   1738     }
   1739 
   1740     RCU_READ_LOCK_GUARD();
   1741 
   1742     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1743         uint64_t uffd_ioctls;
   1744 
   1745         /* Nothing to do with read-only and MMIO-writable regions */
   1746         if (block->mr->readonly || block->mr->rom_device) {
   1747             continue;
   1748         }
   1749         /* Try to register block memory via UFFD-IO to track writes */
   1750         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
   1751                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
   1752             goto out;
   1753         }
   1754         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
   1755             goto out;
   1756         }
   1757     }
   1758     ret = true;
   1759 
   1760 out:
   1761     uffd_close_fd(uffd_fd);
   1762     return ret;
   1763 }
   1764 
   1765 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
   1766                                        ram_addr_t size)
   1767 {
   1768     /*
   1769      * We read one byte of each page; this will preallocate page tables if
   1770      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
   1771      * where no page was populated yet. This might require adaption when
   1772      * supporting other mappings, like shmem.
   1773      */
   1774     for (; offset < size; offset += block->page_size) {
   1775         char tmp = *((char *)block->host + offset);
   1776 
   1777         /* Don't optimize the read out */
   1778         asm volatile("" : "+r" (tmp));
   1779     }
   1780 }
   1781 
   1782 static inline int populate_read_section(MemoryRegionSection *section,
   1783                                         void *opaque)
   1784 {
   1785     const hwaddr size = int128_get64(section->size);
   1786     hwaddr offset = section->offset_within_region;
   1787     RAMBlock *block = section->mr->ram_block;
   1788 
   1789     populate_read_range(block, offset, size);
   1790     return 0;
   1791 }
   1792 
   1793 /*
   1794  * ram_block_populate_read: preallocate page tables and populate pages in the
   1795  *   RAM block by reading a byte of each page.
   1796  *
   1797  * Since it's solely used for userfault_fd WP feature, here we just
   1798  *   hardcode page size to qemu_real_host_page_size.
   1799  *
   1800  * @block: RAM block to populate
   1801  */
   1802 static void ram_block_populate_read(RAMBlock *rb)
   1803 {
   1804     /*
   1805      * Skip populating all pages that fall into a discarded range as managed by
   1806      * a RamDiscardManager responsible for the mapped memory region of the
   1807      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
   1808      * must not get populated automatically. We don't have to track
   1809      * modifications via userfaultfd WP reliably, because these pages will
   1810      * not be part of the migration stream either way -- see
   1811      * ramblock_dirty_bitmap_exclude_discarded_pages().
   1812      *
   1813      * Note: The result is only stable while migrating (precopy/postcopy).
   1814      */
   1815     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
   1816         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
   1817         MemoryRegionSection section = {
   1818             .mr = rb->mr,
   1819             .offset_within_region = 0,
   1820             .size = rb->mr->size,
   1821         };
   1822 
   1823         ram_discard_manager_replay_populated(rdm, &section,
   1824                                              populate_read_section, NULL);
   1825     } else {
   1826         populate_read_range(rb, 0, rb->used_length);
   1827     }
   1828 }
   1829 
   1830 /*
   1831  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
   1832  */
   1833 void ram_write_tracking_prepare(void)
   1834 {
   1835     RAMBlock *block;
   1836 
   1837     RCU_READ_LOCK_GUARD();
   1838 
   1839     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1840         /* Nothing to do with read-only and MMIO-writable regions */
   1841         if (block->mr->readonly || block->mr->rom_device) {
   1842             continue;
   1843         }
   1844 
   1845         /*
   1846          * Populate pages of the RAM block before enabling userfault_fd
   1847          * write protection.
   1848          *
   1849          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
   1850          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
   1851          * pages with pte_none() entries in page table.
   1852          */
   1853         ram_block_populate_read(block);
   1854     }
   1855 }
   1856 
   1857 /*
   1858  * ram_write_tracking_start: start UFFD-WP memory tracking
   1859  *
   1860  * Returns 0 for success or negative value in case of error
   1861  */
   1862 int ram_write_tracking_start(void)
   1863 {
   1864     int uffd_fd;
   1865     RAMState *rs = ram_state;
   1866     RAMBlock *block;
   1867 
   1868     /* Open UFFD file descriptor */
   1869     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
   1870     if (uffd_fd < 0) {
   1871         return uffd_fd;
   1872     }
   1873     rs->uffdio_fd = uffd_fd;
   1874 
   1875     RCU_READ_LOCK_GUARD();
   1876 
   1877     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1878         /* Nothing to do with read-only and MMIO-writable regions */
   1879         if (block->mr->readonly || block->mr->rom_device) {
   1880             continue;
   1881         }
   1882 
   1883         /* Register block memory with UFFD to track writes */
   1884         if (uffd_register_memory(rs->uffdio_fd, block->host,
   1885                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
   1886             goto fail;
   1887         }
   1888         /* Apply UFFD write protection to the block memory range */
   1889         if (uffd_change_protection(rs->uffdio_fd, block->host,
   1890                 block->max_length, true, false)) {
   1891             goto fail;
   1892         }
   1893         block->flags |= RAM_UF_WRITEPROTECT;
   1894         memory_region_ref(block->mr);
   1895 
   1896         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
   1897                 block->host, block->max_length);
   1898     }
   1899 
   1900     return 0;
   1901 
   1902 fail:
   1903     error_report("ram_write_tracking_start() failed: restoring initial memory state");
   1904 
   1905     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1906         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
   1907             continue;
   1908         }
   1909         /*
   1910          * In case some memory block failed to be write-protected
   1911          * remove protection and unregister all succeeded RAM blocks
   1912          */
   1913         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
   1914                 false, false);
   1915         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
   1916         /* Cleanup flags and remove reference */
   1917         block->flags &= ~RAM_UF_WRITEPROTECT;
   1918         memory_region_unref(block->mr);
   1919     }
   1920 
   1921     uffd_close_fd(uffd_fd);
   1922     rs->uffdio_fd = -1;
   1923     return -1;
   1924 }
   1925 
   1926 /**
   1927  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
   1928  */
   1929 void ram_write_tracking_stop(void)
   1930 {
   1931     RAMState *rs = ram_state;
   1932     RAMBlock *block;
   1933 
   1934     RCU_READ_LOCK_GUARD();
   1935 
   1936     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   1937         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
   1938             continue;
   1939         }
   1940         /* Remove protection and unregister all affected RAM blocks */
   1941         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
   1942                 false, false);
   1943         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
   1944 
   1945         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
   1946                 block->host, block->max_length);
   1947 
   1948         /* Cleanup flags and remove reference */
   1949         block->flags &= ~RAM_UF_WRITEPROTECT;
   1950         memory_region_unref(block->mr);
   1951     }
   1952 
   1953     /* Finally close UFFD file descriptor */
   1954     uffd_close_fd(rs->uffdio_fd);
   1955     rs->uffdio_fd = -1;
   1956 }
   1957 
   1958 #else
   1959 /* No target OS support, stubs just fail or ignore */
   1960 
   1961 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
   1962 {
   1963     (void) rs;
   1964     (void) offset;
   1965 
   1966     return NULL;
   1967 }
   1968 
   1969 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
   1970         unsigned long start_page)
   1971 {
   1972     (void) rs;
   1973     (void) pss;
   1974     (void) start_page;
   1975 
   1976     return 0;
   1977 }
   1978 
   1979 bool ram_write_tracking_available(void)
   1980 {
   1981     return false;
   1982 }
   1983 
   1984 bool ram_write_tracking_compatible(void)
   1985 {
   1986     assert(0);
   1987     return false;
   1988 }
   1989 
   1990 int ram_write_tracking_start(void)
   1991 {
   1992     assert(0);
   1993     return -1;
   1994 }
   1995 
   1996 void ram_write_tracking_stop(void)
   1997 {
   1998     assert(0);
   1999 }
   2000 #endif /* defined(__linux__) */
   2001 
   2002 /*
   2003  * Check whether two addr/offset of the ramblock falls onto the same host huge
   2004  * page.  Returns true if so, false otherwise.
   2005  */
   2006 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
   2007                                      uint64_t addr2)
   2008 {
   2009     size_t page_size = qemu_ram_pagesize(rb);
   2010 
   2011     addr1 = ROUND_DOWN(addr1, page_size);
   2012     addr2 = ROUND_DOWN(addr2, page_size);
   2013 
   2014     return addr1 == addr2;
   2015 }
   2016 
   2017 /*
   2018  * Whether a previous preempted precopy huge page contains current requested
   2019  * page?  Returns true if so, false otherwise.
   2020  *
   2021  * This should really happen very rarely, because it means when we were sending
   2022  * during background migration for postcopy we're sending exactly the page that
   2023  * some vcpu got faulted on on dest node.  When it happens, we probably don't
   2024  * need to do much but drop the request, because we know right after we restore
   2025  * the precopy stream it'll be serviced.  It'll slightly affect the order of
   2026  * postcopy requests to be serviced (e.g. it'll be the same as we move current
   2027  * request to the end of the queue) but it shouldn't be a big deal.  The most
   2028  * imporant thing is we can _never_ try to send a partial-sent huge page on the
   2029  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
   2030  * two channels (PRECOPY, POSTCOPY).
   2031  */
   2032 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
   2033                                         ram_addr_t offset)
   2034 {
   2035     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
   2036 
   2037     /* No preemption at all? */
   2038     if (!state->preempted) {
   2039         return false;
   2040     }
   2041 
   2042     /* Not even the same ramblock? */
   2043     if (state->ram_block != block) {
   2044         return false;
   2045     }
   2046 
   2047     return offset_on_same_huge_page(block, offset,
   2048                                     state->ram_page << TARGET_PAGE_BITS);
   2049 }
   2050 
   2051 /**
   2052  * get_queued_page: unqueue a page from the postcopy requests
   2053  *
   2054  * Skips pages that are already sent (!dirty)
   2055  *
   2056  * Returns true if a queued page is found
   2057  *
   2058  * @rs: current RAM state
   2059  * @pss: data about the state of the current dirty page scan
   2060  */
   2061 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
   2062 {
   2063     RAMBlock  *block;
   2064     ram_addr_t offset;
   2065     bool dirty;
   2066 
   2067     do {
   2068         block = unqueue_page(rs, &offset);
   2069         /*
   2070          * We're sending this page, and since it's postcopy nothing else
   2071          * will dirty it, and we must make sure it doesn't get sent again
   2072          * even if this queue request was received after the background
   2073          * search already sent it.
   2074          */
   2075         if (block) {
   2076             unsigned long page;
   2077 
   2078             page = offset >> TARGET_PAGE_BITS;
   2079             dirty = test_bit(page, block->bmap);
   2080             if (!dirty) {
   2081                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
   2082                                                 page);
   2083             } else {
   2084                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
   2085             }
   2086         }
   2087 
   2088     } while (block && !dirty);
   2089 
   2090     if (block) {
   2091         /* See comment above postcopy_preempted_contains() */
   2092         if (postcopy_preempted_contains(rs, block, offset)) {
   2093             trace_postcopy_preempt_hit(block->idstr, offset);
   2094             /*
   2095              * If what we preempted previously was exactly what we're
   2096              * requesting right now, restore the preempted precopy
   2097              * immediately, boosting its priority as it's requested by
   2098              * postcopy.
   2099              */
   2100             postcopy_preempt_restore(rs, pss, true);
   2101             return true;
   2102         }
   2103     } else {
   2104         /*
   2105          * Poll write faults too if background snapshot is enabled; that's
   2106          * when we have vcpus got blocked by the write protected pages.
   2107          */
   2108         block = poll_fault_page(rs, &offset);
   2109     }
   2110 
   2111     if (block) {
   2112         /*
   2113          * We want the background search to continue from the queued page
   2114          * since the guest is likely to want other pages near to the page
   2115          * it just requested.
   2116          */
   2117         pss->block = block;
   2118         pss->page = offset >> TARGET_PAGE_BITS;
   2119 
   2120         /*
   2121          * This unqueued page would break the "one round" check, even is
   2122          * really rare.
   2123          */
   2124         pss->complete_round = false;
   2125         /* Mark it an urgent request, meanwhile using POSTCOPY channel */
   2126         pss->postcopy_requested = true;
   2127         pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
   2128     }
   2129 
   2130     return !!block;
   2131 }
   2132 
   2133 /**
   2134  * migration_page_queue_free: drop any remaining pages in the ram
   2135  * request queue
   2136  *
   2137  * It should be empty at the end anyway, but in error cases there may
   2138  * be some left.  in case that there is any page left, we drop it.
   2139  *
   2140  */
   2141 static void migration_page_queue_free(RAMState *rs)
   2142 {
   2143     struct RAMSrcPageRequest *mspr, *next_mspr;
   2144     /* This queue generally should be empty - but in the case of a failed
   2145      * migration might have some droppings in.
   2146      */
   2147     RCU_READ_LOCK_GUARD();
   2148     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
   2149         memory_region_unref(mspr->rb->mr);
   2150         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
   2151         g_free(mspr);
   2152     }
   2153 }
   2154 
   2155 /**
   2156  * ram_save_queue_pages: queue the page for transmission
   2157  *
   2158  * A request from postcopy destination for example.
   2159  *
   2160  * Returns zero on success or negative on error
   2161  *
   2162  * @rbname: Name of the RAMBLock of the request. NULL means the
   2163  *          same that last one.
   2164  * @start: starting address from the start of the RAMBlock
   2165  * @len: length (in bytes) to send
   2166  */
   2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
   2168 {
   2169     RAMBlock *ramblock;
   2170     RAMState *rs = ram_state;
   2171 
   2172     ram_counters.postcopy_requests++;
   2173     RCU_READ_LOCK_GUARD();
   2174 
   2175     if (!rbname) {
   2176         /* Reuse last RAMBlock */
   2177         ramblock = rs->last_req_rb;
   2178 
   2179         if (!ramblock) {
   2180             /*
   2181              * Shouldn't happen, we can't reuse the last RAMBlock if
   2182              * it's the 1st request.
   2183              */
   2184             error_report("ram_save_queue_pages no previous block");
   2185             return -1;
   2186         }
   2187     } else {
   2188         ramblock = qemu_ram_block_by_name(rbname);
   2189 
   2190         if (!ramblock) {
   2191             /* We shouldn't be asked for a non-existent RAMBlock */
   2192             error_report("ram_save_queue_pages no block '%s'", rbname);
   2193             return -1;
   2194         }
   2195         rs->last_req_rb = ramblock;
   2196     }
   2197     trace_ram_save_queue_pages(ramblock->idstr, start, len);
   2198     if (!offset_in_ramblock(ramblock, start + len - 1)) {
   2199         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
   2200                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
   2201                      __func__, start, len, ramblock->used_length);
   2202         return -1;
   2203     }
   2204 
   2205     struct RAMSrcPageRequest *new_entry =
   2206         g_new0(struct RAMSrcPageRequest, 1);
   2207     new_entry->rb = ramblock;
   2208     new_entry->offset = start;
   2209     new_entry->len = len;
   2210 
   2211     memory_region_ref(ramblock->mr);
   2212     qemu_mutex_lock(&rs->src_page_req_mutex);
   2213     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
   2214     migration_make_urgent_request();
   2215     qemu_mutex_unlock(&rs->src_page_req_mutex);
   2216 
   2217     return 0;
   2218 }
   2219 
   2220 static bool save_page_use_compression(RAMState *rs)
   2221 {
   2222     if (!migrate_use_compression()) {
   2223         return false;
   2224     }
   2225 
   2226     /*
   2227      * If xbzrle is enabled (e.g., after first round of migration), stop
   2228      * using the data compression. In theory, xbzrle can do better than
   2229      * compression.
   2230      */
   2231     if (rs->xbzrle_enabled) {
   2232         return false;
   2233     }
   2234 
   2235     return true;
   2236 }
   2237 
   2238 /*
   2239  * try to compress the page before posting it out, return true if the page
   2240  * has been properly handled by compression, otherwise needs other
   2241  * paths to handle it
   2242  */
   2243 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
   2244 {
   2245     if (!save_page_use_compression(rs)) {
   2246         return false;
   2247     }
   2248 
   2249     /*
   2250      * When starting the process of a new block, the first page of
   2251      * the block should be sent out before other pages in the same
   2252      * block, and all the pages in last block should have been sent
   2253      * out, keeping this order is important, because the 'cont' flag
   2254      * is used to avoid resending the block name.
   2255      *
   2256      * We post the fist page as normal page as compression will take
   2257      * much CPU resource.
   2258      */
   2259     if (block != rs->last_sent_block) {
   2260         flush_compressed_data(rs);
   2261         return false;
   2262     }
   2263 
   2264     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
   2265         return true;
   2266     }
   2267 
   2268     compression_counters.busy++;
   2269     return false;
   2270 }
   2271 
   2272 /**
   2273  * ram_save_target_page: save one target page
   2274  *
   2275  * Returns the number of pages written
   2276  *
   2277  * @rs: current RAM state
   2278  * @pss: data about the page we want to send
   2279  */
   2280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
   2281 {
   2282     RAMBlock *block = pss->block;
   2283     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
   2284     int res;
   2285 
   2286     if (control_save_page(rs, block, offset, &res)) {
   2287         return res;
   2288     }
   2289 
   2290     if (save_compress_page(rs, block, offset)) {
   2291         return 1;
   2292     }
   2293 
   2294     res = save_zero_page(rs, block, offset);
   2295     if (res > 0) {
   2296         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
   2297          * page would be stale
   2298          */
   2299         if (!save_page_use_compression(rs)) {
   2300             XBZRLE_cache_lock();
   2301             xbzrle_cache_zero_page(rs, block->offset + offset);
   2302             XBZRLE_cache_unlock();
   2303         }
   2304         return res;
   2305     }
   2306 
   2307     /*
   2308      * Do not use multifd in postcopy as one whole host page should be
   2309      * placed.  Meanwhile postcopy requires atomic update of pages, so even
   2310      * if host page size == guest page size the dest guest during run may
   2311      * still see partially copied pages which is data corruption.
   2312      */
   2313     if (migrate_use_multifd() && !migration_in_postcopy()) {
   2314         return ram_save_multifd_page(rs, block, offset);
   2315     }
   2316 
   2317     return ram_save_page(rs, pss);
   2318 }
   2319 
   2320 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
   2321 {
   2322     MigrationState *ms = migrate_get_current();
   2323 
   2324     /* Not enabled eager preempt?  Then never do that. */
   2325     if (!migrate_postcopy_preempt()) {
   2326         return false;
   2327     }
   2328 
   2329     /* If the user explicitly disabled breaking of huge page, skip */
   2330     if (!ms->postcopy_preempt_break_huge) {
   2331         return false;
   2332     }
   2333 
   2334     /* If the ramblock we're sending is a small page?  Never bother. */
   2335     if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
   2336         return false;
   2337     }
   2338 
   2339     /* Not in postcopy at all? */
   2340     if (!migration_in_postcopy()) {
   2341         return false;
   2342     }
   2343 
   2344     /*
   2345      * If we're already handling a postcopy request, don't preempt as this page
   2346      * has got the same high priority.
   2347      */
   2348     if (pss->postcopy_requested) {
   2349         return false;
   2350     }
   2351 
   2352     /* If there's postcopy requests, then check it up! */
   2353     return postcopy_has_request(rs);
   2354 }
   2355 
   2356 /* Returns true if we preempted precopy, false otherwise */
   2357 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
   2358 {
   2359     PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
   2360 
   2361     trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
   2362 
   2363     /*
   2364      * Time to preempt precopy. Cache current PSS into preempt state, so that
   2365      * after handling the postcopy pages we can recover to it.  We need to do
   2366      * so because the dest VM will have partial of the precopy huge page kept
   2367      * over in its tmp huge page caches; better move on with it when we can.
   2368      */
   2369     p_state->ram_block = pss->block;
   2370     p_state->ram_page = pss->page;
   2371     p_state->preempted = true;
   2372 }
   2373 
   2374 /* Whether we're preempted by a postcopy request during sending a huge page */
   2375 static bool postcopy_preempt_triggered(RAMState *rs)
   2376 {
   2377     return rs->postcopy_preempt_state.preempted;
   2378 }
   2379 
   2380 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
   2381                                      bool postcopy_requested)
   2382 {
   2383     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
   2384 
   2385     assert(state->preempted);
   2386 
   2387     pss->block = state->ram_block;
   2388     pss->page = state->ram_page;
   2389 
   2390     /* Whether this is a postcopy request? */
   2391     pss->postcopy_requested = postcopy_requested;
   2392     /*
   2393      * When restoring a preempted page, the old data resides in PRECOPY
   2394      * slow channel, even if postcopy_requested is set.  So always use
   2395      * PRECOPY channel here.
   2396      */
   2397     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
   2398 
   2399     trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
   2400 
   2401     /* Reset preempt state, most importantly, set preempted==false */
   2402     postcopy_preempt_reset(rs);
   2403 }
   2404 
   2405 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
   2406 {
   2407     MigrationState *s = migrate_get_current();
   2408     unsigned int channel = pss->postcopy_target_channel;
   2409     QEMUFile *next;
   2410 
   2411     if (channel != rs->postcopy_channel) {
   2412         if (channel == RAM_CHANNEL_PRECOPY) {
   2413             next = s->to_dst_file;
   2414         } else {
   2415             next = s->postcopy_qemufile_src;
   2416         }
   2417         /* Update and cache the current channel */
   2418         rs->f = next;
   2419         rs->postcopy_channel = channel;
   2420 
   2421         /*
   2422          * If channel switched, reset last_sent_block since the old sent block
   2423          * may not be on the same channel.
   2424          */
   2425         rs->last_sent_block = NULL;
   2426 
   2427         trace_postcopy_preempt_switch_channel(channel);
   2428     }
   2429 
   2430     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
   2431 }
   2432 
   2433 /* We need to make sure rs->f always points to the default channel elsewhere */
   2434 static void postcopy_preempt_reset_channel(RAMState *rs)
   2435 {
   2436     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
   2437         rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
   2438         rs->f = migrate_get_current()->to_dst_file;
   2439         trace_postcopy_preempt_reset_channel();
   2440     }
   2441 }
   2442 
   2443 /**
   2444  * ram_save_host_page: save a whole host page
   2445  *
   2446  * Starting at *offset send pages up to the end of the current host
   2447  * page. It's valid for the initial offset to point into the middle of
   2448  * a host page in which case the remainder of the hostpage is sent.
   2449  * Only dirty target pages are sent. Note that the host page size may
   2450  * be a huge page for this block.
   2451  * The saving stops at the boundary of the used_length of the block
   2452  * if the RAMBlock isn't a multiple of the host page size.
   2453  *
   2454  * Returns the number of pages written or negative on error
   2455  *
   2456  * @rs: current RAM state
   2457  * @pss: data about the page we want to send
   2458  */
   2459 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
   2460 {
   2461     int tmppages, pages = 0;
   2462     size_t pagesize_bits =
   2463         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
   2464     unsigned long hostpage_boundary =
   2465         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
   2466     unsigned long start_page = pss->page;
   2467     int res;
   2468 
   2469     if (ramblock_is_ignored(pss->block)) {
   2470         error_report("block %s should not be migrated !", pss->block->idstr);
   2471         return 0;
   2472     }
   2473 
   2474     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
   2475         postcopy_preempt_choose_channel(rs, pss);
   2476     }
   2477 
   2478     do {
   2479         if (postcopy_needs_preempt(rs, pss)) {
   2480             postcopy_do_preempt(rs, pss);
   2481             break;
   2482         }
   2483 
   2484         /* Check the pages is dirty and if it is send it */
   2485         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
   2486             tmppages = ram_save_target_page(rs, pss);
   2487             if (tmppages < 0) {
   2488                 return tmppages;
   2489             }
   2490 
   2491             pages += tmppages;
   2492             /*
   2493              * Allow rate limiting to happen in the middle of huge pages if
   2494              * something is sent in the current iteration.
   2495              */
   2496             if (pagesize_bits > 1 && tmppages > 0) {
   2497                 migration_rate_limit();
   2498             }
   2499         }
   2500         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
   2501     } while ((pss->page < hostpage_boundary) &&
   2502              offset_in_ramblock(pss->block,
   2503                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
   2504     /* The offset we leave with is the min boundary of host page and block */
   2505     pss->page = MIN(pss->page, hostpage_boundary);
   2506 
   2507     /*
   2508      * When with postcopy preempt mode, flush the data as soon as possible for
   2509      * postcopy requests, because we've already sent a whole huge page, so the
   2510      * dst node should already have enough resource to atomically filling in
   2511      * the current missing page.
   2512      *
   2513      * More importantly, when using separate postcopy channel, we must do
   2514      * explicit flush or it won't flush until the buffer is full.
   2515      */
   2516     if (migrate_postcopy_preempt() && pss->postcopy_requested) {
   2517         qemu_fflush(rs->f);
   2518     }
   2519 
   2520     res = ram_save_release_protection(rs, pss, start_page);
   2521     return (res < 0 ? res : pages);
   2522 }
   2523 
   2524 /**
   2525  * ram_find_and_save_block: finds a dirty page and sends it to f
   2526  *
   2527  * Called within an RCU critical section.
   2528  *
   2529  * Returns the number of pages written where zero means no dirty pages,
   2530  * or negative on error
   2531  *
   2532  * @rs: current RAM state
   2533  *
   2534  * On systems where host-page-size > target-page-size it will send all the
   2535  * pages in a host page that are dirty.
   2536  */
   2537 static int ram_find_and_save_block(RAMState *rs)
   2538 {
   2539     PageSearchStatus pss;
   2540     int pages = 0;
   2541     bool again, found;
   2542 
   2543     /* No dirty page as there is zero RAM */
   2544     if (!ram_bytes_total()) {
   2545         return pages;
   2546     }
   2547 
   2548     /*
   2549      * Always keep last_seen_block/last_page valid during this procedure,
   2550      * because find_dirty_block() relies on these values (e.g., we compare
   2551      * last_seen_block with pss.block to see whether we searched all the
   2552      * ramblocks) to detect the completion of migration.  Having NULL value
   2553      * of last_seen_block can conditionally cause below loop to run forever.
   2554      */
   2555     if (!rs->last_seen_block) {
   2556         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
   2557         rs->last_page = 0;
   2558     }
   2559 
   2560     pss.block = rs->last_seen_block;
   2561     pss.page = rs->last_page;
   2562     pss.complete_round = false;
   2563 
   2564     do {
   2565         again = true;
   2566         found = get_queued_page(rs, &pss);
   2567 
   2568         if (!found) {
   2569             /*
   2570              * Recover previous precopy ramblock/offset if postcopy has
   2571              * preempted precopy.  Otherwise find the next dirty bit.
   2572              */
   2573             if (postcopy_preempt_triggered(rs)) {
   2574                 postcopy_preempt_restore(rs, &pss, false);
   2575                 found = true;
   2576             } else {
   2577                 /* priority queue empty, so just search for something dirty */
   2578                 found = find_dirty_block(rs, &pss, &again);
   2579             }
   2580         }
   2581 
   2582         if (found) {
   2583             pages = ram_save_host_page(rs, &pss);
   2584         }
   2585     } while (!pages && again);
   2586 
   2587     rs->last_seen_block = pss.block;
   2588     rs->last_page = pss.page;
   2589 
   2590     return pages;
   2591 }
   2592 
   2593 void acct_update_position(QEMUFile *f, size_t size, bool zero)
   2594 {
   2595     uint64_t pages = size / TARGET_PAGE_SIZE;
   2596 
   2597     if (zero) {
   2598         ram_counters.duplicate += pages;
   2599     } else {
   2600         ram_counters.normal += pages;
   2601         ram_transferred_add(size);
   2602         qemu_file_credit_transfer(f, size);
   2603     }
   2604 }
   2605 
   2606 static uint64_t ram_bytes_total_common(bool count_ignored)
   2607 {
   2608     RAMBlock *block;
   2609     uint64_t total = 0;
   2610 
   2611     RCU_READ_LOCK_GUARD();
   2612 
   2613     if (count_ignored) {
   2614         RAMBLOCK_FOREACH_MIGRATABLE(block) {
   2615             total += block->used_length;
   2616         }
   2617     } else {
   2618         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2619             total += block->used_length;
   2620         }
   2621     }
   2622     return total;
   2623 }
   2624 
   2625 uint64_t ram_bytes_total(void)
   2626 {
   2627     return ram_bytes_total_common(false);
   2628 }
   2629 
   2630 static void xbzrle_load_setup(void)
   2631 {
   2632     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
   2633 }
   2634 
   2635 static void xbzrle_load_cleanup(void)
   2636 {
   2637     g_free(XBZRLE.decoded_buf);
   2638     XBZRLE.decoded_buf = NULL;
   2639 }
   2640 
   2641 static void ram_state_cleanup(RAMState **rsp)
   2642 {
   2643     if (*rsp) {
   2644         migration_page_queue_free(*rsp);
   2645         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
   2646         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
   2647         g_free(*rsp);
   2648         *rsp = NULL;
   2649     }
   2650 }
   2651 
   2652 static void xbzrle_cleanup(void)
   2653 {
   2654     XBZRLE_cache_lock();
   2655     if (XBZRLE.cache) {
   2656         cache_fini(XBZRLE.cache);
   2657         g_free(XBZRLE.encoded_buf);
   2658         g_free(XBZRLE.current_buf);
   2659         g_free(XBZRLE.zero_target_page);
   2660         XBZRLE.cache = NULL;
   2661         XBZRLE.encoded_buf = NULL;
   2662         XBZRLE.current_buf = NULL;
   2663         XBZRLE.zero_target_page = NULL;
   2664     }
   2665     XBZRLE_cache_unlock();
   2666 }
   2667 
   2668 static void ram_save_cleanup(void *opaque)
   2669 {
   2670     RAMState **rsp = opaque;
   2671     RAMBlock *block;
   2672 
   2673     /* We don't use dirty log with background snapshots */
   2674     if (!migrate_background_snapshot()) {
   2675         /* caller have hold iothread lock or is in a bh, so there is
   2676          * no writing race against the migration bitmap
   2677          */
   2678         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
   2679             /*
   2680              * do not stop dirty log without starting it, since
   2681              * memory_global_dirty_log_stop will assert that
   2682              * memory_global_dirty_log_start/stop used in pairs
   2683              */
   2684             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
   2685         }
   2686     }
   2687 
   2688     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2689         g_free(block->clear_bmap);
   2690         block->clear_bmap = NULL;
   2691         g_free(block->bmap);
   2692         block->bmap = NULL;
   2693     }
   2694 
   2695     xbzrle_cleanup();
   2696     compress_threads_save_cleanup();
   2697     ram_state_cleanup(rsp);
   2698 }
   2699 
   2700 static void ram_state_reset(RAMState *rs)
   2701 {
   2702     rs->last_seen_block = NULL;
   2703     rs->last_sent_block = NULL;
   2704     rs->last_page = 0;
   2705     rs->last_version = ram_list.version;
   2706     rs->xbzrle_enabled = false;
   2707     postcopy_preempt_reset(rs);
   2708     rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
   2709 }
   2710 
   2711 #define MAX_WAIT 50 /* ms, half buffered_file limit */
   2712 
   2713 /* **** functions for postcopy ***** */
   2714 
   2715 void ram_postcopy_migrated_memory_release(MigrationState *ms)
   2716 {
   2717     struct RAMBlock *block;
   2718 
   2719     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2720         unsigned long *bitmap = block->bmap;
   2721         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
   2722         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
   2723 
   2724         while (run_start < range) {
   2725             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
   2726             ram_discard_range(block->idstr,
   2727                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
   2728                               ((ram_addr_t)(run_end - run_start))
   2729                                 << TARGET_PAGE_BITS);
   2730             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
   2731         }
   2732     }
   2733 }
   2734 
   2735 /**
   2736  * postcopy_send_discard_bm_ram: discard a RAMBlock
   2737  *
   2738  * Callback from postcopy_each_ram_send_discard for each RAMBlock
   2739  *
   2740  * @ms: current migration state
   2741  * @block: RAMBlock to discard
   2742  */
   2743 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
   2744 {
   2745     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
   2746     unsigned long current;
   2747     unsigned long *bitmap = block->bmap;
   2748 
   2749     for (current = 0; current < end; ) {
   2750         unsigned long one = find_next_bit(bitmap, end, current);
   2751         unsigned long zero, discard_length;
   2752 
   2753         if (one >= end) {
   2754             break;
   2755         }
   2756 
   2757         zero = find_next_zero_bit(bitmap, end, one + 1);
   2758 
   2759         if (zero >= end) {
   2760             discard_length = end - one;
   2761         } else {
   2762             discard_length = zero - one;
   2763         }
   2764         postcopy_discard_send_range(ms, one, discard_length);
   2765         current = one + discard_length;
   2766     }
   2767 }
   2768 
   2769 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
   2770 
   2771 /**
   2772  * postcopy_each_ram_send_discard: discard all RAMBlocks
   2773  *
   2774  * Utility for the outgoing postcopy code.
   2775  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
   2776  *   passing it bitmap indexes and name.
   2777  * (qemu_ram_foreach_block ends up passing unscaled lengths
   2778  *  which would mean postcopy code would have to deal with target page)
   2779  *
   2780  * @ms: current migration state
   2781  */
   2782 static void postcopy_each_ram_send_discard(MigrationState *ms)
   2783 {
   2784     struct RAMBlock *block;
   2785 
   2786     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   2787         postcopy_discard_send_init(ms, block->idstr);
   2788 
   2789         /*
   2790          * Deal with TPS != HPS and huge pages.  It discard any partially sent
   2791          * host-page size chunks, mark any partially dirty host-page size
   2792          * chunks as all dirty.  In this case the host-page is the host-page
   2793          * for the particular RAMBlock, i.e. it might be a huge page.
   2794          */
   2795         postcopy_chunk_hostpages_pass(ms, block);
   2796 
   2797         /*
   2798          * Postcopy sends chunks of bitmap over the wire, but it
   2799          * just needs indexes at this point, avoids it having
   2800          * target page specific code.
   2801          */
   2802         postcopy_send_discard_bm_ram(ms, block);
   2803         postcopy_discard_send_finish(ms);
   2804     }
   2805 }
   2806 
   2807 /**
   2808  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
   2809  *
   2810  * Helper for postcopy_chunk_hostpages; it's called twice to
   2811  * canonicalize the two bitmaps, that are similar, but one is
   2812  * inverted.
   2813  *
   2814  * Postcopy requires that all target pages in a hostpage are dirty or
   2815  * clean, not a mix.  This function canonicalizes the bitmaps.
   2816  *
   2817  * @ms: current migration state
   2818  * @block: block that contains the page we want to canonicalize
   2819  */
   2820 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
   2821 {
   2822     RAMState *rs = ram_state;
   2823     unsigned long *bitmap = block->bmap;
   2824     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
   2825     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
   2826     unsigned long run_start;
   2827 
   2828     if (block->page_size == TARGET_PAGE_SIZE) {
   2829         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
   2830         return;
   2831     }
   2832 
   2833     /* Find a dirty page */
   2834     run_start = find_next_bit(bitmap, pages, 0);
   2835 
   2836     while (run_start < pages) {
   2837 
   2838         /*
   2839          * If the start of this run of pages is in the middle of a host
   2840          * page, then we need to fixup this host page.
   2841          */
   2842         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
   2843             /* Find the end of this run */
   2844             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
   2845             /*
   2846              * If the end isn't at the start of a host page, then the
   2847              * run doesn't finish at the end of a host page
   2848              * and we need to discard.
   2849              */
   2850         }
   2851 
   2852         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
   2853             unsigned long page;
   2854             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
   2855                                                              host_ratio);
   2856             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
   2857 
   2858             /* Clean up the bitmap */
   2859             for (page = fixup_start_addr;
   2860                  page < fixup_start_addr + host_ratio; page++) {
   2861                 /*
   2862                  * Remark them as dirty, updating the count for any pages
   2863                  * that weren't previously dirty.
   2864                  */
   2865                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
   2866             }
   2867         }
   2868 
   2869         /* Find the next dirty page for the next iteration */
   2870         run_start = find_next_bit(bitmap, pages, run_start);
   2871     }
   2872 }
   2873 
   2874 /**
   2875  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
   2876  *
   2877  * Transmit the set of pages to be discarded after precopy to the target
   2878  * these are pages that:
   2879  *     a) Have been previously transmitted but are now dirty again
   2880  *     b) Pages that have never been transmitted, this ensures that
   2881  *        any pages on the destination that have been mapped by background
   2882  *        tasks get discarded (transparent huge pages is the specific concern)
   2883  * Hopefully this is pretty sparse
   2884  *
   2885  * @ms: current migration state
   2886  */
   2887 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
   2888 {
   2889     RAMState *rs = ram_state;
   2890 
   2891     RCU_READ_LOCK_GUARD();
   2892 
   2893     /* This should be our last sync, the src is now paused */
   2894     migration_bitmap_sync(rs);
   2895 
   2896     /* Easiest way to make sure we don't resume in the middle of a host-page */
   2897     rs->last_seen_block = NULL;
   2898     rs->last_sent_block = NULL;
   2899     rs->last_page = 0;
   2900 
   2901     postcopy_each_ram_send_discard(ms);
   2902 
   2903     trace_ram_postcopy_send_discard_bitmap();
   2904 }
   2905 
   2906 /**
   2907  * ram_discard_range: discard dirtied pages at the beginning of postcopy
   2908  *
   2909  * Returns zero on success
   2910  *
   2911  * @rbname: name of the RAMBlock of the request. NULL means the
   2912  *          same that last one.
   2913  * @start: RAMBlock starting page
   2914  * @length: RAMBlock size
   2915  */
   2916 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
   2917 {
   2918     trace_ram_discard_range(rbname, start, length);
   2919 
   2920     RCU_READ_LOCK_GUARD();
   2921     RAMBlock *rb = qemu_ram_block_by_name(rbname);
   2922 
   2923     if (!rb) {
   2924         error_report("ram_discard_range: Failed to find block '%s'", rbname);
   2925         return -1;
   2926     }
   2927 
   2928     /*
   2929      * On source VM, we don't need to update the received bitmap since
   2930      * we don't even have one.
   2931      */
   2932     if (rb->receivedmap) {
   2933         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
   2934                      length >> qemu_target_page_bits());
   2935     }
   2936 
   2937     return ram_block_discard_range(rb, start, length);
   2938 }
   2939 
   2940 /*
   2941  * For every allocation, we will try not to crash the VM if the
   2942  * allocation failed.
   2943  */
   2944 static int xbzrle_init(void)
   2945 {
   2946     Error *local_err = NULL;
   2947 
   2948     if (!migrate_use_xbzrle()) {
   2949         return 0;
   2950     }
   2951 
   2952     XBZRLE_cache_lock();
   2953 
   2954     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
   2955     if (!XBZRLE.zero_target_page) {
   2956         error_report("%s: Error allocating zero page", __func__);
   2957         goto err_out;
   2958     }
   2959 
   2960     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
   2961                               TARGET_PAGE_SIZE, &local_err);
   2962     if (!XBZRLE.cache) {
   2963         error_report_err(local_err);
   2964         goto free_zero_page;
   2965     }
   2966 
   2967     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
   2968     if (!XBZRLE.encoded_buf) {
   2969         error_report("%s: Error allocating encoded_buf", __func__);
   2970         goto free_cache;
   2971     }
   2972 
   2973     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
   2974     if (!XBZRLE.current_buf) {
   2975         error_report("%s: Error allocating current_buf", __func__);
   2976         goto free_encoded_buf;
   2977     }
   2978 
   2979     /* We are all good */
   2980     XBZRLE_cache_unlock();
   2981     return 0;
   2982 
   2983 free_encoded_buf:
   2984     g_free(XBZRLE.encoded_buf);
   2985     XBZRLE.encoded_buf = NULL;
   2986 free_cache:
   2987     cache_fini(XBZRLE.cache);
   2988     XBZRLE.cache = NULL;
   2989 free_zero_page:
   2990     g_free(XBZRLE.zero_target_page);
   2991     XBZRLE.zero_target_page = NULL;
   2992 err_out:
   2993     XBZRLE_cache_unlock();
   2994     return -ENOMEM;
   2995 }
   2996 
   2997 static int ram_state_init(RAMState **rsp)
   2998 {
   2999     *rsp = g_try_new0(RAMState, 1);
   3000 
   3001     if (!*rsp) {
   3002         error_report("%s: Init ramstate fail", __func__);
   3003         return -1;
   3004     }
   3005 
   3006     qemu_mutex_init(&(*rsp)->bitmap_mutex);
   3007     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
   3008     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
   3009 
   3010     /*
   3011      * Count the total number of pages used by ram blocks not including any
   3012      * gaps due to alignment or unplugs.
   3013      * This must match with the initial values of dirty bitmap.
   3014      */
   3015     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
   3016     ram_state_reset(*rsp);
   3017 
   3018     return 0;
   3019 }
   3020 
   3021 static void ram_list_init_bitmaps(void)
   3022 {
   3023     MigrationState *ms = migrate_get_current();
   3024     RAMBlock *block;
   3025     unsigned long pages;
   3026     uint8_t shift;
   3027 
   3028     /* Skip setting bitmap if there is no RAM */
   3029     if (ram_bytes_total()) {
   3030         shift = ms->clear_bitmap_shift;
   3031         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
   3032             error_report("clear_bitmap_shift (%u) too big, using "
   3033                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
   3034             shift = CLEAR_BITMAP_SHIFT_MAX;
   3035         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
   3036             error_report("clear_bitmap_shift (%u) too small, using "
   3037                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
   3038             shift = CLEAR_BITMAP_SHIFT_MIN;
   3039         }
   3040 
   3041         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3042             pages = block->max_length >> TARGET_PAGE_BITS;
   3043             /*
   3044              * The initial dirty bitmap for migration must be set with all
   3045              * ones to make sure we'll migrate every guest RAM page to
   3046              * destination.
   3047              * Here we set RAMBlock.bmap all to 1 because when rebegin a
   3048              * new migration after a failed migration, ram_list.
   3049              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
   3050              * guest memory.
   3051              */
   3052             block->bmap = bitmap_new(pages);
   3053             bitmap_set(block->bmap, 0, pages);
   3054             block->clear_bmap_shift = shift;
   3055             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
   3056         }
   3057     }
   3058 }
   3059 
   3060 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
   3061 {
   3062     unsigned long pages;
   3063     RAMBlock *rb;
   3064 
   3065     RCU_READ_LOCK_GUARD();
   3066 
   3067     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   3068             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
   3069             rs->migration_dirty_pages -= pages;
   3070     }
   3071 }
   3072 
   3073 static void ram_init_bitmaps(RAMState *rs)
   3074 {
   3075     /* For memory_global_dirty_log_start below.  */
   3076     qemu_mutex_lock_iothread();
   3077     qemu_mutex_lock_ramlist();
   3078 
   3079     WITH_RCU_READ_LOCK_GUARD() {
   3080         ram_list_init_bitmaps();
   3081         /* We don't use dirty log with background snapshots */
   3082         if (!migrate_background_snapshot()) {
   3083             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
   3084             migration_bitmap_sync_precopy(rs);
   3085         }
   3086     }
   3087     qemu_mutex_unlock_ramlist();
   3088     qemu_mutex_unlock_iothread();
   3089 
   3090     /*
   3091      * After an eventual first bitmap sync, fixup the initial bitmap
   3092      * containing all 1s to exclude any discarded pages from migration.
   3093      */
   3094     migration_bitmap_clear_discarded_pages(rs);
   3095 }
   3096 
   3097 static int ram_init_all(RAMState **rsp)
   3098 {
   3099     if (ram_state_init(rsp)) {
   3100         return -1;
   3101     }
   3102 
   3103     if (xbzrle_init()) {
   3104         ram_state_cleanup(rsp);
   3105         return -1;
   3106     }
   3107 
   3108     ram_init_bitmaps(*rsp);
   3109 
   3110     return 0;
   3111 }
   3112 
   3113 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
   3114 {
   3115     RAMBlock *block;
   3116     uint64_t pages = 0;
   3117 
   3118     /*
   3119      * Postcopy is not using xbzrle/compression, so no need for that.
   3120      * Also, since source are already halted, we don't need to care
   3121      * about dirty page logging as well.
   3122      */
   3123 
   3124     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3125         pages += bitmap_count_one(block->bmap,
   3126                                   block->used_length >> TARGET_PAGE_BITS);
   3127     }
   3128 
   3129     /* This may not be aligned with current bitmaps. Recalculate. */
   3130     rs->migration_dirty_pages = pages;
   3131 
   3132     ram_state_reset(rs);
   3133 
   3134     /* Update RAMState cache of output QEMUFile */
   3135     rs->f = out;
   3136 
   3137     trace_ram_state_resume_prepare(pages);
   3138 }
   3139 
   3140 /*
   3141  * This function clears bits of the free pages reported by the caller from the
   3142  * migration dirty bitmap. @addr is the host address corresponding to the
   3143  * start of the continuous guest free pages, and @len is the total bytes of
   3144  * those pages.
   3145  */
   3146 void qemu_guest_free_page_hint(void *addr, size_t len)
   3147 {
   3148     RAMBlock *block;
   3149     ram_addr_t offset;
   3150     size_t used_len, start, npages;
   3151     MigrationState *s = migrate_get_current();
   3152 
   3153     /* This function is currently expected to be used during live migration */
   3154     if (!migration_is_setup_or_active(s->state)) {
   3155         return;
   3156     }
   3157 
   3158     for (; len > 0; len -= used_len, addr += used_len) {
   3159         block = qemu_ram_block_from_host(addr, false, &offset);
   3160         if (unlikely(!block || offset >= block->used_length)) {
   3161             /*
   3162              * The implementation might not support RAMBlock resize during
   3163              * live migration, but it could happen in theory with future
   3164              * updates. So we add a check here to capture that case.
   3165              */
   3166             error_report_once("%s unexpected error", __func__);
   3167             return;
   3168         }
   3169 
   3170         if (len <= block->used_length - offset) {
   3171             used_len = len;
   3172         } else {
   3173             used_len = block->used_length - offset;
   3174         }
   3175 
   3176         start = offset >> TARGET_PAGE_BITS;
   3177         npages = used_len >> TARGET_PAGE_BITS;
   3178 
   3179         qemu_mutex_lock(&ram_state->bitmap_mutex);
   3180         /*
   3181          * The skipped free pages are equavalent to be sent from clear_bmap's
   3182          * perspective, so clear the bits from the memory region bitmap which
   3183          * are initially set. Otherwise those skipped pages will be sent in
   3184          * the next round after syncing from the memory region bitmap.
   3185          */
   3186         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
   3187         ram_state->migration_dirty_pages -=
   3188                       bitmap_count_one_with_offset(block->bmap, start, npages);
   3189         bitmap_clear(block->bmap, start, npages);
   3190         qemu_mutex_unlock(&ram_state->bitmap_mutex);
   3191     }
   3192 }
   3193 
   3194 /*
   3195  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
   3196  * long-running RCU critical section.  When rcu-reclaims in the code
   3197  * start to become numerous it will be necessary to reduce the
   3198  * granularity of these critical sections.
   3199  */
   3200 
   3201 /**
   3202  * ram_save_setup: Setup RAM for migration
   3203  *
   3204  * Returns zero to indicate success and negative for error
   3205  *
   3206  * @f: QEMUFile where to send the data
   3207  * @opaque: RAMState pointer
   3208  */
   3209 static int ram_save_setup(QEMUFile *f, void *opaque)
   3210 {
   3211     RAMState **rsp = opaque;
   3212     RAMBlock *block;
   3213     int ret;
   3214 
   3215     if (compress_threads_save_setup()) {
   3216         return -1;
   3217     }
   3218 
   3219     /* migration has already setup the bitmap, reuse it. */
   3220     if (!migration_in_colo_state()) {
   3221         if (ram_init_all(rsp) != 0) {
   3222             compress_threads_save_cleanup();
   3223             return -1;
   3224         }
   3225     }
   3226     (*rsp)->f = f;
   3227 
   3228     WITH_RCU_READ_LOCK_GUARD() {
   3229         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
   3230 
   3231         RAMBLOCK_FOREACH_MIGRATABLE(block) {
   3232             qemu_put_byte(f, strlen(block->idstr));
   3233             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
   3234             qemu_put_be64(f, block->used_length);
   3235             if (migrate_postcopy_ram() && block->page_size !=
   3236                                           qemu_host_page_size) {
   3237                 qemu_put_be64(f, block->page_size);
   3238             }
   3239             if (migrate_ignore_shared()) {
   3240                 qemu_put_be64(f, block->mr->addr);
   3241             }
   3242         }
   3243     }
   3244 
   3245     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
   3246     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
   3247 
   3248     ret =  multifd_send_sync_main(f);
   3249     if (ret < 0) {
   3250         return ret;
   3251     }
   3252 
   3253     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   3254     qemu_fflush(f);
   3255 
   3256     return 0;
   3257 }
   3258 
   3259 /**
   3260  * ram_save_iterate: iterative stage for migration
   3261  *
   3262  * Returns zero to indicate success and negative for error
   3263  *
   3264  * @f: QEMUFile where to send the data
   3265  * @opaque: RAMState pointer
   3266  */
   3267 static int ram_save_iterate(QEMUFile *f, void *opaque)
   3268 {
   3269     RAMState **temp = opaque;
   3270     RAMState *rs = *temp;
   3271     int ret = 0;
   3272     int i;
   3273     int64_t t0;
   3274     int done = 0;
   3275 
   3276     if (blk_mig_bulk_active()) {
   3277         /* Avoid transferring ram during bulk phase of block migration as
   3278          * the bulk phase will usually take a long time and transferring
   3279          * ram updates during that time is pointless. */
   3280         goto out;
   3281     }
   3282 
   3283     /*
   3284      * We'll take this lock a little bit long, but it's okay for two reasons.
   3285      * Firstly, the only possible other thread to take it is who calls
   3286      * qemu_guest_free_page_hint(), which should be rare; secondly, see
   3287      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
   3288      * guarantees that we'll at least released it in a regular basis.
   3289      */
   3290     qemu_mutex_lock(&rs->bitmap_mutex);
   3291     WITH_RCU_READ_LOCK_GUARD() {
   3292         if (ram_list.version != rs->last_version) {
   3293             ram_state_reset(rs);
   3294         }
   3295 
   3296         /* Read version before ram_list.blocks */
   3297         smp_rmb();
   3298 
   3299         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
   3300 
   3301         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
   3302         i = 0;
   3303         while ((ret = qemu_file_rate_limit(f)) == 0 ||
   3304                postcopy_has_request(rs)) {
   3305             int pages;
   3306 
   3307             if (qemu_file_get_error(f)) {
   3308                 break;
   3309             }
   3310 
   3311             pages = ram_find_and_save_block(rs);
   3312             /* no more pages to sent */
   3313             if (pages == 0) {
   3314                 done = 1;
   3315                 break;
   3316             }
   3317 
   3318             if (pages < 0) {
   3319                 qemu_file_set_error(f, pages);
   3320                 break;
   3321             }
   3322 
   3323             rs->target_page_count += pages;
   3324 
   3325             /*
   3326              * During postcopy, it is necessary to make sure one whole host
   3327              * page is sent in one chunk.
   3328              */
   3329             if (migrate_postcopy_ram()) {
   3330                 flush_compressed_data(rs);
   3331             }
   3332 
   3333             /*
   3334              * we want to check in the 1st loop, just in case it was the 1st
   3335              * time and we had to sync the dirty bitmap.
   3336              * qemu_clock_get_ns() is a bit expensive, so we only check each
   3337              * some iterations
   3338              */
   3339             if ((i & 63) == 0) {
   3340                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
   3341                               1000000;
   3342                 if (t1 > MAX_WAIT) {
   3343                     trace_ram_save_iterate_big_wait(t1, i);
   3344                     break;
   3345                 }
   3346             }
   3347             i++;
   3348         }
   3349     }
   3350     qemu_mutex_unlock(&rs->bitmap_mutex);
   3351 
   3352     postcopy_preempt_reset_channel(rs);
   3353 
   3354     /*
   3355      * Must occur before EOS (or any QEMUFile operation)
   3356      * because of RDMA protocol.
   3357      */
   3358     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
   3359 
   3360 out:
   3361     if (ret >= 0
   3362         && migration_is_setup_or_active(migrate_get_current()->state)) {
   3363         ret = multifd_send_sync_main(rs->f);
   3364         if (ret < 0) {
   3365             return ret;
   3366         }
   3367 
   3368         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   3369         qemu_fflush(f);
   3370         ram_transferred_add(8);
   3371 
   3372         ret = qemu_file_get_error(f);
   3373     }
   3374     if (ret < 0) {
   3375         return ret;
   3376     }
   3377 
   3378     return done;
   3379 }
   3380 
   3381 /**
   3382  * ram_save_complete: function called to send the remaining amount of ram
   3383  *
   3384  * Returns zero to indicate success or negative on error
   3385  *
   3386  * Called with iothread lock
   3387  *
   3388  * @f: QEMUFile where to send the data
   3389  * @opaque: RAMState pointer
   3390  */
   3391 static int ram_save_complete(QEMUFile *f, void *opaque)
   3392 {
   3393     RAMState **temp = opaque;
   3394     RAMState *rs = *temp;
   3395     int ret = 0;
   3396 
   3397     rs->last_stage = !migration_in_colo_state();
   3398 
   3399     WITH_RCU_READ_LOCK_GUARD() {
   3400         if (!migration_in_postcopy()) {
   3401             migration_bitmap_sync_precopy(rs);
   3402         }
   3403 
   3404         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
   3405 
   3406         /* try transferring iterative blocks of memory */
   3407 
   3408         /* flush all remaining blocks regardless of rate limiting */
   3409         while (true) {
   3410             int pages;
   3411 
   3412             pages = ram_find_and_save_block(rs);
   3413             /* no more blocks to sent */
   3414             if (pages == 0) {
   3415                 break;
   3416             }
   3417             if (pages < 0) {
   3418                 ret = pages;
   3419                 break;
   3420             }
   3421         }
   3422 
   3423         flush_compressed_data(rs);
   3424         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
   3425     }
   3426 
   3427     if (ret < 0) {
   3428         return ret;
   3429     }
   3430 
   3431     postcopy_preempt_reset_channel(rs);
   3432 
   3433     ret = multifd_send_sync_main(rs->f);
   3434     if (ret < 0) {
   3435         return ret;
   3436     }
   3437 
   3438     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
   3439     qemu_fflush(f);
   3440 
   3441     return 0;
   3442 }
   3443 
   3444 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
   3445                              uint64_t *res_precopy_only,
   3446                              uint64_t *res_compatible,
   3447                              uint64_t *res_postcopy_only)
   3448 {
   3449     RAMState **temp = opaque;
   3450     RAMState *rs = *temp;
   3451     uint64_t remaining_size;
   3452 
   3453     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
   3454 
   3455     if (!migration_in_postcopy() &&
   3456         remaining_size < max_size) {
   3457         qemu_mutex_lock_iothread();
   3458         WITH_RCU_READ_LOCK_GUARD() {
   3459             migration_bitmap_sync_precopy(rs);
   3460         }
   3461         qemu_mutex_unlock_iothread();
   3462         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
   3463     }
   3464 
   3465     if (migrate_postcopy_ram()) {
   3466         /* We can do postcopy, and all the data is postcopiable */
   3467         *res_compatible += remaining_size;
   3468     } else {
   3469         *res_precopy_only += remaining_size;
   3470     }
   3471 }
   3472 
   3473 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
   3474 {
   3475     unsigned int xh_len;
   3476     int xh_flags;
   3477     uint8_t *loaded_data;
   3478 
   3479     /* extract RLE header */
   3480     xh_flags = qemu_get_byte(f);
   3481     xh_len = qemu_get_be16(f);
   3482 
   3483     if (xh_flags != ENCODING_FLAG_XBZRLE) {
   3484         error_report("Failed to load XBZRLE page - wrong compression!");
   3485         return -1;
   3486     }
   3487 
   3488     if (xh_len > TARGET_PAGE_SIZE) {
   3489         error_report("Failed to load XBZRLE page - len overflow!");
   3490         return -1;
   3491     }
   3492     loaded_data = XBZRLE.decoded_buf;
   3493     /* load data and decode */
   3494     /* it can change loaded_data to point to an internal buffer */
   3495     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
   3496 
   3497     /* decode RLE */
   3498     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
   3499                              TARGET_PAGE_SIZE) == -1) {
   3500         error_report("Failed to load XBZRLE page - decode error!");
   3501         return -1;
   3502     }
   3503 
   3504     return 0;
   3505 }
   3506 
   3507 /**
   3508  * ram_block_from_stream: read a RAMBlock id from the migration stream
   3509  *
   3510  * Must be called from within a rcu critical section.
   3511  *
   3512  * Returns a pointer from within the RCU-protected ram_list.
   3513  *
   3514  * @mis: the migration incoming state pointer
   3515  * @f: QEMUFile where to read the data from
   3516  * @flags: Page flags (mostly to see if it's a continuation of previous block)
   3517  * @channel: the channel we're using
   3518  */
   3519 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
   3520                                               QEMUFile *f, int flags,
   3521                                               int channel)
   3522 {
   3523     RAMBlock *block = mis->last_recv_block[channel];
   3524     char id[256];
   3525     uint8_t len;
   3526 
   3527     if (flags & RAM_SAVE_FLAG_CONTINUE) {
   3528         if (!block) {
   3529             error_report("Ack, bad migration stream!");
   3530             return NULL;
   3531         }
   3532         return block;
   3533     }
   3534 
   3535     len = qemu_get_byte(f);
   3536     qemu_get_buffer(f, (uint8_t *)id, len);
   3537     id[len] = 0;
   3538 
   3539     block = qemu_ram_block_by_name(id);
   3540     if (!block) {
   3541         error_report("Can't find block %s", id);
   3542         return NULL;
   3543     }
   3544 
   3545     if (ramblock_is_ignored(block)) {
   3546         error_report("block %s should not be migrated !", id);
   3547         return NULL;
   3548     }
   3549 
   3550     mis->last_recv_block[channel] = block;
   3551 
   3552     return block;
   3553 }
   3554 
   3555 static inline void *host_from_ram_block_offset(RAMBlock *block,
   3556                                                ram_addr_t offset)
   3557 {
   3558     if (!offset_in_ramblock(block, offset)) {
   3559         return NULL;
   3560     }
   3561 
   3562     return block->host + offset;
   3563 }
   3564 
   3565 static void *host_page_from_ram_block_offset(RAMBlock *block,
   3566                                              ram_addr_t offset)
   3567 {
   3568     /* Note: Explicitly no check against offset_in_ramblock(). */
   3569     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
   3570                                    block->page_size);
   3571 }
   3572 
   3573 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
   3574                                                          ram_addr_t offset)
   3575 {
   3576     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
   3577 }
   3578 
   3579 static inline void *colo_cache_from_block_offset(RAMBlock *block,
   3580                              ram_addr_t offset, bool record_bitmap)
   3581 {
   3582     if (!offset_in_ramblock(block, offset)) {
   3583         return NULL;
   3584     }
   3585     if (!block->colo_cache) {
   3586         error_report("%s: colo_cache is NULL in block :%s",
   3587                      __func__, block->idstr);
   3588         return NULL;
   3589     }
   3590 
   3591     /*
   3592     * During colo checkpoint, we need bitmap of these migrated pages.
   3593     * It help us to decide which pages in ram cache should be flushed
   3594     * into VM's RAM later.
   3595     */
   3596     if (record_bitmap &&
   3597         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
   3598         ram_state->migration_dirty_pages++;
   3599     }
   3600     return block->colo_cache + offset;
   3601 }
   3602 
   3603 /**
   3604  * ram_handle_compressed: handle the zero page case
   3605  *
   3606  * If a page (or a whole RDMA chunk) has been
   3607  * determined to be zero, then zap it.
   3608  *
   3609  * @host: host address for the zero page
   3610  * @ch: what the page is filled from.  We only support zero
   3611  * @size: size of the zero page
   3612  */
   3613 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
   3614 {
   3615     if (ch != 0 || !buffer_is_zero(host, size)) {
   3616         memset(host, ch, size);
   3617     }
   3618 }
   3619 
   3620 /* return the size after decompression, or negative value on error */
   3621 static int
   3622 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
   3623                      const uint8_t *source, size_t source_len)
   3624 {
   3625     int err;
   3626 
   3627     err = inflateReset(stream);
   3628     if (err != Z_OK) {
   3629         return -1;
   3630     }
   3631 
   3632     stream->avail_in = source_len;
   3633     stream->next_in = (uint8_t *)source;
   3634     stream->avail_out = dest_len;
   3635     stream->next_out = dest;
   3636 
   3637     err = inflate(stream, Z_NO_FLUSH);
   3638     if (err != Z_STREAM_END) {
   3639         return -1;
   3640     }
   3641 
   3642     return stream->total_out;
   3643 }
   3644 
   3645 static void *do_data_decompress(void *opaque)
   3646 {
   3647     DecompressParam *param = opaque;
   3648     unsigned long pagesize;
   3649     uint8_t *des;
   3650     int len, ret;
   3651 
   3652     qemu_mutex_lock(&param->mutex);
   3653     while (!param->quit) {
   3654         if (param->des) {
   3655             des = param->des;
   3656             len = param->len;
   3657             param->des = 0;
   3658             qemu_mutex_unlock(&param->mutex);
   3659 
   3660             pagesize = TARGET_PAGE_SIZE;
   3661 
   3662             ret = qemu_uncompress_data(&param->stream, des, pagesize,
   3663                                        param->compbuf, len);
   3664             if (ret < 0 && migrate_get_current()->decompress_error_check) {
   3665                 error_report("decompress data failed");
   3666                 qemu_file_set_error(decomp_file, ret);
   3667             }
   3668 
   3669             qemu_mutex_lock(&decomp_done_lock);
   3670             param->done = true;
   3671             qemu_cond_signal(&decomp_done_cond);
   3672             qemu_mutex_unlock(&decomp_done_lock);
   3673 
   3674             qemu_mutex_lock(&param->mutex);
   3675         } else {
   3676             qemu_cond_wait(&param->cond, &param->mutex);
   3677         }
   3678     }
   3679     qemu_mutex_unlock(&param->mutex);
   3680 
   3681     return NULL;
   3682 }
   3683 
   3684 static int wait_for_decompress_done(void)
   3685 {
   3686     int idx, thread_count;
   3687 
   3688     if (!migrate_use_compression()) {
   3689         return 0;
   3690     }
   3691 
   3692     thread_count = migrate_decompress_threads();
   3693     qemu_mutex_lock(&decomp_done_lock);
   3694     for (idx = 0; idx < thread_count; idx++) {
   3695         while (!decomp_param[idx].done) {
   3696             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
   3697         }
   3698     }
   3699     qemu_mutex_unlock(&decomp_done_lock);
   3700     return qemu_file_get_error(decomp_file);
   3701 }
   3702 
   3703 static void compress_threads_load_cleanup(void)
   3704 {
   3705     int i, thread_count;
   3706 
   3707     if (!migrate_use_compression()) {
   3708         return;
   3709     }
   3710     thread_count = migrate_decompress_threads();
   3711     for (i = 0; i < thread_count; i++) {
   3712         /*
   3713          * we use it as a indicator which shows if the thread is
   3714          * properly init'd or not
   3715          */
   3716         if (!decomp_param[i].compbuf) {
   3717             break;
   3718         }
   3719 
   3720         qemu_mutex_lock(&decomp_param[i].mutex);
   3721         decomp_param[i].quit = true;
   3722         qemu_cond_signal(&decomp_param[i].cond);
   3723         qemu_mutex_unlock(&decomp_param[i].mutex);
   3724     }
   3725     for (i = 0; i < thread_count; i++) {
   3726         if (!decomp_param[i].compbuf) {
   3727             break;
   3728         }
   3729 
   3730         qemu_thread_join(decompress_threads + i);
   3731         qemu_mutex_destroy(&decomp_param[i].mutex);
   3732         qemu_cond_destroy(&decomp_param[i].cond);
   3733         inflateEnd(&decomp_param[i].stream);
   3734         g_free(decomp_param[i].compbuf);
   3735         decomp_param[i].compbuf = NULL;
   3736     }
   3737     g_free(decompress_threads);
   3738     g_free(decomp_param);
   3739     decompress_threads = NULL;
   3740     decomp_param = NULL;
   3741     decomp_file = NULL;
   3742 }
   3743 
   3744 static int compress_threads_load_setup(QEMUFile *f)
   3745 {
   3746     int i, thread_count;
   3747 
   3748     if (!migrate_use_compression()) {
   3749         return 0;
   3750     }
   3751 
   3752     thread_count = migrate_decompress_threads();
   3753     decompress_threads = g_new0(QemuThread, thread_count);
   3754     decomp_param = g_new0(DecompressParam, thread_count);
   3755     qemu_mutex_init(&decomp_done_lock);
   3756     qemu_cond_init(&decomp_done_cond);
   3757     decomp_file = f;
   3758     for (i = 0; i < thread_count; i++) {
   3759         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
   3760             goto exit;
   3761         }
   3762 
   3763         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
   3764         qemu_mutex_init(&decomp_param[i].mutex);
   3765         qemu_cond_init(&decomp_param[i].cond);
   3766         decomp_param[i].done = true;
   3767         decomp_param[i].quit = false;
   3768         qemu_thread_create(decompress_threads + i, "decompress",
   3769                            do_data_decompress, decomp_param + i,
   3770                            QEMU_THREAD_JOINABLE);
   3771     }
   3772     return 0;
   3773 exit:
   3774     compress_threads_load_cleanup();
   3775     return -1;
   3776 }
   3777 
   3778 static void decompress_data_with_multi_threads(QEMUFile *f,
   3779                                                void *host, int len)
   3780 {
   3781     int idx, thread_count;
   3782 
   3783     thread_count = migrate_decompress_threads();
   3784     QEMU_LOCK_GUARD(&decomp_done_lock);
   3785     while (true) {
   3786         for (idx = 0; idx < thread_count; idx++) {
   3787             if (decomp_param[idx].done) {
   3788                 decomp_param[idx].done = false;
   3789                 qemu_mutex_lock(&decomp_param[idx].mutex);
   3790                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
   3791                 decomp_param[idx].des = host;
   3792                 decomp_param[idx].len = len;
   3793                 qemu_cond_signal(&decomp_param[idx].cond);
   3794                 qemu_mutex_unlock(&decomp_param[idx].mutex);
   3795                 break;
   3796             }
   3797         }
   3798         if (idx < thread_count) {
   3799             break;
   3800         } else {
   3801             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
   3802         }
   3803     }
   3804 }
   3805 
   3806 static void colo_init_ram_state(void)
   3807 {
   3808     ram_state_init(&ram_state);
   3809 }
   3810 
   3811 /*
   3812  * colo cache: this is for secondary VM, we cache the whole
   3813  * memory of the secondary VM, it is need to hold the global lock
   3814  * to call this helper.
   3815  */
   3816 int colo_init_ram_cache(void)
   3817 {
   3818     RAMBlock *block;
   3819 
   3820     WITH_RCU_READ_LOCK_GUARD() {
   3821         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3822             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
   3823                                                     NULL, false, false);
   3824             if (!block->colo_cache) {
   3825                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
   3826                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
   3827                              block->used_length);
   3828                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3829                     if (block->colo_cache) {
   3830                         qemu_anon_ram_free(block->colo_cache, block->used_length);
   3831                         block->colo_cache = NULL;
   3832                     }
   3833                 }
   3834                 return -errno;
   3835             }
   3836             if (!machine_dump_guest_core(current_machine)) {
   3837                 qemu_madvise(block->colo_cache, block->used_length,
   3838                              QEMU_MADV_DONTDUMP);
   3839             }
   3840         }
   3841     }
   3842 
   3843     /*
   3844     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
   3845     * with to decide which page in cache should be flushed into SVM's RAM. Here
   3846     * we use the same name 'ram_bitmap' as for migration.
   3847     */
   3848     if (ram_bytes_total()) {
   3849         RAMBlock *block;
   3850 
   3851         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3852             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
   3853             block->bmap = bitmap_new(pages);
   3854         }
   3855     }
   3856 
   3857     colo_init_ram_state();
   3858     return 0;
   3859 }
   3860 
   3861 /* TODO: duplicated with ram_init_bitmaps */
   3862 void colo_incoming_start_dirty_log(void)
   3863 {
   3864     RAMBlock *block = NULL;
   3865     /* For memory_global_dirty_log_start below. */
   3866     qemu_mutex_lock_iothread();
   3867     qemu_mutex_lock_ramlist();
   3868 
   3869     memory_global_dirty_log_sync();
   3870     WITH_RCU_READ_LOCK_GUARD() {
   3871         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3872             ramblock_sync_dirty_bitmap(ram_state, block);
   3873             /* Discard this dirty bitmap record */
   3874             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
   3875         }
   3876         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
   3877     }
   3878     ram_state->migration_dirty_pages = 0;
   3879     qemu_mutex_unlock_ramlist();
   3880     qemu_mutex_unlock_iothread();
   3881 }
   3882 
   3883 /* It is need to hold the global lock to call this helper */
   3884 void colo_release_ram_cache(void)
   3885 {
   3886     RAMBlock *block;
   3887 
   3888     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
   3889     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3890         g_free(block->bmap);
   3891         block->bmap = NULL;
   3892     }
   3893 
   3894     WITH_RCU_READ_LOCK_GUARD() {
   3895         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   3896             if (block->colo_cache) {
   3897                 qemu_anon_ram_free(block->colo_cache, block->used_length);
   3898                 block->colo_cache = NULL;
   3899             }
   3900         }
   3901     }
   3902     ram_state_cleanup(&ram_state);
   3903 }
   3904 
   3905 /**
   3906  * ram_load_setup: Setup RAM for migration incoming side
   3907  *
   3908  * Returns zero to indicate success and negative for error
   3909  *
   3910  * @f: QEMUFile where to receive the data
   3911  * @opaque: RAMState pointer
   3912  */
   3913 static int ram_load_setup(QEMUFile *f, void *opaque)
   3914 {
   3915     if (compress_threads_load_setup(f)) {
   3916         return -1;
   3917     }
   3918 
   3919     xbzrle_load_setup();
   3920     ramblock_recv_map_init();
   3921 
   3922     return 0;
   3923 }
   3924 
   3925 static int ram_load_cleanup(void *opaque)
   3926 {
   3927     RAMBlock *rb;
   3928 
   3929     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   3930         qemu_ram_block_writeback(rb);
   3931     }
   3932 
   3933     xbzrle_load_cleanup();
   3934     compress_threads_load_cleanup();
   3935 
   3936     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   3937         g_free(rb->receivedmap);
   3938         rb->receivedmap = NULL;
   3939     }
   3940 
   3941     return 0;
   3942 }
   3943 
   3944 /**
   3945  * ram_postcopy_incoming_init: allocate postcopy data structures
   3946  *
   3947  * Returns 0 for success and negative if there was one error
   3948  *
   3949  * @mis: current migration incoming state
   3950  *
   3951  * Allocate data structures etc needed by incoming migration with
   3952  * postcopy-ram. postcopy-ram's similarly names
   3953  * postcopy_ram_incoming_init does the work.
   3954  */
   3955 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
   3956 {
   3957     return postcopy_ram_incoming_init(mis);
   3958 }
   3959 
   3960 /**
   3961  * ram_load_postcopy: load a page in postcopy case
   3962  *
   3963  * Returns 0 for success or -errno in case of error
   3964  *
   3965  * Called in postcopy mode by ram_load().
   3966  * rcu_read_lock is taken prior to this being called.
   3967  *
   3968  * @f: QEMUFile where to send the data
   3969  * @channel: the channel to use for loading
   3970  */
   3971 int ram_load_postcopy(QEMUFile *f, int channel)
   3972 {
   3973     int flags = 0, ret = 0;
   3974     bool place_needed = false;
   3975     bool matches_target_page_size = false;
   3976     MigrationIncomingState *mis = migration_incoming_get_current();
   3977     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
   3978 
   3979     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
   3980         ram_addr_t addr;
   3981         void *page_buffer = NULL;
   3982         void *place_source = NULL;
   3983         RAMBlock *block = NULL;
   3984         uint8_t ch;
   3985         int len;
   3986 
   3987         addr = qemu_get_be64(f);
   3988 
   3989         /*
   3990          * If qemu file error, we should stop here, and then "addr"
   3991          * may be invalid
   3992          */
   3993         ret = qemu_file_get_error(f);
   3994         if (ret) {
   3995             break;
   3996         }
   3997 
   3998         flags = addr & ~TARGET_PAGE_MASK;
   3999         addr &= TARGET_PAGE_MASK;
   4000 
   4001         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
   4002         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
   4003                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
   4004             block = ram_block_from_stream(mis, f, flags, channel);
   4005             if (!block) {
   4006                 ret = -EINVAL;
   4007                 break;
   4008             }
   4009 
   4010             /*
   4011              * Relying on used_length is racy and can result in false positives.
   4012              * We might place pages beyond used_length in case RAM was shrunk
   4013              * while in postcopy, which is fine - trying to place via
   4014              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
   4015              */
   4016             if (!block->host || addr >= block->postcopy_length) {
   4017                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
   4018                 ret = -EINVAL;
   4019                 break;
   4020             }
   4021             tmp_page->target_pages++;
   4022             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
   4023             /*
   4024              * Postcopy requires that we place whole host pages atomically;
   4025              * these may be huge pages for RAMBlocks that are backed by
   4026              * hugetlbfs.
   4027              * To make it atomic, the data is read into a temporary page
   4028              * that's moved into place later.
   4029              * The migration protocol uses,  possibly smaller, target-pages
   4030              * however the source ensures it always sends all the components
   4031              * of a host page in one chunk.
   4032              */
   4033             page_buffer = tmp_page->tmp_huge_page +
   4034                           host_page_offset_from_ram_block_offset(block, addr);
   4035             /* If all TP are zero then we can optimise the place */
   4036             if (tmp_page->target_pages == 1) {
   4037                 tmp_page->host_addr =
   4038                     host_page_from_ram_block_offset(block, addr);
   4039             } else if (tmp_page->host_addr !=
   4040                        host_page_from_ram_block_offset(block, addr)) {
   4041                 /* not the 1st TP within the HP */
   4042                 error_report("Non-same host page detected on channel %d: "
   4043                              "Target host page %p, received host page %p "
   4044                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
   4045                              channel, tmp_page->host_addr,
   4046                              host_page_from_ram_block_offset(block, addr),
   4047                              block->idstr, addr, tmp_page->target_pages);
   4048                 ret = -EINVAL;
   4049                 break;
   4050             }
   4051 
   4052             /*
   4053              * If it's the last part of a host page then we place the host
   4054              * page
   4055              */
   4056             if (tmp_page->target_pages ==
   4057                 (block->page_size / TARGET_PAGE_SIZE)) {
   4058                 place_needed = true;
   4059             }
   4060             place_source = tmp_page->tmp_huge_page;
   4061         }
   4062 
   4063         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
   4064         case RAM_SAVE_FLAG_ZERO:
   4065             ch = qemu_get_byte(f);
   4066             /*
   4067              * Can skip to set page_buffer when
   4068              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
   4069              */
   4070             if (ch || !matches_target_page_size) {
   4071                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
   4072             }
   4073             if (ch) {
   4074                 tmp_page->all_zero = false;
   4075             }
   4076             break;
   4077 
   4078         case RAM_SAVE_FLAG_PAGE:
   4079             tmp_page->all_zero = false;
   4080             if (!matches_target_page_size) {
   4081                 /* For huge pages, we always use temporary buffer */
   4082                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
   4083             } else {
   4084                 /*
   4085                  * For small pages that matches target page size, we
   4086                  * avoid the qemu_file copy.  Instead we directly use
   4087                  * the buffer of QEMUFile to place the page.  Note: we
   4088                  * cannot do any QEMUFile operation before using that
   4089                  * buffer to make sure the buffer is valid when
   4090                  * placing the page.
   4091                  */
   4092                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
   4093                                          TARGET_PAGE_SIZE);
   4094             }
   4095             break;
   4096         case RAM_SAVE_FLAG_COMPRESS_PAGE:
   4097             tmp_page->all_zero = false;
   4098             len = qemu_get_be32(f);
   4099             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
   4100                 error_report("Invalid compressed data length: %d", len);
   4101                 ret = -EINVAL;
   4102                 break;
   4103             }
   4104             decompress_data_with_multi_threads(f, page_buffer, len);
   4105             break;
   4106 
   4107         case RAM_SAVE_FLAG_EOS:
   4108             /* normal exit */
   4109             multifd_recv_sync_main();
   4110             break;
   4111         default:
   4112             error_report("Unknown combination of migration flags: 0x%x"
   4113                          " (postcopy mode)", flags);
   4114             ret = -EINVAL;
   4115             break;
   4116         }
   4117 
   4118         /* Got the whole host page, wait for decompress before placing. */
   4119         if (place_needed) {
   4120             ret |= wait_for_decompress_done();
   4121         }
   4122 
   4123         /* Detect for any possible file errors */
   4124         if (!ret && qemu_file_get_error(f)) {
   4125             ret = qemu_file_get_error(f);
   4126         }
   4127 
   4128         if (!ret && place_needed) {
   4129             if (tmp_page->all_zero) {
   4130                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
   4131             } else {
   4132                 ret = postcopy_place_page(mis, tmp_page->host_addr,
   4133                                           place_source, block);
   4134             }
   4135             place_needed = false;
   4136             postcopy_temp_page_reset(tmp_page);
   4137         }
   4138     }
   4139 
   4140     return ret;
   4141 }
   4142 
   4143 static bool postcopy_is_advised(void)
   4144 {
   4145     PostcopyState ps = postcopy_state_get();
   4146     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
   4147 }
   4148 
   4149 static bool postcopy_is_running(void)
   4150 {
   4151     PostcopyState ps = postcopy_state_get();
   4152     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
   4153 }
   4154 
   4155 /*
   4156  * Flush content of RAM cache into SVM's memory.
   4157  * Only flush the pages that be dirtied by PVM or SVM or both.
   4158  */
   4159 void colo_flush_ram_cache(void)
   4160 {
   4161     RAMBlock *block = NULL;
   4162     void *dst_host;
   4163     void *src_host;
   4164     unsigned long offset = 0;
   4165 
   4166     memory_global_dirty_log_sync();
   4167     WITH_RCU_READ_LOCK_GUARD() {
   4168         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   4169             ramblock_sync_dirty_bitmap(ram_state, block);
   4170         }
   4171     }
   4172 
   4173     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
   4174     WITH_RCU_READ_LOCK_GUARD() {
   4175         block = QLIST_FIRST_RCU(&ram_list.blocks);
   4176 
   4177         while (block) {
   4178             unsigned long num = 0;
   4179 
   4180             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
   4181             if (!offset_in_ramblock(block,
   4182                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
   4183                 offset = 0;
   4184                 num = 0;
   4185                 block = QLIST_NEXT_RCU(block, next);
   4186             } else {
   4187                 unsigned long i = 0;
   4188 
   4189                 for (i = 0; i < num; i++) {
   4190                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
   4191                 }
   4192                 dst_host = block->host
   4193                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
   4194                 src_host = block->colo_cache
   4195                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
   4196                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
   4197                 offset += num;
   4198             }
   4199         }
   4200     }
   4201     trace_colo_flush_ram_cache_end();
   4202 }
   4203 
   4204 /**
   4205  * ram_load_precopy: load pages in precopy case
   4206  *
   4207  * Returns 0 for success or -errno in case of error
   4208  *
   4209  * Called in precopy mode by ram_load().
   4210  * rcu_read_lock is taken prior to this being called.
   4211  *
   4212  * @f: QEMUFile where to send the data
   4213  */
   4214 static int ram_load_precopy(QEMUFile *f)
   4215 {
   4216     MigrationIncomingState *mis = migration_incoming_get_current();
   4217     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
   4218     /* ADVISE is earlier, it shows the source has the postcopy capability on */
   4219     bool postcopy_advised = postcopy_is_advised();
   4220     if (!migrate_use_compression()) {
   4221         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
   4222     }
   4223 
   4224     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
   4225         ram_addr_t addr, total_ram_bytes;
   4226         void *host = NULL, *host_bak = NULL;
   4227         uint8_t ch;
   4228 
   4229         /*
   4230          * Yield periodically to let main loop run, but an iteration of
   4231          * the main loop is expensive, so do it each some iterations
   4232          */
   4233         if ((i & 32767) == 0 && qemu_in_coroutine()) {
   4234             aio_co_schedule(qemu_get_current_aio_context(),
   4235                             qemu_coroutine_self());
   4236             qemu_coroutine_yield();
   4237         }
   4238         i++;
   4239 
   4240         addr = qemu_get_be64(f);
   4241         flags = addr & ~TARGET_PAGE_MASK;
   4242         addr &= TARGET_PAGE_MASK;
   4243 
   4244         if (flags & invalid_flags) {
   4245             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
   4246                 error_report("Received an unexpected compressed page");
   4247             }
   4248 
   4249             ret = -EINVAL;
   4250             break;
   4251         }
   4252 
   4253         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
   4254                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
   4255             RAMBlock *block = ram_block_from_stream(mis, f, flags,
   4256                                                     RAM_CHANNEL_PRECOPY);
   4257 
   4258             host = host_from_ram_block_offset(block, addr);
   4259             /*
   4260              * After going into COLO stage, we should not load the page
   4261              * into SVM's memory directly, we put them into colo_cache firstly.
   4262              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
   4263              * Previously, we copied all these memory in preparing stage of COLO
   4264              * while we need to stop VM, which is a time-consuming process.
   4265              * Here we optimize it by a trick, back-up every page while in
   4266              * migration process while COLO is enabled, though it affects the
   4267              * speed of the migration, but it obviously reduce the downtime of
   4268              * back-up all SVM'S memory in COLO preparing stage.
   4269              */
   4270             if (migration_incoming_colo_enabled()) {
   4271                 if (migration_incoming_in_colo_state()) {
   4272                     /* In COLO stage, put all pages into cache temporarily */
   4273                     host = colo_cache_from_block_offset(block, addr, true);
   4274                 } else {
   4275                    /*
   4276                     * In migration stage but before COLO stage,
   4277                     * Put all pages into both cache and SVM's memory.
   4278                     */
   4279                     host_bak = colo_cache_from_block_offset(block, addr, false);
   4280                 }
   4281             }
   4282             if (!host) {
   4283                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
   4284                 ret = -EINVAL;
   4285                 break;
   4286             }
   4287             if (!migration_incoming_in_colo_state()) {
   4288                 ramblock_recv_bitmap_set(block, host);
   4289             }
   4290 
   4291             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
   4292         }
   4293 
   4294         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
   4295         case RAM_SAVE_FLAG_MEM_SIZE:
   4296             /* Synchronize RAM block list */
   4297             total_ram_bytes = addr;
   4298             while (!ret && total_ram_bytes) {
   4299                 RAMBlock *block;
   4300                 char id[256];
   4301                 ram_addr_t length;
   4302 
   4303                 len = qemu_get_byte(f);
   4304                 qemu_get_buffer(f, (uint8_t *)id, len);
   4305                 id[len] = 0;
   4306                 length = qemu_get_be64(f);
   4307 
   4308                 block = qemu_ram_block_by_name(id);
   4309                 if (block && !qemu_ram_is_migratable(block)) {
   4310                     error_report("block %s should not be migrated !", id);
   4311                     ret = -EINVAL;
   4312                 } else if (block) {
   4313                     if (length != block->used_length) {
   4314                         Error *local_err = NULL;
   4315 
   4316                         ret = qemu_ram_resize(block, length,
   4317                                               &local_err);
   4318                         if (local_err) {
   4319                             error_report_err(local_err);
   4320                         }
   4321                     }
   4322                     /* For postcopy we need to check hugepage sizes match */
   4323                     if (postcopy_advised && migrate_postcopy_ram() &&
   4324                         block->page_size != qemu_host_page_size) {
   4325                         uint64_t remote_page_size = qemu_get_be64(f);
   4326                         if (remote_page_size != block->page_size) {
   4327                             error_report("Mismatched RAM page size %s "
   4328                                          "(local) %zd != %" PRId64,
   4329                                          id, block->page_size,
   4330                                          remote_page_size);
   4331                             ret = -EINVAL;
   4332                         }
   4333                     }
   4334                     if (migrate_ignore_shared()) {
   4335                         hwaddr addr = qemu_get_be64(f);
   4336                         if (ramblock_is_ignored(block) &&
   4337                             block->mr->addr != addr) {
   4338                             error_report("Mismatched GPAs for block %s "
   4339                                          "%" PRId64 "!= %" PRId64,
   4340                                          id, (uint64_t)addr,
   4341                                          (uint64_t)block->mr->addr);
   4342                             ret = -EINVAL;
   4343                         }
   4344                     }
   4345                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
   4346                                           block->idstr);
   4347                 } else {
   4348                     error_report("Unknown ramblock \"%s\", cannot "
   4349                                  "accept migration", id);
   4350                     ret = -EINVAL;
   4351                 }
   4352 
   4353                 total_ram_bytes -= length;
   4354             }
   4355             break;
   4356 
   4357         case RAM_SAVE_FLAG_ZERO:
   4358             ch = qemu_get_byte(f);
   4359             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
   4360             break;
   4361 
   4362         case RAM_SAVE_FLAG_PAGE:
   4363             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
   4364             break;
   4365 
   4366         case RAM_SAVE_FLAG_COMPRESS_PAGE:
   4367             len = qemu_get_be32(f);
   4368             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
   4369                 error_report("Invalid compressed data length: %d", len);
   4370                 ret = -EINVAL;
   4371                 break;
   4372             }
   4373             decompress_data_with_multi_threads(f, host, len);
   4374             break;
   4375 
   4376         case RAM_SAVE_FLAG_XBZRLE:
   4377             if (load_xbzrle(f, addr, host) < 0) {
   4378                 error_report("Failed to decompress XBZRLE page at "
   4379                              RAM_ADDR_FMT, addr);
   4380                 ret = -EINVAL;
   4381                 break;
   4382             }
   4383             break;
   4384         case RAM_SAVE_FLAG_EOS:
   4385             /* normal exit */
   4386             multifd_recv_sync_main();
   4387             break;
   4388         default:
   4389             if (flags & RAM_SAVE_FLAG_HOOK) {
   4390                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
   4391             } else {
   4392                 error_report("Unknown combination of migration flags: 0x%x",
   4393                              flags);
   4394                 ret = -EINVAL;
   4395             }
   4396         }
   4397         if (!ret) {
   4398             ret = qemu_file_get_error(f);
   4399         }
   4400         if (!ret && host_bak) {
   4401             memcpy(host_bak, host, TARGET_PAGE_SIZE);
   4402         }
   4403     }
   4404 
   4405     ret |= wait_for_decompress_done();
   4406     return ret;
   4407 }
   4408 
   4409 static int ram_load(QEMUFile *f, void *opaque, int version_id)
   4410 {
   4411     int ret = 0;
   4412     static uint64_t seq_iter;
   4413     /*
   4414      * If system is running in postcopy mode, page inserts to host memory must
   4415      * be atomic
   4416      */
   4417     bool postcopy_running = postcopy_is_running();
   4418 
   4419     seq_iter++;
   4420 
   4421     if (version_id != 4) {
   4422         return -EINVAL;
   4423     }
   4424 
   4425     /*
   4426      * This RCU critical section can be very long running.
   4427      * When RCU reclaims in the code start to become numerous,
   4428      * it will be necessary to reduce the granularity of this
   4429      * critical section.
   4430      */
   4431     WITH_RCU_READ_LOCK_GUARD() {
   4432         if (postcopy_running) {
   4433             /*
   4434              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
   4435              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
   4436              * service fast page faults.
   4437              */
   4438             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
   4439         } else {
   4440             ret = ram_load_precopy(f);
   4441         }
   4442     }
   4443     trace_ram_load_complete(ret, seq_iter);
   4444 
   4445     return ret;
   4446 }
   4447 
   4448 static bool ram_has_postcopy(void *opaque)
   4449 {
   4450     RAMBlock *rb;
   4451     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
   4452         if (ramblock_is_pmem(rb)) {
   4453             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
   4454                          "is not supported now!", rb->idstr, rb->host);
   4455             return false;
   4456         }
   4457     }
   4458 
   4459     return migrate_postcopy_ram();
   4460 }
   4461 
   4462 /* Sync all the dirty bitmap with destination VM.  */
   4463 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
   4464 {
   4465     RAMBlock *block;
   4466     QEMUFile *file = s->to_dst_file;
   4467     int ramblock_count = 0;
   4468 
   4469     trace_ram_dirty_bitmap_sync_start();
   4470 
   4471     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
   4472         qemu_savevm_send_recv_bitmap(file, block->idstr);
   4473         trace_ram_dirty_bitmap_request(block->idstr);
   4474         ramblock_count++;
   4475     }
   4476 
   4477     trace_ram_dirty_bitmap_sync_wait();
   4478 
   4479     /* Wait until all the ramblocks' dirty bitmap synced */
   4480     while (ramblock_count--) {
   4481         qemu_sem_wait(&s->rp_state.rp_sem);
   4482     }
   4483 
   4484     trace_ram_dirty_bitmap_sync_complete();
   4485 
   4486     return 0;
   4487 }
   4488 
   4489 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
   4490 {
   4491     qemu_sem_post(&s->rp_state.rp_sem);
   4492 }
   4493 
   4494 /*
   4495  * Read the received bitmap, revert it as the initial dirty bitmap.
   4496  * This is only used when the postcopy migration is paused but wants
   4497  * to resume from a middle point.
   4498  */
   4499 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
   4500 {
   4501     int ret = -EINVAL;
   4502     /* from_dst_file is always valid because we're within rp_thread */
   4503     QEMUFile *file = s->rp_state.from_dst_file;
   4504     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
   4505     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
   4506     uint64_t size, end_mark;
   4507 
   4508     trace_ram_dirty_bitmap_reload_begin(block->idstr);
   4509 
   4510     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
   4511         error_report("%s: incorrect state %s", __func__,
   4512                      MigrationStatus_str(s->state));
   4513         return -EINVAL;
   4514     }
   4515 
   4516     /*
   4517      * Note: see comments in ramblock_recv_bitmap_send() on why we
   4518      * need the endianness conversion, and the paddings.
   4519      */
   4520     local_size = ROUND_UP(local_size, 8);
   4521 
   4522     /* Add paddings */
   4523     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
   4524 
   4525     size = qemu_get_be64(file);
   4526 
   4527     /* The size of the bitmap should match with our ramblock */
   4528     if (size != local_size) {
   4529         error_report("%s: ramblock '%s' bitmap size mismatch "
   4530                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
   4531                      block->idstr, size, local_size);
   4532         ret = -EINVAL;
   4533         goto out;
   4534     }
   4535 
   4536     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
   4537     end_mark = qemu_get_be64(file);
   4538 
   4539     ret = qemu_file_get_error(file);
   4540     if (ret || size != local_size) {
   4541         error_report("%s: read bitmap failed for ramblock '%s': %d"
   4542                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
   4543                      __func__, block->idstr, ret, local_size, size);
   4544         ret = -EIO;
   4545         goto out;
   4546     }
   4547 
   4548     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
   4549         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
   4550                      __func__, block->idstr, end_mark);
   4551         ret = -EINVAL;
   4552         goto out;
   4553     }
   4554 
   4555     /*
   4556      * Endianness conversion. We are during postcopy (though paused).
   4557      * The dirty bitmap won't change. We can directly modify it.
   4558      */
   4559     bitmap_from_le(block->bmap, le_bitmap, nbits);
   4560 
   4561     /*
   4562      * What we received is "received bitmap". Revert it as the initial
   4563      * dirty bitmap for this ramblock.
   4564      */
   4565     bitmap_complement(block->bmap, block->bmap, nbits);
   4566 
   4567     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
   4568     ramblock_dirty_bitmap_clear_discarded_pages(block);
   4569 
   4570     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
   4571     trace_ram_dirty_bitmap_reload_complete(block->idstr);
   4572 
   4573     /*
   4574      * We succeeded to sync bitmap for current ramblock. If this is
   4575      * the last one to sync, we need to notify the main send thread.
   4576      */
   4577     ram_dirty_bitmap_reload_notify(s);
   4578 
   4579     ret = 0;
   4580 out:
   4581     g_free(le_bitmap);
   4582     return ret;
   4583 }
   4584 
   4585 static int ram_resume_prepare(MigrationState *s, void *opaque)
   4586 {
   4587     RAMState *rs = *(RAMState **)opaque;
   4588     int ret;
   4589 
   4590     ret = ram_dirty_bitmap_sync_all(s, rs);
   4591     if (ret) {
   4592         return ret;
   4593     }
   4594 
   4595     ram_state_resume_prepare(rs, s->to_dst_file);
   4596 
   4597     return 0;
   4598 }
   4599 
   4600 void postcopy_preempt_shutdown_file(MigrationState *s)
   4601 {
   4602     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
   4603     qemu_fflush(s->postcopy_qemufile_src);
   4604 }
   4605 
   4606 static SaveVMHandlers savevm_ram_handlers = {
   4607     .save_setup = ram_save_setup,
   4608     .save_live_iterate = ram_save_iterate,
   4609     .save_live_complete_postcopy = ram_save_complete,
   4610     .save_live_complete_precopy = ram_save_complete,
   4611     .has_postcopy = ram_has_postcopy,
   4612     .save_live_pending = ram_save_pending,
   4613     .load_state = ram_load,
   4614     .save_cleanup = ram_save_cleanup,
   4615     .load_setup = ram_load_setup,
   4616     .load_cleanup = ram_load_cleanup,
   4617     .resume_prepare = ram_resume_prepare,
   4618 };
   4619 
   4620 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
   4621                                       size_t old_size, size_t new_size)
   4622 {
   4623     PostcopyState ps = postcopy_state_get();
   4624     ram_addr_t offset;
   4625     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
   4626     Error *err = NULL;
   4627 
   4628     if (ramblock_is_ignored(rb)) {
   4629         return;
   4630     }
   4631 
   4632     if (!migration_is_idle()) {
   4633         /*
   4634          * Precopy code on the source cannot deal with the size of RAM blocks
   4635          * changing at random points in time - especially after sending the
   4636          * RAM block sizes in the migration stream, they must no longer change.
   4637          * Abort and indicate a proper reason.
   4638          */
   4639         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
   4640         migration_cancel(err);
   4641         error_free(err);
   4642     }
   4643 
   4644     switch (ps) {
   4645     case POSTCOPY_INCOMING_ADVISE:
   4646         /*
   4647          * Update what ram_postcopy_incoming_init()->init_range() does at the
   4648          * time postcopy was advised. Syncing RAM blocks with the source will
   4649          * result in RAM resizes.
   4650          */
   4651         if (old_size < new_size) {
   4652             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
   4653                 error_report("RAM block '%s' discard of resized RAM failed",
   4654                              rb->idstr);
   4655             }
   4656         }
   4657         rb->postcopy_length = new_size;
   4658         break;
   4659     case POSTCOPY_INCOMING_NONE:
   4660     case POSTCOPY_INCOMING_RUNNING:
   4661     case POSTCOPY_INCOMING_END:
   4662         /*
   4663          * Once our guest is running, postcopy does no longer care about
   4664          * resizes. When growing, the new memory was not available on the
   4665          * source, no handler needed.
   4666          */
   4667         break;
   4668     default:
   4669         error_report("RAM block '%s' resized during postcopy state: %d",
   4670                      rb->idstr, ps);
   4671         exit(-1);
   4672     }
   4673 }
   4674 
   4675 static RAMBlockNotifier ram_mig_ram_notifier = {
   4676     .ram_block_resized = ram_mig_ram_block_resized,
   4677 };
   4678 
   4679 void ram_mig_init(void)
   4680 {
   4681     qemu_mutex_init(&XBZRLE.lock);
   4682     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
   4683     ram_block_notifier_add(&ram_mig_ram_notifier);
   4684 }