qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

block.c (29665B)


      1 /*
      2  * QEMU live block migration
      3  *
      4  * Copyright IBM, Corp. 2009
      5  *
      6  * Authors:
      7  *  Liran Schour   <lirans@il.ibm.com>
      8  *
      9  * This work is licensed under the terms of the GNU GPL, version 2.  See
     10  * the COPYING file in the top-level directory.
     11  *
     12  * Contributions after 2012-01-13 are licensed under the terms of the
     13  * GNU GPL, version 2 or (at your option) any later version.
     14  */
     15 
     16 #include "qemu/osdep.h"
     17 #include "qapi/error.h"
     18 #include "qemu/error-report.h"
     19 #include "qemu/main-loop.h"
     20 #include "qemu/cutils.h"
     21 #include "qemu/queue.h"
     22 #include "block.h"
     23 #include "migration/misc.h"
     24 #include "migration.h"
     25 #include "migration/register.h"
     26 #include "qemu-file.h"
     27 #include "migration/vmstate.h"
     28 #include "sysemu/block-backend.h"
     29 #include "trace.h"
     30 
     31 #define BLK_MIG_BLOCK_SIZE           (1ULL << 20)
     32 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS)
     33 
     34 #define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
     35 #define BLK_MIG_FLAG_EOS                0x02
     36 #define BLK_MIG_FLAG_PROGRESS           0x04
     37 #define BLK_MIG_FLAG_ZERO_BLOCK         0x08
     38 
     39 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
     40 
     41 #define MAX_IO_BUFFERS 512
     42 #define MAX_PARALLEL_IO 16
     43 
     44 /* #define DEBUG_BLK_MIGRATION */
     45 
     46 #ifdef DEBUG_BLK_MIGRATION
     47 #define DPRINTF(fmt, ...) \
     48     do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
     49 #else
     50 #define DPRINTF(fmt, ...) \
     51     do { } while (0)
     52 #endif
     53 
     54 typedef struct BlkMigDevState {
     55     /* Written during setup phase.  Can be read without a lock.  */
     56     BlockBackend *blk;
     57     char *blk_name;
     58     int shared_base;
     59     int64_t total_sectors;
     60     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
     61     Error *blocker;
     62 
     63     /* Only used by migration thread.  Does not need a lock.  */
     64     int bulk_completed;
     65     int64_t cur_sector;
     66     int64_t cur_dirty;
     67 
     68     /* Data in the aio_bitmap is protected by block migration lock.
     69      * Allocation and free happen during setup and cleanup respectively.
     70      */
     71     unsigned long *aio_bitmap;
     72 
     73     /* Protected by block migration lock.  */
     74     int64_t completed_sectors;
     75 
     76     /* During migration this is protected by iothread lock / AioContext.
     77      * Allocation and free happen during setup and cleanup respectively.
     78      */
     79     BdrvDirtyBitmap *dirty_bitmap;
     80 } BlkMigDevState;
     81 
     82 typedef struct BlkMigBlock {
     83     /* Only used by migration thread.  */
     84     uint8_t *buf;
     85     BlkMigDevState *bmds;
     86     int64_t sector;
     87     int nr_sectors;
     88     QEMUIOVector qiov;
     89     BlockAIOCB *aiocb;
     90 
     91     /* Protected by block migration lock.  */
     92     int ret;
     93     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
     94 } BlkMigBlock;
     95 
     96 typedef struct BlkMigState {
     97     QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list;
     98     int64_t total_sector_sum;
     99     bool zero_blocks;
    100 
    101     /* Protected by lock.  */
    102     QSIMPLEQ_HEAD(, BlkMigBlock) blk_list;
    103     int submitted;
    104     int read_done;
    105 
    106     /* Only used by migration thread.  Does not need a lock.  */
    107     int transferred;
    108     int prev_progress;
    109     int bulk_completed;
    110 
    111     /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
    112     QemuMutex lock;
    113 } BlkMigState;
    114 
    115 static BlkMigState block_mig_state;
    116 
    117 static void blk_mig_lock(void)
    118 {
    119     qemu_mutex_lock(&block_mig_state.lock);
    120 }
    121 
    122 static void blk_mig_unlock(void)
    123 {
    124     qemu_mutex_unlock(&block_mig_state.lock);
    125 }
    126 
    127 /* Must run outside of the iothread lock during the bulk phase,
    128  * or the VM will stall.
    129  */
    130 
    131 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
    132 {
    133     int len;
    134     uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
    135 
    136     if (block_mig_state.zero_blocks &&
    137         buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) {
    138         flags |= BLK_MIG_FLAG_ZERO_BLOCK;
    139     }
    140 
    141     /* sector number and flags */
    142     qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
    143                      | flags);
    144 
    145     /* device name */
    146     len = strlen(blk->bmds->blk_name);
    147     qemu_put_byte(f, len);
    148     qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
    149 
    150     /* if a block is zero we need to flush here since the network
    151      * bandwidth is now a lot higher than the storage device bandwidth.
    152      * thus if we queue zero blocks we slow down the migration */
    153     if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
    154         qemu_fflush(f);
    155         return;
    156     }
    157 
    158     qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE);
    159 }
    160 
    161 int blk_mig_active(void)
    162 {
    163     return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
    164 }
    165 
    166 int blk_mig_bulk_active(void)
    167 {
    168     return blk_mig_active() && !block_mig_state.bulk_completed;
    169 }
    170 
    171 uint64_t blk_mig_bytes_transferred(void)
    172 {
    173     BlkMigDevState *bmds;
    174     uint64_t sum = 0;
    175 
    176     blk_mig_lock();
    177     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    178         sum += bmds->completed_sectors;
    179     }
    180     blk_mig_unlock();
    181     return sum << BDRV_SECTOR_BITS;
    182 }
    183 
    184 uint64_t blk_mig_bytes_remaining(void)
    185 {
    186     return blk_mig_bytes_total() - blk_mig_bytes_transferred();
    187 }
    188 
    189 uint64_t blk_mig_bytes_total(void)
    190 {
    191     BlkMigDevState *bmds;
    192     uint64_t sum = 0;
    193 
    194     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    195         sum += bmds->total_sectors;
    196     }
    197     return sum << BDRV_SECTOR_BITS;
    198 }
    199 
    200 
    201 /* Called with migration lock held.  */
    202 
    203 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
    204 {
    205     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
    206 
    207     if (sector < blk_nb_sectors(bmds->blk)) {
    208         return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
    209             (1UL << (chunk % (sizeof(unsigned long) * 8))));
    210     } else {
    211         return 0;
    212     }
    213 }
    214 
    215 /* Called with migration lock held.  */
    216 
    217 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
    218                              int nb_sectors, int set)
    219 {
    220     int64_t start, end;
    221     unsigned long val, idx, bit;
    222 
    223     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    224     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
    225 
    226     for (; start <= end; start++) {
    227         idx = start / (sizeof(unsigned long) * 8);
    228         bit = start % (sizeof(unsigned long) * 8);
    229         val = bmds->aio_bitmap[idx];
    230         if (set) {
    231             val |= 1UL << bit;
    232         } else {
    233             val &= ~(1UL << bit);
    234         }
    235         bmds->aio_bitmap[idx] = val;
    236     }
    237 }
    238 
    239 static void alloc_aio_bitmap(BlkMigDevState *bmds)
    240 {
    241     BlockBackend *bb = bmds->blk;
    242     int64_t bitmap_size;
    243 
    244     bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    245     bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
    246 
    247     bmds->aio_bitmap = g_malloc0(bitmap_size);
    248 }
    249 
    250 /* Never hold migration lock when yielding to the main loop!  */
    251 
    252 static void blk_mig_read_cb(void *opaque, int ret)
    253 {
    254     BlkMigBlock *blk = opaque;
    255 
    256     blk_mig_lock();
    257     blk->ret = ret;
    258 
    259     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
    260     bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
    261 
    262     block_mig_state.submitted--;
    263     block_mig_state.read_done++;
    264     assert(block_mig_state.submitted >= 0);
    265     blk_mig_unlock();
    266 }
    267 
    268 /* Called with no lock taken.  */
    269 
    270 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
    271 {
    272     int64_t total_sectors = bmds->total_sectors;
    273     int64_t cur_sector = bmds->cur_sector;
    274     BlockBackend *bb = bmds->blk;
    275     BlkMigBlock *blk;
    276     int nr_sectors;
    277     int64_t count;
    278 
    279     if (bmds->shared_base) {
    280         qemu_mutex_lock_iothread();
    281         aio_context_acquire(blk_get_aio_context(bb));
    282         /* Skip unallocated sectors; intentionally treats failure or
    283          * partial sector as an allocated sector */
    284         while (cur_sector < total_sectors &&
    285                !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
    286                                   MAX_IS_ALLOCATED_SEARCH, &count)) {
    287             if (count < BDRV_SECTOR_SIZE) {
    288                 break;
    289             }
    290             cur_sector += count >> BDRV_SECTOR_BITS;
    291         }
    292         aio_context_release(blk_get_aio_context(bb));
    293         qemu_mutex_unlock_iothread();
    294     }
    295 
    296     if (cur_sector >= total_sectors) {
    297         bmds->cur_sector = bmds->completed_sectors = total_sectors;
    298         return 1;
    299     }
    300 
    301     bmds->completed_sectors = cur_sector;
    302 
    303     cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
    304 
    305     /* we are going to transfer a full block even if it is not allocated */
    306     nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    307 
    308     if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    309         nr_sectors = total_sectors - cur_sector;
    310     }
    311 
    312     blk = g_new(BlkMigBlock, 1);
    313     blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    314     blk->bmds = bmds;
    315     blk->sector = cur_sector;
    316     blk->nr_sectors = nr_sectors;
    317 
    318     qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE);
    319 
    320     blk_mig_lock();
    321     block_mig_state.submitted++;
    322     blk_mig_unlock();
    323 
    324     /* We do not know if bs is under the main thread (and thus does
    325      * not acquire the AioContext when doing AIO) or rather under
    326      * dataplane.  Thus acquire both the iothread mutex and the
    327      * AioContext.
    328      *
    329      * This is ugly and will disappear when we make bdrv_* thread-safe,
    330      * without the need to acquire the AioContext.
    331      */
    332     qemu_mutex_lock_iothread();
    333     aio_context_acquire(blk_get_aio_context(bmds->blk));
    334     bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE,
    335                             nr_sectors * BDRV_SECTOR_SIZE);
    336     blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
    337                                 0, blk_mig_read_cb, blk);
    338     aio_context_release(blk_get_aio_context(bmds->blk));
    339     qemu_mutex_unlock_iothread();
    340 
    341     bmds->cur_sector = cur_sector + nr_sectors;
    342     return (bmds->cur_sector >= total_sectors);
    343 }
    344 
    345 /* Called with iothread lock taken.  */
    346 
    347 static int set_dirty_tracking(void)
    348 {
    349     BlkMigDevState *bmds;
    350     int ret;
    351 
    352     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    353         bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
    354                                                       BLK_MIG_BLOCK_SIZE,
    355                                                       NULL, NULL);
    356         if (!bmds->dirty_bitmap) {
    357             ret = -errno;
    358             goto fail;
    359         }
    360     }
    361     return 0;
    362 
    363 fail:
    364     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    365         if (bmds->dirty_bitmap) {
    366             bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
    367         }
    368     }
    369     return ret;
    370 }
    371 
    372 /* Called with iothread lock taken.  */
    373 
    374 static void unset_dirty_tracking(void)
    375 {
    376     BlkMigDevState *bmds;
    377 
    378     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    379         bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
    380     }
    381 }
    382 
    383 static int init_blk_migration(QEMUFile *f)
    384 {
    385     BlockDriverState *bs;
    386     BlkMigDevState *bmds;
    387     int64_t sectors;
    388     BdrvNextIterator it;
    389     int i, num_bs = 0;
    390     struct {
    391         BlkMigDevState *bmds;
    392         BlockDriverState *bs;
    393     } *bmds_bs;
    394     Error *local_err = NULL;
    395     int ret;
    396 
    397     block_mig_state.submitted = 0;
    398     block_mig_state.read_done = 0;
    399     block_mig_state.transferred = 0;
    400     block_mig_state.total_sector_sum = 0;
    401     block_mig_state.prev_progress = -1;
    402     block_mig_state.bulk_completed = 0;
    403     block_mig_state.zero_blocks = migrate_zero_blocks();
    404 
    405     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
    406         num_bs++;
    407     }
    408     bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
    409 
    410     for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
    411         if (bdrv_is_read_only(bs)) {
    412             continue;
    413         }
    414 
    415         sectors = bdrv_nb_sectors(bs);
    416         if (sectors <= 0) {
    417             ret = sectors;
    418             bdrv_next_cleanup(&it);
    419             goto out;
    420         }
    421 
    422         bmds = g_new0(BlkMigDevState, 1);
    423         bmds->blk = blk_new(qemu_get_aio_context(),
    424                             BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
    425         bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
    426         bmds->bulk_completed = 0;
    427         bmds->total_sectors = sectors;
    428         bmds->completed_sectors = 0;
    429         bmds->shared_base = migrate_use_block_incremental();
    430 
    431         assert(i < num_bs);
    432         bmds_bs[i].bmds = bmds;
    433         bmds_bs[i].bs = bs;
    434 
    435         block_mig_state.total_sector_sum += sectors;
    436 
    437         if (bmds->shared_base) {
    438             trace_migration_block_init_shared(bdrv_get_device_name(bs));
    439         } else {
    440             trace_migration_block_init_full(bdrv_get_device_name(bs));
    441         }
    442 
    443         QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    444     }
    445 
    446     /* Can only insert new BDSes now because doing so while iterating block
    447      * devices may end up in a deadlock (iterating the new BDSes, too). */
    448     for (i = 0; i < num_bs; i++) {
    449         BlkMigDevState *bmds = bmds_bs[i].bmds;
    450         BlockDriverState *bs = bmds_bs[i].bs;
    451 
    452         if (bmds) {
    453             ret = blk_insert_bs(bmds->blk, bs, &local_err);
    454             if (ret < 0) {
    455                 error_report_err(local_err);
    456                 goto out;
    457             }
    458 
    459             alloc_aio_bitmap(bmds);
    460             error_setg(&bmds->blocker, "block device is in use by migration");
    461             bdrv_op_block_all(bs, bmds->blocker);
    462         }
    463     }
    464 
    465     ret = 0;
    466 out:
    467     g_free(bmds_bs);
    468     return ret;
    469 }
    470 
    471 /* Called with no lock taken.  */
    472 
    473 static int blk_mig_save_bulked_block(QEMUFile *f)
    474 {
    475     int64_t completed_sector_sum = 0;
    476     BlkMigDevState *bmds;
    477     int progress;
    478     int ret = 0;
    479 
    480     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    481         if (bmds->bulk_completed == 0) {
    482             if (mig_save_device_bulk(f, bmds) == 1) {
    483                 /* completed bulk section for this device */
    484                 bmds->bulk_completed = 1;
    485             }
    486             completed_sector_sum += bmds->completed_sectors;
    487             ret = 1;
    488             break;
    489         } else {
    490             completed_sector_sum += bmds->completed_sectors;
    491         }
    492     }
    493 
    494     if (block_mig_state.total_sector_sum != 0) {
    495         progress = completed_sector_sum * 100 /
    496                    block_mig_state.total_sector_sum;
    497     } else {
    498         progress = 100;
    499     }
    500     if (progress != block_mig_state.prev_progress) {
    501         block_mig_state.prev_progress = progress;
    502         qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
    503                          | BLK_MIG_FLAG_PROGRESS);
    504         DPRINTF("Completed %d %%\r", progress);
    505     }
    506 
    507     return ret;
    508 }
    509 
    510 static void blk_mig_reset_dirty_cursor(void)
    511 {
    512     BlkMigDevState *bmds;
    513 
    514     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    515         bmds->cur_dirty = 0;
    516     }
    517 }
    518 
    519 /* Called with iothread lock and AioContext taken.  */
    520 
    521 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
    522                                  int is_async)
    523 {
    524     BlkMigBlock *blk;
    525     int64_t total_sectors = bmds->total_sectors;
    526     int64_t sector;
    527     int nr_sectors;
    528     int ret = -EIO;
    529 
    530     for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
    531         blk_mig_lock();
    532         if (bmds_aio_inflight(bmds, sector)) {
    533             blk_mig_unlock();
    534             blk_drain(bmds->blk);
    535         } else {
    536             blk_mig_unlock();
    537         }
    538         bdrv_dirty_bitmap_lock(bmds->dirty_bitmap);
    539         if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap,
    540                                          sector * BDRV_SECTOR_SIZE)) {
    541             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    542                 nr_sectors = total_sectors - sector;
    543             } else {
    544                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    545             }
    546             bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap,
    547                                            sector * BDRV_SECTOR_SIZE,
    548                                            nr_sectors * BDRV_SECTOR_SIZE);
    549             bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
    550 
    551             blk = g_new(BlkMigBlock, 1);
    552             blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    553             blk->bmds = bmds;
    554             blk->sector = sector;
    555             blk->nr_sectors = nr_sectors;
    556 
    557             if (is_async) {
    558                 qemu_iovec_init_buf(&blk->qiov, blk->buf,
    559                                     nr_sectors * BDRV_SECTOR_SIZE);
    560 
    561                 blk->aiocb = blk_aio_preadv(bmds->blk,
    562                                             sector * BDRV_SECTOR_SIZE,
    563                                             &blk->qiov, 0, blk_mig_read_cb,
    564                                             blk);
    565 
    566                 blk_mig_lock();
    567                 block_mig_state.submitted++;
    568                 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
    569                 blk_mig_unlock();
    570             } else {
    571                 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE,
    572                                 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0);
    573                 if (ret < 0) {
    574                     goto error;
    575                 }
    576                 blk_send(f, blk);
    577 
    578                 g_free(blk->buf);
    579                 g_free(blk);
    580             }
    581 
    582             sector += nr_sectors;
    583             bmds->cur_dirty = sector;
    584             break;
    585         }
    586 
    587         bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
    588         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
    589         bmds->cur_dirty = sector;
    590     }
    591 
    592     return (bmds->cur_dirty >= bmds->total_sectors);
    593 
    594 error:
    595     trace_migration_block_save_device_dirty(sector);
    596     g_free(blk->buf);
    597     g_free(blk);
    598     return ret;
    599 }
    600 
    601 /* Called with iothread lock taken.
    602  *
    603  * return value:
    604  * 0: too much data for max_downtime
    605  * 1: few enough data for max_downtime
    606 */
    607 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
    608 {
    609     BlkMigDevState *bmds;
    610     int ret = 1;
    611 
    612     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    613         aio_context_acquire(blk_get_aio_context(bmds->blk));
    614         ret = mig_save_device_dirty(f, bmds, is_async);
    615         aio_context_release(blk_get_aio_context(bmds->blk));
    616         if (ret <= 0) {
    617             break;
    618         }
    619     }
    620 
    621     return ret;
    622 }
    623 
    624 /* Called with no locks taken.  */
    625 
    626 static int flush_blks(QEMUFile *f)
    627 {
    628     BlkMigBlock *blk;
    629     int ret = 0;
    630 
    631     trace_migration_block_flush_blks("Enter", block_mig_state.submitted,
    632                                      block_mig_state.read_done,
    633                                      block_mig_state.transferred);
    634 
    635     blk_mig_lock();
    636     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
    637         if (qemu_file_rate_limit(f)) {
    638             break;
    639         }
    640         if (blk->ret < 0) {
    641             ret = blk->ret;
    642             break;
    643         }
    644 
    645         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
    646         blk_mig_unlock();
    647         blk_send(f, blk);
    648         blk_mig_lock();
    649 
    650         g_free(blk->buf);
    651         g_free(blk);
    652 
    653         block_mig_state.read_done--;
    654         block_mig_state.transferred++;
    655         assert(block_mig_state.read_done >= 0);
    656     }
    657     blk_mig_unlock();
    658 
    659     trace_migration_block_flush_blks("Exit", block_mig_state.submitted,
    660                                      block_mig_state.read_done,
    661                                      block_mig_state.transferred);
    662     return ret;
    663 }
    664 
    665 /* Called with iothread lock taken.  */
    666 
    667 static int64_t get_remaining_dirty(void)
    668 {
    669     BlkMigDevState *bmds;
    670     int64_t dirty = 0;
    671 
    672     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
    673         aio_context_acquire(blk_get_aio_context(bmds->blk));
    674         dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
    675         aio_context_release(blk_get_aio_context(bmds->blk));
    676     }
    677 
    678     return dirty;
    679 }
    680 
    681 
    682 
    683 /* Called with iothread lock taken.  */
    684 static void block_migration_cleanup_bmds(void)
    685 {
    686     BlkMigDevState *bmds;
    687     AioContext *ctx;
    688 
    689     unset_dirty_tracking();
    690 
    691     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
    692         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
    693         bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
    694         error_free(bmds->blocker);
    695 
    696         /* Save ctx, because bmds->blk can disappear during blk_unref.  */
    697         ctx = blk_get_aio_context(bmds->blk);
    698         aio_context_acquire(ctx);
    699         blk_unref(bmds->blk);
    700         aio_context_release(ctx);
    701 
    702         g_free(bmds->blk_name);
    703         g_free(bmds->aio_bitmap);
    704         g_free(bmds);
    705     }
    706 }
    707 
    708 /* Called with iothread lock taken.  */
    709 static void block_migration_cleanup(void *opaque)
    710 {
    711     BlkMigBlock *blk;
    712 
    713     bdrv_drain_all();
    714 
    715     block_migration_cleanup_bmds();
    716 
    717     blk_mig_lock();
    718     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
    719         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
    720         g_free(blk->buf);
    721         g_free(blk);
    722     }
    723     blk_mig_unlock();
    724 }
    725 
    726 static int block_save_setup(QEMUFile *f, void *opaque)
    727 {
    728     int ret;
    729 
    730     trace_migration_block_save("setup", block_mig_state.submitted,
    731                                block_mig_state.transferred);
    732 
    733     qemu_mutex_lock_iothread();
    734     ret = init_blk_migration(f);
    735     if (ret < 0) {
    736         qemu_mutex_unlock_iothread();
    737         return ret;
    738     }
    739 
    740     /* start track dirty blocks */
    741     ret = set_dirty_tracking();
    742 
    743     qemu_mutex_unlock_iothread();
    744 
    745     if (ret) {
    746         return ret;
    747     }
    748 
    749     ret = flush_blks(f);
    750     blk_mig_reset_dirty_cursor();
    751     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    752 
    753     return ret;
    754 }
    755 
    756 static int block_save_iterate(QEMUFile *f, void *opaque)
    757 {
    758     int ret;
    759     int64_t last_bytes = qemu_file_total_transferred(f);
    760     int64_t delta_bytes;
    761 
    762     trace_migration_block_save("iterate", block_mig_state.submitted,
    763                                block_mig_state.transferred);
    764 
    765     ret = flush_blks(f);
    766     if (ret) {
    767         return ret;
    768     }
    769 
    770     blk_mig_reset_dirty_cursor();
    771 
    772     /* control the rate of transfer */
    773     blk_mig_lock();
    774     while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE <
    775            qemu_file_get_rate_limit(f) &&
    776            block_mig_state.submitted < MAX_PARALLEL_IO &&
    777            (block_mig_state.submitted + block_mig_state.read_done) <
    778            MAX_IO_BUFFERS) {
    779         blk_mig_unlock();
    780         if (block_mig_state.bulk_completed == 0) {
    781             /* first finish the bulk phase */
    782             if (blk_mig_save_bulked_block(f) == 0) {
    783                 /* finished saving bulk on all devices */
    784                 block_mig_state.bulk_completed = 1;
    785             }
    786             ret = 0;
    787         } else {
    788             /* Always called with iothread lock taken for
    789              * simplicity, block_save_complete also calls it.
    790              */
    791             qemu_mutex_lock_iothread();
    792             ret = blk_mig_save_dirty_block(f, 1);
    793             qemu_mutex_unlock_iothread();
    794         }
    795         if (ret < 0) {
    796             return ret;
    797         }
    798         blk_mig_lock();
    799         if (ret != 0) {
    800             /* no more dirty blocks */
    801             break;
    802         }
    803     }
    804     blk_mig_unlock();
    805 
    806     ret = flush_blks(f);
    807     if (ret) {
    808         return ret;
    809     }
    810 
    811     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    812     delta_bytes = qemu_file_total_transferred(f) - last_bytes;
    813     if (delta_bytes > 0) {
    814         return 1;
    815     } else if (delta_bytes < 0) {
    816         return -1;
    817     } else {
    818         return 0;
    819     }
    820 }
    821 
    822 /* Called with iothread lock taken.  */
    823 
    824 static int block_save_complete(QEMUFile *f, void *opaque)
    825 {
    826     int ret;
    827 
    828     trace_migration_block_save("complete", block_mig_state.submitted,
    829                                block_mig_state.transferred);
    830 
    831     ret = flush_blks(f);
    832     if (ret) {
    833         return ret;
    834     }
    835 
    836     blk_mig_reset_dirty_cursor();
    837 
    838     /* we know for sure that save bulk is completed and
    839        all async read completed */
    840     blk_mig_lock();
    841     assert(block_mig_state.submitted == 0);
    842     blk_mig_unlock();
    843 
    844     do {
    845         ret = blk_mig_save_dirty_block(f, 0);
    846         if (ret < 0) {
    847             return ret;
    848         }
    849     } while (ret == 0);
    850 
    851     /* report completion */
    852     qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
    853 
    854     trace_migration_block_save_complete();
    855 
    856     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
    857 
    858     /* Make sure that our BlockBackends are gone, so that the block driver
    859      * nodes can be inactivated. */
    860     block_migration_cleanup_bmds();
    861 
    862     return 0;
    863 }
    864 
    865 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
    866                                uint64_t *res_precopy_only,
    867                                uint64_t *res_compatible,
    868                                uint64_t *res_postcopy_only)
    869 {
    870     /* Estimate pending number of bytes to send */
    871     uint64_t pending;
    872 
    873     qemu_mutex_lock_iothread();
    874     pending = get_remaining_dirty();
    875     qemu_mutex_unlock_iothread();
    876 
    877     blk_mig_lock();
    878     pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE +
    879                block_mig_state.read_done * BLK_MIG_BLOCK_SIZE;
    880     blk_mig_unlock();
    881 
    882     /* Report at least one block pending during bulk phase */
    883     if (!pending && !block_mig_state.bulk_completed) {
    884         pending = BLK_MIG_BLOCK_SIZE;
    885     }
    886 
    887     trace_migration_block_save_pending(pending);
    888     /* We don't do postcopy */
    889     *res_precopy_only += pending;
    890 }
    891 
    892 static int block_load(QEMUFile *f, void *opaque, int version_id)
    893 {
    894     static int banner_printed;
    895     int len, flags;
    896     char device_name[256];
    897     int64_t addr;
    898     BlockBackend *blk, *blk_prev = NULL;
    899     Error *local_err = NULL;
    900     uint8_t *buf;
    901     int64_t total_sectors = 0;
    902     int nr_sectors;
    903     int ret;
    904     BlockDriverInfo bdi;
    905     int cluster_size = BLK_MIG_BLOCK_SIZE;
    906 
    907     do {
    908         addr = qemu_get_be64(f);
    909 
    910         flags = addr & (BDRV_SECTOR_SIZE - 1);
    911         addr >>= BDRV_SECTOR_BITS;
    912 
    913         if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
    914             /* get device name */
    915             len = qemu_get_byte(f);
    916             qemu_get_buffer(f, (uint8_t *)device_name, len);
    917             device_name[len] = '\0';
    918 
    919             blk = blk_by_name(device_name);
    920             if (!blk) {
    921                 fprintf(stderr, "Error unknown block device %s\n",
    922                         device_name);
    923                 return -EINVAL;
    924             }
    925 
    926             if (blk != blk_prev) {
    927                 blk_prev = blk;
    928                 total_sectors = blk_nb_sectors(blk);
    929                 if (total_sectors <= 0) {
    930                     error_report("Error getting length of block device %s",
    931                                  device_name);
    932                     return -EINVAL;
    933                 }
    934 
    935                 blk_activate(blk, &local_err);
    936                 if (local_err) {
    937                     error_report_err(local_err);
    938                     return -EINVAL;
    939                 }
    940 
    941                 ret = bdrv_get_info(blk_bs(blk), &bdi);
    942                 if (ret == 0 && bdi.cluster_size > 0 &&
    943                     bdi.cluster_size <= BLK_MIG_BLOCK_SIZE &&
    944                     BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) {
    945                     cluster_size = bdi.cluster_size;
    946                 } else {
    947                     cluster_size = BLK_MIG_BLOCK_SIZE;
    948                 }
    949             }
    950 
    951             if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
    952                 nr_sectors = total_sectors - addr;
    953             } else {
    954                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
    955             }
    956 
    957             if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
    958                 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
    959                                         nr_sectors * BDRV_SECTOR_SIZE,
    960                                         BDRV_REQ_MAY_UNMAP);
    961             } else {
    962                 int i;
    963                 int64_t cur_addr;
    964                 uint8_t *cur_buf;
    965 
    966                 buf = g_malloc(BLK_MIG_BLOCK_SIZE);
    967                 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE);
    968                 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) {
    969                     cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
    970                     cur_buf = buf + i * cluster_size;
    971 
    972                     if ((!block_mig_state.zero_blocks ||
    973                         cluster_size < BLK_MIG_BLOCK_SIZE) &&
    974                         buffer_is_zero(cur_buf, cluster_size)) {
    975                         ret = blk_pwrite_zeroes(blk, cur_addr,
    976                                                 cluster_size,
    977                                                 BDRV_REQ_MAY_UNMAP);
    978                     } else {
    979                         ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf,
    980                                          0);
    981                     }
    982                     if (ret < 0) {
    983                         break;
    984                     }
    985                 }
    986                 g_free(buf);
    987             }
    988 
    989             if (ret < 0) {
    990                 return ret;
    991             }
    992         } else if (flags & BLK_MIG_FLAG_PROGRESS) {
    993             if (!banner_printed) {
    994                 printf("Receiving block device images\n");
    995                 banner_printed = 1;
    996             }
    997             printf("Completed %d %%%c", (int)addr,
    998                    (addr == 100) ? '\n' : '\r');
    999             fflush(stdout);
   1000         } else if (!(flags & BLK_MIG_FLAG_EOS)) {
   1001             fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags);
   1002             return -EINVAL;
   1003         }
   1004         ret = qemu_file_get_error(f);
   1005         if (ret != 0) {
   1006             return ret;
   1007         }
   1008     } while (!(flags & BLK_MIG_FLAG_EOS));
   1009 
   1010     return 0;
   1011 }
   1012 
   1013 static bool block_is_active(void *opaque)
   1014 {
   1015     return migrate_use_block();
   1016 }
   1017 
   1018 static SaveVMHandlers savevm_block_handlers = {
   1019     .save_setup = block_save_setup,
   1020     .save_live_iterate = block_save_iterate,
   1021     .save_live_complete_precopy = block_save_complete,
   1022     .save_live_pending = block_save_pending,
   1023     .load_state = block_load,
   1024     .save_cleanup = block_migration_cleanup,
   1025     .is_active = block_is_active,
   1026 };
   1027 
   1028 void blk_mig_init(void)
   1029 {
   1030     QSIMPLEQ_INIT(&block_mig_state.bmds_list);
   1031     QSIMPLEQ_INIT(&block_mig_state.blk_list);
   1032     qemu_mutex_init(&block_mig_state.lock);
   1033 
   1034     register_savevm_live("block", 0, 1, &savevm_block_handlers,
   1035                          &block_mig_state);
   1036 }