qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

vdi.c (35448B)


      1 /*
      2  * Block driver for the Virtual Disk Image (VDI) format
      3  *
      4  * Copyright (c) 2009, 2012 Stefan Weil
      5  *
      6  * This program is free software: you can redistribute it and/or modify
      7  * it under the terms of the GNU General Public License as published by
      8  * the Free Software Foundation, either version 2 of the License, or
      9  * (at your option) version 3 or any later version.
     10  *
     11  * This program is distributed in the hope that it will be useful,
     12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14  * GNU General Public License for more details.
     15  *
     16  * You should have received a copy of the GNU General Public License
     17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
     18  *
     19  * Reference:
     20  * http://forums.virtualbox.org/viewtopic.php?t=8046
     21  *
     22  * This driver supports create / read / write operations on VDI images.
     23  *
     24  * Todo (see also TODO in code):
     25  *
     26  * Some features like snapshots are still missing.
     27  *
     28  * Deallocation of zero-filled blocks and shrinking images are missing, too
     29  * (might be added to common block layer).
     30  *
     31  * Allocation of blocks could be optimized (less writes to block map and
     32  * header).
     33  *
     34  * Read and write of adjacent blocks could be done in one operation
     35  * (current code uses one operation per block (1 MiB).
     36  *
     37  * The code is not thread safe (missing locks for changes in header and
     38  * block table, no problem with current QEMU).
     39  *
     40  * Hints:
     41  *
     42  * Blocks (VDI documentation) correspond to clusters (QEMU).
     43  * QEMU's backing files could be implemented using VDI snapshot files (TODO).
     44  * VDI snapshot files may also contain the complete machine state.
     45  * Maybe this machine state can be converted to QEMU PC machine snapshot data.
     46  *
     47  * The driver keeps a block cache (little endian entries) in memory.
     48  * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM,
     49  * so this seems to be reasonable.
     50  */
     51 
     52 #include "qemu/osdep.h"
     53 #include "qemu/units.h"
     54 #include "qapi/error.h"
     55 #include "qapi/qobject-input-visitor.h"
     56 #include "qapi/qapi-visit-block-core.h"
     57 #include "block/block_int.h"
     58 #include "block/qdict.h"
     59 #include "sysemu/block-backend.h"
     60 #include "qemu/module.h"
     61 #include "qemu/option.h"
     62 #include "qemu/bswap.h"
     63 #include "migration/blocker.h"
     64 #include "qemu/coroutine.h"
     65 #include "qemu/cutils.h"
     66 #include "qemu/uuid.h"
     67 #include "qemu/memalign.h"
     68 
     69 /* Code configuration options. */
     70 
     71 /* Enable debug messages. */
     72 //~ #define CONFIG_VDI_DEBUG
     73 
     74 /* Support write operations on VDI images. */
     75 #define CONFIG_VDI_WRITE
     76 
     77 /* Support non-standard block (cluster) size. This is untested.
     78  * Maybe it will be needed for very large images.
     79  */
     80 //~ #define CONFIG_VDI_BLOCK_SIZE
     81 
     82 /* Support static (fixed, pre-allocated) images. */
     83 #define CONFIG_VDI_STATIC_IMAGE
     84 
     85 /* Command line option for static images. */
     86 #define BLOCK_OPT_STATIC "static"
     87 
     88 #define SECTOR_SIZE 512
     89 #define DEFAULT_CLUSTER_SIZE 1048576
     90 /* Note: can't use 1 * MiB, because it's passed to stringify() */
     91 
     92 #if defined(CONFIG_VDI_DEBUG)
     93 #define VDI_DEBUG 1
     94 #else
     95 #define VDI_DEBUG 0
     96 #endif
     97 
     98 #define logout(fmt, ...) \
     99     do {                                                                \
    100         if (VDI_DEBUG) {                                                \
    101             fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__); \
    102         }                                                               \
    103     } while (0)
    104 
    105 /* Image signature. */
    106 #define VDI_SIGNATURE 0xbeda107f
    107 
    108 /* Image version. */
    109 #define VDI_VERSION_1_1 0x00010001
    110 
    111 /* Image type. */
    112 #define VDI_TYPE_DYNAMIC 1
    113 #define VDI_TYPE_STATIC  2
    114 
    115 /* Innotek / SUN images use these strings in header.text:
    116  * "<<< innotek VirtualBox Disk Image >>>\n"
    117  * "<<< Sun xVM VirtualBox Disk Image >>>\n"
    118  * "<<< Sun VirtualBox Disk Image >>>\n"
    119  * The value does not matter, so QEMU created images use a different text.
    120  */
    121 #define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n"
    122 
    123 /* A never-allocated block; semantically arbitrary content. */
    124 #define VDI_UNALLOCATED 0xffffffffU
    125 
    126 /* A discarded (no longer allocated) block; semantically zero-filled. */
    127 #define VDI_DISCARDED   0xfffffffeU
    128 
    129 #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
    130 
    131 /* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since
    132  * the bmap is read and written in a single operation, its size needs to be
    133  * limited to INT_MAX; furthermore, when opening an image, the bmap size is
    134  * rounded up to be aligned on BDRV_SECTOR_SIZE.
    135  * Therefore this should satisfy the following:
    136  * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1
    137  * (INT_MAX + 1 is the first value not representable as an int)
    138  * This guarantees that any value below or equal to the constant will, when
    139  * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary,
    140  * still be below or equal to INT_MAX. */
    141 #define VDI_BLOCKS_IN_IMAGE_MAX \
    142     ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t)))
    143 #define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
    144                                   (uint64_t)DEFAULT_CLUSTER_SIZE)
    145 
    146 static QemuOptsList vdi_create_opts;
    147 
    148 typedef struct {
    149     char text[0x40];
    150     uint32_t signature;
    151     uint32_t version;
    152     uint32_t header_size;
    153     uint32_t image_type;
    154     uint32_t image_flags;
    155     char description[256];
    156     uint32_t offset_bmap;
    157     uint32_t offset_data;
    158     uint32_t cylinders;         /* disk geometry, unused here */
    159     uint32_t heads;             /* disk geometry, unused here */
    160     uint32_t sectors;           /* disk geometry, unused here */
    161     uint32_t sector_size;
    162     uint32_t unused1;
    163     uint64_t disk_size;
    164     uint32_t block_size;
    165     uint32_t block_extra;       /* unused here */
    166     uint32_t blocks_in_image;
    167     uint32_t blocks_allocated;
    168     QemuUUID uuid_image;
    169     QemuUUID uuid_last_snap;
    170     QemuUUID uuid_link;
    171     QemuUUID uuid_parent;
    172     uint64_t unused2[7];
    173 } QEMU_PACKED VdiHeader;
    174 
    175 QEMU_BUILD_BUG_ON(sizeof(VdiHeader) != 512);
    176 
    177 typedef struct {
    178     /* The block map entries are little endian (even in memory). */
    179     uint32_t *bmap;
    180     /* Size of block (bytes). */
    181     uint32_t block_size;
    182     /* First sector of block map. */
    183     uint32_t bmap_sector;
    184     /* VDI header (converted to host endianness). */
    185     VdiHeader header;
    186 
    187     CoRwlock bmap_lock;
    188 
    189     Error *migration_blocker;
    190 } BDRVVdiState;
    191 
    192 static void vdi_header_to_cpu(VdiHeader *header)
    193 {
    194     header->signature = le32_to_cpu(header->signature);
    195     header->version = le32_to_cpu(header->version);
    196     header->header_size = le32_to_cpu(header->header_size);
    197     header->image_type = le32_to_cpu(header->image_type);
    198     header->image_flags = le32_to_cpu(header->image_flags);
    199     header->offset_bmap = le32_to_cpu(header->offset_bmap);
    200     header->offset_data = le32_to_cpu(header->offset_data);
    201     header->cylinders = le32_to_cpu(header->cylinders);
    202     header->heads = le32_to_cpu(header->heads);
    203     header->sectors = le32_to_cpu(header->sectors);
    204     header->sector_size = le32_to_cpu(header->sector_size);
    205     header->disk_size = le64_to_cpu(header->disk_size);
    206     header->block_size = le32_to_cpu(header->block_size);
    207     header->block_extra = le32_to_cpu(header->block_extra);
    208     header->blocks_in_image = le32_to_cpu(header->blocks_in_image);
    209     header->blocks_allocated = le32_to_cpu(header->blocks_allocated);
    210     header->uuid_image = qemu_uuid_bswap(header->uuid_image);
    211     header->uuid_last_snap = qemu_uuid_bswap(header->uuid_last_snap);
    212     header->uuid_link = qemu_uuid_bswap(header->uuid_link);
    213     header->uuid_parent = qemu_uuid_bswap(header->uuid_parent);
    214 }
    215 
    216 static void vdi_header_to_le(VdiHeader *header)
    217 {
    218     header->signature = cpu_to_le32(header->signature);
    219     header->version = cpu_to_le32(header->version);
    220     header->header_size = cpu_to_le32(header->header_size);
    221     header->image_type = cpu_to_le32(header->image_type);
    222     header->image_flags = cpu_to_le32(header->image_flags);
    223     header->offset_bmap = cpu_to_le32(header->offset_bmap);
    224     header->offset_data = cpu_to_le32(header->offset_data);
    225     header->cylinders = cpu_to_le32(header->cylinders);
    226     header->heads = cpu_to_le32(header->heads);
    227     header->sectors = cpu_to_le32(header->sectors);
    228     header->sector_size = cpu_to_le32(header->sector_size);
    229     header->disk_size = cpu_to_le64(header->disk_size);
    230     header->block_size = cpu_to_le32(header->block_size);
    231     header->block_extra = cpu_to_le32(header->block_extra);
    232     header->blocks_in_image = cpu_to_le32(header->blocks_in_image);
    233     header->blocks_allocated = cpu_to_le32(header->blocks_allocated);
    234     header->uuid_image = qemu_uuid_bswap(header->uuid_image);
    235     header->uuid_last_snap = qemu_uuid_bswap(header->uuid_last_snap);
    236     header->uuid_link = qemu_uuid_bswap(header->uuid_link);
    237     header->uuid_parent = qemu_uuid_bswap(header->uuid_parent);
    238 }
    239 
    240 static void vdi_header_print(VdiHeader *header)
    241 {
    242     char uuidstr[37];
    243     QemuUUID uuid;
    244     logout("text        %s", header->text);
    245     logout("signature   0x%08x\n", header->signature);
    246     logout("header size 0x%04x\n", header->header_size);
    247     logout("image type  0x%04x\n", header->image_type);
    248     logout("image flags 0x%04x\n", header->image_flags);
    249     logout("description %s\n", header->description);
    250     logout("offset bmap 0x%04x\n", header->offset_bmap);
    251     logout("offset data 0x%04x\n", header->offset_data);
    252     logout("cylinders   0x%04x\n", header->cylinders);
    253     logout("heads       0x%04x\n", header->heads);
    254     logout("sectors     0x%04x\n", header->sectors);
    255     logout("sector size 0x%04x\n", header->sector_size);
    256     logout("image size  0x%" PRIx64 " B (%" PRIu64 " MiB)\n",
    257            header->disk_size, header->disk_size / MiB);
    258     logout("block size  0x%04x\n", header->block_size);
    259     logout("block extra 0x%04x\n", header->block_extra);
    260     logout("blocks tot. 0x%04x\n", header->blocks_in_image);
    261     logout("blocks all. 0x%04x\n", header->blocks_allocated);
    262     uuid = header->uuid_image;
    263     qemu_uuid_unparse(&uuid, uuidstr);
    264     logout("uuid image  %s\n", uuidstr);
    265     uuid = header->uuid_last_snap;
    266     qemu_uuid_unparse(&uuid, uuidstr);
    267     logout("uuid snap   %s\n", uuidstr);
    268     uuid = header->uuid_link;
    269     qemu_uuid_unparse(&uuid, uuidstr);
    270     logout("uuid link   %s\n", uuidstr);
    271     uuid = header->uuid_parent;
    272     qemu_uuid_unparse(&uuid, uuidstr);
    273     logout("uuid parent %s\n", uuidstr);
    274 }
    275 
    276 static int coroutine_fn vdi_co_check(BlockDriverState *bs, BdrvCheckResult *res,
    277                                      BdrvCheckMode fix)
    278 {
    279     /* TODO: additional checks possible. */
    280     BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    281     uint32_t blocks_allocated = 0;
    282     uint32_t block;
    283     uint32_t *bmap;
    284     logout("\n");
    285 
    286     if (fix) {
    287         return -ENOTSUP;
    288     }
    289 
    290     bmap = g_try_new(uint32_t, s->header.blocks_in_image);
    291     if (s->header.blocks_in_image && bmap == NULL) {
    292         res->check_errors++;
    293         return -ENOMEM;
    294     }
    295 
    296     memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));
    297 
    298     /* Check block map and value of blocks_allocated. */
    299     for (block = 0; block < s->header.blocks_in_image; block++) {
    300         uint32_t bmap_entry = le32_to_cpu(s->bmap[block]);
    301         if (VDI_IS_ALLOCATED(bmap_entry)) {
    302             if (bmap_entry < s->header.blocks_in_image) {
    303                 blocks_allocated++;
    304                 if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) {
    305                     bmap[bmap_entry] = bmap_entry;
    306                 } else {
    307                     fprintf(stderr, "ERROR: block index %" PRIu32
    308                             " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry);
    309                     res->corruptions++;
    310                 }
    311             } else {
    312                 fprintf(stderr, "ERROR: block index %" PRIu32
    313                         " too large, is %" PRIu32 "\n", block, bmap_entry);
    314                 res->corruptions++;
    315             }
    316         }
    317     }
    318     if (blocks_allocated != s->header.blocks_allocated) {
    319         fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32
    320                ", should be %" PRIu32 "\n",
    321                blocks_allocated, s->header.blocks_allocated);
    322         res->corruptions++;
    323     }
    324 
    325     g_free(bmap);
    326 
    327     return 0;
    328 }
    329 
    330 static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    331 {
    332     /* TODO: vdi_get_info would be needed for machine snapshots.
    333        vm_state_offset is still missing. */
    334     BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    335     logout("\n");
    336     bdi->cluster_size = s->block_size;
    337     bdi->vm_state_offset = 0;
    338     return 0;
    339 }
    340 
    341 static int vdi_make_empty(BlockDriverState *bs)
    342 {
    343     /* TODO: missing code. */
    344     logout("\n");
    345     /* The return value for missing code must be 0, see block.c. */
    346     return 0;
    347 }
    348 
    349 static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
    350 {
    351     const VdiHeader *header = (const VdiHeader *)buf;
    352     int ret = 0;
    353 
    354     logout("\n");
    355 
    356     if (buf_size < sizeof(*header)) {
    357         /* Header too small, no VDI. */
    358     } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
    359         ret = 100;
    360     }
    361 
    362     if (ret == 0) {
    363         logout("no vdi image\n");
    364     } else {
    365         logout("%s", header->text);
    366     }
    367 
    368     return ret;
    369 }
    370 
    371 static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    372                     Error **errp)
    373 {
    374     BDRVVdiState *s = bs->opaque;
    375     VdiHeader header;
    376     size_t bmap_size;
    377     int ret;
    378     QemuUUID uuid_link, uuid_parent;
    379 
    380     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    381     if (ret < 0) {
    382         return ret;
    383     }
    384 
    385     logout("\n");
    386 
    387     ret = bdrv_pread(bs->file, 0, sizeof(header), &header, 0);
    388     if (ret < 0) {
    389         goto fail;
    390     }
    391 
    392     vdi_header_to_cpu(&header);
    393     if (VDI_DEBUG) {
    394         vdi_header_print(&header);
    395     }
    396 
    397     if (header.disk_size > VDI_DISK_SIZE_MAX) {
    398         error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
    399                           ", max supported is 0x%" PRIx64 ")",
    400                           header.disk_size, VDI_DISK_SIZE_MAX);
    401         ret = -ENOTSUP;
    402         goto fail;
    403     }
    404 
    405     uuid_link = header.uuid_link;
    406     uuid_parent = header.uuid_parent;
    407 
    408     if (header.disk_size % SECTOR_SIZE != 0) {
    409         /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
    410            We accept them but round the disk size to the next multiple of
    411            SECTOR_SIZE. */
    412         logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
    413         header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE);
    414     }
    415 
    416     if (header.signature != VDI_SIGNATURE) {
    417         error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32
    418                    ")", header.signature);
    419         ret = -EINVAL;
    420         goto fail;
    421     } else if (header.version != VDI_VERSION_1_1) {
    422         error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32
    423                    ")", header.version >> 16, header.version & 0xffff);
    424         ret = -ENOTSUP;
    425         goto fail;
    426     } else if (header.offset_bmap % SECTOR_SIZE != 0) {
    427         /* We only support block maps which start on a sector boundary. */
    428         error_setg(errp, "unsupported VDI image (unaligned block map offset "
    429                    "0x%" PRIx32 ")", header.offset_bmap);
    430         ret = -ENOTSUP;
    431         goto fail;
    432     } else if (header.offset_data % SECTOR_SIZE != 0) {
    433         /* We only support data blocks which start on a sector boundary. */
    434         error_setg(errp, "unsupported VDI image (unaligned data offset 0x%"
    435                    PRIx32 ")", header.offset_data);
    436         ret = -ENOTSUP;
    437         goto fail;
    438     } else if (header.sector_size != SECTOR_SIZE) {
    439         error_setg(errp, "unsupported VDI image (sector size %" PRIu32
    440                    " is not %u)", header.sector_size, SECTOR_SIZE);
    441         ret = -ENOTSUP;
    442         goto fail;
    443     } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
    444         error_setg(errp, "unsupported VDI image (block size %" PRIu32
    445                          " is not %" PRIu32 ")",
    446                    header.block_size, DEFAULT_CLUSTER_SIZE);
    447         ret = -ENOTSUP;
    448         goto fail;
    449     } else if (header.disk_size >
    450                (uint64_t)header.blocks_in_image * header.block_size) {
    451         error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", "
    452                    "image bitmap has room for %" PRIu64 ")",
    453                    header.disk_size,
    454                    (uint64_t)header.blocks_in_image * header.block_size);
    455         ret = -ENOTSUP;
    456         goto fail;
    457     } else if (!qemu_uuid_is_null(&uuid_link)) {
    458         error_setg(errp, "unsupported VDI image (non-NULL link UUID)");
    459         ret = -ENOTSUP;
    460         goto fail;
    461     } else if (!qemu_uuid_is_null(&uuid_parent)) {
    462         error_setg(errp, "unsupported VDI image (non-NULL parent UUID)");
    463         ret = -ENOTSUP;
    464         goto fail;
    465     } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
    466         error_setg(errp, "unsupported VDI image "
    467                          "(too many blocks %u, max is %u)",
    468                           header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
    469         ret = -ENOTSUP;
    470         goto fail;
    471     }
    472 
    473     bs->total_sectors = header.disk_size / SECTOR_SIZE;
    474 
    475     s->block_size = header.block_size;
    476     s->bmap_sector = header.offset_bmap / SECTOR_SIZE;
    477     s->header = header;
    478 
    479     bmap_size = header.blocks_in_image * sizeof(uint32_t);
    480     bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE);
    481     s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE);
    482     if (s->bmap == NULL) {
    483         ret = -ENOMEM;
    484         goto fail;
    485     }
    486 
    487     ret = bdrv_pread(bs->file, header.offset_bmap, bmap_size * SECTOR_SIZE,
    488                      s->bmap, 0);
    489     if (ret < 0) {
    490         goto fail_free_bmap;
    491     }
    492 
    493     /* Disable migration when vdi images are used */
    494     error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
    495                "does not support live migration",
    496                bdrv_get_device_or_node_name(bs));
    497     ret = migrate_add_blocker(s->migration_blocker, errp);
    498     if (ret < 0) {
    499         error_free(s->migration_blocker);
    500         goto fail_free_bmap;
    501     }
    502 
    503     qemu_co_rwlock_init(&s->bmap_lock);
    504 
    505     return 0;
    506 
    507  fail_free_bmap:
    508     qemu_vfree(s->bmap);
    509 
    510  fail:
    511     return ret;
    512 }
    513 
    514 static int vdi_reopen_prepare(BDRVReopenState *state,
    515                               BlockReopenQueue *queue, Error **errp)
    516 {
    517     return 0;
    518 }
    519 
    520 static int coroutine_fn vdi_co_block_status(BlockDriverState *bs,
    521                                             bool want_zero,
    522                                             int64_t offset, int64_t bytes,
    523                                             int64_t *pnum, int64_t *map,
    524                                             BlockDriverState **file)
    525 {
    526     BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
    527     size_t bmap_index = offset / s->block_size;
    528     size_t index_in_block = offset % s->block_size;
    529     uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
    530     int result;
    531 
    532     logout("%p, %" PRId64 ", %" PRId64 ", %p\n", bs, offset, bytes, pnum);
    533     *pnum = MIN(s->block_size - index_in_block, bytes);
    534     result = VDI_IS_ALLOCATED(bmap_entry);
    535     if (!result) {
    536         return BDRV_BLOCK_ZERO;
    537     }
    538 
    539     *map = s->header.offset_data + (uint64_t)bmap_entry * s->block_size +
    540         index_in_block;
    541     *file = bs->file->bs;
    542     return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
    543         (s->header.image_type == VDI_TYPE_STATIC ? BDRV_BLOCK_RECURSE : 0);
    544 }
    545 
    546 static int coroutine_fn
    547 vdi_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
    548               QEMUIOVector *qiov, BdrvRequestFlags flags)
    549 {
    550     BDRVVdiState *s = bs->opaque;
    551     QEMUIOVector local_qiov;
    552     uint32_t bmap_entry;
    553     uint32_t block_index;
    554     uint32_t offset_in_block;
    555     uint32_t n_bytes;
    556     uint64_t bytes_done = 0;
    557     int ret = 0;
    558 
    559     logout("\n");
    560 
    561     qemu_iovec_init(&local_qiov, qiov->niov);
    562 
    563     while (ret >= 0 && bytes > 0) {
    564         block_index = offset / s->block_size;
    565         offset_in_block = offset % s->block_size;
    566         n_bytes = MIN(bytes, s->block_size - offset_in_block);
    567 
    568         logout("will read %u bytes starting at offset %" PRIu64 "\n",
    569                n_bytes, offset);
    570 
    571         /* prepare next AIO request */
    572         qemu_co_rwlock_rdlock(&s->bmap_lock);
    573         bmap_entry = le32_to_cpu(s->bmap[block_index]);
    574         qemu_co_rwlock_unlock(&s->bmap_lock);
    575         if (!VDI_IS_ALLOCATED(bmap_entry)) {
    576             /* Block not allocated, return zeros, no need to wait. */
    577             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
    578             ret = 0;
    579         } else {
    580             uint64_t data_offset = s->header.offset_data +
    581                                    (uint64_t)bmap_entry * s->block_size +
    582                                    offset_in_block;
    583 
    584             qemu_iovec_reset(&local_qiov);
    585             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    586 
    587             ret = bdrv_co_preadv(bs->file, data_offset, n_bytes,
    588                                  &local_qiov, 0);
    589         }
    590         logout("%u bytes read\n", n_bytes);
    591 
    592         bytes -= n_bytes;
    593         offset += n_bytes;
    594         bytes_done += n_bytes;
    595     }
    596 
    597     qemu_iovec_destroy(&local_qiov);
    598 
    599     return ret;
    600 }
    601 
    602 static int coroutine_fn
    603 vdi_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
    604                QEMUIOVector *qiov, BdrvRequestFlags flags)
    605 {
    606     BDRVVdiState *s = bs->opaque;
    607     QEMUIOVector local_qiov;
    608     uint32_t bmap_entry;
    609     uint32_t block_index;
    610     uint32_t offset_in_block;
    611     uint32_t n_bytes;
    612     uint64_t data_offset;
    613     uint32_t bmap_first = VDI_UNALLOCATED;
    614     uint32_t bmap_last = VDI_UNALLOCATED;
    615     uint8_t *block = NULL;
    616     uint64_t bytes_done = 0;
    617     int ret = 0;
    618 
    619     logout("\n");
    620 
    621     qemu_iovec_init(&local_qiov, qiov->niov);
    622 
    623     while (ret >= 0 && bytes > 0) {
    624         block_index = offset / s->block_size;
    625         offset_in_block = offset % s->block_size;
    626         n_bytes = MIN(bytes, s->block_size - offset_in_block);
    627 
    628         logout("will write %u bytes starting at offset %" PRIu64 "\n",
    629                n_bytes, offset);
    630 
    631         /* prepare next AIO request */
    632         qemu_co_rwlock_rdlock(&s->bmap_lock);
    633         bmap_entry = le32_to_cpu(s->bmap[block_index]);
    634         if (!VDI_IS_ALLOCATED(bmap_entry)) {
    635             /* Allocate new block and write to it. */
    636             uint64_t data_offset;
    637             qemu_co_rwlock_upgrade(&s->bmap_lock);
    638             bmap_entry = le32_to_cpu(s->bmap[block_index]);
    639             if (VDI_IS_ALLOCATED(bmap_entry)) {
    640                 /* A concurrent allocation did the work for us.  */
    641                 qemu_co_rwlock_downgrade(&s->bmap_lock);
    642                 goto nonallocating_write;
    643             }
    644 
    645             bmap_entry = s->header.blocks_allocated;
    646             s->bmap[block_index] = cpu_to_le32(bmap_entry);
    647             s->header.blocks_allocated++;
    648             data_offset = s->header.offset_data +
    649                           (uint64_t)bmap_entry * s->block_size;
    650             if (block == NULL) {
    651                 block = g_malloc(s->block_size);
    652                 bmap_first = block_index;
    653             }
    654             bmap_last = block_index;
    655             /* Copy data to be written to new block and zero unused parts. */
    656             memset(block, 0, offset_in_block);
    657             qemu_iovec_to_buf(qiov, bytes_done, block + offset_in_block,
    658                               n_bytes);
    659             memset(block + offset_in_block + n_bytes, 0,
    660                    s->block_size - n_bytes - offset_in_block);
    661 
    662             /* Write the new block under CoRwLock write-side protection,
    663              * so this full-cluster write does not overlap a partial write
    664              * of the same cluster, issued from the "else" branch.
    665              */
    666             ret = bdrv_co_pwrite(bs->file, data_offset, s->block_size, block,
    667                                  0);
    668             qemu_co_rwlock_unlock(&s->bmap_lock);
    669         } else {
    670 nonallocating_write:
    671             data_offset = s->header.offset_data +
    672                            (uint64_t)bmap_entry * s->block_size +
    673                            offset_in_block;
    674             qemu_co_rwlock_unlock(&s->bmap_lock);
    675 
    676             qemu_iovec_reset(&local_qiov);
    677             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
    678 
    679             ret = bdrv_co_pwritev(bs->file, data_offset, n_bytes,
    680                                   &local_qiov, 0);
    681         }
    682 
    683         bytes -= n_bytes;
    684         offset += n_bytes;
    685         bytes_done += n_bytes;
    686 
    687         logout("%u bytes written\n", n_bytes);
    688     }
    689 
    690     qemu_iovec_destroy(&local_qiov);
    691 
    692     logout("finished data write\n");
    693     if (ret < 0) {
    694         g_free(block);
    695         return ret;
    696     }
    697 
    698     if (block) {
    699         /* One or more new blocks were allocated. */
    700         VdiHeader *header;
    701         uint8_t *base;
    702         uint64_t offset;
    703         uint32_t n_sectors;
    704 
    705         g_free(block);
    706         header = g_malloc(sizeof(*header));
    707 
    708         logout("now writing modified header\n");
    709         assert(VDI_IS_ALLOCATED(bmap_first));
    710         *header = s->header;
    711         vdi_header_to_le(header);
    712         ret = bdrv_co_pwrite(bs->file, 0, sizeof(*header), header, 0);
    713         g_free(header);
    714 
    715         if (ret < 0) {
    716             return ret;
    717         }
    718 
    719         logout("now writing modified block map entry %u...%u\n",
    720                bmap_first, bmap_last);
    721         /* Write modified sectors from block map. */
    722         bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
    723         bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
    724         n_sectors = bmap_last - bmap_first + 1;
    725         offset = s->bmap_sector + bmap_first;
    726         base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
    727         logout("will write %u block map sectors starting from entry %u\n",
    728                n_sectors, bmap_first);
    729         ret = bdrv_co_pwrite(bs->file, offset * SECTOR_SIZE,
    730                              n_sectors * SECTOR_SIZE, base, 0);
    731     }
    732 
    733     return ret;
    734 }
    735 
    736 static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
    737                                          size_t block_size, Error **errp)
    738 {
    739     BlockdevCreateOptionsVdi *vdi_opts;
    740     int ret = 0;
    741     uint64_t bytes = 0;
    742     uint32_t blocks;
    743     uint32_t image_type;
    744     VdiHeader header;
    745     size_t i;
    746     size_t bmap_size;
    747     int64_t offset = 0;
    748     BlockDriverState *bs_file = NULL;
    749     BlockBackend *blk = NULL;
    750     uint32_t *bmap = NULL;
    751     QemuUUID uuid;
    752 
    753     assert(create_options->driver == BLOCKDEV_DRIVER_VDI);
    754     vdi_opts = &create_options->u.vdi;
    755 
    756     logout("\n");
    757 
    758     /* Validate options and set default values */
    759     bytes = vdi_opts->size;
    760 
    761     if (!vdi_opts->has_preallocation) {
    762         vdi_opts->preallocation = PREALLOC_MODE_OFF;
    763     }
    764     switch (vdi_opts->preallocation) {
    765     case PREALLOC_MODE_OFF:
    766         image_type = VDI_TYPE_DYNAMIC;
    767         break;
    768     case PREALLOC_MODE_METADATA:
    769         image_type = VDI_TYPE_STATIC;
    770         break;
    771     default:
    772         error_setg(errp, "Preallocation mode not supported for vdi");
    773         return -EINVAL;
    774     }
    775 
    776 #ifndef CONFIG_VDI_STATIC_IMAGE
    777     if (image_type == VDI_TYPE_STATIC) {
    778         ret = -ENOTSUP;
    779         error_setg(errp, "Statically allocated images cannot be created in "
    780                    "this build");
    781         goto exit;
    782     }
    783 #endif
    784 #ifndef CONFIG_VDI_BLOCK_SIZE
    785     if (block_size != DEFAULT_CLUSTER_SIZE) {
    786         ret = -ENOTSUP;
    787         error_setg(errp,
    788                    "A non-default cluster size is not supported in this build");
    789         goto exit;
    790     }
    791 #endif
    792 
    793     if (bytes > VDI_DISK_SIZE_MAX) {
    794         ret = -ENOTSUP;
    795         error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
    796                           ", max supported is 0x%" PRIx64 ")",
    797                           bytes, VDI_DISK_SIZE_MAX);
    798         goto exit;
    799     }
    800 
    801     /* Create BlockBackend to write to the image */
    802     bs_file = bdrv_open_blockdev_ref(vdi_opts->file, errp);
    803     if (!bs_file) {
    804         ret = -EIO;
    805         goto exit;
    806     }
    807 
    808     blk = blk_new_with_bs(bs_file, BLK_PERM_WRITE | BLK_PERM_RESIZE,
    809                           BLK_PERM_ALL, errp);
    810     if (!blk) {
    811         ret = -EPERM;
    812         goto exit;
    813     }
    814 
    815     blk_set_allow_write_beyond_eof(blk, true);
    816 
    817     /* We need enough blocks to store the given disk size,
    818        so always round up. */
    819     blocks = DIV_ROUND_UP(bytes, block_size);
    820 
    821     bmap_size = blocks * sizeof(uint32_t);
    822     bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE);
    823 
    824     memset(&header, 0, sizeof(header));
    825     pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
    826     header.signature = VDI_SIGNATURE;
    827     header.version = VDI_VERSION_1_1;
    828     header.header_size = 0x180;
    829     header.image_type = image_type;
    830     header.offset_bmap = 0x200;
    831     header.offset_data = 0x200 + bmap_size;
    832     header.sector_size = SECTOR_SIZE;
    833     header.disk_size = bytes;
    834     header.block_size = block_size;
    835     header.blocks_in_image = blocks;
    836     if (image_type == VDI_TYPE_STATIC) {
    837         header.blocks_allocated = blocks;
    838     }
    839     qemu_uuid_generate(&uuid);
    840     header.uuid_image = uuid;
    841     qemu_uuid_generate(&uuid);
    842     header.uuid_last_snap = uuid;
    843     /* There is no need to set header.uuid_link or header.uuid_parent here. */
    844     if (VDI_DEBUG) {
    845         vdi_header_print(&header);
    846     }
    847     vdi_header_to_le(&header);
    848     ret = blk_co_pwrite(blk, offset, sizeof(header), &header, 0);
    849     if (ret < 0) {
    850         error_setg(errp, "Error writing header");
    851         goto exit;
    852     }
    853     offset += sizeof(header);
    854 
    855     if (bmap_size > 0) {
    856         bmap = g_try_malloc0(bmap_size);
    857         if (bmap == NULL) {
    858             ret = -ENOMEM;
    859             error_setg(errp, "Could not allocate bmap");
    860             goto exit;
    861         }
    862         for (i = 0; i < blocks; i++) {
    863             if (image_type == VDI_TYPE_STATIC) {
    864                 bmap[i] = i;
    865             } else {
    866                 bmap[i] = VDI_UNALLOCATED;
    867             }
    868         }
    869         ret = blk_co_pwrite(blk, offset, bmap_size, bmap, 0);
    870         if (ret < 0) {
    871             error_setg(errp, "Error writing bmap");
    872             goto exit;
    873         }
    874         offset += bmap_size;
    875     }
    876 
    877     if (image_type == VDI_TYPE_STATIC) {
    878         ret = blk_co_truncate(blk, offset + blocks * block_size, false,
    879                               PREALLOC_MODE_OFF, 0, errp);
    880         if (ret < 0) {
    881             error_prepend(errp, "Failed to statically allocate file");
    882             goto exit;
    883         }
    884     }
    885 
    886     ret = 0;
    887 exit:
    888     blk_unref(blk);
    889     bdrv_unref(bs_file);
    890     g_free(bmap);
    891     return ret;
    892 }
    893 
    894 static int coroutine_fn vdi_co_create(BlockdevCreateOptions *create_options,
    895                                       Error **errp)
    896 {
    897     return vdi_co_do_create(create_options, DEFAULT_CLUSTER_SIZE, errp);
    898 }
    899 
    900 static int coroutine_fn vdi_co_create_opts(BlockDriver *drv,
    901                                            const char *filename,
    902                                            QemuOpts *opts,
    903                                            Error **errp)
    904 {
    905     QDict *qdict = NULL;
    906     BlockdevCreateOptions *create_options = NULL;
    907     BlockDriverState *bs_file = NULL;
    908     uint64_t block_size = DEFAULT_CLUSTER_SIZE;
    909     bool is_static = false;
    910     Visitor *v;
    911     int ret;
    912 
    913     /* Parse options and convert legacy syntax.
    914      *
    915      * Since CONFIG_VDI_BLOCK_SIZE is disabled by default,
    916      * cluster-size is not part of the QAPI schema; therefore we have
    917      * to parse it before creating the QAPI object. */
    918 #if defined(CONFIG_VDI_BLOCK_SIZE)
    919     block_size = qemu_opt_get_size_del(opts,
    920                                        BLOCK_OPT_CLUSTER_SIZE,
    921                                        DEFAULT_CLUSTER_SIZE);
    922     if (block_size < BDRV_SECTOR_SIZE || block_size > UINT32_MAX ||
    923         !is_power_of_2(block_size))
    924     {
    925         error_setg(errp, "Invalid cluster size");
    926         ret = -EINVAL;
    927         goto done;
    928     }
    929 #endif
    930     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) {
    931         is_static = true;
    932     }
    933 
    934     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);
    935 
    936     /* Create and open the file (protocol layer) */
    937     ret = bdrv_create_file(filename, opts, errp);
    938     if (ret < 0) {
    939         goto done;
    940     }
    941 
    942     bs_file = bdrv_open(filename, NULL, NULL,
    943                         BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
    944     if (!bs_file) {
    945         ret = -EIO;
    946         goto done;
    947     }
    948 
    949     qdict_put_str(qdict, "driver", "vdi");
    950     qdict_put_str(qdict, "file", bs_file->node_name);
    951     if (is_static) {
    952         qdict_put_str(qdict, "preallocation", "metadata");
    953     }
    954 
    955     /* Get the QAPI object */
    956     v = qobject_input_visitor_new_flat_confused(qdict, errp);
    957     if (!v) {
    958         ret = -EINVAL;
    959         goto done;
    960     }
    961     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
    962     visit_free(v);
    963     if (!create_options) {
    964         ret = -EINVAL;
    965         goto done;
    966     }
    967 
    968     /* Silently round up size */
    969     assert(create_options->driver == BLOCKDEV_DRIVER_VDI);
    970     create_options->u.vdi.size = ROUND_UP(create_options->u.vdi.size,
    971                                           BDRV_SECTOR_SIZE);
    972 
    973     /* Create the vdi image (format layer) */
    974     ret = vdi_co_do_create(create_options, block_size, errp);
    975 done:
    976     qobject_unref(qdict);
    977     qapi_free_BlockdevCreateOptions(create_options);
    978     bdrv_unref(bs_file);
    979     return ret;
    980 }
    981 
    982 static void vdi_close(BlockDriverState *bs)
    983 {
    984     BDRVVdiState *s = bs->opaque;
    985 
    986     qemu_vfree(s->bmap);
    987 
    988     migrate_del_blocker(s->migration_blocker);
    989     error_free(s->migration_blocker);
    990 }
    991 
    992 static int vdi_has_zero_init(BlockDriverState *bs)
    993 {
    994     BDRVVdiState *s = bs->opaque;
    995 
    996     if (s->header.image_type == VDI_TYPE_STATIC) {
    997         return bdrv_has_zero_init(bs->file->bs);
    998     } else {
    999         return 1;
   1000     }
   1001 }
   1002 
   1003 static QemuOptsList vdi_create_opts = {
   1004     .name = "vdi-create-opts",
   1005     .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head),
   1006     .desc = {
   1007         {
   1008             .name = BLOCK_OPT_SIZE,
   1009             .type = QEMU_OPT_SIZE,
   1010             .help = "Virtual disk size"
   1011         },
   1012 #if defined(CONFIG_VDI_BLOCK_SIZE)
   1013         {
   1014             .name = BLOCK_OPT_CLUSTER_SIZE,
   1015             .type = QEMU_OPT_SIZE,
   1016             .help = "VDI cluster (block) size",
   1017             .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
   1018         },
   1019 #endif
   1020 #if defined(CONFIG_VDI_STATIC_IMAGE)
   1021         {
   1022             .name = BLOCK_OPT_STATIC,
   1023             .type = QEMU_OPT_BOOL,
   1024             .help = "VDI static (pre-allocated) image",
   1025             .def_value_str = "off"
   1026         },
   1027 #endif
   1028         /* TODO: An additional option to set UUID values might be useful. */
   1029         { /* end of list */ }
   1030     }
   1031 };
   1032 
   1033 static BlockDriver bdrv_vdi = {
   1034     .format_name = "vdi",
   1035     .instance_size = sizeof(BDRVVdiState),
   1036     .bdrv_probe = vdi_probe,
   1037     .bdrv_open = vdi_open,
   1038     .bdrv_close = vdi_close,
   1039     .bdrv_reopen_prepare = vdi_reopen_prepare,
   1040     .bdrv_child_perm          = bdrv_default_perms,
   1041     .bdrv_co_create      = vdi_co_create,
   1042     .bdrv_co_create_opts = vdi_co_create_opts,
   1043     .bdrv_has_zero_init  = vdi_has_zero_init,
   1044     .bdrv_co_block_status = vdi_co_block_status,
   1045     .bdrv_make_empty = vdi_make_empty,
   1046 
   1047     .bdrv_co_preadv     = vdi_co_preadv,
   1048 #if defined(CONFIG_VDI_WRITE)
   1049     .bdrv_co_pwritev    = vdi_co_pwritev,
   1050 #endif
   1051 
   1052     .bdrv_get_info = vdi_get_info,
   1053 
   1054     .is_format = true,
   1055     .create_opts = &vdi_create_opts,
   1056     .bdrv_co_check = vdi_co_check,
   1057 };
   1058 
   1059 static void bdrv_vdi_init(void)
   1060 {
   1061     logout("\n");
   1062     bdrv_register(&bdrv_vdi);
   1063 }
   1064 
   1065 block_init(bdrv_vdi_init);