qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

vhdx.c (75621B)


      1 /*
      2  * Block driver for Hyper-V VHDX Images
      3  *
      4  * Copyright (c) 2013 Red Hat, Inc.,
      5  *
      6  * Authors:
      7  *  Jeff Cody <jcody@redhat.com>
      8  *
      9  *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
     10  *  by Microsoft:
     11  *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
     12  *
     13  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
     14  * See the COPYING.LIB file in the top-level directory.
     15  *
     16  */
     17 
     18 #include "qemu/osdep.h"
     19 #include "qapi/error.h"
     20 #include "block/block_int.h"
     21 #include "block/qdict.h"
     22 #include "sysemu/block-backend.h"
     23 #include "qemu/module.h"
     24 #include "qemu/option.h"
     25 #include "qemu/crc32c.h"
     26 #include "qemu/bswap.h"
     27 #include "qemu/error-report.h"
     28 #include "qemu/memalign.h"
     29 #include "vhdx.h"
     30 #include "migration/blocker.h"
     31 #include "qemu/uuid.h"
     32 #include "qapi/qmp/qdict.h"
     33 #include "qapi/qobject-input-visitor.h"
     34 #include "qapi/qapi-visit-block-core.h"
     35 
     36 /* Options for VHDX creation */
     37 
     38 #define VHDX_BLOCK_OPT_LOG_SIZE   "log_size"
     39 #define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size"
     40 #define VHDX_BLOCK_OPT_ZERO "block_state_zero"
     41 
     42 typedef enum VHDXImageType {
     43     VHDX_TYPE_DYNAMIC = 0,
     44     VHDX_TYPE_FIXED,
     45     VHDX_TYPE_DIFFERENCING,   /* Currently unsupported */
     46 } VHDXImageType;
     47 
     48 static QemuOptsList vhdx_create_opts;
     49 
     50 /* Several metadata and region table data entries are identified by
     51  * guids in  a MS-specific GUID format. */
     52 
     53 
     54 /* ------- Known Region Table GUIDs ---------------------- */
     55 static const MSGUID bat_guid =      { .data1 = 0x2dc27766,
     56                                       .data2 = 0xf623,
     57                                       .data3 = 0x4200,
     58                                       .data4 = { 0x9d, 0x64, 0x11, 0x5e,
     59                                                  0x9b, 0xfd, 0x4a, 0x08} };
     60 
     61 static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
     62                                       .data2 = 0x4790,
     63                                       .data3 = 0x4b9a,
     64                                       .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
     65                                                  0x05, 0x0f, 0x88, 0x6e} };
     66 
     67 
     68 
     69 /* ------- Known Metadata Entry GUIDs ---------------------- */
     70 static const MSGUID file_param_guid =   { .data1 = 0xcaa16737,
     71                                           .data2 = 0xfa36,
     72                                           .data3 = 0x4d43,
     73                                           .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
     74                                                      0xaa, 0x44, 0xe7, 0x6b} };
     75 
     76 static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
     77                                           .data2 = 0xcd1b,
     78                                           .data3 = 0x4876,
     79                                           .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
     80                                                      0xd8, 0x3b, 0xf4, 0xb8} };
     81 
     82 static const MSGUID page83_guid =       { .data1 = 0xbeca12ab,
     83                                           .data2 = 0xb2e6,
     84                                           .data3 = 0x4523,
     85                                           .data4 = { 0x93, 0xef, 0xc3, 0x09,
     86                                                      0xe0, 0x00, 0xc7, 0x46} };
     87 
     88 
     89 static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7,
     90                                           .data2 = 0x445d,
     91                                           .data3 = 0x4471,
     92                                           .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
     93                                                      0x52, 0x51, 0xc5, 0x56} };
     94 
     95 static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
     96                                             .data2 = 0xb30b,
     97                                             .data3 = 0x454d,
     98                                             .data4 = { 0xab, 0xf7, 0xd3,
     99                                                        0xd8, 0x48, 0x34,
    100                                                        0xab, 0x0c} };
    101 
    102 static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
    103                                             .data2 = 0xa96f,
    104                                             .data3 = 0x4709,
    105                                             .data4 = { 0xba, 0x47, 0xf2,
    106                                                        0x33, 0xa8, 0xfa,
    107                                                        0xab, 0x5f} };
    108 
    109 /* Each parent type must have a valid GUID; this is for parent images
    110  * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
    111  * need to make up our own QCOW2 GUID type */
    112 static const MSGUID parent_vhdx_guid __attribute__((unused))
    113                                      = { .data1 = 0xb04aefb7,
    114                                          .data2 = 0xd19e,
    115                                          .data3 = 0x4a81,
    116                                          .data4 = { 0xb7, 0x89, 0x25, 0xb8,
    117                                                     0xe9, 0x44, 0x59, 0x13} };
    118 
    119 
    120 #define META_FILE_PARAMETER_PRESENT      0x01
    121 #define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
    122 #define META_PAGE_83_PRESENT             0x04
    123 #define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
    124 #define META_PHYS_SECTOR_SIZE_PRESENT    0x10
    125 #define META_PARENT_LOCATOR_PRESENT      0x20
    126 
    127 #define META_ALL_PRESENT    \
    128     (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
    129      META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
    130      META_PHYS_SECTOR_SIZE_PRESENT)
    131 
    132 
    133 typedef struct VHDXSectorInfo {
    134     uint32_t bat_idx;       /* BAT entry index */
    135     uint32_t sectors_avail; /* sectors available in payload block */
    136     uint32_t bytes_left;    /* bytes left in the block after data to r/w */
    137     uint32_t bytes_avail;   /* bytes available in payload block */
    138     uint64_t file_offset;   /* absolute offset in bytes, in file */
    139     uint64_t block_offset;  /* block offset, in bytes */
    140 } VHDXSectorInfo;
    141 
    142 /* Calculates new checksum.
    143  *
    144  * Zero is substituted during crc calculation for the original crc field
    145  * crc_offset: byte offset in buf of the buffer crc
    146  * buf: buffer pointer
    147  * size: size of buffer (must be > crc_offset+4)
    148  *
    149  * Note: The buffer should have all multi-byte data in little-endian format,
    150  *       and the resulting checksum is in little endian format.
    151  */
    152 uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset)
    153 {
    154     uint32_t crc;
    155 
    156     assert(buf != NULL);
    157     assert(size > (crc_offset + sizeof(crc)));
    158 
    159     memset(buf + crc_offset, 0, sizeof(crc));
    160     crc =  crc32c(0xffffffff, buf, size);
    161     crc = cpu_to_le32(crc);
    162     memcpy(buf + crc_offset, &crc, sizeof(crc));
    163 
    164     return crc;
    165 }
    166 
    167 uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
    168                             int crc_offset)
    169 {
    170     uint32_t crc_new;
    171     uint32_t crc_orig;
    172     assert(buf != NULL);
    173 
    174     if (crc_offset > 0) {
    175         memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
    176         memset(buf + crc_offset, 0, sizeof(crc_orig));
    177     }
    178 
    179     crc_new = crc32c(crc, buf, size);
    180     if (crc_offset > 0) {
    181         memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
    182     }
    183 
    184     return crc_new;
    185 }
    186 
    187 /* Validates the checksum of the buffer, with an in-place CRC.
    188  *
    189  * Zero is substituted during crc calculation for the original crc field,
    190  * and the crc field is restored afterwards.  But the buffer will be modified
    191  * during the calculation, so this may not be not suitable for multi-threaded
    192  * use.
    193  *
    194  * crc_offset: byte offset in buf of the buffer crc
    195  * buf: buffer pointer
    196  * size: size of buffer (must be > crc_offset+4)
    197  *
    198  * returns true if checksum is valid, false otherwise
    199  */
    200 bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
    201 {
    202     uint32_t crc_orig;
    203     uint32_t crc;
    204 
    205     assert(buf != NULL);
    206     assert(size > (crc_offset + 4));
    207 
    208     memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
    209     crc_orig = le32_to_cpu(crc_orig);
    210 
    211     crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
    212 
    213     return crc == crc_orig;
    214 }
    215 
    216 
    217 /*
    218  * This generates a UUID that is compliant with the MS GUIDs used
    219  * in the VHDX spec (and elsewhere).
    220  */
    221 void vhdx_guid_generate(MSGUID *guid)
    222 {
    223     QemuUUID uuid;
    224     assert(guid != NULL);
    225 
    226     qemu_uuid_generate(&uuid);
    227     memcpy(guid, &uuid, sizeof(MSGUID));
    228 }
    229 
    230 /* Check for region overlaps inside the VHDX image */
    231 static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length)
    232 {
    233     int ret = 0;
    234     uint64_t end;
    235     VHDXRegionEntry *r;
    236 
    237     end = start + length;
    238     QLIST_FOREACH(r, &s->regions, entries) {
    239         if (!((start >= r->end) || (end <= r->start))) {
    240             error_report("VHDX region %" PRIu64 "-%" PRIu64 " overlaps with "
    241                          "region %" PRIu64 "-%." PRIu64, start, end, r->start,
    242                          r->end);
    243             ret = -EINVAL;
    244             goto exit;
    245         }
    246     }
    247 
    248 exit:
    249     return ret;
    250 }
    251 
    252 /* Register a region for future checks */
    253 static void vhdx_region_register(BDRVVHDXState *s,
    254                                  uint64_t start, uint64_t length)
    255 {
    256     VHDXRegionEntry *r;
    257 
    258     r = g_malloc0(sizeof(*r));
    259 
    260     r->start = start;
    261     r->end = start + length;
    262 
    263     QLIST_INSERT_HEAD(&s->regions, r, entries);
    264 }
    265 
    266 /* Free all registered regions */
    267 static void vhdx_region_unregister_all(BDRVVHDXState *s)
    268 {
    269     VHDXRegionEntry *r, *r_next;
    270 
    271     QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) {
    272         QLIST_REMOVE(r, entries);
    273         g_free(r);
    274     }
    275 }
    276 
    277 static void vhdx_set_shift_bits(BDRVVHDXState *s)
    278 {
    279     s->logical_sector_size_bits = ctz32(s->logical_sector_size);
    280     s->sectors_per_block_bits =   ctz32(s->sectors_per_block);
    281     s->chunk_ratio_bits =         ctz64(s->chunk_ratio);
    282     s->block_size_bits =          ctz32(s->block_size);
    283 }
    284 
    285 /*
    286  * Per the MS VHDX Specification, for every VHDX file:
    287  *      - The header section is fixed size - 1 MB
    288  *      - The header section is always the first "object"
    289  *      - The first 64KB of the header is the File Identifier
    290  *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
    291  *      - The following 512 bytes constitute a UTF-16 string identifiying the
    292  *        software that created the file, and is optional and diagnostic only.
    293  *
    294  *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
    295  */
    296 static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
    297 {
    298     if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
    299         return 100;
    300     }
    301     return 0;
    302 }
    303 
    304 /*
    305  * Writes the header to the specified offset.
    306  *
    307  * This will optionally read in buffer data from disk (otherwise zero-fill),
    308  * and then update the header checksum.  Header is converted to proper
    309  * endianness before being written to the specified file offset
    310  */
    311 static int vhdx_write_header(BdrvChild *file, VHDXHeader *hdr,
    312                              uint64_t offset, bool read)
    313 {
    314     BlockDriverState *bs_file = file->bs;
    315     uint8_t *buffer = NULL;
    316     int ret;
    317     VHDXHeader *header_le;
    318 
    319     assert(bs_file != NULL);
    320     assert(hdr != NULL);
    321 
    322     /* the header checksum is not over just the packed size of VHDXHeader,
    323      * but rather over the entire 'reserved' range for the header, which is
    324      * 4KB (VHDX_HEADER_SIZE). */
    325 
    326     buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE);
    327     if (read) {
    328         /* if true, we can't assume the extra reserved bytes are 0 */
    329         ret = bdrv_pread(file, offset, VHDX_HEADER_SIZE, buffer, 0);
    330         if (ret < 0) {
    331             goto exit;
    332         }
    333     } else {
    334         memset(buffer, 0, VHDX_HEADER_SIZE);
    335     }
    336 
    337     /* overwrite the actual VHDXHeader portion */
    338     header_le = (VHDXHeader *)buffer;
    339     memcpy(header_le, hdr, sizeof(VHDXHeader));
    340     vhdx_header_le_export(hdr, header_le);
    341     vhdx_update_checksum(buffer, VHDX_HEADER_SIZE,
    342                          offsetof(VHDXHeader, checksum));
    343     ret = bdrv_pwrite_sync(file, offset, sizeof(VHDXHeader), header_le, 0);
    344 
    345 exit:
    346     qemu_vfree(buffer);
    347     return ret;
    348 }
    349 
    350 /* Update the VHDX headers
    351  *
    352  * This follows the VHDX spec procedures for header updates.
    353  *
    354  *  - non-current header is updated with largest sequence number
    355  */
    356 static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s,
    357                               bool generate_data_write_guid, MSGUID *log_guid)
    358 {
    359     int ret = 0;
    360     int hdr_idx = 0;
    361     uint64_t header_offset = VHDX_HEADER1_OFFSET;
    362 
    363     VHDXHeader *active_header;
    364     VHDXHeader *inactive_header;
    365 
    366     /* operate on the non-current header */
    367     if (s->curr_header == 0) {
    368         hdr_idx = 1;
    369         header_offset = VHDX_HEADER2_OFFSET;
    370     }
    371 
    372     active_header   = s->headers[s->curr_header];
    373     inactive_header = s->headers[hdr_idx];
    374 
    375     inactive_header->sequence_number = active_header->sequence_number + 1;
    376 
    377     /* a new file guid must be generated before any file write, including
    378      * headers */
    379     inactive_header->file_write_guid = s->session_guid;
    380 
    381     /* a new data guid only needs to be generated before any guest-visible
    382      * writes (i.e. something observable via virtual disk read) */
    383     if (generate_data_write_guid) {
    384         vhdx_guid_generate(&inactive_header->data_write_guid);
    385     }
    386 
    387     /* update the log guid if present */
    388     if (log_guid) {
    389         inactive_header->log_guid = *log_guid;
    390     }
    391 
    392     ret = vhdx_write_header(bs->file, inactive_header, header_offset, true);
    393     if (ret < 0) {
    394         goto exit;
    395     }
    396     s->curr_header = hdr_idx;
    397 
    398 exit:
    399     return ret;
    400 }
    401 
    402 /*
    403  * The VHDX spec calls for header updates to be performed twice, so that both
    404  * the current and non-current header have valid info
    405  */
    406 int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s,
    407                         bool generate_data_write_guid, MSGUID *log_guid)
    408 {
    409     int ret;
    410 
    411     ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
    412     if (ret < 0) {
    413         return ret;
    414     }
    415     return vhdx_update_header(bs, s, generate_data_write_guid, log_guid);
    416 }
    417 
    418 /* opens the specified header block from the VHDX file header section */
    419 static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s,
    420                               Error **errp)
    421 {
    422     int ret;
    423     VHDXHeader *header1;
    424     VHDXHeader *header2;
    425     bool h1_valid = false;
    426     bool h2_valid = false;
    427     uint64_t h1_seq = 0;
    428     uint64_t h2_seq = 0;
    429     uint8_t *buffer;
    430 
    431     /* header1 & header2 are freed in vhdx_close() */
    432     header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
    433     header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
    434 
    435     buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
    436 
    437     s->headers[0] = header1;
    438     s->headers[1] = header2;
    439 
    440     /* We have to read the whole VHDX_HEADER_SIZE instead of
    441      * sizeof(VHDXHeader), because the checksum is over the whole
    442      * region */
    443     ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, VHDX_HEADER_SIZE, buffer,
    444                      0);
    445     if (ret < 0) {
    446         goto fail;
    447     }
    448     /* copy over just the relevant portion that we need */
    449     memcpy(header1, buffer, sizeof(VHDXHeader));
    450 
    451     if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
    452         vhdx_header_le_import(header1);
    453         if (header1->signature == VHDX_HEADER_SIGNATURE &&
    454             header1->version == 1) {
    455             h1_seq = header1->sequence_number;
    456             h1_valid = true;
    457         }
    458     }
    459 
    460     ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, VHDX_HEADER_SIZE, buffer,
    461                      0);
    462     if (ret < 0) {
    463         goto fail;
    464     }
    465     /* copy over just the relevant portion that we need */
    466     memcpy(header2, buffer, sizeof(VHDXHeader));
    467 
    468     if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) {
    469         vhdx_header_le_import(header2);
    470         if (header2->signature == VHDX_HEADER_SIGNATURE &&
    471             header2->version == 1) {
    472             h2_seq = header2->sequence_number;
    473             h2_valid = true;
    474         }
    475     }
    476 
    477     /* If there is only 1 valid header (or no valid headers), we
    478      * don't care what the sequence numbers are */
    479     if (h1_valid && !h2_valid) {
    480         s->curr_header = 0;
    481     } else if (!h1_valid && h2_valid) {
    482         s->curr_header = 1;
    483     } else if (!h1_valid && !h2_valid) {
    484         goto fail;
    485     } else {
    486         /* If both headers are valid, then we choose the active one by the
    487          * highest sequence number.  If the sequence numbers are equal, that is
    488          * invalid */
    489         if (h1_seq > h2_seq) {
    490             s->curr_header = 0;
    491         } else if (h2_seq > h1_seq) {
    492             s->curr_header = 1;
    493         } else {
    494             /* The Microsoft Disk2VHD tool will create 2 identical
    495              * headers, with identical sequence numbers.  If the headers are
    496              * identical, don't consider the file corrupt */
    497             if (!memcmp(header1, header2, sizeof(VHDXHeader))) {
    498                 s->curr_header = 0;
    499             } else {
    500                 goto fail;
    501             }
    502         }
    503     }
    504 
    505     vhdx_region_register(s, s->headers[s->curr_header]->log_offset,
    506                             s->headers[s->curr_header]->log_length);
    507     goto exit;
    508 
    509 fail:
    510     error_setg_errno(errp, -ret, "No valid VHDX header found");
    511     qemu_vfree(header1);
    512     qemu_vfree(header2);
    513     s->headers[0] = NULL;
    514     s->headers[1] = NULL;
    515 exit:
    516     qemu_vfree(buffer);
    517 }
    518 
    519 
    520 static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
    521 {
    522     int ret = 0;
    523     uint8_t *buffer;
    524     int offset = 0;
    525     VHDXRegionTableEntry rt_entry;
    526     uint32_t i;
    527     bool bat_rt_found = false;
    528     bool metadata_rt_found = false;
    529 
    530     /* We have to read the whole 64KB block, because the crc32 is over the
    531      * whole block */
    532     buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
    533 
    534     ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET,
    535                      VHDX_HEADER_BLOCK_SIZE, buffer, 0);
    536     if (ret < 0) {
    537         goto fail;
    538     }
    539     memcpy(&s->rt, buffer, sizeof(s->rt));
    540     offset += sizeof(s->rt);
    541 
    542     if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) {
    543         ret = -EINVAL;
    544         goto fail;
    545     }
    546 
    547     vhdx_region_header_le_import(&s->rt);
    548 
    549     if (s->rt.signature != VHDX_REGION_SIGNATURE) {
    550         ret = -EINVAL;
    551         goto fail;
    552     }
    553 
    554 
    555     /* Per spec, maximum region table entry count is 2047 */
    556     if (s->rt.entry_count > 2047) {
    557         ret = -EINVAL;
    558         goto fail;
    559     }
    560 
    561     for (i = 0; i < s->rt.entry_count; i++) {
    562         memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
    563         offset += sizeof(rt_entry);
    564 
    565         vhdx_region_entry_le_import(&rt_entry);
    566 
    567         /* check for region overlap between these entries, and any
    568          * other memory regions in the file */
    569         ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length);
    570         if (ret < 0) {
    571             goto fail;
    572         }
    573 
    574         vhdx_region_register(s, rt_entry.file_offset, rt_entry.length);
    575 
    576         /* see if we recognize the entry */
    577         if (guid_eq(rt_entry.guid, bat_guid)) {
    578             /* must be unique; if we have already found it this is invalid */
    579             if (bat_rt_found) {
    580                 ret = -EINVAL;
    581                 goto fail;
    582             }
    583             bat_rt_found = true;
    584             s->bat_rt = rt_entry;
    585             continue;
    586         }
    587 
    588         if (guid_eq(rt_entry.guid, metadata_guid)) {
    589             /* must be unique; if we have already found it this is invalid */
    590             if (metadata_rt_found) {
    591                 ret = -EINVAL;
    592                 goto fail;
    593             }
    594             metadata_rt_found = true;
    595             s->metadata_rt = rt_entry;
    596             continue;
    597         }
    598 
    599         if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
    600             /* cannot read vhdx file - required region table entry that
    601              * we do not understand.  per spec, we must fail to open */
    602             ret = -ENOTSUP;
    603             goto fail;
    604         }
    605     }
    606 
    607     if (!bat_rt_found || !metadata_rt_found) {
    608         ret = -EINVAL;
    609         goto fail;
    610     }
    611 
    612     ret = 0;
    613 
    614 fail:
    615     qemu_vfree(buffer);
    616     return ret;
    617 }
    618 
    619 
    620 
    621 /* Metadata initial parser
    622  *
    623  * This loads all the metadata entry fields.  This may cause additional
    624  * fields to be processed (e.g. parent locator, etc..).
    625  *
    626  * There are 5 Metadata items that are always required:
    627  *      - File Parameters (block size, has a parent)
    628  *      - Virtual Disk Size (size, in bytes, of the virtual drive)
    629  *      - Page 83 Data (scsi page 83 guid)
    630  *      - Logical Sector Size (logical sector size in bytes, either 512 or
    631  *                             4096.  We only support 512 currently)
    632  *      - Physical Sector Size (512 or 4096)
    633  *
    634  * Also, if the File Parameters indicate this is a differencing file,
    635  * we must also look for the Parent Locator metadata item.
    636  */
    637 static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
    638 {
    639     int ret = 0;
    640     uint8_t *buffer;
    641     int offset = 0;
    642     uint32_t i = 0;
    643     VHDXMetadataTableEntry md_entry;
    644 
    645     buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
    646 
    647     ret = bdrv_pread(bs->file, s->metadata_rt.file_offset,
    648                      VHDX_METADATA_TABLE_MAX_SIZE, buffer, 0);
    649     if (ret < 0) {
    650         goto exit;
    651     }
    652     memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
    653     offset += sizeof(s->metadata_hdr);
    654 
    655     vhdx_metadata_header_le_import(&s->metadata_hdr);
    656 
    657     if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) {
    658         ret = -EINVAL;
    659         goto exit;
    660     }
    661 
    662     s->metadata_entries.present = 0;
    663 
    664     if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
    665         (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
    666         ret = -EINVAL;
    667         goto exit;
    668     }
    669 
    670     for (i = 0; i < s->metadata_hdr.entry_count; i++) {
    671         memcpy(&md_entry, buffer + offset, sizeof(md_entry));
    672         offset += sizeof(md_entry);
    673 
    674         vhdx_metadata_entry_le_import(&md_entry);
    675 
    676         if (guid_eq(md_entry.item_id, file_param_guid)) {
    677             if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
    678                 ret = -EINVAL;
    679                 goto exit;
    680             }
    681             s->metadata_entries.file_parameters_entry = md_entry;
    682             s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
    683             continue;
    684         }
    685 
    686         if (guid_eq(md_entry.item_id, virtual_size_guid)) {
    687             if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
    688                 ret = -EINVAL;
    689                 goto exit;
    690             }
    691             s->metadata_entries.virtual_disk_size_entry = md_entry;
    692             s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
    693             continue;
    694         }
    695 
    696         if (guid_eq(md_entry.item_id, page83_guid)) {
    697             if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
    698                 ret = -EINVAL;
    699                 goto exit;
    700             }
    701             s->metadata_entries.page83_data_entry = md_entry;
    702             s->metadata_entries.present |= META_PAGE_83_PRESENT;
    703             continue;
    704         }
    705 
    706         if (guid_eq(md_entry.item_id, logical_sector_guid)) {
    707             if (s->metadata_entries.present &
    708                 META_LOGICAL_SECTOR_SIZE_PRESENT) {
    709                 ret = -EINVAL;
    710                 goto exit;
    711             }
    712             s->metadata_entries.logical_sector_size_entry = md_entry;
    713             s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
    714             continue;
    715         }
    716 
    717         if (guid_eq(md_entry.item_id, phys_sector_guid)) {
    718             if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
    719                 ret = -EINVAL;
    720                 goto exit;
    721             }
    722             s->metadata_entries.phys_sector_size_entry = md_entry;
    723             s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
    724             continue;
    725         }
    726 
    727         if (guid_eq(md_entry.item_id, parent_locator_guid)) {
    728             if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
    729                 ret = -EINVAL;
    730                 goto exit;
    731             }
    732             s->metadata_entries.parent_locator_entry = md_entry;
    733             s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
    734             continue;
    735         }
    736 
    737         if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
    738             /* cannot read vhdx file - required region table entry that
    739              * we do not understand.  per spec, we must fail to open */
    740             ret = -ENOTSUP;
    741             goto exit;
    742         }
    743     }
    744 
    745     if (s->metadata_entries.present != META_ALL_PRESENT) {
    746         ret = -ENOTSUP;
    747         goto exit;
    748     }
    749 
    750     ret = bdrv_pread(bs->file,
    751                      s->metadata_entries.file_parameters_entry.offset
    752                                          + s->metadata_rt.file_offset,
    753                      sizeof(s->params),
    754                      &s->params,
    755                      0);
    756 
    757     if (ret < 0) {
    758         goto exit;
    759     }
    760 
    761     s->params.block_size = le32_to_cpu(s->params.block_size);
    762     s->params.data_bits = le32_to_cpu(s->params.data_bits);
    763 
    764 
    765     /* We now have the file parameters, so we can tell if this is a
    766      * differencing file (i.e.. has_parent), is dynamic or fixed
    767      * sized (leave_blocks_allocated), and the block size */
    768 
    769     /* The parent locator required iff the file parameters has_parent set */
    770     if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
    771         if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
    772             /* TODO: parse  parent locator fields */
    773             ret = -ENOTSUP; /* temp, until differencing files are supported */
    774             goto exit;
    775         } else {
    776             /* if has_parent is set, but there is not parent locator present,
    777              * then that is an invalid combination */
    778             ret = -EINVAL;
    779             goto exit;
    780         }
    781     }
    782 
    783     /* determine virtual disk size, logical sector size,
    784      * and phys sector size */
    785 
    786     ret = bdrv_pread(bs->file,
    787                      s->metadata_entries.virtual_disk_size_entry.offset
    788                                            + s->metadata_rt.file_offset,
    789                      sizeof(uint64_t),
    790                      &s->virtual_disk_size,
    791                      0);
    792     if (ret < 0) {
    793         goto exit;
    794     }
    795     ret = bdrv_pread(bs->file,
    796                      s->metadata_entries.logical_sector_size_entry.offset
    797                                              + s->metadata_rt.file_offset,
    798                      sizeof(uint32_t),
    799                      &s->logical_sector_size,
    800                      0);
    801     if (ret < 0) {
    802         goto exit;
    803     }
    804     ret = bdrv_pread(bs->file,
    805                      s->metadata_entries.phys_sector_size_entry.offset
    806                                           + s->metadata_rt.file_offset,
    807                      sizeof(uint32_t),
    808                      &s->physical_sector_size,
    809                      0);
    810     if (ret < 0) {
    811         goto exit;
    812     }
    813 
    814     s->virtual_disk_size = le64_to_cpu(s->virtual_disk_size);
    815     s->logical_sector_size = le32_to_cpu(s->logical_sector_size);
    816     s->physical_sector_size = le32_to_cpu(s->physical_sector_size);
    817 
    818     if (s->params.block_size < VHDX_BLOCK_SIZE_MIN ||
    819         s->params.block_size > VHDX_BLOCK_SIZE_MAX) {
    820         ret = -EINVAL;
    821         goto exit;
    822     }
    823 
    824     /* Currently we only support 512 */
    825     if (s->logical_sector_size != 512) {
    826         ret = -ENOTSUP;
    827         goto exit;
    828     }
    829 
    830     /* Both block_size and sector_size are guaranteed powers of 2, below.
    831        Due to range checks above, s->sectors_per_block can never be < 256 */
    832     s->sectors_per_block = s->params.block_size / s->logical_sector_size;
    833     s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
    834                      (uint64_t)s->logical_sector_size /
    835                      (uint64_t)s->params.block_size;
    836 
    837     /* These values are ones we will want to use for division / multiplication
    838      * later on, and they are all guaranteed (per the spec) to be powers of 2,
    839      * so we can take advantage of that for shift operations during
    840      * reads/writes */
    841     if (s->logical_sector_size & (s->logical_sector_size - 1)) {
    842         ret = -EINVAL;
    843         goto exit;
    844     }
    845     if (s->sectors_per_block & (s->sectors_per_block - 1)) {
    846         ret = -EINVAL;
    847         goto exit;
    848     }
    849     if (s->chunk_ratio & (s->chunk_ratio - 1)) {
    850         ret = -EINVAL;
    851         goto exit;
    852     }
    853     s->block_size = s->params.block_size;
    854     if (s->block_size & (s->block_size - 1)) {
    855         ret = -EINVAL;
    856         goto exit;
    857     }
    858 
    859     vhdx_set_shift_bits(s);
    860 
    861     ret = 0;
    862 
    863 exit:
    864     qemu_vfree(buffer);
    865     return ret;
    866 }
    867 
    868 /*
    869  * Calculate the number of BAT entries, including sector
    870  * bitmap entries.
    871  */
    872 static void vhdx_calc_bat_entries(BDRVVHDXState *s)
    873 {
    874     uint32_t data_blocks_cnt, bitmap_blocks_cnt;
    875 
    876     data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size);
    877     bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio);
    878 
    879     if (s->parent_entries) {
    880         s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
    881     } else {
    882         s->bat_entries = data_blocks_cnt +
    883                          ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
    884     }
    885 
    886 }
    887 
    888 static int vhdx_check_bat_entries(BlockDriverState *bs, int *errcnt)
    889 {
    890     BDRVVHDXState *s = bs->opaque;
    891     int64_t image_file_size = bdrv_getlength(bs->file->bs);
    892     uint64_t payblocks = s->chunk_ratio;
    893     uint64_t i;
    894     int ret = 0;
    895 
    896     if (image_file_size < 0) {
    897         error_report("Could not determinate VHDX image file size.");
    898         return image_file_size;
    899     }
    900 
    901     for (i = 0; i < s->bat_entries; i++) {
    902         if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) ==
    903             PAYLOAD_BLOCK_FULLY_PRESENT) {
    904             uint64_t offset = s->bat[i] & VHDX_BAT_FILE_OFF_MASK;
    905             /*
    906              * Allow that the last block exists only partially. The VHDX spec
    907              * states that the image file can only grow in blocksize increments,
    908              * but QEMU created images with partial last blocks in the past.
    909              */
    910             uint32_t block_length = MIN(s->block_size,
    911                 bs->total_sectors * BDRV_SECTOR_SIZE - i * s->block_size);
    912             /*
    913              * Check for BAT entry overflow.
    914              */
    915             if (offset > INT64_MAX - s->block_size) {
    916                 error_report("VHDX BAT entry %" PRIu64 " offset overflow.", i);
    917                 ret = -EINVAL;
    918                 if (!errcnt) {
    919                     break;
    920                 }
    921                 (*errcnt)++;
    922             }
    923             /*
    924              * Check if fully allocated BAT entries do not reside after
    925              * end of the image file.
    926              */
    927             if (offset >= image_file_size) {
    928                 error_report("VHDX BAT entry %" PRIu64 " start offset %" PRIu64
    929                              " points after end of file (%" PRIi64 "). Image"
    930                              " has probably been truncated.",
    931                              i, offset, image_file_size);
    932                 ret = -EINVAL;
    933                 if (!errcnt) {
    934                     break;
    935                 }
    936                 (*errcnt)++;
    937             } else if (offset + block_length > image_file_size) {
    938                 error_report("VHDX BAT entry %" PRIu64 " end offset %" PRIu64
    939                              " points after end of file (%" PRIi64 "). Image"
    940                              " has probably been truncated.",
    941                              i, offset + block_length - 1, image_file_size);
    942                 ret = -EINVAL;
    943                 if (!errcnt) {
    944                     break;
    945                 }
    946                 (*errcnt)++;
    947             }
    948 
    949             /*
    950              * verify populated BAT field file offsets against
    951              * region table and log entries
    952              */
    953             if (payblocks--) {
    954                 /* payload bat entries */
    955                 int ret2;
    956                 ret2 = vhdx_region_check(s, offset, s->block_size);
    957                 if (ret2 < 0) {
    958                     ret = -EINVAL;
    959                     if (!errcnt) {
    960                         break;
    961                     }
    962                     (*errcnt)++;
    963                 }
    964             } else {
    965                 payblocks = s->chunk_ratio;
    966                 /*
    967                  * Once differencing files are supported, verify sector bitmap
    968                  * blocks here
    969                  */
    970             }
    971         }
    972     }
    973 
    974     return ret;
    975 }
    976 
    977 static void vhdx_close(BlockDriverState *bs)
    978 {
    979     BDRVVHDXState *s = bs->opaque;
    980     qemu_vfree(s->headers[0]);
    981     s->headers[0] = NULL;
    982     qemu_vfree(s->headers[1]);
    983     s->headers[1] = NULL;
    984     qemu_vfree(s->bat);
    985     s->bat = NULL;
    986     qemu_vfree(s->parent_entries);
    987     s->parent_entries = NULL;
    988     migrate_del_blocker(s->migration_blocker);
    989     error_free(s->migration_blocker);
    990     qemu_vfree(s->log.hdr);
    991     s->log.hdr = NULL;
    992     vhdx_region_unregister_all(s);
    993 }
    994 
    995 static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
    996                      Error **errp)
    997 {
    998     BDRVVHDXState *s = bs->opaque;
    999     int ret = 0;
   1000     uint32_t i;
   1001     uint64_t signature;
   1002     Error *local_err = NULL;
   1003 
   1004     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
   1005     if (ret < 0) {
   1006         return ret;
   1007     }
   1008 
   1009     s->bat = NULL;
   1010     s->first_visible_write = true;
   1011 
   1012     qemu_co_mutex_init(&s->lock);
   1013     QLIST_INIT(&s->regions);
   1014 
   1015     /* validate the file signature */
   1016     ret = bdrv_pread(bs->file, 0, sizeof(uint64_t), &signature, 0);
   1017     if (ret < 0) {
   1018         goto fail;
   1019     }
   1020     if (memcmp(&signature, "vhdxfile", 8)) {
   1021         ret = -EINVAL;
   1022         goto fail;
   1023     }
   1024 
   1025     /* This is used for any header updates, for the file_write_guid.
   1026      * The spec dictates that a new value should be used for the first
   1027      * header update */
   1028     vhdx_guid_generate(&s->session_guid);
   1029 
   1030     vhdx_parse_header(bs, s, &local_err);
   1031     if (local_err != NULL) {
   1032         error_propagate(errp, local_err);
   1033         ret = -EINVAL;
   1034         goto fail;
   1035     }
   1036 
   1037     ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp);
   1038     if (ret < 0) {
   1039         goto fail;
   1040     }
   1041 
   1042     ret = vhdx_open_region_tables(bs, s);
   1043     if (ret < 0) {
   1044         goto fail;
   1045     }
   1046 
   1047     ret = vhdx_parse_metadata(bs, s);
   1048     if (ret < 0) {
   1049         goto fail;
   1050     }
   1051 
   1052     s->block_size = s->params.block_size;
   1053 
   1054     /* the VHDX spec dictates that virtual_disk_size is always a multiple of
   1055      * logical_sector_size */
   1056     bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
   1057 
   1058     vhdx_calc_bat_entries(s);
   1059 
   1060     s->bat_offset = s->bat_rt.file_offset;
   1061 
   1062     if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
   1063         /* BAT allocation is not large enough for all entries */
   1064         ret = -EINVAL;
   1065         goto fail;
   1066     }
   1067 
   1068     /* s->bat is freed in vhdx_close() */
   1069     s->bat = qemu_try_blockalign(bs->file->bs, s->bat_rt.length);
   1070     if (s->bat == NULL) {
   1071         ret = -ENOMEM;
   1072         goto fail;
   1073     }
   1074 
   1075     ret = bdrv_pread(bs->file, s->bat_offset, s->bat_rt.length, s->bat, 0);
   1076     if (ret < 0) {
   1077         goto fail;
   1078     }
   1079 
   1080     /* endian convert populated BAT field entires */
   1081     for (i = 0; i < s->bat_entries; i++) {
   1082         s->bat[i] = le64_to_cpu(s->bat[i]);
   1083     }
   1084 
   1085     if (!(flags & BDRV_O_CHECK)) {
   1086         ret = vhdx_check_bat_entries(bs, NULL);
   1087         if (ret < 0) {
   1088             goto fail;
   1089         }
   1090     }
   1091 
   1092     /* Disable migration when VHDX images are used */
   1093     error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
   1094                "does not support live migration",
   1095                bdrv_get_device_or_node_name(bs));
   1096     ret = migrate_add_blocker(s->migration_blocker, errp);
   1097     if (ret < 0) {
   1098         error_free(s->migration_blocker);
   1099         goto fail;
   1100     }
   1101 
   1102     /* TODO: differencing files */
   1103 
   1104     return 0;
   1105 fail:
   1106     vhdx_close(bs);
   1107     return ret;
   1108 }
   1109 
   1110 static int vhdx_reopen_prepare(BDRVReopenState *state,
   1111                                BlockReopenQueue *queue, Error **errp)
   1112 {
   1113     return 0;
   1114 }
   1115 
   1116 
   1117 /*
   1118  * Perform sector to block offset translations, to get various
   1119  * sector and file offsets into the image.  See VHDXSectorInfo
   1120  */
   1121 static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
   1122                                  int nb_sectors, VHDXSectorInfo *sinfo)
   1123 {
   1124     uint32_t block_offset;
   1125 
   1126     sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
   1127     /* effectively a modulo - this gives us the offset into the block
   1128      * (in sector sizes) for our sector number */
   1129     block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
   1130     /* the chunk ratio gives us the interleaving of the sector
   1131      * bitmaps, so we need to advance our page block index by the
   1132      * sector bitmaps entry number */
   1133     sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
   1134 
   1135     /* the number of sectors we can read/write in this cycle */
   1136     sinfo->sectors_avail = s->sectors_per_block - block_offset;
   1137 
   1138     sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
   1139 
   1140     if (sinfo->sectors_avail > nb_sectors) {
   1141         sinfo->sectors_avail = nb_sectors;
   1142     }
   1143 
   1144     sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
   1145 
   1146     sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK;
   1147 
   1148     sinfo->block_offset = block_offset << s->logical_sector_size_bits;
   1149 
   1150     /* The file offset must be past the header section, so must be > 0 */
   1151     if (sinfo->file_offset == 0) {
   1152         return;
   1153     }
   1154 
   1155     /* block offset is the offset in vhdx logical sectors, in
   1156      * the payload data block. Convert that to a byte offset
   1157      * in the block, and add in the payload data block offset
   1158      * in the file, in bytes, to get the final read address */
   1159 
   1160     sinfo->file_offset += sinfo->block_offset;
   1161 }
   1162 
   1163 
   1164 static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   1165 {
   1166     BDRVVHDXState *s = bs->opaque;
   1167 
   1168     bdi->cluster_size = s->block_size;
   1169 
   1170     return 0;
   1171 }
   1172 
   1173 
   1174 static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
   1175                                       int nb_sectors, QEMUIOVector *qiov)
   1176 {
   1177     BDRVVHDXState *s = bs->opaque;
   1178     int ret = 0;
   1179     VHDXSectorInfo sinfo;
   1180     uint64_t bytes_done = 0;
   1181     QEMUIOVector hd_qiov;
   1182 
   1183     qemu_iovec_init(&hd_qiov, qiov->niov);
   1184 
   1185     qemu_co_mutex_lock(&s->lock);
   1186 
   1187     while (nb_sectors > 0) {
   1188         /* We are a differencing file, so we need to inspect the sector bitmap
   1189          * to see if we have the data or not */
   1190         if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
   1191             /* not supported yet */
   1192             ret = -ENOTSUP;
   1193             goto exit;
   1194         } else {
   1195             vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
   1196 
   1197             qemu_iovec_reset(&hd_qiov);
   1198             qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail);
   1199 
   1200             /* check the payload block state */
   1201             switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
   1202             case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
   1203             case PAYLOAD_BLOCK_UNDEFINED:
   1204             case PAYLOAD_BLOCK_UNMAPPED:
   1205             case PAYLOAD_BLOCK_UNMAPPED_v095:
   1206             case PAYLOAD_BLOCK_ZERO:
   1207                 /* return zero */
   1208                 qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
   1209                 break;
   1210             case PAYLOAD_BLOCK_FULLY_PRESENT:
   1211                 qemu_co_mutex_unlock(&s->lock);
   1212                 ret = bdrv_co_preadv(bs->file, sinfo.file_offset,
   1213                                      sinfo.sectors_avail * BDRV_SECTOR_SIZE,
   1214                                      &hd_qiov, 0);
   1215                 qemu_co_mutex_lock(&s->lock);
   1216                 if (ret < 0) {
   1217                     goto exit;
   1218                 }
   1219                 break;
   1220             case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
   1221                 /* we don't yet support difference files, fall through
   1222                  * to error */
   1223             default:
   1224                 ret = -EIO;
   1225                 goto exit;
   1226                 break;
   1227             }
   1228             nb_sectors -= sinfo.sectors_avail;
   1229             sector_num += sinfo.sectors_avail;
   1230             bytes_done += sinfo.bytes_avail;
   1231         }
   1232     }
   1233     ret = 0;
   1234 exit:
   1235     qemu_co_mutex_unlock(&s->lock);
   1236     qemu_iovec_destroy(&hd_qiov);
   1237     return ret;
   1238 }
   1239 
   1240 /*
   1241  * Allocate a new payload block at the end of the file.
   1242  *
   1243  * Allocation will happen at 1MB alignment inside the file.
   1244  *
   1245  * If @need_zero is set on entry but not cleared on return, then truncation
   1246  * could not guarantee that the new portion reads as zero, and the caller
   1247  * will take care of it instead.
   1248  *
   1249  * Returns the file offset start of the new payload block
   1250  */
   1251 static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
   1252                                uint64_t *new_offset, bool *need_zero)
   1253 {
   1254     int64_t current_len;
   1255 
   1256     current_len = bdrv_getlength(bs->file->bs);
   1257     if (current_len < 0) {
   1258         return current_len;
   1259     }
   1260 
   1261     *new_offset = current_len;
   1262 
   1263     /* per the spec, the address for a block is in units of 1MB */
   1264     *new_offset = ROUND_UP(*new_offset, 1 * MiB);
   1265     if (*new_offset > INT64_MAX) {
   1266         return -EINVAL;
   1267     }
   1268 
   1269     if (*need_zero) {
   1270         int ret;
   1271 
   1272         ret = bdrv_truncate(bs->file, *new_offset + s->block_size, false,
   1273                             PREALLOC_MODE_OFF, BDRV_REQ_ZERO_WRITE, NULL);
   1274         if (ret != -ENOTSUP) {
   1275             *need_zero = false;
   1276             return ret;
   1277         }
   1278     }
   1279 
   1280     return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
   1281                          PREALLOC_MODE_OFF, 0, NULL);
   1282 }
   1283 
   1284 /*
   1285  * Update the BAT table entry with the new file offset, and the new entry
   1286  * state */
   1287 static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s,
   1288                                        VHDXSectorInfo *sinfo,
   1289                                        uint64_t *bat_entry_le,
   1290                                        uint64_t *bat_offset, int state)
   1291 {
   1292     /* The BAT entry is a uint64, with 44 bits for the file offset in units of
   1293      * 1MB, and 3 bits for the block state. */
   1294     if ((state == PAYLOAD_BLOCK_ZERO)        ||
   1295         (state == PAYLOAD_BLOCK_UNDEFINED)   ||
   1296         (state == PAYLOAD_BLOCK_NOT_PRESENT) ||
   1297         (state == PAYLOAD_BLOCK_UNMAPPED)) {
   1298         s->bat[sinfo->bat_idx]  = 0;  /* For PAYLOAD_BLOCK_ZERO, the
   1299                                          FileOffsetMB field is denoted as
   1300                                          'reserved' in the v1.0 spec.  If it is
   1301                                          non-zero, MS Hyper-V will fail to read
   1302                                          the disk image */
   1303     } else {
   1304         s->bat[sinfo->bat_idx]  = sinfo->file_offset;
   1305     }
   1306 
   1307     s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK;
   1308 
   1309     *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]);
   1310     *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry);
   1311 
   1312 }
   1313 
   1314 /* Per the spec, on the first write of guest-visible data to the file the
   1315  * data write guid must be updated in the header */
   1316 int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
   1317 {
   1318     int ret = 0;
   1319     if (s->first_visible_write) {
   1320         s->first_visible_write = false;
   1321         ret = vhdx_update_headers(bs, s, true, NULL);
   1322     }
   1323     return ret;
   1324 }
   1325 
   1326 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
   1327                                        int nb_sectors, QEMUIOVector *qiov,
   1328                                        int flags)
   1329 {
   1330     int ret = -ENOTSUP;
   1331     BDRVVHDXState *s = bs->opaque;
   1332     VHDXSectorInfo sinfo;
   1333     uint64_t bytes_done = 0;
   1334     uint64_t bat_entry = 0;
   1335     uint64_t bat_entry_offset = 0;
   1336     QEMUIOVector hd_qiov;
   1337     struct iovec iov1 = { 0 };
   1338     struct iovec iov2 = { 0 };
   1339     int sectors_to_write;
   1340     int bat_state;
   1341     uint64_t bat_prior_offset = 0;
   1342     bool bat_update = false;
   1343 
   1344     qemu_iovec_init(&hd_qiov, qiov->niov);
   1345 
   1346     qemu_co_mutex_lock(&s->lock);
   1347 
   1348     ret = vhdx_user_visible_write(bs, s);
   1349     if (ret < 0) {
   1350         goto exit;
   1351     }
   1352 
   1353     while (nb_sectors > 0) {
   1354         bool use_zero_buffers = false;
   1355         bat_update = false;
   1356         if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
   1357             /* not supported yet */
   1358             ret = -ENOTSUP;
   1359             goto exit;
   1360         } else {
   1361             vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
   1362             sectors_to_write = sinfo.sectors_avail;
   1363 
   1364             qemu_iovec_reset(&hd_qiov);
   1365             /* check the payload block state */
   1366             bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK;
   1367             switch (bat_state) {
   1368             case PAYLOAD_BLOCK_ZERO:
   1369                 /* in this case, we need to preserve zero writes for
   1370                  * data that is not part of this write, so we must pad
   1371                  * the rest of the buffer to zeroes */
   1372                 use_zero_buffers = true;
   1373                 /* fall through */
   1374             case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
   1375             case PAYLOAD_BLOCK_UNMAPPED:
   1376             case PAYLOAD_BLOCK_UNMAPPED_v095:
   1377             case PAYLOAD_BLOCK_UNDEFINED:
   1378                 bat_prior_offset = sinfo.file_offset;
   1379                 ret = vhdx_allocate_block(bs, s, &sinfo.file_offset,
   1380                                           &use_zero_buffers);
   1381                 if (ret < 0) {
   1382                     goto exit;
   1383                 }
   1384                 /*
   1385                  * once we support differencing files, this may also be
   1386                  * partially present
   1387                  */
   1388                 /* update block state to the newly specified state */
   1389                 vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
   1390                                             &bat_entry_offset,
   1391                                             PAYLOAD_BLOCK_FULLY_PRESENT);
   1392                 bat_update = true;
   1393                 /*
   1394                  * Since we just allocated a block, file_offset is the
   1395                  * beginning of the payload block. It needs to be the
   1396                  * write address, which includes the offset into the
   1397                  * block, unless the entire block needs to read as
   1398                  * zeroes but truncation was not able to provide them,
   1399                  * in which case we need to fill in the rest.
   1400                  */
   1401                 if (!use_zero_buffers) {
   1402                     sinfo.file_offset += sinfo.block_offset;
   1403                 } else {
   1404                     /* zero fill the front, if any */
   1405                     if (sinfo.block_offset) {
   1406                         iov1.iov_len = sinfo.block_offset;
   1407                         iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
   1408                         memset(iov1.iov_base, 0, iov1.iov_len);
   1409                         qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
   1410                                               iov1.iov_len);
   1411                         sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
   1412                     }
   1413 
   1414                     /* our actual data */
   1415                     qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
   1416                                       sinfo.bytes_avail);
   1417 
   1418                     /* zero fill the back, if any */
   1419                     if ((sinfo.bytes_avail - sinfo.block_offset) <
   1420                          s->block_size) {
   1421                         iov2.iov_len = s->block_size -
   1422                                       (sinfo.bytes_avail + sinfo.block_offset);
   1423                         iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
   1424                         memset(iov2.iov_base, 0, iov2.iov_len);
   1425                         qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
   1426                                               iov2.iov_len);
   1427                         sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
   1428                     }
   1429                 }
   1430 
   1431                 /* fall through */
   1432             case PAYLOAD_BLOCK_FULLY_PRESENT:
   1433                 /* if the file offset address is in the header zone,
   1434                  * there is a problem */
   1435                 if (sinfo.file_offset < (1 * MiB)) {
   1436                     ret = -EFAULT;
   1437                     goto error_bat_restore;
   1438                 }
   1439 
   1440                 if (!use_zero_buffers) {
   1441                     qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
   1442                                       sinfo.bytes_avail);
   1443                 }
   1444                 /* block exists, so we can just overwrite it */
   1445                 qemu_co_mutex_unlock(&s->lock);
   1446                 ret = bdrv_co_pwritev(bs->file, sinfo.file_offset,
   1447                                       sectors_to_write * BDRV_SECTOR_SIZE,
   1448                                       &hd_qiov, 0);
   1449                 qemu_co_mutex_lock(&s->lock);
   1450                 if (ret < 0) {
   1451                     goto error_bat_restore;
   1452                 }
   1453                 break;
   1454             case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
   1455                 /* we don't yet support difference files, fall through
   1456                  * to error */
   1457             default:
   1458                 ret = -EIO;
   1459                 goto exit;
   1460                 break;
   1461             }
   1462 
   1463             if (bat_update) {
   1464                 /* this will update the BAT entry into the log journal, and
   1465                  * then flush the log journal out to disk */
   1466                 ret =  vhdx_log_write_and_flush(bs, s, &bat_entry,
   1467                                                 sizeof(VHDXBatEntry),
   1468                                                 bat_entry_offset);
   1469                 if (ret < 0) {
   1470                     goto exit;
   1471                 }
   1472             }
   1473 
   1474             nb_sectors -= sinfo.sectors_avail;
   1475             sector_num += sinfo.sectors_avail;
   1476             bytes_done += sinfo.bytes_avail;
   1477 
   1478         }
   1479     }
   1480 
   1481     goto exit;
   1482 
   1483 error_bat_restore:
   1484     if (bat_update) {
   1485         /* keep metadata in sync, and restore the bat entry state
   1486          * if error. */
   1487         sinfo.file_offset = bat_prior_offset;
   1488         vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
   1489                                     &bat_entry_offset, bat_state);
   1490     }
   1491 exit:
   1492     qemu_vfree(iov1.iov_base);
   1493     qemu_vfree(iov2.iov_base);
   1494     qemu_co_mutex_unlock(&s->lock);
   1495     qemu_iovec_destroy(&hd_qiov);
   1496     return ret;
   1497 }
   1498 
   1499 
   1500 
   1501 /*
   1502  * Create VHDX Headers
   1503  *
   1504  * There are 2 headers, and the highest sequence number will represent
   1505  * the active header
   1506  */
   1507 static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
   1508                                    uint32_t log_size)
   1509 {
   1510     BlockDriverState *bs = blk_bs(blk);
   1511     BdrvChild *child;
   1512     int ret = 0;
   1513     VHDXHeader *hdr = NULL;
   1514 
   1515     hdr = g_new0(VHDXHeader, 1);
   1516 
   1517     hdr->signature       = VHDX_HEADER_SIGNATURE;
   1518     hdr->sequence_number = g_random_int();
   1519     hdr->log_version     = 0;
   1520     hdr->version         = 1;
   1521     hdr->log_length      = log_size;
   1522     hdr->log_offset      = VHDX_HEADER_SECTION_END;
   1523     vhdx_guid_generate(&hdr->file_write_guid);
   1524     vhdx_guid_generate(&hdr->data_write_guid);
   1525 
   1526     /* XXX Ugly way to get blk->root, but that's a feature, not a bug. This
   1527      * hack makes it obvious that vhdx_write_header() bypasses the BlockBackend
   1528      * here, which it really shouldn't be doing. */
   1529     child = QLIST_FIRST(&bs->parents);
   1530     assert(!QLIST_NEXT(child, next_parent));
   1531 
   1532     ret = vhdx_write_header(child, hdr, VHDX_HEADER1_OFFSET, false);
   1533     if (ret < 0) {
   1534         goto exit;
   1535     }
   1536     hdr->sequence_number++;
   1537     ret = vhdx_write_header(child, hdr, VHDX_HEADER2_OFFSET, false);
   1538     if (ret < 0) {
   1539         goto exit;
   1540     }
   1541 
   1542 exit:
   1543     g_free(hdr);
   1544     return ret;
   1545 }
   1546 
   1547 #define VHDX_METADATA_ENTRY_BUFFER_SIZE \
   1548                                     (sizeof(VHDXFileParameters)               +\
   1549                                      sizeof(VHDXVirtualDiskSize)              +\
   1550                                      sizeof(VHDXPage83Data)                   +\
   1551                                      sizeof(VHDXVirtualDiskLogicalSectorSize) +\
   1552                                      sizeof(VHDXVirtualDiskPhysicalSectorSize))
   1553 
   1554 /*
   1555  * Create the Metadata entries.
   1556  *
   1557  * For more details on the entries, see section 3.5 (pg 29) in the
   1558  * VHDX 1.00 specification.
   1559  *
   1560  * We support 5 metadata entries (all required by spec):
   1561  *          File Parameters,
   1562  *          Virtual Disk Size,
   1563  *          Page 83 Data,
   1564  *          Logical Sector Size,
   1565  *          Physical Sector Size
   1566  *
   1567  * The first 64KB of the Metadata section is reserved for the metadata
   1568  * header and entries; beyond that, the metadata items themselves reside.
   1569  */
   1570 static int vhdx_create_new_metadata(BlockBackend *blk,
   1571                                     uint64_t image_size,
   1572                                     uint32_t block_size,
   1573                                     uint32_t sector_size,
   1574                                     uint64_t metadata_offset,
   1575                                     VHDXImageType type)
   1576 {
   1577     int ret = 0;
   1578     uint32_t offset = 0;
   1579     void *buffer = NULL;
   1580     void *entry_buffer;
   1581     VHDXMetadataTableHeader *md_table;
   1582     VHDXMetadataTableEntry  *md_table_entry;
   1583 
   1584     /* Metadata entries */
   1585     VHDXFileParameters     *mt_file_params;
   1586     VHDXVirtualDiskSize    *mt_virtual_size;
   1587     VHDXPage83Data         *mt_page83;
   1588     VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size;
   1589     VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size;
   1590 
   1591     entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE);
   1592 
   1593     mt_file_params = entry_buffer;
   1594     offset += sizeof(VHDXFileParameters);
   1595     mt_virtual_size = entry_buffer + offset;
   1596     offset += sizeof(VHDXVirtualDiskSize);
   1597     mt_page83 = entry_buffer + offset;
   1598     offset += sizeof(VHDXPage83Data);
   1599     mt_log_sector_size = entry_buffer + offset;
   1600     offset += sizeof(VHDXVirtualDiskLogicalSectorSize);
   1601     mt_phys_sector_size = entry_buffer + offset;
   1602 
   1603     mt_file_params->block_size = cpu_to_le32(block_size);
   1604     if (type == VHDX_TYPE_FIXED) {
   1605         mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED;
   1606         mt_file_params->data_bits = cpu_to_le32(mt_file_params->data_bits);
   1607     }
   1608 
   1609     vhdx_guid_generate(&mt_page83->page_83_data);
   1610     cpu_to_leguids(&mt_page83->page_83_data);
   1611     mt_virtual_size->virtual_disk_size        = cpu_to_le64(image_size);
   1612     mt_log_sector_size->logical_sector_size   = cpu_to_le32(sector_size);
   1613     mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size);
   1614 
   1615     buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
   1616     md_table = buffer;
   1617 
   1618     md_table->signature   = VHDX_METADATA_SIGNATURE;
   1619     md_table->entry_count = 5;
   1620     vhdx_metadata_header_le_export(md_table);
   1621 
   1622 
   1623     /* This will reference beyond the reserved table portion */
   1624     offset = 64 * KiB;
   1625 
   1626     md_table_entry = buffer + sizeof(VHDXMetadataTableHeader);
   1627 
   1628     md_table_entry[0].item_id = file_param_guid;
   1629     md_table_entry[0].offset  = offset;
   1630     md_table_entry[0].length  = sizeof(VHDXFileParameters);
   1631     md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED;
   1632     offset += md_table_entry[0].length;
   1633     vhdx_metadata_entry_le_export(&md_table_entry[0]);
   1634 
   1635     md_table_entry[1].item_id = virtual_size_guid;
   1636     md_table_entry[1].offset  = offset;
   1637     md_table_entry[1].length  = sizeof(VHDXVirtualDiskSize);
   1638     md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
   1639                                    VHDX_META_FLAGS_IS_VIRTUAL_DISK;
   1640     offset += md_table_entry[1].length;
   1641     vhdx_metadata_entry_le_export(&md_table_entry[1]);
   1642 
   1643     md_table_entry[2].item_id = page83_guid;
   1644     md_table_entry[2].offset  = offset;
   1645     md_table_entry[2].length  = sizeof(VHDXPage83Data);
   1646     md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
   1647                                    VHDX_META_FLAGS_IS_VIRTUAL_DISK;
   1648     offset += md_table_entry[2].length;
   1649     vhdx_metadata_entry_le_export(&md_table_entry[2]);
   1650 
   1651     md_table_entry[3].item_id = logical_sector_guid;
   1652     md_table_entry[3].offset  = offset;
   1653     md_table_entry[3].length  = sizeof(VHDXVirtualDiskLogicalSectorSize);
   1654     md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
   1655                                    VHDX_META_FLAGS_IS_VIRTUAL_DISK;
   1656     offset += md_table_entry[3].length;
   1657     vhdx_metadata_entry_le_export(&md_table_entry[3]);
   1658 
   1659     md_table_entry[4].item_id = phys_sector_guid;
   1660     md_table_entry[4].offset  = offset;
   1661     md_table_entry[4].length  = sizeof(VHDXVirtualDiskPhysicalSectorSize);
   1662     md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
   1663                                    VHDX_META_FLAGS_IS_VIRTUAL_DISK;
   1664     vhdx_metadata_entry_le_export(&md_table_entry[4]);
   1665 
   1666     ret = blk_pwrite(blk, metadata_offset, VHDX_HEADER_BLOCK_SIZE, buffer, 0);
   1667     if (ret < 0) {
   1668         goto exit;
   1669     }
   1670 
   1671     ret = blk_pwrite(blk, metadata_offset + (64 * KiB),
   1672                      VHDX_METADATA_ENTRY_BUFFER_SIZE, entry_buffer, 0);
   1673     if (ret < 0) {
   1674         goto exit;
   1675     }
   1676 
   1677 
   1678 exit:
   1679     g_free(buffer);
   1680     g_free(entry_buffer);
   1681     return ret;
   1682 }
   1683 
   1684 /* This create the actual BAT itself.  We currently only support
   1685  * 'Dynamic' and 'Fixed' image types.
   1686  *
   1687  *  Dynamic images: default state of the BAT is all zeroes.
   1688  *
   1689  *  Fixed images: default state of the BAT is fully populated, with
   1690  *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
   1691  */
   1692 static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
   1693                            uint64_t image_size, VHDXImageType type,
   1694                            bool use_zero_blocks, uint64_t file_offset,
   1695                            uint32_t length, Error **errp)
   1696 {
   1697     int ret = 0;
   1698     uint64_t data_file_offset;
   1699     uint64_t total_sectors = 0;
   1700     uint64_t sector_num = 0;
   1701     uint64_t unused;
   1702     int block_state;
   1703     VHDXSectorInfo sinfo;
   1704 
   1705     assert(s->bat == NULL);
   1706 
   1707     /* this gives a data start after BAT/bitmap entries, and well
   1708      * past any metadata entries (with a 4 MB buffer for future
   1709      * expansion */
   1710     data_file_offset = file_offset + length + 5 * MiB;
   1711     total_sectors = image_size >> s->logical_sector_size_bits;
   1712 
   1713     if (type == VHDX_TYPE_DYNAMIC) {
   1714         /* All zeroes, so we can just extend the file - the end of the BAT
   1715          * is the furthest thing we have written yet */
   1716         ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
   1717                            0, errp);
   1718         if (ret < 0) {
   1719             goto exit;
   1720         }
   1721     } else if (type == VHDX_TYPE_FIXED) {
   1722         ret = blk_truncate(blk, data_file_offset + image_size, false,
   1723                            PREALLOC_MODE_OFF, 0, errp);
   1724         if (ret < 0) {
   1725             goto exit;
   1726         }
   1727     } else {
   1728         error_setg(errp, "Unsupported image type");
   1729         ret = -ENOTSUP;
   1730         goto exit;
   1731     }
   1732 
   1733     if (type == VHDX_TYPE_FIXED ||
   1734                 use_zero_blocks ||
   1735                 bdrv_has_zero_init(blk_bs(blk)) == 0) {
   1736         /* for a fixed file, the default BAT entry is not zero */
   1737         s->bat = g_try_malloc0(length);
   1738         if (length && s->bat == NULL) {
   1739             error_setg(errp, "Failed to allocate memory for the BAT");
   1740             ret = -ENOMEM;
   1741             goto exit;
   1742         }
   1743         block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
   1744                                                 PAYLOAD_BLOCK_NOT_PRESENT;
   1745         block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
   1746         /* fill the BAT by emulating sector writes of sectors_per_block size */
   1747         while (sector_num < total_sectors) {
   1748             vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo);
   1749             sinfo.file_offset = data_file_offset +
   1750                                 (sector_num << s->logical_sector_size_bits);
   1751             sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB);
   1752             vhdx_update_bat_table_entry(blk_bs(blk), s, &sinfo, &unused, &unused,
   1753                                         block_state);
   1754             s->bat[sinfo.bat_idx] = cpu_to_le64(s->bat[sinfo.bat_idx]);
   1755             sector_num += s->sectors_per_block;
   1756         }
   1757         ret = blk_pwrite(blk, file_offset, length, s->bat, 0);
   1758         if (ret < 0) {
   1759             error_setg_errno(errp, -ret, "Failed to write the BAT");
   1760             goto exit;
   1761         }
   1762     }
   1763 
   1764 
   1765 
   1766 exit:
   1767     g_free(s->bat);
   1768     return ret;
   1769 }
   1770 
   1771 /* Creates the region table header, and region table entries.
   1772  * There are 2 supported region table entries: BAT, and Metadata/
   1773  *
   1774  * As the calculations for the BAT region table are also needed
   1775  * to create the BAT itself, we will also cause the BAT to be
   1776  * created.
   1777  */
   1778 static int vhdx_create_new_region_table(BlockBackend *blk,
   1779                                         uint64_t image_size,
   1780                                         uint32_t block_size,
   1781                                         uint32_t sector_size,
   1782                                         uint32_t log_size,
   1783                                         bool use_zero_blocks,
   1784                                         VHDXImageType type,
   1785                                         uint64_t *metadata_offset,
   1786                                         Error **errp)
   1787 {
   1788     int ret = 0;
   1789     uint32_t offset = 0;
   1790     void *buffer = NULL;
   1791     uint64_t bat_file_offset;
   1792     uint32_t bat_length;
   1793     BDRVVHDXState *s = NULL;
   1794     VHDXRegionTableHeader *region_table;
   1795     VHDXRegionTableEntry *rt_bat;
   1796     VHDXRegionTableEntry *rt_metadata;
   1797 
   1798     assert(metadata_offset != NULL);
   1799 
   1800     /* Populate enough of the BDRVVHDXState to be able to use the
   1801      * pre-existing BAT calculation, translation, and update functions */
   1802     s = g_new0(BDRVVHDXState, 1);
   1803 
   1804     s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
   1805                      (uint64_t) sector_size / (uint64_t) block_size;
   1806 
   1807     s->sectors_per_block = block_size / sector_size;
   1808     s->virtual_disk_size = image_size;
   1809     s->block_size = block_size;
   1810     s->logical_sector_size = sector_size;
   1811 
   1812     vhdx_set_shift_bits(s);
   1813 
   1814     vhdx_calc_bat_entries(s);
   1815 
   1816     /* At this point the VHDX state is populated enough for creation */
   1817 
   1818     /* a single buffer is used so we can calculate the checksum over the
   1819      * entire 64KB block */
   1820     buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
   1821     region_table = buffer;
   1822     offset += sizeof(VHDXRegionTableHeader);
   1823     rt_bat = buffer + offset;
   1824     offset += sizeof(VHDXRegionTableEntry);
   1825     rt_metadata  = buffer + offset;
   1826 
   1827     region_table->signature = VHDX_REGION_SIGNATURE;
   1828     region_table->entry_count = 2;   /* BAT and Metadata */
   1829 
   1830     rt_bat->guid        = bat_guid;
   1831     rt_bat->length      = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB);
   1832     rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB);
   1833     s->bat_offset = rt_bat->file_offset;
   1834 
   1835     rt_metadata->guid        = metadata_guid;
   1836     rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length,
   1837                                         MiB);
   1838     rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
   1839     *metadata_offset = rt_metadata->file_offset;
   1840 
   1841     bat_file_offset = rt_bat->file_offset;
   1842     bat_length = rt_bat->length;
   1843 
   1844     vhdx_region_header_le_export(region_table);
   1845     vhdx_region_entry_le_export(rt_bat);
   1846     vhdx_region_entry_le_export(rt_metadata);
   1847 
   1848     vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
   1849                          offsetof(VHDXRegionTableHeader, checksum));
   1850 
   1851 
   1852     /* The region table gives us the data we need to create the BAT,
   1853      * so do that now */
   1854     ret = vhdx_create_bat(blk, s, image_size, type, use_zero_blocks,
   1855                           bat_file_offset, bat_length, errp);
   1856     if (ret < 0) {
   1857         goto exit;
   1858     }
   1859 
   1860     /* Now write out the region headers to disk */
   1861     ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, VHDX_HEADER_BLOCK_SIZE,
   1862                      buffer, 0);
   1863     if (ret < 0) {
   1864         error_setg_errno(errp, -ret, "Failed to write first region table");
   1865         goto exit;
   1866     }
   1867 
   1868     ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, VHDX_HEADER_BLOCK_SIZE,
   1869                      buffer, 0);
   1870     if (ret < 0) {
   1871         error_setg_errno(errp, -ret, "Failed to write second region table");
   1872         goto exit;
   1873     }
   1874 
   1875 exit:
   1876     g_free(s);
   1877     g_free(buffer);
   1878     return ret;
   1879 }
   1880 
   1881 /* We need to create the following elements:
   1882  *
   1883  *    .-----------------------------------------------------------------.
   1884  *    |   (A)    |   (B)    |    (C)    |     (D)       |     (E)       |
   1885  *    |  File ID |  Header1 |  Header 2 |  Region Tbl 1 |  Region Tbl 2 |
   1886  *    |          |          |           |               |               |
   1887  *    .-----------------------------------------------------------------.
   1888  *    0         64KB      128KB       192KB           256KB           320KB
   1889  *
   1890  *
   1891  *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
   1892  *    |     (F)     |     (G)       |    (H)    |                        |
   1893  *    | Journal Log |  BAT / Bitmap |  Metadata |  .... data ......      |
   1894  *    |             |               |           |                        |
   1895  *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
   1896  *   1MB
   1897  */
   1898 static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,
   1899                                        Error **errp)
   1900 {
   1901     BlockdevCreateOptionsVhdx *vhdx_opts;
   1902     BlockBackend *blk = NULL;
   1903     BlockDriverState *bs = NULL;
   1904 
   1905     int ret = 0;
   1906     uint64_t image_size;
   1907     uint32_t log_size;
   1908     uint32_t block_size;
   1909     uint64_t signature;
   1910     uint64_t metadata_offset;
   1911     bool use_zero_blocks = false;
   1912 
   1913     gunichar2 *creator = NULL;
   1914     glong creator_items;
   1915     VHDXImageType image_type;
   1916 
   1917     assert(opts->driver == BLOCKDEV_DRIVER_VHDX);
   1918     vhdx_opts = &opts->u.vhdx;
   1919 
   1920     /* Validate options and set default values */
   1921     image_size = vhdx_opts->size;
   1922     if (image_size > VHDX_MAX_IMAGE_SIZE) {
   1923         error_setg(errp, "Image size too large; max of 64TB");
   1924         return -EINVAL;
   1925     }
   1926 
   1927     if (!vhdx_opts->has_log_size) {
   1928         log_size = DEFAULT_LOG_SIZE;
   1929     } else {
   1930         if (vhdx_opts->log_size > UINT32_MAX) {
   1931             error_setg(errp, "Log size must be smaller than 4 GB");
   1932             return -EINVAL;
   1933         }
   1934         log_size = vhdx_opts->log_size;
   1935     }
   1936     if (log_size < MiB || (log_size % MiB) != 0) {
   1937         error_setg(errp, "Log size must be a multiple of 1 MB");
   1938         return -EINVAL;
   1939     }
   1940 
   1941     if (!vhdx_opts->has_block_state_zero) {
   1942         use_zero_blocks = true;
   1943     } else {
   1944         use_zero_blocks = vhdx_opts->block_state_zero;
   1945     }
   1946 
   1947     if (!vhdx_opts->has_subformat) {
   1948         vhdx_opts->subformat = BLOCKDEV_VHDX_SUBFORMAT_DYNAMIC;
   1949     }
   1950 
   1951     switch (vhdx_opts->subformat) {
   1952     case BLOCKDEV_VHDX_SUBFORMAT_DYNAMIC:
   1953         image_type = VHDX_TYPE_DYNAMIC;
   1954         break;
   1955     case BLOCKDEV_VHDX_SUBFORMAT_FIXED:
   1956         image_type = VHDX_TYPE_FIXED;
   1957         break;
   1958     default:
   1959         g_assert_not_reached();
   1960     }
   1961 
   1962     /* These are pretty arbitrary, and mainly designed to keep the BAT
   1963      * size reasonable to load into RAM */
   1964     if (vhdx_opts->has_block_size) {
   1965         block_size = vhdx_opts->block_size;
   1966     } else {
   1967         if (image_size > 32 * TiB) {
   1968             block_size = 64 * MiB;
   1969         } else if (image_size > (uint64_t) 100 * GiB) {
   1970             block_size = 32 * MiB;
   1971         } else if (image_size > 1 * GiB) {
   1972             block_size = 16 * MiB;
   1973         } else {
   1974             block_size = 8 * MiB;
   1975         }
   1976     }
   1977 
   1978     if (block_size < MiB || (block_size % MiB) != 0) {
   1979         error_setg(errp, "Block size must be a multiple of 1 MB");
   1980         return -EINVAL;
   1981     }
   1982     if (!is_power_of_2(block_size)) {
   1983         error_setg(errp, "Block size must be a power of two");
   1984         return -EINVAL;
   1985     }
   1986     if (block_size > VHDX_BLOCK_SIZE_MAX) {
   1987         error_setg(errp, "Block size must not exceed %" PRId64,
   1988                    VHDX_BLOCK_SIZE_MAX);
   1989         return -EINVAL;
   1990     }
   1991 
   1992     /* Create BlockBackend to write to the image */
   1993     bs = bdrv_open_blockdev_ref(vhdx_opts->file, errp);
   1994     if (bs == NULL) {
   1995         return -EIO;
   1996     }
   1997 
   1998     blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
   1999                           errp);
   2000     if (!blk) {
   2001         ret = -EPERM;
   2002         goto delete_and_exit;
   2003     }
   2004     blk_set_allow_write_beyond_eof(blk, true);
   2005 
   2006     /* Create (A) */
   2007 
   2008     /* The creator field is optional, but may be useful for
   2009      * debugging / diagnostics */
   2010     creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
   2011                               &creator_items, NULL);
   2012     signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
   2013     ret = blk_co_pwrite(blk, VHDX_FILE_ID_OFFSET, sizeof(signature), &signature,
   2014                         0);
   2015     if (ret < 0) {
   2016         error_setg_errno(errp, -ret, "Failed to write file signature");
   2017         goto delete_and_exit;
   2018     }
   2019     if (creator) {
   2020         ret = blk_co_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature),
   2021                             creator_items * sizeof(gunichar2), creator, 0);
   2022         if (ret < 0) {
   2023             error_setg_errno(errp, -ret, "Failed to write creator field");
   2024             goto delete_and_exit;
   2025         }
   2026     }
   2027 
   2028 
   2029     /* Creates (B),(C) */
   2030     ret = vhdx_create_new_headers(blk, image_size, log_size);
   2031     if (ret < 0) {
   2032         error_setg_errno(errp, -ret, "Failed to write image headers");
   2033         goto delete_and_exit;
   2034     }
   2035 
   2036     /* Creates (D),(E),(G) explicitly. (F) created as by-product */
   2037     ret = vhdx_create_new_region_table(blk, image_size, block_size, 512,
   2038                                        log_size, use_zero_blocks, image_type,
   2039                                        &metadata_offset, errp);
   2040     if (ret < 0) {
   2041         goto delete_and_exit;
   2042     }
   2043 
   2044     /* Creates (H) */
   2045     ret = vhdx_create_new_metadata(blk, image_size, block_size, 512,
   2046                                    metadata_offset, image_type);
   2047     if (ret < 0) {
   2048         error_setg_errno(errp, -ret, "Failed to initialize metadata");
   2049         goto delete_and_exit;
   2050     }
   2051 
   2052     ret = 0;
   2053 delete_and_exit:
   2054     blk_unref(blk);
   2055     bdrv_unref(bs);
   2056     g_free(creator);
   2057     return ret;
   2058 }
   2059 
   2060 static int coroutine_fn vhdx_co_create_opts(BlockDriver *drv,
   2061                                             const char *filename,
   2062                                             QemuOpts *opts,
   2063                                             Error **errp)
   2064 {
   2065     BlockdevCreateOptions *create_options = NULL;
   2066     QDict *qdict;
   2067     Visitor *v;
   2068     BlockDriverState *bs = NULL;
   2069     int ret;
   2070 
   2071     static const QDictRenames opt_renames[] = {
   2072         { VHDX_BLOCK_OPT_LOG_SIZE,      "log-size" },
   2073         { VHDX_BLOCK_OPT_BLOCK_SIZE,    "block-size" },
   2074         { VHDX_BLOCK_OPT_ZERO,          "block-state-zero" },
   2075         { NULL, NULL },
   2076     };
   2077 
   2078     /* Parse options and convert legacy syntax */
   2079     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vhdx_create_opts, true);
   2080 
   2081     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
   2082         ret = -EINVAL;
   2083         goto fail;
   2084     }
   2085 
   2086     /* Create and open the file (protocol layer) */
   2087     ret = bdrv_create_file(filename, opts, errp);
   2088     if (ret < 0) {
   2089         goto fail;
   2090     }
   2091 
   2092     bs = bdrv_open(filename, NULL, NULL,
   2093                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
   2094     if (bs == NULL) {
   2095         ret = -EIO;
   2096         goto fail;
   2097     }
   2098 
   2099     /* Now get the QAPI type BlockdevCreateOptions */
   2100     qdict_put_str(qdict, "driver", "vhdx");
   2101     qdict_put_str(qdict, "file", bs->node_name);
   2102 
   2103     v = qobject_input_visitor_new_flat_confused(qdict, errp);
   2104     if (!v) {
   2105         ret = -EINVAL;
   2106         goto fail;
   2107     }
   2108 
   2109     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
   2110     visit_free(v);
   2111     if (!create_options) {
   2112         ret = -EINVAL;
   2113         goto fail;
   2114     }
   2115 
   2116     /* Silently round up sizes:
   2117      * The image size is rounded to 512 bytes. Make the block and log size
   2118      * close to what was specified, but must be at least 1MB, and a multiple of
   2119      * 1 MB. Also respect VHDX_BLOCK_SIZE_MAX for block sizes. block_size = 0
   2120      * means auto, which is represented by a missing key in QAPI. */
   2121     assert(create_options->driver == BLOCKDEV_DRIVER_VHDX);
   2122     create_options->u.vhdx.size =
   2123         ROUND_UP(create_options->u.vhdx.size, BDRV_SECTOR_SIZE);
   2124 
   2125     if (create_options->u.vhdx.has_log_size) {
   2126         create_options->u.vhdx.log_size =
   2127             ROUND_UP(create_options->u.vhdx.log_size, MiB);
   2128     }
   2129     if (create_options->u.vhdx.has_block_size) {
   2130         create_options->u.vhdx.block_size =
   2131             ROUND_UP(create_options->u.vhdx.block_size, MiB);
   2132 
   2133         if (create_options->u.vhdx.block_size == 0) {
   2134             create_options->u.vhdx.has_block_size = false;
   2135         }
   2136         if (create_options->u.vhdx.block_size > VHDX_BLOCK_SIZE_MAX) {
   2137             create_options->u.vhdx.block_size = VHDX_BLOCK_SIZE_MAX;
   2138         }
   2139     }
   2140 
   2141     /* Create the vhdx image (format layer) */
   2142     ret = vhdx_co_create(create_options, errp);
   2143 
   2144 fail:
   2145     qobject_unref(qdict);
   2146     bdrv_unref(bs);
   2147     qapi_free_BlockdevCreateOptions(create_options);
   2148     return ret;
   2149 }
   2150 
   2151 /* If opened r/w, the VHDX driver will automatically replay the log,
   2152  * if one is present, inside the vhdx_open() call.
   2153  *
   2154  * If qemu-img check -r all is called, the image is automatically opened
   2155  * r/w and any log has already been replayed, so there is nothing (currently)
   2156  * for us to do here
   2157  */
   2158 static int coroutine_fn vhdx_co_check(BlockDriverState *bs,
   2159                                       BdrvCheckResult *result,
   2160                                       BdrvCheckMode fix)
   2161 {
   2162     BDRVVHDXState *s = bs->opaque;
   2163 
   2164     if (s->log_replayed_on_open) {
   2165         result->corruptions_fixed++;
   2166     }
   2167 
   2168     vhdx_check_bat_entries(bs, &result->corruptions);
   2169 
   2170     return 0;
   2171 }
   2172 
   2173 static int vhdx_has_zero_init(BlockDriverState *bs)
   2174 {
   2175     BDRVVHDXState *s = bs->opaque;
   2176     int state;
   2177 
   2178     /*
   2179      * Check the subformat: Fixed images have all BAT entries present,
   2180      * dynamic images have none (right after creation).  It is
   2181      * therefore enough to check the first BAT entry.
   2182      */
   2183     if (!s->bat_entries) {
   2184         return 1;
   2185     }
   2186 
   2187     state = s->bat[0] & VHDX_BAT_STATE_BIT_MASK;
   2188     if (state == PAYLOAD_BLOCK_FULLY_PRESENT) {
   2189         /* Fixed subformat */
   2190         return bdrv_has_zero_init(bs->file->bs);
   2191     }
   2192 
   2193     /* Dynamic subformat */
   2194     return 1;
   2195 }
   2196 
   2197 static QemuOptsList vhdx_create_opts = {
   2198     .name = "vhdx-create-opts",
   2199     .head = QTAILQ_HEAD_INITIALIZER(vhdx_create_opts.head),
   2200     .desc = {
   2201         {
   2202            .name = BLOCK_OPT_SIZE,
   2203            .type = QEMU_OPT_SIZE,
   2204            .help = "Virtual disk size; max of 64TB."
   2205        },
   2206        {
   2207            .name = VHDX_BLOCK_OPT_LOG_SIZE,
   2208            .type = QEMU_OPT_SIZE,
   2209            .def_value_str = stringify(DEFAULT_LOG_SIZE),
   2210            .help = "Log size; min 1MB."
   2211        },
   2212        {
   2213            .name = VHDX_BLOCK_OPT_BLOCK_SIZE,
   2214            .type = QEMU_OPT_SIZE,
   2215            .def_value_str = stringify(0),
   2216            .help = "Block Size; min 1MB, max 256MB. "
   2217                    "0 means auto-calculate based on image size."
   2218        },
   2219        {
   2220            .name = BLOCK_OPT_SUBFMT,
   2221            .type = QEMU_OPT_STRING,
   2222            .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "
   2223                    "Default is 'dynamic'."
   2224        },
   2225        {
   2226            .name = VHDX_BLOCK_OPT_ZERO,
   2227            .type = QEMU_OPT_BOOL,
   2228            .help = "Force use of payload blocks of type 'ZERO'. "
   2229                    "Non-standard, but default.  Do not set to 'off' when "
   2230                    "using 'qemu-img convert' with subformat=dynamic."
   2231        },
   2232        { NULL }
   2233     }
   2234 };
   2235 
   2236 static BlockDriver bdrv_vhdx = {
   2237     .format_name            = "vhdx",
   2238     .instance_size          = sizeof(BDRVVHDXState),
   2239     .bdrv_probe             = vhdx_probe,
   2240     .bdrv_open              = vhdx_open,
   2241     .bdrv_close             = vhdx_close,
   2242     .bdrv_reopen_prepare    = vhdx_reopen_prepare,
   2243     .bdrv_child_perm        = bdrv_default_perms,
   2244     .bdrv_co_readv          = vhdx_co_readv,
   2245     .bdrv_co_writev         = vhdx_co_writev,
   2246     .bdrv_co_create         = vhdx_co_create,
   2247     .bdrv_co_create_opts    = vhdx_co_create_opts,
   2248     .bdrv_get_info          = vhdx_get_info,
   2249     .bdrv_co_check          = vhdx_co_check,
   2250     .bdrv_has_zero_init     = vhdx_has_zero_init,
   2251 
   2252     .is_format              = true,
   2253     .create_opts            = &vhdx_create_opts,
   2254 };
   2255 
   2256 static void bdrv_vhdx_init(void)
   2257 {
   2258     bdrv_register(&bdrv_vhdx);
   2259 }
   2260 
   2261 block_init(bdrv_vhdx_init);