qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

qcow2-refcount.c (132286B)


      1 /*
      2  * Block driver for the QCOW version 2 format
      3  *
      4  * Copyright (c) 2004-2006 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 
     25 #include "qemu/osdep.h"
     26 #include "qapi/error.h"
     27 #include "qcow2.h"
     28 #include "qemu/range.h"
     29 #include "qemu/bswap.h"
     30 #include "qemu/cutils.h"
     31 #include "qemu/memalign.h"
     32 #include "trace.h"
     33 
     34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
     35                                     uint64_t max);
     36 
     37 G_GNUC_WARN_UNUSED_RESULT
     38 static int update_refcount(BlockDriverState *bs,
     39                            int64_t offset, int64_t length, uint64_t addend,
     40                            bool decrease, enum qcow2_discard_type type);
     41 
     42 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
     43 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
     44 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
     45 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
     46 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
     47 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
     48 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
     49 
     50 static void set_refcount_ro0(void *refcount_array, uint64_t index,
     51                              uint64_t value);
     52 static void set_refcount_ro1(void *refcount_array, uint64_t index,
     53                              uint64_t value);
     54 static void set_refcount_ro2(void *refcount_array, uint64_t index,
     55                              uint64_t value);
     56 static void set_refcount_ro3(void *refcount_array, uint64_t index,
     57                              uint64_t value);
     58 static void set_refcount_ro4(void *refcount_array, uint64_t index,
     59                              uint64_t value);
     60 static void set_refcount_ro5(void *refcount_array, uint64_t index,
     61                              uint64_t value);
     62 static void set_refcount_ro6(void *refcount_array, uint64_t index,
     63                              uint64_t value);
     64 
     65 
     66 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
     67     &get_refcount_ro0,
     68     &get_refcount_ro1,
     69     &get_refcount_ro2,
     70     &get_refcount_ro3,
     71     &get_refcount_ro4,
     72     &get_refcount_ro5,
     73     &get_refcount_ro6
     74 };
     75 
     76 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
     77     &set_refcount_ro0,
     78     &set_refcount_ro1,
     79     &set_refcount_ro2,
     80     &set_refcount_ro3,
     81     &set_refcount_ro4,
     82     &set_refcount_ro5,
     83     &set_refcount_ro6
     84 };
     85 
     86 
     87 /*********************************************************/
     88 /* refcount handling */
     89 
     90 static void update_max_refcount_table_index(BDRVQcow2State *s)
     91 {
     92     unsigned i = s->refcount_table_size - 1;
     93     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
     94         i--;
     95     }
     96     /* Set s->max_refcount_table_index to the index of the last used entry */
     97     s->max_refcount_table_index = i;
     98 }
     99 
    100 int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
    101 {
    102     BDRVQcow2State *s = bs->opaque;
    103     unsigned int refcount_table_size2, i;
    104     int ret;
    105 
    106     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
    107 
    108     s->get_refcount = get_refcount_funcs[s->refcount_order];
    109     s->set_refcount = set_refcount_funcs[s->refcount_order];
    110 
    111     assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
    112     refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
    113     s->refcount_table = g_try_malloc(refcount_table_size2);
    114 
    115     if (s->refcount_table_size > 0) {
    116         if (s->refcount_table == NULL) {
    117             ret = -ENOMEM;
    118             goto fail;
    119         }
    120         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
    121         ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
    122                             refcount_table_size2, s->refcount_table, 0);
    123         if (ret < 0) {
    124             goto fail;
    125         }
    126         for(i = 0; i < s->refcount_table_size; i++)
    127             be64_to_cpus(&s->refcount_table[i]);
    128         update_max_refcount_table_index(s);
    129     }
    130     return 0;
    131  fail:
    132     return ret;
    133 }
    134 
    135 void qcow2_refcount_close(BlockDriverState *bs)
    136 {
    137     BDRVQcow2State *s = bs->opaque;
    138     g_free(s->refcount_table);
    139 }
    140 
    141 
    142 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
    143 {
    144     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
    145 }
    146 
    147 static void set_refcount_ro0(void *refcount_array, uint64_t index,
    148                              uint64_t value)
    149 {
    150     assert(!(value >> 1));
    151     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
    152     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
    153 }
    154 
    155 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
    156 {
    157     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
    158            & 0x3;
    159 }
    160 
    161 static void set_refcount_ro1(void *refcount_array, uint64_t index,
    162                              uint64_t value)
    163 {
    164     assert(!(value >> 2));
    165     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
    166     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
    167 }
    168 
    169 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
    170 {
    171     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
    172            & 0xf;
    173 }
    174 
    175 static void set_refcount_ro2(void *refcount_array, uint64_t index,
    176                              uint64_t value)
    177 {
    178     assert(!(value >> 4));
    179     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
    180     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
    181 }
    182 
    183 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
    184 {
    185     return ((const uint8_t *)refcount_array)[index];
    186 }
    187 
    188 static void set_refcount_ro3(void *refcount_array, uint64_t index,
    189                              uint64_t value)
    190 {
    191     assert(!(value >> 8));
    192     ((uint8_t *)refcount_array)[index] = value;
    193 }
    194 
    195 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
    196 {
    197     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
    198 }
    199 
    200 static void set_refcount_ro4(void *refcount_array, uint64_t index,
    201                              uint64_t value)
    202 {
    203     assert(!(value >> 16));
    204     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
    205 }
    206 
    207 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
    208 {
    209     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
    210 }
    211 
    212 static void set_refcount_ro5(void *refcount_array, uint64_t index,
    213                              uint64_t value)
    214 {
    215     assert(!(value >> 32));
    216     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
    217 }
    218 
    219 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
    220 {
    221     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
    222 }
    223 
    224 static void set_refcount_ro6(void *refcount_array, uint64_t index,
    225                              uint64_t value)
    226 {
    227     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
    228 }
    229 
    230 
    231 static int load_refcount_block(BlockDriverState *bs,
    232                                int64_t refcount_block_offset,
    233                                void **refcount_block)
    234 {
    235     BDRVQcow2State *s = bs->opaque;
    236 
    237     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
    238     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
    239                            refcount_block);
    240 }
    241 
    242 /*
    243  * Retrieves the refcount of the cluster given by its index and stores it in
    244  * *refcount. Returns 0 on success and -errno on failure.
    245  */
    246 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
    247                        uint64_t *refcount)
    248 {
    249     BDRVQcow2State *s = bs->opaque;
    250     uint64_t refcount_table_index, block_index;
    251     int64_t refcount_block_offset;
    252     int ret;
    253     void *refcount_block;
    254 
    255     refcount_table_index = cluster_index >> s->refcount_block_bits;
    256     if (refcount_table_index >= s->refcount_table_size) {
    257         *refcount = 0;
    258         return 0;
    259     }
    260     refcount_block_offset =
    261         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
    262     if (!refcount_block_offset) {
    263         *refcount = 0;
    264         return 0;
    265     }
    266 
    267     if (offset_into_cluster(s, refcount_block_offset)) {
    268         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
    269                                 " unaligned (reftable index: %#" PRIx64 ")",
    270                                 refcount_block_offset, refcount_table_index);
    271         return -EIO;
    272     }
    273 
    274     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
    275                           &refcount_block);
    276     if (ret < 0) {
    277         return ret;
    278     }
    279 
    280     block_index = cluster_index & (s->refcount_block_size - 1);
    281     *refcount = s->get_refcount(refcount_block, block_index);
    282 
    283     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
    284 
    285     return 0;
    286 }
    287 
    288 /* Checks if two offsets are described by the same refcount block */
    289 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
    290     uint64_t offset_b)
    291 {
    292     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
    293     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
    294 
    295     return (block_a == block_b);
    296 }
    297 
    298 /*
    299  * Loads a refcount block. If it doesn't exist yet, it is allocated first
    300  * (including growing the refcount table if needed).
    301  *
    302  * Returns 0 on success or -errno in error case
    303  */
    304 static int alloc_refcount_block(BlockDriverState *bs,
    305                                 int64_t cluster_index, void **refcount_block)
    306 {
    307     BDRVQcow2State *s = bs->opaque;
    308     unsigned int refcount_table_index;
    309     int64_t ret;
    310 
    311     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
    312 
    313     /* Find the refcount block for the given cluster */
    314     refcount_table_index = cluster_index >> s->refcount_block_bits;
    315 
    316     if (refcount_table_index < s->refcount_table_size) {
    317 
    318         uint64_t refcount_block_offset =
    319             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
    320 
    321         /* If it's already there, we're done */
    322         if (refcount_block_offset) {
    323             if (offset_into_cluster(s, refcount_block_offset)) {
    324                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
    325                                         PRIx64 " unaligned (reftable index: "
    326                                         "%#x)", refcount_block_offset,
    327                                         refcount_table_index);
    328                 return -EIO;
    329             }
    330 
    331              return load_refcount_block(bs, refcount_block_offset,
    332                                         refcount_block);
    333         }
    334     }
    335 
    336     /*
    337      * If we came here, we need to allocate something. Something is at least
    338      * a cluster for the new refcount block. It may also include a new refcount
    339      * table if the old refcount table is too small.
    340      *
    341      * Note that allocating clusters here needs some special care:
    342      *
    343      * - We can't use the normal qcow2_alloc_clusters(), it would try to
    344      *   increase the refcount and very likely we would end up with an endless
    345      *   recursion. Instead we must place the refcount blocks in a way that
    346      *   they can describe them themselves.
    347      *
    348      * - We need to consider that at this point we are inside update_refcounts
    349      *   and potentially doing an initial refcount increase. This means that
    350      *   some clusters have already been allocated by the caller, but their
    351      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
    352      *   need to return -EAGAIN to signal the caller that it needs to restart
    353      *   the search for free clusters.
    354      *
    355      * - alloc_clusters_noref and qcow2_free_clusters may load a different
    356      *   refcount block into the cache
    357      */
    358 
    359     *refcount_block = NULL;
    360 
    361     /* We write to the refcount table, so we might depend on L2 tables */
    362     ret = qcow2_cache_flush(bs, s->l2_table_cache);
    363     if (ret < 0) {
    364         return ret;
    365     }
    366 
    367     /* Allocate the refcount block itself and mark it as used */
    368     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
    369     if (new_block < 0) {
    370         return new_block;
    371     }
    372 
    373     /* The offset must fit in the offset field of the refcount table entry */
    374     assert((new_block & REFT_OFFSET_MASK) == new_block);
    375 
    376     /* If we're allocating the block at offset 0 then something is wrong */
    377     if (new_block == 0) {
    378         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
    379                                 "allocation of refcount block at offset 0");
    380         return -EIO;
    381     }
    382 
    383 #ifdef DEBUG_ALLOC2
    384     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
    385         " at %" PRIx64 "\n",
    386         refcount_table_index, cluster_index << s->cluster_bits, new_block);
    387 #endif
    388 
    389     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
    390         /* Zero the new refcount block before updating it */
    391         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
    392                                     refcount_block);
    393         if (ret < 0) {
    394             goto fail;
    395         }
    396 
    397         memset(*refcount_block, 0, s->cluster_size);
    398 
    399         /* The block describes itself, need to update the cache */
    400         int block_index = (new_block >> s->cluster_bits) &
    401             (s->refcount_block_size - 1);
    402         s->set_refcount(*refcount_block, block_index, 1);
    403     } else {
    404         /* Described somewhere else. This can recurse at most twice before we
    405          * arrive at a block that describes itself. */
    406         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
    407                               QCOW2_DISCARD_NEVER);
    408         if (ret < 0) {
    409             goto fail;
    410         }
    411 
    412         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    413         if (ret < 0) {
    414             goto fail;
    415         }
    416 
    417         /* Initialize the new refcount block only after updating its refcount,
    418          * update_refcount uses the refcount cache itself */
    419         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
    420                                     refcount_block);
    421         if (ret < 0) {
    422             goto fail;
    423         }
    424 
    425         memset(*refcount_block, 0, s->cluster_size);
    426     }
    427 
    428     /* Now the new refcount block needs to be written to disk */
    429     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
    430     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
    431     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    432     if (ret < 0) {
    433         goto fail;
    434     }
    435 
    436     /* If the refcount table is big enough, just hook the block up there */
    437     if (refcount_table_index < s->refcount_table_size) {
    438         uint64_t data64 = cpu_to_be64(new_block);
    439         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
    440         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
    441                                refcount_table_index * REFTABLE_ENTRY_SIZE,
    442             sizeof(data64), &data64, 0);
    443         if (ret < 0) {
    444             goto fail;
    445         }
    446 
    447         s->refcount_table[refcount_table_index] = new_block;
    448         /* If there's a hole in s->refcount_table then it can happen
    449          * that refcount_table_index < s->max_refcount_table_index */
    450         s->max_refcount_table_index =
    451             MAX(s->max_refcount_table_index, refcount_table_index);
    452 
    453         /* The new refcount block may be where the caller intended to put its
    454          * data, so let it restart the search. */
    455         return -EAGAIN;
    456     }
    457 
    458     qcow2_cache_put(s->refcount_block_cache, refcount_block);
    459 
    460     /*
    461      * If we come here, we need to grow the refcount table. Again, a new
    462      * refcount table needs some space and we can't simply allocate to avoid
    463      * endless recursion.
    464      *
    465      * Therefore let's grab new refcount blocks at the end of the image, which
    466      * will describe themselves and the new refcount table. This way we can
    467      * reference them only in the new table and do the switch to the new
    468      * refcount table at once without producing an inconsistent state in
    469      * between.
    470      */
    471     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
    472 
    473     /* Calculate the number of refcount blocks needed so far; this will be the
    474      * basis for calculating the index of the first cluster used for the
    475      * self-describing refcount structures which we are about to create.
    476      *
    477      * Because we reached this point, there cannot be any refcount entries for
    478      * cluster_index or higher indices yet. However, because new_block has been
    479      * allocated to describe that cluster (and it will assume this role later
    480      * on), we cannot use that index; also, new_block may actually have a higher
    481      * cluster index than cluster_index, so it needs to be taken into account
    482      * here (and 1 needs to be added to its value because that cluster is used).
    483      */
    484     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
    485                                             (new_block >> s->cluster_bits) + 1),
    486                                         s->refcount_block_size);
    487 
    488     /* Create the new refcount table and blocks */
    489     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
    490         s->cluster_size;
    491 
    492     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
    493                               refcount_table_index, new_block);
    494     if (ret < 0) {
    495         return ret;
    496     }
    497 
    498     ret = load_refcount_block(bs, new_block, refcount_block);
    499     if (ret < 0) {
    500         return ret;
    501     }
    502 
    503     /* If we were trying to do the initial refcount update for some cluster
    504      * allocation, we might have used the same clusters to store newly
    505      * allocated metadata. Make the caller search some new space. */
    506     return -EAGAIN;
    507 
    508 fail:
    509     if (*refcount_block != NULL) {
    510         qcow2_cache_put(s->refcount_block_cache, refcount_block);
    511     }
    512     return ret;
    513 }
    514 
    515 /*
    516  * Starting at @start_offset, this function creates new self-covering refcount
    517  * structures: A new refcount table and refcount blocks which cover all of
    518  * themselves, and a number of @additional_clusters beyond their end.
    519  * @start_offset must be at the end of the image file, that is, there must be
    520  * only empty space beyond it.
    521  * If @exact_size is false, the refcount table will have 50 % more entries than
    522  * necessary so it will not need to grow again soon.
    523  * If @new_refblock_offset is not zero, it contains the offset of a refcount
    524  * block that should be entered into the new refcount table at index
    525  * @new_refblock_index.
    526  *
    527  * Returns: The offset after the new refcount structures (i.e. where the
    528  *          @additional_clusters may be placed) on success, -errno on error.
    529  */
    530 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
    531                             uint64_t additional_clusters, bool exact_size,
    532                             int new_refblock_index,
    533                             uint64_t new_refblock_offset)
    534 {
    535     BDRVQcow2State *s = bs->opaque;
    536     uint64_t total_refblock_count_u64, additional_refblock_count;
    537     int total_refblock_count, table_size, area_reftable_index, table_clusters;
    538     int i;
    539     uint64_t table_offset, block_offset, end_offset;
    540     int ret;
    541     uint64_t *new_table;
    542 
    543     assert(!(start_offset % s->cluster_size));
    544 
    545     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
    546                                  additional_clusters,
    547                                  s->cluster_size, s->refcount_order,
    548                                  !exact_size, &total_refblock_count_u64);
    549     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
    550         return -EFBIG;
    551     }
    552     total_refblock_count = total_refblock_count_u64;
    553 
    554     /* Index in the refcount table of the first refcount block to cover the area
    555      * of refcount structures we are about to create; we know that
    556      * @total_refblock_count can cover @start_offset, so this will definitely
    557      * fit into an int. */
    558     area_reftable_index = (start_offset / s->cluster_size) /
    559                           s->refcount_block_size;
    560 
    561     if (exact_size) {
    562         table_size = total_refblock_count;
    563     } else {
    564         table_size = total_refblock_count +
    565                      DIV_ROUND_UP(total_refblock_count, 2);
    566     }
    567     /* The qcow2 file can only store the reftable size in number of clusters */
    568     table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
    569     table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
    570 
    571     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
    572         return -EFBIG;
    573     }
    574 
    575     new_table = g_try_new0(uint64_t, table_size);
    576 
    577     assert(table_size > 0);
    578     if (new_table == NULL) {
    579         ret = -ENOMEM;
    580         goto fail;
    581     }
    582 
    583     /* Fill the new refcount table */
    584     if (table_size > s->max_refcount_table_index) {
    585         /* We're actually growing the reftable */
    586         memcpy(new_table, s->refcount_table,
    587                (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
    588     } else {
    589         /* Improbable case: We're shrinking the reftable. However, the caller
    590          * has assured us that there is only empty space beyond @start_offset,
    591          * so we can simply drop all of the refblocks that won't fit into the
    592          * new reftable. */
    593         memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
    594     }
    595 
    596     if (new_refblock_offset) {
    597         assert(new_refblock_index < total_refblock_count);
    598         new_table[new_refblock_index] = new_refblock_offset;
    599     }
    600 
    601     /* Count how many new refblocks we have to create */
    602     additional_refblock_count = 0;
    603     for (i = area_reftable_index; i < total_refblock_count; i++) {
    604         if (!new_table[i]) {
    605             additional_refblock_count++;
    606         }
    607     }
    608 
    609     table_offset = start_offset + additional_refblock_count * s->cluster_size;
    610     end_offset = table_offset + table_clusters * s->cluster_size;
    611 
    612     /* Fill the refcount blocks, and create new ones, if necessary */
    613     block_offset = start_offset;
    614     for (i = area_reftable_index; i < total_refblock_count; i++) {
    615         void *refblock_data;
    616         uint64_t first_offset_covered;
    617 
    618         /* Reuse an existing refblock if possible, create a new one otherwise */
    619         if (new_table[i]) {
    620             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
    621                                   &refblock_data);
    622             if (ret < 0) {
    623                 goto fail;
    624             }
    625         } else {
    626             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
    627                                         block_offset, &refblock_data);
    628             if (ret < 0) {
    629                 goto fail;
    630             }
    631             memset(refblock_data, 0, s->cluster_size);
    632             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
    633                                          refblock_data);
    634 
    635             new_table[i] = block_offset;
    636             block_offset += s->cluster_size;
    637         }
    638 
    639         /* First host offset covered by this refblock */
    640         first_offset_covered = (uint64_t)i * s->refcount_block_size *
    641                                s->cluster_size;
    642         if (first_offset_covered < end_offset) {
    643             int j, end_index;
    644 
    645             /* Set the refcount of all of the new refcount structures to 1 */
    646 
    647             if (first_offset_covered < start_offset) {
    648                 assert(i == area_reftable_index);
    649                 j = (start_offset - first_offset_covered) / s->cluster_size;
    650                 assert(j < s->refcount_block_size);
    651             } else {
    652                 j = 0;
    653             }
    654 
    655             end_index = MIN((end_offset - first_offset_covered) /
    656                             s->cluster_size,
    657                             s->refcount_block_size);
    658 
    659             for (; j < end_index; j++) {
    660                 /* The caller guaranteed us this space would be empty */
    661                 assert(s->get_refcount(refblock_data, j) == 0);
    662                 s->set_refcount(refblock_data, j, 1);
    663             }
    664 
    665             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
    666                                          refblock_data);
    667         }
    668 
    669         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
    670     }
    671 
    672     assert(block_offset == table_offset);
    673 
    674     /* Write refcount blocks to disk */
    675     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
    676     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
    677     if (ret < 0) {
    678         goto fail;
    679     }
    680 
    681     /* Write refcount table to disk */
    682     for (i = 0; i < total_refblock_count; i++) {
    683         cpu_to_be64s(&new_table[i]);
    684     }
    685 
    686     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
    687     ret = bdrv_pwrite_sync(bs->file, table_offset,
    688                            table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
    689     if (ret < 0) {
    690         goto fail;
    691     }
    692 
    693     for (i = 0; i < total_refblock_count; i++) {
    694         be64_to_cpus(&new_table[i]);
    695     }
    696 
    697     /* Hook up the new refcount table in the qcow2 header */
    698     struct QEMU_PACKED {
    699         uint64_t d64;
    700         uint32_t d32;
    701     } data;
    702     data.d64 = cpu_to_be64(table_offset);
    703     data.d32 = cpu_to_be32(table_clusters);
    704     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
    705     ret = bdrv_pwrite_sync(bs->file,
    706                            offsetof(QCowHeader, refcount_table_offset),
    707                            sizeof(data), &data, 0);
    708     if (ret < 0) {
    709         goto fail;
    710     }
    711 
    712     /* And switch it in memory */
    713     uint64_t old_table_offset = s->refcount_table_offset;
    714     uint64_t old_table_size = s->refcount_table_size;
    715 
    716     g_free(s->refcount_table);
    717     s->refcount_table = new_table;
    718     s->refcount_table_size = table_size;
    719     s->refcount_table_offset = table_offset;
    720     update_max_refcount_table_index(s);
    721 
    722     /* Free old table. */
    723     qcow2_free_clusters(bs, old_table_offset,
    724                         old_table_size * REFTABLE_ENTRY_SIZE,
    725                         QCOW2_DISCARD_OTHER);
    726 
    727     return end_offset;
    728 
    729 fail:
    730     g_free(new_table);
    731     return ret;
    732 }
    733 
    734 void qcow2_process_discards(BlockDriverState *bs, int ret)
    735 {
    736     BDRVQcow2State *s = bs->opaque;
    737     Qcow2DiscardRegion *d, *next;
    738 
    739     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
    740         QTAILQ_REMOVE(&s->discards, d, next);
    741 
    742         /* Discard is optional, ignore the return value */
    743         if (ret >= 0) {
    744             int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
    745             if (r2 < 0) {
    746                 trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
    747                                                            r2);
    748             }
    749         }
    750 
    751         g_free(d);
    752     }
    753 }
    754 
    755 static void update_refcount_discard(BlockDriverState *bs,
    756                                     uint64_t offset, uint64_t length)
    757 {
    758     BDRVQcow2State *s = bs->opaque;
    759     Qcow2DiscardRegion *d, *p, *next;
    760 
    761     QTAILQ_FOREACH(d, &s->discards, next) {
    762         uint64_t new_start = MIN(offset, d->offset);
    763         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
    764 
    765         if (new_end - new_start <= length + d->bytes) {
    766             /* There can't be any overlap, areas ending up here have no
    767              * references any more and therefore shouldn't get freed another
    768              * time. */
    769             assert(d->bytes + length == new_end - new_start);
    770             d->offset = new_start;
    771             d->bytes = new_end - new_start;
    772             goto found;
    773         }
    774     }
    775 
    776     d = g_malloc(sizeof(*d));
    777     *d = (Qcow2DiscardRegion) {
    778         .bs     = bs,
    779         .offset = offset,
    780         .bytes  = length,
    781     };
    782     QTAILQ_INSERT_TAIL(&s->discards, d, next);
    783 
    784 found:
    785     /* Merge discard requests if they are adjacent now */
    786     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
    787         if (p == d
    788             || p->offset > d->offset + d->bytes
    789             || d->offset > p->offset + p->bytes)
    790         {
    791             continue;
    792         }
    793 
    794         /* Still no overlap possible */
    795         assert(p->offset == d->offset + d->bytes
    796             || d->offset == p->offset + p->bytes);
    797 
    798         QTAILQ_REMOVE(&s->discards, p, next);
    799         d->offset = MIN(d->offset, p->offset);
    800         d->bytes += p->bytes;
    801         g_free(p);
    802     }
    803 }
    804 
    805 /* XXX: cache several refcount block clusters ? */
    806 /* @addend is the absolute value of the addend; if @decrease is set, @addend
    807  * will be subtracted from the current refcount, otherwise it will be added */
    808 static int update_refcount(BlockDriverState *bs,
    809                            int64_t offset,
    810                            int64_t length,
    811                            uint64_t addend,
    812                            bool decrease,
    813                            enum qcow2_discard_type type)
    814 {
    815     BDRVQcow2State *s = bs->opaque;
    816     int64_t start, last, cluster_offset;
    817     void *refcount_block = NULL;
    818     int64_t old_table_index = -1;
    819     int ret;
    820 
    821 #ifdef DEBUG_ALLOC2
    822     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
    823             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
    824             addend);
    825 #endif
    826     if (length < 0) {
    827         return -EINVAL;
    828     } else if (length == 0) {
    829         return 0;
    830     }
    831 
    832     if (decrease) {
    833         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
    834             s->l2_table_cache);
    835     }
    836 
    837     start = start_of_cluster(s, offset);
    838     last = start_of_cluster(s, offset + length - 1);
    839     for(cluster_offset = start; cluster_offset <= last;
    840         cluster_offset += s->cluster_size)
    841     {
    842         int block_index;
    843         uint64_t refcount;
    844         int64_t cluster_index = cluster_offset >> s->cluster_bits;
    845         int64_t table_index = cluster_index >> s->refcount_block_bits;
    846 
    847         /* Load the refcount block and allocate it if needed */
    848         if (table_index != old_table_index) {
    849             if (refcount_block) {
    850                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
    851             }
    852             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
    853             /* If the caller needs to restart the search for free clusters,
    854              * try the same ones first to see if they're still free. */
    855             if (ret == -EAGAIN) {
    856                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
    857                     s->free_cluster_index = (start >> s->cluster_bits);
    858                 }
    859             }
    860             if (ret < 0) {
    861                 goto fail;
    862             }
    863         }
    864         old_table_index = table_index;
    865 
    866         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
    867 
    868         /* we can update the count and save it */
    869         block_index = cluster_index & (s->refcount_block_size - 1);
    870 
    871         refcount = s->get_refcount(refcount_block, block_index);
    872         if (decrease ? (refcount - addend > refcount)
    873                      : (refcount + addend < refcount ||
    874                         refcount + addend > s->refcount_max))
    875         {
    876             ret = -EINVAL;
    877             goto fail;
    878         }
    879         if (decrease) {
    880             refcount -= addend;
    881         } else {
    882             refcount += addend;
    883         }
    884         if (refcount == 0 && cluster_index < s->free_cluster_index) {
    885             s->free_cluster_index = cluster_index;
    886         }
    887         s->set_refcount(refcount_block, block_index, refcount);
    888 
    889         if (refcount == 0) {
    890             void *table;
    891 
    892             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
    893                                                 offset);
    894             if (table != NULL) {
    895                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
    896                 old_table_index = -1;
    897                 qcow2_cache_discard(s->refcount_block_cache, table);
    898             }
    899 
    900             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
    901             if (table != NULL) {
    902                 qcow2_cache_discard(s->l2_table_cache, table);
    903             }
    904 
    905             if (s->discard_passthrough[type]) {
    906                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
    907             }
    908         }
    909     }
    910 
    911     ret = 0;
    912 fail:
    913     if (!s->cache_discards) {
    914         qcow2_process_discards(bs, ret);
    915     }
    916 
    917     /* Write last changed block to disk */
    918     if (refcount_block) {
    919         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
    920     }
    921 
    922     /*
    923      * Try do undo any updates if an error is returned (This may succeed in
    924      * some cases like ENOSPC for allocating a new refcount block)
    925      */
    926     if (ret < 0) {
    927         int dummy;
    928         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
    929                                 !decrease, QCOW2_DISCARD_NEVER);
    930         (void)dummy;
    931     }
    932 
    933     return ret;
    934 }
    935 
    936 /*
    937  * Increases or decreases the refcount of a given cluster.
    938  *
    939  * @addend is the absolute value of the addend; if @decrease is set, @addend
    940  * will be subtracted from the current refcount, otherwise it will be added.
    941  *
    942  * On success 0 is returned; on failure -errno is returned.
    943  */
    944 int qcow2_update_cluster_refcount(BlockDriverState *bs,
    945                                   int64_t cluster_index,
    946                                   uint64_t addend, bool decrease,
    947                                   enum qcow2_discard_type type)
    948 {
    949     BDRVQcow2State *s = bs->opaque;
    950     int ret;
    951 
    952     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
    953                           decrease, type);
    954     if (ret < 0) {
    955         return ret;
    956     }
    957 
    958     return 0;
    959 }
    960 
    961 
    962 
    963 /*********************************************************/
    964 /* cluster allocation functions */
    965 
    966 
    967 
    968 /* return < 0 if error */
    969 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
    970                                     uint64_t max)
    971 {
    972     BDRVQcow2State *s = bs->opaque;
    973     uint64_t i, nb_clusters, refcount;
    974     int ret;
    975 
    976     /* We can't allocate clusters if they may still be queued for discard. */
    977     if (s->cache_discards) {
    978         qcow2_process_discards(bs, 0);
    979     }
    980 
    981     nb_clusters = size_to_clusters(s, size);
    982 retry:
    983     for(i = 0; i < nb_clusters; i++) {
    984         uint64_t next_cluster_index = s->free_cluster_index++;
    985         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
    986 
    987         if (ret < 0) {
    988             return ret;
    989         } else if (refcount != 0) {
    990             goto retry;
    991         }
    992     }
    993 
    994     /* Make sure that all offsets in the "allocated" range are representable
    995      * in the requested max */
    996     if (s->free_cluster_index > 0 &&
    997         s->free_cluster_index - 1 > (max >> s->cluster_bits))
    998     {
    999         return -EFBIG;
   1000     }
   1001 
   1002 #ifdef DEBUG_ALLOC2
   1003     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
   1004             size,
   1005             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
   1006 #endif
   1007     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
   1008 }
   1009 
   1010 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
   1011 {
   1012     int64_t offset;
   1013     int ret;
   1014 
   1015     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
   1016     do {
   1017         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
   1018         if (offset < 0) {
   1019             return offset;
   1020         }
   1021 
   1022         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
   1023     } while (ret == -EAGAIN);
   1024 
   1025     if (ret < 0) {
   1026         return ret;
   1027     }
   1028 
   1029     return offset;
   1030 }
   1031 
   1032 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
   1033                                 int64_t nb_clusters)
   1034 {
   1035     BDRVQcow2State *s = bs->opaque;
   1036     uint64_t cluster_index, refcount;
   1037     uint64_t i;
   1038     int ret;
   1039 
   1040     assert(nb_clusters >= 0);
   1041     if (nb_clusters == 0) {
   1042         return 0;
   1043     }
   1044 
   1045     do {
   1046         /* Check how many clusters there are free */
   1047         cluster_index = offset >> s->cluster_bits;
   1048         for(i = 0; i < nb_clusters; i++) {
   1049             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
   1050             if (ret < 0) {
   1051                 return ret;
   1052             } else if (refcount != 0) {
   1053                 break;
   1054             }
   1055         }
   1056 
   1057         /* And then allocate them */
   1058         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
   1059                               QCOW2_DISCARD_NEVER);
   1060     } while (ret == -EAGAIN);
   1061 
   1062     if (ret < 0) {
   1063         return ret;
   1064     }
   1065 
   1066     return i;
   1067 }
   1068 
   1069 /* only used to allocate compressed sectors. We try to allocate
   1070    contiguous sectors. size must be <= cluster_size */
   1071 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
   1072 {
   1073     BDRVQcow2State *s = bs->opaque;
   1074     int64_t offset;
   1075     size_t free_in_cluster;
   1076     int ret;
   1077 
   1078     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
   1079     assert(size > 0 && size <= s->cluster_size);
   1080     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
   1081 
   1082     offset = s->free_byte_offset;
   1083 
   1084     if (offset) {
   1085         uint64_t refcount;
   1086         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
   1087         if (ret < 0) {
   1088             return ret;
   1089         }
   1090 
   1091         if (refcount == s->refcount_max) {
   1092             offset = 0;
   1093         }
   1094     }
   1095 
   1096     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
   1097     do {
   1098         if (!offset || free_in_cluster < size) {
   1099             int64_t new_cluster;
   1100 
   1101             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
   1102                                                MIN(s->cluster_offset_mask,
   1103                                                    QCOW_MAX_CLUSTER_OFFSET));
   1104             if (new_cluster < 0) {
   1105                 return new_cluster;
   1106             }
   1107 
   1108             if (new_cluster == 0) {
   1109                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
   1110                                         "allocation of compressed cluster "
   1111                                         "at offset 0");
   1112                 return -EIO;
   1113             }
   1114 
   1115             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
   1116                 offset = new_cluster;
   1117                 free_in_cluster = s->cluster_size;
   1118             } else {
   1119                 free_in_cluster += s->cluster_size;
   1120             }
   1121         }
   1122 
   1123         assert(offset);
   1124         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
   1125         if (ret < 0) {
   1126             offset = 0;
   1127         }
   1128     } while (ret == -EAGAIN);
   1129     if (ret < 0) {
   1130         return ret;
   1131     }
   1132 
   1133     /* The cluster refcount was incremented; refcount blocks must be flushed
   1134      * before the caller's L2 table updates. */
   1135     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
   1136 
   1137     s->free_byte_offset = offset + size;
   1138     if (!offset_into_cluster(s, s->free_byte_offset)) {
   1139         s->free_byte_offset = 0;
   1140     }
   1141 
   1142     return offset;
   1143 }
   1144 
   1145 void qcow2_free_clusters(BlockDriverState *bs,
   1146                           int64_t offset, int64_t size,
   1147                           enum qcow2_discard_type type)
   1148 {
   1149     int ret;
   1150 
   1151     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
   1152     ret = update_refcount(bs, offset, size, 1, true, type);
   1153     if (ret < 0) {
   1154         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
   1155         /* TODO Remember the clusters to free them later and avoid leaking */
   1156     }
   1157 }
   1158 
   1159 /*
   1160  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
   1161  * normal cluster, compressed cluster, etc.)
   1162  */
   1163 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
   1164                             enum qcow2_discard_type type)
   1165 {
   1166     BDRVQcow2State *s = bs->opaque;
   1167     QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
   1168 
   1169     if (has_data_file(bs)) {
   1170         if (s->discard_passthrough[type] &&
   1171             (ctype == QCOW2_CLUSTER_NORMAL ||
   1172              ctype == QCOW2_CLUSTER_ZERO_ALLOC))
   1173         {
   1174             bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
   1175                           s->cluster_size);
   1176         }
   1177         return;
   1178     }
   1179 
   1180     switch (ctype) {
   1181     case QCOW2_CLUSTER_COMPRESSED:
   1182         {
   1183             uint64_t coffset;
   1184             int csize;
   1185 
   1186             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
   1187             qcow2_free_clusters(bs, coffset, csize, type);
   1188         }
   1189         break;
   1190     case QCOW2_CLUSTER_NORMAL:
   1191     case QCOW2_CLUSTER_ZERO_ALLOC:
   1192         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
   1193             qcow2_signal_corruption(bs, false, -1, -1,
   1194                                     "Cannot free unaligned cluster %#llx",
   1195                                     l2_entry & L2E_OFFSET_MASK);
   1196         } else {
   1197             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
   1198                                 s->cluster_size, type);
   1199         }
   1200         break;
   1201     case QCOW2_CLUSTER_ZERO_PLAIN:
   1202     case QCOW2_CLUSTER_UNALLOCATED:
   1203         break;
   1204     default:
   1205         abort();
   1206     }
   1207 }
   1208 
   1209 int qcow2_write_caches(BlockDriverState *bs)
   1210 {
   1211     BDRVQcow2State *s = bs->opaque;
   1212     int ret;
   1213 
   1214     ret = qcow2_cache_write(bs, s->l2_table_cache);
   1215     if (ret < 0) {
   1216         return ret;
   1217     }
   1218 
   1219     if (qcow2_need_accurate_refcounts(s)) {
   1220         ret = qcow2_cache_write(bs, s->refcount_block_cache);
   1221         if (ret < 0) {
   1222             return ret;
   1223         }
   1224     }
   1225 
   1226     return 0;
   1227 }
   1228 
   1229 int qcow2_flush_caches(BlockDriverState *bs)
   1230 {
   1231     int ret = qcow2_write_caches(bs);
   1232     if (ret < 0) {
   1233         return ret;
   1234     }
   1235 
   1236     return bdrv_flush(bs->file->bs);
   1237 }
   1238 
   1239 /*********************************************************/
   1240 /* snapshots and image creation */
   1241 
   1242 
   1243 
   1244 /* update the refcounts of snapshots and the copied flag */
   1245 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
   1246     int64_t l1_table_offset, int l1_size, int addend)
   1247 {
   1248     BDRVQcow2State *s = bs->opaque;
   1249     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
   1250     bool l1_allocated = false;
   1251     int64_t old_entry, old_l2_offset;
   1252     unsigned slice, slice_size2, n_slices;
   1253     int i, j, l1_modified = 0;
   1254     int ret;
   1255 
   1256     assert(addend >= -1 && addend <= 1);
   1257 
   1258     l2_slice = NULL;
   1259     l1_table = NULL;
   1260     l1_size2 = l1_size * L1E_SIZE;
   1261     slice_size2 = s->l2_slice_size * l2_entry_size(s);
   1262     n_slices = s->cluster_size / slice_size2;
   1263 
   1264     s->cache_discards = true;
   1265 
   1266     /* WARNING: qcow2_snapshot_goto relies on this function not using the
   1267      * l1_table_offset when it is the current s->l1_table_offset! Be careful
   1268      * when changing this! */
   1269     if (l1_table_offset != s->l1_table_offset) {
   1270         l1_table = g_try_malloc0(l1_size2);
   1271         if (l1_size2 && l1_table == NULL) {
   1272             ret = -ENOMEM;
   1273             goto fail;
   1274         }
   1275         l1_allocated = true;
   1276 
   1277         ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
   1278         if (ret < 0) {
   1279             goto fail;
   1280         }
   1281 
   1282         for (i = 0; i < l1_size; i++) {
   1283             be64_to_cpus(&l1_table[i]);
   1284         }
   1285     } else {
   1286         assert(l1_size == s->l1_size);
   1287         l1_table = s->l1_table;
   1288         l1_allocated = false;
   1289     }
   1290 
   1291     for (i = 0; i < l1_size; i++) {
   1292         l2_offset = l1_table[i];
   1293         if (l2_offset) {
   1294             old_l2_offset = l2_offset;
   1295             l2_offset &= L1E_OFFSET_MASK;
   1296 
   1297             if (offset_into_cluster(s, l2_offset)) {
   1298                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
   1299                                         PRIx64 " unaligned (L1 index: %#x)",
   1300                                         l2_offset, i);
   1301                 ret = -EIO;
   1302                 goto fail;
   1303             }
   1304 
   1305             for (slice = 0; slice < n_slices; slice++) {
   1306                 ret = qcow2_cache_get(bs, s->l2_table_cache,
   1307                                       l2_offset + slice * slice_size2,
   1308                                       (void **) &l2_slice);
   1309                 if (ret < 0) {
   1310                     goto fail;
   1311                 }
   1312 
   1313                 for (j = 0; j < s->l2_slice_size; j++) {
   1314                     uint64_t cluster_index;
   1315                     uint64_t offset;
   1316 
   1317                     entry = get_l2_entry(s, l2_slice, j);
   1318                     old_entry = entry;
   1319                     entry &= ~QCOW_OFLAG_COPIED;
   1320                     offset = entry & L2E_OFFSET_MASK;
   1321 
   1322                     switch (qcow2_get_cluster_type(bs, entry)) {
   1323                     case QCOW2_CLUSTER_COMPRESSED:
   1324                         if (addend != 0) {
   1325                             uint64_t coffset;
   1326                             int csize;
   1327 
   1328                             qcow2_parse_compressed_l2_entry(bs, entry,
   1329                                                             &coffset, &csize);
   1330                             ret = update_refcount(
   1331                                 bs, coffset, csize,
   1332                                 abs(addend), addend < 0,
   1333                                 QCOW2_DISCARD_SNAPSHOT);
   1334                             if (ret < 0) {
   1335                                 goto fail;
   1336                             }
   1337                         }
   1338                         /* compressed clusters are never modified */
   1339                         refcount = 2;
   1340                         break;
   1341 
   1342                     case QCOW2_CLUSTER_NORMAL:
   1343                     case QCOW2_CLUSTER_ZERO_ALLOC:
   1344                         if (offset_into_cluster(s, offset)) {
   1345                             /* Here l2_index means table (not slice) index */
   1346                             int l2_index = slice * s->l2_slice_size + j;
   1347                             qcow2_signal_corruption(
   1348                                 bs, true, -1, -1, "Cluster "
   1349                                 "allocation offset %#" PRIx64
   1350                                 " unaligned (L2 offset: %#"
   1351                                 PRIx64 ", L2 index: %#x)",
   1352                                 offset, l2_offset, l2_index);
   1353                             ret = -EIO;
   1354                             goto fail;
   1355                         }
   1356 
   1357                         cluster_index = offset >> s->cluster_bits;
   1358                         assert(cluster_index);
   1359                         if (addend != 0) {
   1360                             ret = qcow2_update_cluster_refcount(
   1361                                 bs, cluster_index, abs(addend), addend < 0,
   1362                                 QCOW2_DISCARD_SNAPSHOT);
   1363                             if (ret < 0) {
   1364                                 goto fail;
   1365                             }
   1366                         }
   1367 
   1368                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
   1369                         if (ret < 0) {
   1370                             goto fail;
   1371                         }
   1372                         break;
   1373 
   1374                     case QCOW2_CLUSTER_ZERO_PLAIN:
   1375                     case QCOW2_CLUSTER_UNALLOCATED:
   1376                         refcount = 0;
   1377                         break;
   1378 
   1379                     default:
   1380                         abort();
   1381                     }
   1382 
   1383                     if (refcount == 1) {
   1384                         entry |= QCOW_OFLAG_COPIED;
   1385                     }
   1386                     if (entry != old_entry) {
   1387                         if (addend > 0) {
   1388                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
   1389                                                        s->refcount_block_cache);
   1390                         }
   1391                         set_l2_entry(s, l2_slice, j, entry);
   1392                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
   1393                                                      l2_slice);
   1394                     }
   1395                 }
   1396 
   1397                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1398             }
   1399 
   1400             if (addend != 0) {
   1401                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
   1402                                                         s->cluster_bits,
   1403                                                     abs(addend), addend < 0,
   1404                                                     QCOW2_DISCARD_SNAPSHOT);
   1405                 if (ret < 0) {
   1406                     goto fail;
   1407                 }
   1408             }
   1409             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
   1410                                      &refcount);
   1411             if (ret < 0) {
   1412                 goto fail;
   1413             } else if (refcount == 1) {
   1414                 l2_offset |= QCOW_OFLAG_COPIED;
   1415             }
   1416             if (l2_offset != old_l2_offset) {
   1417                 l1_table[i] = l2_offset;
   1418                 l1_modified = 1;
   1419             }
   1420         }
   1421     }
   1422 
   1423     ret = bdrv_flush(bs);
   1424 fail:
   1425     if (l2_slice) {
   1426         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
   1427     }
   1428 
   1429     s->cache_discards = false;
   1430     qcow2_process_discards(bs, ret);
   1431 
   1432     /* Update L1 only if it isn't deleted anyway (addend = -1) */
   1433     if (ret == 0 && addend >= 0 && l1_modified) {
   1434         for (i = 0; i < l1_size; i++) {
   1435             cpu_to_be64s(&l1_table[i]);
   1436         }
   1437 
   1438         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
   1439                                0);
   1440 
   1441         for (i = 0; i < l1_size; i++) {
   1442             be64_to_cpus(&l1_table[i]);
   1443         }
   1444     }
   1445     if (l1_allocated)
   1446         g_free(l1_table);
   1447     return ret;
   1448 }
   1449 
   1450 
   1451 
   1452 
   1453 /*********************************************************/
   1454 /* refcount checking functions */
   1455 
   1456 
   1457 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
   1458 {
   1459     /* This assertion holds because there is no way we can address more than
   1460      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
   1461      * offsets have to be representable in bytes); due to every cluster
   1462      * corresponding to one refcount entry, we are well below that limit */
   1463     assert(entries < (UINT64_C(1) << (64 - 9)));
   1464 
   1465     /* Thanks to the assertion this will not overflow, because
   1466      * s->refcount_order < 7.
   1467      * (note: x << s->refcount_order == x * s->refcount_bits) */
   1468     return DIV_ROUND_UP(entries << s->refcount_order, 8);
   1469 }
   1470 
   1471 /**
   1472  * Reallocates *array so that it can hold new_size entries. *size must contain
   1473  * the current number of entries in *array. If the reallocation fails, *array
   1474  * and *size will not be modified and -errno will be returned. If the
   1475  * reallocation is successful, *array will be set to the new buffer, *size
   1476  * will be set to new_size and 0 will be returned. The size of the reallocated
   1477  * refcount array buffer will be aligned to a cluster boundary, and the newly
   1478  * allocated area will be zeroed.
   1479  */
   1480 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
   1481                                   int64_t *size, int64_t new_size)
   1482 {
   1483     int64_t old_byte_size, new_byte_size;
   1484     void *new_ptr;
   1485 
   1486     /* Round to clusters so the array can be directly written to disk */
   1487     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
   1488                     * s->cluster_size;
   1489     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
   1490                     * s->cluster_size;
   1491 
   1492     if (new_byte_size == old_byte_size) {
   1493         *size = new_size;
   1494         return 0;
   1495     }
   1496 
   1497     assert(new_byte_size > 0);
   1498 
   1499     if (new_byte_size > SIZE_MAX) {
   1500         return -ENOMEM;
   1501     }
   1502 
   1503     new_ptr = g_try_realloc(*array, new_byte_size);
   1504     if (!new_ptr) {
   1505         return -ENOMEM;
   1506     }
   1507 
   1508     if (new_byte_size > old_byte_size) {
   1509         memset((char *)new_ptr + old_byte_size, 0,
   1510                new_byte_size - old_byte_size);
   1511     }
   1512 
   1513     *array = new_ptr;
   1514     *size  = new_size;
   1515 
   1516     return 0;
   1517 }
   1518 
   1519 /*
   1520  * Increases the refcount for a range of clusters in a given refcount table.
   1521  * This is used to construct a temporary refcount table out of L1 and L2 tables
   1522  * which can be compared to the refcount table saved in the image.
   1523  *
   1524  * Modifies the number of errors in res.
   1525  */
   1526 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
   1527                              void **refcount_table,
   1528                              int64_t *refcount_table_size,
   1529                              int64_t offset, int64_t size)
   1530 {
   1531     BDRVQcow2State *s = bs->opaque;
   1532     uint64_t start, last, cluster_offset, k, refcount;
   1533     int64_t file_len;
   1534     int ret;
   1535 
   1536     if (size <= 0) {
   1537         return 0;
   1538     }
   1539 
   1540     file_len = bdrv_getlength(bs->file->bs);
   1541     if (file_len < 0) {
   1542         return file_len;
   1543     }
   1544 
   1545     /*
   1546      * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
   1547      * reference some space after file end but it should be less than one
   1548      * cluster.
   1549      */
   1550     if (offset + size - file_len >= s->cluster_size) {
   1551         fprintf(stderr, "ERROR: counting reference for region exceeding the "
   1552                 "end of the file by one cluster or more: offset 0x%" PRIx64
   1553                 " size 0x%" PRIx64 "\n", offset, size);
   1554         res->corruptions++;
   1555         return 0;
   1556     }
   1557 
   1558     start = start_of_cluster(s, offset);
   1559     last = start_of_cluster(s, offset + size - 1);
   1560     for(cluster_offset = start; cluster_offset <= last;
   1561         cluster_offset += s->cluster_size) {
   1562         k = cluster_offset >> s->cluster_bits;
   1563         if (k >= *refcount_table_size) {
   1564             ret = realloc_refcount_array(s, refcount_table,
   1565                                          refcount_table_size, k + 1);
   1566             if (ret < 0) {
   1567                 res->check_errors++;
   1568                 return ret;
   1569             }
   1570         }
   1571 
   1572         refcount = s->get_refcount(*refcount_table, k);
   1573         if (refcount == s->refcount_max) {
   1574             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
   1575                     "\n", cluster_offset);
   1576             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
   1577                     "width or qemu-img convert to create a clean copy if the "
   1578                     "image cannot be opened for writing\n");
   1579             res->corruptions++;
   1580             continue;
   1581         }
   1582         s->set_refcount(*refcount_table, k, refcount + 1);
   1583     }
   1584 
   1585     return 0;
   1586 }
   1587 
   1588 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
   1589 enum {
   1590     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
   1591 };
   1592 
   1593 /*
   1594  * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
   1595  * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
   1596  *
   1597  * This function decrements res->corruptions on success, so the caller is
   1598  * responsible to increment res->corruptions prior to the call.
   1599  *
   1600  * On failure in-memory @l2_table may be modified.
   1601  */
   1602 static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
   1603                                 uint64_t l2_offset,
   1604                                 uint64_t *l2_table, int l2_index, bool active,
   1605                                 bool *metadata_overlap)
   1606 {
   1607     BDRVQcow2State *s = bs->opaque;
   1608     int ret;
   1609     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
   1610     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
   1611     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
   1612 
   1613     if (has_subclusters(s)) {
   1614         uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
   1615 
   1616         /* Allocated subclusters become zero */
   1617         l2_bitmap |= l2_bitmap << 32;
   1618         l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
   1619 
   1620         set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
   1621         set_l2_entry(s, l2_table, l2_index, 0);
   1622     } else {
   1623         set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
   1624     }
   1625 
   1626     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
   1627                                         false);
   1628     if (metadata_overlap) {
   1629         *metadata_overlap = ret < 0;
   1630     }
   1631     if (ret < 0) {
   1632         fprintf(stderr, "ERROR: Overlap check failed\n");
   1633         goto fail;
   1634     }
   1635 
   1636     ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
   1637                            &l2_table[idx], 0);
   1638     if (ret < 0) {
   1639         fprintf(stderr, "ERROR: Failed to overwrite L2 "
   1640                 "table entry: %s\n", strerror(-ret));
   1641         goto fail;
   1642     }
   1643 
   1644     res->corruptions--;
   1645     res->corruptions_fixed++;
   1646     return 0;
   1647 
   1648 fail:
   1649     res->check_errors++;
   1650     return ret;
   1651 }
   1652 
   1653 /*
   1654  * Increases the refcount in the given refcount table for the all clusters
   1655  * referenced in the L2 table. While doing so, performs some checks on L2
   1656  * entries.
   1657  *
   1658  * Returns the number of errors found by the checks or -errno if an internal
   1659  * error occurred.
   1660  */
   1661 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
   1662                               void **refcount_table,
   1663                               int64_t *refcount_table_size, int64_t l2_offset,
   1664                               int flags, BdrvCheckMode fix, bool active)
   1665 {
   1666     BDRVQcow2State *s = bs->opaque;
   1667     uint64_t l2_entry, l2_bitmap;
   1668     uint64_t next_contiguous_offset = 0;
   1669     int i, ret;
   1670     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
   1671     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
   1672     bool metadata_overlap;
   1673 
   1674     /* Read L2 table from disk */
   1675     ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
   1676     if (ret < 0) {
   1677         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
   1678         res->check_errors++;
   1679         return ret;
   1680     }
   1681 
   1682     /* Do the actual checks */
   1683     for (i = 0; i < s->l2_size; i++) {
   1684         uint64_t coffset;
   1685         int csize;
   1686         QCow2ClusterType type;
   1687 
   1688         l2_entry = get_l2_entry(s, l2_table, i);
   1689         l2_bitmap = get_l2_bitmap(s, l2_table, i);
   1690         type = qcow2_get_cluster_type(bs, l2_entry);
   1691 
   1692         if (type != QCOW2_CLUSTER_COMPRESSED) {
   1693             /* Check reserved bits of Standard Cluster Descriptor */
   1694             if (l2_entry & L2E_STD_RESERVED_MASK) {
   1695                 fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
   1696                         "%" PRIx64 "\n", l2_entry);
   1697                 res->corruptions++;
   1698             }
   1699         }
   1700 
   1701         switch (type) {
   1702         case QCOW2_CLUSTER_COMPRESSED:
   1703             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
   1704             if (l2_entry & QCOW_OFLAG_COPIED) {
   1705                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
   1706                     "copied flag must never be set for compressed "
   1707                     "clusters\n", l2_entry & s->cluster_offset_mask);
   1708                 l2_entry &= ~QCOW_OFLAG_COPIED;
   1709                 res->corruptions++;
   1710             }
   1711 
   1712             if (has_data_file(bs)) {
   1713                 fprintf(stderr, "ERROR compressed cluster %d with data file, "
   1714                         "entry=0x%" PRIx64 "\n", i, l2_entry);
   1715                 res->corruptions++;
   1716                 break;
   1717             }
   1718 
   1719             if (l2_bitmap) {
   1720                 fprintf(stderr, "ERROR compressed cluster %d with non-zero "
   1721                         "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
   1722                         i, l2_entry);
   1723                 res->corruptions++;
   1724                 break;
   1725             }
   1726 
   1727             /* Mark cluster as used */
   1728             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
   1729             ret = qcow2_inc_refcounts_imrt(
   1730                 bs, res, refcount_table, refcount_table_size, coffset, csize);
   1731             if (ret < 0) {
   1732                 return ret;
   1733             }
   1734 
   1735             if (flags & CHECK_FRAG_INFO) {
   1736                 res->bfi.allocated_clusters++;
   1737                 res->bfi.compressed_clusters++;
   1738 
   1739                 /*
   1740                  * Compressed clusters are fragmented by nature.  Since they
   1741                  * take up sub-sector space but we only have sector granularity
   1742                  * I/O we need to re-read the same sectors even for adjacent
   1743                  * compressed clusters.
   1744                  */
   1745                 res->bfi.fragmented_clusters++;
   1746             }
   1747             break;
   1748 
   1749         case QCOW2_CLUSTER_ZERO_ALLOC:
   1750         case QCOW2_CLUSTER_NORMAL:
   1751         {
   1752             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
   1753 
   1754             if ((l2_bitmap >> 32) & l2_bitmap) {
   1755                 res->corruptions++;
   1756                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
   1757                         "cluster has corrupted subcluster allocation bitmap\n",
   1758                         offset);
   1759             }
   1760 
   1761             /* Correct offsets are cluster aligned */
   1762             if (offset_into_cluster(s, offset)) {
   1763                 bool contains_data;
   1764                 res->corruptions++;
   1765 
   1766                 if (has_subclusters(s)) {
   1767                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
   1768                 } else {
   1769                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
   1770                 }
   1771 
   1772                 if (!contains_data) {
   1773                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
   1774                             "cluster is not properly aligned; L2 entry "
   1775                             "corrupted.\n",
   1776                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
   1777                             offset);
   1778                     if (fix & BDRV_FIX_ERRORS) {
   1779                         ret = fix_l2_entry_by_zero(bs, res, l2_offset,
   1780                                                    l2_table, i, active,
   1781                                                    &metadata_overlap);
   1782                         if (metadata_overlap) {
   1783                             /*
   1784                              * Something is seriously wrong, so abort checking
   1785                              * this L2 table.
   1786                              */
   1787                             return ret;
   1788                         }
   1789 
   1790                         if (ret == 0) {
   1791                             /*
   1792                              * Skip marking the cluster as used
   1793                              * (it is unused now).
   1794                              */
   1795                             continue;
   1796                         }
   1797 
   1798                         /*
   1799                          * Failed to fix.
   1800                          * Do not abort, continue checking the rest of this
   1801                          * L2 table's entries.
   1802                          */
   1803                     }
   1804                 } else {
   1805                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
   1806                         "not properly aligned; L2 entry corrupted.\n", offset);
   1807                 }
   1808             }
   1809 
   1810             if (flags & CHECK_FRAG_INFO) {
   1811                 res->bfi.allocated_clusters++;
   1812                 if (next_contiguous_offset &&
   1813                     offset != next_contiguous_offset) {
   1814                     res->bfi.fragmented_clusters++;
   1815                 }
   1816                 next_contiguous_offset = offset + s->cluster_size;
   1817             }
   1818 
   1819             /* Mark cluster as used */
   1820             if (!has_data_file(bs)) {
   1821                 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
   1822                                                refcount_table_size,
   1823                                                offset, s->cluster_size);
   1824                 if (ret < 0) {
   1825                     return ret;
   1826                 }
   1827             }
   1828             break;
   1829         }
   1830 
   1831         case QCOW2_CLUSTER_ZERO_PLAIN:
   1832             /* Impossible when image has subclusters */
   1833             assert(!l2_bitmap);
   1834             break;
   1835 
   1836         case QCOW2_CLUSTER_UNALLOCATED:
   1837             if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
   1838                 res->corruptions++;
   1839                 fprintf(stderr, "ERROR: Unallocated "
   1840                         "cluster has non-zero subcluster allocation map\n");
   1841             }
   1842             break;
   1843 
   1844         default:
   1845             abort();
   1846         }
   1847     }
   1848 
   1849     return 0;
   1850 }
   1851 
   1852 /*
   1853  * Increases the refcount for the L1 table, its L2 tables and all referenced
   1854  * clusters in the given refcount table. While doing so, performs some checks
   1855  * on L1 and L2 entries.
   1856  *
   1857  * Returns the number of errors found by the checks or -errno if an internal
   1858  * error occurred.
   1859  */
   1860 static int check_refcounts_l1(BlockDriverState *bs,
   1861                               BdrvCheckResult *res,
   1862                               void **refcount_table,
   1863                               int64_t *refcount_table_size,
   1864                               int64_t l1_table_offset, int l1_size,
   1865                               int flags, BdrvCheckMode fix, bool active)
   1866 {
   1867     BDRVQcow2State *s = bs->opaque;
   1868     size_t l1_size_bytes = l1_size * L1E_SIZE;
   1869     g_autofree uint64_t *l1_table = NULL;
   1870     uint64_t l2_offset;
   1871     int i, ret;
   1872 
   1873     if (!l1_size) {
   1874         return 0;
   1875     }
   1876 
   1877     /* Mark L1 table as used */
   1878     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
   1879                                    l1_table_offset, l1_size_bytes);
   1880     if (ret < 0) {
   1881         return ret;
   1882     }
   1883 
   1884     l1_table = g_try_malloc(l1_size_bytes);
   1885     if (l1_table == NULL) {
   1886         res->check_errors++;
   1887         return -ENOMEM;
   1888     }
   1889 
   1890     /* Read L1 table entries from disk */
   1891     ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
   1892     if (ret < 0) {
   1893         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
   1894         res->check_errors++;
   1895         return ret;
   1896     }
   1897 
   1898     for (i = 0; i < l1_size; i++) {
   1899         be64_to_cpus(&l1_table[i]);
   1900     }
   1901 
   1902     /* Do the actual checks */
   1903     for (i = 0; i < l1_size; i++) {
   1904         if (!l1_table[i]) {
   1905             continue;
   1906         }
   1907 
   1908         if (l1_table[i] & L1E_RESERVED_MASK) {
   1909             fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
   1910                     "%" PRIx64 "\n", l1_table[i]);
   1911             res->corruptions++;
   1912         }
   1913 
   1914         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
   1915 
   1916         /* Mark L2 table as used */
   1917         ret = qcow2_inc_refcounts_imrt(bs, res,
   1918                                        refcount_table, refcount_table_size,
   1919                                        l2_offset, s->cluster_size);
   1920         if (ret < 0) {
   1921             return ret;
   1922         }
   1923 
   1924         /* L2 tables are cluster aligned */
   1925         if (offset_into_cluster(s, l2_offset)) {
   1926             fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
   1927                 "cluster aligned; L1 entry corrupted\n", l2_offset);
   1928             res->corruptions++;
   1929         }
   1930 
   1931         /* Process and check L2 entries */
   1932         ret = check_refcounts_l2(bs, res, refcount_table,
   1933                                  refcount_table_size, l2_offset, flags,
   1934                                  fix, active);
   1935         if (ret < 0) {
   1936             return ret;
   1937         }
   1938     }
   1939 
   1940     return 0;
   1941 }
   1942 
   1943 /*
   1944  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
   1945  *
   1946  * This function does not print an error message nor does it increment
   1947  * check_errors if qcow2_get_refcount fails (this is because such an error will
   1948  * have been already detected and sufficiently signaled by the calling function
   1949  * (qcow2_check_refcounts) by the time this function is called).
   1950  */
   1951 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
   1952                               BdrvCheckMode fix)
   1953 {
   1954     BDRVQcow2State *s = bs->opaque;
   1955     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
   1956     int ret;
   1957     uint64_t refcount;
   1958     int i, j;
   1959     bool repair;
   1960 
   1961     if (fix & BDRV_FIX_ERRORS) {
   1962         /* Always repair */
   1963         repair = true;
   1964     } else if (fix & BDRV_FIX_LEAKS) {
   1965         /* Repair only if that seems safe: This function is always
   1966          * called after the refcounts have been fixed, so the refcount
   1967          * is accurate if that repair was successful */
   1968         repair = !res->check_errors && !res->corruptions && !res->leaks;
   1969     } else {
   1970         repair = false;
   1971     }
   1972 
   1973     for (i = 0; i < s->l1_size; i++) {
   1974         uint64_t l1_entry = s->l1_table[i];
   1975         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
   1976         int l2_dirty = 0;
   1977 
   1978         if (!l2_offset) {
   1979             continue;
   1980         }
   1981 
   1982         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
   1983                                  &refcount);
   1984         if (ret < 0) {
   1985             /* don't print message nor increment check_errors */
   1986             continue;
   1987         }
   1988         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
   1989             res->corruptions++;
   1990             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
   1991                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
   1992                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
   1993             if (repair) {
   1994                 s->l1_table[i] = refcount == 1
   1995                                ? l1_entry |  QCOW_OFLAG_COPIED
   1996                                : l1_entry & ~QCOW_OFLAG_COPIED;
   1997                 ret = qcow2_write_l1_entry(bs, i);
   1998                 if (ret < 0) {
   1999                     res->check_errors++;
   2000                     goto fail;
   2001                 }
   2002                 res->corruptions--;
   2003                 res->corruptions_fixed++;
   2004             }
   2005         }
   2006 
   2007         ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
   2008                          l2_table, 0);
   2009         if (ret < 0) {
   2010             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
   2011                     strerror(-ret));
   2012             res->check_errors++;
   2013             goto fail;
   2014         }
   2015 
   2016         for (j = 0; j < s->l2_size; j++) {
   2017             uint64_t l2_entry = get_l2_entry(s, l2_table, j);
   2018             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
   2019             QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
   2020 
   2021             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
   2022                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
   2023                 if (has_data_file(bs)) {
   2024                     refcount = 1;
   2025                 } else {
   2026                     ret = qcow2_get_refcount(bs,
   2027                                              data_offset >> s->cluster_bits,
   2028                                              &refcount);
   2029                     if (ret < 0) {
   2030                         /* don't print message nor increment check_errors */
   2031                         continue;
   2032                     }
   2033                 }
   2034                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
   2035                     res->corruptions++;
   2036                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
   2037                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
   2038                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
   2039                     if (repair) {
   2040                         set_l2_entry(s, l2_table, j,
   2041                                      refcount == 1 ?
   2042                                      l2_entry |  QCOW_OFLAG_COPIED :
   2043                                      l2_entry & ~QCOW_OFLAG_COPIED);
   2044                         l2_dirty++;
   2045                     }
   2046                 }
   2047             }
   2048         }
   2049 
   2050         if (l2_dirty > 0) {
   2051             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
   2052                                                 l2_offset, s->cluster_size,
   2053                                                 false);
   2054             if (ret < 0) {
   2055                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
   2056                         "overlap check failed: %s\n", strerror(-ret));
   2057                 res->check_errors++;
   2058                 goto fail;
   2059             }
   2060 
   2061             ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table,
   2062                               0);
   2063             if (ret < 0) {
   2064                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
   2065                         strerror(-ret));
   2066                 res->check_errors++;
   2067                 goto fail;
   2068             }
   2069             res->corruptions -= l2_dirty;
   2070             res->corruptions_fixed += l2_dirty;
   2071         }
   2072     }
   2073 
   2074     ret = 0;
   2075 
   2076 fail:
   2077     qemu_vfree(l2_table);
   2078     return ret;
   2079 }
   2080 
   2081 /*
   2082  * Checks consistency of refblocks and accounts for each refblock in
   2083  * *refcount_table.
   2084  */
   2085 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
   2086                            BdrvCheckMode fix, bool *rebuild,
   2087                            void **refcount_table, int64_t *nb_clusters)
   2088 {
   2089     BDRVQcow2State *s = bs->opaque;
   2090     int64_t i, size;
   2091     int ret;
   2092 
   2093     for(i = 0; i < s->refcount_table_size; i++) {
   2094         uint64_t offset, cluster;
   2095         offset = s->refcount_table[i] & REFT_OFFSET_MASK;
   2096         cluster = offset >> s->cluster_bits;
   2097 
   2098         if (s->refcount_table[i] & REFT_RESERVED_MASK) {
   2099             fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
   2100                     "reserved bits set\n", i);
   2101             res->corruptions++;
   2102             *rebuild = true;
   2103             continue;
   2104         }
   2105 
   2106         /* Refcount blocks are cluster aligned */
   2107         if (offset_into_cluster(s, offset)) {
   2108             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
   2109                 "cluster aligned; refcount table entry corrupted\n", i);
   2110             res->corruptions++;
   2111             *rebuild = true;
   2112             continue;
   2113         }
   2114 
   2115         if (cluster >= *nb_clusters) {
   2116             res->corruptions++;
   2117             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
   2118                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
   2119 
   2120             if (fix & BDRV_FIX_ERRORS) {
   2121                 int64_t new_nb_clusters;
   2122                 Error *local_err = NULL;
   2123 
   2124                 if (offset > INT64_MAX - s->cluster_size) {
   2125                     ret = -EINVAL;
   2126                     goto resize_fail;
   2127                 }
   2128 
   2129                 ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
   2130                                     PREALLOC_MODE_OFF, 0, &local_err);
   2131                 if (ret < 0) {
   2132                     error_report_err(local_err);
   2133                     goto resize_fail;
   2134                 }
   2135                 size = bdrv_getlength(bs->file->bs);
   2136                 if (size < 0) {
   2137                     ret = size;
   2138                     goto resize_fail;
   2139                 }
   2140 
   2141                 new_nb_clusters = size_to_clusters(s, size);
   2142                 assert(new_nb_clusters >= *nb_clusters);
   2143 
   2144                 ret = realloc_refcount_array(s, refcount_table,
   2145                                              nb_clusters, new_nb_clusters);
   2146                 if (ret < 0) {
   2147                     res->check_errors++;
   2148                     return ret;
   2149                 }
   2150 
   2151                 if (cluster >= *nb_clusters) {
   2152                     ret = -EINVAL;
   2153                     goto resize_fail;
   2154                 }
   2155 
   2156                 res->corruptions--;
   2157                 res->corruptions_fixed++;
   2158                 ret = qcow2_inc_refcounts_imrt(bs, res,
   2159                                                refcount_table, nb_clusters,
   2160                                                offset, s->cluster_size);
   2161                 if (ret < 0) {
   2162                     return ret;
   2163                 }
   2164                 /* No need to check whether the refcount is now greater than 1:
   2165                  * This area was just allocated and zeroed, so it can only be
   2166                  * exactly 1 after qcow2_inc_refcounts_imrt() */
   2167                 continue;
   2168 
   2169 resize_fail:
   2170                 *rebuild = true;
   2171                 fprintf(stderr, "ERROR could not resize image: %s\n",
   2172                         strerror(-ret));
   2173             }
   2174             continue;
   2175         }
   2176 
   2177         if (offset != 0) {
   2178             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
   2179                                            offset, s->cluster_size);
   2180             if (ret < 0) {
   2181                 return ret;
   2182             }
   2183             if (s->get_refcount(*refcount_table, cluster) != 1) {
   2184                 fprintf(stderr, "ERROR refcount block %" PRId64
   2185                         " refcount=%" PRIu64 "\n", i,
   2186                         s->get_refcount(*refcount_table, cluster));
   2187                 res->corruptions++;
   2188                 *rebuild = true;
   2189             }
   2190         }
   2191     }
   2192 
   2193     return 0;
   2194 }
   2195 
   2196 /*
   2197  * Calculates an in-memory refcount table.
   2198  */
   2199 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
   2200                                BdrvCheckMode fix, bool *rebuild,
   2201                                void **refcount_table, int64_t *nb_clusters)
   2202 {
   2203     BDRVQcow2State *s = bs->opaque;
   2204     int64_t i;
   2205     QCowSnapshot *sn;
   2206     int ret;
   2207 
   2208     if (!*refcount_table) {
   2209         int64_t old_size = 0;
   2210         ret = realloc_refcount_array(s, refcount_table,
   2211                                      &old_size, *nb_clusters);
   2212         if (ret < 0) {
   2213             res->check_errors++;
   2214             return ret;
   2215         }
   2216     }
   2217 
   2218     /* header */
   2219     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
   2220                                    0, s->cluster_size);
   2221     if (ret < 0) {
   2222         return ret;
   2223     }
   2224 
   2225     /* current L1 table */
   2226     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
   2227                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
   2228                              fix, true);
   2229     if (ret < 0) {
   2230         return ret;
   2231     }
   2232 
   2233     /* snapshots */
   2234     if (has_data_file(bs) && s->nb_snapshots) {
   2235         fprintf(stderr, "ERROR %d snapshots in image with data file\n",
   2236                 s->nb_snapshots);
   2237         res->corruptions++;
   2238     }
   2239 
   2240     for (i = 0; i < s->nb_snapshots; i++) {
   2241         sn = s->snapshots + i;
   2242         if (offset_into_cluster(s, sn->l1_table_offset)) {
   2243             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
   2244                     "L1 table is not cluster aligned; snapshot table entry "
   2245                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
   2246             res->corruptions++;
   2247             continue;
   2248         }
   2249         if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
   2250             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
   2251                     "L1 table is too large; snapshot table entry corrupted\n",
   2252                     sn->id_str, sn->name, sn->l1_size);
   2253             res->corruptions++;
   2254             continue;
   2255         }
   2256         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
   2257                                  sn->l1_table_offset, sn->l1_size, 0, fix,
   2258                                  false);
   2259         if (ret < 0) {
   2260             return ret;
   2261         }
   2262     }
   2263     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
   2264                                    s->snapshots_offset, s->snapshots_size);
   2265     if (ret < 0) {
   2266         return ret;
   2267     }
   2268 
   2269     /* refcount data */
   2270     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
   2271                                    s->refcount_table_offset,
   2272                                    s->refcount_table_size *
   2273                                    REFTABLE_ENTRY_SIZE);
   2274     if (ret < 0) {
   2275         return ret;
   2276     }
   2277 
   2278     /* encryption */
   2279     if (s->crypto_header.length) {
   2280         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
   2281                                        s->crypto_header.offset,
   2282                                        s->crypto_header.length);
   2283         if (ret < 0) {
   2284             return ret;
   2285         }
   2286     }
   2287 
   2288     /* bitmaps */
   2289     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
   2290     if (ret < 0) {
   2291         return ret;
   2292     }
   2293 
   2294     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
   2295 }
   2296 
   2297 /*
   2298  * Compares the actual reference count for each cluster in the image against the
   2299  * refcount as reported by the refcount structures on-disk.
   2300  */
   2301 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
   2302                               BdrvCheckMode fix, bool *rebuild,
   2303                               int64_t *highest_cluster,
   2304                               void *refcount_table, int64_t nb_clusters)
   2305 {
   2306     BDRVQcow2State *s = bs->opaque;
   2307     int64_t i;
   2308     uint64_t refcount1, refcount2;
   2309     int ret;
   2310 
   2311     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
   2312         ret = qcow2_get_refcount(bs, i, &refcount1);
   2313         if (ret < 0) {
   2314             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
   2315                     i, strerror(-ret));
   2316             res->check_errors++;
   2317             continue;
   2318         }
   2319 
   2320         refcount2 = s->get_refcount(refcount_table, i);
   2321 
   2322         if (refcount1 > 0 || refcount2 > 0) {
   2323             *highest_cluster = i;
   2324         }
   2325 
   2326         if (refcount1 != refcount2) {
   2327             /* Check if we're allowed to fix the mismatch */
   2328             int *num_fixed = NULL;
   2329             if (refcount1 == 0) {
   2330                 *rebuild = true;
   2331             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
   2332                 num_fixed = &res->leaks_fixed;
   2333             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
   2334                 num_fixed = &res->corruptions_fixed;
   2335             }
   2336 
   2337             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
   2338                     " reference=%" PRIu64 "\n",
   2339                    num_fixed != NULL     ? "Repairing" :
   2340                    refcount1 < refcount2 ? "ERROR" :
   2341                                            "Leaked",
   2342                    i, refcount1, refcount2);
   2343 
   2344             if (num_fixed) {
   2345                 ret = update_refcount(bs, i << s->cluster_bits, 1,
   2346                                       refcount_diff(refcount1, refcount2),
   2347                                       refcount1 > refcount2,
   2348                                       QCOW2_DISCARD_ALWAYS);
   2349                 if (ret >= 0) {
   2350                     (*num_fixed)++;
   2351                     continue;
   2352                 }
   2353             }
   2354 
   2355             /* And if we couldn't, print an error */
   2356             if (refcount1 < refcount2) {
   2357                 res->corruptions++;
   2358             } else {
   2359                 res->leaks++;
   2360             }
   2361         }
   2362     }
   2363 }
   2364 
   2365 /*
   2366  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
   2367  * the on-disk refcount structures.
   2368  *
   2369  * On input, *first_free_cluster tells where to start looking, and need not
   2370  * actually be a free cluster; the returned offset will not be before that
   2371  * cluster.  On output, *first_free_cluster points to the first gap found, even
   2372  * if that gap was too small to be used as the returned offset.
   2373  *
   2374  * Note that *first_free_cluster is a cluster index whereas the return value is
   2375  * an offset.
   2376  */
   2377 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
   2378                                    int cluster_count,
   2379                                    void **refcount_table,
   2380                                    int64_t *imrt_nb_clusters,
   2381                                    int64_t *first_free_cluster)
   2382 {
   2383     BDRVQcow2State *s = bs->opaque;
   2384     int64_t cluster = *first_free_cluster, i;
   2385     bool first_gap = true;
   2386     int contiguous_free_clusters;
   2387     int ret;
   2388 
   2389     /* Starting at *first_free_cluster, find a range of at least cluster_count
   2390      * continuously free clusters */
   2391     for (contiguous_free_clusters = 0;
   2392          cluster < *imrt_nb_clusters &&
   2393          contiguous_free_clusters < cluster_count;
   2394          cluster++)
   2395     {
   2396         if (!s->get_refcount(*refcount_table, cluster)) {
   2397             contiguous_free_clusters++;
   2398             if (first_gap) {
   2399                 /* If this is the first free cluster found, update
   2400                  * *first_free_cluster accordingly */
   2401                 *first_free_cluster = cluster;
   2402                 first_gap = false;
   2403             }
   2404         } else if (contiguous_free_clusters) {
   2405             contiguous_free_clusters = 0;
   2406         }
   2407     }
   2408 
   2409     /* If contiguous_free_clusters is greater than zero, it contains the number
   2410      * of continuously free clusters until the current cluster; the first free
   2411      * cluster in the current "gap" is therefore
   2412      * cluster - contiguous_free_clusters */
   2413 
   2414     /* If no such range could be found, grow the in-memory refcount table
   2415      * accordingly to append free clusters at the end of the image */
   2416     if (contiguous_free_clusters < cluster_count) {
   2417         /* contiguous_free_clusters clusters are already empty at the image end;
   2418          * we need cluster_count clusters; therefore, we have to allocate
   2419          * cluster_count - contiguous_free_clusters new clusters at the end of
   2420          * the image (which is the current value of cluster; note that cluster
   2421          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
   2422          * the image end) */
   2423         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
   2424                                      cluster + cluster_count
   2425                                      - contiguous_free_clusters);
   2426         if (ret < 0) {
   2427             return ret;
   2428         }
   2429     }
   2430 
   2431     /* Go back to the first free cluster */
   2432     cluster -= contiguous_free_clusters;
   2433     for (i = 0; i < cluster_count; i++) {
   2434         s->set_refcount(*refcount_table, cluster + i, 1);
   2435     }
   2436 
   2437     return cluster << s->cluster_bits;
   2438 }
   2439 
   2440 /*
   2441  * Helper function for rebuild_refcount_structure().
   2442  *
   2443  * Scan the range of clusters [first_cluster, end_cluster) for allocated
   2444  * clusters and write all corresponding refblocks to disk.  The refblock
   2445  * and allocation data is taken from the in-memory refcount table
   2446  * *refcount_table[] (of size *nb_clusters), which is basically one big
   2447  * (unlimited size) refblock for the whole image.
   2448  *
   2449  * For these refblocks, clusters are allocated using said in-memory
   2450  * refcount table.  Care is taken that these allocations are reflected
   2451  * in the refblocks written to disk.
   2452  *
   2453  * The refblocks' offsets are written into a reftable, which is
   2454  * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
   2455  * that reftable is of insufficient size, it will be resized to fit.
   2456  * This reftable is not written to disk.
   2457  *
   2458  * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
   2459  * to point to existing valid refblocks that do not need to be allocated
   2460  * again.)
   2461  *
   2462  * Return whether the on-disk reftable array was resized (true/false),
   2463  * or -errno on error.
   2464  */
   2465 static int rebuild_refcounts_write_refblocks(
   2466         BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
   2467         int64_t first_cluster, int64_t end_cluster,
   2468         uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
   2469         Error **errp
   2470     )
   2471 {
   2472     BDRVQcow2State *s = bs->opaque;
   2473     int64_t cluster;
   2474     int64_t refblock_offset, refblock_start, refblock_index;
   2475     int64_t first_free_cluster = 0;
   2476     uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
   2477     uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
   2478     void *on_disk_refblock;
   2479     bool reftable_grown = false;
   2480     int ret;
   2481 
   2482     for (cluster = first_cluster; cluster < end_cluster; cluster++) {
   2483         /* Check all clusters to find refblocks that contain non-zero entries */
   2484         if (!s->get_refcount(*refcount_table, cluster)) {
   2485             continue;
   2486         }
   2487 
   2488         /*
   2489          * This cluster is allocated, so we need to create a refblock
   2490          * for it.  The data we will write to disk is just the
   2491          * respective slice from *refcount_table, so it will contain
   2492          * accurate refcounts for all clusters belonging to this
   2493          * refblock.  After we have written it, we will therefore skip
   2494          * all remaining clusters in this refblock.
   2495          */
   2496 
   2497         refblock_index = cluster >> s->refcount_block_bits;
   2498         refblock_start = refblock_index << s->refcount_block_bits;
   2499 
   2500         if (on_disk_reftable_entries > refblock_index &&
   2501             on_disk_reftable[refblock_index])
   2502         {
   2503             /*
   2504              * We can get here after a `goto write_refblocks`: We have a
   2505              * reftable from a previous run, and the refblock is already
   2506              * allocated.  No need to allocate it again.
   2507              */
   2508             refblock_offset = on_disk_reftable[refblock_index];
   2509         } else {
   2510             int64_t refblock_cluster_index;
   2511 
   2512             /* Don't allocate a cluster in a refblock already written to disk */
   2513             if (first_free_cluster < refblock_start) {
   2514                 first_free_cluster = refblock_start;
   2515             }
   2516             refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
   2517                                                   nb_clusters,
   2518                                                   &first_free_cluster);
   2519             if (refblock_offset < 0) {
   2520                 error_setg_errno(errp, -refblock_offset,
   2521                                  "ERROR allocating refblock");
   2522                 return refblock_offset;
   2523             }
   2524 
   2525             refblock_cluster_index = refblock_offset / s->cluster_size;
   2526             if (refblock_cluster_index >= end_cluster) {
   2527                 /*
   2528                  * We must write the refblock that holds this refblock's
   2529                  * refcount
   2530                  */
   2531                 end_cluster = refblock_cluster_index + 1;
   2532             }
   2533 
   2534             if (on_disk_reftable_entries <= refblock_index) {
   2535                 on_disk_reftable_entries =
   2536                     ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
   2537                              s->cluster_size) / REFTABLE_ENTRY_SIZE;
   2538                 on_disk_reftable =
   2539                     g_try_realloc(on_disk_reftable,
   2540                                   on_disk_reftable_entries *
   2541                                   REFTABLE_ENTRY_SIZE);
   2542                 if (!on_disk_reftable) {
   2543                     error_setg(errp, "ERROR allocating reftable memory");
   2544                     return -ENOMEM;
   2545                 }
   2546 
   2547                 memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
   2548                        (on_disk_reftable_entries -
   2549                         *on_disk_reftable_entries_ptr) *
   2550                        REFTABLE_ENTRY_SIZE);
   2551 
   2552                 *on_disk_reftable_ptr = on_disk_reftable;
   2553                 *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
   2554 
   2555                 reftable_grown = true;
   2556             } else {
   2557                 assert(on_disk_reftable);
   2558             }
   2559             on_disk_reftable[refblock_index] = refblock_offset;
   2560         }
   2561 
   2562         /* Refblock is allocated, write it to disk */
   2563 
   2564         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
   2565                                             s->cluster_size, false);
   2566         if (ret < 0) {
   2567             error_setg_errno(errp, -ret, "ERROR writing refblock");
   2568             return ret;
   2569         }
   2570 
   2571         /*
   2572          * The refblock is simply a slice of *refcount_table.
   2573          * Note that the size of *refcount_table is always aligned to
   2574          * whole clusters, so the write operation will not result in
   2575          * out-of-bounds accesses.
   2576          */
   2577         on_disk_refblock = (void *)((char *) *refcount_table +
   2578                                     refblock_index * s->cluster_size);
   2579 
   2580         ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size,
   2581                           on_disk_refblock, 0);
   2582         if (ret < 0) {
   2583             error_setg_errno(errp, -ret, "ERROR writing refblock");
   2584             return ret;
   2585         }
   2586 
   2587         /* This refblock is done, skip to its end */
   2588         cluster = refblock_start + s->refcount_block_size - 1;
   2589     }
   2590 
   2591     return reftable_grown;
   2592 }
   2593 
   2594 /*
   2595  * Creates a new refcount structure based solely on the in-memory information
   2596  * given through *refcount_table (this in-memory information is basically just
   2597  * the concatenation of all refblocks).  All necessary allocations will be
   2598  * reflected in that array.
   2599  *
   2600  * On success, the old refcount structure is leaked (it will be covered by the
   2601  * new refcount structure).
   2602  */
   2603 static int rebuild_refcount_structure(BlockDriverState *bs,
   2604                                       BdrvCheckResult *res,
   2605                                       void **refcount_table,
   2606                                       int64_t *nb_clusters,
   2607                                       Error **errp)
   2608 {
   2609     BDRVQcow2State *s = bs->opaque;
   2610     int64_t reftable_offset = -1;
   2611     int64_t reftable_length = 0;
   2612     int64_t reftable_clusters;
   2613     int64_t refblock_index;
   2614     uint32_t on_disk_reftable_entries = 0;
   2615     uint64_t *on_disk_reftable = NULL;
   2616     int ret = 0;
   2617     int reftable_size_changed = 0;
   2618     struct {
   2619         uint64_t reftable_offset;
   2620         uint32_t reftable_clusters;
   2621     } QEMU_PACKED reftable_offset_and_clusters;
   2622 
   2623     qcow2_cache_empty(bs, s->refcount_block_cache);
   2624 
   2625     /*
   2626      * For each refblock containing entries, we try to allocate a
   2627      * cluster (in the in-memory refcount table) and write its offset
   2628      * into on_disk_reftable[].  We then write the whole refblock to
   2629      * disk (as a slice of the in-memory refcount table).
   2630      * This is done by rebuild_refcounts_write_refblocks().
   2631      *
   2632      * Once we have scanned all clusters, we try to find space for the
   2633      * reftable.  This will dirty the in-memory refcount table (i.e.
   2634      * make it differ from the refblocks we have already written), so we
   2635      * need to run rebuild_refcounts_write_refblocks() again for the
   2636      * range of clusters where the reftable has been allocated.
   2637      *
   2638      * This second run might make the reftable grow again, in which case
   2639      * we will need to allocate another space for it, which is why we
   2640      * repeat all this until the reftable stops growing.
   2641      *
   2642      * (This loop will terminate, because with every cluster the
   2643      * reftable grows, it can accomodate a multitude of more refcounts,
   2644      * so that at some point this must be able to cover the reftable
   2645      * and all refblocks describing it.)
   2646      *
   2647      * We then convert the reftable to big-endian and write it to disk.
   2648      *
   2649      * Note that we never free any reftable allocations.  Doing so would
   2650      * needlessly complicate the algorithm: The eventual second check
   2651      * run we do will clean up all leaks we have caused.
   2652      */
   2653 
   2654     reftable_size_changed =
   2655         rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
   2656                                           0, *nb_clusters,
   2657                                           &on_disk_reftable,
   2658                                           &on_disk_reftable_entries, errp);
   2659     if (reftable_size_changed < 0) {
   2660         res->check_errors++;
   2661         ret = reftable_size_changed;
   2662         goto fail;
   2663     }
   2664 
   2665     /*
   2666      * There was no reftable before, so rebuild_refcounts_write_refblocks()
   2667      * must have increased its size (from 0 to something).
   2668      */
   2669     assert(reftable_size_changed);
   2670 
   2671     do {
   2672         int64_t reftable_start_cluster, reftable_end_cluster;
   2673         int64_t first_free_cluster = 0;
   2674 
   2675         reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
   2676         reftable_clusters = size_to_clusters(s, reftable_length);
   2677 
   2678         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
   2679                                               refcount_table, nb_clusters,
   2680                                               &first_free_cluster);
   2681         if (reftable_offset < 0) {
   2682             error_setg_errno(errp, -reftable_offset,
   2683                              "ERROR allocating reftable");
   2684             res->check_errors++;
   2685             ret = reftable_offset;
   2686             goto fail;
   2687         }
   2688 
   2689         /*
   2690          * We need to update the affected refblocks, so re-run the
   2691          * write_refblocks loop for the reftable's range of clusters.
   2692          */
   2693         assert(offset_into_cluster(s, reftable_offset) == 0);
   2694         reftable_start_cluster = reftable_offset / s->cluster_size;
   2695         reftable_end_cluster = reftable_start_cluster + reftable_clusters;
   2696         reftable_size_changed =
   2697             rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
   2698                                               reftable_start_cluster,
   2699                                               reftable_end_cluster,
   2700                                               &on_disk_reftable,
   2701                                               &on_disk_reftable_entries, errp);
   2702         if (reftable_size_changed < 0) {
   2703             res->check_errors++;
   2704             ret = reftable_size_changed;
   2705             goto fail;
   2706         }
   2707 
   2708         /*
   2709          * If the reftable size has changed, we will need to find a new
   2710          * allocation, repeating the loop.
   2711          */
   2712     } while (reftable_size_changed);
   2713 
   2714     /* The above loop must have run at least once */
   2715     assert(reftable_offset >= 0);
   2716 
   2717     /*
   2718      * All allocations are done, all refblocks are written, convert the
   2719      * reftable to big-endian and write it to disk.
   2720      */
   2721 
   2722     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
   2723          refblock_index++)
   2724     {
   2725         cpu_to_be64s(&on_disk_reftable[refblock_index]);
   2726     }
   2727 
   2728     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
   2729                                         false);
   2730     if (ret < 0) {
   2731         error_setg_errno(errp, -ret, "ERROR writing reftable");
   2732         goto fail;
   2733     }
   2734 
   2735     assert(reftable_length < INT_MAX);
   2736     ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length,
   2737                       on_disk_reftable, 0);
   2738     if (ret < 0) {
   2739         error_setg_errno(errp, -ret, "ERROR writing reftable");
   2740         goto fail;
   2741     }
   2742 
   2743     /* Enter new reftable into the image header */
   2744     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
   2745     reftable_offset_and_clusters.reftable_clusters =
   2746         cpu_to_be32(reftable_clusters);
   2747     ret = bdrv_pwrite_sync(bs->file,
   2748                            offsetof(QCowHeader, refcount_table_offset),
   2749                            sizeof(reftable_offset_and_clusters),
   2750                            &reftable_offset_and_clusters, 0);
   2751     if (ret < 0) {
   2752         error_setg_errno(errp, -ret, "ERROR setting reftable");
   2753         goto fail;
   2754     }
   2755 
   2756     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
   2757          refblock_index++)
   2758     {
   2759         be64_to_cpus(&on_disk_reftable[refblock_index]);
   2760     }
   2761     s->refcount_table = on_disk_reftable;
   2762     s->refcount_table_offset = reftable_offset;
   2763     s->refcount_table_size = on_disk_reftable_entries;
   2764     update_max_refcount_table_index(s);
   2765 
   2766     return 0;
   2767 
   2768 fail:
   2769     g_free(on_disk_reftable);
   2770     return ret;
   2771 }
   2772 
   2773 /*
   2774  * Checks an image for refcount consistency.
   2775  *
   2776  * Returns 0 if no errors are found, the number of errors in case the image is
   2777  * detected as corrupted, and -errno when an internal error occurred.
   2778  */
   2779 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
   2780                           BdrvCheckMode fix)
   2781 {
   2782     BDRVQcow2State *s = bs->opaque;
   2783     BdrvCheckResult pre_compare_res;
   2784     int64_t size, highest_cluster, nb_clusters;
   2785     void *refcount_table = NULL;
   2786     bool rebuild = false;
   2787     int ret;
   2788 
   2789     size = bdrv_getlength(bs->file->bs);
   2790     if (size < 0) {
   2791         res->check_errors++;
   2792         return size;
   2793     }
   2794 
   2795     nb_clusters = size_to_clusters(s, size);
   2796     if (nb_clusters > INT_MAX) {
   2797         res->check_errors++;
   2798         return -EFBIG;
   2799     }
   2800 
   2801     res->bfi.total_clusters =
   2802         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
   2803 
   2804     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
   2805                               &nb_clusters);
   2806     if (ret < 0) {
   2807         goto fail;
   2808     }
   2809 
   2810     /* In case we don't need to rebuild the refcount structure (but want to fix
   2811      * something), this function is immediately called again, in which case the
   2812      * result should be ignored */
   2813     pre_compare_res = *res;
   2814     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
   2815                       nb_clusters);
   2816 
   2817     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
   2818         BdrvCheckResult old_res = *res;
   2819         int fresh_leaks = 0;
   2820         Error *local_err = NULL;
   2821 
   2822         fprintf(stderr, "Rebuilding refcount structure\n");
   2823         ret = rebuild_refcount_structure(bs, res, &refcount_table,
   2824                                          &nb_clusters, &local_err);
   2825         if (ret < 0) {
   2826             error_report_err(local_err);
   2827             goto fail;
   2828         }
   2829 
   2830         res->corruptions = 0;
   2831         res->leaks = 0;
   2832 
   2833         /* Because the old reftable has been exchanged for a new one the
   2834          * references have to be recalculated */
   2835         rebuild = false;
   2836         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
   2837         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
   2838                                   &nb_clusters);
   2839         if (ret < 0) {
   2840             goto fail;
   2841         }
   2842 
   2843         if (fix & BDRV_FIX_LEAKS) {
   2844             /* The old refcount structures are now leaked, fix it; the result
   2845              * can be ignored, aside from leaks which were introduced by
   2846              * rebuild_refcount_structure() that could not be fixed */
   2847             BdrvCheckResult saved_res = *res;
   2848             *res = (BdrvCheckResult){ 0 };
   2849 
   2850             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
   2851                               &highest_cluster, refcount_table, nb_clusters);
   2852             if (rebuild) {
   2853                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
   2854                         "broken\n");
   2855             }
   2856 
   2857             /* Any leaks accounted for here were introduced by
   2858              * rebuild_refcount_structure() because that function has created a
   2859              * new refcount structure from scratch */
   2860             fresh_leaks = res->leaks;
   2861             *res = saved_res;
   2862         }
   2863 
   2864         if (res->corruptions < old_res.corruptions) {
   2865             res->corruptions_fixed += old_res.corruptions - res->corruptions;
   2866         }
   2867         if (res->leaks < old_res.leaks) {
   2868             res->leaks_fixed += old_res.leaks - res->leaks;
   2869         }
   2870         res->leaks += fresh_leaks;
   2871     } else if (fix) {
   2872         if (rebuild) {
   2873             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
   2874             res->check_errors++;
   2875             ret = -EIO;
   2876             goto fail;
   2877         }
   2878 
   2879         if (res->leaks || res->corruptions) {
   2880             *res = pre_compare_res;
   2881             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
   2882                               refcount_table, nb_clusters);
   2883         }
   2884     }
   2885 
   2886     /* check OFLAG_COPIED */
   2887     ret = check_oflag_copied(bs, res, fix);
   2888     if (ret < 0) {
   2889         goto fail;
   2890     }
   2891 
   2892     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
   2893     ret = 0;
   2894 
   2895 fail:
   2896     g_free(refcount_table);
   2897 
   2898     return ret;
   2899 }
   2900 
   2901 #define overlaps_with(ofs, sz) \
   2902     ranges_overlap(offset, size, ofs, sz)
   2903 
   2904 /*
   2905  * Checks if the given offset into the image file is actually free to use by
   2906  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
   2907  * i.e. a sanity check without relying on the refcount tables.
   2908  *
   2909  * The ign parameter specifies what checks not to perform (being a bitmask of
   2910  * QCow2MetadataOverlap values), i.e., what sections to ignore.
   2911  *
   2912  * Returns:
   2913  * - 0 if writing to this offset will not affect the mentioned metadata
   2914  * - a positive QCow2MetadataOverlap value indicating one overlapping section
   2915  * - a negative value (-errno) indicating an error while performing a check,
   2916  *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
   2917  */
   2918 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
   2919                                  int64_t size)
   2920 {
   2921     BDRVQcow2State *s = bs->opaque;
   2922     int chk = s->overlap_check & ~ign;
   2923     int i, j;
   2924 
   2925     if (!size) {
   2926         return 0;
   2927     }
   2928 
   2929     if (chk & QCOW2_OL_MAIN_HEADER) {
   2930         if (offset < s->cluster_size) {
   2931             return QCOW2_OL_MAIN_HEADER;
   2932         }
   2933     }
   2934 
   2935     /* align range to test to cluster boundaries */
   2936     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
   2937     offset = start_of_cluster(s, offset);
   2938 
   2939     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
   2940         if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
   2941             return QCOW2_OL_ACTIVE_L1;
   2942         }
   2943     }
   2944 
   2945     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
   2946         if (overlaps_with(s->refcount_table_offset,
   2947             s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
   2948             return QCOW2_OL_REFCOUNT_TABLE;
   2949         }
   2950     }
   2951 
   2952     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
   2953         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
   2954             return QCOW2_OL_SNAPSHOT_TABLE;
   2955         }
   2956     }
   2957 
   2958     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
   2959         for (i = 0; i < s->nb_snapshots; i++) {
   2960             if (s->snapshots[i].l1_size &&
   2961                 overlaps_with(s->snapshots[i].l1_table_offset,
   2962                 s->snapshots[i].l1_size * L1E_SIZE)) {
   2963                 return QCOW2_OL_INACTIVE_L1;
   2964             }
   2965         }
   2966     }
   2967 
   2968     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
   2969         for (i = 0; i < s->l1_size; i++) {
   2970             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
   2971                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
   2972                 s->cluster_size)) {
   2973                 return QCOW2_OL_ACTIVE_L2;
   2974             }
   2975         }
   2976     }
   2977 
   2978     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
   2979         unsigned last_entry = s->max_refcount_table_index;
   2980         assert(last_entry < s->refcount_table_size);
   2981         assert(last_entry + 1 == s->refcount_table_size ||
   2982                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
   2983         for (i = 0; i <= last_entry; i++) {
   2984             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
   2985                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
   2986                 s->cluster_size)) {
   2987                 return QCOW2_OL_REFCOUNT_BLOCK;
   2988             }
   2989         }
   2990     }
   2991 
   2992     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
   2993         for (i = 0; i < s->nb_snapshots; i++) {
   2994             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
   2995             uint32_t l1_sz  = s->snapshots[i].l1_size;
   2996             uint64_t l1_sz2 = l1_sz * L1E_SIZE;
   2997             uint64_t *l1;
   2998             int ret;
   2999 
   3000             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
   3001                                        QCOW_MAX_L1_SIZE, "", NULL);
   3002             if (ret < 0) {
   3003                 return ret;
   3004             }
   3005 
   3006             l1 = g_try_malloc(l1_sz2);
   3007 
   3008             if (l1_sz2 && l1 == NULL) {
   3009                 return -ENOMEM;
   3010             }
   3011 
   3012             ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
   3013             if (ret < 0) {
   3014                 g_free(l1);
   3015                 return ret;
   3016             }
   3017 
   3018             for (j = 0; j < l1_sz; j++) {
   3019                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
   3020                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
   3021                     g_free(l1);
   3022                     return QCOW2_OL_INACTIVE_L2;
   3023                 }
   3024             }
   3025 
   3026             g_free(l1);
   3027         }
   3028     }
   3029 
   3030     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
   3031         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
   3032     {
   3033         if (overlaps_with(s->bitmap_directory_offset,
   3034                           s->bitmap_directory_size))
   3035         {
   3036             return QCOW2_OL_BITMAP_DIRECTORY;
   3037         }
   3038     }
   3039 
   3040     return 0;
   3041 }
   3042 
   3043 static const char *metadata_ol_names[] = {
   3044     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
   3045     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
   3046     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
   3047     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
   3048     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
   3049     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
   3050     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
   3051     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
   3052     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
   3053 };
   3054 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
   3055 
   3056 /*
   3057  * First performs a check for metadata overlaps (through
   3058  * qcow2_check_metadata_overlap); if that fails with a negative value (error
   3059  * while performing a check), that value is returned. If an impending overlap
   3060  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
   3061  * and -EIO returned.
   3062  *
   3063  * Returns 0 if there were neither overlaps nor errors while checking for
   3064  * overlaps; or a negative value (-errno) on error.
   3065  */
   3066 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
   3067                                   int64_t size, bool data_file)
   3068 {
   3069     int ret;
   3070 
   3071     if (data_file && has_data_file(bs)) {
   3072         return 0;
   3073     }
   3074 
   3075     ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
   3076     if (ret < 0) {
   3077         return ret;
   3078     } else if (ret > 0) {
   3079         int metadata_ol_bitnr = ctz32(ret);
   3080         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
   3081 
   3082         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
   3083                                 "write on metadata (overlaps with %s)",
   3084                                 metadata_ol_names[metadata_ol_bitnr]);
   3085         return -EIO;
   3086     }
   3087 
   3088     return 0;
   3089 }
   3090 
   3091 /* A pointer to a function of this type is given to walk_over_reftable(). That
   3092  * function will create refblocks and pass them to a RefblockFinishOp once they
   3093  * are completed (@refblock). @refblock_empty is set if the refblock is
   3094  * completely empty.
   3095  *
   3096  * Along with the refblock, a corresponding reftable entry is passed, in the
   3097  * reftable @reftable (which may be reallocated) at @reftable_index.
   3098  *
   3099  * @allocated should be set to true if a new cluster has been allocated.
   3100  */
   3101 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
   3102                                uint64_t reftable_index, uint64_t *reftable_size,
   3103                                void *refblock, bool refblock_empty,
   3104                                bool *allocated, Error **errp);
   3105 
   3106 /**
   3107  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
   3108  * it is not empty) and inserts its offset into the new reftable. The size of
   3109  * this new reftable is increased as required.
   3110  */
   3111 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
   3112                           uint64_t reftable_index, uint64_t *reftable_size,
   3113                           void *refblock, bool refblock_empty, bool *allocated,
   3114                           Error **errp)
   3115 {
   3116     BDRVQcow2State *s = bs->opaque;
   3117     int64_t offset;
   3118 
   3119     if (!refblock_empty && reftable_index >= *reftable_size) {
   3120         uint64_t *new_reftable;
   3121         uint64_t new_reftable_size;
   3122 
   3123         new_reftable_size = ROUND_UP(reftable_index + 1,
   3124                                      s->cluster_size / REFTABLE_ENTRY_SIZE);
   3125         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
   3126             error_setg(errp,
   3127                        "This operation would make the refcount table grow "
   3128                        "beyond the maximum size supported by QEMU, aborting");
   3129             return -ENOTSUP;
   3130         }
   3131 
   3132         new_reftable = g_try_realloc(*reftable, new_reftable_size *
   3133                                                 REFTABLE_ENTRY_SIZE);
   3134         if (!new_reftable) {
   3135             error_setg(errp, "Failed to increase reftable buffer size");
   3136             return -ENOMEM;
   3137         }
   3138 
   3139         memset(new_reftable + *reftable_size, 0,
   3140                (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
   3141 
   3142         *reftable      = new_reftable;
   3143         *reftable_size = new_reftable_size;
   3144     }
   3145 
   3146     if (!refblock_empty && !(*reftable)[reftable_index]) {
   3147         offset = qcow2_alloc_clusters(bs, s->cluster_size);
   3148         if (offset < 0) {
   3149             error_setg_errno(errp, -offset, "Failed to allocate refblock");
   3150             return offset;
   3151         }
   3152         (*reftable)[reftable_index] = offset;
   3153         *allocated = true;
   3154     }
   3155 
   3156     return 0;
   3157 }
   3158 
   3159 /**
   3160  * This "operation" for walk_over_reftable() writes the refblock to disk at the
   3161  * offset specified by the new reftable's entry. It does not modify the new
   3162  * reftable or change any refcounts.
   3163  */
   3164 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
   3165                           uint64_t reftable_index, uint64_t *reftable_size,
   3166                           void *refblock, bool refblock_empty, bool *allocated,
   3167                           Error **errp)
   3168 {
   3169     BDRVQcow2State *s = bs->opaque;
   3170     int64_t offset;
   3171     int ret;
   3172 
   3173     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
   3174         offset = (*reftable)[reftable_index];
   3175 
   3176         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
   3177                                             false);
   3178         if (ret < 0) {
   3179             error_setg_errno(errp, -ret, "Overlap check failed");
   3180             return ret;
   3181         }
   3182 
   3183         ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
   3184         if (ret < 0) {
   3185             error_setg_errno(errp, -ret, "Failed to write refblock");
   3186             return ret;
   3187         }
   3188     } else {
   3189         assert(refblock_empty);
   3190     }
   3191 
   3192     return 0;
   3193 }
   3194 
   3195 /**
   3196  * This function walks over the existing reftable and every referenced refblock;
   3197  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
   3198  * create an equal new entry in the passed @new_refblock. Once that
   3199  * @new_refblock is completely filled, @operation will be called.
   3200  *
   3201  * @status_cb and @cb_opaque are used for the amend operation's status callback.
   3202  * @index is the index of the walk_over_reftable() calls and @total is the total
   3203  * number of walk_over_reftable() calls per amend operation. Both are used for
   3204  * calculating the parameters for the status callback.
   3205  *
   3206  * @allocated is set to true if a new cluster has been allocated.
   3207  */
   3208 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
   3209                               uint64_t *new_reftable_index,
   3210                               uint64_t *new_reftable_size,
   3211                               void *new_refblock, int new_refblock_size,
   3212                               int new_refcount_bits,
   3213                               RefblockFinishOp *operation, bool *allocated,
   3214                               Qcow2SetRefcountFunc *new_set_refcount,
   3215                               BlockDriverAmendStatusCB *status_cb,
   3216                               void *cb_opaque, int index, int total,
   3217                               Error **errp)
   3218 {
   3219     BDRVQcow2State *s = bs->opaque;
   3220     uint64_t reftable_index;
   3221     bool new_refblock_empty = true;
   3222     int refblock_index;
   3223     int new_refblock_index = 0;
   3224     int ret;
   3225 
   3226     for (reftable_index = 0; reftable_index < s->refcount_table_size;
   3227          reftable_index++)
   3228     {
   3229         uint64_t refblock_offset = s->refcount_table[reftable_index]
   3230                                  & REFT_OFFSET_MASK;
   3231 
   3232         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
   3233                   (uint64_t)total * s->refcount_table_size, cb_opaque);
   3234 
   3235         if (refblock_offset) {
   3236             void *refblock;
   3237 
   3238             if (offset_into_cluster(s, refblock_offset)) {
   3239                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
   3240                                         PRIx64 " unaligned (reftable index: %#"
   3241                                         PRIx64 ")", refblock_offset,
   3242                                         reftable_index);
   3243                 error_setg(errp,
   3244                            "Image is corrupt (unaligned refblock offset)");
   3245                 return -EIO;
   3246             }
   3247 
   3248             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
   3249                                   &refblock);
   3250             if (ret < 0) {
   3251                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
   3252                 return ret;
   3253             }
   3254 
   3255             for (refblock_index = 0; refblock_index < s->refcount_block_size;
   3256                  refblock_index++)
   3257             {
   3258                 uint64_t refcount;
   3259 
   3260                 if (new_refblock_index >= new_refblock_size) {
   3261                     /* new_refblock is now complete */
   3262                     ret = operation(bs, new_reftable, *new_reftable_index,
   3263                                     new_reftable_size, new_refblock,
   3264                                     new_refblock_empty, allocated, errp);
   3265                     if (ret < 0) {
   3266                         qcow2_cache_put(s->refcount_block_cache, &refblock);
   3267                         return ret;
   3268                     }
   3269 
   3270                     (*new_reftable_index)++;
   3271                     new_refblock_index = 0;
   3272                     new_refblock_empty = true;
   3273                 }
   3274 
   3275                 refcount = s->get_refcount(refblock, refblock_index);
   3276                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
   3277                     uint64_t offset;
   3278 
   3279                     qcow2_cache_put(s->refcount_block_cache, &refblock);
   3280 
   3281                     offset = ((reftable_index << s->refcount_block_bits)
   3282                               + refblock_index) << s->cluster_bits;
   3283 
   3284                     error_setg(errp, "Cannot decrease refcount entry width to "
   3285                                "%i bits: Cluster at offset %#" PRIx64 " has a "
   3286                                "refcount of %" PRIu64, new_refcount_bits,
   3287                                offset, refcount);
   3288                     return -EINVAL;
   3289                 }
   3290 
   3291                 if (new_set_refcount) {
   3292                     new_set_refcount(new_refblock, new_refblock_index++,
   3293                                      refcount);
   3294                 } else {
   3295                     new_refblock_index++;
   3296                 }
   3297                 new_refblock_empty = new_refblock_empty && refcount == 0;
   3298             }
   3299 
   3300             qcow2_cache_put(s->refcount_block_cache, &refblock);
   3301         } else {
   3302             /* No refblock means every refcount is 0 */
   3303             for (refblock_index = 0; refblock_index < s->refcount_block_size;
   3304                  refblock_index++)
   3305             {
   3306                 if (new_refblock_index >= new_refblock_size) {
   3307                     /* new_refblock is now complete */
   3308                     ret = operation(bs, new_reftable, *new_reftable_index,
   3309                                     new_reftable_size, new_refblock,
   3310                                     new_refblock_empty, allocated, errp);
   3311                     if (ret < 0) {
   3312                         return ret;
   3313                     }
   3314 
   3315                     (*new_reftable_index)++;
   3316                     new_refblock_index = 0;
   3317                     new_refblock_empty = true;
   3318                 }
   3319 
   3320                 if (new_set_refcount) {
   3321                     new_set_refcount(new_refblock, new_refblock_index++, 0);
   3322                 } else {
   3323                     new_refblock_index++;
   3324                 }
   3325             }
   3326         }
   3327     }
   3328 
   3329     if (new_refblock_index > 0) {
   3330         /* Complete the potentially existing partially filled final refblock */
   3331         if (new_set_refcount) {
   3332             for (; new_refblock_index < new_refblock_size;
   3333                  new_refblock_index++)
   3334             {
   3335                 new_set_refcount(new_refblock, new_refblock_index, 0);
   3336             }
   3337         }
   3338 
   3339         ret = operation(bs, new_reftable, *new_reftable_index,
   3340                         new_reftable_size, new_refblock, new_refblock_empty,
   3341                         allocated, errp);
   3342         if (ret < 0) {
   3343             return ret;
   3344         }
   3345 
   3346         (*new_reftable_index)++;
   3347     }
   3348 
   3349     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
   3350               (uint64_t)total * s->refcount_table_size, cb_opaque);
   3351 
   3352     return 0;
   3353 }
   3354 
   3355 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
   3356                                 BlockDriverAmendStatusCB *status_cb,
   3357                                 void *cb_opaque, Error **errp)
   3358 {
   3359     BDRVQcow2State *s = bs->opaque;
   3360     Qcow2GetRefcountFunc *new_get_refcount;
   3361     Qcow2SetRefcountFunc *new_set_refcount;
   3362     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
   3363     uint64_t *new_reftable = NULL, new_reftable_size = 0;
   3364     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
   3365     uint64_t new_reftable_index = 0;
   3366     uint64_t i;
   3367     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
   3368     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
   3369     int old_refcount_order;
   3370     int walk_index = 0;
   3371     int ret;
   3372     bool new_allocation;
   3373 
   3374     assert(s->qcow_version >= 3);
   3375     assert(refcount_order >= 0 && refcount_order <= 6);
   3376 
   3377     /* see qcow2_open() */
   3378     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
   3379 
   3380     new_get_refcount = get_refcount_funcs[refcount_order];
   3381     new_set_refcount = set_refcount_funcs[refcount_order];
   3382 
   3383 
   3384     do {
   3385         int total_walks;
   3386 
   3387         new_allocation = false;
   3388 
   3389         /* At least we have to do this walk and the one which writes the
   3390          * refblocks; also, at least we have to do this loop here at least
   3391          * twice (normally), first to do the allocations, and second to
   3392          * determine that everything is correctly allocated, this then makes
   3393          * three walks in total */
   3394         total_walks = MAX(walk_index + 2, 3);
   3395 
   3396         /* First, allocate the structures so they are present in the refcount
   3397          * structures */
   3398         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
   3399                                  &new_reftable_size, NULL, new_refblock_size,
   3400                                  new_refcount_bits, &alloc_refblock,
   3401                                  &new_allocation, NULL, status_cb, cb_opaque,
   3402                                  walk_index++, total_walks, errp);
   3403         if (ret < 0) {
   3404             goto done;
   3405         }
   3406 
   3407         new_reftable_index = 0;
   3408 
   3409         if (new_allocation) {
   3410             if (new_reftable_offset) {
   3411                 qcow2_free_clusters(
   3412                     bs, new_reftable_offset,
   3413                     allocated_reftable_size * REFTABLE_ENTRY_SIZE,
   3414                     QCOW2_DISCARD_NEVER);
   3415             }
   3416 
   3417             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
   3418                                                            REFTABLE_ENTRY_SIZE);
   3419             if (new_reftable_offset < 0) {
   3420                 error_setg_errno(errp, -new_reftable_offset,
   3421                                  "Failed to allocate the new reftable");
   3422                 ret = new_reftable_offset;
   3423                 goto done;
   3424             }
   3425             allocated_reftable_size = new_reftable_size;
   3426         }
   3427     } while (new_allocation);
   3428 
   3429     /* Second, write the new refblocks */
   3430     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
   3431                              &new_reftable_size, new_refblock,
   3432                              new_refblock_size, new_refcount_bits,
   3433                              &flush_refblock, &new_allocation, new_set_refcount,
   3434                              status_cb, cb_opaque, walk_index, walk_index + 1,
   3435                              errp);
   3436     if (ret < 0) {
   3437         goto done;
   3438     }
   3439     assert(!new_allocation);
   3440 
   3441 
   3442     /* Write the new reftable */
   3443     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
   3444                                         new_reftable_size * REFTABLE_ENTRY_SIZE,
   3445                                         false);
   3446     if (ret < 0) {
   3447         error_setg_errno(errp, -ret, "Overlap check failed");
   3448         goto done;
   3449     }
   3450 
   3451     for (i = 0; i < new_reftable_size; i++) {
   3452         cpu_to_be64s(&new_reftable[i]);
   3453     }
   3454 
   3455     ret = bdrv_pwrite(bs->file, new_reftable_offset,
   3456                       new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
   3457                       0);
   3458 
   3459     for (i = 0; i < new_reftable_size; i++) {
   3460         be64_to_cpus(&new_reftable[i]);
   3461     }
   3462 
   3463     if (ret < 0) {
   3464         error_setg_errno(errp, -ret, "Failed to write the new reftable");
   3465         goto done;
   3466     }
   3467 
   3468 
   3469     /* Empty the refcount cache */
   3470     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
   3471     if (ret < 0) {
   3472         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
   3473         goto done;
   3474     }
   3475 
   3476     /* Update the image header to point to the new reftable; this only updates
   3477      * the fields which are relevant to qcow2_update_header(); other fields
   3478      * such as s->refcount_table or s->refcount_bits stay stale for now
   3479      * (because we have to restore everything if qcow2_update_header() fails) */
   3480     old_refcount_order  = s->refcount_order;
   3481     old_reftable_size   = s->refcount_table_size;
   3482     old_reftable_offset = s->refcount_table_offset;
   3483 
   3484     s->refcount_order        = refcount_order;
   3485     s->refcount_table_size   = new_reftable_size;
   3486     s->refcount_table_offset = new_reftable_offset;
   3487 
   3488     ret = qcow2_update_header(bs);
   3489     if (ret < 0) {
   3490         s->refcount_order        = old_refcount_order;
   3491         s->refcount_table_size   = old_reftable_size;
   3492         s->refcount_table_offset = old_reftable_offset;
   3493         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
   3494         goto done;
   3495     }
   3496 
   3497     /* Now update the rest of the in-memory information */
   3498     old_reftable = s->refcount_table;
   3499     s->refcount_table = new_reftable;
   3500     update_max_refcount_table_index(s);
   3501 
   3502     s->refcount_bits = 1 << refcount_order;
   3503     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
   3504     s->refcount_max += s->refcount_max - 1;
   3505 
   3506     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
   3507     s->refcount_block_size = 1 << s->refcount_block_bits;
   3508 
   3509     s->get_refcount = new_get_refcount;
   3510     s->set_refcount = new_set_refcount;
   3511 
   3512     /* For cleaning up all old refblocks and the old reftable below the "done"
   3513      * label */
   3514     new_reftable        = old_reftable;
   3515     new_reftable_size   = old_reftable_size;
   3516     new_reftable_offset = old_reftable_offset;
   3517 
   3518 done:
   3519     if (new_reftable) {
   3520         /* On success, new_reftable actually points to the old reftable (and
   3521          * new_reftable_size is the old reftable's size); but that is just
   3522          * fine */
   3523         for (i = 0; i < new_reftable_size; i++) {
   3524             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
   3525             if (offset) {
   3526                 qcow2_free_clusters(bs, offset, s->cluster_size,
   3527                                     QCOW2_DISCARD_OTHER);
   3528             }
   3529         }
   3530         g_free(new_reftable);
   3531 
   3532         if (new_reftable_offset > 0) {
   3533             qcow2_free_clusters(bs, new_reftable_offset,
   3534                                 new_reftable_size * REFTABLE_ENTRY_SIZE,
   3535                                 QCOW2_DISCARD_OTHER);
   3536         }
   3537     }
   3538 
   3539     qemu_vfree(new_refblock);
   3540     return ret;
   3541 }
   3542 
   3543 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
   3544 {
   3545     BDRVQcow2State *s = bs->opaque;
   3546     uint32_t index = offset_to_reftable_index(s, offset);
   3547     int64_t covering_refblock_offset = 0;
   3548 
   3549     if (index < s->refcount_table_size) {
   3550         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
   3551     }
   3552     if (!covering_refblock_offset) {
   3553         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
   3554                                 "not covered by the refcount structures",
   3555                                 offset);
   3556         return -EIO;
   3557     }
   3558 
   3559     return covering_refblock_offset;
   3560 }
   3561 
   3562 static int coroutine_fn
   3563 qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
   3564 {
   3565     BDRVQcow2State *s = bs->opaque;
   3566     int64_t refblock_offs;
   3567     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
   3568     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
   3569     void *refblock;
   3570     int ret;
   3571 
   3572     refblock_offs = get_refblock_offset(bs, discard_block_offs);
   3573     if (refblock_offs < 0) {
   3574         return refblock_offs;
   3575     }
   3576 
   3577     assert(discard_block_offs != 0);
   3578 
   3579     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
   3580                           &refblock);
   3581     if (ret < 0) {
   3582         return ret;
   3583     }
   3584 
   3585     if (s->get_refcount(refblock, block_index) != 1) {
   3586         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
   3587                                 " refblock offset %#" PRIx64
   3588                                 ", reftable index %u"
   3589                                 ", block offset %#" PRIx64
   3590                                 ", refcount %#" PRIx64,
   3591                                 refblock_offs,
   3592                                 offset_to_reftable_index(s, discard_block_offs),
   3593                                 discard_block_offs,
   3594                                 s->get_refcount(refblock, block_index));
   3595         qcow2_cache_put(s->refcount_block_cache, &refblock);
   3596         return -EINVAL;
   3597     }
   3598     s->set_refcount(refblock, block_index, 0);
   3599 
   3600     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
   3601 
   3602     qcow2_cache_put(s->refcount_block_cache, &refblock);
   3603 
   3604     if (cluster_index < s->free_cluster_index) {
   3605         s->free_cluster_index = cluster_index;
   3606     }
   3607 
   3608     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
   3609                                            discard_block_offs);
   3610     if (refblock) {
   3611         /* discard refblock from the cache if refblock is cached */
   3612         qcow2_cache_discard(s->refcount_block_cache, refblock);
   3613     }
   3614     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
   3615 
   3616     return 0;
   3617 }
   3618 
   3619 int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
   3620 {
   3621     BDRVQcow2State *s = bs->opaque;
   3622     uint64_t *reftable_tmp =
   3623         g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
   3624     int i, ret;
   3625 
   3626     for (i = 0; i < s->refcount_table_size; i++) {
   3627         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
   3628         void *refblock;
   3629         bool unused_block;
   3630 
   3631         if (refblock_offs == 0) {
   3632             reftable_tmp[i] = 0;
   3633             continue;
   3634         }
   3635         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
   3636                               &refblock);
   3637         if (ret < 0) {
   3638             goto out;
   3639         }
   3640 
   3641         /* the refblock has own reference */
   3642         if (i == offset_to_reftable_index(s, refblock_offs)) {
   3643             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
   3644                                    (s->refcount_block_size - 1);
   3645             uint64_t refcount = s->get_refcount(refblock, block_index);
   3646 
   3647             s->set_refcount(refblock, block_index, 0);
   3648 
   3649             unused_block = buffer_is_zero(refblock, s->cluster_size);
   3650 
   3651             s->set_refcount(refblock, block_index, refcount);
   3652         } else {
   3653             unused_block = buffer_is_zero(refblock, s->cluster_size);
   3654         }
   3655         qcow2_cache_put(s->refcount_block_cache, &refblock);
   3656 
   3657         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
   3658     }
   3659 
   3660     ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
   3661                               s->refcount_table_size * REFTABLE_ENTRY_SIZE,
   3662                               reftable_tmp, 0);
   3663     /*
   3664      * If the write in the reftable failed the image may contain a partially
   3665      * overwritten reftable. In this case it would be better to clear the
   3666      * reftable in memory to avoid possible image corruption.
   3667      */
   3668     for (i = 0; i < s->refcount_table_size; i++) {
   3669         if (s->refcount_table[i] && !reftable_tmp[i]) {
   3670             if (ret == 0) {
   3671                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
   3672                                                        REFT_OFFSET_MASK);
   3673             }
   3674             s->refcount_table[i] = 0;
   3675         }
   3676     }
   3677 
   3678     if (!s->cache_discards) {
   3679         qcow2_process_discards(bs, ret);
   3680     }
   3681 
   3682 out:
   3683     g_free(reftable_tmp);
   3684     return ret;
   3685 }
   3686 
   3687 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
   3688 {
   3689     BDRVQcow2State *s = bs->opaque;
   3690     int64_t i;
   3691 
   3692     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
   3693         uint64_t refcount;
   3694         int ret = qcow2_get_refcount(bs, i, &refcount);
   3695         if (ret < 0) {
   3696             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
   3697                     i, strerror(-ret));
   3698             return ret;
   3699         }
   3700         if (refcount > 0) {
   3701             return i;
   3702         }
   3703     }
   3704     qcow2_signal_corruption(bs, true, -1, -1,
   3705                             "There are no references in the refcount table.");
   3706     return -EIO;
   3707 }
   3708 
   3709 int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
   3710 {
   3711     BDRVQcow2State *s = bs->opaque;
   3712     int64_t i, end_cluster, cluster_count = 0, threshold;
   3713     int64_t file_length, real_allocation, real_clusters;
   3714 
   3715     qemu_co_mutex_assert_locked(&s->lock);
   3716 
   3717     file_length = bdrv_getlength(bs->file->bs);
   3718     if (file_length < 0) {
   3719         return file_length;
   3720     }
   3721 
   3722     real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
   3723     if (real_allocation < 0) {
   3724         return real_allocation;
   3725     }
   3726 
   3727     real_clusters = real_allocation / s->cluster_size;
   3728     threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
   3729 
   3730     end_cluster = size_to_clusters(s, file_length);
   3731     for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
   3732         uint64_t refcount;
   3733         int ret = qcow2_get_refcount(bs, i, &refcount);
   3734         if (ret < 0) {
   3735             return ret;
   3736         }
   3737         cluster_count += !!refcount;
   3738     }
   3739 
   3740     return cluster_count >= threshold;
   3741 }