qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

preallocate.c (17480B)


      1 /*
      2  * preallocate filter driver
      3  *
      4  * The driver performs preallocate operation: it is injected above
      5  * some node, and before each write over EOF it does additional preallocating
      6  * write-zeroes request.
      7  *
      8  * Copyright (c) 2020 Virtuozzo International GmbH.
      9  *
     10  * Author:
     11  *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
     12  *
     13  * This program is free software; you can redistribute it and/or modify
     14  * it under the terms of the GNU General Public License as published by
     15  * the Free Software Foundation; either version 2 of the License, or
     16  * (at your option) any later version.
     17  *
     18  * This program is distributed in the hope that it will be useful,
     19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     21  * GNU General Public License for more details.
     22  *
     23  * You should have received a copy of the GNU General Public License
     24  * along with this program. If not, see <http://www.gnu.org/licenses/>.
     25  */
     26 
     27 #include "qemu/osdep.h"
     28 
     29 #include "qapi/error.h"
     30 #include "qemu/module.h"
     31 #include "qemu/option.h"
     32 #include "qemu/units.h"
     33 #include "block/block_int.h"
     34 
     35 
     36 typedef struct PreallocateOpts {
     37     int64_t prealloc_size;
     38     int64_t prealloc_align;
     39 } PreallocateOpts;
     40 
     41 typedef struct BDRVPreallocateState {
     42     PreallocateOpts opts;
     43 
     44     /*
     45      * Track real data end, to crop preallocation on close. If < 0 the status is
     46      * unknown.
     47      *
     48      * @data_end is a maximum of file size on open (or when we get write/resize
     49      * permissions) and all write request ends after it. So it's safe to
     50      * truncate to data_end if it is valid.
     51      */
     52     int64_t data_end;
     53 
     54     /*
     55      * Start of trailing preallocated area which reads as zero. May be smaller
     56      * than data_end, if user does over-EOF write zero operation. If < 0 the
     57      * status is unknown.
     58      *
     59      * If both @zero_start and @file_end are valid, the region
     60      * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
     61      * is not valid, @zero_start doesn't make much sense.
     62      */
     63     int64_t zero_start;
     64 
     65     /*
     66      * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
     67      * to avoid extra lseek() calls on each write operation. If < 0 the status
     68      * is unknown.
     69      */
     70     int64_t file_end;
     71 
     72     /*
     73      * All three states @data_end, @zero_start and @file_end are guaranteed to
     74      * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
     75      * BLK_PERM_WRITE permissions on file child.
     76      */
     77 } BDRVPreallocateState;
     78 
     79 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
     80 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
     81 static QemuOptsList runtime_opts = {
     82     .name = "preallocate",
     83     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
     84     .desc = {
     85         {
     86             .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
     87             .type = QEMU_OPT_SIZE,
     88             .help = "on preallocation, align file length to this number, "
     89                 "default 1M",
     90         },
     91         {
     92             .name = PREALLOCATE_OPT_PREALLOC_SIZE,
     93             .type = QEMU_OPT_SIZE,
     94             .help = "how much to preallocate, default 128M",
     95         },
     96         { /* end of list */ }
     97     },
     98 };
     99 
    100 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
    101                                     BlockDriverState *child_bs, Error **errp)
    102 {
    103     QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
    104 
    105     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
    106         return false;
    107     }
    108 
    109     dest->prealloc_align =
    110         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
    111     dest->prealloc_size =
    112         qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
    113 
    114     qemu_opts_del(opts);
    115 
    116     if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
    117         error_setg(errp, "prealloc-align parameter of preallocate filter "
    118                    "is not aligned to %llu", BDRV_SECTOR_SIZE);
    119         return false;
    120     }
    121 
    122     if (!QEMU_IS_ALIGNED(dest->prealloc_align,
    123                          child_bs->bl.request_alignment)) {
    124         error_setg(errp, "prealloc-align parameter of preallocate filter "
    125                    "is not aligned to underlying node request alignment "
    126                    "(%" PRIi32 ")", child_bs->bl.request_alignment);
    127         return false;
    128     }
    129 
    130     return true;
    131 }
    132 
    133 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
    134                             Error **errp)
    135 {
    136     BDRVPreallocateState *s = bs->opaque;
    137     int ret;
    138 
    139     /*
    140      * s->data_end and friends should be initialized on permission update.
    141      * For this to work, mark them invalid.
    142      */
    143     s->file_end = s->zero_start = s->data_end = -EINVAL;
    144 
    145     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    146     if (ret < 0) {
    147         return ret;
    148     }
    149 
    150     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
    151         return -EINVAL;
    152     }
    153 
    154     bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
    155         (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
    156 
    157     bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
    158         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
    159             bs->file->bs->supported_zero_flags);
    160 
    161     return 0;
    162 }
    163 
    164 static void preallocate_close(BlockDriverState *bs)
    165 {
    166     int ret;
    167     BDRVPreallocateState *s = bs->opaque;
    168 
    169     if (s->data_end < 0) {
    170         return;
    171     }
    172 
    173     if (s->file_end < 0) {
    174         s->file_end = bdrv_getlength(bs->file->bs);
    175         if (s->file_end < 0) {
    176             return;
    177         }
    178     }
    179 
    180     if (s->data_end < s->file_end) {
    181         ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
    182                             NULL);
    183         s->file_end = ret < 0 ? ret : s->data_end;
    184     }
    185 }
    186 
    187 
    188 /*
    189  * Handle reopen.
    190  *
    191  * We must implement reopen handlers, otherwise reopen just don't work. Handle
    192  * new options and don't care about preallocation state, as it is handled in
    193  * set/check permission handlers.
    194  */
    195 
    196 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
    197                                       BlockReopenQueue *queue, Error **errp)
    198 {
    199     PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
    200 
    201     if (!preallocate_absorb_opts(opts, reopen_state->options,
    202                                  reopen_state->bs->file->bs, errp)) {
    203         g_free(opts);
    204         return -EINVAL;
    205     }
    206 
    207     reopen_state->opaque = opts;
    208 
    209     return 0;
    210 }
    211 
    212 static void preallocate_reopen_commit(BDRVReopenState *state)
    213 {
    214     BDRVPreallocateState *s = state->bs->opaque;
    215 
    216     s->opts = *(PreallocateOpts *)state->opaque;
    217 
    218     g_free(state->opaque);
    219     state->opaque = NULL;
    220 }
    221 
    222 static void preallocate_reopen_abort(BDRVReopenState *state)
    223 {
    224     g_free(state->opaque);
    225     state->opaque = NULL;
    226 }
    227 
    228 static coroutine_fn int preallocate_co_preadv_part(
    229         BlockDriverState *bs, int64_t offset, int64_t bytes,
    230         QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
    231 {
    232     return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
    233                                flags);
    234 }
    235 
    236 static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
    237                                                int64_t offset, int64_t bytes)
    238 {
    239     return bdrv_co_pdiscard(bs->file, offset, bytes);
    240 }
    241 
    242 static bool can_write_resize(uint64_t perm)
    243 {
    244     return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
    245 }
    246 
    247 static bool has_prealloc_perms(BlockDriverState *bs)
    248 {
    249     BDRVPreallocateState *s = bs->opaque;
    250 
    251     if (can_write_resize(bs->file->perm)) {
    252         assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
    253         assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
    254         return true;
    255     }
    256 
    257     assert(s->data_end < 0);
    258     assert(s->zero_start < 0);
    259     assert(s->file_end < 0);
    260     return false;
    261 }
    262 
    263 /*
    264  * Call on each write. Returns true if @want_merge_zero is true and the region
    265  * [offset, offset + bytes) is zeroed (as a result of this call or earlier
    266  * preallocation).
    267  *
    268  * want_merge_zero is used to merge write-zero request with preallocation in
    269  * one bdrv_co_pwrite_zeroes() call.
    270  */
    271 static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
    272                                       int64_t bytes, bool want_merge_zero)
    273 {
    274     BDRVPreallocateState *s = bs->opaque;
    275     int64_t end = offset + bytes;
    276     int64_t prealloc_start, prealloc_end;
    277     int ret;
    278     uint32_t file_align = bs->file->bs->bl.request_alignment;
    279     uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
    280 
    281     assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
    282 
    283     if (!has_prealloc_perms(bs)) {
    284         /* We don't have state neither should try to recover it */
    285         return false;
    286     }
    287 
    288     if (s->data_end < 0) {
    289         s->data_end = bdrv_getlength(bs->file->bs);
    290         if (s->data_end < 0) {
    291             return false;
    292         }
    293 
    294         if (s->file_end < 0) {
    295             s->file_end = s->data_end;
    296         }
    297     }
    298 
    299     if (end <= s->data_end) {
    300         return false;
    301     }
    302 
    303     /* We have valid s->data_end, and request writes beyond it. */
    304 
    305     s->data_end = end;
    306     if (s->zero_start < 0 || !want_merge_zero) {
    307         s->zero_start = end;
    308     }
    309 
    310     if (s->file_end < 0) {
    311         s->file_end = bdrv_getlength(bs->file->bs);
    312         if (s->file_end < 0) {
    313             return false;
    314         }
    315     }
    316 
    317     /* Now s->data_end, s->zero_start and s->file_end are valid. */
    318 
    319     if (end <= s->file_end) {
    320         /* No preallocation needed. */
    321         return want_merge_zero && offset >= s->zero_start;
    322     }
    323 
    324     /* Now we want new preallocation, as request writes beyond s->file_end. */
    325 
    326     prealloc_start = QEMU_ALIGN_UP(
    327             want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
    328             file_align);
    329     prealloc_end = QEMU_ALIGN_UP(
    330             MAX(prealloc_start, end) + s->opts.prealloc_size,
    331             prealloc_align);
    332 
    333     want_merge_zero = want_merge_zero && (prealloc_start <= offset);
    334 
    335     ret = bdrv_co_pwrite_zeroes(
    336             bs->file, prealloc_start, prealloc_end - prealloc_start,
    337             BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
    338     if (ret < 0) {
    339         s->file_end = ret;
    340         return false;
    341     }
    342 
    343     s->file_end = prealloc_end;
    344     return want_merge_zero;
    345 }
    346 
    347 static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
    348         int64_t offset, int64_t bytes, BdrvRequestFlags flags)
    349 {
    350     bool want_merge_zero =
    351         !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
    352     if (handle_write(bs, offset, bytes, want_merge_zero)) {
    353         return 0;
    354     }
    355 
    356     return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
    357 }
    358 
    359 static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
    360                                                     int64_t offset,
    361                                                     int64_t bytes,
    362                                                     QEMUIOVector *qiov,
    363                                                     size_t qiov_offset,
    364                                                     BdrvRequestFlags flags)
    365 {
    366     handle_write(bs, offset, bytes, false);
    367 
    368     return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
    369                                 flags);
    370 }
    371 
    372 static int coroutine_fn
    373 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
    374                         bool exact, PreallocMode prealloc,
    375                         BdrvRequestFlags flags, Error **errp)
    376 {
    377     ERRP_GUARD();
    378     BDRVPreallocateState *s = bs->opaque;
    379     int ret;
    380 
    381     if (s->data_end >= 0 && offset > s->data_end) {
    382         if (s->file_end < 0) {
    383             s->file_end = bdrv_getlength(bs->file->bs);
    384             if (s->file_end < 0) {
    385                 error_setg(errp, "failed to get file length");
    386                 return s->file_end;
    387             }
    388         }
    389 
    390         if (prealloc == PREALLOC_MODE_FALLOC) {
    391             /*
    392              * If offset <= s->file_end, the task is already done, just
    393              * update s->data_end, to move part of "filter preallocation"
    394              * to "preallocation requested by user".
    395              * Otherwise just proceed to preallocate missing part.
    396              */
    397             if (offset <= s->file_end) {
    398                 s->data_end = offset;
    399                 return 0;
    400             }
    401         } else {
    402             /*
    403              * We have to drop our preallocation, to
    404              * - avoid "Cannot use preallocation for shrinking files" in
    405              *   case of offset < file_end
    406              * - give PREALLOC_MODE_OFF a chance to keep small disk
    407              *   usage
    408              * - give PREALLOC_MODE_FULL a chance to actually write the
    409              *   whole region as user expects
    410              */
    411             if (s->file_end > s->data_end) {
    412                 ret = bdrv_co_truncate(bs->file, s->data_end, true,
    413                                        PREALLOC_MODE_OFF, 0, errp);
    414                 if (ret < 0) {
    415                     s->file_end = ret;
    416                     error_prepend(errp, "preallocate-filter: failed to drop "
    417                                   "write-zero preallocation: ");
    418                     return ret;
    419                 }
    420                 s->file_end = s->data_end;
    421             }
    422         }
    423 
    424         s->data_end = offset;
    425     }
    426 
    427     ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
    428     if (ret < 0) {
    429         s->file_end = s->zero_start = s->data_end = ret;
    430         return ret;
    431     }
    432 
    433     if (has_prealloc_perms(bs)) {
    434         s->file_end = s->zero_start = s->data_end = offset;
    435     }
    436     return 0;
    437 }
    438 
    439 static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
    440 {
    441     return bdrv_co_flush(bs->file->bs);
    442 }
    443 
    444 static int64_t preallocate_getlength(BlockDriverState *bs)
    445 {
    446     int64_t ret;
    447     BDRVPreallocateState *s = bs->opaque;
    448 
    449     if (s->data_end >= 0) {
    450         return s->data_end;
    451     }
    452 
    453     ret = bdrv_getlength(bs->file->bs);
    454 
    455     if (has_prealloc_perms(bs)) {
    456         s->file_end = s->zero_start = s->data_end = ret;
    457     }
    458 
    459     return ret;
    460 }
    461 
    462 static int preallocate_check_perm(BlockDriverState *bs,
    463                                   uint64_t perm, uint64_t shared, Error **errp)
    464 {
    465     BDRVPreallocateState *s = bs->opaque;
    466 
    467     if (s->data_end >= 0 && !can_write_resize(perm)) {
    468         /*
    469          * Lose permissions.
    470          * We should truncate in check_perm, as in set_perm bs->file->perm will
    471          * be already changed, and we should not violate it.
    472          */
    473         if (s->file_end < 0) {
    474             s->file_end = bdrv_getlength(bs->file->bs);
    475             if (s->file_end < 0) {
    476                 error_setg(errp, "Failed to get file length");
    477                 return s->file_end;
    478             }
    479         }
    480 
    481         if (s->data_end < s->file_end) {
    482             int ret = bdrv_truncate(bs->file, s->data_end, true,
    483                                     PREALLOC_MODE_OFF, 0, NULL);
    484             if (ret < 0) {
    485                 error_setg(errp, "Failed to drop preallocation");
    486                 s->file_end = ret;
    487                 return ret;
    488             }
    489             s->file_end = s->data_end;
    490         }
    491     }
    492 
    493     return 0;
    494 }
    495 
    496 static void preallocate_set_perm(BlockDriverState *bs,
    497                                  uint64_t perm, uint64_t shared)
    498 {
    499     BDRVPreallocateState *s = bs->opaque;
    500 
    501     if (can_write_resize(perm)) {
    502         if (s->data_end < 0) {
    503             s->data_end = s->file_end = s->zero_start =
    504                 bdrv_getlength(bs->file->bs);
    505         }
    506     } else {
    507         /*
    508          * We drop our permissions, as well as allow shared
    509          * permissions (see preallocate_child_perm), anyone will be able to
    510          * change the child, so mark all states invalid. We'll regain control if
    511          * get good permissions back.
    512          */
    513         s->data_end = s->file_end = s->zero_start = -EINVAL;
    514     }
    515 }
    516 
    517 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
    518     BdrvChildRole role, BlockReopenQueue *reopen_queue,
    519     uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
    520 {
    521     bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
    522 
    523     if (can_write_resize(perm)) {
    524         /* This should come by default, but let's enforce: */
    525         *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
    526 
    527         /*
    528          * Don't share, to keep our states s->file_end, s->data_end and
    529          * s->zero_start valid.
    530          */
    531         *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
    532     }
    533 }
    534 
    535 BlockDriver bdrv_preallocate_filter = {
    536     .format_name = "preallocate",
    537     .instance_size = sizeof(BDRVPreallocateState),
    538 
    539     .bdrv_getlength = preallocate_getlength,
    540     .bdrv_open = preallocate_open,
    541     .bdrv_close = preallocate_close,
    542 
    543     .bdrv_reopen_prepare  = preallocate_reopen_prepare,
    544     .bdrv_reopen_commit   = preallocate_reopen_commit,
    545     .bdrv_reopen_abort    = preallocate_reopen_abort,
    546 
    547     .bdrv_co_preadv_part = preallocate_co_preadv_part,
    548     .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
    549     .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
    550     .bdrv_co_pdiscard = preallocate_co_pdiscard,
    551     .bdrv_co_flush = preallocate_co_flush,
    552     .bdrv_co_truncate = preallocate_co_truncate,
    553 
    554     .bdrv_check_perm = preallocate_check_perm,
    555     .bdrv_set_perm = preallocate_set_perm,
    556     .bdrv_child_perm = preallocate_child_perm,
    557 
    558     .has_variable_length = true,
    559     .is_filter = true,
    560 };
    561 
    562 static void bdrv_preallocate_init(void)
    563 {
    564     bdrv_register(&bdrv_preallocate_filter);
    565 }
    566 
    567 block_init(bdrv_preallocate_init);