qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

block-common.h (17271B)


      1 /*
      2  * QEMU System Emulator block driver
      3  *
      4  * Copyright (c) 2003 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #ifndef BLOCK_COMMON_H
     25 #define BLOCK_COMMON_H
     26 
     27 #include "block/aio.h"
     28 #include "block/aio-wait.h"
     29 #include "qemu/iov.h"
     30 #include "qemu/coroutine.h"
     31 #include "block/accounting.h"
     32 #include "block/dirty-bitmap.h"
     33 #include "block/blockjob.h"
     34 #include "qemu/hbitmap.h"
     35 #include "qemu/transactions.h"
     36 
     37 /*
     38  * generated_co_wrapper
     39  *
     40  * Function specifier, which does nothing but mark functions to be
     41  * generated by scripts/block-coroutine-wrapper.py
     42  *
     43  * Read more in docs/devel/block-coroutine-wrapper.rst
     44  */
     45 #define generated_co_wrapper
     46 
     47 /* block.c */
     48 typedef struct BlockDriver BlockDriver;
     49 typedef struct BdrvChild BdrvChild;
     50 typedef struct BdrvChildClass BdrvChildClass;
     51 
     52 typedef struct BlockDriverInfo {
     53     /* in bytes, 0 if irrelevant */
     54     int cluster_size;
     55     /* offset at which the VM state can be saved (0 if not possible) */
     56     int64_t vm_state_offset;
     57     bool is_dirty;
     58     /*
     59      * True if this block driver only supports compressed writes
     60      */
     61     bool needs_compressed_writes;
     62 } BlockDriverInfo;
     63 
     64 typedef struct BlockFragInfo {
     65     uint64_t allocated_clusters;
     66     uint64_t total_clusters;
     67     uint64_t fragmented_clusters;
     68     uint64_t compressed_clusters;
     69 } BlockFragInfo;
     70 
     71 typedef enum {
     72     BDRV_REQ_COPY_ON_READ       = 0x1,
     73     BDRV_REQ_ZERO_WRITE         = 0x2,
     74 
     75     /*
     76      * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
     77      * that the block driver should unmap (discard) blocks if it is guaranteed
     78      * that the result will read back as zeroes. The flag is only passed to the
     79      * driver if the block device is opened with BDRV_O_UNMAP.
     80      */
     81     BDRV_REQ_MAY_UNMAP          = 0x4,
     82 
     83     /*
     84      * An optimization hint when all QEMUIOVector elements are within
     85      * previously registered bdrv_register_buf() memory ranges.
     86      *
     87      * Code that replaces the user's QEMUIOVector elements with bounce buffers
     88      * must take care to clear this flag.
     89      */
     90     BDRV_REQ_REGISTERED_BUF     = 0x8,
     91 
     92     BDRV_REQ_FUA                = 0x10,
     93     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
     94 
     95     /*
     96      * Signifies that this write request will not change the visible disk
     97      * content.
     98      */
     99     BDRV_REQ_WRITE_UNCHANGED    = 0x40,
    100 
    101     /*
    102      * Forces request serialisation. Use only with write requests.
    103      */
    104     BDRV_REQ_SERIALISING        = 0x80,
    105 
    106     /*
    107      * Execute the request only if the operation can be offloaded or otherwise
    108      * be executed efficiently, but return an error instead of using a slow
    109      * fallback.
    110      */
    111     BDRV_REQ_NO_FALLBACK        = 0x100,
    112 
    113     /*
    114      * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
    115      * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
    116      * filter is involved), in which case it signals that the COR operation
    117      * need not read the data into memory (qiov) but only ensure they are
    118      * copied to the top layer (i.e., that COR operation is done).
    119      */
    120     BDRV_REQ_PREFETCH  = 0x200,
    121 
    122     /*
    123      * If we need to wait for other requests, just fail immediately. Used
    124      * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
    125      * to request_alignment (corresponding assertions are in block/io.c).
    126      */
    127     BDRV_REQ_NO_WAIT = 0x400,
    128 
    129     /* Mask of valid flags */
    130     BDRV_REQ_MASK               = 0x7ff,
    131 } BdrvRequestFlags;
    132 
    133 #define BDRV_O_NO_SHARE    0x0001 /* don't share permissions */
    134 #define BDRV_O_RDWR        0x0002
    135 #define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
    136 #define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save
    137                                      writes in a snapshot */
    138 #define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
    139 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
    140 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the
    141                                      thread pool */
    142 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
    143 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
    144 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
    145 #define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
    146 #define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
    147 #define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
    148 #define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
    149 #define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
    150                                       select an appropriate protocol driver,
    151                                       ignoring the format layer */
    152 #define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
    153 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
    154                                       read-write fails */
    155 #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
    156 
    157 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
    158 
    159 
    160 /* Option names of options parsed by the block layer */
    161 
    162 #define BDRV_OPT_CACHE_WB       "cache.writeback"
    163 #define BDRV_OPT_CACHE_DIRECT   "cache.direct"
    164 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
    165 #define BDRV_OPT_READ_ONLY      "read-only"
    166 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
    167 #define BDRV_OPT_DISCARD        "discard"
    168 #define BDRV_OPT_FORCE_SHARE    "force-share"
    169 
    170 
    171 #define BDRV_SECTOR_BITS   9
    172 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
    173 
    174 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
    175                                            INT_MAX >> BDRV_SECTOR_BITS)
    176 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
    177 
    178 /*
    179  * We want allow aligning requests and disk length up to any 32bit alignment
    180  * and don't afraid of overflow.
    181  * To achieve it, and in the same time use some pretty number as maximum disk
    182  * size, let's define maximum "length" (a limit for any offset/bytes request and
    183  * for disk size) to be the greatest power of 2 less than INT64_MAX.
    184  */
    185 #define BDRV_MAX_ALIGNMENT (1L << 30)
    186 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
    187 
    188 /*
    189  * Allocation status flags for bdrv_block_status() and friends.
    190  *
    191  * Public flags:
    192  * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
    193  * BDRV_BLOCK_ZERO: offset reads as zero
    194  * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
    195  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
    196  *                       layer rather than any backing, set by block layer
    197  * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
    198  *                 layer, set by block layer
    199  *
    200  * Internal flags:
    201  * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
    202  *                 that the block layer recompute the answer from the returned
    203  *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
    204  * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
    205  *                     zeroes in file child of current block node inside
    206  *                     returned region. Only valid together with both
    207  *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
    208  *                     appear with BDRV_BLOCK_ZERO.
    209  *
    210  * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
    211  * host offset within the returned BDS that is allocated for the
    212  * corresponding raw guest data.  However, whether that offset
    213  * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
    214  *
    215  * DATA ZERO OFFSET_VALID
    216  *  t    t        t       sectors read as zero, returned file is zero at offset
    217  *  t    f        t       sectors read as valid from file at offset
    218  *  f    t        t       sectors preallocated, read as zero, returned file not
    219  *                        necessarily zero at offset
    220  *  f    f        t       sectors preallocated but read from backing_hd,
    221  *                        returned file contains garbage at offset
    222  *  t    t        f       sectors preallocated, read as zero, unknown offset
    223  *  t    f        f       sectors read from unknown file or offset
    224  *  f    t        f       not allocated or unknown offset, read as zero
    225  *  f    f        f       not allocated or unknown offset, read from backing_hd
    226  */
    227 #define BDRV_BLOCK_DATA         0x01
    228 #define BDRV_BLOCK_ZERO         0x02
    229 #define BDRV_BLOCK_OFFSET_VALID 0x04
    230 #define BDRV_BLOCK_RAW          0x08
    231 #define BDRV_BLOCK_ALLOCATED    0x10
    232 #define BDRV_BLOCK_EOF          0x20
    233 #define BDRV_BLOCK_RECURSE      0x40
    234 
    235 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
    236 
    237 typedef struct BDRVReopenState {
    238     BlockDriverState *bs;
    239     int flags;
    240     BlockdevDetectZeroesOptions detect_zeroes;
    241     bool backing_missing;
    242     BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
    243     BlockDriverState *old_file_bs; /* keep pointer for permissions update */
    244     QDict *options;
    245     QDict *explicit_options;
    246     void *opaque;
    247 } BDRVReopenState;
    248 
    249 /*
    250  * Block operation types
    251  */
    252 typedef enum BlockOpType {
    253     BLOCK_OP_TYPE_BACKUP_SOURCE,
    254     BLOCK_OP_TYPE_BACKUP_TARGET,
    255     BLOCK_OP_TYPE_CHANGE,
    256     BLOCK_OP_TYPE_COMMIT_SOURCE,
    257     BLOCK_OP_TYPE_COMMIT_TARGET,
    258     BLOCK_OP_TYPE_DATAPLANE,
    259     BLOCK_OP_TYPE_DRIVE_DEL,
    260     BLOCK_OP_TYPE_EJECT,
    261     BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
    262     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
    263     BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
    264     BLOCK_OP_TYPE_MIRROR_SOURCE,
    265     BLOCK_OP_TYPE_MIRROR_TARGET,
    266     BLOCK_OP_TYPE_RESIZE,
    267     BLOCK_OP_TYPE_STREAM,
    268     BLOCK_OP_TYPE_REPLACE,
    269     BLOCK_OP_TYPE_MAX,
    270 } BlockOpType;
    271 
    272 /* Block node permission constants */
    273 enum {
    274     /**
    275      * A user that has the "permission" of consistent reads is guaranteed that
    276      * their view of the contents of the block device is complete and
    277      * self-consistent, representing the contents of a disk at a specific
    278      * point.
    279      *
    280      * For most block devices (including their backing files) this is true, but
    281      * the property cannot be maintained in a few situations like for
    282      * intermediate nodes of a commit block job.
    283      */
    284     BLK_PERM_CONSISTENT_READ    = 0x01,
    285 
    286     /** This permission is required to change the visible disk contents. */
    287     BLK_PERM_WRITE              = 0x02,
    288 
    289     /**
    290      * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
    291      * required for writes to the block node when the caller promises that
    292      * the visible disk content doesn't change.
    293      *
    294      * As the BLK_PERM_WRITE permission is strictly stronger, either is
    295      * sufficient to perform an unchanging write.
    296      */
    297     BLK_PERM_WRITE_UNCHANGED    = 0x04,
    298 
    299     /** This permission is required to change the size of a block node. */
    300     BLK_PERM_RESIZE             = 0x08,
    301 
    302     /**
    303      * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
    304      * 6.1 and earlier may still lock the corresponding byte in block/file-posix
    305      * locking.  So, implementing some new permission should be very careful to
    306      * not interfere with this old unused thing.
    307      */
    308 
    309     BLK_PERM_ALL                = 0x0f,
    310 
    311     DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
    312                                  | BLK_PERM_WRITE
    313                                  | BLK_PERM_WRITE_UNCHANGED
    314                                  | BLK_PERM_RESIZE,
    315 
    316     DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
    317 };
    318 
    319 /*
    320  * Flags that parent nodes assign to child nodes to specify what kind of
    321  * role(s) they take.
    322  *
    323  * At least one of DATA, METADATA, FILTERED, or COW must be set for
    324  * every child.
    325  *
    326  *
    327  * = Connection with bs->children, bs->file and bs->backing fields =
    328  *
    329  * 1. Filters
    330  *
    331  * Filter drivers have drv->is_filter = true.
    332  *
    333  * Filter node has exactly one FILTERED|PRIMARY child, and may have other
    334  * children which must not have these bits (one example is the
    335  * copy-before-write filter, which also has its target DATA child).
    336  *
    337  * Filter nodes never have COW children.
    338  *
    339  * For most filters, the filtered child is linked in bs->file, bs->backing is
    340  * NULL.  For some filters (as an exception), it is the other way around; those
    341  * drivers will have drv->filtered_child_is_backing set to true (see that
    342  * field’s documentation for what drivers this concerns)
    343  *
    344  * 2. "raw" driver (block/raw-format.c)
    345  *
    346  * Formally it's not a filter (drv->is_filter = false)
    347  *
    348  * bs->backing is always NULL
    349  *
    350  * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY
    351  * (like filter) or DATA|PRIMARY depending on options.
    352  *
    353  * 3. Other drivers
    354  *
    355  * Don't have any FILTERED children.
    356  *
    357  * May have at most one COW child. In this case it's linked in bs->backing.
    358  * Otherwise bs->backing is NULL. COW child is never PRIMARY.
    359  *
    360  * May have at most one PRIMARY child. In this case it's linked in bs->file.
    361  * Otherwise bs->file is NULL.
    362  *
    363  * May also have some other children that don't have the PRIMARY or COW bit set.
    364  */
    365 enum BdrvChildRoleBits {
    366     /*
    367      * This child stores data.
    368      * Any node may have an arbitrary number of such children.
    369      */
    370     BDRV_CHILD_DATA         = (1 << 0),
    371 
    372     /*
    373      * This child stores metadata.
    374      * Any node may have an arbitrary number of metadata-storing
    375      * children.
    376      */
    377     BDRV_CHILD_METADATA     = (1 << 1),
    378 
    379     /*
    380      * A child that always presents exactly the same visible data as
    381      * the parent, e.g. by virtue of the parent forwarding all reads
    382      * and writes.
    383      * This flag is mutually exclusive with DATA, METADATA, and COW.
    384      * Any node may have at most one filtered child at a time.
    385      */
    386     BDRV_CHILD_FILTERED     = (1 << 2),
    387 
    388     /*
    389      * Child from which to read all data that isn't allocated in the
    390      * parent (i.e., the backing child); such data is copied to the
    391      * parent through COW (and optionally COR).
    392      * This field is mutually exclusive with DATA, METADATA, and
    393      * FILTERED.
    394      * Any node may have at most one such backing child at a time.
    395      */
    396     BDRV_CHILD_COW          = (1 << 3),
    397 
    398     /*
    399      * The primary child.  For most drivers, this is the child whose
    400      * filename applies best to the parent node.
    401      * Any node may have at most one primary child at a time.
    402      */
    403     BDRV_CHILD_PRIMARY      = (1 << 4),
    404 
    405     /* Useful combination of flags */
    406     BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
    407                               | BDRV_CHILD_METADATA
    408                               | BDRV_CHILD_PRIMARY,
    409 };
    410 
    411 /* Mask of BdrvChildRoleBits values */
    412 typedef unsigned int BdrvChildRole;
    413 
    414 typedef struct BdrvCheckResult {
    415     int corruptions;
    416     int leaks;
    417     int check_errors;
    418     int corruptions_fixed;
    419     int leaks_fixed;
    420     int64_t image_end_offset;
    421     BlockFragInfo bfi;
    422 } BdrvCheckResult;
    423 
    424 typedef enum {
    425     BDRV_FIX_LEAKS    = 1,
    426     BDRV_FIX_ERRORS   = 2,
    427 } BdrvCheckMode;
    428 
    429 typedef struct BlockSizes {
    430     uint32_t phys;
    431     uint32_t log;
    432 } BlockSizes;
    433 
    434 typedef struct HDGeometry {
    435     uint32_t heads;
    436     uint32_t sectors;
    437     uint32_t cylinders;
    438 } HDGeometry;
    439 
    440 /*
    441  * Common functions that are neither I/O nor Global State.
    442  *
    443  * These functions must never call any function from other categories
    444  * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
    445  * all of them.
    446  */
    447 
    448 char *bdrv_perm_names(uint64_t perm);
    449 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
    450 
    451 void bdrv_init_with_whitelist(void);
    452 bool bdrv_uses_whitelist(void);
    453 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
    454 
    455 int bdrv_parse_aio(const char *mode, int *flags);
    456 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
    457 int bdrv_parse_discard_flags(const char *mode, int *flags);
    458 
    459 int path_has_protocol(const char *path);
    460 int path_is_absolute(const char *path);
    461 char *path_combine(const char *base_path, const char *filename);
    462 
    463 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
    464                                                    const char *backing,
    465                                                    Error **errp);
    466 
    467 #endif /* BLOCK_COMMON_H */