qemu

FORK: QEMU emulator
git clone https://git.neptards.moe/neptards/qemu.git
Log | Files | Refs | Submodules | LICENSE

file-posix.c (117217B)


      1 /*
      2  * Block driver for RAW files (posix)
      3  *
      4  * Copyright (c) 2006 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 
     25 #include "qemu/osdep.h"
     26 #include "qapi/error.h"
     27 #include "qemu/cutils.h"
     28 #include "qemu/error-report.h"
     29 #include "block/block_int.h"
     30 #include "qemu/module.h"
     31 #include "qemu/option.h"
     32 #include "qemu/units.h"
     33 #include "qemu/memalign.h"
     34 #include "trace.h"
     35 #include "block/thread-pool.h"
     36 #include "qemu/iov.h"
     37 #include "block/raw-aio.h"
     38 #include "qapi/qmp/qdict.h"
     39 #include "qapi/qmp/qstring.h"
     40 
     41 #include "scsi/pr-manager.h"
     42 #include "scsi/constants.h"
     43 
     44 #if defined(__APPLE__) && (__MACH__)
     45 #include <sys/ioctl.h>
     46 #if defined(HAVE_HOST_BLOCK_DEVICE)
     47 #include <paths.h>
     48 #include <sys/param.h>
     49 #include <sys/mount.h>
     50 #include <IOKit/IOKitLib.h>
     51 #include <IOKit/IOBSD.h>
     52 #include <IOKit/storage/IOMediaBSDClient.h>
     53 #include <IOKit/storage/IOMedia.h>
     54 #include <IOKit/storage/IOCDMedia.h>
     55 //#include <IOKit/storage/IOCDTypes.h>
     56 #include <IOKit/storage/IODVDMedia.h>
     57 #include <CoreFoundation/CoreFoundation.h>
     58 #endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
     59 #endif
     60 
     61 #ifdef __sun__
     62 #define _POSIX_PTHREAD_SEMANTICS 1
     63 #include <sys/dkio.h>
     64 #endif
     65 #ifdef __linux__
     66 #include <sys/ioctl.h>
     67 #include <sys/param.h>
     68 #include <sys/syscall.h>
     69 #include <sys/vfs.h>
     70 #include <linux/cdrom.h>
     71 #include <linux/fd.h>
     72 #include <linux/fs.h>
     73 #include <linux/hdreg.h>
     74 #include <linux/magic.h>
     75 #include <scsi/sg.h>
     76 #ifdef __s390__
     77 #include <asm/dasd.h>
     78 #endif
     79 #ifndef FS_NOCOW_FL
     80 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
     81 #endif
     82 #endif
     83 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
     84 #include <linux/falloc.h>
     85 #endif
     86 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
     87 #include <sys/disk.h>
     88 #include <sys/cdio.h>
     89 #endif
     90 
     91 #ifdef __OpenBSD__
     92 #include <sys/ioctl.h>
     93 #include <sys/disklabel.h>
     94 #include <sys/dkio.h>
     95 #endif
     96 
     97 #ifdef __NetBSD__
     98 #include <sys/ioctl.h>
     99 #include <sys/disklabel.h>
    100 #include <sys/dkio.h>
    101 #include <sys/disk.h>
    102 #endif
    103 
    104 #ifdef __DragonFly__
    105 #include <sys/ioctl.h>
    106 #include <sys/diskslice.h>
    107 #endif
    108 
    109 /* OS X does not have O_DSYNC */
    110 #ifndef O_DSYNC
    111 #ifdef O_SYNC
    112 #define O_DSYNC O_SYNC
    113 #elif defined(O_FSYNC)
    114 #define O_DSYNC O_FSYNC
    115 #endif
    116 #endif
    117 
    118 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
    119 #ifndef O_DIRECT
    120 #define O_DIRECT O_DSYNC
    121 #endif
    122 
    123 #define FTYPE_FILE   0
    124 #define FTYPE_CD     1
    125 
    126 #define MAX_BLOCKSIZE	4096
    127 
    128 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
    129  * leaving a few more bytes for its future use. */
    130 #define RAW_LOCK_PERM_BASE             100
    131 #define RAW_LOCK_SHARED_BASE           200
    132 
    133 typedef struct BDRVRawState {
    134     int fd;
    135     bool use_lock;
    136     int type;
    137     int open_flags;
    138     size_t buf_align;
    139 
    140     /* The current permissions. */
    141     uint64_t perm;
    142     uint64_t shared_perm;
    143 
    144     /* The perms bits whose corresponding bytes are already locked in
    145      * s->fd. */
    146     uint64_t locked_perm;
    147     uint64_t locked_shared_perm;
    148 
    149     uint64_t aio_max_batch;
    150 
    151     int perm_change_fd;
    152     int perm_change_flags;
    153     BDRVReopenState *reopen_state;
    154 
    155     bool has_discard:1;
    156     bool has_write_zeroes:1;
    157     bool use_linux_aio:1;
    158     bool use_linux_io_uring:1;
    159     int page_cache_inconsistent; /* errno from fdatasync failure */
    160     bool has_fallocate;
    161     bool needs_alignment;
    162     bool force_alignment;
    163     bool drop_cache;
    164     bool check_cache_dropped;
    165     struct {
    166         uint64_t discard_nb_ok;
    167         uint64_t discard_nb_failed;
    168         uint64_t discard_bytes_ok;
    169     } stats;
    170 
    171     PRManager *pr_mgr;
    172 } BDRVRawState;
    173 
    174 typedef struct BDRVRawReopenState {
    175     int open_flags;
    176     bool drop_cache;
    177     bool check_cache_dropped;
    178 } BDRVRawReopenState;
    179 
    180 static int fd_open(BlockDriverState *bs)
    181 {
    182     BDRVRawState *s = bs->opaque;
    183 
    184     /* this is just to ensure s->fd is sane (its called by io ops) */
    185     if (s->fd >= 0) {
    186         return 0;
    187     }
    188     return -EIO;
    189 }
    190 
    191 static int64_t raw_getlength(BlockDriverState *bs);
    192 
    193 typedef struct RawPosixAIOData {
    194     BlockDriverState *bs;
    195     int aio_type;
    196     int aio_fildes;
    197 
    198     off_t aio_offset;
    199     uint64_t aio_nbytes;
    200 
    201     union {
    202         struct {
    203             struct iovec *iov;
    204             int niov;
    205         } io;
    206         struct {
    207             uint64_t cmd;
    208             void *buf;
    209         } ioctl;
    210         struct {
    211             int aio_fd2;
    212             off_t aio_offset2;
    213         } copy_range;
    214         struct {
    215             PreallocMode prealloc;
    216             Error **errp;
    217         } truncate;
    218     };
    219 } RawPosixAIOData;
    220 
    221 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
    222 static int cdrom_reopen(BlockDriverState *bs);
    223 #endif
    224 
    225 /*
    226  * Elide EAGAIN and EACCES details when failing to lock, as this
    227  * indicates that the specified file region is already locked by
    228  * another process, which is considered a common scenario.
    229  */
    230 #define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
    231     do {                                                                \
    232         if ((err) == EAGAIN || (err) == EACCES) {                       \
    233             error_setg((errp), (fmt), ## __VA_ARGS__);                  \
    234         } else {                                                        \
    235             error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
    236         }                                                               \
    237     } while (0)
    238 
    239 #if defined(__NetBSD__)
    240 static int raw_normalize_devicepath(const char **filename, Error **errp)
    241 {
    242     static char namebuf[PATH_MAX];
    243     const char *dp, *fname;
    244     struct stat sb;
    245 
    246     fname = *filename;
    247     dp = strrchr(fname, '/');
    248     if (lstat(fname, &sb) < 0) {
    249         error_setg_file_open(errp, errno, fname);
    250         return -errno;
    251     }
    252 
    253     if (!S_ISBLK(sb.st_mode)) {
    254         return 0;
    255     }
    256 
    257     if (dp == NULL) {
    258         snprintf(namebuf, PATH_MAX, "r%s", fname);
    259     } else {
    260         snprintf(namebuf, PATH_MAX, "%.*s/r%s",
    261             (int)(dp - fname), fname, dp + 1);
    262     }
    263     *filename = namebuf;
    264     warn_report("%s is a block device, using %s", fname, *filename);
    265 
    266     return 0;
    267 }
    268 #else
    269 static int raw_normalize_devicepath(const char **filename, Error **errp)
    270 {
    271     return 0;
    272 }
    273 #endif
    274 
    275 /*
    276  * Get logical block size via ioctl. On success store it in @sector_size_p.
    277  */
    278 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
    279 {
    280     unsigned int sector_size;
    281     bool success = false;
    282     int i;
    283 
    284     errno = ENOTSUP;
    285     static const unsigned long ioctl_list[] = {
    286 #ifdef BLKSSZGET
    287         BLKSSZGET,
    288 #endif
    289 #ifdef DKIOCGETBLOCKSIZE
    290         DKIOCGETBLOCKSIZE,
    291 #endif
    292 #ifdef DIOCGSECTORSIZE
    293         DIOCGSECTORSIZE,
    294 #endif
    295     };
    296 
    297     /* Try a few ioctls to get the right size */
    298     for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
    299         if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
    300             *sector_size_p = sector_size;
    301             success = true;
    302         }
    303     }
    304 
    305     return success ? 0 : -errno;
    306 }
    307 
    308 /**
    309  * Get physical block size of @fd.
    310  * On success, store it in @blk_size and return 0.
    311  * On failure, return -errno.
    312  */
    313 static int probe_physical_blocksize(int fd, unsigned int *blk_size)
    314 {
    315 #ifdef BLKPBSZGET
    316     if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
    317         return -errno;
    318     }
    319     return 0;
    320 #else
    321     return -ENOTSUP;
    322 #endif
    323 }
    324 
    325 /*
    326  * Returns true if no alignment restrictions are necessary even for files
    327  * opened with O_DIRECT.
    328  *
    329  * raw_probe_alignment() probes the required alignment and assume that 1 means
    330  * the probing failed, so it falls back to a safe default of 4k. This can be
    331  * avoided if we know that byte alignment is okay for the file.
    332  */
    333 static bool dio_byte_aligned(int fd)
    334 {
    335 #ifdef __linux__
    336     struct statfs buf;
    337     int ret;
    338 
    339     ret = fstatfs(fd, &buf);
    340     if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
    341         return true;
    342     }
    343 #endif
    344     return false;
    345 }
    346 
    347 static bool raw_needs_alignment(BlockDriverState *bs)
    348 {
    349     BDRVRawState *s = bs->opaque;
    350 
    351     if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
    352         return true;
    353     }
    354 
    355     return s->force_alignment;
    356 }
    357 
    358 /* Check if read is allowed with given memory buffer and length.
    359  *
    360  * This function is used to check O_DIRECT memory buffer and request alignment.
    361  */
    362 static bool raw_is_io_aligned(int fd, void *buf, size_t len)
    363 {
    364     ssize_t ret = pread(fd, buf, len, 0);
    365 
    366     if (ret >= 0) {
    367         return true;
    368     }
    369 
    370 #ifdef __linux__
    371     /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
    372      * other errors (e.g. real I/O error), which could happen on a failed
    373      * drive, since we only care about probing alignment.
    374      */
    375     if (errno != EINVAL) {
    376         return true;
    377     }
    378 #endif
    379 
    380     return false;
    381 }
    382 
    383 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
    384 {
    385     BDRVRawState *s = bs->opaque;
    386     char *buf;
    387     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
    388     size_t alignments[] = {1, 512, 1024, 2048, 4096};
    389 
    390     /* For SCSI generic devices the alignment is not really used.
    391        With buffered I/O, we don't have any restrictions. */
    392     if (bdrv_is_sg(bs) || !s->needs_alignment) {
    393         bs->bl.request_alignment = 1;
    394         s->buf_align = 1;
    395         return;
    396     }
    397 
    398     bs->bl.request_alignment = 0;
    399     s->buf_align = 0;
    400     /* Let's try to use the logical blocksize for the alignment. */
    401     if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
    402         bs->bl.request_alignment = 0;
    403     }
    404 
    405 #ifdef __linux__
    406     /*
    407      * The XFS ioctl definitions are shipped in extra packages that might
    408      * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
    409      * here, we simply use our own definition instead:
    410      */
    411     struct xfs_dioattr {
    412         uint32_t d_mem;
    413         uint32_t d_miniosz;
    414         uint32_t d_maxiosz;
    415     } da;
    416     if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
    417         bs->bl.request_alignment = da.d_miniosz;
    418         /* The kernel returns wrong information for d_mem */
    419         /* s->buf_align = da.d_mem; */
    420     }
    421 #endif
    422 
    423     /*
    424      * If we could not get the sizes so far, we can only guess them. First try
    425      * to detect request alignment, since it is more likely to succeed. Then
    426      * try to detect buf_align, which cannot be detected in some cases (e.g.
    427      * Gluster). If buf_align cannot be detected, we fallback to the value of
    428      * request_alignment.
    429      */
    430 
    431     if (!bs->bl.request_alignment) {
    432         int i;
    433         size_t align;
    434         buf = qemu_memalign(max_align, max_align);
    435         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
    436             align = alignments[i];
    437             if (raw_is_io_aligned(fd, buf, align)) {
    438                 /* Fallback to safe value. */
    439                 bs->bl.request_alignment = (align != 1) ? align : max_align;
    440                 break;
    441             }
    442         }
    443         qemu_vfree(buf);
    444     }
    445 
    446     if (!s->buf_align) {
    447         int i;
    448         size_t align;
    449         buf = qemu_memalign(max_align, 2 * max_align);
    450         for (i = 0; i < ARRAY_SIZE(alignments); i++) {
    451             align = alignments[i];
    452             if (raw_is_io_aligned(fd, buf + align, max_align)) {
    453                 /* Fallback to request_alignment. */
    454                 s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
    455                 break;
    456             }
    457         }
    458         qemu_vfree(buf);
    459     }
    460 
    461     if (!s->buf_align || !bs->bl.request_alignment) {
    462         error_setg(errp, "Could not find working O_DIRECT alignment");
    463         error_append_hint(errp, "Try cache.direct=off\n");
    464     }
    465 }
    466 
    467 static int check_hdev_writable(int fd)
    468 {
    469 #if defined(BLKROGET)
    470     /* Linux block devices can be configured "read-only" using blockdev(8).
    471      * This is independent of device node permissions and therefore open(2)
    472      * with O_RDWR succeeds.  Actual writes fail with EPERM.
    473      *
    474      * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
    475      * check for read-only block devices so that Linux block devices behave
    476      * properly.
    477      */
    478     struct stat st;
    479     int readonly = 0;
    480 
    481     if (fstat(fd, &st)) {
    482         return -errno;
    483     }
    484 
    485     if (!S_ISBLK(st.st_mode)) {
    486         return 0;
    487     }
    488 
    489     if (ioctl(fd, BLKROGET, &readonly) < 0) {
    490         return -errno;
    491     }
    492 
    493     if (readonly) {
    494         return -EACCES;
    495     }
    496 #endif /* defined(BLKROGET) */
    497     return 0;
    498 }
    499 
    500 static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
    501 {
    502     bool read_write = false;
    503     assert(open_flags != NULL);
    504 
    505     *open_flags |= O_BINARY;
    506     *open_flags &= ~O_ACCMODE;
    507 
    508     if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
    509         read_write = has_writers;
    510     } else if (bdrv_flags & BDRV_O_RDWR) {
    511         read_write = true;
    512     }
    513 
    514     if (read_write) {
    515         *open_flags |= O_RDWR;
    516     } else {
    517         *open_flags |= O_RDONLY;
    518     }
    519 
    520     /* Use O_DSYNC for write-through caching, no flags for write-back caching,
    521      * and O_DIRECT for no caching. */
    522     if ((bdrv_flags & BDRV_O_NOCACHE)) {
    523         *open_flags |= O_DIRECT;
    524     }
    525 }
    526 
    527 static void raw_parse_filename(const char *filename, QDict *options,
    528                                Error **errp)
    529 {
    530     bdrv_parse_filename_strip_prefix(filename, "file:", options);
    531 }
    532 
    533 static QemuOptsList raw_runtime_opts = {
    534     .name = "raw",
    535     .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
    536     .desc = {
    537         {
    538             .name = "filename",
    539             .type = QEMU_OPT_STRING,
    540             .help = "File name of the image",
    541         },
    542         {
    543             .name = "aio",
    544             .type = QEMU_OPT_STRING,
    545             .help = "host AIO implementation (threads, native, io_uring)",
    546         },
    547         {
    548             .name = "aio-max-batch",
    549             .type = QEMU_OPT_NUMBER,
    550             .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
    551         },
    552         {
    553             .name = "locking",
    554             .type = QEMU_OPT_STRING,
    555             .help = "file locking mode (on/off/auto, default: auto)",
    556         },
    557         {
    558             .name = "pr-manager",
    559             .type = QEMU_OPT_STRING,
    560             .help = "id of persistent reservation manager object (default: none)",
    561         },
    562 #if defined(__linux__)
    563         {
    564             .name = "drop-cache",
    565             .type = QEMU_OPT_BOOL,
    566             .help = "invalidate page cache during live migration (default: on)",
    567         },
    568 #endif
    569         {
    570             .name = "x-check-cache-dropped",
    571             .type = QEMU_OPT_BOOL,
    572             .help = "check that page cache was dropped on live migration (default: off)"
    573         },
    574         { /* end of list */ }
    575     },
    576 };
    577 
    578 static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
    579 
    580 static int raw_open_common(BlockDriverState *bs, QDict *options,
    581                            int bdrv_flags, int open_flags,
    582                            bool device, Error **errp)
    583 {
    584     BDRVRawState *s = bs->opaque;
    585     QemuOpts *opts;
    586     Error *local_err = NULL;
    587     const char *filename = NULL;
    588     const char *str;
    589     BlockdevAioOptions aio, aio_default;
    590     int fd, ret;
    591     struct stat st;
    592     OnOffAuto locking;
    593 
    594     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
    595     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
    596         ret = -EINVAL;
    597         goto fail;
    598     }
    599 
    600     filename = qemu_opt_get(opts, "filename");
    601 
    602     ret = raw_normalize_devicepath(&filename, errp);
    603     if (ret != 0) {
    604         goto fail;
    605     }
    606 
    607     if (bdrv_flags & BDRV_O_NATIVE_AIO) {
    608         aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
    609 #ifdef CONFIG_LINUX_IO_URING
    610     } else if (bdrv_flags & BDRV_O_IO_URING) {
    611         aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
    612 #endif
    613     } else {
    614         aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
    615     }
    616 
    617     aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
    618                           qemu_opt_get(opts, "aio"),
    619                           aio_default, &local_err);
    620     if (local_err) {
    621         error_propagate(errp, local_err);
    622         ret = -EINVAL;
    623         goto fail;
    624     }
    625 
    626     s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
    627 #ifdef CONFIG_LINUX_IO_URING
    628     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
    629 #endif
    630 
    631     s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
    632 
    633     locking = qapi_enum_parse(&OnOffAuto_lookup,
    634                               qemu_opt_get(opts, "locking"),
    635                               ON_OFF_AUTO_AUTO, &local_err);
    636     if (local_err) {
    637         error_propagate(errp, local_err);
    638         ret = -EINVAL;
    639         goto fail;
    640     }
    641     switch (locking) {
    642     case ON_OFF_AUTO_ON:
    643         s->use_lock = true;
    644         if (!qemu_has_ofd_lock()) {
    645             warn_report("File lock requested but OFD locking syscall is "
    646                         "unavailable, falling back to POSIX file locks");
    647             error_printf("Due to the implementation, locks can be lost "
    648                          "unexpectedly.\n");
    649         }
    650         break;
    651     case ON_OFF_AUTO_OFF:
    652         s->use_lock = false;
    653         break;
    654     case ON_OFF_AUTO_AUTO:
    655         s->use_lock = qemu_has_ofd_lock();
    656         break;
    657     default:
    658         abort();
    659     }
    660 
    661     str = qemu_opt_get(opts, "pr-manager");
    662     if (str) {
    663         s->pr_mgr = pr_manager_lookup(str, &local_err);
    664         if (local_err) {
    665             error_propagate(errp, local_err);
    666             ret = -EINVAL;
    667             goto fail;
    668         }
    669     }
    670 
    671     s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
    672     s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
    673                                                false);
    674 
    675     s->open_flags = open_flags;
    676     raw_parse_flags(bdrv_flags, &s->open_flags, false);
    677 
    678     s->fd = -1;
    679     fd = qemu_open(filename, s->open_flags, errp);
    680     ret = fd < 0 ? -errno : 0;
    681 
    682     if (ret < 0) {
    683         if (ret == -EROFS) {
    684             ret = -EACCES;
    685         }
    686         goto fail;
    687     }
    688     s->fd = fd;
    689 
    690     /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
    691     if (s->open_flags & O_RDWR) {
    692         ret = check_hdev_writable(s->fd);
    693         if (ret < 0) {
    694             error_setg_errno(errp, -ret, "The device is not writable");
    695             goto fail;
    696         }
    697     }
    698 
    699     s->perm = 0;
    700     s->shared_perm = BLK_PERM_ALL;
    701 
    702 #ifdef CONFIG_LINUX_AIO
    703      /* Currently Linux does AIO only for files opened with O_DIRECT */
    704     if (s->use_linux_aio) {
    705         if (!(s->open_flags & O_DIRECT)) {
    706             error_setg(errp, "aio=native was specified, but it requires "
    707                              "cache.direct=on, which was not specified.");
    708             ret = -EINVAL;
    709             goto fail;
    710         }
    711         if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
    712             error_prepend(errp, "Unable to use native AIO: ");
    713             goto fail;
    714         }
    715     }
    716 #else
    717     if (s->use_linux_aio) {
    718         error_setg(errp, "aio=native was specified, but is not supported "
    719                          "in this build.");
    720         ret = -EINVAL;
    721         goto fail;
    722     }
    723 #endif /* !defined(CONFIG_LINUX_AIO) */
    724 
    725 #ifdef CONFIG_LINUX_IO_URING
    726     if (s->use_linux_io_uring) {
    727         if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
    728             error_prepend(errp, "Unable to use io_uring: ");
    729             goto fail;
    730         }
    731     }
    732 #else
    733     if (s->use_linux_io_uring) {
    734         error_setg(errp, "aio=io_uring was specified, but is not supported "
    735                          "in this build.");
    736         ret = -EINVAL;
    737         goto fail;
    738     }
    739 #endif /* !defined(CONFIG_LINUX_IO_URING) */
    740 
    741     s->has_discard = true;
    742     s->has_write_zeroes = true;
    743 
    744     if (fstat(s->fd, &st) < 0) {
    745         ret = -errno;
    746         error_setg_errno(errp, errno, "Could not stat file");
    747         goto fail;
    748     }
    749 
    750     if (!device) {
    751         if (!S_ISREG(st.st_mode)) {
    752             error_setg(errp, "'%s' driver requires '%s' to be a regular file",
    753                        bs->drv->format_name, bs->filename);
    754             ret = -EINVAL;
    755             goto fail;
    756         } else {
    757             s->has_fallocate = true;
    758         }
    759     } else {
    760         if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
    761             error_setg(errp, "'%s' driver requires '%s' to be either "
    762                        "a character or block device",
    763                        bs->drv->format_name, bs->filename);
    764             ret = -EINVAL;
    765             goto fail;
    766         }
    767     }
    768 
    769     if (S_ISBLK(st.st_mode)) {
    770 #ifdef __linux__
    771         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
    772          * not rely on the contents of discarded blocks unless using O_DIRECT.
    773          * Same for BLKZEROOUT.
    774          */
    775         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
    776             s->has_write_zeroes = false;
    777         }
    778 #endif
    779     }
    780 #ifdef __FreeBSD__
    781     if (S_ISCHR(st.st_mode)) {
    782         /*
    783          * The file is a char device (disk), which on FreeBSD isn't behind
    784          * a pager, so force all requests to be aligned. This is needed
    785          * so QEMU makes sure all IO operations on the device are aligned
    786          * to sector size, or else FreeBSD will reject them with EINVAL.
    787          */
    788         s->force_alignment = true;
    789     }
    790 #endif
    791     s->needs_alignment = raw_needs_alignment(bs);
    792 
    793     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
    794     if (S_ISREG(st.st_mode)) {
    795         /* When extending regular files, we get zeros from the OS */
    796         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
    797     }
    798     ret = 0;
    799 fail:
    800     if (ret < 0 && s->fd != -1) {
    801         qemu_close(s->fd);
    802     }
    803     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
    804         unlink(filename);
    805     }
    806     qemu_opts_del(opts);
    807     return ret;
    808 }
    809 
    810 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
    811                     Error **errp)
    812 {
    813     BDRVRawState *s = bs->opaque;
    814 
    815     s->type = FTYPE_FILE;
    816     return raw_open_common(bs, options, flags, 0, false, errp);
    817 }
    818 
    819 typedef enum {
    820     RAW_PL_PREPARE,
    821     RAW_PL_COMMIT,
    822     RAW_PL_ABORT,
    823 } RawPermLockOp;
    824 
    825 #define PERM_FOREACH(i) \
    826     for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
    827 
    828 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
    829  * file; if @unlock == true, also unlock the unneeded bytes.
    830  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
    831  */
    832 static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
    833                                 uint64_t perm_lock_bits,
    834                                 uint64_t shared_perm_lock_bits,
    835                                 bool unlock, Error **errp)
    836 {
    837     int ret;
    838     int i;
    839     uint64_t locked_perm, locked_shared_perm;
    840 
    841     if (s) {
    842         locked_perm = s->locked_perm;
    843         locked_shared_perm = s->locked_shared_perm;
    844     } else {
    845         /*
    846          * We don't have the previous bits, just lock/unlock for each of the
    847          * requested bits.
    848          */
    849         if (unlock) {
    850             locked_perm = BLK_PERM_ALL;
    851             locked_shared_perm = BLK_PERM_ALL;
    852         } else {
    853             locked_perm = 0;
    854             locked_shared_perm = 0;
    855         }
    856     }
    857 
    858     PERM_FOREACH(i) {
    859         int off = RAW_LOCK_PERM_BASE + i;
    860         uint64_t bit = (1ULL << i);
    861         if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
    862             ret = qemu_lock_fd(fd, off, 1, false);
    863             if (ret) {
    864                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
    865                                           off);
    866                 return ret;
    867             } else if (s) {
    868                 s->locked_perm |= bit;
    869             }
    870         } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
    871             ret = qemu_unlock_fd(fd, off, 1);
    872             if (ret) {
    873                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
    874                 return ret;
    875             } else if (s) {
    876                 s->locked_perm &= ~bit;
    877             }
    878         }
    879     }
    880     PERM_FOREACH(i) {
    881         int off = RAW_LOCK_SHARED_BASE + i;
    882         uint64_t bit = (1ULL << i);
    883         if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
    884             ret = qemu_lock_fd(fd, off, 1, false);
    885             if (ret) {
    886                 raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
    887                                           off);
    888                 return ret;
    889             } else if (s) {
    890                 s->locked_shared_perm |= bit;
    891             }
    892         } else if (unlock && (locked_shared_perm & bit) &&
    893                    !(shared_perm_lock_bits & bit)) {
    894             ret = qemu_unlock_fd(fd, off, 1);
    895             if (ret) {
    896                 error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
    897                 return ret;
    898             } else if (s) {
    899                 s->locked_shared_perm &= ~bit;
    900             }
    901         }
    902     }
    903     return 0;
    904 }
    905 
    906 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
    907 static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
    908                                 Error **errp)
    909 {
    910     int ret;
    911     int i;
    912 
    913     PERM_FOREACH(i) {
    914         int off = RAW_LOCK_SHARED_BASE + i;
    915         uint64_t p = 1ULL << i;
    916         if (perm & p) {
    917             ret = qemu_lock_fd_test(fd, off, 1, true);
    918             if (ret) {
    919                 char *perm_name = bdrv_perm_names(p);
    920 
    921                 raw_lock_error_setg_errno(errp, -ret,
    922                                           "Failed to get \"%s\" lock",
    923                                           perm_name);
    924                 g_free(perm_name);
    925                 return ret;
    926             }
    927         }
    928     }
    929     PERM_FOREACH(i) {
    930         int off = RAW_LOCK_PERM_BASE + i;
    931         uint64_t p = 1ULL << i;
    932         if (!(shared_perm & p)) {
    933             ret = qemu_lock_fd_test(fd, off, 1, true);
    934             if (ret) {
    935                 char *perm_name = bdrv_perm_names(p);
    936 
    937                 raw_lock_error_setg_errno(errp, -ret,
    938                                           "Failed to get shared \"%s\" lock",
    939                                           perm_name);
    940                 g_free(perm_name);
    941                 return ret;
    942             }
    943         }
    944     }
    945     return 0;
    946 }
    947 
    948 static int raw_handle_perm_lock(BlockDriverState *bs,
    949                                 RawPermLockOp op,
    950                                 uint64_t new_perm, uint64_t new_shared,
    951                                 Error **errp)
    952 {
    953     BDRVRawState *s = bs->opaque;
    954     int ret = 0;
    955     Error *local_err = NULL;
    956 
    957     if (!s->use_lock) {
    958         return 0;
    959     }
    960 
    961     if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
    962         return 0;
    963     }
    964 
    965     switch (op) {
    966     case RAW_PL_PREPARE:
    967         if ((s->perm | new_perm) == s->perm &&
    968             (s->shared_perm & new_shared) == s->shared_perm)
    969         {
    970             /*
    971              * We are going to unlock bytes, it should not fail. If it fail due
    972              * to some fs-dependent permission-unrelated reasons (which occurs
    973              * sometimes on NFS and leads to abort in bdrv_replace_child) we
    974              * can't prevent such errors by any check here. And we ignore them
    975              * anyway in ABORT and COMMIT.
    976              */
    977             return 0;
    978         }
    979         ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
    980                                    ~s->shared_perm | ~new_shared,
    981                                    false, errp);
    982         if (!ret) {
    983             ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
    984             if (!ret) {
    985                 return 0;
    986             }
    987             error_append_hint(errp,
    988                               "Is another process using the image [%s]?\n",
    989                               bs->filename);
    990         }
    991         /* fall through to unlock bytes. */
    992     case RAW_PL_ABORT:
    993         raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
    994                              true, &local_err);
    995         if (local_err) {
    996             /* Theoretically the above call only unlocks bytes and it cannot
    997              * fail. Something weird happened, report it.
    998              */
    999             warn_report_err(local_err);
   1000         }
   1001         break;
   1002     case RAW_PL_COMMIT:
   1003         raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
   1004                              true, &local_err);
   1005         if (local_err) {
   1006             /* Theoretically the above call only unlocks bytes and it cannot
   1007              * fail. Something weird happened, report it.
   1008              */
   1009             warn_report_err(local_err);
   1010         }
   1011         break;
   1012     }
   1013     return ret;
   1014 }
   1015 
   1016 /* Sets a specific flag */
   1017 static int fcntl_setfl(int fd, int flag)
   1018 {
   1019     int flags;
   1020 
   1021     flags = fcntl(fd, F_GETFL);
   1022     if (flags == -1) {
   1023         return -errno;
   1024     }
   1025     if (fcntl(fd, F_SETFL, flags | flag) == -1) {
   1026         return -errno;
   1027     }
   1028     return 0;
   1029 }
   1030 
   1031 static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
   1032                                  int *open_flags, uint64_t perm, bool force_dup,
   1033                                  Error **errp)
   1034 {
   1035     BDRVRawState *s = bs->opaque;
   1036     int fd = -1;
   1037     int ret;
   1038     bool has_writers = perm &
   1039         (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
   1040     int fcntl_flags = O_APPEND | O_NONBLOCK;
   1041 #ifdef O_NOATIME
   1042     fcntl_flags |= O_NOATIME;
   1043 #endif
   1044 
   1045     *open_flags = 0;
   1046     if (s->type == FTYPE_CD) {
   1047         *open_flags |= O_NONBLOCK;
   1048     }
   1049 
   1050     raw_parse_flags(flags, open_flags, has_writers);
   1051 
   1052 #ifdef O_ASYNC
   1053     /* Not all operating systems have O_ASYNC, and those that don't
   1054      * will not let us track the state into rs->open_flags (typically
   1055      * you achieve the same effect with an ioctl, for example I_SETSIG
   1056      * on Solaris). But we do not use O_ASYNC, so that's fine.
   1057      */
   1058     assert((s->open_flags & O_ASYNC) == 0);
   1059 #endif
   1060 
   1061     if (!force_dup && *open_flags == s->open_flags) {
   1062         /* We're lucky, the existing fd is fine */
   1063         return s->fd;
   1064     }
   1065 
   1066     if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
   1067         /* dup the original fd */
   1068         fd = qemu_dup(s->fd);
   1069         if (fd >= 0) {
   1070             ret = fcntl_setfl(fd, *open_flags);
   1071             if (ret) {
   1072                 qemu_close(fd);
   1073                 fd = -1;
   1074             }
   1075         }
   1076     }
   1077 
   1078     /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
   1079     if (fd == -1) {
   1080         const char *normalized_filename = bs->filename;
   1081         ret = raw_normalize_devicepath(&normalized_filename, errp);
   1082         if (ret >= 0) {
   1083             fd = qemu_open(normalized_filename, *open_flags, errp);
   1084             if (fd == -1) {
   1085                 return -1;
   1086             }
   1087         }
   1088     }
   1089 
   1090     if (fd != -1 && (*open_flags & O_RDWR)) {
   1091         ret = check_hdev_writable(fd);
   1092         if (ret < 0) {
   1093             qemu_close(fd);
   1094             error_setg_errno(errp, -ret, "The device is not writable");
   1095             return -1;
   1096         }
   1097     }
   1098 
   1099     return fd;
   1100 }
   1101 
   1102 static int raw_reopen_prepare(BDRVReopenState *state,
   1103                               BlockReopenQueue *queue, Error **errp)
   1104 {
   1105     BDRVRawState *s;
   1106     BDRVRawReopenState *rs;
   1107     QemuOpts *opts;
   1108     int ret;
   1109 
   1110     assert(state != NULL);
   1111     assert(state->bs != NULL);
   1112 
   1113     s = state->bs->opaque;
   1114 
   1115     state->opaque = g_new0(BDRVRawReopenState, 1);
   1116     rs = state->opaque;
   1117 
   1118     /* Handle options changes */
   1119     opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
   1120     if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
   1121         ret = -EINVAL;
   1122         goto out;
   1123     }
   1124 
   1125     rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
   1126     rs->check_cache_dropped =
   1127         qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
   1128 
   1129     /* This driver's reopen function doesn't currently allow changing
   1130      * other options, so let's put them back in the original QDict and
   1131      * bdrv_reopen_prepare() will detect changes and complain. */
   1132     qemu_opts_to_qdict(opts, state->options);
   1133 
   1134     /*
   1135      * As part of reopen prepare we also want to create new fd by
   1136      * raw_reconfigure_getfd(). But it wants updated "perm", when in
   1137      * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
   1138      * permission update. Happily, permission update is always a part (a seprate
   1139      * stage) of bdrv_reopen_multiple() so we can rely on this fact and
   1140      * reconfigure fd in raw_check_perm().
   1141      */
   1142 
   1143     s->reopen_state = state;
   1144     ret = 0;
   1145 
   1146 out:
   1147     qemu_opts_del(opts);
   1148     return ret;
   1149 }
   1150 
   1151 static void raw_reopen_commit(BDRVReopenState *state)
   1152 {
   1153     BDRVRawReopenState *rs = state->opaque;
   1154     BDRVRawState *s = state->bs->opaque;
   1155 
   1156     s->drop_cache = rs->drop_cache;
   1157     s->check_cache_dropped = rs->check_cache_dropped;
   1158     s->open_flags = rs->open_flags;
   1159     g_free(state->opaque);
   1160     state->opaque = NULL;
   1161 
   1162     assert(s->reopen_state == state);
   1163     s->reopen_state = NULL;
   1164 }
   1165 
   1166 
   1167 static void raw_reopen_abort(BDRVReopenState *state)
   1168 {
   1169     BDRVRawReopenState *rs = state->opaque;
   1170     BDRVRawState *s = state->bs->opaque;
   1171 
   1172      /* nothing to do if NULL, we didn't get far enough */
   1173     if (rs == NULL) {
   1174         return;
   1175     }
   1176 
   1177     g_free(state->opaque);
   1178     state->opaque = NULL;
   1179 
   1180     assert(s->reopen_state == state);
   1181     s->reopen_state = NULL;
   1182 }
   1183 
   1184 static int hdev_get_max_hw_transfer(int fd, struct stat *st)
   1185 {
   1186 #ifdef BLKSECTGET
   1187     if (S_ISBLK(st->st_mode)) {
   1188         unsigned short max_sectors = 0;
   1189         if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
   1190             return max_sectors * 512;
   1191         }
   1192     } else {
   1193         int max_bytes = 0;
   1194         if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
   1195             return max_bytes;
   1196         }
   1197     }
   1198     return -errno;
   1199 #else
   1200     return -ENOSYS;
   1201 #endif
   1202 }
   1203 
   1204 static int hdev_get_max_segments(int fd, struct stat *st)
   1205 {
   1206 #ifdef CONFIG_LINUX
   1207     char buf[32];
   1208     const char *end;
   1209     char *sysfspath = NULL;
   1210     int ret;
   1211     int sysfd = -1;
   1212     long max_segments;
   1213 
   1214     if (S_ISCHR(st->st_mode)) {
   1215         if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
   1216             return ret;
   1217         }
   1218         return -ENOTSUP;
   1219     }
   1220 
   1221     if (!S_ISBLK(st->st_mode)) {
   1222         return -ENOTSUP;
   1223     }
   1224 
   1225     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
   1226                                 major(st->st_rdev), minor(st->st_rdev));
   1227     sysfd = open(sysfspath, O_RDONLY);
   1228     if (sysfd == -1) {
   1229         ret = -errno;
   1230         goto out;
   1231     }
   1232     do {
   1233         ret = read(sysfd, buf, sizeof(buf) - 1);
   1234     } while (ret == -1 && errno == EINTR);
   1235     if (ret < 0) {
   1236         ret = -errno;
   1237         goto out;
   1238     } else if (ret == 0) {
   1239         ret = -EIO;
   1240         goto out;
   1241     }
   1242     buf[ret] = 0;
   1243     /* The file is ended with '\n', pass 'end' to accept that. */
   1244     ret = qemu_strtol(buf, &end, 10, &max_segments);
   1245     if (ret == 0 && end && *end == '\n') {
   1246         ret = max_segments;
   1247     }
   1248 
   1249 out:
   1250     if (sysfd != -1) {
   1251         close(sysfd);
   1252     }
   1253     g_free(sysfspath);
   1254     return ret;
   1255 #else
   1256     return -ENOTSUP;
   1257 #endif
   1258 }
   1259 
   1260 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
   1261 {
   1262     BDRVRawState *s = bs->opaque;
   1263     struct stat st;
   1264 
   1265     s->needs_alignment = raw_needs_alignment(bs);
   1266     raw_probe_alignment(bs, s->fd, errp);
   1267 
   1268     bs->bl.min_mem_alignment = s->buf_align;
   1269     bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size());
   1270 
   1271     /*
   1272      * Maximum transfers are best effort, so it is okay to ignore any
   1273      * errors.  That said, based on the man page errors in fstat would be
   1274      * very much unexpected; the only possible case seems to be ENOMEM.
   1275      */
   1276     if (fstat(s->fd, &st)) {
   1277         return;
   1278     }
   1279 
   1280 #if defined(__APPLE__) && (__MACH__)
   1281     struct statfs buf;
   1282 
   1283     if (!fstatfs(s->fd, &buf)) {
   1284         bs->bl.opt_transfer = buf.f_iosize;
   1285         bs->bl.pdiscard_alignment = buf.f_bsize;
   1286     }
   1287 #endif
   1288 
   1289     if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) {
   1290         int ret = hdev_get_max_hw_transfer(s->fd, &st);
   1291 
   1292         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
   1293             bs->bl.max_hw_transfer = ret;
   1294         }
   1295 
   1296         ret = hdev_get_max_segments(s->fd, &st);
   1297         if (ret > 0) {
   1298             bs->bl.max_hw_iov = ret;
   1299         }
   1300     }
   1301 }
   1302 
   1303 static int check_for_dasd(int fd)
   1304 {
   1305 #ifdef BIODASDINFO2
   1306     struct dasd_information2_t info = {0};
   1307 
   1308     return ioctl(fd, BIODASDINFO2, &info);
   1309 #else
   1310     return -1;
   1311 #endif
   1312 }
   1313 
   1314 /**
   1315  * Try to get @bs's logical and physical block size.
   1316  * On success, store them in @bsz and return zero.
   1317  * On failure, return negative errno.
   1318  */
   1319 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
   1320 {
   1321     BDRVRawState *s = bs->opaque;
   1322     int ret;
   1323 
   1324     /* If DASD, get blocksizes */
   1325     if (check_for_dasd(s->fd) < 0) {
   1326         return -ENOTSUP;
   1327     }
   1328     ret = probe_logical_blocksize(s->fd, &bsz->log);
   1329     if (ret < 0) {
   1330         return ret;
   1331     }
   1332     return probe_physical_blocksize(s->fd, &bsz->phys);
   1333 }
   1334 
   1335 /**
   1336  * Try to get @bs's geometry: cyls, heads, sectors.
   1337  * On success, store them in @geo and return 0.
   1338  * On failure return -errno.
   1339  * (Allows block driver to assign default geometry values that guest sees)
   1340  */
   1341 #ifdef __linux__
   1342 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
   1343 {
   1344     BDRVRawState *s = bs->opaque;
   1345     struct hd_geometry ioctl_geo = {0};
   1346 
   1347     /* If DASD, get its geometry */
   1348     if (check_for_dasd(s->fd) < 0) {
   1349         return -ENOTSUP;
   1350     }
   1351     if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
   1352         return -errno;
   1353     }
   1354     /* HDIO_GETGEO may return success even though geo contains zeros
   1355        (e.g. certain multipath setups) */
   1356     if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
   1357         return -ENOTSUP;
   1358     }
   1359     /* Do not return a geometry for partition */
   1360     if (ioctl_geo.start != 0) {
   1361         return -ENOTSUP;
   1362     }
   1363     geo->heads = ioctl_geo.heads;
   1364     geo->sectors = ioctl_geo.sectors;
   1365     geo->cylinders = ioctl_geo.cylinders;
   1366 
   1367     return 0;
   1368 }
   1369 #else /* __linux__ */
   1370 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
   1371 {
   1372     return -ENOTSUP;
   1373 }
   1374 #endif
   1375 
   1376 #if defined(__linux__)
   1377 static int handle_aiocb_ioctl(void *opaque)
   1378 {
   1379     RawPosixAIOData *aiocb = opaque;
   1380     int ret;
   1381 
   1382     do {
   1383         ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
   1384     } while (ret == -1 && errno == EINTR);
   1385     if (ret == -1) {
   1386         return -errno;
   1387     }
   1388 
   1389     return 0;
   1390 }
   1391 #endif /* linux */
   1392 
   1393 static int handle_aiocb_flush(void *opaque)
   1394 {
   1395     RawPosixAIOData *aiocb = opaque;
   1396     BDRVRawState *s = aiocb->bs->opaque;
   1397     int ret;
   1398 
   1399     if (s->page_cache_inconsistent) {
   1400         return -s->page_cache_inconsistent;
   1401     }
   1402 
   1403     ret = qemu_fdatasync(aiocb->aio_fildes);
   1404     if (ret == -1) {
   1405         trace_file_flush_fdatasync_failed(errno);
   1406 
   1407         /* There is no clear definition of the semantics of a failing fsync(),
   1408          * so we may have to assume the worst. The sad truth is that this
   1409          * assumption is correct for Linux. Some pages are now probably marked
   1410          * clean in the page cache even though they are inconsistent with the
   1411          * on-disk contents. The next fdatasync() call would succeed, but no
   1412          * further writeback attempt will be made. We can't get back to a state
   1413          * in which we know what is on disk (we would have to rewrite
   1414          * everything that was touched since the last fdatasync() at least), so
   1415          * make bdrv_flush() fail permanently. Given that the behaviour isn't
   1416          * really defined, I have little hope that other OSes are doing better.
   1417          *
   1418          * Obviously, this doesn't affect O_DIRECT, which bypasses the page
   1419          * cache. */
   1420         if ((s->open_flags & O_DIRECT) == 0) {
   1421             s->page_cache_inconsistent = errno;
   1422         }
   1423         return -errno;
   1424     }
   1425     return 0;
   1426 }
   1427 
   1428 #ifdef CONFIG_PREADV
   1429 
   1430 static bool preadv_present = true;
   1431 
   1432 static ssize_t
   1433 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1434 {
   1435     return preadv(fd, iov, nr_iov, offset);
   1436 }
   1437 
   1438 static ssize_t
   1439 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1440 {
   1441     return pwritev(fd, iov, nr_iov, offset);
   1442 }
   1443 
   1444 #else
   1445 
   1446 static bool preadv_present = false;
   1447 
   1448 static ssize_t
   1449 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1450 {
   1451     return -ENOSYS;
   1452 }
   1453 
   1454 static ssize_t
   1455 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
   1456 {
   1457     return -ENOSYS;
   1458 }
   1459 
   1460 #endif
   1461 
   1462 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
   1463 {
   1464     ssize_t len;
   1465 
   1466     do {
   1467         if (aiocb->aio_type & QEMU_AIO_WRITE)
   1468             len = qemu_pwritev(aiocb->aio_fildes,
   1469                                aiocb->io.iov,
   1470                                aiocb->io.niov,
   1471                                aiocb->aio_offset);
   1472          else
   1473             len = qemu_preadv(aiocb->aio_fildes,
   1474                               aiocb->io.iov,
   1475                               aiocb->io.niov,
   1476                               aiocb->aio_offset);
   1477     } while (len == -1 && errno == EINTR);
   1478 
   1479     if (len == -1) {
   1480         return -errno;
   1481     }
   1482     return len;
   1483 }
   1484 
   1485 /*
   1486  * Read/writes the data to/from a given linear buffer.
   1487  *
   1488  * Returns the number of bytes handles or -errno in case of an error. Short
   1489  * reads are only returned if the end of the file is reached.
   1490  */
   1491 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
   1492 {
   1493     ssize_t offset = 0;
   1494     ssize_t len;
   1495 
   1496     while (offset < aiocb->aio_nbytes) {
   1497         if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1498             len = pwrite(aiocb->aio_fildes,
   1499                          (const char *)buf + offset,
   1500                          aiocb->aio_nbytes - offset,
   1501                          aiocb->aio_offset + offset);
   1502         } else {
   1503             len = pread(aiocb->aio_fildes,
   1504                         buf + offset,
   1505                         aiocb->aio_nbytes - offset,
   1506                         aiocb->aio_offset + offset);
   1507         }
   1508         if (len == -1 && errno == EINTR) {
   1509             continue;
   1510         } else if (len == -1 && errno == EINVAL &&
   1511                    (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
   1512                    !(aiocb->aio_type & QEMU_AIO_WRITE) &&
   1513                    offset > 0) {
   1514             /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
   1515              * after a short read.  Assume that O_DIRECT short reads only occur
   1516              * at EOF.  Therefore this is a short read, not an I/O error.
   1517              */
   1518             break;
   1519         } else if (len == -1) {
   1520             offset = -errno;
   1521             break;
   1522         } else if (len == 0) {
   1523             break;
   1524         }
   1525         offset += len;
   1526     }
   1527 
   1528     return offset;
   1529 }
   1530 
   1531 static int handle_aiocb_rw(void *opaque)
   1532 {
   1533     RawPosixAIOData *aiocb = opaque;
   1534     ssize_t nbytes;
   1535     char *buf;
   1536 
   1537     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
   1538         /*
   1539          * If there is just a single buffer, and it is properly aligned
   1540          * we can just use plain pread/pwrite without any problems.
   1541          */
   1542         if (aiocb->io.niov == 1) {
   1543             nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
   1544             goto out;
   1545         }
   1546         /*
   1547          * We have more than one iovec, and all are properly aligned.
   1548          *
   1549          * Try preadv/pwritev first and fall back to linearizing the
   1550          * buffer if it's not supported.
   1551          */
   1552         if (preadv_present) {
   1553             nbytes = handle_aiocb_rw_vector(aiocb);
   1554             if (nbytes == aiocb->aio_nbytes ||
   1555                 (nbytes < 0 && nbytes != -ENOSYS)) {
   1556                 goto out;
   1557             }
   1558             preadv_present = false;
   1559         }
   1560 
   1561         /*
   1562          * XXX(hch): short read/write.  no easy way to handle the reminder
   1563          * using these interfaces.  For now retry using plain
   1564          * pread/pwrite?
   1565          */
   1566     }
   1567 
   1568     /*
   1569      * Ok, we have to do it the hard way, copy all segments into
   1570      * a single aligned buffer.
   1571      */
   1572     buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
   1573     if (buf == NULL) {
   1574         nbytes = -ENOMEM;
   1575         goto out;
   1576     }
   1577 
   1578     if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1579         char *p = buf;
   1580         int i;
   1581 
   1582         for (i = 0; i < aiocb->io.niov; ++i) {
   1583             memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
   1584             p += aiocb->io.iov[i].iov_len;
   1585         }
   1586         assert(p - buf == aiocb->aio_nbytes);
   1587     }
   1588 
   1589     nbytes = handle_aiocb_rw_linear(aiocb, buf);
   1590     if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
   1591         char *p = buf;
   1592         size_t count = aiocb->aio_nbytes, copy;
   1593         int i;
   1594 
   1595         for (i = 0; i < aiocb->io.niov && count; ++i) {
   1596             copy = count;
   1597             if (copy > aiocb->io.iov[i].iov_len) {
   1598                 copy = aiocb->io.iov[i].iov_len;
   1599             }
   1600             memcpy(aiocb->io.iov[i].iov_base, p, copy);
   1601             assert(count >= copy);
   1602             p     += copy;
   1603             count -= copy;
   1604         }
   1605         assert(count == 0);
   1606     }
   1607     qemu_vfree(buf);
   1608 
   1609 out:
   1610     if (nbytes == aiocb->aio_nbytes) {
   1611         return 0;
   1612     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
   1613         if (aiocb->aio_type & QEMU_AIO_WRITE) {
   1614             return -EINVAL;
   1615         } else {
   1616             iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
   1617                       0, aiocb->aio_nbytes - nbytes);
   1618             return 0;
   1619         }
   1620     } else {
   1621         assert(nbytes < 0);
   1622         return nbytes;
   1623     }
   1624 }
   1625 
   1626 #if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
   1627 static int translate_err(int err)
   1628 {
   1629     if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
   1630         err == -ENOTTY) {
   1631         err = -ENOTSUP;
   1632     }
   1633     return err;
   1634 }
   1635 #endif
   1636 
   1637 #ifdef CONFIG_FALLOCATE
   1638 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
   1639 {
   1640     do {
   1641         if (fallocate(fd, mode, offset, len) == 0) {
   1642             return 0;
   1643         }
   1644     } while (errno == EINTR);
   1645     return translate_err(-errno);
   1646 }
   1647 #endif
   1648 
   1649 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
   1650 {
   1651     int ret = -ENOTSUP;
   1652     BDRVRawState *s = aiocb->bs->opaque;
   1653 
   1654     if (!s->has_write_zeroes) {
   1655         return -ENOTSUP;
   1656     }
   1657 
   1658 #ifdef BLKZEROOUT
   1659     /* The BLKZEROOUT implementation in the kernel doesn't set
   1660      * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
   1661      * fallbacks. */
   1662     if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
   1663         do {
   1664             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
   1665             if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
   1666                 return 0;
   1667             }
   1668         } while (errno == EINTR);
   1669 
   1670         ret = translate_err(-errno);
   1671         if (ret == -ENOTSUP) {
   1672             s->has_write_zeroes = false;
   1673         }
   1674     }
   1675 #endif
   1676 
   1677     return ret;
   1678 }
   1679 
   1680 static int handle_aiocb_write_zeroes(void *opaque)
   1681 {
   1682     RawPosixAIOData *aiocb = opaque;
   1683 #ifdef CONFIG_FALLOCATE
   1684     BDRVRawState *s = aiocb->bs->opaque;
   1685     int64_t len;
   1686 #endif
   1687 
   1688     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
   1689         return handle_aiocb_write_zeroes_block(aiocb);
   1690     }
   1691 
   1692 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
   1693     if (s->has_write_zeroes) {
   1694         int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
   1695                                aiocb->aio_offset, aiocb->aio_nbytes);
   1696         if (ret == -ENOTSUP) {
   1697             s->has_write_zeroes = false;
   1698         } else if (ret == 0 || ret != -EINVAL) {
   1699             return ret;
   1700         }
   1701         /*
   1702          * Note: Some file systems do not like unaligned byte ranges, and
   1703          * return EINVAL in such a case, though they should not do it according
   1704          * to the man-page of fallocate(). Thus we simply ignore this return
   1705          * value and try the other fallbacks instead.
   1706          */
   1707     }
   1708 #endif
   1709 
   1710 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1711     if (s->has_discard && s->has_fallocate) {
   1712         int ret = do_fallocate(s->fd,
   1713                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1714                                aiocb->aio_offset, aiocb->aio_nbytes);
   1715         if (ret == 0) {
   1716             ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
   1717             if (ret == 0 || ret != -ENOTSUP) {
   1718                 return ret;
   1719             }
   1720             s->has_fallocate = false;
   1721         } else if (ret == -EINVAL) {
   1722             /*
   1723              * Some file systems like older versions of GPFS do not like un-
   1724              * aligned byte ranges, and return EINVAL in such a case, though
   1725              * they should not do it according to the man-page of fallocate().
   1726              * Warn about the bad filesystem and try the final fallback instead.
   1727              */
   1728             warn_report_once("Your file system is misbehaving: "
   1729                              "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
   1730                              "Please report this bug to your file system "
   1731                              "vendor.");
   1732         } else if (ret != -ENOTSUP) {
   1733             return ret;
   1734         } else {
   1735             s->has_discard = false;
   1736         }
   1737     }
   1738 #endif
   1739 
   1740 #ifdef CONFIG_FALLOCATE
   1741     /* Last resort: we are trying to extend the file with zeroed data. This
   1742      * can be done via fallocate(fd, 0) */
   1743     len = bdrv_getlength(aiocb->bs);
   1744     if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
   1745         int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
   1746         if (ret == 0 || ret != -ENOTSUP) {
   1747             return ret;
   1748         }
   1749         s->has_fallocate = false;
   1750     }
   1751 #endif
   1752 
   1753     return -ENOTSUP;
   1754 }
   1755 
   1756 static int handle_aiocb_write_zeroes_unmap(void *opaque)
   1757 {
   1758     RawPosixAIOData *aiocb = opaque;
   1759     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
   1760 
   1761     /* First try to write zeros and unmap at the same time */
   1762 
   1763 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1764     int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1765                            aiocb->aio_offset, aiocb->aio_nbytes);
   1766     switch (ret) {
   1767     case -ENOTSUP:
   1768     case -EINVAL:
   1769     case -EBUSY:
   1770         break;
   1771     default:
   1772         return ret;
   1773     }
   1774 #endif
   1775 
   1776     /* If we couldn't manage to unmap while guaranteed that the area reads as
   1777      * all-zero afterwards, just write zeroes without unmapping */
   1778     return handle_aiocb_write_zeroes(aiocb);
   1779 }
   1780 
   1781 #ifndef HAVE_COPY_FILE_RANGE
   1782 static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
   1783                              off_t *out_off, size_t len, unsigned int flags)
   1784 {
   1785 #ifdef __NR_copy_file_range
   1786     return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
   1787                    out_off, len, flags);
   1788 #else
   1789     errno = ENOSYS;
   1790     return -1;
   1791 #endif
   1792 }
   1793 #endif
   1794 
   1795 static int handle_aiocb_copy_range(void *opaque)
   1796 {
   1797     RawPosixAIOData *aiocb = opaque;
   1798     uint64_t bytes = aiocb->aio_nbytes;
   1799     off_t in_off = aiocb->aio_offset;
   1800     off_t out_off = aiocb->copy_range.aio_offset2;
   1801 
   1802     while (bytes) {
   1803         ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
   1804                                       aiocb->copy_range.aio_fd2, &out_off,
   1805                                       bytes, 0);
   1806         trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
   1807                                    aiocb->copy_range.aio_fd2, out_off, bytes,
   1808                                    0, ret);
   1809         if (ret == 0) {
   1810             /* No progress (e.g. when beyond EOF), let the caller fall back to
   1811              * buffer I/O. */
   1812             return -ENOSPC;
   1813         }
   1814         if (ret < 0) {
   1815             switch (errno) {
   1816             case ENOSYS:
   1817                 return -ENOTSUP;
   1818             case EINTR:
   1819                 continue;
   1820             default:
   1821                 return -errno;
   1822             }
   1823         }
   1824         bytes -= ret;
   1825     }
   1826     return 0;
   1827 }
   1828 
   1829 static int handle_aiocb_discard(void *opaque)
   1830 {
   1831     RawPosixAIOData *aiocb = opaque;
   1832     int ret = -ENOTSUP;
   1833     BDRVRawState *s = aiocb->bs->opaque;
   1834 
   1835     if (!s->has_discard) {
   1836         return -ENOTSUP;
   1837     }
   1838 
   1839     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
   1840 #ifdef BLKDISCARD
   1841         do {
   1842             uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
   1843             if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
   1844                 return 0;
   1845             }
   1846         } while (errno == EINTR);
   1847 
   1848         ret = translate_err(-errno);
   1849 #endif
   1850     } else {
   1851 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
   1852         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1853                            aiocb->aio_offset, aiocb->aio_nbytes);
   1854         ret = translate_err(ret);
   1855 #elif defined(__APPLE__) && (__MACH__)
   1856         fpunchhole_t fpunchhole;
   1857         fpunchhole.fp_flags = 0;
   1858         fpunchhole.reserved = 0;
   1859         fpunchhole.fp_offset = aiocb->aio_offset;
   1860         fpunchhole.fp_length = aiocb->aio_nbytes;
   1861         if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
   1862             ret = errno == ENODEV ? -ENOTSUP : -errno;
   1863         } else {
   1864             ret = 0;
   1865         }
   1866 #endif
   1867     }
   1868 
   1869     if (ret == -ENOTSUP) {
   1870         s->has_discard = false;
   1871     }
   1872     return ret;
   1873 }
   1874 
   1875 /*
   1876  * Help alignment probing by allocating the first block.
   1877  *
   1878  * When reading with direct I/O from unallocated area on Gluster backed by XFS,
   1879  * reading succeeds regardless of request length. In this case we fallback to
   1880  * safe alignment which is not optimal. Allocating the first block avoids this
   1881  * fallback.
   1882  *
   1883  * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
   1884  * request alignment, so we use safe values.
   1885  *
   1886  * Returns: 0 on success, -errno on failure. Since this is an optimization,
   1887  * caller may ignore failures.
   1888  */
   1889 static int allocate_first_block(int fd, size_t max_size)
   1890 {
   1891     size_t write_size = (max_size < MAX_BLOCKSIZE)
   1892         ? BDRV_SECTOR_SIZE
   1893         : MAX_BLOCKSIZE;
   1894     size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
   1895     void *buf;
   1896     ssize_t n;
   1897     int ret;
   1898 
   1899     buf = qemu_memalign(max_align, write_size);
   1900     memset(buf, 0, write_size);
   1901 
   1902     do {
   1903         n = pwrite(fd, buf, write_size, 0);
   1904     } while (n == -1 && errno == EINTR);
   1905 
   1906     ret = (n == -1) ? -errno : 0;
   1907 
   1908     qemu_vfree(buf);
   1909     return ret;
   1910 }
   1911 
   1912 static int handle_aiocb_truncate(void *opaque)
   1913 {
   1914     RawPosixAIOData *aiocb = opaque;
   1915     int result = 0;
   1916     int64_t current_length = 0;
   1917     char *buf = NULL;
   1918     struct stat st;
   1919     int fd = aiocb->aio_fildes;
   1920     int64_t offset = aiocb->aio_offset;
   1921     PreallocMode prealloc = aiocb->truncate.prealloc;
   1922     Error **errp = aiocb->truncate.errp;
   1923 
   1924     if (fstat(fd, &st) < 0) {
   1925         result = -errno;
   1926         error_setg_errno(errp, -result, "Could not stat file");
   1927         return result;
   1928     }
   1929 
   1930     current_length = st.st_size;
   1931     if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
   1932         error_setg(errp, "Cannot use preallocation for shrinking files");
   1933         return -ENOTSUP;
   1934     }
   1935 
   1936     switch (prealloc) {
   1937 #ifdef CONFIG_POSIX_FALLOCATE
   1938     case PREALLOC_MODE_FALLOC:
   1939         /*
   1940          * Truncating before posix_fallocate() makes it about twice slower on
   1941          * file systems that do not support fallocate(), trying to check if a
   1942          * block is allocated before allocating it, so don't do that here.
   1943          */
   1944         if (offset != current_length) {
   1945             result = -posix_fallocate(fd, current_length,
   1946                                       offset - current_length);
   1947             if (result != 0) {
   1948                 /* posix_fallocate() doesn't set errno. */
   1949                 error_setg_errno(errp, -result,
   1950                                  "Could not preallocate new data");
   1951             } else if (current_length == 0) {
   1952                 /*
   1953                  * posix_fallocate() uses fallocate() if the filesystem
   1954                  * supports it, or fallback to manually writing zeroes. If
   1955                  * fallocate() was used, unaligned reads from the fallocated
   1956                  * area in raw_probe_alignment() will succeed, hence we need to
   1957                  * allocate the first block.
   1958                  *
   1959                  * Optimize future alignment probing; ignore failures.
   1960                  */
   1961                 allocate_first_block(fd, offset);
   1962             }
   1963         } else {
   1964             result = 0;
   1965         }
   1966         goto out;
   1967 #endif
   1968     case PREALLOC_MODE_FULL:
   1969     {
   1970         int64_t num = 0, left = offset - current_length;
   1971         off_t seek_result;
   1972 
   1973         /*
   1974          * Knowing the final size from the beginning could allow the file
   1975          * system driver to do less allocations and possibly avoid
   1976          * fragmentation of the file.
   1977          */
   1978         if (ftruncate(fd, offset) != 0) {
   1979             result = -errno;
   1980             error_setg_errno(errp, -result, "Could not resize file");
   1981             goto out;
   1982         }
   1983 
   1984         buf = g_malloc0(65536);
   1985 
   1986         seek_result = lseek(fd, current_length, SEEK_SET);
   1987         if (seek_result < 0) {
   1988             result = -errno;
   1989             error_setg_errno(errp, -result,
   1990                              "Failed to seek to the old end of file");
   1991             goto out;
   1992         }
   1993 
   1994         while (left > 0) {
   1995             num = MIN(left, 65536);
   1996             result = write(fd, buf, num);
   1997             if (result < 0) {
   1998                 if (errno == EINTR) {
   1999                     continue;
   2000                 }
   2001                 result = -errno;
   2002                 error_setg_errno(errp, -result,
   2003                                  "Could not write zeros for preallocation");
   2004                 goto out;
   2005             }
   2006             left -= result;
   2007         }
   2008         if (result >= 0) {
   2009             result = fsync(fd);
   2010             if (result < 0) {
   2011                 result = -errno;
   2012                 error_setg_errno(errp, -result,
   2013                                  "Could not flush file to disk");
   2014                 goto out;
   2015             }
   2016         }
   2017         goto out;
   2018     }
   2019     case PREALLOC_MODE_OFF:
   2020         if (ftruncate(fd, offset) != 0) {
   2021             result = -errno;
   2022             error_setg_errno(errp, -result, "Could not resize file");
   2023         } else if (current_length == 0 && offset > current_length) {
   2024             /* Optimize future alignment probing; ignore failures. */
   2025             allocate_first_block(fd, offset);
   2026         }
   2027         return result;
   2028     default:
   2029         result = -ENOTSUP;
   2030         error_setg(errp, "Unsupported preallocation mode: %s",
   2031                    PreallocMode_str(prealloc));
   2032         return result;
   2033     }
   2034 
   2035 out:
   2036     if (result < 0) {
   2037         if (ftruncate(fd, current_length) < 0) {
   2038             error_report("Failed to restore old file length: %s",
   2039                          strerror(errno));
   2040         }
   2041     }
   2042 
   2043     g_free(buf);
   2044     return result;
   2045 }
   2046 
   2047 static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
   2048                                                ThreadPoolFunc func, void *arg)
   2049 {
   2050     /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
   2051     ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
   2052     return thread_pool_submit_co(pool, func, arg);
   2053 }
   2054 
   2055 /*
   2056  * Check if all memory in this vector is sector aligned.
   2057  */
   2058 static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
   2059 {
   2060     int i;
   2061     size_t alignment = bdrv_min_mem_align(bs);
   2062     size_t len = bs->bl.request_alignment;
   2063     IO_CODE();
   2064 
   2065     for (i = 0; i < qiov->niov; i++) {
   2066         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
   2067             return false;
   2068         }
   2069         if (qiov->iov[i].iov_len % len) {
   2070             return false;
   2071         }
   2072     }
   2073 
   2074     return true;
   2075 }
   2076 
   2077 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
   2078                                    uint64_t bytes, QEMUIOVector *qiov, int type)
   2079 {
   2080     BDRVRawState *s = bs->opaque;
   2081     RawPosixAIOData acb;
   2082 
   2083     if (fd_open(bs) < 0)
   2084         return -EIO;
   2085 
   2086     /*
   2087      * When using O_DIRECT, the request must be aligned to be able to use
   2088      * either libaio or io_uring interface. If not fail back to regular thread
   2089      * pool read/write code which emulates this for us if we
   2090      * set QEMU_AIO_MISALIGNED.
   2091      */
   2092     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
   2093         type |= QEMU_AIO_MISALIGNED;
   2094 #ifdef CONFIG_LINUX_IO_URING
   2095     } else if (s->use_linux_io_uring) {
   2096         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2097         assert(qiov->size == bytes);
   2098         return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
   2099 #endif
   2100 #ifdef CONFIG_LINUX_AIO
   2101     } else if (s->use_linux_aio) {
   2102         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2103         assert(qiov->size == bytes);
   2104         return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
   2105                               s->aio_max_batch);
   2106 #endif
   2107     }
   2108 
   2109     acb = (RawPosixAIOData) {
   2110         .bs             = bs,
   2111         .aio_fildes     = s->fd,
   2112         .aio_type       = type,
   2113         .aio_offset     = offset,
   2114         .aio_nbytes     = bytes,
   2115         .io             = {
   2116             .iov            = qiov->iov,
   2117             .niov           = qiov->niov,
   2118         },
   2119     };
   2120 
   2121     assert(qiov->size == bytes);
   2122     return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
   2123 }
   2124 
   2125 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
   2126                                       int64_t bytes, QEMUIOVector *qiov,
   2127                                       BdrvRequestFlags flags)
   2128 {
   2129     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
   2130 }
   2131 
   2132 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
   2133                                        int64_t bytes, QEMUIOVector *qiov,
   2134                                        BdrvRequestFlags flags)
   2135 {
   2136     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
   2137 }
   2138 
   2139 static void raw_aio_plug(BlockDriverState *bs)
   2140 {
   2141     BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2142 #ifdef CONFIG_LINUX_AIO
   2143     if (s->use_linux_aio) {
   2144         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2145         laio_io_plug(bs, aio);
   2146     }
   2147 #endif
   2148 #ifdef CONFIG_LINUX_IO_URING
   2149     if (s->use_linux_io_uring) {
   2150         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2151         luring_io_plug(bs, aio);
   2152     }
   2153 #endif
   2154 }
   2155 
   2156 static void raw_aio_unplug(BlockDriverState *bs)
   2157 {
   2158     BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2159 #ifdef CONFIG_LINUX_AIO
   2160     if (s->use_linux_aio) {
   2161         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
   2162         laio_io_unplug(bs, aio, s->aio_max_batch);
   2163     }
   2164 #endif
   2165 #ifdef CONFIG_LINUX_IO_URING
   2166     if (s->use_linux_io_uring) {
   2167         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2168         luring_io_unplug(bs, aio);
   2169     }
   2170 #endif
   2171 }
   2172 
   2173 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
   2174 {
   2175     BDRVRawState *s = bs->opaque;
   2176     RawPosixAIOData acb;
   2177     int ret;
   2178 
   2179     ret = fd_open(bs);
   2180     if (ret < 0) {
   2181         return ret;
   2182     }
   2183 
   2184     acb = (RawPosixAIOData) {
   2185         .bs             = bs,
   2186         .aio_fildes     = s->fd,
   2187         .aio_type       = QEMU_AIO_FLUSH,
   2188     };
   2189 
   2190 #ifdef CONFIG_LINUX_IO_URING
   2191     if (s->use_linux_io_uring) {
   2192         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
   2193         return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
   2194     }
   2195 #endif
   2196     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
   2197 }
   2198 
   2199 static void raw_aio_attach_aio_context(BlockDriverState *bs,
   2200                                        AioContext *new_context)
   2201 {
   2202     BDRVRawState __attribute__((unused)) *s = bs->opaque;
   2203 #ifdef CONFIG_LINUX_AIO
   2204     if (s->use_linux_aio) {
   2205         Error *local_err = NULL;
   2206         if (!aio_setup_linux_aio(new_context, &local_err)) {
   2207             error_reportf_err(local_err, "Unable to use native AIO, "
   2208                                          "falling back to thread pool: ");
   2209             s->use_linux_aio = false;
   2210         }
   2211     }
   2212 #endif
   2213 #ifdef CONFIG_LINUX_IO_URING
   2214     if (s->use_linux_io_uring) {
   2215         Error *local_err = NULL;
   2216         if (!aio_setup_linux_io_uring(new_context, &local_err)) {
   2217             error_reportf_err(local_err, "Unable to use linux io_uring, "
   2218                                          "falling back to thread pool: ");
   2219             s->use_linux_io_uring = false;
   2220         }
   2221     }
   2222 #endif
   2223 }
   2224 
   2225 static void raw_close(BlockDriverState *bs)
   2226 {
   2227     BDRVRawState *s = bs->opaque;
   2228 
   2229     if (s->fd >= 0) {
   2230         qemu_close(s->fd);
   2231         s->fd = -1;
   2232     }
   2233 }
   2234 
   2235 /**
   2236  * Truncates the given regular file @fd to @offset and, when growing, fills the
   2237  * new space according to @prealloc.
   2238  *
   2239  * Returns: 0 on success, -errno on failure.
   2240  */
   2241 static int coroutine_fn
   2242 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
   2243                      PreallocMode prealloc, Error **errp)
   2244 {
   2245     RawPosixAIOData acb;
   2246 
   2247     acb = (RawPosixAIOData) {
   2248         .bs             = bs,
   2249         .aio_fildes     = fd,
   2250         .aio_type       = QEMU_AIO_TRUNCATE,
   2251         .aio_offset     = offset,
   2252         .truncate       = {
   2253             .prealloc       = prealloc,
   2254             .errp           = errp,
   2255         },
   2256     };
   2257 
   2258     return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
   2259 }
   2260 
   2261 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
   2262                                         bool exact, PreallocMode prealloc,
   2263                                         BdrvRequestFlags flags, Error **errp)
   2264 {
   2265     BDRVRawState *s = bs->opaque;
   2266     struct stat st;
   2267     int ret;
   2268 
   2269     if (fstat(s->fd, &st)) {
   2270         ret = -errno;
   2271         error_setg_errno(errp, -ret, "Failed to fstat() the file");
   2272         return ret;
   2273     }
   2274 
   2275     if (S_ISREG(st.st_mode)) {
   2276         /* Always resizes to the exact @offset */
   2277         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
   2278     }
   2279 
   2280     if (prealloc != PREALLOC_MODE_OFF) {
   2281         error_setg(errp, "Preallocation mode '%s' unsupported for this "
   2282                    "non-regular file", PreallocMode_str(prealloc));
   2283         return -ENOTSUP;
   2284     }
   2285 
   2286     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2287         int64_t cur_length = raw_getlength(bs);
   2288 
   2289         if (offset != cur_length && exact) {
   2290             error_setg(errp, "Cannot resize device files");
   2291             return -ENOTSUP;
   2292         } else if (offset > cur_length) {
   2293             error_setg(errp, "Cannot grow device files");
   2294             return -EINVAL;
   2295         }
   2296     } else {
   2297         error_setg(errp, "Resizing this file is not supported");
   2298         return -ENOTSUP;
   2299     }
   2300 
   2301     return 0;
   2302 }
   2303 
   2304 #ifdef __OpenBSD__
   2305 static int64_t raw_getlength(BlockDriverState *bs)
   2306 {
   2307     BDRVRawState *s = bs->opaque;
   2308     int fd = s->fd;
   2309     struct stat st;
   2310 
   2311     if (fstat(fd, &st))
   2312         return -errno;
   2313     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2314         struct disklabel dl;
   2315 
   2316         if (ioctl(fd, DIOCGDINFO, &dl))
   2317             return -errno;
   2318         return (uint64_t)dl.d_secsize *
   2319             dl.d_partitions[DISKPART(st.st_rdev)].p_size;
   2320     } else
   2321         return st.st_size;
   2322 }
   2323 #elif defined(__NetBSD__)
   2324 static int64_t raw_getlength(BlockDriverState *bs)
   2325 {
   2326     BDRVRawState *s = bs->opaque;
   2327     int fd = s->fd;
   2328     struct stat st;
   2329 
   2330     if (fstat(fd, &st))
   2331         return -errno;
   2332     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
   2333         struct dkwedge_info dkw;
   2334 
   2335         if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
   2336             return dkw.dkw_size * 512;
   2337         } else {
   2338             struct disklabel dl;
   2339 
   2340             if (ioctl(fd, DIOCGDINFO, &dl))
   2341                 return -errno;
   2342             return (uint64_t)dl.d_secsize *
   2343                 dl.d_partitions[DISKPART(st.st_rdev)].p_size;
   2344         }
   2345     } else
   2346         return st.st_size;
   2347 }
   2348 #elif defined(__sun__)
   2349 static int64_t raw_getlength(BlockDriverState *bs)
   2350 {
   2351     BDRVRawState *s = bs->opaque;
   2352     struct dk_minfo minfo;
   2353     int ret;
   2354     int64_t size;
   2355 
   2356     ret = fd_open(bs);
   2357     if (ret < 0) {
   2358         return ret;
   2359     }
   2360 
   2361     /*
   2362      * Use the DKIOCGMEDIAINFO ioctl to read the size.
   2363      */
   2364     ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
   2365     if (ret != -1) {
   2366         return minfo.dki_lbsize * minfo.dki_capacity;
   2367     }
   2368 
   2369     /*
   2370      * There are reports that lseek on some devices fails, but
   2371      * irc discussion said that contingency on contingency was overkill.
   2372      */
   2373     size = lseek(s->fd, 0, SEEK_END);
   2374     if (size < 0) {
   2375         return -errno;
   2376     }
   2377     return size;
   2378 }
   2379 #elif defined(CONFIG_BSD)
   2380 static int64_t raw_getlength(BlockDriverState *bs)
   2381 {
   2382     BDRVRawState *s = bs->opaque;
   2383     int fd = s->fd;
   2384     int64_t size;
   2385     struct stat sb;
   2386 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   2387     int reopened = 0;
   2388 #endif
   2389     int ret;
   2390 
   2391     ret = fd_open(bs);
   2392     if (ret < 0)
   2393         return ret;
   2394 
   2395 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   2396 again:
   2397 #endif
   2398     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
   2399         size = 0;
   2400 #ifdef DIOCGMEDIASIZE
   2401         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
   2402             size = 0;
   2403         }
   2404 #endif
   2405 #ifdef DIOCGPART
   2406         if (size == 0) {
   2407             struct partinfo pi;
   2408             if (ioctl(fd, DIOCGPART, &pi) == 0) {
   2409                 size = pi.media_size;
   2410             }
   2411         }
   2412 #endif
   2413 #if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
   2414         if (size == 0) {
   2415             uint64_t sectors = 0;
   2416             uint32_t sector_size = 0;
   2417 
   2418             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
   2419                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
   2420                 size = sectors * sector_size;
   2421             }
   2422         }
   2423 #endif
   2424         if (size == 0) {
   2425             size = lseek(fd, 0LL, SEEK_END);
   2426         }
   2427         if (size < 0) {
   2428             return -errno;
   2429         }
   2430 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   2431         switch(s->type) {
   2432         case FTYPE_CD:
   2433             /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
   2434             if (size == 2048LL * (unsigned)-1)
   2435                 size = 0;
   2436             /* XXX no disc?  maybe we need to reopen... */
   2437             if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
   2438                 reopened = 1;
   2439                 goto again;
   2440             }
   2441         }
   2442 #endif
   2443     } else {
   2444         size = lseek(fd, 0, SEEK_END);
   2445         if (size < 0) {
   2446             return -errno;
   2447         }
   2448     }
   2449     return size;
   2450 }
   2451 #else
   2452 static int64_t raw_getlength(BlockDriverState *bs)
   2453 {
   2454     BDRVRawState *s = bs->opaque;
   2455     int ret;
   2456     int64_t size;
   2457 
   2458     ret = fd_open(bs);
   2459     if (ret < 0) {
   2460         return ret;
   2461     }
   2462 
   2463     size = lseek(s->fd, 0, SEEK_END);
   2464     if (size < 0) {
   2465         return -errno;
   2466     }
   2467     return size;
   2468 }
   2469 #endif
   2470 
   2471 static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
   2472 {
   2473     struct stat st;
   2474     BDRVRawState *s = bs->opaque;
   2475 
   2476     if (fstat(s->fd, &st) < 0) {
   2477         return -errno;
   2478     }
   2479     return (int64_t)st.st_blocks * 512;
   2480 }
   2481 
   2482 static int coroutine_fn
   2483 raw_co_create(BlockdevCreateOptions *options, Error **errp)
   2484 {
   2485     BlockdevCreateOptionsFile *file_opts;
   2486     Error *local_err = NULL;
   2487     int fd;
   2488     uint64_t perm, shared;
   2489     int result = 0;
   2490 
   2491     /* Validate options and set default values */
   2492     assert(options->driver == BLOCKDEV_DRIVER_FILE);
   2493     file_opts = &options->u.file;
   2494 
   2495     if (!file_opts->has_nocow) {
   2496         file_opts->nocow = false;
   2497     }
   2498     if (!file_opts->has_preallocation) {
   2499         file_opts->preallocation = PREALLOC_MODE_OFF;
   2500     }
   2501     if (!file_opts->has_extent_size_hint) {
   2502         file_opts->extent_size_hint = 1 * MiB;
   2503     }
   2504     if (file_opts->extent_size_hint > UINT32_MAX) {
   2505         result = -EINVAL;
   2506         error_setg(errp, "Extent size hint is too large");
   2507         goto out;
   2508     }
   2509 
   2510     /* Create file */
   2511     fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
   2512     if (fd < 0) {
   2513         result = -errno;
   2514         goto out;
   2515     }
   2516 
   2517     /* Take permissions: We want to discard everything, so we need
   2518      * BLK_PERM_WRITE; and truncation to the desired size requires
   2519      * BLK_PERM_RESIZE.
   2520      * On the other hand, we cannot share the RESIZE permission
   2521      * because we promise that after this function, the file has the
   2522      * size given in the options.  If someone else were to resize it
   2523      * concurrently, we could not guarantee that.
   2524      * Note that after this function, we can no longer guarantee that
   2525      * the file is not touched by a third party, so it may be resized
   2526      * then. */
   2527     perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
   2528     shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
   2529 
   2530     /* Step one: Take locks */
   2531     result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
   2532     if (result < 0) {
   2533         goto out_close;
   2534     }
   2535 
   2536     /* Step two: Check that nobody else has taken conflicting locks */
   2537     result = raw_check_lock_bytes(fd, perm, shared, errp);
   2538     if (result < 0) {
   2539         error_append_hint(errp,
   2540                           "Is another process using the image [%s]?\n",
   2541                           file_opts->filename);
   2542         goto out_unlock;
   2543     }
   2544 
   2545     /* Clear the file by truncating it to 0 */
   2546     result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
   2547     if (result < 0) {
   2548         goto out_unlock;
   2549     }
   2550 
   2551     if (file_opts->nocow) {
   2552 #ifdef __linux__
   2553         /* Set NOCOW flag to solve performance issue on fs like btrfs.
   2554          * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
   2555          * will be ignored since any failure of this operation should not
   2556          * block the left work.
   2557          */
   2558         int attr;
   2559         if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
   2560             attr |= FS_NOCOW_FL;
   2561             ioctl(fd, FS_IOC_SETFLAGS, &attr);
   2562         }
   2563 #endif
   2564     }
   2565 #ifdef FS_IOC_FSSETXATTR
   2566     /*
   2567      * Try to set the extent size hint. Failure is not fatal, and a warning is
   2568      * only printed if the option was explicitly specified.
   2569      */
   2570     {
   2571         struct fsxattr attr;
   2572         result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
   2573         if (result == 0) {
   2574             attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
   2575             attr.fsx_extsize = file_opts->extent_size_hint;
   2576             result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
   2577         }
   2578         if (result < 0 && file_opts->has_extent_size_hint &&
   2579             file_opts->extent_size_hint)
   2580         {
   2581             warn_report("Failed to set extent size hint: %s",
   2582                         strerror(errno));
   2583         }
   2584     }
   2585 #endif
   2586 
   2587     /* Resize and potentially preallocate the file to the desired
   2588      * final size */
   2589     result = raw_regular_truncate(NULL, fd, file_opts->size,
   2590                                   file_opts->preallocation, errp);
   2591     if (result < 0) {
   2592         goto out_unlock;
   2593     }
   2594 
   2595 out_unlock:
   2596     raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
   2597     if (local_err) {
   2598         /* The above call should not fail, and if it does, that does
   2599          * not mean the whole creation operation has failed.  So
   2600          * report it the user for their convenience, but do not report
   2601          * it to the caller. */
   2602         warn_report_err(local_err);
   2603     }
   2604 
   2605 out_close:
   2606     if (qemu_close(fd) != 0 && result == 0) {
   2607         result = -errno;
   2608         error_setg_errno(errp, -result, "Could not close the new file");
   2609     }
   2610 out:
   2611     return result;
   2612 }
   2613 
   2614 static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
   2615                                            const char *filename,
   2616                                            QemuOpts *opts,
   2617                                            Error **errp)
   2618 {
   2619     BlockdevCreateOptions options;
   2620     int64_t total_size = 0;
   2621     int64_t extent_size_hint = 0;
   2622     bool has_extent_size_hint = false;
   2623     bool nocow = false;
   2624     PreallocMode prealloc;
   2625     char *buf = NULL;
   2626     Error *local_err = NULL;
   2627 
   2628     /* Skip file: protocol prefix */
   2629     strstart(filename, "file:", &filename);
   2630 
   2631     /* Read out options */
   2632     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
   2633                           BDRV_SECTOR_SIZE);
   2634     if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
   2635         has_extent_size_hint = true;
   2636         extent_size_hint =
   2637             qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
   2638     }
   2639     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
   2640     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
   2641     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
   2642                                PREALLOC_MODE_OFF, &local_err);
   2643     g_free(buf);
   2644     if (local_err) {
   2645         error_propagate(errp, local_err);
   2646         return -EINVAL;
   2647     }
   2648 
   2649     options = (BlockdevCreateOptions) {
   2650         .driver     = BLOCKDEV_DRIVER_FILE,
   2651         .u.file     = {
   2652             .filename           = (char *) filename,
   2653             .size               = total_size,
   2654             .has_preallocation  = true,
   2655             .preallocation      = prealloc,
   2656             .has_nocow          = true,
   2657             .nocow              = nocow,
   2658             .has_extent_size_hint = has_extent_size_hint,
   2659             .extent_size_hint   = extent_size_hint,
   2660         },
   2661     };
   2662     return raw_co_create(&options, errp);
   2663 }
   2664 
   2665 static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
   2666                                            Error **errp)
   2667 {
   2668     struct stat st;
   2669     int ret;
   2670 
   2671     if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
   2672         error_setg_errno(errp, ENOENT, "%s is not a regular file",
   2673                          bs->filename);
   2674         return -ENOENT;
   2675     }
   2676 
   2677     ret = unlink(bs->filename);
   2678     if (ret < 0) {
   2679         ret = -errno;
   2680         error_setg_errno(errp, -ret, "Error when deleting file %s",
   2681                          bs->filename);
   2682     }
   2683 
   2684     return ret;
   2685 }
   2686 
   2687 /*
   2688  * Find allocation range in @bs around offset @start.
   2689  * May change underlying file descriptor's file offset.
   2690  * If @start is not in a hole, store @start in @data, and the
   2691  * beginning of the next hole in @hole, and return 0.
   2692  * If @start is in a non-trailing hole, store @start in @hole and the
   2693  * beginning of the next non-hole in @data, and return 0.
   2694  * If @start is in a trailing hole or beyond EOF, return -ENXIO.
   2695  * If we can't find out, return a negative errno other than -ENXIO.
   2696  */
   2697 static int find_allocation(BlockDriverState *bs, off_t start,
   2698                            off_t *data, off_t *hole)
   2699 {
   2700 #if defined SEEK_HOLE && defined SEEK_DATA
   2701     BDRVRawState *s = bs->opaque;
   2702     off_t offs;
   2703 
   2704     /*
   2705      * SEEK_DATA cases:
   2706      * D1. offs == start: start is in data
   2707      * D2. offs > start: start is in a hole, next data at offs
   2708      * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
   2709      *                              or start is beyond EOF
   2710      *     If the latter happens, the file has been truncated behind
   2711      *     our back since we opened it.  All bets are off then.
   2712      *     Treating like a trailing hole is simplest.
   2713      * D4. offs < 0, errno != ENXIO: we learned nothing
   2714      */
   2715     offs = lseek(s->fd, start, SEEK_DATA);
   2716     if (offs < 0) {
   2717         return -errno;          /* D3 or D4 */
   2718     }
   2719 
   2720     if (offs < start) {
   2721         /* This is not a valid return by lseek().  We are safe to just return
   2722          * -EIO in this case, and we'll treat it like D4. */
   2723         return -EIO;
   2724     }
   2725 
   2726     if (offs > start) {
   2727         /* D2: in hole, next data at offs */
   2728         *hole = start;
   2729         *data = offs;
   2730         return 0;
   2731     }
   2732 
   2733     /* D1: in data, end not yet known */
   2734 
   2735     /*
   2736      * SEEK_HOLE cases:
   2737      * H1. offs == start: start is in a hole
   2738      *     If this happens here, a hole has been dug behind our back
   2739      *     since the previous lseek().
   2740      * H2. offs > start: either start is in data, next hole at offs,
   2741      *                   or start is in trailing hole, EOF at offs
   2742      *     Linux treats trailing holes like any other hole: offs ==
   2743      *     start.  Solaris seeks to EOF instead: offs > start (blech).
   2744      *     If that happens here, a hole has been dug behind our back
   2745      *     since the previous lseek().
   2746      * H3. offs < 0, errno = ENXIO: start is beyond EOF
   2747      *     If this happens, the file has been truncated behind our
   2748      *     back since we opened it.  Treat it like a trailing hole.
   2749      * H4. offs < 0, errno != ENXIO: we learned nothing
   2750      *     Pretend we know nothing at all, i.e. "forget" about D1.
   2751      */
   2752     offs = lseek(s->fd, start, SEEK_HOLE);
   2753     if (offs < 0) {
   2754         return -errno;          /* D1 and (H3 or H4) */
   2755     }
   2756 
   2757     if (offs < start) {
   2758         /* This is not a valid return by lseek().  We are safe to just return
   2759          * -EIO in this case, and we'll treat it like H4. */
   2760         return -EIO;
   2761     }
   2762 
   2763     if (offs > start) {
   2764         /*
   2765          * D1 and H2: either in data, next hole at offs, or it was in
   2766          * data but is now in a trailing hole.  In the latter case,
   2767          * all bets are off.  Treating it as if it there was data all
   2768          * the way to EOF is safe, so simply do that.
   2769          */
   2770         *data = start;
   2771         *hole = offs;
   2772         return 0;
   2773     }
   2774 
   2775     /* D1 and H1 */
   2776     return -EBUSY;
   2777 #else
   2778     return -ENOTSUP;
   2779 #endif
   2780 }
   2781 
   2782 /*
   2783  * Returns the allocation status of the specified offset.
   2784  *
   2785  * The block layer guarantees 'offset' and 'bytes' are within bounds.
   2786  *
   2787  * 'pnum' is set to the number of bytes (including and immediately following
   2788  * the specified offset) that are known to be in the same
   2789  * allocated/unallocated state.
   2790  *
   2791  * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
   2792  * well exceed it.
   2793  */
   2794 static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
   2795                                             bool want_zero,
   2796                                             int64_t offset,
   2797                                             int64_t bytes, int64_t *pnum,
   2798                                             int64_t *map,
   2799                                             BlockDriverState **file)
   2800 {
   2801     off_t data = 0, hole = 0;
   2802     int ret;
   2803 
   2804     assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
   2805 
   2806     ret = fd_open(bs);
   2807     if (ret < 0) {
   2808         return ret;
   2809     }
   2810 
   2811     if (!want_zero) {
   2812         *pnum = bytes;
   2813         *map = offset;
   2814         *file = bs;
   2815         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
   2816     }
   2817 
   2818     ret = find_allocation(bs, offset, &data, &hole);
   2819     if (ret == -ENXIO) {
   2820         /* Trailing hole */
   2821         *pnum = bytes;
   2822         ret = BDRV_BLOCK_ZERO;
   2823     } else if (ret < 0) {
   2824         /* No info available, so pretend there are no holes */
   2825         *pnum = bytes;
   2826         ret = BDRV_BLOCK_DATA;
   2827     } else if (data == offset) {
   2828         /* On a data extent, compute bytes to the end of the extent,
   2829          * possibly including a partial sector at EOF. */
   2830         *pnum = hole - offset;
   2831 
   2832         /*
   2833          * We are not allowed to return partial sectors, though, so
   2834          * round up if necessary.
   2835          */
   2836         if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
   2837             int64_t file_length = raw_getlength(bs);
   2838             if (file_length > 0) {
   2839                 /* Ignore errors, this is just a safeguard */
   2840                 assert(hole == file_length);
   2841             }
   2842             *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
   2843         }
   2844 
   2845         ret = BDRV_BLOCK_DATA;
   2846     } else {
   2847         /* On a hole, compute bytes to the beginning of the next extent.  */
   2848         assert(hole == offset);
   2849         *pnum = data - offset;
   2850         ret = BDRV_BLOCK_ZERO;
   2851     }
   2852     *map = offset;
   2853     *file = bs;
   2854     return ret | BDRV_BLOCK_OFFSET_VALID;
   2855 }
   2856 
   2857 #if defined(__linux__)
   2858 /* Verify that the file is not in the page cache */
   2859 static void check_cache_dropped(BlockDriverState *bs, Error **errp)
   2860 {
   2861     const size_t window_size = 128 * 1024 * 1024;
   2862     BDRVRawState *s = bs->opaque;
   2863     void *window = NULL;
   2864     size_t length = 0;
   2865     unsigned char *vec;
   2866     size_t page_size;
   2867     off_t offset;
   2868     off_t end;
   2869 
   2870     /* mincore(2) page status information requires 1 byte per page */
   2871     page_size = sysconf(_SC_PAGESIZE);
   2872     vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
   2873 
   2874     end = raw_getlength(bs);
   2875 
   2876     for (offset = 0; offset < end; offset += window_size) {
   2877         void *new_window;
   2878         size_t new_length;
   2879         size_t vec_end;
   2880         size_t i;
   2881         int ret;
   2882 
   2883         /* Unmap previous window if size has changed */
   2884         new_length = MIN(end - offset, window_size);
   2885         if (new_length != length) {
   2886             munmap(window, length);
   2887             window = NULL;
   2888             length = 0;
   2889         }
   2890 
   2891         new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
   2892                           s->fd, offset);
   2893         if (new_window == MAP_FAILED) {
   2894             error_setg_errno(errp, errno, "mmap failed");
   2895             break;
   2896         }
   2897 
   2898         window = new_window;
   2899         length = new_length;
   2900 
   2901         ret = mincore(window, length, vec);
   2902         if (ret < 0) {
   2903             error_setg_errno(errp, errno, "mincore failed");
   2904             break;
   2905         }
   2906 
   2907         vec_end = DIV_ROUND_UP(length, page_size);
   2908         for (i = 0; i < vec_end; i++) {
   2909             if (vec[i] & 0x1) {
   2910                 break;
   2911             }
   2912         }
   2913         if (i < vec_end) {
   2914             error_setg(errp, "page cache still in use!");
   2915             break;
   2916         }
   2917     }
   2918 
   2919     if (window) {
   2920         munmap(window, length);
   2921     }
   2922 
   2923     g_free(vec);
   2924 }
   2925 #endif /* __linux__ */
   2926 
   2927 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
   2928                                                  Error **errp)
   2929 {
   2930     BDRVRawState *s = bs->opaque;
   2931     int ret;
   2932 
   2933     ret = fd_open(bs);
   2934     if (ret < 0) {
   2935         error_setg_errno(errp, -ret, "The file descriptor is not open");
   2936         return;
   2937     }
   2938 
   2939     if (!s->drop_cache) {
   2940         return;
   2941     }
   2942 
   2943     if (s->open_flags & O_DIRECT) {
   2944         return; /* No host kernel page cache */
   2945     }
   2946 
   2947 #if defined(__linux__)
   2948     /* This sets the scene for the next syscall... */
   2949     ret = bdrv_co_flush(bs);
   2950     if (ret < 0) {
   2951         error_setg_errno(errp, -ret, "flush failed");
   2952         return;
   2953     }
   2954 
   2955     /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
   2956      * process.  These limitations are okay because we just fsynced the file,
   2957      * we don't use mmap, and the file should not be in use by other processes.
   2958      */
   2959     ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
   2960     if (ret != 0) { /* the return value is a positive errno */
   2961         error_setg_errno(errp, ret, "fadvise failed");
   2962         return;
   2963     }
   2964 
   2965     if (s->check_cache_dropped) {
   2966         check_cache_dropped(bs, errp);
   2967     }
   2968 #else /* __linux__ */
   2969     /* Do nothing.  Live migration to a remote host with cache.direct=off is
   2970      * unsupported on other host operating systems.  Cache consistency issues
   2971      * may occur but no error is reported here, partly because that's the
   2972      * historical behavior and partly because it's hard to differentiate valid
   2973      * configurations that should not cause errors.
   2974      */
   2975 #endif /* !__linux__ */
   2976 }
   2977 
   2978 static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
   2979 {
   2980     if (ret) {
   2981         s->stats.discard_nb_failed++;
   2982     } else {
   2983         s->stats.discard_nb_ok++;
   2984         s->stats.discard_bytes_ok += nbytes;
   2985     }
   2986 }
   2987 
   2988 static coroutine_fn int
   2989 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
   2990                 bool blkdev)
   2991 {
   2992     BDRVRawState *s = bs->opaque;
   2993     RawPosixAIOData acb;
   2994     int ret;
   2995 
   2996     acb = (RawPosixAIOData) {
   2997         .bs             = bs,
   2998         .aio_fildes     = s->fd,
   2999         .aio_type       = QEMU_AIO_DISCARD,
   3000         .aio_offset     = offset,
   3001         .aio_nbytes     = bytes,
   3002     };
   3003 
   3004     if (blkdev) {
   3005         acb.aio_type |= QEMU_AIO_BLKDEV;
   3006     }
   3007 
   3008     ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
   3009     raw_account_discard(s, bytes, ret);
   3010     return ret;
   3011 }
   3012 
   3013 static coroutine_fn int
   3014 raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
   3015 {
   3016     return raw_do_pdiscard(bs, offset, bytes, false);
   3017 }
   3018 
   3019 static int coroutine_fn
   3020 raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
   3021                      BdrvRequestFlags flags, bool blkdev)
   3022 {
   3023     BDRVRawState *s = bs->opaque;
   3024     RawPosixAIOData acb;
   3025     ThreadPoolFunc *handler;
   3026 
   3027 #ifdef CONFIG_FALLOCATE
   3028     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
   3029         BdrvTrackedRequest *req;
   3030 
   3031         /*
   3032          * This is a workaround for a bug in the Linux XFS driver,
   3033          * where writes submitted through the AIO interface will be
   3034          * discarded if they happen beyond a concurrently running
   3035          * fallocate() that increases the file length (i.e., both the
   3036          * write and the fallocate() happen beyond the EOF).
   3037          *
   3038          * To work around it, we extend the tracked request for this
   3039          * zero write until INT64_MAX (effectively infinity), and mark
   3040          * it as serializing.
   3041          *
   3042          * We have to enable this workaround for all filesystems and
   3043          * AIO modes (not just XFS with aio=native), because for
   3044          * remote filesystems we do not know the host configuration.
   3045          */
   3046 
   3047         req = bdrv_co_get_self_request(bs);
   3048         assert(req);
   3049         assert(req->type == BDRV_TRACKED_WRITE);
   3050         assert(req->offset <= offset);
   3051         assert(req->offset + req->bytes >= offset + bytes);
   3052 
   3053         req->bytes = BDRV_MAX_LENGTH - req->offset;
   3054 
   3055         bdrv_check_request(req->offset, req->bytes, &error_abort);
   3056 
   3057         bdrv_make_request_serialising(req, bs->bl.request_alignment);
   3058     }
   3059 #endif
   3060 
   3061     acb = (RawPosixAIOData) {
   3062         .bs             = bs,
   3063         .aio_fildes     = s->fd,
   3064         .aio_type       = QEMU_AIO_WRITE_ZEROES,
   3065         .aio_offset     = offset,
   3066         .aio_nbytes     = bytes,
   3067     };
   3068 
   3069     if (blkdev) {
   3070         acb.aio_type |= QEMU_AIO_BLKDEV;
   3071     }
   3072     if (flags & BDRV_REQ_NO_FALLBACK) {
   3073         acb.aio_type |= QEMU_AIO_NO_FALLBACK;
   3074     }
   3075 
   3076     if (flags & BDRV_REQ_MAY_UNMAP) {
   3077         acb.aio_type |= QEMU_AIO_DISCARD;
   3078         handler = handle_aiocb_write_zeroes_unmap;
   3079     } else {
   3080         handler = handle_aiocb_write_zeroes;
   3081     }
   3082 
   3083     return raw_thread_pool_submit(bs, handler, &acb);
   3084 }
   3085 
   3086 static int coroutine_fn raw_co_pwrite_zeroes(
   3087     BlockDriverState *bs, int64_t offset,
   3088     int64_t bytes, BdrvRequestFlags flags)
   3089 {
   3090     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
   3091 }
   3092 
   3093 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
   3094 {
   3095     return 0;
   3096 }
   3097 
   3098 static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
   3099 {
   3100     BDRVRawState *s = bs->opaque;
   3101     return (BlockStatsSpecificFile) {
   3102         .discard_nb_ok = s->stats.discard_nb_ok,
   3103         .discard_nb_failed = s->stats.discard_nb_failed,
   3104         .discard_bytes_ok = s->stats.discard_bytes_ok,
   3105     };
   3106 }
   3107 
   3108 static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
   3109 {
   3110     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
   3111 
   3112     stats->driver = BLOCKDEV_DRIVER_FILE;
   3113     stats->u.file = get_blockstats_specific_file(bs);
   3114 
   3115     return stats;
   3116 }
   3117 
   3118 #if defined(HAVE_HOST_BLOCK_DEVICE)
   3119 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
   3120 {
   3121     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
   3122 
   3123     stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
   3124     stats->u.host_device = get_blockstats_specific_file(bs);
   3125 
   3126     return stats;
   3127 }
   3128 #endif /* HAVE_HOST_BLOCK_DEVICE */
   3129 
   3130 static QemuOptsList raw_create_opts = {
   3131     .name = "raw-create-opts",
   3132     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
   3133     .desc = {
   3134         {
   3135             .name = BLOCK_OPT_SIZE,
   3136             .type = QEMU_OPT_SIZE,
   3137             .help = "Virtual disk size"
   3138         },
   3139         {
   3140             .name = BLOCK_OPT_NOCOW,
   3141             .type = QEMU_OPT_BOOL,
   3142             .help = "Turn off copy-on-write (valid only on btrfs)"
   3143         },
   3144         {
   3145             .name = BLOCK_OPT_PREALLOC,
   3146             .type = QEMU_OPT_STRING,
   3147             .help = "Preallocation mode (allowed values: off"
   3148 #ifdef CONFIG_POSIX_FALLOCATE
   3149                     ", falloc"
   3150 #endif
   3151                     ", full)"
   3152         },
   3153         {
   3154             .name = BLOCK_OPT_EXTENT_SIZE_HINT,
   3155             .type = QEMU_OPT_SIZE,
   3156             .help = "Extent size hint for the image file, 0 to disable"
   3157         },
   3158         { /* end of list */ }
   3159     }
   3160 };
   3161 
   3162 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
   3163                           Error **errp)
   3164 {
   3165     BDRVRawState *s = bs->opaque;
   3166     int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
   3167     int open_flags;
   3168     int ret;
   3169 
   3170     /* We may need a new fd if auto-read-only switches the mode */
   3171     ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
   3172                                 false, errp);
   3173     if (ret < 0) {
   3174         return ret;
   3175     } else if (ret != s->fd) {
   3176         Error *local_err = NULL;
   3177 
   3178         /*
   3179          * Fail already check_perm() if we can't get a working O_DIRECT
   3180          * alignment with the new fd.
   3181          */
   3182         raw_probe_alignment(bs, ret, &local_err);
   3183         if (local_err) {
   3184             error_propagate(errp, local_err);
   3185             return -EINVAL;
   3186         }
   3187 
   3188         s->perm_change_fd = ret;
   3189         s->perm_change_flags = open_flags;
   3190     }
   3191 
   3192     /* Prepare permissions on old fd to avoid conflicts between old and new,
   3193      * but keep everything locked that new will need. */
   3194     ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
   3195     if (ret < 0) {
   3196         goto fail;
   3197     }
   3198 
   3199     /* Copy locks to the new fd */
   3200     if (s->perm_change_fd && s->use_lock) {
   3201         ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
   3202                                    false, errp);
   3203         if (ret < 0) {
   3204             raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
   3205             goto fail;
   3206         }
   3207     }
   3208     return 0;
   3209 
   3210 fail:
   3211     if (s->perm_change_fd) {
   3212         qemu_close(s->perm_change_fd);
   3213     }
   3214     s->perm_change_fd = 0;
   3215     return ret;
   3216 }
   3217 
   3218 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
   3219 {
   3220     BDRVRawState *s = bs->opaque;
   3221 
   3222     /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
   3223      * called after .bdrv_reopen_commit) */
   3224     if (s->perm_change_fd && s->fd != s->perm_change_fd) {
   3225         qemu_close(s->fd);
   3226         s->fd = s->perm_change_fd;
   3227         s->open_flags = s->perm_change_flags;
   3228     }
   3229     s->perm_change_fd = 0;
   3230 
   3231     raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
   3232     s->perm = perm;
   3233     s->shared_perm = shared;
   3234 }
   3235 
   3236 static void raw_abort_perm_update(BlockDriverState *bs)
   3237 {
   3238     BDRVRawState *s = bs->opaque;
   3239 
   3240     /* For reopen, .bdrv_reopen_abort is called afterwards and will close
   3241      * the file descriptor. */
   3242     if (s->perm_change_fd) {
   3243         qemu_close(s->perm_change_fd);
   3244     }
   3245     s->perm_change_fd = 0;
   3246 
   3247     raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
   3248 }
   3249 
   3250 static int coroutine_fn raw_co_copy_range_from(
   3251         BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
   3252         BdrvChild *dst, int64_t dst_offset, int64_t bytes,
   3253         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
   3254 {
   3255     return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
   3256                                  read_flags, write_flags);
   3257 }
   3258 
   3259 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
   3260                                              BdrvChild *src,
   3261                                              int64_t src_offset,
   3262                                              BdrvChild *dst,
   3263                                              int64_t dst_offset,
   3264                                              int64_t bytes,
   3265                                              BdrvRequestFlags read_flags,
   3266                                              BdrvRequestFlags write_flags)
   3267 {
   3268     RawPosixAIOData acb;
   3269     BDRVRawState *s = bs->opaque;
   3270     BDRVRawState *src_s;
   3271 
   3272     assert(dst->bs == bs);
   3273     if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
   3274         return -ENOTSUP;
   3275     }
   3276 
   3277     src_s = src->bs->opaque;
   3278     if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
   3279         return -EIO;
   3280     }
   3281 
   3282     acb = (RawPosixAIOData) {
   3283         .bs             = bs,
   3284         .aio_type       = QEMU_AIO_COPY_RANGE,
   3285         .aio_fildes     = src_s->fd,
   3286         .aio_offset     = src_offset,
   3287         .aio_nbytes     = bytes,
   3288         .copy_range     = {
   3289             .aio_fd2        = s->fd,
   3290             .aio_offset2    = dst_offset,
   3291         },
   3292     };
   3293 
   3294     return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
   3295 }
   3296 
   3297 BlockDriver bdrv_file = {
   3298     .format_name = "file",
   3299     .protocol_name = "file",
   3300     .instance_size = sizeof(BDRVRawState),
   3301     .bdrv_needs_filename = true,
   3302     .bdrv_probe = NULL, /* no probe for protocols */
   3303     .bdrv_parse_filename = raw_parse_filename,
   3304     .bdrv_file_open = raw_open,
   3305     .bdrv_reopen_prepare = raw_reopen_prepare,
   3306     .bdrv_reopen_commit = raw_reopen_commit,
   3307     .bdrv_reopen_abort = raw_reopen_abort,
   3308     .bdrv_close = raw_close,
   3309     .bdrv_co_create = raw_co_create,
   3310     .bdrv_co_create_opts = raw_co_create_opts,
   3311     .bdrv_has_zero_init = bdrv_has_zero_init_1,
   3312     .bdrv_co_block_status = raw_co_block_status,
   3313     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3314     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
   3315     .bdrv_co_delete_file = raw_co_delete_file,
   3316 
   3317     .bdrv_co_preadv         = raw_co_preadv,
   3318     .bdrv_co_pwritev        = raw_co_pwritev,
   3319     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3320     .bdrv_co_pdiscard       = raw_co_pdiscard,
   3321     .bdrv_co_copy_range_from = raw_co_copy_range_from,
   3322     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
   3323     .bdrv_refresh_limits = raw_refresh_limits,
   3324     .bdrv_io_plug = raw_aio_plug,
   3325     .bdrv_io_unplug = raw_aio_unplug,
   3326     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3327 
   3328     .bdrv_co_truncate = raw_co_truncate,
   3329     .bdrv_getlength = raw_getlength,
   3330     .bdrv_get_info = raw_get_info,
   3331     .bdrv_get_allocated_file_size
   3332                         = raw_get_allocated_file_size,
   3333     .bdrv_get_specific_stats = raw_get_specific_stats,
   3334     .bdrv_check_perm = raw_check_perm,
   3335     .bdrv_set_perm   = raw_set_perm,
   3336     .bdrv_abort_perm_update = raw_abort_perm_update,
   3337     .create_opts = &raw_create_opts,
   3338     .mutable_opts = mutable_opts,
   3339 };
   3340 
   3341 /***********************************************/
   3342 /* host device */
   3343 
   3344 #if defined(HAVE_HOST_BLOCK_DEVICE)
   3345 
   3346 #if defined(__APPLE__) && defined(__MACH__)
   3347 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
   3348                                 CFIndex maxPathSize, int flags);
   3349 
   3350 #if !defined(MAC_OS_VERSION_12_0) \
   3351     || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_VERSION_12_0)
   3352 #define IOMainPort IOMasterPort
   3353 #endif
   3354 
   3355 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
   3356 {
   3357     kern_return_t kernResult = KERN_FAILURE;
   3358     mach_port_t mainPort;
   3359     CFMutableDictionaryRef  classesToMatch;
   3360     const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
   3361     char *mediaType = NULL;
   3362 
   3363     kernResult = IOMainPort(MACH_PORT_NULL, &mainPort);
   3364     if ( KERN_SUCCESS != kernResult ) {
   3365         printf("IOMainPort returned %d\n", kernResult);
   3366     }
   3367 
   3368     int index;
   3369     for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
   3370         classesToMatch = IOServiceMatching(matching_array[index]);
   3371         if (classesToMatch == NULL) {
   3372             error_report("IOServiceMatching returned NULL for %s",
   3373                          matching_array[index]);
   3374             continue;
   3375         }
   3376         CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
   3377                              kCFBooleanTrue);
   3378         kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch,
   3379                                                   mediaIterator);
   3380         if (kernResult != KERN_SUCCESS) {
   3381             error_report("Note: IOServiceGetMatchingServices returned %d",
   3382                          kernResult);
   3383             continue;
   3384         }
   3385 
   3386         /* If a match was found, leave the loop */
   3387         if (*mediaIterator != 0) {
   3388             trace_file_FindEjectableOpticalMedia(matching_array[index]);
   3389             mediaType = g_strdup(matching_array[index]);
   3390             break;
   3391         }
   3392     }
   3393     return mediaType;
   3394 }
   3395 
   3396 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
   3397                          CFIndex maxPathSize, int flags)
   3398 {
   3399     io_object_t     nextMedia;
   3400     kern_return_t   kernResult = KERN_FAILURE;
   3401     *bsdPath = '\0';
   3402     nextMedia = IOIteratorNext( mediaIterator );
   3403     if ( nextMedia )
   3404     {
   3405         CFTypeRef   bsdPathAsCFString;
   3406     bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
   3407         if ( bsdPathAsCFString ) {
   3408             size_t devPathLength;
   3409             strcpy( bsdPath, _PATH_DEV );
   3410             if (flags & BDRV_O_NOCACHE) {
   3411                 strcat(bsdPath, "r");
   3412             }
   3413             devPathLength = strlen( bsdPath );
   3414             if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
   3415                 kernResult = KERN_SUCCESS;
   3416             }
   3417             CFRelease( bsdPathAsCFString );
   3418         }
   3419         IOObjectRelease( nextMedia );
   3420     }
   3421 
   3422     return kernResult;
   3423 }
   3424 
   3425 /* Sets up a real cdrom for use in QEMU */
   3426 static bool setup_cdrom(char *bsd_path, Error **errp)
   3427 {
   3428     int index, num_of_test_partitions = 2, fd;
   3429     char test_partition[MAXPATHLEN];
   3430     bool partition_found = false;
   3431 
   3432     /* look for a working partition */
   3433     for (index = 0; index < num_of_test_partitions; index++) {
   3434         snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
   3435                  index);
   3436         fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
   3437         if (fd >= 0) {
   3438             partition_found = true;
   3439             qemu_close(fd);
   3440             break;
   3441         }
   3442     }
   3443 
   3444     /* if a working partition on the device was not found */
   3445     if (partition_found == false) {
   3446         error_setg(errp, "Failed to find a working partition on disc");
   3447     } else {
   3448         trace_file_setup_cdrom(test_partition);
   3449         pstrcpy(bsd_path, MAXPATHLEN, test_partition);
   3450     }
   3451     return partition_found;
   3452 }
   3453 
   3454 /* Prints directions on mounting and unmounting a device */
   3455 static void print_unmounting_directions(const char *file_name)
   3456 {
   3457     error_report("If device %s is mounted on the desktop, unmount"
   3458                  " it first before using it in QEMU", file_name);
   3459     error_report("Command to unmount device: diskutil unmountDisk %s",
   3460                  file_name);
   3461     error_report("Command to mount device: diskutil mountDisk %s", file_name);
   3462 }
   3463 
   3464 #endif /* defined(__APPLE__) && defined(__MACH__) */
   3465 
   3466 static int hdev_probe_device(const char *filename)
   3467 {
   3468     struct stat st;
   3469 
   3470     /* allow a dedicated CD-ROM driver to match with a higher priority */
   3471     if (strstart(filename, "/dev/cdrom", NULL))
   3472         return 50;
   3473 
   3474     if (stat(filename, &st) >= 0 &&
   3475             (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
   3476         return 100;
   3477     }
   3478 
   3479     return 0;
   3480 }
   3481 
   3482 static void hdev_parse_filename(const char *filename, QDict *options,
   3483                                 Error **errp)
   3484 {
   3485     bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
   3486 }
   3487 
   3488 static bool hdev_is_sg(BlockDriverState *bs)
   3489 {
   3490 
   3491 #if defined(__linux__)
   3492 
   3493     BDRVRawState *s = bs->opaque;
   3494     struct stat st;
   3495     struct sg_scsi_id scsiid;
   3496     int sg_version;
   3497     int ret;
   3498 
   3499     if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
   3500         return false;
   3501     }
   3502 
   3503     ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
   3504     if (ret < 0) {
   3505         return false;
   3506     }
   3507 
   3508     ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
   3509     if (ret >= 0) {
   3510         trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
   3511         return true;
   3512     }
   3513 
   3514 #endif
   3515 
   3516     return false;
   3517 }
   3518 
   3519 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
   3520                      Error **errp)
   3521 {
   3522     BDRVRawState *s = bs->opaque;
   3523     int ret;
   3524 
   3525 #if defined(__APPLE__) && defined(__MACH__)
   3526     /*
   3527      * Caution: while qdict_get_str() is fine, getting non-string types
   3528      * would require more care.  When @options come from -blockdev or
   3529      * blockdev_add, its members are typed according to the QAPI
   3530      * schema, but when they come from -drive, they're all QString.
   3531      */
   3532     const char *filename = qdict_get_str(options, "filename");
   3533     char bsd_path[MAXPATHLEN] = "";
   3534     bool error_occurred = false;
   3535 
   3536     /* If using a real cdrom */
   3537     if (strcmp(filename, "/dev/cdrom") == 0) {
   3538         char *mediaType = NULL;
   3539         kern_return_t ret_val;
   3540         io_iterator_t mediaIterator = 0;
   3541 
   3542         mediaType = FindEjectableOpticalMedia(&mediaIterator);
   3543         if (mediaType == NULL) {
   3544             error_setg(errp, "Please make sure your CD/DVD is in the optical"
   3545                        " drive");
   3546             error_occurred = true;
   3547             goto hdev_open_Mac_error;
   3548         }
   3549 
   3550         ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
   3551         if (ret_val != KERN_SUCCESS) {
   3552             error_setg(errp, "Could not get BSD path for optical drive");
   3553             error_occurred = true;
   3554             goto hdev_open_Mac_error;
   3555         }
   3556 
   3557         /* If a real optical drive was not found */
   3558         if (bsd_path[0] == '\0') {
   3559             error_setg(errp, "Failed to obtain bsd path for optical drive");
   3560             error_occurred = true;
   3561             goto hdev_open_Mac_error;
   3562         }
   3563 
   3564         /* If using a cdrom disc and finding a partition on the disc failed */
   3565         if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
   3566             setup_cdrom(bsd_path, errp) == false) {
   3567             print_unmounting_directions(bsd_path);
   3568             error_occurred = true;
   3569             goto hdev_open_Mac_error;
   3570         }
   3571 
   3572         qdict_put_str(options, "filename", bsd_path);
   3573 
   3574 hdev_open_Mac_error:
   3575         g_free(mediaType);
   3576         if (mediaIterator) {
   3577             IOObjectRelease(mediaIterator);
   3578         }
   3579         if (error_occurred) {
   3580             return -ENOENT;
   3581         }
   3582     }
   3583 #endif /* defined(__APPLE__) && defined(__MACH__) */
   3584 
   3585     s->type = FTYPE_FILE;
   3586 
   3587     ret = raw_open_common(bs, options, flags, 0, true, errp);
   3588     if (ret < 0) {
   3589 #if defined(__APPLE__) && defined(__MACH__)
   3590         if (*bsd_path) {
   3591             filename = bsd_path;
   3592         }
   3593         /* if a physical device experienced an error while being opened */
   3594         if (strncmp(filename, "/dev/", 5) == 0) {
   3595             print_unmounting_directions(filename);
   3596         }
   3597 #endif /* defined(__APPLE__) && defined(__MACH__) */
   3598         return ret;
   3599     }
   3600 
   3601     /* Since this does ioctl the device must be already opened */
   3602     bs->sg = hdev_is_sg(bs);
   3603 
   3604     return ret;
   3605 }
   3606 
   3607 #if defined(__linux__)
   3608 static int coroutine_fn
   3609 hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
   3610 {
   3611     BDRVRawState *s = bs->opaque;
   3612     RawPosixAIOData acb;
   3613     int ret;
   3614 
   3615     ret = fd_open(bs);
   3616     if (ret < 0) {
   3617         return ret;
   3618     }
   3619 
   3620     if (req == SG_IO && s->pr_mgr) {
   3621         struct sg_io_hdr *io_hdr = buf;
   3622         if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
   3623             io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
   3624             return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
   3625                                       s->fd, io_hdr);
   3626         }
   3627     }
   3628 
   3629     acb = (RawPosixAIOData) {
   3630         .bs         = bs,
   3631         .aio_type   = QEMU_AIO_IOCTL,
   3632         .aio_fildes = s->fd,
   3633         .aio_offset = 0,
   3634         .ioctl      = {
   3635             .buf        = buf,
   3636             .cmd        = req,
   3637         },
   3638     };
   3639 
   3640     return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
   3641 }
   3642 #endif /* linux */
   3643 
   3644 static coroutine_fn int
   3645 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
   3646 {
   3647     BDRVRawState *s = bs->opaque;
   3648     int ret;
   3649 
   3650     ret = fd_open(bs);
   3651     if (ret < 0) {
   3652         raw_account_discard(s, bytes, ret);
   3653         return ret;
   3654     }
   3655     return raw_do_pdiscard(bs, offset, bytes, true);
   3656 }
   3657 
   3658 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
   3659     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
   3660 {
   3661     int rc;
   3662 
   3663     rc = fd_open(bs);
   3664     if (rc < 0) {
   3665         return rc;
   3666     }
   3667 
   3668     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
   3669 }
   3670 
   3671 static BlockDriver bdrv_host_device = {
   3672     .format_name        = "host_device",
   3673     .protocol_name        = "host_device",
   3674     .instance_size      = sizeof(BDRVRawState),
   3675     .bdrv_needs_filename = true,
   3676     .bdrv_probe_device  = hdev_probe_device,
   3677     .bdrv_parse_filename = hdev_parse_filename,
   3678     .bdrv_file_open     = hdev_open,
   3679     .bdrv_close         = raw_close,
   3680     .bdrv_reopen_prepare = raw_reopen_prepare,
   3681     .bdrv_reopen_commit  = raw_reopen_commit,
   3682     .bdrv_reopen_abort   = raw_reopen_abort,
   3683     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3684     .create_opts         = &bdrv_create_opts_simple,
   3685     .mutable_opts        = mutable_opts,
   3686     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3687     .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
   3688 
   3689     .bdrv_co_preadv         = raw_co_preadv,
   3690     .bdrv_co_pwritev        = raw_co_pwritev,
   3691     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3692     .bdrv_co_pdiscard       = hdev_co_pdiscard,
   3693     .bdrv_co_copy_range_from = raw_co_copy_range_from,
   3694     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
   3695     .bdrv_refresh_limits = raw_refresh_limits,
   3696     .bdrv_io_plug = raw_aio_plug,
   3697     .bdrv_io_unplug = raw_aio_unplug,
   3698     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3699 
   3700     .bdrv_co_truncate       = raw_co_truncate,
   3701     .bdrv_getlength	= raw_getlength,
   3702     .bdrv_get_info = raw_get_info,
   3703     .bdrv_get_allocated_file_size
   3704                         = raw_get_allocated_file_size,
   3705     .bdrv_get_specific_stats = hdev_get_specific_stats,
   3706     .bdrv_check_perm = raw_check_perm,
   3707     .bdrv_set_perm   = raw_set_perm,
   3708     .bdrv_abort_perm_update = raw_abort_perm_update,
   3709     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
   3710     .bdrv_probe_geometry = hdev_probe_geometry,
   3711 
   3712     /* generic scsi device */
   3713 #ifdef __linux__
   3714     .bdrv_co_ioctl          = hdev_co_ioctl,
   3715 #endif
   3716 };
   3717 
   3718 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   3719 static void cdrom_parse_filename(const char *filename, QDict *options,
   3720                                  Error **errp)
   3721 {
   3722     bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
   3723 }
   3724 #endif
   3725 
   3726 #ifdef __linux__
   3727 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
   3728                       Error **errp)
   3729 {
   3730     BDRVRawState *s = bs->opaque;
   3731 
   3732     s->type = FTYPE_CD;
   3733 
   3734     /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
   3735     return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
   3736 }
   3737 
   3738 static int cdrom_probe_device(const char *filename)
   3739 {
   3740     int fd, ret;
   3741     int prio = 0;
   3742     struct stat st;
   3743 
   3744     fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
   3745     if (fd < 0) {
   3746         goto out;
   3747     }
   3748     ret = fstat(fd, &st);
   3749     if (ret == -1 || !S_ISBLK(st.st_mode)) {
   3750         goto outc;
   3751     }
   3752 
   3753     /* Attempt to detect via a CDROM specific ioctl */
   3754     ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
   3755     if (ret >= 0)
   3756         prio = 100;
   3757 
   3758 outc:
   3759     qemu_close(fd);
   3760 out:
   3761     return prio;
   3762 }
   3763 
   3764 static bool cdrom_is_inserted(BlockDriverState *bs)
   3765 {
   3766     BDRVRawState *s = bs->opaque;
   3767     int ret;
   3768 
   3769     ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
   3770     return ret == CDS_DISC_OK;
   3771 }
   3772 
   3773 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
   3774 {
   3775     BDRVRawState *s = bs->opaque;
   3776 
   3777     if (eject_flag) {
   3778         if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
   3779             perror("CDROMEJECT");
   3780     } else {
   3781         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
   3782             perror("CDROMEJECT");
   3783     }
   3784 }
   3785 
   3786 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
   3787 {
   3788     BDRVRawState *s = bs->opaque;
   3789 
   3790     if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
   3791         /*
   3792          * Note: an error can happen if the distribution automatically
   3793          * mounts the CD-ROM
   3794          */
   3795         /* perror("CDROM_LOCKDOOR"); */
   3796     }
   3797 }
   3798 
   3799 static BlockDriver bdrv_host_cdrom = {
   3800     .format_name        = "host_cdrom",
   3801     .protocol_name      = "host_cdrom",
   3802     .instance_size      = sizeof(BDRVRawState),
   3803     .bdrv_needs_filename = true,
   3804     .bdrv_probe_device	= cdrom_probe_device,
   3805     .bdrv_parse_filename = cdrom_parse_filename,
   3806     .bdrv_file_open     = cdrom_open,
   3807     .bdrv_close         = raw_close,
   3808     .bdrv_reopen_prepare = raw_reopen_prepare,
   3809     .bdrv_reopen_commit  = raw_reopen_commit,
   3810     .bdrv_reopen_abort   = raw_reopen_abort,
   3811     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3812     .create_opts         = &bdrv_create_opts_simple,
   3813     .mutable_opts        = mutable_opts,
   3814     .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
   3815 
   3816     .bdrv_co_preadv         = raw_co_preadv,
   3817     .bdrv_co_pwritev        = raw_co_pwritev,
   3818     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3819     .bdrv_refresh_limits = raw_refresh_limits,
   3820     .bdrv_io_plug = raw_aio_plug,
   3821     .bdrv_io_unplug = raw_aio_unplug,
   3822     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3823 
   3824     .bdrv_co_truncate    = raw_co_truncate,
   3825     .bdrv_getlength      = raw_getlength,
   3826     .has_variable_length = true,
   3827     .bdrv_get_allocated_file_size
   3828                         = raw_get_allocated_file_size,
   3829 
   3830     /* removable device support */
   3831     .bdrv_is_inserted   = cdrom_is_inserted,
   3832     .bdrv_eject         = cdrom_eject,
   3833     .bdrv_lock_medium   = cdrom_lock_medium,
   3834 
   3835     /* generic scsi device */
   3836     .bdrv_co_ioctl      = hdev_co_ioctl,
   3837 };
   3838 #endif /* __linux__ */
   3839 
   3840 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
   3841 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
   3842                       Error **errp)
   3843 {
   3844     BDRVRawState *s = bs->opaque;
   3845     int ret;
   3846 
   3847     s->type = FTYPE_CD;
   3848 
   3849     ret = raw_open_common(bs, options, flags, 0, true, errp);
   3850     if (ret) {
   3851         return ret;
   3852     }
   3853 
   3854     /* make sure the door isn't locked at this time */
   3855     ioctl(s->fd, CDIOCALLOW);
   3856     return 0;
   3857 }
   3858 
   3859 static int cdrom_probe_device(const char *filename)
   3860 {
   3861     if (strstart(filename, "/dev/cd", NULL) ||
   3862             strstart(filename, "/dev/acd", NULL))
   3863         return 100;
   3864     return 0;
   3865 }
   3866 
   3867 static int cdrom_reopen(BlockDriverState *bs)
   3868 {
   3869     BDRVRawState *s = bs->opaque;
   3870     int fd;
   3871 
   3872     /*
   3873      * Force reread of possibly changed/newly loaded disc,
   3874      * FreeBSD seems to not notice sometimes...
   3875      */
   3876     if (s->fd >= 0)
   3877         qemu_close(s->fd);
   3878     fd = qemu_open(bs->filename, s->open_flags, NULL);
   3879     if (fd < 0) {
   3880         s->fd = -1;
   3881         return -EIO;
   3882     }
   3883     s->fd = fd;
   3884 
   3885     /* make sure the door isn't locked at this time */
   3886     ioctl(s->fd, CDIOCALLOW);
   3887     return 0;
   3888 }
   3889 
   3890 static bool cdrom_is_inserted(BlockDriverState *bs)
   3891 {
   3892     return raw_getlength(bs) > 0;
   3893 }
   3894 
   3895 static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
   3896 {
   3897     BDRVRawState *s = bs->opaque;
   3898 
   3899     if (s->fd < 0)
   3900         return;
   3901 
   3902     (void) ioctl(s->fd, CDIOCALLOW);
   3903 
   3904     if (eject_flag) {
   3905         if (ioctl(s->fd, CDIOCEJECT) < 0)
   3906             perror("CDIOCEJECT");
   3907     } else {
   3908         if (ioctl(s->fd, CDIOCCLOSE) < 0)
   3909             perror("CDIOCCLOSE");
   3910     }
   3911 
   3912     cdrom_reopen(bs);
   3913 }
   3914 
   3915 static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
   3916 {
   3917     BDRVRawState *s = bs->opaque;
   3918 
   3919     if (s->fd < 0)
   3920         return;
   3921     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
   3922         /*
   3923          * Note: an error can happen if the distribution automatically
   3924          * mounts the CD-ROM
   3925          */
   3926         /* perror("CDROM_LOCKDOOR"); */
   3927     }
   3928 }
   3929 
   3930 static BlockDriver bdrv_host_cdrom = {
   3931     .format_name        = "host_cdrom",
   3932     .protocol_name      = "host_cdrom",
   3933     .instance_size      = sizeof(BDRVRawState),
   3934     .bdrv_needs_filename = true,
   3935     .bdrv_probe_device	= cdrom_probe_device,
   3936     .bdrv_parse_filename = cdrom_parse_filename,
   3937     .bdrv_file_open     = cdrom_open,
   3938     .bdrv_close         = raw_close,
   3939     .bdrv_reopen_prepare = raw_reopen_prepare,
   3940     .bdrv_reopen_commit  = raw_reopen_commit,
   3941     .bdrv_reopen_abort   = raw_reopen_abort,
   3942     .bdrv_co_create_opts = bdrv_co_create_opts_simple,
   3943     .create_opts         = &bdrv_create_opts_simple,
   3944     .mutable_opts       = mutable_opts,
   3945 
   3946     .bdrv_co_preadv         = raw_co_preadv,
   3947     .bdrv_co_pwritev        = raw_co_pwritev,
   3948     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
   3949     .bdrv_refresh_limits = raw_refresh_limits,
   3950     .bdrv_io_plug = raw_aio_plug,
   3951     .bdrv_io_unplug = raw_aio_unplug,
   3952     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
   3953 
   3954     .bdrv_co_truncate    = raw_co_truncate,
   3955     .bdrv_getlength      = raw_getlength,
   3956     .has_variable_length = true,
   3957     .bdrv_get_allocated_file_size
   3958                         = raw_get_allocated_file_size,
   3959 
   3960     /* removable device support */
   3961     .bdrv_is_inserted   = cdrom_is_inserted,
   3962     .bdrv_eject         = cdrom_eject,
   3963     .bdrv_lock_medium   = cdrom_lock_medium,
   3964 };
   3965 #endif /* __FreeBSD__ */
   3966 
   3967 #endif /* HAVE_HOST_BLOCK_DEVICE */
   3968 
   3969 static void bdrv_file_init(void)
   3970 {
   3971     /*
   3972      * Register all the drivers.  Note that order is important, the driver
   3973      * registered last will get probed first.
   3974      */
   3975     bdrv_register(&bdrv_file);
   3976 #if defined(HAVE_HOST_BLOCK_DEVICE)
   3977     bdrv_register(&bdrv_host_device);
   3978 #ifdef __linux__
   3979     bdrv_register(&bdrv_host_cdrom);
   3980 #endif
   3981 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
   3982     bdrv_register(&bdrv_host_cdrom);
   3983 #endif
   3984 #endif /* HAVE_HOST_BLOCK_DEVICE */
   3985 }
   3986 
   3987 block_init(bdrv_file_init);