mirror of https://gitlab.com/qemu-project/qemu
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
420 lines
12 KiB
C
420 lines
12 KiB
C
/*
|
|
* Export QEMU block device via VDUSE
|
|
*
|
|
* Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
|
|
*
|
|
* Author:
|
|
* Xie Yongji <xieyongji@bytedance.com>
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or
|
|
* later. See the COPYING file in the top-level directory.
|
|
*/
|
|
|
|
#include "qemu/osdep.h"
|
|
#include <sys/eventfd.h>
|
|
|
|
#include "qapi/error.h"
|
|
#include "block/export.h"
|
|
#include "qemu/error-report.h"
|
|
#include "util/block-helpers.h"
|
|
#include "subprojects/libvduse/libvduse.h"
|
|
#include "virtio-blk-handler.h"
|
|
|
|
#include "standard-headers/linux/virtio_blk.h"
|
|
|
|
#define VDUSE_DEFAULT_NUM_QUEUE 1
|
|
#define VDUSE_DEFAULT_QUEUE_SIZE 256
|
|
|
|
typedef struct VduseBlkExport {
|
|
BlockExport export;
|
|
VirtioBlkHandler handler;
|
|
VduseDev *dev;
|
|
uint16_t num_queues;
|
|
char *recon_file;
|
|
unsigned int inflight; /* atomic */
|
|
bool vqs_started;
|
|
} VduseBlkExport;
|
|
|
|
typedef struct VduseBlkReq {
|
|
VduseVirtqElement elem;
|
|
VduseVirtq *vq;
|
|
} VduseBlkReq;
|
|
|
|
static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
|
|
{
|
|
if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
|
|
/* Prevent export from being deleted */
|
|
blk_exp_ref(&vblk_exp->export);
|
|
}
|
|
}
|
|
|
|
static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
|
|
{
|
|
if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
|
|
/* Wake AIO_WAIT_WHILE() */
|
|
aio_wait_kick();
|
|
|
|
/* Now the export can be deleted */
|
|
blk_exp_unref(&vblk_exp->export);
|
|
}
|
|
}
|
|
|
|
static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len)
|
|
{
|
|
vduse_queue_push(req->vq, &req->elem, in_len);
|
|
vduse_queue_notify(req->vq);
|
|
|
|
free(req);
|
|
}
|
|
|
|
static void coroutine_fn vduse_blk_virtio_process_req(void *opaque)
|
|
{
|
|
VduseBlkReq *req = opaque;
|
|
VduseVirtq *vq = req->vq;
|
|
VduseDev *dev = vduse_queue_get_dev(vq);
|
|
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
|
|
VirtioBlkHandler *handler = &vblk_exp->handler;
|
|
VduseVirtqElement *elem = &req->elem;
|
|
struct iovec *in_iov = elem->in_sg;
|
|
struct iovec *out_iov = elem->out_sg;
|
|
unsigned in_num = elem->in_num;
|
|
unsigned out_num = elem->out_num;
|
|
int in_len;
|
|
|
|
in_len = virtio_blk_process_req(handler, in_iov,
|
|
out_iov, in_num, out_num);
|
|
if (in_len < 0) {
|
|
free(req);
|
|
return;
|
|
}
|
|
|
|
vduse_blk_req_complete(req, in_len);
|
|
vduse_blk_inflight_dec(vblk_exp);
|
|
}
|
|
|
|
static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq)
|
|
{
|
|
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
|
|
|
|
while (1) {
|
|
VduseBlkReq *req;
|
|
|
|
req = vduse_queue_pop(vq, sizeof(VduseBlkReq));
|
|
if (!req) {
|
|
break;
|
|
}
|
|
req->vq = vq;
|
|
|
|
Coroutine *co =
|
|
qemu_coroutine_create(vduse_blk_virtio_process_req, req);
|
|
|
|
vduse_blk_inflight_inc(vblk_exp);
|
|
qemu_coroutine_enter(co);
|
|
}
|
|
}
|
|
|
|
static void on_vduse_vq_kick(void *opaque)
|
|
{
|
|
VduseVirtq *vq = opaque;
|
|
VduseDev *dev = vduse_queue_get_dev(vq);
|
|
int fd = vduse_queue_get_fd(vq);
|
|
eventfd_t kick_data;
|
|
|
|
if (eventfd_read(fd, &kick_data) == -1) {
|
|
error_report("failed to read data from eventfd");
|
|
return;
|
|
}
|
|
|
|
vduse_blk_vq_handler(dev, vq);
|
|
}
|
|
|
|
static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
|
|
{
|
|
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
|
|
|
|
if (!vblk_exp->vqs_started) {
|
|
return; /* vduse_blk_drained_end() will start vqs later */
|
|
}
|
|
|
|
aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
|
|
on_vduse_vq_kick, NULL, NULL, NULL, vq);
|
|
/* Make sure we don't miss any kick after reconnecting */
|
|
eventfd_write(vduse_queue_get_fd(vq), 1);
|
|
}
|
|
|
|
static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
|
|
{
|
|
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
|
|
int fd = vduse_queue_get_fd(vq);
|
|
|
|
if (fd < 0) {
|
|
return;
|
|
}
|
|
|
|
aio_set_fd_handler(vblk_exp->export.ctx, fd,
|
|
NULL, NULL, NULL, NULL, NULL);
|
|
}
|
|
|
|
static const VduseOps vduse_blk_ops = {
|
|
.enable_queue = vduse_blk_enable_queue,
|
|
.disable_queue = vduse_blk_disable_queue,
|
|
};
|
|
|
|
static void on_vduse_dev_kick(void *opaque)
|
|
{
|
|
VduseDev *dev = opaque;
|
|
|
|
vduse_dev_handler(dev);
|
|
}
|
|
|
|
static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
|
|
{
|
|
aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
|
|
on_vduse_dev_kick, NULL, NULL, NULL,
|
|
vblk_exp->dev);
|
|
|
|
/* Virtqueues are handled by vduse_blk_drained_end() */
|
|
}
|
|
|
|
static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
|
|
{
|
|
aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
|
|
NULL, NULL, NULL, NULL, NULL);
|
|
|
|
/* Virtqueues are handled by vduse_blk_drained_begin() */
|
|
}
|
|
|
|
|
|
static void blk_aio_attached(AioContext *ctx, void *opaque)
|
|
{
|
|
VduseBlkExport *vblk_exp = opaque;
|
|
|
|
vblk_exp->export.ctx = ctx;
|
|
vduse_blk_attach_ctx(vblk_exp, ctx);
|
|
}
|
|
|
|
static void blk_aio_detach(void *opaque)
|
|
{
|
|
VduseBlkExport *vblk_exp = opaque;
|
|
|
|
vduse_blk_detach_ctx(vblk_exp);
|
|
vblk_exp->export.ctx = NULL;
|
|
}
|
|
|
|
static void vduse_blk_resize(void *opaque)
|
|
{
|
|
BlockExport *exp = opaque;
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
struct virtio_blk_config config;
|
|
|
|
config.capacity =
|
|
cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
|
|
vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity),
|
|
offsetof(struct virtio_blk_config, capacity),
|
|
(char *)&config.capacity);
|
|
}
|
|
|
|
static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
|
|
{
|
|
for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
|
|
VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
|
|
vduse_blk_disable_queue(vblk_exp->dev, vq);
|
|
}
|
|
|
|
vblk_exp->vqs_started = false;
|
|
}
|
|
|
|
static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
|
|
{
|
|
vblk_exp->vqs_started = true;
|
|
|
|
for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
|
|
VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
|
|
vduse_blk_enable_queue(vblk_exp->dev, vq);
|
|
}
|
|
}
|
|
|
|
static void vduse_blk_drained_begin(void *opaque)
|
|
{
|
|
BlockExport *exp = opaque;
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
|
|
vduse_blk_stop_virtqueues(vblk_exp);
|
|
}
|
|
|
|
static void vduse_blk_drained_end(void *opaque)
|
|
{
|
|
BlockExport *exp = opaque;
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
|
|
vduse_blk_start_virtqueues(vblk_exp);
|
|
}
|
|
|
|
static bool vduse_blk_drained_poll(void *opaque)
|
|
{
|
|
BlockExport *exp = opaque;
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
|
|
return qatomic_read(&vblk_exp->inflight) > 0;
|
|
}
|
|
|
|
static const BlockDevOps vduse_block_ops = {
|
|
.resize_cb = vduse_blk_resize,
|
|
.drained_begin = vduse_blk_drained_begin,
|
|
.drained_end = vduse_blk_drained_end,
|
|
.drained_poll = vduse_blk_drained_poll,
|
|
};
|
|
|
|
static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
|
|
Error **errp)
|
|
{
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk;
|
|
uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE;
|
|
uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE;
|
|
uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE;
|
|
struct virtio_blk_config config = { 0 };
|
|
uint64_t features;
|
|
int i, ret;
|
|
|
|
if (vblk_opts->has_num_queues) {
|
|
num_queues = vblk_opts->num_queues;
|
|
if (num_queues == 0) {
|
|
error_setg(errp, "num-queues must be greater than 0");
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
if (vblk_opts->has_queue_size) {
|
|
queue_size = vblk_opts->queue_size;
|
|
if (queue_size <= 2 || !is_power_of_2(queue_size) ||
|
|
queue_size > VIRTQUEUE_MAX_SIZE) {
|
|
error_setg(errp, "queue-size is invalid");
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
if (vblk_opts->has_logical_block_size) {
|
|
logical_block_size = vblk_opts->logical_block_size;
|
|
if (!check_block_size("logical-block-size", logical_block_size,
|
|
errp)) {
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
vblk_exp->num_queues = num_queues;
|
|
vblk_exp->handler.blk = exp->blk;
|
|
vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
|
|
vblk_exp->handler.logical_block_size = logical_block_size;
|
|
vblk_exp->handler.writable = opts->writable;
|
|
vblk_exp->vqs_started = true;
|
|
|
|
config.capacity =
|
|
cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
|
|
config.seg_max = cpu_to_le32(queue_size - 2);
|
|
config.min_io_size = cpu_to_le16(1);
|
|
config.opt_io_size = cpu_to_le32(1);
|
|
config.num_queues = cpu_to_le16(num_queues);
|
|
config.blk_size = cpu_to_le32(logical_block_size);
|
|
config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
|
|
config.max_discard_seg = cpu_to_le32(1);
|
|
config.discard_sector_alignment =
|
|
cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS);
|
|
config.max_write_zeroes_sectors =
|
|
cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
|
|
config.max_write_zeroes_seg = cpu_to_le32(1);
|
|
|
|
features = vduse_get_virtio_features() |
|
|
(1ULL << VIRTIO_BLK_F_SEG_MAX) |
|
|
(1ULL << VIRTIO_BLK_F_TOPOLOGY) |
|
|
(1ULL << VIRTIO_BLK_F_BLK_SIZE) |
|
|
(1ULL << VIRTIO_BLK_F_FLUSH) |
|
|
(1ULL << VIRTIO_BLK_F_DISCARD) |
|
|
(1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
|
|
|
|
if (num_queues > 1) {
|
|
features |= 1ULL << VIRTIO_BLK_F_MQ;
|
|
}
|
|
if (!opts->writable) {
|
|
features |= 1ULL << VIRTIO_BLK_F_RO;
|
|
}
|
|
|
|
vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0,
|
|
features, num_queues,
|
|
sizeof(struct virtio_blk_config),
|
|
(char *)&config, &vduse_blk_ops,
|
|
vblk_exp);
|
|
if (!vblk_exp->dev) {
|
|
error_setg(errp, "failed to create vduse device");
|
|
ret = -ENOMEM;
|
|
goto err_dev;
|
|
}
|
|
|
|
vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
|
|
g_get_tmp_dir(), vblk_opts->name);
|
|
if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
|
|
error_setg(errp, "failed to set reconnect log file");
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
for (i = 0; i < num_queues; i++) {
|
|
vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
|
|
}
|
|
|
|
aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev),
|
|
on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
|
|
|
|
blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
|
|
vblk_exp);
|
|
blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
|
|
|
|
/*
|
|
* We handle draining ourselves using an in-flight counter and by disabling
|
|
* virtqueue fd handlers. Do not queue BlockBackend requests, they need to
|
|
* complete so the in-flight counter reaches zero.
|
|
*/
|
|
blk_set_disable_request_queuing(exp->blk, true);
|
|
|
|
return 0;
|
|
err:
|
|
vduse_dev_destroy(vblk_exp->dev);
|
|
g_free(vblk_exp->recon_file);
|
|
err_dev:
|
|
g_free(vblk_exp->handler.serial);
|
|
return ret;
|
|
}
|
|
|
|
static void vduse_blk_exp_delete(BlockExport *exp)
|
|
{
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
int ret;
|
|
|
|
assert(qatomic_read(&vblk_exp->inflight) == 0);
|
|
|
|
vduse_blk_detach_ctx(vblk_exp);
|
|
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
|
|
vblk_exp);
|
|
ret = vduse_dev_destroy(vblk_exp->dev);
|
|
if (ret != -EBUSY) {
|
|
unlink(vblk_exp->recon_file);
|
|
}
|
|
g_free(vblk_exp->recon_file);
|
|
g_free(vblk_exp->handler.serial);
|
|
}
|
|
|
|
/* Called with exp->ctx acquired */
|
|
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
|
|
{
|
|
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
|
|
|
|
vduse_blk_stop_virtqueues(vblk_exp);
|
|
}
|
|
|
|
const BlockExportDriver blk_exp_vduse_blk = {
|
|
.type = BLOCK_EXPORT_TYPE_VDUSE_BLK,
|
|
.instance_size = sizeof(VduseBlkExport),
|
|
.create = vduse_blk_exp_create,
|
|
.delete = vduse_blk_exp_delete,
|
|
.request_shutdown = vduse_blk_exp_request_shutdown,
|
|
};
|