diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index ba45c46cc0dacf558308833bafa95441c6437d82..ef8e2b1830795a6984509a48ead4bb6d4814ddd1 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -144,6 +144,43 @@ managing and controlling ublk devices with help of several control commands: For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's responsibility to save IO target specific info in userspace. +- ``UBLK_CMD_GET_DEV_INFO2`` + Same purpose with ``UBLK_CMD_GET_DEV_INFO``, but ublk server has to + provide path of the char device of ``/dev/ublkc*`` for kernel to run + permission check, and this command is added for supporting unprivileged + ublk device, and introduced with ``UBLK_F_UNPRIVILEGED_DEV`` together. + Only the user owning the requested device can retrieve the device info. + + How to deal with userspace/kernel compatibility: + + 1) if kernel is capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + + ublk server should send ``UBLK_CMD_GET_DEV_INFO2``, given anytime + unprivileged application needs to query devices the current user owns, + when the application has no idea if ``UBLK_F_UNPRIVILEGED_DEV`` is set + given the capability info is stateless, and application should always + retrieve it via ``UBLK_CMD_GET_DEV_INFO2`` + + If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of + UBLK_F_UNPRIVILEGED_DEV isn't available for user + + 2) if kernel isn't capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + + ``UBLK_CMD_GET_DEV_INFO2`` is tried first, and will be failed, then + ``UBLK_CMD_GET_DEV_INFO`` needs to be retried given + ``UBLK_F_UNPRIVILEGED_DEV`` can't be set + + If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of + ``UBLK_F_UNPRIVILEGED_DEV`` isn't available for user + - ``UBLK_CMD_START_USER_RECOVERY`` This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This @@ -180,6 +217,15 @@ managing and controlling ublk devices with help of several control commands: double-write since the driver may issue the same I/O request twice. It might be useful to a read-only FS or a VM backend. +Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``. +Once the flag is set, all control commands can be sent by unprivileged +user. Except for command of ``UBLK_CMD_ADD_DEV``, permission check on +the specified char device(``/dev/ublkc*``) is done for all other control +commands by ublk driver, for doing that, path of the char device has to +be provided in these commands' payload from ublk server. With this way, +ublk device becomes container-ware, and device created in one container +can be controlled/accessed just inside this container. + Data plane ---------- @@ -254,15 +300,6 @@ with specified IO tag in the command data: Future development ================== -Container-aware ublk deivice ----------------------------- - -ublk driver doesn't handle any IO logic. Its function is well defined -for now and very limited userspace interfaces are needed, which is also -well defined too. It is possible to make ublk devices container-aware block -devices in future as Stefan Hajnoczi suggested [#stefan]_, by removing -ADMIN privilege. - Zero copy --------- @@ -284,6 +321,4 @@ References .. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README -.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ - .. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES b/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES new file mode 100644 index 0000000000000000000000000000000000000000..5abaa4d2046e44ef0958b586e3dca7739561167c --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES @@ -0,0 +1 @@ +CONFIG_BLKDEV_UBLK_LEGACY_OPCODES=y diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 7ad988d0626e0836dad5aefce6503c49d3d798e8..9195938f67ae20726ed9b3b97160fca49d9720e5 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,6 +488,29 @@ config BLK_DEV_UBLK definition isn't finalized yet, and might change according to future requirement, so mark is as experimental now. + Say Y if you want to get better performance because task_work_add() + can be used in IO path for replacing io_uring cmd, which will become + shared between IO tasks and ubq daemon, meantime task_work_add() can + can handle batch more effectively, but task_work_add() isn't exported + for module, so ublk has to be built to kernel. + +config BLKDEV_UBLK_LEGACY_OPCODES + bool "Support legacy command opcode" + depends on BLK_DEV_UBLK + default y + help + ublk driver started to take plain command encoding, which turns out + one bad way. The traditional ioctl command opcode encodes more + info and basically defines each code uniquely, so opcode conflict + is avoided, and driver can handle wrong command easily, meantime it + may help security subsystem to audit io_uring command. + + Say Y if your application still uses legacy command opcode. + + Say N if you don't want to support legacy command opcode. It is + suggested to enable N if your application(ublk server) switches to + ioctl command encoding. + source "drivers/block/rnbd/Kconfig" endif # BLK_DEV diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3fc38937f758f15b502bfdc84dd45185f1a86523..91771c096ebe3a5c6f541c7b294676ab1a088775 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -50,13 +52,25 @@ | UBLK_F_URING_CMD_COMP_IN_TASK \ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ - | UBLK_F_USER_RECOVERY_REISSUE) + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_UNPRIVILEGED_DEV \ + | UBLK_F_CMD_IOCTL_ENCODE \ + | UBLK_F_USER_COPY \ + | UBLK_F_ZONED) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +#define UBLK_PARAM_TYPE_ALL \ + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) + +struct ublk_rq_data { + struct llist_node node; + + struct kref ref; +}; struct ublk_uring_cmd_pdu { - struct request *req; + struct ublk_queue *ubq; }; /* @@ -97,6 +111,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 +/* atomic RW with ubq->cancel_lock */ +#define UBLK_IO_FLAG_CANCELED 0x80000000 + struct ublk_io { /* userspace buffer address from io cmd */ __u64 addr; @@ -114,10 +131,14 @@ struct ublk_queue { struct task_struct *ubq_daemon; char *io_cmd_buf; + struct llist_head io_cmds; + unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; + bool timeout; unsigned short nr_io_ready; /* how many ios setup */ + spinlock_t cancel_lock; struct ublk_device *dev; struct ublk_io ios[]; }; @@ -140,6 +161,7 @@ struct ublk_device { #define UB_STATE_OPEN 0 #define UB_STATE_USED 1 +#define UB_STATE_DELETED 2 unsigned long state; int ub_number; @@ -152,7 +174,7 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - atomic_t nr_aborted_queues; + unsigned int nr_privileged_daemon; /* * Our ubq->daemon may be killed without any notification, so @@ -169,8 +191,307 @@ struct ublk_params_header { __u32 types; }; +static inline unsigned int ublk_req_build_flags(struct request *req); +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag); + +static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_ZONED; +} + +static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_ZONED; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + +static int ublk_get_nr_zones(const struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; + + /* Zone size is a power of 2 */ + return p->dev_sectors >> ilog2(p->chunk_sectors); +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return blk_revalidate_disk_zones(ub->ub_disk, NULL); +} + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + int nr_zones; + + if (!ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (!p->max_zone_append_sectors) + return -EINVAL; + + nr_zones = ublk_get_nr_zones(ub); + + if (p->max_active_zones > nr_zones) + return -EINVAL; + + if (p->max_open_zones > nr_zones) + return -EINVAL; + + return 0; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + + blk_queue_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); + blk_queue_required_elevator_features(ub->ub_disk->queue, + ELEVATOR_F_ZBD_SEQ_WRITE); + blk_queue_max_active_zones(ub->ub_disk->queue, p->max_active_zones); + blk_queue_max_open_zones(ub->ub_disk->queue, p->max_open_zones); + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); + + ub->ub_disk->queue->nr_zones = ublk_get_nr_zones(ub); + + return 0; +} + +/* Based on virtblk_alloc_report_buffer */ +static void *ublk_alloc_report_buffer(struct ublk_device *ublk, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ublk->ub_disk->queue; + size_t bufsize; + void *buf; + + nr_zones = min_t(unsigned int, nr_zones, + ublk->ub_disk->queue->nr_zones); + + bufsize = nr_zones * sizeof(struct blk_zone); + bufsize = + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); + + while (bufsize >= sizeof(struct blk_zone)) { + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + + *buflen = 0; + return NULL; +} + +static int ublk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct ublk_device *ub = disk->private_data; + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; + unsigned int first_zone = sector >> ilog2(zone_size_sectors); + unsigned int done_zones = 0; + unsigned int max_zones_per_request; + int ret; + struct blk_zone *buffer; + size_t buffer_length; + + nr_zones = min_t(unsigned int, ub->ub_disk->queue->nr_zones - first_zone, + nr_zones); + + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); + if (!buffer) + return -ENOMEM; + + max_zones_per_request = buffer_length / sizeof(struct blk_zone); + + while (done_zones < nr_zones) { + unsigned int remaining_zones = nr_zones - done_zones; + unsigned int zones_in_request = + min_t(unsigned int, remaining_zones, max_zones_per_request); + struct request *req; + struct ublk_zoned_report_desc desc; + blk_status_t status; + unsigned int i; + + memset(buffer, 0, buffer_length); + + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; + + ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, + GFP_KERNEL); + if (ret) + goto erase_desc; + + status = blk_execute_rq(disk, req, 0); + ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: + blk_mq_free_request(req); + if (ret) + goto out; + + for (i = 0; i < zones_in_request; i++) { + struct blk_zone *zone = buffer + i; + + /* A zero length zone means no more zones in this response */ + if (!zone->len) + break; + + ret = cb(zone, i, data); + if (ret) + goto out; + + done_zones++; + sector += zone_size_sectors; + + } + } + + ret = done_zones; + +out: + kvfree(buffer); + return ret; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + struct ublk_zoned_report_desc *desc; + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_ZONE_OPEN: + ublk_op = UBLK_IO_OP_ZONE_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + ublk_op = UBLK_IO_OP_ZONE_CLOSE; + break; + case REQ_OP_ZONE_FINISH: + ublk_op = UBLK_IO_OP_ZONE_FINISH; + break; + case REQ_OP_ZONE_RESET: + ublk_op = UBLK_IO_OP_ZONE_RESET; + break; + case REQ_OP_ZONE_APPEND: + ublk_op = UBLK_IO_OP_ZONE_APPEND; + break; + case REQ_OP_ZONE_RESET_ALL: + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; + break; + case REQ_OP_DRV_IN: + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; + switch (ublk_op) { + case UBLK_IO_OP_REPORT_ZONES: + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; + return BLK_STS_OK; + default: + return BLK_STS_IOERR; + } + case REQ_OP_DRV_OUT: + /* We do not support drv_out */ + return BLK_STS_NOTSUPP; + default: + return BLK_STS_IOERR; + } + + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +#else + +#define ublk_report_zones (NULL) + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return 0; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + return BLK_STS_NOTSUPP; +} + +#endif + +static inline void __ublk_complete_rq(struct request *req); +static void ublk_complete_rq(struct kref *ref); + static dev_t ublk_chr_devt; -static struct class *ublk_chr_class; +static struct class ublk_chr_class = { + .name = "ublk-char", +}; static DEFINE_IDR(ublk_index_idr); static DEFINE_SPINLOCK(ublk_idr_lock); @@ -178,8 +499,34 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +/* + * Max ublk devices allowed to add + * + * It can be extended to one per-user limit in future or even controlled + * by cgroup. + */ +static unsigned int ublks_max = 64; +static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ + static struct miscdevice ublk_misc; +static inline unsigned ublk_pos_to_hwq(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & + UBLK_QID_BITS_MASK; +} + +static inline unsigned ublk_pos_to_buf_off(loff_t pos) +{ + return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; +} + +static inline unsigned ublk_pos_to_tag(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & + UBLK_TAG_BITS_MASK; +} + static void ublk_dev_param_basic_apply(struct ublk_device *ub) { struct request_queue *q = ub->ub_disk->queue; @@ -226,7 +573,7 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { const struct ublk_param_basic *p = &ub->params.basic; - if (p->logical_bs_shift > PAGE_SHIFT) + if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; if (p->logical_bs_shift > p->physical_bs_shift) @@ -234,6 +581,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) + return -EINVAL; } else return -EINVAL; @@ -248,6 +598,15 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + /* dev_t is read-only */ + if (ub->params.types & UBLK_PARAM_TYPE_DEVT) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_validate(ub); + else if (ublk_dev_is_zoned(ub)) + return -EINVAL; + return 0; } @@ -261,9 +620,60 @@ static int ublk_apply_params(struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) ublk_dev_param_discard_apply(ub); + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_apply(ub); + return 0; } +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) +{ + /* + * read()/write() is involved in user copy, so request reference + * has to be grabbed + */ + return ublk_support_user_copy(ubq); +} + +static inline void ublk_init_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_init(&data->ref); + } +} + +static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + return kref_get_unless_zero(&data->ref); + } + + return true; +} + +static inline void ublk_put_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_put(&data->ref, ublk_complete_rq); + } else { + __ublk_complete_rq(req); + } +} + static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -304,21 +714,28 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) return ublk_get_queue(ub, q_id)->io_cmd_buf; } +static inline int __ublk_queue_cmd_buf_size(int depth) +{ + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); +} + static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); + return __ublk_queue_cmd_buf_size(ubq->q_depth); +} + +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); } static inline bool ublk_queue_can_use_recovery_reissue( struct ublk_queue *ubq) { - if ((ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) - return true; - return false; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); } static inline bool ublk_queue_can_use_recovery( @@ -340,57 +757,89 @@ static void ublk_free_disk(struct gendisk *disk) put_device(&ub->cdev_dev); } +static void ublk_store_owner_uid_gid(unsigned int *owner_uid, + unsigned int *owner_gid) +{ + kuid_t uid; + kgid_t gid; + + current_uid_gid(&uid, &gid); + + *owner_uid = from_kuid(&init_user_ns, uid); + *owner_gid = from_kgid(&init_user_ns, gid); +} + +static int ublk_open(struct block_device *bdev, fmode_t mode) +{ + struct ublk_device *ub = bdev->bd_disk->private_data; + + if (capable(CAP_SYS_ADMIN)) + return 0; + + /* + * If it is one unprivileged device, only owner can open + * the disk. Otherwise it could be one trap made by one + * evil user who grants this disk's privileges to other + * users deliberately. + * + * This way is reasonable too given anyone can create + * unprivileged device, and no need other's grant. + */ + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { + unsigned int curr_uid, curr_gid; + + ublk_store_owner_uid_gid(&curr_uid, &curr_gid); + + if (curr_uid != ub->dev_info.owner_uid || curr_gid != + ub->dev_info.owner_gid) + return -EPERM; + } + + return 0; +} + static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, - .free_disk = ublk_free_disk, + .open = ublk_open, + .free_disk = ublk_free_disk, + .report_zones = ublk_report_zones, }; #define UBLK_MAX_PIN_PAGES 32 -struct ublk_map_data { - const struct ublk_queue *ubq; - const struct request *rq; - const struct ublk_io *io; - unsigned max_bytes; -}; - struct ublk_io_iter { struct page *pages[UBLK_MAX_PIN_PAGES]; - unsigned pg_off; /* offset in the 1st page in pages */ - int nr_pages; /* how many page pointers in pages */ struct bio *bio; struct bvec_iter iter; }; -static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, - unsigned max_bytes, bool to_vm) +/* return how many pages are copied */ +static void ublk_copy_io_pages(struct ublk_io_iter *data, + size_t total, size_t pg_off, int dir) { - const unsigned total = min_t(unsigned, max_bytes, - PAGE_SIZE - data->pg_off + - ((data->nr_pages - 1) << PAGE_SHIFT)); unsigned done = 0; unsigned pg_idx = 0; while (done < total) { struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - const unsigned int bytes = min3(bv.bv_len, total - done, - (unsigned)(PAGE_SIZE - data->pg_off)); + unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, + (unsigned)(PAGE_SIZE - pg_off)); void *bv_buf = kmap(bv.bv_page) + bv.bv_offset; void *pg_buf = kmap(data->pages[pg_idx]); - if (to_vm) - memcpy(pg_buf + data->pg_off, bv_buf, bytes); + if (dir == READ) + memcpy(pg_buf + pg_off, bv_buf, bytes); else - memcpy(bv_buf, pg_buf + data->pg_off, bytes); + memcpy(bv_buf, pg_buf + pg_off, bytes); kunmap(pg_buf); kunmap(bv_buf); /* advance page array */ - data->pg_off += bytes; - if (data->pg_off == PAGE_SIZE) { + pg_off += bytes; + if (pg_off == PAGE_SIZE) { pg_idx += 1; - data->pg_off = 0; + pg_off = 0; } done += bytes; @@ -404,70 +853,98 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, data->iter = data->bio->bi_iter; } } +} - return done; +static bool ublk_advance_io_iter(const struct request *req, + struct ublk_io_iter *iter, unsigned int offset) +{ + struct bio *bio = req->bio; + + for_each_bio(bio) { + if (bio->bi_iter.bi_size > offset) { + iter->bio = bio; + iter->iter = bio->bi_iter; + bio_advance_iter(iter->bio, &iter->iter, offset); + return true; + } + offset -= bio->bi_iter.bi_size; + } + return false; } -static inline int ublk_copy_user_pages(struct ublk_map_data *data, - bool to_vm) -{ - const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->io->addr; - unsigned int done = 0; - struct ublk_io_iter iter = { - .pg_off = start_vm & (PAGE_SIZE - 1), - .bio = data->rq->bio, - .iter = data->rq->bio->bi_iter, - }; - const unsigned int nr_pages = round_up(data->max_bytes + - (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; - - while (done < nr_pages) { - const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, - nr_pages - done); - unsigned i, len; - - iter.nr_pages = get_user_pages_fast(start_vm + - (done << PAGE_SHIFT), to_pin, gup_flags, - iter.pages); - if (iter.nr_pages <= 0) - return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); - for (i = 0; i < iter.nr_pages; i++) { - if (to_vm) +/* + * Copy data between request pages and io_iter, and 'offset' + * is the start point of linear offset of request. + */ +static size_t ublk_copy_user_pages(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + struct ublk_io_iter iter; + size_t done = 0; + + if (!ublk_advance_io_iter(req, &iter, offset)) + return 0; + + while (iov_iter_count(uiter) && iter.bio) { + unsigned nr_pages; + ssize_t len; + size_t off; + int i; + + len = iov_iter_get_pages(uiter, iter.pages, + iov_iter_count(uiter), + UBLK_MAX_PIN_PAGES, &off); + if (len >= 0) + iov_iter_advance(uiter, len); + if (len <= 0) + return done; + + ublk_copy_io_pages(&iter, len, off, dir); + nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); + for (i = 0; i < nr_pages; i++) { + if (dir == READ) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->max_bytes -= len; - done += iter.nr_pages; + done += len; } return done; } +static inline bool ublk_need_map_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; +} + +static inline bool ublk_need_unmap_req(const struct request *req) +{ + return ublk_rq_has_data(req) && + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); +} + static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); + + if (ublk_support_user_copy(ubq)) + return rq_bytes; + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) - return rq_bytes; - - if (ublk_rq_has_data(req)) { - struct ublk_map_data data = { - .ubq = ubq, - .rq = req, - .io = io, - .max_bytes = rq_bytes, - }; + if (ublk_need_map_req(req)) { + struct iov_iter iter; + struct iovec iov; + const int dir = READ; - ublk_copy_user_pages(&data, true); + import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, + &iov, &iter); - return rq_bytes - data.max_bytes; + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } @@ -478,19 +955,19 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { - struct ublk_map_data data = { - .ubq = ubq, - .rq = req, - .io = io, - .max_bytes = io->res, - }; + if (ublk_support_user_copy(ubq)) + return rq_bytes; - WARN_ON_ONCE(io->res > rq_bytes); + if (ublk_need_unmap_req(req)) { + struct iov_iter iter; + struct iovec iov; + const int dir = WRITE; - ublk_copy_user_pages(&data, false); + WARN_ON_ONCE(io->res > rq_bytes); - return io->res - data.max_bytes; + import_single_range(dir, u64_to_user_ptr(io->addr), io->res, + &iov, &iter); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } @@ -527,8 +1004,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; + enum req_opf op = req_op(req); u32 ublk_op; + if (!ublk_queue_is_zoned(ubq) && + (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) + return BLK_STS_IOERR; + switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; @@ -546,6 +1028,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) ublk_op = UBLK_IO_OP_WRITE_ZEROES; break; default: + if (ublk_queue_is_zoned(ubq)) + return ublk_setup_iod_zoned(ubq, req); return BLK_STS_IOERR; } @@ -569,20 +1053,35 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) return ubq->ubq_daemon->flags & PF_EXITING; } +static void ublk_end_request(struct request *req, blk_status_t error) +{ + local_bh_disable(); + blk_mq_end_request(req, error); + local_bh_enable(); +} + /* todo: handle partial completion */ -static void ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req) { struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; + bool requeue; + + /* called from ublk_abort_queue() code path */ + if (io->flags & UBLK_IO_FLAG_ABORTED) { + res = BLK_STS_IOERR; + goto exit; + } /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; if (io->res < 0) { - blk_mq_end_request(req, errno_to_blk_status(io->res)); - return; + res = errno_to_blk_status(io->res); + goto exit; } /* @@ -591,10 +1090,9 @@ static void ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { - blk_mq_end_request(req, BLK_STS_OK); - return; - } + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && + req_op(req) != REQ_OP_DRV_IN) + goto exit; /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(ubq, req, io); @@ -607,10 +1105,39 @@ static void ublk_complete_rq(struct request *req) if (unlikely(unmapped_bytes < io->res)) io->res = unmapped_bytes; - if (blk_update_request(req, BLK_STS_OK, io->res)) + /* + * Run bio->bi_end_io() with softirqs disabled. If the final fput + * happens off this path, then that will prevent ublk's blkdev_release() + * from being called on current's task work, see fput() implementation. + * + * Otherwise, ublk server may not provide forward progress in case of + * reading the partition table from bdev_open() with disk->open_mutex + * held, and causes dead lock as we could already be holding + * disk->open_mutex here. + * + * Preferably we would not be doing IO with a mutex held that is also + * used for release, but this work-around will suffice for now. + */ + local_bh_disable(); + requeue = blk_update_request(req, BLK_STS_OK, io->res); + local_bh_enable(); + if (requeue) blk_mq_requeue_request(req, true); else __blk_mq_end_request(req, BLK_STS_OK); + + return; +exit: + ublk_end_request(req, res); +} + +static void ublk_complete_rq(struct kref *ref) +{ + struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, + ref); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_complete_rq(req); } /* @@ -631,7 +1158,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, if (ublk_queue_can_use_recovery_reissue(ubq)) blk_mq_requeue_request(req, false); else - blk_mq_end_request(req, BLK_STS_IOERR); + ublk_put_req_ref(ubq, req); } } @@ -659,7 +1186,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, if (ublk_queue_can_use_recovery(ubq)) blk_mq_requeue_request(rq, false); else - blk_mq_end_request(rq, BLK_STS_IOERR); + ublk_end_request(rq, BLK_STS_IOERR); mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } @@ -689,9 +1216,7 @@ static inline void __ublk_rq_task_work(struct request *req) return; } - if (ublk_need_get_data(ubq) && - (req_op(req) == REQ_OP_WRITE || - req_op(req) == REQ_OP_FLUSH)) { + if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv @@ -740,14 +1265,83 @@ static inline void __ublk_rq_task_work(struct request *req) mapped_bytes >> 9; } + ublk_init_req_ref(ubq, req); ubq_complete_io_cmd(io, UBLK_IO_RES_OK); } +static inline void ublk_forward_io_cmds(struct ublk_queue *ubq) +{ + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data, *tmp; + + io_cmds = llist_reverse_order(io_cmds); + llist_for_each_entry_safe(data, tmp, io_cmds, node) + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); +} + +static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) +{ + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data, *tmp; + + llist_for_each_entry_safe(data, tmp, io_cmds, node) + __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); +} + static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + + ublk_forward_io_cmds(ubq); +} + +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + struct ublk_io *io; + + if (!llist_add(&data->node, &ubq->io_cmds)) + return; + + io = &ubq->ios[rq->tag]; + /* + * If the check pass, we know that this is a re-issued request aborted + * previously in monitor_work because the ubq_daemon(cmd's task) is + * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore + * because this ioucmd's io_uring context may be freed now if no inflight + * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. + * + * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing + * the tag). Then the request is re-started(allocating the tag) and we are here. + * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED + * guarantees that here is a re-issued request aborted previously. + */ + if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { + ublk_abort_io_cmds(ubq); + } else { + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->ubq = ubq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } +} + +static enum blk_eh_timer_return ublk_timeout(struct request *rq, bool reserved) +{ + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { + if (!ubq->timeout) { + send_sig(SIGKILL, ubq->ubq_daemon, 0); + ubq->timeout = true; + } + + return BLK_EH_DONE; + } - __ublk_rq_task_work(pdu->req); + return BLK_EH_RESET_TIMER; } static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -755,15 +1349,13 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; - struct ublk_io *io = &ubq->ios[rq->tag]; - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); blk_status_t res; /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; + /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to * abort all requeued and new rqs here to let del_gendisk() @@ -779,32 +1371,15 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); if (unlikely(ubq_daemon_is_dying(ubq))) { - fail: __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - /* - * If the check pass, we know that this is a re-issued request aborted - * previously in monitor_work because the ubq_daemon(cmd's task) is - * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore - * because this ioucmd's io_uring context may be freed now if no inflight - * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. - * - * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing - * the tag). Then the request is re-started(allocating the tag) and we are here. - * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED - * guarantees that here is a re-issued request aborted previously. - */ - if ((io->flags & UBLK_IO_FLAG_ABORTED)) - goto fail; - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } - static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -818,6 +1393,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -844,7 +1420,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; @@ -891,9 +1467,14 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (WARN_ON_ONCE(unlikely(!req))) + return; - if (req && likely(!blk_should_fake_timeout(req->q))) - ublk_complete_rq(req); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + + if (likely(!blk_should_fake_timeout(req->q))) + ublk_put_req_ref(ubq, req); } /* @@ -967,18 +1548,23 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; - if (!ublk_queue_ready(ubq)) - return; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; - if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); - } + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + bool done; - /* all io commands are canceled */ - ubq->nr_io_ready = 0; + spin_lock(&ubq->cancel_lock); + done = !!(io->flags & UBLK_IO_FLAG_CANCELED); + if (!done) + io->flags |= UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + + if (!done) + io_uring_cmd_done(io->cmd, + UBLK_IO_RES_ABORT, 0); + } + } } /* Cancel all pending commands, must be called after del_gendisk() returns */ @@ -1026,7 +1612,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) blk_mq_quiesce_queue(ub->ub_disk->queue); ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; - ublk_cancel_dev(ub); /* we are going to release task_struct of ubq_daemon and resets * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. * Besides, monitor_work is not necessary in QUIESCED state since we have @@ -1049,6 +1634,7 @@ static void ublk_quiesce_work_fn(struct work_struct *work) __ublk_quiesce_dev(ub); unlock: mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); } static void ublk_unquiesce_dev(struct ublk_device *ub) @@ -1089,8 +1675,8 @@ static void ublk_stop_dev(struct ublk_device *ub) put_disk(ub->ub_disk); ub->ub_disk = NULL; unlock: - ublk_cancel_dev(ub); mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); cancel_delayed_work_sync(&ub->monitor_work); } @@ -1103,6 +1689,9 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) ubq->ubq_daemon = current; get_task_struct(ubq->ubq_daemon); ub->nr_queues_ready++; + + if (capable(CAP_SYS_ADMIN)) + ub->nr_privileged_daemon++; } if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) complete_all(&ub->completion); @@ -1110,18 +1699,39 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, - int tag, struct io_uring_cmd *cmd) + int tag) { + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - pdu->req = req; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + ublk_queue_cmd(ubq, req); } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static inline int ublk_check_cmd_op(u32 cmd_op) +{ + u32 ioc_type = _IOC_TYPE(cmd_op); + + if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') + return -EOPNOTSUPP; + + if (ioc_type != 'u' && ioc_type != 0) + return -EOPNOTSUPP; + + return 0; +} + +static inline void ublk_fill_io_cmd(struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr) +{ + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->addr = buf_addr; +} + +static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct ublksrv_io_cmd *ub_cmd) { - struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; @@ -1134,9 +1744,6 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; @@ -1150,77 +1757,248 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) if (tag >= ubq->q_depth) goto out; - io = &ubq->ios[tag]; + io = &ubq->ios[tag]; + + /* there is pending io cmd, something must be wrong */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + ret = -EBUSY; + goto out; + } + + /* + * ensure that the user issues UBLK_IO_NEED_GET_DATA + * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. + */ + if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) + ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) + goto out; + + ret = ublk_check_cmd_op(cmd_op); + if (ret) + goto out; + + ret = -EINVAL; + switch (_IOC_NR(cmd_op)) { + case UBLK_IO_FETCH_REQ: + /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ + if (ublk_queue_ready(ubq)) { + ret = -EBUSY; + goto out; + } + /* + * The io is being handled by server, so COMMIT_RQ is expected + * instead of FETCH_REQ + */ + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!ub_cmd->addr && !ublk_need_get_data(ubq)) + goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; + } + + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); + ublk_mark_io_ready(ub, ubq); + break; + case UBLK_IO_COMMIT_AND_FETCH_REQ: + req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); + + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + goto out; + } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ + ret = -EINVAL; + goto out; + } + + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); + ublk_commit_completion(ub, ub_cmd); + break; + case UBLK_IO_NEED_GET_DATA: + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + goto out; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); + ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); + break; + default: + goto out; + } + return -EIOCBQUEUED; + + out: + io_uring_cmd_done(cmd, ret, 0); + pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", + __func__, cmd_op, tag, ret, io->flags); + return -EIOCBQUEUED; +} + +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset) +{ + struct request *req; + + if (!ublk_need_req_ref(ubq)) + return NULL; + + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (!req) + return NULL; + + if (!ublk_get_req_ref(ubq, req)) + return NULL; + + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) + goto fail_put; + + if (!ublk_rq_has_data(req)) + goto fail_put; + + if (offset > blk_rq_bytes(req)) + goto fail_put; + + return req; +fail_put: + ublk_put_req_ref(ubq, req); + return NULL; +} + +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd; + struct ublksrv_io_cmd ub_cmd; + + /* + * Not necessary for async retry, but let's keep it simple and always + * copy the values to avoid any potential reuse. + */ + ub_cmd.q_id = READ_ONCE(ub_src->q_id); + ub_cmd.tag = READ_ONCE(ub_src->tag); + ub_cmd.result = READ_ONCE(ub_src->result); + ub_cmd.addr = READ_ONCE(ub_src->addr); + + return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); +} + +static inline bool ublk_check_ubuf_dir(const struct request *req, + int ubuf_dir) +{ + /* copy ubuf to request pages */ + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && + ubuf_dir == WRITE) + return true; + + /* copy request pages to ubuf */ + if ((req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_ZONE_APPEND) && + ubuf_dir == READ) + return true; + + return false; +} + +static struct request *ublk_check_and_get_req(struct kiocb *iocb, + struct iov_iter *iter, size_t *off, int dir) +{ + struct ublk_device *ub = iocb->ki_filp->private_data; + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + u16 tag, q_id; + + if (!ub) + return ERR_PTR(-EACCES); + + if (!iter_is_iovec(iter)) + return ERR_PTR(-EACCES); + + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return ERR_PTR(-EACCES); + + tag = ublk_pos_to_tag(iocb->ki_pos); + q_id = ublk_pos_to_hwq(iocb->ki_pos); + buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + + if (q_id >= ub->dev_info.nr_hw_queues) + return ERR_PTR(-EINVAL); + + ubq = ublk_get_queue(ub, q_id); + if (!ubq) + return ERR_PTR(-EINVAL); + + if (tag >= ubq->q_depth) + return ERR_PTR(-EINVAL); + + req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + if (!req) + return ERR_PTR(-EINVAL); + + if (!req->mq_hctx || !req->mq_hctx->driver_data) + goto fail; + + if (!ublk_check_ubuf_dir(req, dir)) + goto fail; + + *off = buf_off; + return req; +fail: + ublk_put_req_ref(ubq, req); + return ERR_PTR(-EACCES); +} + +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, to, &buf_off, READ); + if (IS_ERR(req)) + return PTR_ERR(req); - /* there is pending io cmd, something must be wrong */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { - ret = -EBUSY; - goto out; - } + ret = ublk_copy_user_pages(req, buf_off, to, READ); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); - /* - * ensure that the user issues UBLK_IO_NEED_GET_DATA - * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. - */ - if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) - ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) - goto out; + return ret; +} - switch (cmd_op) { - case UBLK_IO_FETCH_REQ: - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { - ret = -EBUSY; - goto out; - } - /* - * The io is being handled by server, so COMMIT_RQ is expected - * instead of FETCH_REQ - */ - if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - goto out; - /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->addr = ub_cmd->addr; +static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; - ublk_mark_io_ready(ub, ubq); - break; - case UBLK_IO_COMMIT_AND_FETCH_REQ: - req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is - * not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) - goto out; - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - goto out; - io->addr = ub_cmd->addr; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->cmd = cmd; - ublk_commit_completion(ub, ub_cmd); - break; - case UBLK_IO_NEED_GET_DATA: - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - goto out; - io->addr = ub_cmd->addr; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd); - break; - default: - goto out; - } - return -EIOCBQUEUED; + req = ublk_check_and_get_req(iocb, from, &buf_off, WRITE); + if (IS_ERR(req)) + return PTR_ERR(req); - out: - io_uring_cmd_done(cmd, ret, 0); - pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", - __func__, cmd_op, tag, ret, io->flags); - return -EIOCBQUEUED; + ret = ublk_copy_user_pages(req, buf_off, from, WRITE); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; } static const struct file_operations ublk_ch_fops = { @@ -1228,6 +2006,8 @@ static const struct file_operations ublk_ch_fops = { .open = ublk_ch_open, .release = ublk_ch_release, .llseek = no_llseek, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, .mmap = ublk_ch_mmap, }; @@ -1250,6 +2030,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) void *ptr; int size; + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; @@ -1274,7 +2055,7 @@ static void ublk_deinit_queues(struct ublk_device *ub) for (i = 0; i < nr_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); + kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) @@ -1285,7 +2066,7 @@ static int ublk_init_queues(struct ublk_device *ub) int i, ret = -ENOMEM; ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); if (!ub->__queues) return ret; @@ -1351,7 +2132,7 @@ static int ublk_add_chdev(struct ublk_device *ub) dev->parent = ublk_misc.this_device; dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); - dev->class = ublk_chr_class; + dev->class = &ublk_chr_class; dev->release = ublk_cdev_rel; device_initialize(dev); @@ -1363,6 +2144,8 @@ static int ublk_add_chdev(struct ublk_device *ub) ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; + + ublks_added++; return 0; fail: put_device(dev); @@ -1392,6 +2175,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; @@ -1405,6 +2189,7 @@ static void ublk_remove(struct ublk_device *ub) cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); put_device(&ub->cdev_dev); + ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -1423,22 +2208,18 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } -static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - - wait_for_completion_interruptible(&ub->completion); + if (wait_for_completion_interruptible(&ub->completion) != 0) + return -EINTR; schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); @@ -1469,34 +2250,51 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) ub->ub_disk = disk; ret = ublk_apply_params(ub); - if (ret) { - put_disk(disk); - goto out_cleanup_queue; - } + if (ret) + goto out_put_disk; + + /* don't probe partitions if any one ubq daemon is un-trusted */ + if (ub->nr_privileged_daemon != ub->nr_queues_ready) + disk->flags |= GENHD_FL_NO_PART_SCAN; get_device(&ub->cdev_dev); + ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_dev_is_zoned(ub)) { + ret = ublk_revalidate_disk_zones(ub); + if (ret) + goto out_put_cdev; + } + add_disk(ub->ub_disk); set_bit(UB_STATE_USED, &ub->state); - ub->dev_info.state = UBLK_S_DEV_LIVE; + +out_put_cdev: + if (ret) { + ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_put_device(ub); + } +out_put_disk: + if (ret) + put_disk(disk); out_cleanup_queue: if (ret) blk_cleanup_queue(ub->ub_queue); out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; cpumask_var_t cpumask; unsigned long queue; unsigned int retlen; unsigned int i; - int ret = -EINVAL; + int ret; if (header->len * BITS_PER_BYTE < nr_cpu_ids) return -EINVAL; @@ -1505,17 +2303,12 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) if (!header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - queue = header->data[0]; if (queue >= ub->dev_info.nr_hw_queues) - goto out_put_device; + return -EINVAL; - ret = -ENOMEM; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) - goto out_put_device; + return -ENOMEM; for_each_possible_cpu(i) { if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) @@ -1533,8 +2326,6 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) ret = 0; out_free_cpumask: free_cpumask_var(cpumask); -out_put_device: - ublk_put_device(ub); return ret; } @@ -1561,19 +2352,59 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) __func__, header->queue_id); return -EINVAL; } + if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - ublk_dump_dev_info(&info); + + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) + return -EINVAL; + + if (capable(CAP_SYS_ADMIN)) + info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; + else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) + return -EPERM; + + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + */ + if (info.flags & UBLK_F_USER_COPY) + return -EINVAL; + } + + /* the created device is always owned by current user */ + ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); + if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", __func__, header->dev_id, info.dev_id); return -EINVAL; } + ublk_dump_dev_info(&info); + ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; + ret = -EACCES; + if (ublks_added >= ublks_max) + goto out_unlock; + ret = -ENOMEM; ub = kzalloc(sizeof(*ub), GFP_KERNEL); if (!ub) @@ -1603,7 +2434,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) */ ub->dev_info.flags &= UBLK_F_ALL; - ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | + UBLK_F_URING_CMD_COMP_IN_TASK; + + /* GET_DATA isn't needed any more with USER_COPY */ + if (ublk_dev_is_user_copy(ub)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + + /* Zoned storage support requires user copy feature */ + if (ublk_dev_is_zoned(ub) && + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + ret = -EINVAL; + goto out_free_dev_number; + } /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; @@ -1656,33 +2499,43 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(int idx) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub) { - struct ublk_device *ub; + struct ublk_device *ub = *p_ub; + int idx = ub->ub_number; int ret; ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; - ub = ublk_get_device_from_id(idx); - if (ub) { + if (!test_bit(UB_STATE_DELETED, &ub->state)) { ublk_remove(ub); - ublk_put_device(ub); - ret = 0; - } else { - ret = -ENODEV; + set_bit(UB_STATE_DELETED, &ub->state); } + /* Mark the reference as consumed */ + *p_ub = NULL; + ublk_put_device(ub); + mutex_unlock(&ublk_ctl_mutex); + /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. + * + * If we returns because of user interrupt, future delete command + * may come: + * + * - the device number isn't freed, this device won't or needn't + * be deleted again, since UB_STATE_DELETED is set, and device + * will be released after the last reference is dropped + * + * - the device number is freed already, we will not find this + * device via ublk_get_device_from_id() */ - if (!ret) - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); - mutex_unlock(&ublk_ctl_mutex); - - return ret; + if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + return -EINTR; + return 0; } static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) @@ -1694,50 +2547,52 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_stop_dev(struct ublk_device *ub) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->quiesce_work); - ublk_put_device(ub); return 0; } -static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_dev_info(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; - int ret = 0; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) - ret = -EFAULT; - ublk_put_device(ub); + return -EFAULT; - return ret; + return 0; +} + +/* TYPE_DEVT is readonly, so fill it up before returning to userspace */ +static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) +{ + ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); + ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); + + if (ub->ub_disk) { + ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); + ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); + } else { + ub->params.devt.disk_major = 0; + ub->params.devt.disk_minor = 0; + } + ub->params.types |= UBLK_PARAM_TYPE_DEVT; } -static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret; if (header->len <= sizeof(ph) || !header->addr) @@ -1752,27 +2607,23 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - mutex_lock(&ub->mutex); + ublk_ctrl_fill_params_devt(ub); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; else ret = 0; mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_set_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret = -EFAULT; if (header->len <= sizeof(ph) || !header->addr) @@ -1787,13 +2638,12 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; @@ -1801,9 +2651,10 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); + if (ret) + ub->params.types = 0; } mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } @@ -1813,12 +2664,14 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) int i; WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ - WARN_ON_ONCE(ubq->nr_io_ready); + ubq->nr_io_ready = 0; /* old daemon is PF_EXITING, put it now */ put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; + ubq->timeout = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1830,17 +2683,13 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) } } -static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; int ret = -EINVAL; int i; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - mutex_lock(&ub->mutex); if (!ublk_can_use_recovery(ub)) goto out_unlock; @@ -1870,29 +2719,27 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; + ub->nr_privileged_daemon = 0; init_completion(&ub->completion); ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_end_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; int ret = -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); /* wait until new ubq_daemon sending all FETCH_REQ */ - wait_for_completion_interruptible(&ub->completion); + if (wait_for_completion_interruptible(&ub->completion)) + return -EINTR; + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -1916,7 +2763,125 @@ static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) +{ + const struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + + if (header->len != UBLK_FEATURES_LEN || !header->addr) + return -EINVAL; + + if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) + return -EFAULT; + + return 0; +} + +/* + * All control commands are sent via /dev/ublk-control, so we have to check + * the destination device's permission + */ +static int ublk_char_dev_permission(struct ublk_device *ub, + const char *dev_path, int mask) +{ + int err; + struct path path; + struct kstat stat; + + err = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (err) + return err; + + err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + if (err) + goto exit; + + err = -EPERM; + if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) + goto exit; + + err = inode_permission(d_backing_inode(path.dentry), mask); +exit: + path_put(&path); + return err; +} + +static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, + struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; + void __user *argp = (void __user *)(unsigned long)header->addr; + char *dev_path = NULL; + int ret = 0; + int mask; + + if (!unprivileged) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The new added command of UBLK_CMD_GET_DEV_INFO2 includes + * char_dev_path in payload too, since userspace may not + * know if the specified device is created as unprivileged + * mode. + */ + if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) + return 0; + } + + /* + * User has to provide the char device path for unprivileged ublk + * + * header->addr always points to the dev path buffer, and + * header->dev_path_len records length of dev path buffer. + */ + if (!header->dev_path_len || header->dev_path_len > PATH_MAX) + return -EINVAL; + + if (header->len < header->dev_path_len) + return -EINVAL; + + dev_path = memdup_user_nul(argp, header->dev_path_len); + if (IS_ERR(dev_path)) + return PTR_ERR(dev_path); + + ret = -EINVAL; + switch (_IOC_NR(cmd->cmd_op)) { + case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: + case UBLK_CMD_GET_QUEUE_AFFINITY: + case UBLK_CMD_GET_PARAMS: + case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): + mask = MAY_READ; + break; + case UBLK_CMD_START_DEV: + case UBLK_CMD_STOP_DEV: + case UBLK_CMD_ADD_DEV: + case UBLK_CMD_DEL_DEV: + case UBLK_CMD_SET_PARAMS: + case UBLK_CMD_START_USER_RECOVERY: + case UBLK_CMD_END_USER_RECOVERY: + mask = MAY_READ | MAY_WRITE; + break; + default: + goto exit; + } + + ret = ublk_char_dev_permission(ub, dev_path, mask); + if (!ret) { + header->len -= header->dev_path_len; + header->addr += header->dev_path_len; + } + pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", + __func__, ub->ub_number, cmd->cmd_op, + ub->dev_info.owner_uid, ub->dev_info.owner_gid, + dev_path, ret); +exit: + kfree(dev_path); return ret; } @@ -1924,55 +2889,78 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub = NULL; + u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; + if (!(issue_flags & IO_URING_F_SQE128)) + return -EINVAL; + ublk_ctrl_cmd_dump(cmd); - if (!(issue_flags & IO_URING_F_SQE128)) + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (cmd_op == UBLK_U_CMD_GET_FEATURES) { + ret = ublk_ctrl_get_features(cmd); goto out; + } + + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { + ret = -ENODEV; + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + goto out; + + ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + if (ret) + goto put_dev; + } - ret = -ENODEV; - switch (cmd->cmd_op) { + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(cmd); + ret = ublk_ctrl_start_dev(ub, cmd); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(cmd); + ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: - ret = ublk_ctrl_get_dev_info(cmd); + case UBLK_CMD_GET_DEV_INFO2: + ret = ublk_ctrl_get_dev_info(ub, cmd); break; case UBLK_CMD_ADD_DEV: ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(header->dev_id); + ret = ublk_ctrl_del_dev(&ub); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(cmd); + ret = ublk_ctrl_get_queue_affinity(ub, cmd); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(cmd); + ret = ublk_ctrl_get_params(ub, cmd); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(cmd); + ret = ublk_ctrl_set_params(ub, cmd); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(cmd); + ret = ublk_ctrl_start_recovery(ub, cmd); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(cmd); + ret = ublk_ctrl_end_recovery(ub, cmd); break; default: + ret = -EOPNOTSUPP; break; } + + put_dev: + if (ub) + ublk_put_device(ub); out: io_uring_cmd_done(cmd, ret, 0); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", @@ -1997,6 +2985,9 @@ static int __init ublk_init(void) { int ret; + BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc); @@ -2007,11 +2998,10 @@ static int __init ublk_init(void) if (ret) goto unregister_mis; - ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); - if (IS_ERR(ublk_chr_class)) { - ret = PTR_ERR(ublk_chr_class); + ret = class_register(&ublk_chr_class); + if (ret) goto free_chrdev_region; - } + return 0; free_chrdev_region: @@ -2029,7 +3019,7 @@ static void __exit ublk_exit(void) idr_for_each_entry(&ublk_index_idr, ub, id) ublk_remove(ub); - class_destroy(ublk_chr_class); + class_unregister(&ublk_chr_class); misc_deregister(&ublk_misc); idr_destroy(&ublk_index_idr); @@ -2039,5 +3029,8 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); +module_param(ublks_max, int, 0444); +MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); + MODULE_AUTHOR("Ming Lei "); MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 8f88e3a29998fb6e79672e3122caa88d11331945..3830b428ecf19d5abaa797bac08f1770d8aecc57 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -8,6 +8,9 @@ /* * Admin commands, issued by ublk server, and handled by ublk driver. + * + * Legacy command definition, don't use in new application, and don't + * add new such definition any more */ #define UBLK_CMD_GET_QUEUE_AFFINITY 0x01 #define UBLK_CMD_GET_DEV_INFO 0x02 @@ -19,6 +22,40 @@ #define UBLK_CMD_GET_PARAMS 0x09 #define UBLK_CMD_START_USER_RECOVERY 0x10 #define UBLK_CMD_END_USER_RECOVERY 0x11 +#define UBLK_CMD_GET_DEV_INFO2 0x12 + +/* Any new ctrl command should encode by __IO*() */ +#define UBLK_U_CMD_GET_QUEUE_AFFINITY \ + _IOR('u', UBLK_CMD_GET_QUEUE_AFFINITY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_DEV_INFO \ + _IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_ADD_DEV \ + _IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV \ + _IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_DEV \ + _IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_STOP_DEV \ + _IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_SET_PARAMS \ + _IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_PARAMS \ + _IOR('u', UBLK_CMD_GET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_END_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_DEV_INFO2 \ + _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_FEATURES \ + _IOR('u', 0x13, struct ublksrv_ctrl_cmd) + +/* + * 64bits are enough now, and it should be easy to extend in case of + * running out of feature flags + */ +#define UBLK_FEATURES_LEN 8 + /* * IO commands, issued by ublk server, and handled by ublk driver. * @@ -39,10 +76,23 @@ * It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag * while starting a ublk device. */ + +/* + * Legacy IO command definition, don't use in new application, and don't + * add new such definition any more + */ #define UBLK_IO_FETCH_REQ 0x20 #define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21 #define UBLK_IO_NEED_GET_DATA 0x22 +/* Any new IO command should encode by __IOWR() */ +#define UBLK_U_IO_FETCH_REQ \ + _IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_COMMIT_AND_FETCH_REQ \ + _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_NEED_GET_DATA \ + _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 @@ -51,9 +101,29 @@ #define UBLKSRV_CMD_BUF_OFFSET 0 #define UBLKSRV_IO_BUF_OFFSET 0x80000000 -/* tag bit is 12bit, so at most 4096 IOs for each queue */ +/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */ #define UBLK_MAX_QUEUE_DEPTH 4096 +/* single IO buffer max size is 32MB */ +#define UBLK_IO_BUF_OFF 0 +#define UBLK_IO_BUF_BITS 25 +#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1) + +/* so at most 64K IOs for each queue */ +#define UBLK_TAG_OFF UBLK_IO_BUF_BITS +#define UBLK_TAG_BITS 16 +#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1) + +/* max 4096 queues */ +#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS) +#define UBLK_QID_BITS 12 +#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1) + +#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS) + +#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) +#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) + /* * zero copy requires 4k block size, and can remap ublk driver's io * request into ublksrv's vm space @@ -79,6 +149,45 @@ #define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) +/* + * Unprivileged user can create /dev/ublkcN and /dev/ublkbN. + * + * /dev/ublk-control needs to be available for unprivileged user, and it + * can be done via udev rule to make all control commands available to + * unprivileged user. Except for the command of UBLK_CMD_ADD_DEV, all + * other commands are only allowed for the owner of the specified device. + * + * When userspace sends UBLK_CMD_ADD_DEV, the device pair's owner_uid and + * owner_gid are stored to ublksrv_ctrl_dev_info by kernel, so far only + * the current user's uid/gid is stored, that said owner of the created + * device is always the current user. + * + * We still need udev rule to apply OWNER/GROUP with the stored owner_uid + * and owner_gid. + * + * Then ublk server can be run as unprivileged user, and /dev/ublkbN can + * be accessed and managed by its owner represented by owner_uid/owner_gid. + */ +#define UBLK_F_UNPRIVILEGED_DEV (1UL << 5) + +/* use ioctl encoding for uring command */ +#define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) + +/* + * Copy between request and user buffer by pread()/pwrite() + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, otherwise userspace may + * deceive us by not filling request buffer, then kernel uninitialized + * data may be leaked. + */ +#define UBLK_F_USER_COPY (1UL << 7) + +/* + * User space sets this flag when setting up the device to request zoned storage support. Kernel may + * deny the request by returning an error. + */ +#define UBLK_F_ZONED (1ULL << 8) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -98,7 +207,15 @@ struct ublksrv_ctrl_cmd { __u64 addr; /* inline data */ - __u64 data[2]; + __u64 data[1]; + + /* + * Used for UBLK_F_UNPRIVILEGED_DEV and UBLK_CMD_GET_DEV_INFO2 + * only, include null char + */ + __u16 dev_path_len; + __u16 pad; + __u32 reserved; }; struct ublksrv_ctrl_dev_info { @@ -118,7 +235,8 @@ struct ublksrv_ctrl_dev_info { /* For ublksrv internal use, invisible to ublk driver */ __u64 ublksrv_flags; - __u64 reserved0; + __u32 owner_uid; /* store by kernel */ + __u32 owner_gid; /* store by kernel */ __u64 reserved1; __u64 reserved2; }; @@ -126,9 +244,27 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_READ 0 #define UBLK_IO_OP_WRITE 1 #define UBLK_IO_OP_FLUSH 2 -#define UBLK_IO_OP_DISCARD 3 -#define UBLK_IO_OP_WRITE_SAME 4 -#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_DISCARD 3 +#define UBLK_IO_OP_WRITE_SAME 4 +#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_ZONE_OPEN 10 +#define UBLK_IO_OP_ZONE_CLOSE 11 +#define UBLK_IO_OP_ZONE_FINISH 12 +#define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET_ALL 14 +#define UBLK_IO_OP_ZONE_RESET 15 +/* + * Construct a zone report. The report request is carried in `struct + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone + * and shall indicate the first zone of the report. The `nr_zones` shall + * indicate how many zones should be reported at most. The report shall be + * delivered as a `struct blk_zone` array. To report fewer zones than requested, + * zero the last entry of the returned array. + * + * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in + * include/uapi/linux/blkzoned.h are part of ublk UAPI. + */ +#define UBLK_IO_OP_REPORT_ZONES 18 #define UBLK_IO_F_FAILFAST_DEV (1U << 8) #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) @@ -149,7 +285,10 @@ struct ublksrv_io_desc { /* op: bit 0-7, flags: bit 8-31 */ __u32 op_flags; - __u32 nr_sectors; + union { + __u32 nr_sectors; + __u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */ + }; /* start sector for this io */ __u64 start_sector; @@ -178,11 +317,21 @@ struct ublksrv_io_cmd { /* io result, it is valid for COMMIT* command only */ __s32 result; - /* - * userspace buffer address in ublksrv daemon process, valid for - * FETCH* command only - */ - __u64 addr; + union { + /* + * userspace buffer address in ublksrv daemon process, valid for + * FETCH* command only + * + * `addr` should not be used when UBLK_F_USER_COPY is enabled, + * because userspace handles data copy by pread()/pwrite() over + * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is + * re-used to pass back the allocated LBA for + * UBLK_IO_OP_ZONE_APPEND which actually depends on + * UBLK_F_USER_COPY + */ + __u64 addr; + __u64 zone_append_lba; + }; }; struct ublk_param_basic { @@ -214,6 +363,24 @@ struct ublk_param_discard { __u16 reserved0; }; +/* + * read-only, can't set via UBLK_CMD_SET_PARAMS, disk_devt is available + * after device is started + */ +struct ublk_param_devt { + __u32 char_major; + __u32 char_minor; + __u32 disk_major; + __u32 disk_minor; +}; + +struct ublk_param_zoned { + __u32 max_open_zones; + __u32 max_active_zones; + __u32 max_zone_append_sectors; + __u8 reserved[20]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -224,10 +391,14 @@ struct ublk_params { __u32 len; #define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_DISCARD (1 << 1) +#define UBLK_PARAM_TYPE_DEVT (1 << 2) +#define UBLK_PARAM_TYPE_ZONED (1 << 3) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; + struct ublk_param_devt devt; + struct ublk_param_zoned zoned; }; #endif