From 9a15f3aa290ff585e1cb51e9ae3acb02c2d16768 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Apr 2022 17:25:16 -0600 Subject: [PATCH 01/63] task_work: allow TWA_SIGNAL without a rescheduling IPI ANBZ: #32233 commit e788be95a57a9bebe446878ce9bf2750f6fe4974 upstream. Some use cases don't always need an IPI when sending a TWA_SIGNAL notification. Add TWA_SIGNAL_NO_IPI, which is just like TWA_SIGNAL, except it doesn't send an IPI to the target task. It merely sets TIF_NOTIFY_SIGNAL and wakes up the task. This can be useful in avoiding a forceful transition to the kernel if the task is running in userspace. Depending on the task_work in question, it may be quite fine waiting for the next reschedule or kernel enter anyway, or the use case may even have other mechanisms for hinting to the task that a transition may be useful. This can drive more cooperative scheduling of task_work. Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/821f42b6-7d91-8074-8212-d34998097de4@kernel.dk Signed-off-by: Jens Axboe [joe: add __set_notify_signal() into tracehook.h instead] Signed-off-by: Joseph Qi --- include/linux/task_work.h | 1 + include/linux/tracehook.h | 13 +++++++++++-- kernel/task_work.c | 25 +++++++++++++++++++------ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 5b8a93f288bb..8d976ae84769 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -17,6 +17,7 @@ enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, + TWA_SIGNAL_NO_IPI, }; int task_work_add(struct task_struct *task, struct callback_head *twork, diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 0e1ad33eec7c..e724a283405d 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -212,13 +212,22 @@ static inline void tracehook_notify_signal(void) task_work_run(); } +/* + * Returns 'true' if kick_process() is needed to force a transition from + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. + */ +static inline bool __set_notify_signal(struct task_struct *task) +{ + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE); +} + /* * Called when we have work to process from exit_to_user_mode_loop() */ static inline void set_notify_signal(struct task_struct *task) { - if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && - !wake_up_state(task, TASK_INTERRUPTIBLE)) + if (__set_notify_signal(task)) kick_process(task); } diff --git a/kernel/task_work.c b/kernel/task_work.c index e9316198c64b..c1701eeb6a7b 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the - * it will interrupt the targeted task and run the task_work. @TWA_RESUME - * work is run only when the task exits the kernel and returns to user mode, - * or before entering guest mode. Fails if the @task is exiting/exited and thus - * it can't process this @work. Otherwise @work->func() will be called when the - * @task goes through one of the aforementioned transitions, or exits. + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted + * task and run the task_work, regardless of whether the task is currently + * running in the kernel or userspace. + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a + * reschedule IPI to force the targeted task to reschedule and run task_work. + * This can be advantageous if there's no strict requirement that the + * task_work be run as soon as possible, just whenever the task enters the + * kernel anyway. + * @TWA_RESUME work is run only when the task exits the kernel and returns to + * user mode, or before entering guest mode. + * + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task goes through one of + * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -50,6 +60,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; + case TWA_SIGNAL_NO_IPI: + __set_notify_signal(task); + break; default: WARN_ON_ONCE(1); break; -- Gitee From c4eaf4cef74f2d25ff58f4c45fe70581215f4659 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 13 Jul 2022 22:07:11 +0800 Subject: [PATCH 02/63] ublk_drv: support to complete io command via task_work_add ANBZ: #32233 commit 0edb3696c1713c42f52acbd8355b545e58f782b1 upstream. Use task_work_add if it is available, since task_work_add can bring up better performance, especially batching signaling ->ubq_daemon can be done. It is observed that task_work_add() can boost iops by +4% on random 4k io test. Also except for completing io command, all other code paths are same with completing io command via io_uring_cmd_complete_in_task. Meantime add one flag of UBLK_F_URING_CMD_COMP_IN_TASK for comparing the mode easily. [Backport Notes] The original backport doesn't pick task_work_add() logic since we don't support __set_notify_signal(). This patch add the missing logic. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220713140711.97356-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 63 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3fc38937f758..bcb682d2dae7 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -55,6 +57,10 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +struct ublk_rq_data { + struct callback_head work; +}; + struct ublk_uring_cmd_pdu { struct request *req; }; @@ -264,6 +270,14 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) +{ + if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && + !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) + return true; + return false; +} + static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -750,14 +764,21 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) __ublk_rq_task_work(pdu->req); } +static void ublk_rq_task_work_fn(struct callback_head *work) +{ + struct ublk_rq_data *data = container_of(work, + struct ublk_rq_data, work); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_rq_task_work(req); +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; struct ublk_io *io = &ubq->ios[rq->tag]; - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); blk_status_t res; /* fill iod to slot in io cmd buffer */ @@ -798,12 +819,31 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if ((io->flags & UBLK_IO_FLAG_ABORTED)) goto fail; - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + if (ublk_can_use_task_work(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + enum task_work_notify_mode notify_mode = bd->last ? + TWA_SIGNAL_NO_IPI : TWA_NONE; + + if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) + goto fail; + } else { + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } return BLK_STS_OK; } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + + if (ublk_can_use_task_work(ubq)) + __set_notify_signal(ubq->ubq_daemon); +} static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) @@ -815,9 +855,20 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } +static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + init_task_work(&data->work, ublk_rq_task_work_fn); + return 0; +} + static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .commit_rqs = ublk_commit_rqs, .init_hctx = ublk_init_hctx, + .init_request = ublk_init_rq, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1392,6 +1443,7 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; @@ -1603,7 +1655,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) */ ub->dev_info.flags &= UBLK_F_ALL; - ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) + ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; -- Gitee From c53b797606ad6ac68cb3a79989d6a737693d77ef Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 29 Oct 2022 09:04:30 +0800 Subject: [PATCH 03/63] ublk_drv: comment on ublk_driver entry of Kconfig ANBZ: #32233 commit d57c2c6c1145148bb23d68db73de0b52d482d4ba upstream. Add help info for choosing to build ublk_drv as module or builtin. Signed-off-by: Ming Lei Reviewed-by: ZiyangZhang Link: https://lore.kernel.org/r/20221029010432.598367-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 7ad988d0626e..d2d9dfcc34ff 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,6 +488,12 @@ config BLK_DEV_UBLK definition isn't finalized yet, and might change according to future requirement, so mark is as experimental now. + Say Y if you want to get better performance because task_work_add() + can be used in IO path for replacing io_uring cmd, which will become + shared between IO tasks and ubq daemon, meantime task_work_add() can + can handle batch more effectively, but task_work_add() isn't exported + for module, so ublk has to be built to kernel. + source "drivers/block/rnbd/Kconfig" endif # BLK_DEV -- Gitee From 8498e221c9f2e2c3d251781784e84d487d565f10 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 29 Oct 2022 09:04:31 +0800 Subject: [PATCH 04/63] ublk_drv: avoid to touch io_uring cmd in blk_mq io path ANBZ: #32233 commit 3ab6e94ca539242247d4f00414a1bde584d001ed upstream. io_uring cmd is supposed to be used in ubq daemon context mainly, and we should try to avoid to touch it in ublk io submission context, otherwise this data could become shared between the two contexts, and performance is hurt. So link request into one per-queue list, and use same batching policy of io_uring command, just avoid to touch ucmd in blk-mq io context. Signed-off-by: Ming Lei Reviewed-by: ZiyangZhang Link: https://lore.kernel.org/r/20221029010432.598367-4-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 89 +++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bcb682d2dae7..d794c209ae1c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -58,11 +58,14 @@ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) struct ublk_rq_data { - struct callback_head work; + union { + struct callback_head work; + struct llist_node node; + }; }; struct ublk_uring_cmd_pdu { - struct request *req; + struct ublk_queue *ubq; }; /* @@ -120,6 +123,8 @@ struct ublk_queue { struct task_struct *ubq_daemon; char *io_cmd_buf; + struct llist_head io_cmds; + unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; @@ -760,8 +765,12 @@ static inline void __ublk_rq_task_work(struct request *req) static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data; - __ublk_rq_task_work(pdu->req); + llist_for_each_entry(data, io_cmds, node) + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); } static void ublk_rq_task_work_fn(struct callback_head *work) @@ -773,18 +782,50 @@ static void ublk_rq_task_work_fn(struct callback_head *work) __ublk_rq_task_work(req); } +static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) +{ + struct ublk_io *io = &ubq->ios[rq->tag]; + + /* + * If the check pass, we know that this is a re-issued request aborted + * previously in monitor_work because the ubq_daemon(cmd's task) is + * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore + * because this ioucmd's io_uring context may be freed now if no inflight + * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. + * + * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing + * the tag). Then the request is re-started(allocating the tag) and we are here. + * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED + * guarantees that here is a re-issued request aborted previously. + */ + if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data; + + llist_for_each_entry(data, io_cmds, node) + __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); + } else { + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->ubq = ubq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; - struct ublk_io *io = &ubq->ios[rq->tag]; + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); blk_status_t res; /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; + /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to * abort all requeued and new rqs here to let del_gendisk() @@ -804,36 +845,17 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - /* - * If the check pass, we know that this is a re-issued request aborted - * previously in monitor_work because the ubq_daemon(cmd's task) is - * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore - * because this ioucmd's io_uring context may be freed now if no inflight - * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. - * - * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing - * the tag). Then the request is re-started(allocating the tag) and we are here. - * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED - * guarantees that here is a re-issued request aborted previously. - */ - if ((io->flags & UBLK_IO_FLAG_ABORTED)) - goto fail; if (ublk_can_use_task_work(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); enum task_work_notify_mode notify_mode = bd->last ? TWA_SIGNAL_NO_IPI : TWA_NONE; if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) goto fail; } else { - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + if (llist_add(&data->node, &ubq->io_cmds)) + ublk_submit_cmd(ubq, rq); } - return BLK_STS_OK; } @@ -1163,11 +1185,22 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, int tag, struct io_uring_cmd *cmd) { + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - pdu->req = req; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + if (ublk_can_use_task_work(ubq)) { + /* should not fail since we call it just in ubq->ubq_daemon */ + task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI); + } else { + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + if (llist_add(&data->node, &ubq->io_cmds)) { + pdu->ubq = ubq; + io_uring_cmd_complete_in_task(cmd, + ublk_rq_task_work_cb); + } + } } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) -- Gitee From 9b540dc07cee054e47648f249d1b90b28891010c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 29 Oct 2022 09:04:32 +0800 Subject: [PATCH 05/63] ublk_drv: add ublk_queue_cmd() for cleanup ANBZ: #32233 commit fee32f312405726eec6b35b5740c48acda0315e9 upstream. Add helper of ublk_queue_cmd() so that both ublk_queue_rq() and ublk_handle_need_get_data() can reuse this helper. Signed-off-by: Ming Lei Reviewed-by: ZiyangZhang Link: https://lore.kernel.org/r/20221029010432.598367-5-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 47 ++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d794c209ae1c..08c210ad6361 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -813,12 +813,28 @@ static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) } } +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq, + bool last) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + + if (ublk_can_use_task_work(ubq)) { + enum task_work_notify_mode notify_mode = last ? + TWA_SIGNAL_NO_IPI : TWA_NONE; + + if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) + __ublk_abort_rq(ubq, rq); + } else { + if (llist_add(&data->node, &ubq->io_cmds)) + ublk_submit_cmd(ubq, rq); + } +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); blk_status_t res; /* fill iod to slot in io cmd buffer */ @@ -841,21 +857,12 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); if (unlikely(ubq_daemon_is_dying(ubq))) { - fail: __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - if (ublk_can_use_task_work(ubq)) { - enum task_work_notify_mode notify_mode = bd->last ? - TWA_SIGNAL_NO_IPI : TWA_NONE; + ublk_queue_cmd(ubq, rq, bd->last); - if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) - goto fail; - } else { - if (llist_add(&data->node, &ubq->io_cmds)) - ublk_submit_cmd(ubq, rq); - } return BLK_STS_OK; } @@ -1183,24 +1190,12 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, - int tag, struct io_uring_cmd *cmd) + int tag) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - if (ublk_can_use_task_work(ubq)) { - /* should not fail since we call it just in ubq->ubq_daemon */ - task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI); - } else { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - - if (llist_add(&data->node, &ubq->io_cmds)) { - pdu->ubq = ubq; - io_uring_cmd_complete_in_task(cmd, - ublk_rq_task_work_cb); - } - } + ublk_queue_cmd(ubq, req, true); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -1293,7 +1288,7 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) io->addr = ub_cmd->addr; io->cmd = cmd; io->flags |= UBLK_IO_FLAG_ACTIVE; - ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd); + ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); break; default: goto out; -- Gitee From c497873c427ad489e469f9d90ad289c47a8f4910 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 21 Nov 2022 23:56:45 +0800 Subject: [PATCH 06/63] ublk_drv: don't forward io commands in reserve order ANBZ: #32233 commit 7d4a93176e0142ce16d23c849a47d5b00b856296 upstream. Either ublk_can_use_task_work() is true or not, io commands are forwarded to ublk server in reverse order, since llist_add() is always to add one element to the head of the list. Even though block layer doesn't guarantee request dispatch order, requests should be sent to hardware in the sequence order generated from io scheduler, which usually considers the request's LBA, and order is often important for HDD. So forward io commands in the sequence made from io scheduler by aligning task work with current io_uring command's batch handling, and it has been observed that both can get similar performance data if IORING_SETUP_COOP_TASKRUN is set from ublk server. Reported-by: Andreas Hindborg Cc: Damien Le Moal Signed-off-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: ZiyangZhang Link: https://lore.kernel.org/r/20221121155645.396272-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 82 +++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 44 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 08c210ad6361..42c4a537ca1a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -58,10 +58,8 @@ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) struct ublk_rq_data { - union { - struct callback_head work; - struct llist_node node; - }; + struct llist_node node; + struct callback_head work; }; struct ublk_uring_cmd_pdu { @@ -762,15 +760,31 @@ static inline void __ublk_rq_task_work(struct request *req) ubq_complete_io_cmd(io, UBLK_IO_RES_OK); } +static inline void ublk_forward_io_cmds(struct ublk_queue *ubq) +{ + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data, *tmp; + + io_cmds = llist_reverse_order(io_cmds); + llist_for_each_entry_safe(data, tmp, io_cmds, node) + __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); +} + +static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) +{ + struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); + struct ublk_rq_data *data, *tmp; + + llist_for_each_entry_safe(data, tmp, io_cmds, node) + __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); +} + static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data; - llist_for_each_entry(data, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); + ublk_forward_io_cmds(ubq); } static void ublk_rq_task_work_fn(struct callback_head *work) @@ -778,14 +792,20 @@ static void ublk_rq_task_work_fn(struct callback_head *work) struct ublk_rq_data *data = container_of(work, struct ublk_rq_data, work); struct request *req = blk_mq_rq_from_pdu(data); + struct ublk_queue *ubq = req->mq_hctx->driver_data; - __ublk_rq_task_work(req); + ublk_forward_io_cmds(ubq); } -static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct ublk_io *io = &ubq->ios[rq->tag]; + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + struct ublk_io *io; + if (!llist_add(&data->node, &ubq->io_cmds)) + return; + + io = &ubq->ios[rq->tag]; /* * If the check pass, we know that this is a re-issued request aborted * previously in monitor_work because the ubq_daemon(cmd's task) is @@ -799,11 +819,11 @@ static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) * guarantees that here is a re-issued request aborted previously. */ if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data; - - llist_for_each_entry(data, io_cmds, node) - __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); + ublk_abort_io_cmds(ubq); + } else if (ublk_can_use_task_work(ubq)) { + if (task_work_add(ubq->ubq_daemon, &data->work, + TWA_SIGNAL_NO_IPI)) + ublk_abort_io_cmds(ubq); } else { struct io_uring_cmd *cmd = io->cmd; struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); @@ -813,23 +833,6 @@ static void ublk_submit_cmd(struct ublk_queue *ubq, const struct request *rq) } } -static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq, - bool last) -{ - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); - - if (ublk_can_use_task_work(ubq)) { - enum task_work_notify_mode notify_mode = last ? - TWA_SIGNAL_NO_IPI : TWA_NONE; - - if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) - __ublk_abort_rq(ubq, rq); - } else { - if (llist_add(&data->node, &ubq->io_cmds)) - ublk_submit_cmd(ubq, rq); - } -} - static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -861,19 +864,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } - ublk_queue_cmd(ubq, rq, bd->last); + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } -static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) -{ - struct ublk_queue *ubq = hctx->driver_data; - - if (ublk_can_use_task_work(ubq)) - __set_notify_signal(ubq->ubq_daemon); -} - static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -895,7 +890,6 @@ static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, - .commit_rqs = ublk_commit_rqs, .init_hctx = ublk_init_hctx, .init_request = ublk_init_rq, }; @@ -1195,7 +1189,7 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, struct ublk_queue *ubq = ublk_get_queue(ub, q_id); struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - ublk_queue_cmd(ubq, req, true); + ublk_queue_cmd(ubq, req); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) -- Gitee From 13853779e26917d0db7c8fd9ac8526497cb9d61b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:06 +0800 Subject: [PATCH 07/63] ublk_drv: remove nr_aborted_queues from ublk_device ANBZ: #32233 commit ed878d1c1c641c4a6bd366658fc8e6bc842b80d1 upstream. No one uses 'nr_aborted_queues' any more, so remove it. Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-2-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 42c4a537ca1a..d6cc91f3b0a0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -161,7 +161,6 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - atomic_t nr_aborted_queues; /* * Our ubq->daemon may be killed without any notification, so -- Gitee From 8c058411baf7eabd7d06eccba70105096a75f07d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:07 +0800 Subject: [PATCH 08/63] ublk_drv: don't probe partitions if the ubq daemon isn't trusted ANBZ: #32233 commit 73a166d9749230d598320fdae3b687cdc0e2e205 upstream. If any ubq daemon is unprivileged, the ublk char device is allowed for unprivileged user actually, and we can't trust the current user, so not probe partitions. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d6cc91f3b0a0..601cbd05ad75 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -161,6 +161,7 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; + unsigned int nr_privileged_daemon; /* * Our ubq->daemon may be killed without any notification, so @@ -1176,6 +1177,9 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) ubq->ubq_daemon = current; get_task_struct(ubq->ubq_daemon); ub->nr_queues_ready++; + + if (capable(CAP_SYS_ADMIN)) + ub->nr_privileged_daemon++; } if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) complete_all(&ub->completion); @@ -1547,6 +1551,10 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) goto out_cleanup_queue; } + /* don't probe partitions if any one ubq daemon is un-trusted */ + if (ub->nr_privileged_daemon != ub->nr_queues_ready) + disk->flags |= GENHD_FL_NO_PART_SCAN; + get_device(&ub->cdev_dev); add_disk(ub->ub_disk); set_bit(UB_STATE_USED, &ub->state); @@ -1944,6 +1952,7 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; + ub->nr_privileged_daemon = 0; init_completion(&ub->completion); ret = 0; out_unlock: -- Gitee From c27dc179fb5aa8930e4ee6a30f0e6cda63385348 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:08 +0800 Subject: [PATCH 09/63] ublk_drv: move ublk_get_device_from_id into ublk_ctrl_uring_cmd ANBZ: #32233 commit bfbcef036396a73fbf4b3fee385cc670159df5ad upstream. It is annoying for each control command handler to get/put ublk device and deal with failure. Control command handler is simplified a lot by moving ublk_get_device_from_id into ublk_ctrl_uring_cmd(). Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-4-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 136 ++++++++++++++------------------------- 1 file changed, 48 insertions(+), 88 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 601cbd05ad75..fbc3a180d4a3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1500,21 +1500,16 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } -static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - wait_for_completion_interruptible(&ub->completion); schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); @@ -1564,20 +1559,19 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) blk_cleanup_queue(ub->ub_queue); out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; cpumask_var_t cpumask; unsigned long queue; unsigned int retlen; unsigned int i; - int ret = -EINVAL; + int ret; if (header->len * BITS_PER_BYTE < nr_cpu_ids) return -EINVAL; @@ -1586,17 +1580,12 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) if (!header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - queue = header->data[0]; if (queue >= ub->dev_info.nr_hw_queues) - goto out_put_device; + return -EINVAL; - ret = -ENOMEM; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) - goto out_put_device; + return -ENOMEM; for_each_possible_cpu(i) { if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) @@ -1614,8 +1603,6 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) ret = 0; out_free_cpumask: free_cpumask_var(cpumask); -out_put_device: - ublk_put_device(ub); return ret; } @@ -1738,30 +1725,27 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(int idx) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub) { - struct ublk_device *ub; + struct ublk_device *ub = *p_ub; + int idx = ub->ub_number; int ret; ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; - ub = ublk_get_device_from_id(idx); - if (ub) { - ublk_remove(ub); - ublk_put_device(ub); - ret = 0; - } else { - ret = -ENODEV; - } + ublk_remove(ub); + + /* Mark the reference as consumed */ + *p_ub = NULL; + ublk_put_device(ub); /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. */ - if (!ret) - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); + wait_event(ublk_idr_wq, ublk_idr_freed(idx)); mutex_unlock(&ublk_ctl_mutex); return ret; @@ -1776,50 +1760,36 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_stop_dev(struct ublk_device *ub) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->quiesce_work); - ublk_put_device(ub); return 0; } -static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_dev_info(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; - int ret = 0; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) - ret = -EFAULT; - ublk_put_device(ub); + return -EFAULT; - return ret; + return 0; } -static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret; if (header->len <= sizeof(ph) || !header->addr) @@ -1834,10 +1804,6 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - mutex_lock(&ub->mutex); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; @@ -1845,16 +1811,15 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) ret = 0; mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_set_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret = -EFAULT; if (header->len <= sizeof(ph) || !header->addr) @@ -1869,10 +1834,6 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE) { @@ -1885,7 +1846,6 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) ret = ublk_validate_params(ub); } mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } @@ -1912,17 +1872,13 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) } } -static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; int ret = -EINVAL; int i; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - mutex_lock(&ub->mutex); if (!ublk_can_use_recovery(ub)) goto out_unlock; @@ -1957,21 +1913,16 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_end_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; int ret = -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); /* wait until new ubq_daemon sending all FETCH_REQ */ @@ -1999,7 +1950,6 @@ static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } @@ -2007,6 +1957,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub = NULL; int ret = -EINVAL; if (issue_flags & IO_URING_F_NONBLOCK) @@ -2021,41 +1972,50 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!capable(CAP_SYS_ADMIN)) goto out; - ret = -ENODEV; + if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { + ret = -ENODEV; + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + goto out; + } + switch (cmd->cmd_op) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(cmd); + ret = ublk_ctrl_start_dev(ub, cmd); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(cmd); + ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: - ret = ublk_ctrl_get_dev_info(cmd); + ret = ublk_ctrl_get_dev_info(ub, cmd); break; case UBLK_CMD_ADD_DEV: ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(header->dev_id); + ret = ublk_ctrl_del_dev(&ub); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(cmd); + ret = ublk_ctrl_get_queue_affinity(ub, cmd); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(cmd); + ret = ublk_ctrl_get_params(ub, cmd); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(cmd); + ret = ublk_ctrl_set_params(ub, cmd); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(cmd); + ret = ublk_ctrl_start_recovery(ub, cmd); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(cmd); + ret = ublk_ctrl_end_recovery(ub, cmd); break; default: + ret = -ENOTSUPP; break; } + if (ub) + ublk_put_device(ub); out: io_uring_cmd_done(cmd, ret, 0); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", -- Gitee From cfff2027880af1e094fc2880f51ce93ac865be94 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:09 +0800 Subject: [PATCH 10/63] ublk_drv: add device parameter UBLK_PARAM_TYPE_DEVT ANBZ: #32233 commit abb864d380854b5427b6b070beb2ebc291ce4d1e upstream. Userspace side only knows device ID, but the associated path of ublkc* and ublkb* could be changed by udev, and that depends on userspace's policy, so add parameter of UBLK_PARAM_TYPE_DEVT for retrieving major/minor of the ublkc* and ublkb*, then user may figure out major/minor of the ublk disks he/she owns. With major/minor, it is easy to find the device node path. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-5-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 24 +++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 13 +++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index fbc3a180d4a3..9db9c655169b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -55,7 +55,8 @@ | UBLK_F_USER_RECOVERY_REISSUE) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ + UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) struct ublk_rq_data { struct llist_node node; @@ -257,6 +258,10 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + /* dev_t is read-only */ + if (ub->params.types & UBLK_PARAM_TYPE_DEVT) + return -EINVAL; + return 0; } @@ -1784,6 +1789,22 @@ static int ublk_ctrl_get_dev_info(struct ublk_device *ub, return 0; } +/* TYPE_DEVT is readonly, so fill it up before returning to userspace */ +static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) +{ + ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); + ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); + + if (ub->ub_disk) { + ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); + ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); + } else { + ub->params.devt.disk_major = 0; + ub->params.devt.disk_minor = 0; + } + ub->params.types |= UBLK_PARAM_TYPE_DEVT; +} + static int ublk_ctrl_get_params(struct ublk_device *ub, struct io_uring_cmd *cmd) { @@ -1805,6 +1826,7 @@ static int ublk_ctrl_get_params(struct ublk_device *ub, ph.len = sizeof(struct ublk_params); mutex_lock(&ub->mutex); + ublk_ctrl_fill_params_devt(ub); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; else diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 8f88e3a29998..4e38b9aa0293 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -214,6 +214,17 @@ struct ublk_param_discard { __u16 reserved0; }; +/* + * read-only, can't set via UBLK_CMD_SET_PARAMS, disk_devt is available + * after device is started + */ +struct ublk_param_devt { + __u32 char_major; + __u32 char_minor; + __u32 disk_major; + __u32 disk_minor; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -224,10 +235,12 @@ struct ublk_params { __u32 len; #define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_DISCARD (1 << 1) +#define UBLK_PARAM_TYPE_DEVT (1 << 2) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; + struct ublk_param_devt devt; }; #endif -- Gitee From df2c39390bba7bf3bcacc3b3406246074d145fb5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:10 +0800 Subject: [PATCH 11/63] ublk_drv: add module parameter of ublks_max for limiting max allowed ublk dev ANBZ: #32233 commit 403ebc877832752da9fc851284fa00ceca7b2fae upstream. Prepare for supporting unprivileged ublk device by limiting max number ublk devices added. Otherwise too many ublk devices could be added by un-trusted user, which can be thought as one DoS. Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-6-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9db9c655169b..144a159b8141 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -188,6 +188,15 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +/* + * Max ublk devices allowed to add + * + * It can be extended to one per-user limit in future or even controlled + * by cgroup. + */ +static unsigned int ublks_max = 64; +static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ + static struct miscdevice ublk_misc; static void ublk_dev_param_basic_apply(struct ublk_device *ub) @@ -1444,6 +1453,8 @@ static int ublk_add_chdev(struct ublk_device *ub) ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; + + ublks_added++; return 0; fail: put_device(dev); @@ -1487,6 +1498,7 @@ static void ublk_remove(struct ublk_device *ub) cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); put_device(&ub->cdev_dev); + ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -1647,6 +1659,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (ret) return ret; + ret = -EACCES; + if (ublks_added >= ublks_max) + goto out_unlock; + ret = -ENOMEM; ub = kzalloc(sizeof(*ub), GFP_KERNEL); if (!ub) @@ -2104,5 +2120,8 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); +module_param(ublks_max, int, 0444); +MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); + MODULE_AUTHOR("Ming Lei "); MODULE_LICENSE("GPL"); -- Gitee From 6409ca62d0e076b026c8f341df57ca3e598cc4de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:11 +0800 Subject: [PATCH 12/63] ublk_drv: add mechanism for supporting unprivileged ublk device ANBZ: #32233 commit 4093cb5a06343ea3936ae46664d132c82576b153 upstream. unprivileged ublk device is helpful for container use case, such as: ublk device created in one unprivileged container can be controlled and accessed by this container only. Implement this feature by adding flag of UBLK_F_UNPRIVILEGED_DEV, and if this flag isn't set, any control command has been run from privileged user. Otherwise, any control command can be sent from any unprivileged user, but the user has to be permitted to access the ublk char device to be controlled. In case of UBLK_F_UNPRIVILEGED_DEV: 1) for command UBLK_CMD_ADD_DEV, it is always allowed, and user needs to provide owner's uid/gid in this command, so that udev can set correct ownership for the created ublk device, since the device owner uid/gid can be queried via command of UBLK_CMD_GET_DEV_INFO. 2) for other control commands, they can only be run successfully if the current user is allowed to access the specified ublk char device, for running the permission check, path of the ublk char device has to be provided by these commands. Also add one control of command UBLK_CMD_GET_DEV_INFO2 which always include the char dev path in payload since userspace may not have knowledge if this device is created in unprivileged mode. For applying this mechanism, system administrator needs to take the following policies: 1) chmod 0666 /dev/ublk-control 2) change ownership of ublkcN & ublkbN - chown owner_uid:owner_gid /dev/ublkcN - chown owner_uid:owner_gid /dev/ublkbN Both can be done via one simple udev rule. Userspace: https://github.com/ming1/ubdsrv/tree/unprivileged-ublk 'ublk add -t $TYPE --un_privileged=1' is for creating one un-privileged ublk device if the user is un-privileged. Link: https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ Suggested-by: Stefan Hajnoczi Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-7-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- Documentation/block/ublk.rst | 49 +++++++++-- drivers/block/ublk_drv.c | 151 ++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 36 +++++++- 3 files changed, 219 insertions(+), 17 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index ba45c46cc0da..2916fcf3ab44 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -144,6 +144,37 @@ managing and controlling ublk devices with help of several control commands: For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's responsibility to save IO target specific info in userspace. +- ``UBLK_CMD_GET_DEV_INFO2`` + Same purpose with ``UBLK_CMD_GET_DEV_INFO``, but ublk server has to + provide path of the char device of ``/dev/ublkc*`` for kernel to run + permission check, and this command is added for supporting unprivileged + ublk device, and introduced with ``UBLK_F_UNPRIVILEGED_DEV`` together. + Only the user owning the requested device can retrieve the device info. + + How to deal with userspace/kernel compatibility: + + 1) if kernel is capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + ublk server should send ``UBLK_CMD_GET_DEV_INFO2``, given anytime + unprivileged application needs to query devices the current user owns, + when the application has no idea if ``UBLK_F_UNPRIVILEGED_DEV`` is set + given the capability info is stateless, and application should always + retrieve it via ``UBLK_CMD_GET_DEV_INFO2`` + + If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of + UBLK_F_UNPRIVILEGED_DEV isn't available for user + + 2) if kernel isn't capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO2`` is tried first, and will be failed, then + ``UBLK_CMD_GET_DEV_INFO`` needs to be retried given + ``UBLK_F_UNPRIVILEGED_DEV`` can't be set + + If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of + ``UBLK_F_UNPRIVILEGED_DEV`` isn't available for user + - ``UBLK_CMD_START_USER_RECOVERY`` This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This @@ -180,6 +211,15 @@ managing and controlling ublk devices with help of several control commands: double-write since the driver may issue the same I/O request twice. It might be useful to a read-only FS or a VM backend. +Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``. +Once the flag is set, all control commands can be sent by unprivileged +user. Except for command of ``UBLK_CMD_ADD_DEV``, permission check on +the specified char device(``/dev/ublkc*``) is done for all other control +commands by ublk driver, for doing that, path of the char device has to +be provided in these commands' payload from ublk server. With this way, +ublk device becomes container-ware, and device created in one container +can be controlled/accessed just inside this container. + Data plane ---------- @@ -254,15 +294,6 @@ with specified IO tag in the command data: Future development ================== -Container-aware ublk deivice ----------------------------- - -ublk driver doesn't handle any IO logic. Its function is well defined -for now and very limited userspace interfaces are needed, which is also -well defined too. It is possible to make ublk devices container-aware block -devices in future as Stefan Hajnoczi suggested [#stefan]_, by removing -ADMIN privilege. - Zero copy --------- diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 144a159b8141..acff21f9cc0a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -52,7 +53,8 @@ | UBLK_F_URING_CMD_COMP_IN_TASK \ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ - | UBLK_F_USER_RECOVERY_REISSUE) + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_UNPRIVILEGED_DEV) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -1623,6 +1625,17 @@ static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, return ret; } +static void ublk_store_owner_uid_gid(struct ublksrv_ctrl_dev_info *info) +{ + kuid_t uid; + kgid_t gid; + + current_uid_gid(&uid, &gid); + + info->owner_uid = from_kuid(&init_user_ns, uid); + info->owner_gid = from_kgid(&init_user_ns, gid); +} + static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) { pr_devel("%s: dev id %d flags %llx\n", __func__, @@ -1646,15 +1659,26 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) __func__, header->queue_id); return -EINVAL; } + if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - ublk_dump_dev_info(&info); + + if (capable(CAP_SYS_ADMIN)) + info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; + else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) + return -EPERM; + + /* the created device is always owned by current user */ + ublk_store_owner_uid_gid(&info); + if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", __func__, header->dev_id, info.dev_id); return -EINVAL; } + ublk_dump_dev_info(&info); + ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; @@ -1991,6 +2015,114 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } +/* + * All control commands are sent via /dev/ublk-control, so we have to check + * the destination device's permission + */ +static int ublk_char_dev_permission(struct ublk_device *ub, + const char *dev_path, int mask) +{ + int err; + struct path path; + struct kstat stat; + + err = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (err) + return err; + + err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + if (err) + goto exit; + + err = -EPERM; + if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) + goto exit; + + err = inode_permission(d_backing_inode(path.dentry), mask); +exit: + path_put(&path); + return err; +} + +static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, + struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; + void __user *argp = (void __user *)(unsigned long)header->addr; + char *dev_path = NULL; + int ret = 0; + int mask; + + if (!unprivileged) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The new added command of UBLK_CMD_GET_DEV_INFO2 includes + * char_dev_path in payload too, since userspace may not + * know if the specified device is created as unprivileged + * mode. + */ + if (cmd->cmd_op != UBLK_CMD_GET_DEV_INFO2) + return 0; + } + + /* + * User has to provide the char device path for unprivileged ublk + * + * header->addr always points to the dev path buffer, and + * header->dev_path_len records length of dev path buffer. + */ + if (!header->dev_path_len || header->dev_path_len > PATH_MAX) + return -EINVAL; + + if (header->len < header->dev_path_len) + return -EINVAL; + + dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); + if (!dev_path) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(dev_path, argp, header->dev_path_len)) + goto exit; + dev_path[header->dev_path_len] = 0; + + ret = -EINVAL; + switch (cmd->cmd_op) { + case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: + case UBLK_CMD_GET_QUEUE_AFFINITY: + case UBLK_CMD_GET_PARAMS: + mask = MAY_READ; + break; + case UBLK_CMD_START_DEV: + case UBLK_CMD_STOP_DEV: + case UBLK_CMD_ADD_DEV: + case UBLK_CMD_DEL_DEV: + case UBLK_CMD_SET_PARAMS: + case UBLK_CMD_START_USER_RECOVERY: + case UBLK_CMD_END_USER_RECOVERY: + mask = MAY_READ | MAY_WRITE; + break; + default: + goto exit; + } + + ret = ublk_char_dev_permission(ub, dev_path, mask); + if (!ret) { + header->len -= header->dev_path_len; + header->addr += header->dev_path_len; + } + pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", + __func__, ub->ub_number, cmd->cmd_op, + ub->dev_info.owner_uid, ub->dev_info.owner_gid, + dev_path, ret); +exit: + kfree(dev_path); + return ret; +} + static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -2006,17 +2138,21 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!(issue_flags & IO_URING_F_SQE128)) goto out; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); if (!ub) goto out; + + ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + } else { + /* ADD_DEV permission check is done in command handler */ + ret = 0; } + if (ret) + goto put_dev; + switch (cmd->cmd_op) { case UBLK_CMD_START_DEV: ret = ublk_ctrl_start_dev(ub, cmd); @@ -2025,6 +2161,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: ret = ublk_ctrl_get_dev_info(ub, cmd); break; case UBLK_CMD_ADD_DEV: @@ -2052,6 +2189,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = -ENOTSUPP; break; } + + put_dev: if (ub) ublk_put_device(ub); out: diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4e38b9aa0293..f6238ccc7800 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -19,6 +19,8 @@ #define UBLK_CMD_GET_PARAMS 0x09 #define UBLK_CMD_START_USER_RECOVERY 0x10 #define UBLK_CMD_END_USER_RECOVERY 0x11 +#define UBLK_CMD_GET_DEV_INFO2 0x12 + /* * IO commands, issued by ublk server, and handled by ublk driver. * @@ -79,6 +81,27 @@ #define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) +/* + * Unprivileged user can create /dev/ublkcN and /dev/ublkbN. + * + * /dev/ublk-control needs to be available for unprivileged user, and it + * can be done via udev rule to make all control commands available to + * unprivileged user. Except for the command of UBLK_CMD_ADD_DEV, all + * other commands are only allowed for the owner of the specified device. + * + * When userspace sends UBLK_CMD_ADD_DEV, the device pair's owner_uid and + * owner_gid are stored to ublksrv_ctrl_dev_info by kernel, so far only + * the current user's uid/gid is stored, that said owner of the created + * device is always the current user. + * + * We still need udev rule to apply OWNER/GROUP with the stored owner_uid + * and owner_gid. + * + * Then ublk server can be run as unprivileged user, and /dev/ublkbN can + * be accessed and managed by its owner represented by owner_uid/owner_gid. + */ +#define UBLK_F_UNPRIVILEGED_DEV (1UL << 5) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -98,7 +121,15 @@ struct ublksrv_ctrl_cmd { __u64 addr; /* inline data */ - __u64 data[2]; + __u64 data[1]; + + /* + * Used for UBLK_F_UNPRIVILEGED_DEV and UBLK_CMD_GET_DEV_INFO2 + * only, include null char + */ + __u16 dev_path_len; + __u16 pad; + __u32 reserved; }; struct ublksrv_ctrl_dev_info { @@ -118,7 +149,8 @@ struct ublksrv_ctrl_dev_info { /* For ublksrv internal use, invisible to ublk driver */ __u64 ublksrv_flags; - __u64 reserved0; + __u32 owner_uid; /* store by kernel */ + __u32 owner_gid; /* store by kernel */ __u64 reserved1; __u64 reserved2; }; -- Gitee From 80eb2108c6a06ea23528b9d92fc5787aad57910f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Jan 2023 12:23:18 +0800 Subject: [PATCH 13/63] block: ublk: fix doc build warning ANBZ: #32233 commit 464544fb93fc7b5ac92b49c3ce60ab95a48aadf7 upstream. Fix the following warning: Documentation/block/ublk.rst:157: WARNING: Enumerated list ends without a blank line; unexpected unindent. Documentation/block/ublk.rst:171: WARNING: Enumerated list ends without a blank line; unexpected unindent. Fixes: 56f5160bc1b8 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230118042318.127900-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- Documentation/block/ublk.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 2916fcf3ab44..1713b2890abb 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -154,7 +154,9 @@ managing and controlling ublk devices with help of several control commands: How to deal with userspace/kernel compatibility: 1) if kernel is capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + ublk server should send ``UBLK_CMD_GET_DEV_INFO2``, given anytime unprivileged application needs to query devices the current user owns, when the application has no idea if ``UBLK_F_UNPRIVILEGED_DEV`` is set @@ -162,16 +164,20 @@ managing and controlling ublk devices with help of several control commands: retrieve it via ``UBLK_CMD_GET_DEV_INFO2`` If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of UBLK_F_UNPRIVILEGED_DEV isn't available for user 2) if kernel isn't capable of handling ``UBLK_F_UNPRIVILEGED_DEV`` + If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO2`` is tried first, and will be failed, then ``UBLK_CMD_GET_DEV_INFO`` needs to be retried given ``UBLK_F_UNPRIVILEGED_DEV`` can't be set If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``: + ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of ``UBLK_F_UNPRIVILEGED_DEV`` isn't available for user -- Gitee From c562b41886f40482319ab3c677a7fc90434182f7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Jan 2023 12:04:46 +0800 Subject: [PATCH 14/63] ublk_drv: only allow owner to open unprivileged disk ANBZ: #32233 commit 48a9051980242128f844defe44c7e83217f79872 upstream. Owner of one unprivileged ublk device could be one evil user, which can grant this disk's privilege to other users deliberately, and this way could be like making one trap and waiting for other users to be caught. So only owner to open unprivileged disk even though the owner grants disk privilege to other user. This way is reasonable too given anyone can create ublk disk, and no need other's grant. Reported-by: Stefan Hajnoczi Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230131040446.214583-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 57 +++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index acff21f9cc0a..fbe560c7608a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -373,9 +373,51 @@ static void ublk_free_disk(struct gendisk *disk) put_device(&ub->cdev_dev); } +static void ublk_store_owner_uid_gid(unsigned int *owner_uid, + unsigned int *owner_gid) +{ + kuid_t uid; + kgid_t gid; + + current_uid_gid(&uid, &gid); + + *owner_uid = from_kuid(&init_user_ns, uid); + *owner_gid = from_kgid(&init_user_ns, gid); +} + +static int ublk_open(struct block_device *bdev, fmode_t mode) +{ + struct ublk_device *ub = bdev->bd_disk->private_data; + + if (capable(CAP_SYS_ADMIN)) + return 0; + + /* + * If it is one unprivileged device, only owner can open + * the disk. Otherwise it could be one trap made by one + * evil user who grants this disk's privileges to other + * users deliberately. + * + * This way is reasonable too given anyone can create + * unprivileged device, and no need other's grant. + */ + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { + unsigned int curr_uid, curr_gid; + + ublk_store_owner_uid_gid(&curr_uid, &curr_gid); + + if (curr_uid != ub->dev_info.owner_uid || curr_gid != + ub->dev_info.owner_gid) + return -EPERM; + } + + return 0; +} + static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, - .free_disk = ublk_free_disk, + .open = ublk_open, + .free_disk = ublk_free_disk, }; #define UBLK_MAX_PIN_PAGES 32 @@ -1625,17 +1667,6 @@ static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, return ret; } -static void ublk_store_owner_uid_gid(struct ublksrv_ctrl_dev_info *info) -{ - kuid_t uid; - kgid_t gid; - - current_uid_gid(&uid, &gid); - - info->owner_uid = from_kuid(&init_user_ns, uid); - info->owner_gid = from_kgid(&init_user_ns, gid); -} - static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) { pr_devel("%s: dev id %d flags %llx\n", __func__, @@ -1669,7 +1700,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return -EPERM; /* the created device is always owned by current user */ - ublk_store_owner_uid_gid(&info); + ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", -- Gitee From c25f4501e76f58acb935875b0f940913139d7490 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 7 Feb 2023 23:07:00 +0800 Subject: [PATCH 15/63] block: ublk: improve handling device deletion ANBZ: #32233 commit 0abe39dec065133e3f92a52219c3728fe7d7617f upstream. Inside ublk_ctrl_del_dev(), when the device is removed, we wait until the device number is freed with holding global lock of ublk_ctl_mutex, this way isn't friendly from user viewpoint: 1) if device is in-use, the current delete command hangs in ublk_ctrl_del_dev(), and user can't break from the handling because wait_event() is used 2) global lock is held, so any new device can't be added and other old devices can't be removed. Improve the deleting handling by the following way, suggested by Nadav: 1) wait without holding the global lock 2) replace wait_event() with wait_event_interruptible() Reported-by: Nadav Amit Suggested-by: Nadav Amit Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230207150700.545530-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index fbe560c7608a..73c39bdd2ac3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -152,6 +152,7 @@ struct ublk_device { #define UB_STATE_OPEN 0 #define UB_STATE_USED 1 +#define UB_STATE_DELETED 2 unsigned long state; int ub_number; @@ -1811,20 +1812,33 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) if (ret) return ret; - ublk_remove(ub); + if (!test_bit(UB_STATE_DELETED, &ub->state)) { + ublk_remove(ub); + set_bit(UB_STATE_DELETED, &ub->state); + } /* Mark the reference as consumed */ *p_ub = NULL; ublk_put_device(ub); + mutex_unlock(&ublk_ctl_mutex); /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. + * + * If we returns because of user interrupt, future delete command + * may come: + * + * - the device number isn't freed, this device won't or needn't + * be deleted again, since UB_STATE_DELETED is set, and device + * will be released after the last reference is dropped + * + * - the device number is freed already, we will not find this + * device via ublk_get_device_from_id() */ - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); - mutex_unlock(&ublk_ctl_mutex); + wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)); - return ret; + return 0; } static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) -- Gitee From f7e54a399ef61b64c9ac6cdc746a64a4e4533fa4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 20 Feb 2023 12:14:13 +0800 Subject: [PATCH 16/63] ublk: remove check IO_URING_F_SQE128 in ublk_ch_uring_cmd ANBZ: #32233 commit 9c7c4bc986932218fd0df9d2a100509772028fb1 upstream. sizeof(struct ublksrv_io_cmd) is 16bytes, which can be held in 64byte SQE, so not necessary to check IO_URING_F_SQE128. With this change, we get chance to save half SQ ring memory. Fixed: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230220041413.1524335-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 73c39bdd2ac3..e1ac8619318d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1269,9 +1269,6 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; -- Gitee From 3aacfd196d5a17753903a62c230aa9ccb61a53df Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 18 Mar 2023 22:12:31 +0800 Subject: [PATCH 17/63] block: ublk_drv: mark device as LIVE before adding disk ANBZ: #32233 commit 4985e7b2c002eb4c5c794a1d3acd91b82c89a0fd upstream. IO can be started before add_disk() returns, such as reading parititon table, then the monitor work should work for making forward progress. So mark device as LIVE before adding disk, meantime change to DEAD if add_disk() fails. Fixed: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230318141231.55562-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e1ac8619318d..4c2695613c42 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1610,9 +1610,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) disk->flags |= GENHD_FL_NO_PART_SCAN; get_device(&ub->cdev_dev); + ub->dev_info.state = UBLK_S_DEV_LIVE; add_disk(ub->ub_disk); set_bit(UB_STATE_USED, &ub->state); - ub->dev_info.state = UBLK_S_DEV_LIVE; out_cleanup_queue: if (ret) blk_cleanup_queue(ub->ub_queue); -- Gitee From c590a54f02702b3abdea6c892a8f31bc184998ba Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Mar 2023 19:36:20 +0800 Subject: [PATCH 18/63] block: ublk_drv: add common exit handling ANBZ: #32233 commit 903f8aeea9fd1b97fba4ab805ddd639f57f117f8 upstream. Simplify exit handling a bit, and prepare for supporting fused command. Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4c2695613c42..eede626a7ba7 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -651,14 +651,15 @@ static void ublk_complete_rq(struct request *req) struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; if (io->res < 0) { - blk_mq_end_request(req, errno_to_blk_status(io->res)); - return; + res = errno_to_blk_status(io->res); + goto exit; } /* @@ -667,10 +668,8 @@ static void ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { - blk_mq_end_request(req, BLK_STS_OK); - return; - } + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + goto exit; /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(ubq, req, io); @@ -687,6 +686,10 @@ static void ublk_complete_rq(struct request *req) blk_mq_requeue_request(req, true); else __blk_mq_end_request(req, BLK_STS_OK); + + return; +exit: + blk_mq_end_request(req, res); } /* -- Gitee From 39426d08fdf0cf048c950a176f2393ec8c619521 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Mar 2023 19:36:21 +0800 Subject: [PATCH 19/63] block: ublk_drv: don't consider flush request in map/unmap io ANBZ: #32233 commit 23ef8220f287abe5bf741ddfc278e7359742d3b1 upstream. There isn't data in request of REQ_OP_FLUSH always, so don't consider it in both ublk_map_io() and ublk_unmap_io(). Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index eede626a7ba7..d8eff420bcf9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -525,15 +525,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) - return rq_bytes; - - if (ublk_rq_has_data(req)) { + if (ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE) { struct ublk_map_data data = { .ubq = ubq, .rq = req, @@ -768,9 +766,7 @@ static inline void __ublk_rq_task_work(struct request *req) return; } - if (ublk_need_get_data(ubq) && - (req_op(req) == REQ_OP_WRITE || - req_op(req) == REQ_OP_FLUSH)) { + if (ublk_need_get_data(ubq) && (req_op(req) == REQ_OP_WRITE)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv -- Gitee From cc3ac0efda37691bff4aafe4984fabbda05f9dc9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Mar 2023 19:36:22 +0800 Subject: [PATCH 20/63] block: ublk_drv: add two helpers to clean up map/unmap request ANBZ: #32233 commit 2f3af723447c35c16f3c6a1b4b317c61dc41d6c3 upstream. Add two helpers for checking if map/unmap is needed, since we may have passthrough request which needs map or unmap in future, such as for supporting report zones. Meantime don't mark ublk_copy_user_pages as inline since this function is a bit fat now. Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d8eff420bcf9..448c30526dbb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -484,8 +484,7 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, return done; } -static inline int ublk_copy_user_pages(struct ublk_map_data *data, - bool to_vm) +static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) { const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; const unsigned long start_vm = data->io->addr; @@ -521,6 +520,16 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data, return done; } +static inline bool ublk_need_map_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; +} + +static inline bool ublk_need_unmap_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; +} + static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct ublk_io *io) { @@ -531,7 +540,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE) { + if (ublk_need_map_req(req)) { struct ublk_map_data data = { .ubq = ubq, .rq = req, @@ -552,7 +561,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { + if (ublk_need_unmap_req(req)) { struct ublk_map_data data = { .ubq = ubq, .rq = req, @@ -766,7 +775,7 @@ static inline void __ublk_rq_task_work(struct request *req) return; } - if (ublk_need_get_data(ubq) && (req_op(req) == REQ_OP_WRITE)) { + if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv -- Gitee From fe7d15329b7c91c907417c6b49cb0254c23fe9dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Mar 2023 19:36:23 +0800 Subject: [PATCH 21/63] block: ublk_drv: clean up several helpers ANBZ: #32233 commit 96cf2f5404c8bc979628a2b495852d735a56c5b5 upstream. Convert the following pattern in several helpers if (Z) return true return false into: return Z; Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 448c30526dbb..052f8e8d0f5a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -349,10 +349,8 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) static inline bool ublk_queue_can_use_recovery_reissue( struct ublk_queue *ubq) { - if ((ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) - return true; - return false; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); } static inline bool ublk_queue_can_use_recovery( -- Gitee From 0aaf11a36611906827311832dcdfaf488747a54c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Mar 2023 19:36:24 +0800 Subject: [PATCH 22/63] block: ublk_drv: cleanup 'struct ublk_map_data' ANBZ: #32233 commit ae9f5ccea4c268a96763e51239b32d6b5172c18c upstream. 'struct ublk_map_data' is passed to ublk_copy_user_pages() for copying data between userspace buffer and request pages. Here what matters is userspace buffer address/len and 'struct request', so replace ->io field with user buffer address, and rename max_bytes as len. Meantime remove 'ubq' field from ublk_map_data, since it isn't used any more. Then code becomes more readable. Reviewed-by: Ziyang Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 052f8e8d0f5a..3e422389937f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -422,10 +422,9 @@ static const struct block_device_operations ub_fops = { #define UBLK_MAX_PIN_PAGES 32 struct ublk_map_data { - const struct ublk_queue *ubq; const struct request *rq; - const struct ublk_io *io; - unsigned max_bytes; + unsigned long ubuf; + unsigned int len; }; struct ublk_io_iter { @@ -485,14 +484,14 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) { const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->io->addr; + const unsigned long start_vm = data->ubuf; unsigned int done = 0; struct ublk_io_iter iter = { .pg_off = start_vm & (PAGE_SIZE - 1), .bio = data->rq->bio, .iter = data->rq->bio->bi_iter, }; - const unsigned int nr_pages = round_up(data->max_bytes + + const unsigned int nr_pages = round_up(data->len + (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; while (done < nr_pages) { @@ -505,13 +504,13 @@ static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) iter.pages); if (iter.nr_pages <= 0) return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); + len = ublk_copy_io_pages(&iter, data->len, to_vm); for (i = 0; i < iter.nr_pages; i++) { if (to_vm) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->max_bytes -= len; + data->len -= len; done += iter.nr_pages; } @@ -540,15 +539,14 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, */ if (ublk_need_map_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = rq_bytes, + .ubuf = io->addr, + .len = rq_bytes, }; ublk_copy_user_pages(&data, true); - return rq_bytes - data.max_bytes; + return rq_bytes - data.len; } return rq_bytes; } @@ -561,17 +559,16 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, if (ublk_need_unmap_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = io->res, + .ubuf = io->addr, + .len = io->res, }; WARN_ON_ONCE(io->res > rq_bytes); ublk_copy_user_pages(&data, false); - return io->res - data.max_bytes; + return io->res - data.len; } return rq_bytes; } -- Gitee From 9377c87057d671833eb4964e883de1c6e48f34ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Apr 2023 20:00:46 -0600 Subject: [PATCH 23/63] ublk: read any SQE values upfront ANBZ: #32233 commit 8c68ae3b22fa6fb2dbe83ef955ff10936503d28e upstream. Since SQE memory is shared with userspace, we should only be reading it once. We cannot read it multiple times, particularly when it's read once for validation and then read again for the actual use. ublk_ch_uring_cmd() is safe when called as a retry operation, as the memory backing is stable at that point. But for normal issue, we want to ensure that we only read ublksrv_io_cmd once. Wrap the function in a helper that reads the value into an on-stack copy of the struct. Cc: stable@vger.kernel.org # 6.0+ Reviewed-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3e422389937f..0e32e33d8e69 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1257,9 +1257,10 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, ublk_queue_cmd(ubq, req); } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct ublksrv_io_cmd *ub_cmd) { - struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; @@ -1358,6 +1359,23 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd; + struct ublksrv_io_cmd ub_cmd; + + /* + * Not necessary for async retry, but let's keep it simple and always + * copy the values to avoid any potential reuse. + */ + ub_cmd.q_id = READ_ONCE(ub_src->q_id); + ub_cmd.tag = READ_ONCE(ub_src->tag); + ub_cmd.result = READ_ONCE(ub_src->result); + ub_cmd.addr = READ_ONCE(ub_src->addr); + + return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, -- Gitee From 70f5b920ed08b9c88c7d4c6f59cde441aa0c7c0b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 6 Apr 2023 20:40:59 +0800 Subject: [PATCH 24/63] block: ublk: make sure that block size is set correctly ANBZ: #32233 commit 1d1665279a845d16c93687389e364386e3fe0f38 upstream. block size is one very key setting for block layer, and bad block size could panic kernel easily. Make sure that block size is set correctly. Meantime if ublk_validate_params() fails, clear ub->params so that disk is prevented from being added. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reported-and-tested-by: Breno Leitao Signed-off-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0e32e33d8e69..f057f6cebb31 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -248,7 +248,7 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { const struct ublk_param_basic *p = &ub->params.basic; - if (p->logical_bs_shift > PAGE_SHIFT) + if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; if (p->logical_bs_shift > p->physical_bs_shift) @@ -1969,6 +1969,8 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); + if (ret) + ub->params.types = 0; } mutex_unlock(&ub->mutex); -- Gitee From cb5567adaf95d5c1b343b47fbf6a80448de09ef4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 18 Apr 2023 21:18:10 +0800 Subject: [PATCH 25/63] block: ublk: switch to ioctl command encoding ANBZ: #32233 commit 2d786e66c9662d84cbeab981ce3a371d2fb5a4bb upstream. All ublk commands(control, IO) should have taken ioctl command encoding from the beginning, because ioctl command encoding defines each code uniquely, so driver can figure out wrong command sent from userspace easily; 2) it might help security subsystem for audit uring cmd[1]. Unfortunately we didn't do that way, and it could be one lesson for ublk driver. So switch to ioctl command encoding now, we still support commands encoded in old way, but they become legacy definition. Any new command should take ioctl encoding. See ublksrv code for switching to ioctl command encoding in [2]. [1] https://lore.kernel.org/io-uring/CAHC9VhSVzujW9LOj5Km80AjU0EfAuukoLrxO6BEfnXeK_s6bAg@mail.gmail.com/ [2] https://github.com/ming1/ubdsrv/commits/ioctl_cmd_encoding Cc: Christoph Hellwig Cc: Ken Kurematsu Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230418131810.855959-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/Kconfig | 17 +++++++++++++ drivers/block/ublk_drv.c | 47 +++++++++++++++++++++++++---------- include/uapi/linux/ublk_cmd.h | 43 ++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 13 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index d2d9dfcc34ff..9195938f67ae 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -494,6 +494,23 @@ config BLK_DEV_UBLK can handle batch more effectively, but task_work_add() isn't exported for module, so ublk has to be built to kernel. +config BLKDEV_UBLK_LEGACY_OPCODES + bool "Support legacy command opcode" + depends on BLK_DEV_UBLK + default y + help + ublk driver started to take plain command encoding, which turns out + one bad way. The traditional ioctl command opcode encodes more + info and basically defines each code uniquely, so opcode conflict + is avoided, and driver can handle wrong command easily, meantime it + may help security subsystem to audit io_uring command. + + Say Y if your application still uses legacy command opcode. + + Say N if you don't want to support legacy command opcode. It is + suggested to enable N if your application(ublk server) switches to + ioctl command encoding. + source "drivers/block/rnbd/Kconfig" endif # BLK_DEV diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f057f6cebb31..bb4a03e1cddf 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -54,7 +54,8 @@ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ - | UBLK_F_UNPRIVILEGED_DEV) + | UBLK_F_UNPRIVILEGED_DEV \ + | UBLK_F_CMD_IOCTL_ENCODE) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -1257,6 +1258,19 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, ublk_queue_cmd(ubq, req); } +static inline int ublk_check_cmd_op(u32 cmd_op) +{ + u32 ioc_type = _IOC_TYPE(cmd_op); + + if (IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') + return -EOPNOTSUPP; + + if (ioc_type != 'u' && ioc_type != 0) + return -EOPNOTSUPP; + + return 0; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, struct ublksrv_io_cmd *ub_cmd) @@ -1299,10 +1313,14 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. */ if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) - ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) + ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) + goto out; + + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - switch (cmd_op) { + switch (_IOC_NR(cmd_op)) { case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -1769,6 +1787,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE; + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; @@ -2128,7 +2148,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, * know if the specified device is created as unprivileged * mode. */ - if (cmd->cmd_op != UBLK_CMD_GET_DEV_INFO2) + if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) return 0; } @@ -2154,7 +2174,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, dev_path[header->dev_path_len] = 0; ret = -EINVAL; - switch (cmd->cmd_op) { + switch (_IOC_NR(cmd->cmd_op)) { case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: @@ -2193,6 +2213,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; struct ublk_device *ub = NULL; + u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; if (issue_flags & IO_URING_F_NONBLOCK) @@ -2203,22 +2224,22 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!(issue_flags & IO_URING_F_SQE128)) goto out; - if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { + ret = ublk_check_cmd_op(cmd_op); + if (ret) + goto out; + + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); if (!ub) goto out; ret = ublk_ctrl_uring_cmd_permission(ub, cmd); - } else { - /* ADD_DEV permission check is done in command handler */ - ret = 0; + if (ret) + goto put_dev; } - if (ret) - goto put_dev; - - switch (cmd->cmd_op) { + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: ret = ublk_ctrl_start_dev(ub, cmd); break; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index f6238ccc7800..640bf687b94a 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -8,6 +8,9 @@ /* * Admin commands, issued by ublk server, and handled by ublk driver. + * + * Legacy command definition, don't use in new application, and don't + * add new such definition any more */ #define UBLK_CMD_GET_QUEUE_AFFINITY 0x01 #define UBLK_CMD_GET_DEV_INFO 0x02 @@ -21,6 +24,30 @@ #define UBLK_CMD_END_USER_RECOVERY 0x11 #define UBLK_CMD_GET_DEV_INFO2 0x12 +/* Any new ctrl command should encode by __IO*() */ +#define UBLK_U_CMD_GET_QUEUE_AFFINITY \ + _IOR('u', UBLK_CMD_GET_QUEUE_AFFINITY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_DEV_INFO \ + _IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_ADD_DEV \ + _IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV \ + _IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_DEV \ + _IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_STOP_DEV \ + _IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_SET_PARAMS \ + _IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_PARAMS \ + _IOR('u', UBLK_CMD_GET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_END_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_DEV_INFO2 \ + _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) + /* * IO commands, issued by ublk server, and handled by ublk driver. * @@ -41,10 +68,23 @@ * It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag * while starting a ublk device. */ + +/* + * Legacy IO command definition, don't use in new application, and don't + * add new such definition any more + */ #define UBLK_IO_FETCH_REQ 0x20 #define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21 #define UBLK_IO_NEED_GET_DATA 0x22 +/* Any new IO command should encode by __IOWR() */ +#define UBLK_U_IO_FETCH_REQ \ + _IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_COMMIT_AND_FETCH_REQ \ + _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_NEED_GET_DATA \ + _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 @@ -102,6 +142,9 @@ */ #define UBLK_F_UNPRIVILEGED_DEV (1UL << 5) +/* use ioctl encoding for uring command */ +#define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- Gitee From 7a0f8e32f7f8e6a12371188e44a7738b2246a135 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 20 Apr 2023 17:11:04 +0800 Subject: [PATCH 26/63] ublk: don't return 0 in case of any failure ANBZ: #32233 commit 7c75661c42a06fe35e1774373194f646b5a9e5c9 upstream. Commit 2d786e66c966 ("block: ublk: switch to ioctl command encoding") starts to reset local variable of 'ret' as zero, then if any failure happens when handling the three IO commands, 0 can be returned to ublk server. Fix it by returning -EINVAL in case of command handling failure. Cc: Christoph Hellwig Fixes: 2d786e66c966 ("block: ublk: switch to ioctl command encoding") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230420091104.1092972-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bb4a03e1cddf..cb47142cd4d6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1320,6 +1320,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (ret) goto out; + ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ -- Gitee From ec4368a5e06efb00e5a09d7b2c49cfba0ba75edf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 May 2023 10:42:31 +0800 Subject: [PATCH 27/63] ublk: add timeout handler ANBZ: #32233 commit c0b79b0ff53be5b05be98e3caaa6a39de1fe9520 upstream. Add timeout handler, so that we can provide forward progress guarantee for unprivileged ublk, which can't be trusted. One thing is that sync() calls sync_bdevs(wait) for all block devices after running sync_bdevs(no_wait), and if one device can't move on, the sync() won't return any more. Add timeout for unprivileged ublk to avoid such affect for other users which call sync() syscall. Meantime clear UBLK_F_USER_RECOVERY_REISSUE for unprivileged ublk since that feature may cause IO hang too. Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230502024231.888498-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cb47142cd4d6..4be8705cc27e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -130,6 +130,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; + bool timeout; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[]; @@ -896,6 +897,22 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) } } +static enum blk_eh_timer_return ublk_timeout(struct request *rq, bool reserved) +{ + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { + if (!ubq->timeout) { + send_sig(SIGKILL, ubq->ubq_daemon, 0); + ubq->timeout = true; + } + + return BLK_EH_DONE; + } + + return BLK_EH_RESET_TIMER; +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -955,6 +972,7 @@ static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, .init_request = ublk_init_rq, + .timeout = ublk_timeout, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1737,6 +1755,18 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -2009,6 +2039,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; + ubq->timeout = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; -- Gitee From 7ac3f23e38fbf616e3fe76887a5b8c2d21dd9a63 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 5 May 2023 23:31:42 +0800 Subject: [PATCH 28/63] ublk: fix command op code check ANBZ: #32233 commit e485bd9e2c419142430ae6fe3e8f64e3059aef50 upstream. In case of CONFIG_BLKDEV_UBLK_LEGACY_OPCODES, type of cmd opcode could be 0 or 'u'; and type can only be 'u' if CONFIG_BLKDEV_UBLK_LEGACY_OPCODES isn't set. So fix the wrong check. Fixes: 2d786e66c966 ("block: ublk: switch to ioctl command encoding") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230505153142.1258336-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4be8705cc27e..3885d942583f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1280,7 +1280,7 @@ static inline int ublk_check_cmd_op(u32 cmd_op) { u32 ioc_type = _IOC_TYPE(cmd_op); - if (IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') + if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') return -EOPNOTSUPP; if (ioc_type != 'u' && ioc_type != 0) -- Gitee From 55ea02d51586555026d0f9c1f9b01c3909993827 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 May 2023 21:34:08 +0800 Subject: [PATCH 29/63] ublk: fix AB-BA lockdep warning ANBZ: #32233 commit ac5902f84bb546c64aea02c439c2579cbf40318f upstream. When handling UBLK_IO_FETCH_REQ, ctx->uring_lock is grabbed first, then ub->mutex is acquired. When handling UBLK_CMD_STOP_DEV or UBLK_CMD_DEL_DEV, ub->mutex is grabbed first, then calling io_uring_cmd_done() for canceling uring command, in which ctx->uring_lock may be required. Real deadlock only happens when all the above commands are issued from same uring context, and in reality different uring contexts are often used for handing control command and IO command. Fix the issue by using io_uring_cmd_complete_in_task() to cancel command in ublk_cancel_dev(ublk_cancel_queue). Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-block/becol2g7sawl4rsjq2dztsbc7mqypfqko6wzsyoyazqydoasml@rcxarzwidrhk Cc: Ziyang Zhang Signed-off-by: Ming Lei Tested-by: Shinichiro Kawasaki Link: https://lore.kernel.org/r/20230517133408.210944-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3885d942583f..1dc51480743d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1118,6 +1118,11 @@ static inline bool ublk_queue_ready(struct ublk_queue *ubq) return ubq->nr_io_ready == ubq->q_depth; } +static void ublk_cmd_cancel_cb(struct io_uring_cmd *cmd) +{ + io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0); +} + static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; @@ -1129,7 +1134,8 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) struct ublk_io *io = &ubq->ios[i]; if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); + io_uring_cmd_complete_in_task(io->cmd, + ublk_cmd_cancel_cb); } /* all io commands are canceled */ -- Gitee From 3a7172f602e666f5fb09fbb6c642a0a47c4a6831 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:24 +0800 Subject: [PATCH 30/63] ublk: kill queuing request by task_work_add ANBZ: #32233 commit 29dc5d06613f2438ec20a4ba5e0a5a740584d346 upstream. task_work_add() is used from early ublk development stage for handling request in batch. However, since commit 7d4a93176e01 ("ublk_drv: don't forward io commands in reserve order"), we can get similar batch processing with io_uring_cmd_complete_in_task(), and similar performance data is observed between task_work_add() and io_uring_cmd_complete_in_task(). Meantime we can kill one fast code path, which is actually seldom used given it is common to build ublk driver as module. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-2-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 41 ++-------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dc51480743d..1fc233bee55d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -41,8 +41,6 @@ #include #include #include -#include -#include #include #include @@ -63,7 +61,6 @@ struct ublk_rq_data { struct llist_node node; - struct callback_head work; }; struct ublk_uring_cmd_pdu { @@ -292,14 +289,6 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } -static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) -{ - if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && - !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) - return true; - return false; -} - static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -851,16 +840,6 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) ublk_forward_io_cmds(ubq); } -static void ublk_rq_task_work_fn(struct callback_head *work) -{ - struct ublk_rq_data *data = container_of(work, - struct ublk_rq_data, work); - struct request *req = blk_mq_rq_from_pdu(data); - struct ublk_queue *ubq = req->mq_hctx->driver_data; - - ublk_forward_io_cmds(ubq); -} - static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); @@ -884,10 +863,6 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) */ if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { ublk_abort_io_cmds(ubq); - } else if (ublk_can_use_task_work(ubq)) { - if (task_work_add(ubq->ubq_daemon, &data->work, - TWA_SIGNAL_NO_IPI)) - ublk_abort_io_cmds(ubq); } else { struct io_uring_cmd *cmd = io->cmd; struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); @@ -959,19 +934,9 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } -static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - init_task_work(&data->work, ublk_rq_task_work_fn); - return 0; -} - static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, - .init_request = ublk_init_rq, .timeout = ublk_timeout, }; @@ -1821,10 +1786,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) */ ub->dev_info.flags &= UBLK_F_ALL; - if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) - ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; - - ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | + UBLK_F_URING_CMD_COMP_IN_TASK; /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; -- Gitee From 096f21b9056df184fbfa461f66b2d0295dac143e Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 29 Jul 2025 16:23:49 +0800 Subject: [PATCH 31/63] anolis: Revert "task_work: allow TWA_SIGNAL without a rescheduling IPI" ANBZ: #32233 This reverts commit 7df38cba443c2d600b8876948bd9635a8c0216dc. Since ublk is now killing queuing request by task_work_add, TWA_SIGNAL_NO_IPI has no user any more. So revert the corresponding introduced logic. Signed-off-by: Joseph Qi --- include/linux/task_work.h | 1 - include/linux/tracehook.h | 13 ++----------- kernel/task_work.c | 25 ++++++------------------- 3 files changed, 8 insertions(+), 31 deletions(-) diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 8d976ae84769..5b8a93f288bb 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -17,7 +17,6 @@ enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, - TWA_SIGNAL_NO_IPI, }; int task_work_add(struct task_struct *task, struct callback_head *twork, diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index e724a283405d..0e1ad33eec7c 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -212,22 +212,13 @@ static inline void tracehook_notify_signal(void) task_work_run(); } -/* - * Returns 'true' if kick_process() is needed to force a transition from - * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. - */ -static inline bool __set_notify_signal(struct task_struct *task) -{ - return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && - !wake_up_state(task, TASK_INTERRUPTIBLE); -} - /* * Called when we have work to process from exit_to_user_mode_loop() */ static inline void set_notify_signal(struct task_struct *task) { - if (__set_notify_signal(task)) + if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE)) kick_process(task); } diff --git a/kernel/task_work.c b/kernel/task_work.c index c1701eeb6a7b..e9316198c64b 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -12,22 +12,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. - * - * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted - * task and run the task_work, regardless of whether the task is currently - * running in the kernel or userspace. - * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a - * reschedule IPI to force the targeted task to reschedule and run task_work. - * This can be advantageous if there's no strict requirement that the - * task_work be run as soon as possible, just whenever the task enters the - * kernel anyway. - * @TWA_RESUME work is run only when the task exits the kernel and returns to - * user mode, or before entering guest mode. - * - * Fails if the @task is exiting/exited and thus it can't process this @work. - * Otherwise @work->func() will be called when the @task goes through one of - * the aforementioned transitions, or exits. + * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the + * it will interrupt the targeted task and run the task_work. @TWA_RESUME + * work is run only when the task exits the kernel and returns to user mode, + * or before entering guest mode. Fails if the @task is exiting/exited and thus + * it can't process this @work. Otherwise @work->func() will be called when the + * @task goes through one of the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -60,9 +50,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; - case TWA_SIGNAL_NO_IPI: - __set_notify_signal(task); - break; default: WARN_ON_ONCE(1); break; -- Gitee From 9c1d807841d05105e27ad5cd61c568385ff95cf0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:25 +0800 Subject: [PATCH 32/63] ublk: cleanup io cmd code path by adding ublk_fill_io_cmd() ANBZ: #32233 commit f236a21459a5cdd828b93f363946e116773494f6 upstream. Add one small helper to cleanup io command hanlding code path. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1fc233bee55d..524995e979c0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1260,6 +1260,14 @@ static inline int ublk_check_cmd_op(u32 cmd_op) return 0; } +static inline void ublk_fill_io_cmd(struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr) +{ + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->addr = buf_addr; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, struct ublksrv_io_cmd *ub_cmd) @@ -1326,10 +1334,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->addr = ub_cmd->addr; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: @@ -1342,17 +1348,13 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, goto out; if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - io->addr = ub_cmd->addr; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->cmd = cmd; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; case UBLK_IO_NEED_GET_DATA: if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - io->addr = ub_cmd->addr; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); break; default: -- Gitee From 511438883cfdfa200aff4f2e2f9503d93f5bcd93 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:26 +0800 Subject: [PATCH 33/63] ublk: cleanup ublk_copy_user_pages ANBZ: #32233 commit 981f95a571e3ca20a496c0b77dbf6b06039c6648 upstream. Clean up ublk_copy_user_pages() by using iov_iter_get_pages2, and code gets simplified a lot and becomes much more readable than before. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-4-ming.lei@redhat.com Signed-off-by: Jens Axboe [joe: replace iov_iter_get_pages2 with iov_iter_get_pages and iov_iter_advance, and replace ITER_DEST/ITER_SOURCE with READ/WRITE] Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 114 ++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 63 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 524995e979c0..b830ce603237 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -412,49 +412,39 @@ static const struct block_device_operations ub_fops = { #define UBLK_MAX_PIN_PAGES 32 -struct ublk_map_data { - const struct request *rq; - unsigned long ubuf; - unsigned int len; -}; - struct ublk_io_iter { struct page *pages[UBLK_MAX_PIN_PAGES]; - unsigned pg_off; /* offset in the 1st page in pages */ - int nr_pages; /* how many page pointers in pages */ struct bio *bio; struct bvec_iter iter; }; -static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, - unsigned max_bytes, bool to_vm) +/* return how many pages are copied */ +static void ublk_copy_io_pages(struct ublk_io_iter *data, + size_t total, size_t pg_off, int dir) { - const unsigned total = min_t(unsigned, max_bytes, - PAGE_SIZE - data->pg_off + - ((data->nr_pages - 1) << PAGE_SHIFT)); unsigned done = 0; unsigned pg_idx = 0; while (done < total) { struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - const unsigned int bytes = min3(bv.bv_len, total - done, - (unsigned)(PAGE_SIZE - data->pg_off)); + unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, + (unsigned)(PAGE_SIZE - pg_off)); void *bv_buf = kmap(bv.bv_page) + bv.bv_offset; void *pg_buf = kmap(data->pages[pg_idx]); - if (to_vm) - memcpy(pg_buf + data->pg_off, bv_buf, bytes); + if (dir == READ) + memcpy(pg_buf + pg_off, bv_buf, bytes); else - memcpy(bv_buf, pg_buf + data->pg_off, bytes); + memcpy(bv_buf, pg_buf + pg_off, bytes); kunmap(pg_buf); kunmap(bv_buf); /* advance page array */ - data->pg_off += bytes; - if (data->pg_off == PAGE_SIZE) { + pg_off += bytes; + if (pg_off == PAGE_SIZE) { pg_idx += 1; - data->pg_off = 0; + pg_off = 0; } done += bytes; @@ -468,41 +458,42 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, data->iter = data->bio->bi_iter; } } - - return done; } -static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) +/* + * Copy data between request pages and io_iter, and 'offset' + * is the start point of linear offset of request. + */ +static size_t ublk_copy_user_pages(const struct request *req, + struct iov_iter *uiter, int dir) { - const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->ubuf; - unsigned int done = 0; struct ublk_io_iter iter = { - .pg_off = start_vm & (PAGE_SIZE - 1), - .bio = data->rq->bio, - .iter = data->rq->bio->bi_iter, + .bio = req->bio, + .iter = req->bio->bi_iter, }; - const unsigned int nr_pages = round_up(data->len + - (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; - - while (done < nr_pages) { - const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, - nr_pages - done); - unsigned i, len; - - iter.nr_pages = get_user_pages_fast(start_vm + - (done << PAGE_SHIFT), to_pin, gup_flags, - iter.pages); - if (iter.nr_pages <= 0) - return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->len, to_vm); - for (i = 0; i < iter.nr_pages; i++) { - if (to_vm) + size_t done = 0; + + while (iov_iter_count(uiter) && iter.bio) { + unsigned nr_pages; + size_t len, off; + int i; + + len = iov_iter_get_pages(uiter, iter.pages, + iov_iter_count(uiter), + UBLK_MAX_PIN_PAGES, &off); + if (len >= 0) + iov_iter_advance(uiter, len); + if (len <= 0) + return done; + + ublk_copy_io_pages(&iter, len, off, dir); + nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); + for (i = 0; i < nr_pages; i++) { + if (dir == READ) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->len -= len; - done += iter.nr_pages; + done += len; } return done; @@ -529,15 +520,14 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, * context is pretty fast, see ublk_pin_user_pages */ if (ublk_need_map_req(req)) { - struct ublk_map_data data = { - .rq = req, - .ubuf = io->addr, - .len = rq_bytes, - }; + struct iov_iter iter; + struct iovec iov; + const int dir = READ; - ublk_copy_user_pages(&data, true); + import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, + &iov, &iter); - return rq_bytes - data.len; + return ublk_copy_user_pages(req, &iter, dir); } return rq_bytes; } @@ -549,17 +539,15 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, const unsigned int rq_bytes = blk_rq_bytes(req); if (ublk_need_unmap_req(req)) { - struct ublk_map_data data = { - .rq = req, - .ubuf = io->addr, - .len = io->res, - }; + struct iov_iter iter; + struct iovec iov; + const int dir = WRITE; WARN_ON_ONCE(io->res > rq_bytes); - ublk_copy_user_pages(&data, false); - - return io->res - data.len; + import_single_range(dir, u64_to_user_ptr(io->addr), io->res, + &iov, &iter); + return ublk_copy_user_pages(req, &iter, dir); } return rq_bytes; } -- Gitee From e05e9f547a342c78564b030b2a41b8a6749f6809 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:27 +0800 Subject: [PATCH 34/63] ublk: grab request reference when the request is handled by userspace ANBZ: #32233 commit 8284066946e6d9cc979566ce698fe24e7ca0b31e upstream. Add one reference counter into request pdu data, and hold this reference in the request's lifetime. Prepare for supporting to move request data copy into userspace, which needs to copy request data by read()/write() on /dev/ublkcN, so we have to guarantee that read()/write() is done on one valid/active request, and that will be enhanced by holding the io request reference in read()/write(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-5-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 67 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b830ce603237..34aba3b6e1ae 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -61,6 +62,8 @@ struct ublk_rq_data { struct llist_node node; + + struct kref ref; }; struct ublk_uring_cmd_pdu { @@ -181,6 +184,9 @@ struct ublk_params_header { __u32 types; }; +static inline void __ublk_complete_rq(struct request *req); +static void ublk_complete_rq(struct kref *ref); + static dev_t ublk_chr_devt; static struct class *ublk_chr_class; @@ -289,6 +295,45 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) +{ + return false; +} + +static inline void ublk_init_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_init(&data->ref); + } +} + +static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + return kref_get_unless_zero(&data->ref); + } + + return true; +} + +static inline void ublk_put_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_put(&data->ref, ublk_complete_rq); + } else { + __ublk_complete_rq(req); + } +} + static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -627,13 +672,19 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) } /* todo: handle partial completion */ -static void ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req) { struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; + /* called from ublk_abort_queue() code path */ + if (io->flags & UBLK_IO_FLAG_ABORTED) { + res = BLK_STS_IOERR; + goto exit; + } + /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; @@ -673,6 +724,15 @@ static void ublk_complete_rq(struct request *req) blk_mq_end_request(req, res); } +static void ublk_complete_rq(struct kref *ref) +{ + struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, + ref); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_complete_rq(req); +} + /* * Since __ublk_rq_task_work always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during @@ -691,7 +751,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, if (ublk_queue_can_use_recovery_reissue(ubq)) blk_mq_requeue_request(req, false); else - blk_mq_end_request(req, BLK_STS_IOERR); + ublk_put_req_ref(ubq, req); } } @@ -798,6 +858,7 @@ static inline void __ublk_rq_task_work(struct request *req) mapped_bytes >> 9; } + ublk_init_req_ref(ubq, req); ubq_complete_io_cmd(io, UBLK_IO_RES_OK); } @@ -1001,7 +1062,7 @@ static void ublk_commit_completion(struct ublk_device *ub, req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); if (req && likely(!blk_should_fake_timeout(req->q))) - ublk_complete_rq(req); + ublk_put_req_ref(ubq, req); } /* -- Gitee From 3b0ce3ea0d1c6d844d2e85f396b87b67b2a71fe2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:28 +0800 Subject: [PATCH 35/63] ublk: support to copy any part of request pages ANBZ: #32233 commit 38f2dd34410f9070b60969a07ff7d8743b4fd56c upstream. Add 'offset' to 'struct ublk_map_data', so that ublk_copy_user_pages() can be used to copy any sub-buffer(linear mapped) of the request. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-6-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 34aba3b6e1ae..67cd5c3ee460 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -505,19 +505,36 @@ static void ublk_copy_io_pages(struct ublk_io_iter *data, } } +static bool ublk_advance_io_iter(const struct request *req, + struct ublk_io_iter *iter, unsigned int offset) +{ + struct bio *bio = req->bio; + + for_each_bio(bio) { + if (bio->bi_iter.bi_size > offset) { + iter->bio = bio; + iter->iter = bio->bi_iter; + bio_advance_iter(iter->bio, &iter->iter, offset); + return true; + } + offset -= bio->bi_iter.bi_size; + } + return false; +} + /* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request. */ static size_t ublk_copy_user_pages(const struct request *req, - struct iov_iter *uiter, int dir) + unsigned offset, struct iov_iter *uiter, int dir) { - struct ublk_io_iter iter = { - .bio = req->bio, - .iter = req->bio->bi_iter, - }; + struct ublk_io_iter iter; size_t done = 0; + if (!ublk_advance_io_iter(req, &iter, offset)) + return 0; + while (iov_iter_count(uiter) && iter.bio) { unsigned nr_pages; size_t len, off; @@ -572,7 +589,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, &iov, &iter); - return ublk_copy_user_pages(req, &iter, dir); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } @@ -592,7 +609,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, import_single_range(dir, u64_to_user_ptr(io->addr), io->res, &iov, &iter); - return ublk_copy_user_pages(req, &iter, dir); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } -- Gitee From cc31e2c44ece87db6dc68fe0e0a6edf590104e3a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:29 +0800 Subject: [PATCH 36/63] ublk: add read()/write() support for ublk char device ANBZ: #32233 commit 62fe99cef94a5900cac3bf15fd03ee8baad1a99c upstream. Support pread()/pwrite() on ublk char device for reading/writing request io buffer, so data copy between io request buffer and userspace buffer can be moved to ublk server from ublk driver. Then UBLK_F_NEED_GET_DATA becomes not necessary, so ublk server can allocate buffer without one extra round uring command communication for userspace to provide buffer. IO buffer can be located by iocb->ki_pos which encodes buffer offset, io tag and queue id info, and type of iocb->ki_pos is u64, so it is big enough for holding reasonable queue depth, nr_queues and max io buffer size. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-7-ming.lei@redhat.com Signed-off-by: Jens Axboe [joe: replace ITER_DEST/ITER_SOURCE with READ/WRITE, and user_backed_iter with iter_is_iovec] Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 151 ++++++++++++++++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 22 ++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 67cd5c3ee460..4f7902cd7d46 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -207,6 +207,23 @@ static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ static struct miscdevice ublk_misc; +static inline unsigned ublk_pos_to_hwq(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & + UBLK_QID_BITS_MASK; +} + +static inline unsigned ublk_pos_to_buf_off(loff_t pos) +{ + return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; +} + +static inline unsigned ublk_pos_to_tag(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & + UBLK_TAG_BITS_MASK; +} + static void ublk_dev_param_basic_apply(struct ublk_device *ub) { struct request_queue *q = ub->ub_disk->queue; @@ -1435,6 +1452,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset) +{ + struct request *req; + + if (!ublk_need_req_ref(ubq)) + return NULL; + + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (!req) + return NULL; + + if (!ublk_get_req_ref(ubq, req)) + return NULL; + + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) + goto fail_put; + + if (!ublk_rq_has_data(req)) + goto fail_put; + + if (offset > blk_rq_bytes(req)) + goto fail_put; + + return req; +fail_put: + ublk_put_req_ref(ubq, req); + return NULL; +} + static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd; @@ -1452,11 +1499,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); } +static inline bool ublk_check_ubuf_dir(const struct request *req, + int ubuf_dir) +{ + /* copy ubuf to request pages */ + if (req_op(req) == REQ_OP_READ && ubuf_dir == WRITE) + return true; + + /* copy request pages to ubuf */ + if (req_op(req) == REQ_OP_WRITE && ubuf_dir == READ) + return true; + + return false; +} + +static struct request *ublk_check_and_get_req(struct kiocb *iocb, + struct iov_iter *iter, size_t *off, int dir) +{ + struct ublk_device *ub = iocb->ki_filp->private_data; + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + u16 tag, q_id; + + if (!ub) + return ERR_PTR(-EACCES); + + if (!iter_is_iovec(iter)) + return ERR_PTR(-EACCES); + + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return ERR_PTR(-EACCES); + + tag = ublk_pos_to_tag(iocb->ki_pos); + q_id = ublk_pos_to_hwq(iocb->ki_pos); + buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + + if (q_id >= ub->dev_info.nr_hw_queues) + return ERR_PTR(-EINVAL); + + ubq = ublk_get_queue(ub, q_id); + if (!ubq) + return ERR_PTR(-EINVAL); + + if (tag >= ubq->q_depth) + return ERR_PTR(-EINVAL); + + req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + if (!req) + return ERR_PTR(-EINVAL); + + if (!req->mq_hctx || !req->mq_hctx->driver_data) + goto fail; + + if (!ublk_check_ubuf_dir(req, dir)) + goto fail; + + *off = buf_off; + return req; +fail: + ublk_put_req_ref(ubq, req); + return ERR_PTR(-EACCES); +} + +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, to, &buf_off, READ); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, to, READ); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + +static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, from, &buf_off, WRITE); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, from, WRITE); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, .llseek = no_llseek, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, .mmap = ublk_ch_mmap, }; @@ -2372,6 +2520,9 @@ static int __init ublk_init(void) { int ret; + BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 640bf687b94a..c0c1632c671e 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -93,9 +93,29 @@ #define UBLKSRV_CMD_BUF_OFFSET 0 #define UBLKSRV_IO_BUF_OFFSET 0x80000000 -/* tag bit is 12bit, so at most 4096 IOs for each queue */ +/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */ #define UBLK_MAX_QUEUE_DEPTH 4096 +/* single IO buffer max size is 32MB */ +#define UBLK_IO_BUF_OFF 0 +#define UBLK_IO_BUF_BITS 25 +#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1) + +/* so at most 64K IOs for each queue */ +#define UBLK_TAG_OFF UBLK_IO_BUF_BITS +#define UBLK_TAG_BITS 16 +#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1) + +/* max 4096 queues */ +#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS) +#define UBLK_QID_BITS 12 +#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1) + +#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS) + +#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) +#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) + /* * zero copy requires 4k block size, and can remap ublk driver's io * request into ublksrv's vm space -- Gitee From 02c71563faf9ec3688d65c8173c85e8084b10725 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:30 +0800 Subject: [PATCH 37/63] ublk: support user copy ANBZ: #32233 commit 1172d5b8beca6b899deb9f7f2850e7e47ec16198 upstream. Currently copy between io request buffer(pages) and userspace buffer is done inside ublk_map_io() or ublk_unmap_io(). This way performs very well in case of pre-allocated userspace io buffer. For dynamically allocated or external userspace backend io buffer, UBLK_F_NEED_GET_DATA is added for ublk server to provide buffer by one extra command communication for WRITE request. For READ, userspace simply provides buffer, but can't know when the buffer is done[1]. Add UBLK_F_USER_COPY by moving io data copy out of kernel by providing read()/write() on /dev/ublkcN, and simply let ublk server do the io data copy. This way makes both side cleaner, the cost is that one extra syscall for copy io data between request and backend buffer. With UBLK_F_USER_COPY, it actually becomes possible to run per-io zero copy now, such as, only do zero copy for big size IO, so it can be thought as one prep patch for supporting zero copy. Meantime zero copy still needs to expose read()/write() buffer for some corner case, such as passthrough IO. [1] READ buffer in UBLK_F_NEED_GET_DATA https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb ublksrv loop usercopy code: https://github.com/ming1/ubdsrv/commits/usercopy Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-8-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 58 ++++++++++++++++++++++++++++------- include/uapi/linux/ublk_cmd.h | 3 ++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4f7902cd7d46..c3a7e3384a91 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -54,7 +54,8 @@ | UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ - | UBLK_F_CMD_IOCTL_ENCODE) + | UBLK_F_CMD_IOCTL_ENCODE \ + | UBLK_F_USER_COPY) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -312,9 +313,18 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { - return false; + /* + * read()/write() is involved in user copy, so request reference + * has to be grabbed + */ + return ublk_support_user_copy(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, @@ -593,6 +603,9 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current @@ -617,6 +630,9 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + if (ublk_need_unmap_req(req)) { struct iov_iter iter; struct iovec iov; @@ -1396,6 +1412,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; + if (ublk_support_user_copy(ubq) && ub_cmd->addr) { + ret = -EINVAL; + goto out; + } + ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1414,23 +1435,34 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!ub_cmd->addr && !ublk_need_get_data(ubq)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is - * not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) - goto out; + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; @@ -2005,6 +2037,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK; + /* GET_DATA isn't needed any more with USER_COPY */ + if (ub->dev_info.flags & UBLK_F_USER_COPY) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c0c1632c671e..54b5b0aeefca 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -165,6 +165,9 @@ /* use ioctl encoding for uring command */ #define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) +/* Copy between request and user buffer by pread()/pwrite() */ +#define UBLK_F_USER_COPY (1UL << 7) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- Gitee From 245996fbd1033f808fa2b0d6ef7fb052e0a4f217 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 20 May 2023 23:11:34 +0800 Subject: [PATCH 38/63] ublk: fix build warning on iov_iter_get_pages2 ANBZ: #32233 commit b8b637d770ef7aa9bc3971670cc8532b1f0d757e upstream. Return type of iov_iter_get_pages2() is ssize_t instead of size_t, so fix it. Fixes: 981f95a571e3 ("ublk: cleanup ublk_copy_user_pages") Reported-by: kernel test robot Reported-by: Julia Lawall Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230520151134.459679-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c3a7e3384a91..34fba1dc2c06 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -564,7 +564,8 @@ static size_t ublk_copy_user_pages(const struct request *req, while (iov_iter_count(uiter) && iter.bio) { unsigned nr_pages; - size_t len, off; + ssize_t len; + size_t off; int i; len = iov_iter_get_pages(uiter, iter.pages, -- Gitee From c918365920b4d28723bdd1b48a2dc5681428ff6f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 3 Jun 2023 12:06:01 +0800 Subject: [PATCH 39/63] ublk: add control command of UBLK_U_CMD_GET_FEATURES ANBZ: #32233 commit b5bbc52fd01278642773818642288999a0236cb6 upstream. Add control command of UBLK_U_CMD_GET_FEATURES for returning driver's feature set or capability. This way can simplify userspace for maintaining compatibility because userspace doesn't need to send command to one device for querying driver feature set any more. Such as, with the queried feature set, userspace can choose to use: - UBLK_CMD_GET_DEV_INFO2 or UBLK_CMD_GET_DEV_INFO, - UBLK_U_CMD_* or UBLK_CMD_* Userspace code: https://github.com/ming1/ubdsrv/commits/features-cmd Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230603040601.775227-1-ming.lei@redhat.com Signed-off-by: Jens Axboe [joe: get ublksrv_ctrl_cmd directly from io_uring_cmd] Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 21 +++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 34fba1dc2c06..53a1a2b74f9e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2354,6 +2354,21 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } +static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) +{ + const struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + + if (header->len != UBLK_FEATURES_LEN || !header->addr) + return -EINVAL; + + if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) + return -EFAULT; + + return 0; +} + /* * All control commands are sent via /dev/ublk-control, so we have to check * the destination device's permission @@ -2433,6 +2448,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: case UBLK_CMD_GET_PARAMS: + case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): mask = MAY_READ; break; case UBLK_CMD_START_DEV: @@ -2482,6 +2498,11 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ret) goto out; + if (cmd_op == UBLK_U_CMD_GET_FEATURES) { + ret = ublk_ctrl_get_features(cmd); + goto out; + } + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 54b5b0aeefca..4b8558db90e1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -47,6 +47,14 @@ _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_DEV_INFO2 \ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_FEATURES \ + _IOR('u', 0x13, struct ublksrv_ctrl_cmd) + +/* + * 64bits are enough now, and it should be easy to extend in case of + * running out of feature flags + */ +#define UBLK_FEATURES_LEN 8 /* * IO commands, issued by ublk server, and handled by ublk driver. -- Gitee From 406e5435c9057e9f136d41dcb044ca9603bcca5e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Jun 2023 20:01:32 +0200 Subject: [PATCH 40/63] ublk: make ublk_chr_class a static const structure ANBZ: #32233 commit 2eefd399d28a52739fdbeebe84775275f016171c upstream. Now that the driver core allows for struct class to be in read-only memory, move the ublk_chr_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Ming Lei Cc: Jens Axboe Cc: linux-block@vger.kernel.org Suggested-by: Greg Kroah-Hartman Signed-off-by: Ivan Orlov Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230620180129.645646-7-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe [joe: eliminate const since struce device->class is non-const] Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 53a1a2b74f9e..cea56357937c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -189,7 +189,9 @@ static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); static dev_t ublk_chr_devt; -static struct class *ublk_chr_class; +static struct class ublk_chr_class = { + .name = "ublk-char", +}; static DEFINE_IDR(ublk_index_idr); static DEFINE_SPINLOCK(ublk_idr_lock); @@ -1761,7 +1763,7 @@ static int ublk_add_chdev(struct ublk_device *ub) dev->parent = ublk_misc.this_device; dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); - dev->class = ublk_chr_class; + dev->class = &ublk_chr_class; dev->release = ublk_cdev_rel; device_initialize(dev); @@ -2591,11 +2593,10 @@ static int __init ublk_init(void) if (ret) goto unregister_mis; - ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); - if (IS_ERR(ublk_chr_class)) { - ret = PTR_ERR(ublk_chr_class); + ret = class_register(&ublk_chr_class); + if (ret) goto free_chrdev_region; - } + return 0; free_chrdev_region: @@ -2613,7 +2614,7 @@ static void __exit ublk_exit(void) idr_for_each_entry(&ublk_index_idr, ub, id) ublk_remove(ub); - class_destroy(ublk_chr_class); + class_unregister(&ublk_chr_class); misc_deregister(&ublk_misc); idr_destroy(&ublk_index_idr); -- Gitee From 5795a70d47869673cfae60f7e98be978d794286a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 26 Jul 2023 22:45:00 +0800 Subject: [PATCH 41/63] ublk: fail to start device if queue setup is interrupted ANBZ: #32233 commit 53e7d08f6d6e214c40db1f51291bb2975c789dc2 upstream. In ublk_ctrl_start_dev(), if wait_for_completion_interruptible() is interrupted by signal, queues aren't setup successfully yet, so we have to fail UBLK_CMD_START_DEV, otherwise kernel oops can be triggered. Reported by German when working on qemu-storage-deamon which requires single thread ublk daemon. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reported-by: German Maglione Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230726144502.566785-2-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cea56357937c..5bc4854ae71b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1849,7 +1849,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ublksrv_pid <= 0) return -EINVAL; - wait_for_completion_interruptible(&ub->completion); + if (wait_for_completion_interruptible(&ub->completion) != 0) + return -EINTR; schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); -- Gitee From 15d586ba336e748ab4a5308b60d3eef8cdc15ae2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 26 Jul 2023 22:45:01 +0800 Subject: [PATCH 42/63] ublk: fail to recover device if queue setup is interrupted ANBZ: #32233 commit 0c0cbd4ebc375ceebc75c89df04b74f215fab23a upstream. In ublk_ctrl_end_recovery(), if wait_for_completion_interruptible() is interrupted by signal, queues aren't setup successfully yet, so we have to fail UBLK_CMD_END_USER_RECOVERY, otherwise kernel oops can be triggered. Fixes: c732a852b419 ("ublk_drv: add START_USER_RECOVERY and END_USER_RECOVERY support") Reported-by: Stefano Garzarella Signed-off-by: Ming Lei Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20230726144502.566785-3-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5bc4854ae71b..c542e7655d84 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2330,7 +2330,9 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); /* wait until new ubq_daemon sending all FETCH_REQ */ - wait_for_completion_interruptible(&ub->completion); + if (wait_for_completion_interruptible(&ub->completion)) + return -EINTR; + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); -- Gitee From 1676bb859fdca931ad42117098de6cbc92710b3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 26 Jul 2023 22:45:02 +0800 Subject: [PATCH 43/63] ublk: return -EINTR if breaking from waiting for existed users in DEL_DEV ANBZ: #32233 commit 3e9dce80dbf91972aed972c743f539c396a34312 upstream. If user interrupts wait_event_interruptible() in ublk_ctrl_del_dev(), return -EINTR and let user know what happens. Fixes: 0abe39dec065 ("block: ublk: improve handling device deletion") Reported-by: Stefano Garzarella Signed-off-by: Ming Lei Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20230726144502.566785-4-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c542e7655d84..7c092c551afa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2130,8 +2130,8 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)); - + if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + return -EINTR; return 0; } -- Gitee From bfcdbd615027b2ad77b1b2841deba0574b999b4a Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:08 +0200 Subject: [PATCH 44/63] ublk: add helper to check if device supports user copy ANBZ: #32233 commit 9d4ed6d46272bb60036a6539aae74eafcbbb3d2d upstream. This will be used by ublk zoned storage support. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230804114610.179530-2-nmi@metaspace.dk Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7c092c551afa..9c3257ee5332 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -185,6 +185,11 @@ struct ublk_params_header { __u32 types; }; +static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); @@ -2042,7 +2047,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) UBLK_F_URING_CMD_COMP_IN_TASK; /* GET_DATA isn't needed any more with USER_COPY */ - if (ub->dev_info.flags & UBLK_F_USER_COPY) + if (ublk_dev_is_user_copy(ub)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; /* We are not ready to support zero copy */ -- Gitee From 2199c7246b900d7c6ab702a9643feb118deee3d6 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:09 +0200 Subject: [PATCH 45/63] ublk: move check for empty address field on command submission ANBZ: #32233 commit 1a6e88b9593b63ccdfe1d84e3f99dd91e4f8d490 upstream. In preparation for zoned storage support, move the check for empty `addr` field into the command handler case statement. Note that the check makes no sense for `UBLK_IO_NEED_GET_DATA` because the `addr` field must always be set for this command. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230804114610.179530-3-nmi@metaspace.dk Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9c3257ee5332..4837d3d58f9c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1420,11 +1420,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - if (ublk_support_user_copy(ubq) && ub_cmd->addr) { - ret = -EINVAL; - goto out; - } - ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1451,6 +1446,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); @@ -1470,7 +1469,12 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; + } else if (ub_cmd->addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; } + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; -- Gitee From 4123d2c91b4866bba03509f38e13aff4d1940e0a Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 4 Aug 2023 13:46:10 +0200 Subject: [PATCH 46/63] ublk: enable zoned storage support ANBZ: #32233 commit 29802d7ca33bc0a75c9da2a143eeed4f9e99fca4 upstream. Add zoned storage support to ublk: report_zones and operations: - REQ_OP_ZONE_OPEN - REQ_OP_ZONE_CLOSE - REQ_OP_ZONE_FINISH - REQ_OP_ZONE_RESET - REQ_OP_ZONE_APPEND The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to communicate ALBA back to the kernel. Therefore ublk must be used with the user copy feature (UBLK_F_USER_COPY) for zoned storage support to be available. Without this feature, ublk will not allow zoned storage support. Signed-off-by: Andreas Hindborg Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/20230804114610.179530-4-nmi@metaspace.dk Signed-off-by: Jens Axboe [joe: fix conflicts for zone attributes related helpers] Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 335 ++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 63 ++++++- 2 files changed, 376 insertions(+), 22 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4837d3d58f9c..d726388b438c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -55,16 +55,21 @@ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ - | UBLK_F_USER_COPY) + | UBLK_F_USER_COPY \ + | UBLK_F_ZONED) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) +#define UBLK_PARAM_TYPE_ALL \ + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) struct ublk_rq_data { struct llist_node node; struct kref ref; + __u64 sector; + __u32 operation; + __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -185,11 +190,264 @@ struct ublk_params_header { __u32 types; }; +static inline unsigned int ublk_req_build_flags(struct request *req); +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag); + static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_USER_COPY; } +static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_ZONED; +} + +static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_ZONED; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +static int ublk_get_nr_zones(const struct ublk_device *ub) +{ + const struct ublk_param_basic *p = &ub->params.basic; + + /* Zone size is a power of 2 */ + return p->dev_sectors >> ilog2(p->chunk_sectors); +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return blk_revalidate_disk_zones(ub->ub_disk, NULL); +} + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + int nr_zones; + + if (!ublk_dev_is_zoned(ub)) + return -EINVAL; + + if (!p->max_zone_append_sectors) + return -EINVAL; + + nr_zones = ublk_get_nr_zones(ub); + + if (p->max_active_zones > nr_zones) + return -EINVAL; + + if (p->max_open_zones > nr_zones) + return -EINVAL; + + return 0; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + const struct ublk_param_zoned *p = &ub->params.zoned; + + blk_queue_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_required_elevator_features(ub->ub_disk->queue, + ELEVATOR_F_ZBD_SEQ_WRITE); + blk_queue_max_active_zones(ub->ub_disk->queue, p->max_active_zones); + blk_queue_max_open_zones(ub->ub_disk->queue, p->max_open_zones); + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); + + ub->ub_disk->queue->nr_zones = ublk_get_nr_zones(ub); + + return 0; +} + +/* Based on virtblk_alloc_report_buffer */ +static void *ublk_alloc_report_buffer(struct ublk_device *ublk, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ublk->ub_disk->queue; + size_t bufsize; + void *buf; + + nr_zones = min_t(unsigned int, nr_zones, + ublk->ub_disk->queue->nr_zones); + + bufsize = nr_zones * sizeof(struct blk_zone); + bufsize = + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); + + while (bufsize >= sizeof(struct blk_zone)) { + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + + *buflen = 0; + return NULL; +} + +static int ublk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct ublk_device *ub = disk->private_data; + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; + unsigned int first_zone = sector >> ilog2(zone_size_sectors); + unsigned int done_zones = 0; + unsigned int max_zones_per_request; + int ret; + struct blk_zone *buffer; + size_t buffer_length; + + nr_zones = min_t(unsigned int, ub->ub_disk->queue->nr_zones - first_zone, + nr_zones); + + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); + if (!buffer) + return -ENOMEM; + + max_zones_per_request = buffer_length / sizeof(struct blk_zone); + + while (done_zones < nr_zones) { + unsigned int remaining_zones = nr_zones - done_zones; + unsigned int zones_in_request = + min_t(unsigned int, remaining_zones, max_zones_per_request); + struct request *req; + struct ublk_rq_data *pdu; + blk_status_t status; + unsigned int i; + + memset(buffer, 0, buffer_length); + + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + pdu = blk_mq_rq_to_pdu(req); + pdu->operation = UBLK_IO_OP_REPORT_ZONES; + pdu->sector = sector; + pdu->nr_zones = zones_in_request; + + ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, + GFP_KERNEL); + if (ret) { + blk_mq_free_request(req); + goto out; + } + + status = blk_execute_rq(disk, req, 0); + ret = blk_status_to_errno(status); + blk_mq_free_request(req); + if (ret) + goto out; + + for (i = 0; i < zones_in_request; i++) { + struct blk_zone *zone = buffer + i; + + /* A zero length zone means no more zones in this response */ + if (!zone->len) + break; + + ret = cb(zone, i, data); + if (ret) + goto out; + + done_zones++; + sector += zone_size_sectors; + + } + } + + ret = done_zones; + +out: + kvfree(buffer); + return ret; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_ZONE_OPEN: + ublk_op = UBLK_IO_OP_ZONE_OPEN; + break; + case REQ_OP_ZONE_CLOSE: + ublk_op = UBLK_IO_OP_ZONE_CLOSE; + break; + case REQ_OP_ZONE_FINISH: + ublk_op = UBLK_IO_OP_ZONE_FINISH; + break; + case REQ_OP_ZONE_RESET: + ublk_op = UBLK_IO_OP_ZONE_RESET; + break; + case REQ_OP_ZONE_APPEND: + ublk_op = UBLK_IO_OP_ZONE_APPEND; + break; + case REQ_OP_DRV_IN: + ublk_op = pdu->operation; + switch (ublk_op) { + case UBLK_IO_OP_REPORT_ZONES: + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_zones = pdu->nr_zones; + iod->start_sector = pdu->sector; + return BLK_STS_OK; + default: + return BLK_STS_IOERR; + } + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_DRV_OUT: + /* We do not support reset_all and drv_out */ + return BLK_STS_NOTSUPP; + default: + return BLK_STS_IOERR; + } + + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +#else + +#define ublk_report_zones (NULL) + +static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +{ + return -EOPNOTSUPP; +} + +static int ublk_revalidate_disk_zones(struct ublk_device *ub) +{ + return 0; +} + +static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, + struct request *req) +{ + return -EOPNOTSUPP; +} + +#endif + static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); @@ -286,6 +544,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) + return -EINVAL; } else return -EINVAL; @@ -304,6 +565,11 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DEVT) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_validate(ub); + else if (ublk_dev_is_zoned(ub)) + return -EINVAL; + return 0; } @@ -317,6 +583,9 @@ static int ublk_apply_params(struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) ublk_dev_param_discard_apply(ub); + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) + return ublk_dev_param_zoned_apply(ub); + return 0; } @@ -487,6 +756,7 @@ static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, .open = ublk_open, .free_disk = ublk_free_disk, + .report_zones = ublk_report_zones, }; #define UBLK_MAX_PIN_PAGES 32 @@ -603,7 +873,8 @@ static inline bool ublk_need_map_req(const struct request *req) static inline bool ublk_need_unmap_req(const struct request *req) { - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; + return ublk_rq_has_data(req) && + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, @@ -687,8 +958,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; + enum req_opf op = req_op(req); u32 ublk_op; + if (!ublk_queue_is_zoned(ubq) && + (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) + return -EIO; + switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; @@ -706,6 +982,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) ublk_op = UBLK_IO_OP_WRITE_ZEROES; break; default: + if (ublk_queue_is_zoned(ubq)) + return ublk_setup_iod_zoned(ubq, req); return BLK_STS_IOERR; } @@ -758,7 +1036,8 @@ static inline void __ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && + req_op(req) != REQ_OP_DRV_IN) goto exit; /* for READ request, writing data in iod->addr to rq buffers */ @@ -1119,6 +1398,9 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + if (req && likely(!blk_should_fake_timeout(req->q))) ublk_put_req_ref(ubq, req); } @@ -1469,8 +1751,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; - } else if (ub_cmd->addr) { - /* User copy requires addr to be unset */ + } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ ret = -EINVAL; goto out; } @@ -1547,11 +1832,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { /* copy ubuf to request pages */ - if (req_op(req) == REQ_OP_READ && ubuf_dir == WRITE) + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && + ubuf_dir == WRITE) return true; /* copy request pages to ubuf */ - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == READ) + if ((req_op(req) == REQ_OP_WRITE || + req_op(req) == REQ_OP_ZONE_APPEND) && + ubuf_dir == READ) return true; return false; @@ -1890,10 +2178,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) ub->ub_disk = disk; ret = ublk_apply_params(ub); - if (ret) { - put_disk(disk); - goto out_cleanup_queue; - } + if (ret) + goto out_put_disk; /* don't probe partitions if any one ubq daemon is un-trusted */ if (ub->nr_privileged_daemon != ub->nr_queues_ready) @@ -1901,8 +2187,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) get_device(&ub->cdev_dev); ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_dev_is_zoned(ub)) { + ret = ublk_revalidate_disk_zones(ub); + if (ret) + goto out_put_cdev; + } + add_disk(ub->ub_disk); set_bit(UB_STATE_USED, &ub->state); + +out_put_cdev: + if (ret) { + ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_put_device(ub); + } +out_put_disk: + if (ret) + put_disk(disk); out_cleanup_queue: if (ret) blk_cleanup_queue(ub->ub_queue); @@ -2054,6 +2356,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (ublk_dev_is_user_copy(ub)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* Zoned storage support requires user copy feature */ + if (ublk_dev_is_zoned(ub) && + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + ret = -EINVAL; + goto out_free_dev_number; + } + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4b8558db90e1..2685e53e4752 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -176,6 +176,12 @@ /* Copy between request and user buffer by pread()/pwrite() */ #define UBLK_F_USER_COPY (1UL << 7) +/* + * User space sets this flag when setting up the device to request zoned storage support. Kernel may + * deny the request by returning an error. + */ +#define UBLK_F_ZONED (1ULL << 8) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -232,9 +238,26 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_READ 0 #define UBLK_IO_OP_WRITE 1 #define UBLK_IO_OP_FLUSH 2 -#define UBLK_IO_OP_DISCARD 3 -#define UBLK_IO_OP_WRITE_SAME 4 -#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_DISCARD 3 +#define UBLK_IO_OP_WRITE_SAME 4 +#define UBLK_IO_OP_WRITE_ZEROES 5 +#define UBLK_IO_OP_ZONE_OPEN 10 +#define UBLK_IO_OP_ZONE_CLOSE 11 +#define UBLK_IO_OP_ZONE_FINISH 12 +#define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET 15 +/* + * Construct a zone report. The report request is carried in `struct + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone + * and shall indicate the first zone of the report. The `nr_zones` shall + * indicate how many zones should be reported at most. The report shall be + * delivered as a `struct blk_zone` array. To report fewer zones than requested, + * zero the last entry of the returned array. + * + * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in + * include/uapi/linux/blkzoned.h are part of ublk UAPI. + */ +#define UBLK_IO_OP_REPORT_ZONES 18 #define UBLK_IO_F_FAILFAST_DEV (1U << 8) #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) @@ -255,7 +278,10 @@ struct ublksrv_io_desc { /* op: bit 0-7, flags: bit 8-31 */ __u32 op_flags; - __u32 nr_sectors; + union { + __u32 nr_sectors; + __u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */ + }; /* start sector for this io */ __u64 start_sector; @@ -284,11 +310,21 @@ struct ublksrv_io_cmd { /* io result, it is valid for COMMIT* command only */ __s32 result; - /* - * userspace buffer address in ublksrv daemon process, valid for - * FETCH* command only - */ - __u64 addr; + union { + /* + * userspace buffer address in ublksrv daemon process, valid for + * FETCH* command only + * + * `addr` should not be used when UBLK_F_USER_COPY is enabled, + * because userspace handles data copy by pread()/pwrite() over + * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is + * re-used to pass back the allocated LBA for + * UBLK_IO_OP_ZONE_APPEND which actually depends on + * UBLK_F_USER_COPY + */ + __u64 addr; + __u64 zone_append_lba; + }; }; struct ublk_param_basic { @@ -331,6 +367,13 @@ struct ublk_param_devt { __u32 disk_minor; }; +struct ublk_param_zoned { + __u32 max_open_zones; + __u32 max_active_zones; + __u32 max_zone_append_sectors; + __u8 reserved[20]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -342,11 +385,13 @@ struct ublk_params { #define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DEVT (1 << 2) +#define UBLK_PARAM_TYPE_ZONED (1 << 3) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; struct ublk_param_devt devt; + struct ublk_param_zoned zoned; }; #endif -- Gitee From 7f614340f7e16187b0eeac359ad18993c014411f Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 10 Aug 2023 16:48:36 +0800 Subject: [PATCH 47/63] ublk: Fix signedness bug returning warning ANBZ: #32233 commit c8659bbb15cd42577a9b16a23b527436b028c8b2 upstream. There are two warnings reported by smatch: drivers/block/ublk_drv.c:445 ublk_setup_iod_zoned() warn: signedness bug returning '(-95)' drivers/block/ublk_drv.c:963 ublk_setup_iod() warn: signedness bug returning '(-5)' The type of "blk_status_t" is either be a u32 or u8, but this two functions return a negative value when not supported or failed. Use the error code of the blk module to fix these warnings. Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202308100201.TCRhgdvN-lkp@intel.com/ Signed-off-by: Li Zetao Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230810084836.3535322-1-lizetao1@huawei.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d726388b438c..118c76b030d3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -443,7 +443,7 @@ static int ublk_revalidate_disk_zones(struct ublk_device *ub) static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, struct request *req) { - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; } #endif @@ -963,7 +963,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) if (!ublk_queue_is_zoned(ubq) && (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) - return -EIO; + return BLK_STS_IOERR; switch (req_op(req)) { case REQ_OP_READ: -- Gitee From b63454b48b78ad780edb07c4f1b8c3ccf9d5af1e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Aug 2023 21:52:16 +0800 Subject: [PATCH 48/63] ublk: fix 'warn: variable dereferenced before check 'req'' from Smatch ANBZ: #32233 commit e24721e441a7c640e4e7b2b63c23c06d9a750880 upstream. The added check of 'req_op(req) == REQ_OP_ZONE_APPEND' should have been done after the request is confirmed as valid. Actually here, the request should always been true, so add one WARN_ON_ONCE(!req), meantime move the zone_append check after checking the request. Cc: Andreas Hindborg Reported-by: Dan Carpenter Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230811135216.420404-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 118c76b030d3..73a0de84fbb7 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1397,11 +1397,13 @@ static void ublk_commit_completion(struct ublk_device *ub, /* find the io request and complete */ req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + if (WARN_ON_ONCE(unlikely(!req))) + return; if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ub_cmd->zone_append_lba; - if (req && likely(!blk_should_fake_timeout(req->q))) + if (likely(!blk_should_fake_timeout(req->q))) ublk_put_req_ref(ubq, req); } -- Gitee From dd14e6478a22e35c9c151c3e837f7f85dce7433d Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Tue, 15 Aug 2023 19:48:14 +0800 Subject: [PATCH 49/63] ublk: Switch to memdup_user_nul() helper ANBZ: #32233 commit 66a6a5d0ec852eaced589da066376e69397cd71e upstream. Use memdup_user_nul() helper instead of open-coding to simplify the code. Signed-off-by: Ruan Jinjie Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230815114815.1551171-1-ruanjinjie@huawei.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 73a0de84fbb7..0b0c5f3bbc22 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2758,14 +2758,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, if (header->len < header->dev_path_len) return -EINVAL; - dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); - if (!dev_path) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(dev_path, argp, header->dev_path_len)) - goto exit; - dev_path[header->dev_path_len] = 0; + dev_path = memdup_user_nul(argp, header->dev_path_len); + if (IS_ERR(dev_path)) + return PTR_ERR(dev_path); ret = -EINVAL; switch (_IOC_NR(cmd->cmd_op)) { -- Gitee From e6faf45416eb500636c3c64902cab23e13df947f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 10 Aug 2023 20:43:26 +0800 Subject: [PATCH 50/63] ublk: zoned: support REQ_OP_ZONE_RESET_ALL ANBZ: #32233 commit 851e06297f20bbd85c93bbf09469f2150d1db218 upstream. There isn't any reason to not support REQ_OP_ZONE_RESET_ALL given everything is actually handled in userspace, not mention it is pretty easy to support RESET_ALL. So enable REQ_OP_ZONE_RESET_ALL and let userspace handle it. Verified by 'tools/zbc_reset_zone -all /dev/ublkb0' in libzbc[1] with libublk-rs based ublk-zoned target prototype[2], follows command line for creating ublk-zoned: cargo run --example zoned -- add -1 1024 # add $dev_id $DEV_SIZE [1] https://github.com/westerndigitalcorporation/libzbc [2] https://github.com/ming1/libublk-rs/tree/zoned.v2 Cc: Niklas Cassel Cc: Damien Le Moal Cc: Andreas Hindborg Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230810124326.321472-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 7 +++++-- include/uapi/linux/ublk_cmd.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0b0c5f3bbc22..b8037c21b614 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -251,6 +251,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub) const struct ublk_param_zoned *p = &ub->params.zoned; blk_queue_set_zoned(ub->ub_disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); blk_queue_required_elevator_features(ub->ub_disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE); blk_queue_max_active_zones(ub->ub_disk->queue, p->max_active_zones); @@ -394,6 +395,9 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, case REQ_OP_ZONE_APPEND: ublk_op = UBLK_IO_OP_ZONE_APPEND; break; + case REQ_OP_ZONE_RESET_ALL: + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; + break; case REQ_OP_DRV_IN: ublk_op = pdu->operation; switch (ublk_op) { @@ -405,9 +409,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, default: return BLK_STS_IOERR; } - case REQ_OP_ZONE_RESET_ALL: case REQ_OP_DRV_OUT: - /* We do not support reset_all and drv_out */ + /* We do not support drv_out */ return BLK_STS_NOTSUPP; default: return BLK_STS_IOERR; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 2685e53e4752..b9cfc5c96268 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -245,6 +245,7 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_OP_ZONE_CLOSE 11 #define UBLK_IO_OP_ZONE_FINISH 12 #define UBLK_IO_OP_ZONE_APPEND 13 +#define UBLK_IO_OP_ZONE_RESET_ALL 14 #define UBLK_IO_OP_ZONE_RESET 15 /* * Construct a zone report. The report request is carried in `struct -- Gitee From 31ed9756da35a776a951c2cc2768899413168d74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 9 Oct 2023 17:33:18 +0800 Subject: [PATCH 51/63] ublk: move ublk_cancel_dev() out of ub->mutex ANBZ: #32233 commit 85248d670b71d9edda9459ee14fdc85c8e9632c0 upstream. ublk_cancel_dev() just calls ublk_cancel_queue() to cancel all pending io commands after ublk request queue is idle. The only protection is just the read & write of ubq->nr_io_ready and avoid duplicated command cancel, so add one per-queue lock with cancel flag for providing this protection, meantime move ublk_cancel_dev() out of ub->mutex. Then we needn't to call io_uring_cmd_complete_in_task() to cancel pending command. And the same cancel logic will be re-used for cancelable uring command. This patch basically reverts commit ac5902f84bb5 ("ublk: fix AB-BA lockdep warning"). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231009093324.957829-4-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b8037c21b614..764bf99ece1a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -114,6 +114,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 +/* atomic RW with ubq->cancel_lock */ +#define UBLK_IO_FLAG_CANCELED 0x80000000 + struct ublk_io { /* userspace buffer address from io cmd */ __u64 addr; @@ -138,6 +141,7 @@ struct ublk_queue { bool force_abort; bool timeout; unsigned short nr_io_ready; /* how many ios setup */ + spinlock_t cancel_lock; struct ublk_device *dev; struct ublk_io ios[]; }; @@ -1477,28 +1481,27 @@ static inline bool ublk_queue_ready(struct ublk_queue *ubq) return ubq->nr_io_ready == ubq->q_depth; } -static void ublk_cmd_cancel_cb(struct io_uring_cmd *cmd) -{ - io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0); -} - static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; - if (!ublk_queue_ready(ubq)) - return; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; - if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_complete_in_task(io->cmd, - ublk_cmd_cancel_cb); - } + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + bool done; - /* all io commands are canceled */ - ubq->nr_io_ready = 0; + spin_lock(&ubq->cancel_lock); + done = !!(io->flags & UBLK_IO_FLAG_CANCELED); + if (!done) + io->flags |= UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + + if (!done) + io_uring_cmd_done(io->cmd, + UBLK_IO_RES_ABORT, 0); + } + } } /* Cancel all pending commands, must be called after del_gendisk() returns */ @@ -1546,7 +1549,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) blk_mq_quiesce_queue(ub->ub_disk->queue); ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; - ublk_cancel_dev(ub); /* we are going to release task_struct of ubq_daemon and resets * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. * Besides, monitor_work is not necessary in QUIESCED state since we have @@ -1569,6 +1571,7 @@ static void ublk_quiesce_work_fn(struct work_struct *work) __ublk_quiesce_dev(ub); unlock: mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); } static void ublk_unquiesce_dev(struct ublk_device *ub) @@ -1609,8 +1612,8 @@ static void ublk_stop_dev(struct ublk_device *ub) put_disk(ub->ub_disk); ub->ub_disk = NULL; unlock: - ublk_cancel_dev(ub); mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); cancel_delayed_work_sync(&ub->monitor_work); } @@ -1964,6 +1967,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) void *ptr; int size; + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; @@ -2581,8 +2585,9 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) int i; WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ - WARN_ON_ONCE(ubq->nr_io_ready); + ubq->nr_io_ready = 0; /* old daemon is PF_EXITING, put it now */ put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ -- Gitee From 82ec6592cd7e9bdb8841cd478df83aebc4ec18be Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 12 Aug 2024 09:36:24 +0800 Subject: [PATCH 52/63] ublk: move zone report data out of request pdu ANBZ: #32233 commit 9327b51c9a9c864f5177127e09851da9d78b4943 upstream. ublk zoned takes 16 bytes in each request pdu just for handling REPORT_ZONE operation, this way does waste memory since request pdu is allocated statically. Store the transient zone report data into one global xarray, and remove it after the report zone request is completed. This way is reasonable since report zone is run in slow code path. Fixes: 29802d7ca33b ("ublk: enable zoned storage support") Cc: Damien Le Moal Cc: Andreas Hindborg Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240812013624.587587-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 62 +++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 764bf99ece1a..9004ddeb2f00 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -67,9 +67,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -215,6 +212,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -321,7 +345,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; unsigned int i; @@ -333,20 +357,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(disk, req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -380,7 +407,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -403,12 +430,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; -- Gitee From 68d8776e5a7cca5d8e6873d9b21d0602b68bed39 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Oct 2024 21:48:47 +0800 Subject: [PATCH 53/63] ublk: don't allow user copy for unprivileged device ANBZ: #32233 commit 42aafd8b48adac1c3b20fe5892b1b91b80c1a1e6 upstream. UBLK_F_USER_COPY requires userspace to call write() on ublk char device for filling request buffer, and unprivileged device can't be trusted. So don't allow user copy for unprivileged device. Cc: stable@vger.kernel.org Fixes: 1172d5b8beca ("ublk: support user copy") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241016134847.2911721-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 11 ++++++++++- include/uapi/linux/ublk_cmd.h | 8 +++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9004ddeb2f00..38f2660b4227 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2336,10 +2336,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * TODO: provide forward progress for RECOVERY handler, so that * unprivileged device can benefit from it */ - if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | UBLK_F_USER_RECOVERY); + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + */ + if (info.flags & UBLK_F_USER_COPY) + return -EINVAL; + } + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..3830b428ecf1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -173,7 +173,13 @@ /* use ioctl encoding for uring command */ #define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) -/* Copy between request and user buffer by pread()/pwrite() */ +/* + * Copy between request and user buffer by pread()/pwrite() + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, otherwise userspace may + * deceive us by not filling request buffer, then kernel uninitialized + * data may be leaked. + */ #define UBLK_F_USER_COPY (1UL << 7) /* -- Gitee From 91e9c71ce4938a6397ff157d59e0d36990f9d661 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Nov 2024 19:07:18 +0800 Subject: [PATCH 54/63] ublk: fix ublk_ch_mmap() for 64K page size ANBZ: #32233 commit d369735e02ef122d19d4c3d093028da0eb400636 upstream. In ublk_ch_mmap(), queue id is calculated in the following way: (vma->vm_pgoff << PAGE_SHIFT) / `max_cmd_buf_size` 'max_cmd_buf_size' is equal to `UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc)` and UBLK_MAX_QUEUE_DEPTH is 4096 and part of UAPI, so 'max_cmd_buf_size' is always page aligned in 4K page size kernel. However, it isn't true in 64K page size kernel. Fixes the issue by always rounding up 'max_cmd_buf_size' with PAGE_SIZE. Cc: stable@vger.kernel.org Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241111110718.1394001-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 38f2660b4227..01da7f2d7366 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -714,12 +714,21 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) return ublk_get_queue(ub, q_id)->io_cmd_buf; } +static inline int __ublk_queue_cmd_buf_size(int depth) +{ + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); +} + static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); + return __ublk_queue_cmd_buf_size(ubq->q_depth); +} + +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); } static inline bool ublk_queue_can_use_recovery_reissue( @@ -1387,7 +1396,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; -- Gitee From 9ec5c485ef1a0078783b40ba0b8dce5282d6ff58 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 19 Nov 2024 11:06:46 +0800 Subject: [PATCH 55/63] ublk: fix error code for unsupported command ANBZ: #32233 commit 34c1227035b3ab930a1ae6ab6f22fec1af8ab09e upstream. ENOTSUPP is for kernel use only, and shouldn't be sent to userspace. Fix it by replacing it with EOPNOTSUPP. Cc: stable@vger.kernel.org Fixes: bfbcef036396 ("ublk_drv: move ublk_get_device_from_id into ublk_ctrl_uring_cmd") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241119030646.2319030-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 01da7f2d7366..5cb8e53f524f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2923,7 +2923,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_end_recovery(ub, cmd); break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } -- Gitee From b3f0d1599a9590e62a90445ffd67e9234e5fd6d4 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Tue, 4 Mar 2025 14:34:26 -0700 Subject: [PATCH 56/63] ublk: set_params: properly check if parameters can be applied ANBZ: #32233 commit 5ac60242b0173be83709603ebaf27a473f16c4e4 upstream. The parameters set by the set_params call are only applied to the block device in the start_dev call. So if a device has already been started, a subsequently issued set_params on that device will not have the desired effect, and should return an error. There is an existing check for this - set_params fails on devices in the LIVE state. But this check is not sufficient to cover the recovery case. In this case, the device will be in the QUIESCED or FAIL_IO states, so set_params will succeed. But this success is misleading, because the parameters will not be applied, since the device has already been started (by a previous ublk server). The bit UB_STATE_USED is set on completion of the start_dev; use it to detect and fail set_params commands which arrive too late to be applied (after start_dev). Signed-off-by: Uday Shankar Fixes: 0aa73170eba5 ("ublk_drv: add SET_PARAMS/GET_PARAMS control command") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250304-set_params-v1-1-17b5e0887606@purestorage.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5cb8e53f524f..0e28baf26b20 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2610,9 +2610,12 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; -- Gitee From f6d87df6926b6ac5d753131b9f141105bb72a6b0 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 19 Jun 2025 12:10:31 +1000 Subject: [PATCH 57/63] ublk: santizize the arguments from userspace when adding a device ANBZ: #32233 commit 8c8472855884355caf3d8e0c50adf825f83454b2 upstream. Sanity check the values for queue depth and number of queues we get from userspace when adding a device. Signed-off-by: Ronnie Sahlberg Reviewed-by: Ming Lei Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Fixes: 62fe99cef94a ("ublk: add read()/write() support for ublk char device") Link: https://lore.kernel.org/r/20250619021031.181340-1-ronniesahlberg@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0e28baf26b20..76e0e5172637 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2332,6 +2332,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) + return -EINVAL; + if (capable(CAP_SYS_ADMIN)) info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) -- Gitee From 0ab974a22408127a7b71e8d61ace24673b01098f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 26 Jun 2025 12:20:45 +1000 Subject: [PATCH 58/63] ublk: sanity check add_dev input for underflow ANBZ: #32233 commit 969127bf0783a4ac0c8a27e633a9e8ea1738583f upstream. Add additional checks that queue depth and number of queues are non-zero. Signed-off-by: Ronnie Sahlberg Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250626022046.235018-1-ronniesahlberg@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 76e0e5172637..b944ac021be5 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2332,7 +2332,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) return -EINVAL; if (capable(CAP_SYS_ADMIN)) -- Gitee From afdea0deae386388b8686907842e7beca0d2a4d7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 20 Jun 2025 09:09:55 -0600 Subject: [PATCH 59/63] ublk: use vmalloc for ublk_device's __queues ANBZ: #32233 commit c2f48453b7806d41f5a3270f206a5cd5640ed207 upstream. struct ublk_device's __queues points to an allocation with up to UBLK_MAX_NR_QUEUES (4096) queues, each of which have: - struct ublk_queue (48 bytes) - Tail array of up to UBLK_MAX_QUEUE_DEPTH (4096) struct ublk_io's, 32 bytes each This means the full allocation can exceed 512 MB, which may well be impossible to service with contiguous physical pages. Switch to kvcalloc() and kvfree(), since there is no need for physically contiguous memory. Signed-off-by: Caleb Sander Mateos Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250620151008.3976463-2-csander@purestorage.com Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b944ac021be5..551eaee1a81c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2031,7 +2031,7 @@ static void ublk_deinit_queues(struct ublk_device *ub) for (i = 0; i < nr_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); + kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) @@ -2042,7 +2042,7 @@ static int ublk_init_queues(struct ublk_device *ub) int i, ret = -ENOMEM; ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); if (!ub->__queues) return ret; -- Gitee From 96a4aaf8a28e8344182e68d5a6916374f09fd8a7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Dec 2025 22:34:15 +0800 Subject: [PATCH 60/63] ublk: fix deadlock when reading partition table ANBZ: #32233 commit c258f5c4502c9667bccf5d76fa731ab9c96687c1 upstream. When one process(such as udev) opens ublk block device (e.g., to read the partition table via bdev_open()), a deadlock[1] can occur: 1. bdev_open() grabs disk->open_mutex 2. The process issues read I/O to ublk backend to read partition table 3. In __ublk_complete_rq(), blk_update_request() or blk_mq_end_request() runs bio->bi_end_io() callbacks 4. If this triggers fput() on file descriptor of ublk block device, the work may be deferred to current task's task work (see fput() implementation) 5. This eventually calls blkdev_release() from the same context 6. blkdev_release() tries to grab disk->open_mutex again 7. Deadlock: same task waiting for a mutex it already holds The fix is to run blk_update_request() and blk_mq_end_request() with bottom halves disabled. This forces blkdev_release() to run in kernel work-queue context instead of current task work context, and allows ublk server to make forward progress, and avoids the deadlock. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Link: https://github.com/ublk-org/ublksrv/issues/170 [1] Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos [axboe: rewrite comment in ublk] Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 551eaee1a81c..e5f68402f6d0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1053,6 +1053,13 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) return ubq->ubq_daemon->flags & PF_EXITING; } +static void ublk_end_request(struct request *req, blk_status_t error) +{ + local_bh_disable(); + blk_mq_end_request(req, error); + local_bh_enable(); +} + /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req) { @@ -1060,6 +1067,7 @@ static inline void __ublk_complete_rq(struct request *req) struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; + bool requeue; /* called from ublk_abort_queue() code path */ if (io->flags & UBLK_IO_FLAG_ABORTED) { @@ -1097,14 +1105,30 @@ static inline void __ublk_complete_rq(struct request *req) if (unlikely(unmapped_bytes < io->res)) io->res = unmapped_bytes; - if (blk_update_request(req, BLK_STS_OK, io->res)) + /* + * Run bio->bi_end_io() with softirqs disabled. If the final fput + * happens off this path, then that will prevent ublk's blkdev_release() + * from being called on current's task work, see fput() implementation. + * + * Otherwise, ublk server may not provide forward progress in case of + * reading the partition table from bdev_open() with disk->open_mutex + * held, and causes dead lock as we could already be holding + * disk->open_mutex here. + * + * Preferably we would not be doing IO with a mutex held that is also + * used for release, but this work-around will suffice for now. + */ + local_bh_disable(); + requeue = blk_update_request(req, BLK_STS_OK, io->res); + local_bh_enable(); + if (requeue) blk_mq_requeue_request(req, true); else __blk_mq_end_request(req, BLK_STS_OK); return; exit: - blk_mq_end_request(req, res); + ublk_end_request(req, res); } static void ublk_complete_rq(struct kref *ref) @@ -1162,7 +1186,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, if (ublk_queue_can_use_recovery(ubq)) blk_mq_requeue_request(rq, false); else - blk_mq_end_request(rq, BLK_STS_IOERR); + ublk_end_request(rq, BLK_STS_IOERR); mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } -- Gitee From 37e1c00b9abfa3473569d65d745ad11534601ff0 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Fri, 30 Jan 2026 10:14:12 -0700 Subject: [PATCH 61/63] ublk: Validate SQE128 flag before accessing the cmd ANBZ: #32233 commit da7e4b75e50c087d2031a92f6646eb90f7045a67 upstream. ublk_ctrl_cmd_dump() accesses (header *)sqe->cmd before IO_URING_F_SQE128 flag check. This could cause out of boundary memory access. Move the SQE128 flag check earlier in ublk_ctrl_uring_cmd() to return -EINVAL immediately if the flag is not set. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Govindarajulu Varadarajan Reviewed-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Joseph Qi --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e5f68402f6d0..91771c096ebe 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2896,10 +2896,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ublk_ctrl_cmd_dump(cmd); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; + return -EINVAL; + + ublk_ctrl_cmd_dump(cmd); ret = ublk_check_cmd_op(cmd_op); if (ret) -- Gitee From 9f780b4b3ff902a2ee0e4c8e4ae27a087a388760 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 22 Jan 2025 21:34:55 +0700 Subject: [PATCH 62/63] Documentation: ublk: Drop Stefan Hajnoczi's message footnote ANBZ: #32233 commit 3fdf2ec7da1c3b2ca13d2d3360f37f017558ed84 upstream. Sphinx reports unreferenced footnote warning pointing to ubd-control message by Stefan Hajnoczi: Documentation/block/ublk.rst:336: WARNING: Footnote [#] is not referenced. [ref.footnote] Drop the footnote to squash above warning. Signed-off-by: Bagas Sanjaya Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Reviewed-by: Stefan Hajnoczi Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250122143456.68867-3-bagasdotme@gmail.com Signed-off-by: Joseph Qi --- Documentation/block/ublk.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 1713b2890abb..ef8e2b183079 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -321,6 +321,4 @@ References .. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README -.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ - .. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ -- Gitee From 67fbad80809c0c2d807d5662066443edc4ab0fd5 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 26 Mar 2026 19:50:00 +0800 Subject: [PATCH 63/63] anolis: ublk support legacy command opcode ANBZ: #32233 Turn on CONFIG_BLKDEV_UBLK_LEGACY_OPCODES by default. Signed-off-by: Joseph Qi --- .../L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES | 1 + 1 file changed, 1 insertion(+) create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES b/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES new file mode 100644 index 000000000000..5abaa4d2046e --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_BLKDEV_UBLK_LEGACY_OPCODES @@ -0,0 +1 @@ +CONFIG_BLKDEV_UBLK_LEGACY_OPCODES=y -- Gitee