| // SPDX-License-Identifier: GPL-2.0 |
| |
| #include "io_uring.h" |
| #include "napi.h" |
| |
| #ifdef CONFIG_NET_RX_BUSY_POLL |
| |
| /* Timeout for cleanout of stale entries. */ |
| #define NAPI_TIMEOUT (60 * SEC_CONVERSION) |
| |
| struct io_napi_entry { |
| unsigned int napi_id; |
| struct list_head list; |
| |
| unsigned long timeout; |
| struct hlist_node node; |
| |
| struct rcu_head rcu; |
| }; |
| |
| static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, |
| unsigned int napi_id) |
| { |
| struct io_napi_entry *e; |
| |
| hlist_for_each_entry_rcu(e, hash_list, node) { |
| if (e->napi_id != napi_id) |
| continue; |
| return e; |
| } |
| |
| return NULL; |
| } |
| |
| static inline ktime_t net_to_ktime(unsigned long t) |
| { |
| /* napi approximating usecs, reverse busy_loop_current_time */ |
| return ns_to_ktime(t << 10); |
| } |
| |
| int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) |
| { |
| struct hlist_head *hash_list; |
| struct io_napi_entry *e; |
| |
| /* Non-NAPI IDs can be rejected. */ |
| if (napi_id < MIN_NAPI_ID) |
| return -EINVAL; |
| |
| hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; |
| |
| scoped_guard(rcu) { |
| e = io_napi_hash_find(hash_list, napi_id); |
| if (e) { |
| WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); |
| return -EEXIST; |
| } |
| } |
| |
| e = kmalloc(sizeof(*e), GFP_NOWAIT); |
| if (!e) |
| return -ENOMEM; |
| |
| e->napi_id = napi_id; |
| e->timeout = jiffies + NAPI_TIMEOUT; |
| |
| /* |
| * guard(spinlock) is not used to manually unlock it before calling |
| * kfree() |
| */ |
| spin_lock(&ctx->napi_lock); |
| if (unlikely(io_napi_hash_find(hash_list, napi_id))) { |
| spin_unlock(&ctx->napi_lock); |
| kfree(e); |
| return -EEXIST; |
| } |
| |
| hlist_add_tail_rcu(&e->node, hash_list); |
| list_add_tail_rcu(&e->list, &ctx->napi_list); |
| spin_unlock(&ctx->napi_lock); |
| return 0; |
| } |
| |
| static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) |
| { |
| struct hlist_head *hash_list; |
| struct io_napi_entry *e; |
| |
| /* Non-NAPI IDs can be rejected. */ |
| if (napi_id < MIN_NAPI_ID) |
| return -EINVAL; |
| |
| hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; |
| guard(spinlock)(&ctx->napi_lock); |
| e = io_napi_hash_find(hash_list, napi_id); |
| if (!e) |
| return -ENOENT; |
| |
| list_del_rcu(&e->list); |
| hash_del_rcu(&e->node); |
| kfree_rcu(e, rcu); |
| return 0; |
| } |
| |
| static void __io_napi_remove_stale(struct io_ring_ctx *ctx) |
| { |
| struct io_napi_entry *e; |
| |
| guard(spinlock)(&ctx->napi_lock); |
| /* |
| * list_for_each_entry_safe() is not required as long as: |
| * 1. list_del_rcu() does not reset the deleted node next pointer |
| * 2. kfree_rcu() delays the memory freeing until the next quiescent |
| * state |
| */ |
| list_for_each_entry(e, &ctx->napi_list, list) { |
| if (time_after(jiffies, READ_ONCE(e->timeout))) { |
| list_del_rcu(&e->list); |
| hash_del_rcu(&e->node); |
| kfree_rcu(e, rcu); |
| } |
| } |
| } |
| |
| static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) |
| { |
| if (is_stale) |
| __io_napi_remove_stale(ctx); |
| } |
| |
| static inline bool io_napi_busy_loop_timeout(ktime_t start_time, |
| ktime_t bp) |
| { |
| if (bp) { |
| ktime_t end_time = ktime_add(start_time, bp); |
| ktime_t now = net_to_ktime(busy_loop_current_time()); |
| |
| return ktime_after(now, end_time); |
| } |
| |
| return true; |
| } |
| |
| static bool io_napi_busy_loop_should_end(void *data, |
| unsigned long start_time) |
| { |
| struct io_wait_queue *iowq = data; |
| |
| if (signal_pending(current)) |
| return true; |
| if (io_should_wake(iowq) || io_has_work(iowq->ctx)) |
| return true; |
| if (io_napi_busy_loop_timeout(net_to_ktime(start_time), |
| iowq->napi_busy_poll_dt)) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * never report stale entries |
| */ |
| static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, |
| bool (*loop_end)(void *, unsigned long), |
| void *loop_end_arg) |
| { |
| struct io_napi_entry *e; |
| |
| list_for_each_entry_rcu(e, &ctx->napi_list, list) |
| napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, |
| ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); |
| return false; |
| } |
| |
| static bool |
| dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, |
| bool (*loop_end)(void *, unsigned long), |
| void *loop_end_arg) |
| { |
| struct io_napi_entry *e; |
| bool is_stale = false; |
| |
| list_for_each_entry_rcu(e, &ctx->napi_list, list) { |
| napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, |
| ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); |
| |
| if (time_after(jiffies, READ_ONCE(e->timeout))) |
| is_stale = true; |
| } |
| |
| return is_stale; |
| } |
| |
| static inline bool |
| __io_napi_do_busy_loop(struct io_ring_ctx *ctx, |
| bool (*loop_end)(void *, unsigned long), |
| void *loop_end_arg) |
| { |
| if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) |
| return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); |
| return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); |
| } |
| |
| static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, |
| struct io_wait_queue *iowq) |
| { |
| unsigned long start_time = busy_loop_current_time(); |
| bool (*loop_end)(void *, unsigned long) = NULL; |
| void *loop_end_arg = NULL; |
| bool is_stale = false; |
| |
| /* Singular lists use a different napi loop end check function and are |
| * only executed once. |
| */ |
| if (list_is_singular(&ctx->napi_list)) { |
| loop_end = io_napi_busy_loop_should_end; |
| loop_end_arg = iowq; |
| } |
| |
| scoped_guard(rcu) { |
| do { |
| is_stale = __io_napi_do_busy_loop(ctx, loop_end, |
| loop_end_arg); |
| } while (!io_napi_busy_loop_should_end(iowq, start_time) && |
| !loop_end_arg); |
| } |
| |
| io_napi_remove_stale(ctx, is_stale); |
| } |
| |
| /* |
| * io_napi_init() - Init napi settings |
| * @ctx: pointer to io-uring context structure |
| * |
| * Init napi settings in the io-uring context. |
| */ |
| void io_napi_init(struct io_ring_ctx *ctx) |
| { |
| u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; |
| |
| INIT_LIST_HEAD(&ctx->napi_list); |
| spin_lock_init(&ctx->napi_lock); |
| ctx->napi_prefer_busy_poll = false; |
| ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); |
| ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; |
| } |
| |
| /* |
| * io_napi_free() - Deallocate napi |
| * @ctx: pointer to io-uring context structure |
| * |
| * Free the napi list and the hash table in the io-uring context. |
| */ |
| void io_napi_free(struct io_ring_ctx *ctx) |
| { |
| struct io_napi_entry *e; |
| |
| guard(spinlock)(&ctx->napi_lock); |
| list_for_each_entry(e, &ctx->napi_list, list) { |
| hash_del_rcu(&e->node); |
| kfree_rcu(e, rcu); |
| } |
| INIT_LIST_HEAD_RCU(&ctx->napi_list); |
| } |
| |
| static int io_napi_register_napi(struct io_ring_ctx *ctx, |
| struct io_uring_napi *napi) |
| { |
| switch (napi->op_param) { |
| case IO_URING_NAPI_TRACKING_DYNAMIC: |
| case IO_URING_NAPI_TRACKING_STATIC: |
| break; |
| default: |
| return -EINVAL; |
| } |
| /* clean the napi list for new settings */ |
| io_napi_free(ctx); |
| WRITE_ONCE(ctx->napi_track_mode, napi->op_param); |
| WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); |
| WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); |
| return 0; |
| } |
| |
| /* |
| * io_napi_register() - Register napi with io-uring |
| * @ctx: pointer to io-uring context structure |
| * @arg: pointer to io_uring_napi structure |
| * |
| * Register napi in the io-uring context. |
| */ |
| int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) |
| { |
| const struct io_uring_napi curr = { |
| .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), |
| .prefer_busy_poll = ctx->napi_prefer_busy_poll, |
| .op_param = ctx->napi_track_mode |
| }; |
| struct io_uring_napi napi; |
| |
| if (ctx->flags & IORING_SETUP_IOPOLL) |
| return -EINVAL; |
| if (copy_from_user(&napi, arg, sizeof(napi))) |
| return -EFAULT; |
| if (napi.pad[0] || napi.pad[1] || napi.resv) |
| return -EINVAL; |
| |
| if (copy_to_user(arg, &curr, sizeof(curr))) |
| return -EFAULT; |
| |
| switch (napi.opcode) { |
| case IO_URING_NAPI_REGISTER_OP: |
| return io_napi_register_napi(ctx, &napi); |
| case IO_URING_NAPI_STATIC_ADD_ID: |
| if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) |
| return -EINVAL; |
| return __io_napi_add_id(ctx, napi.op_param); |
| case IO_URING_NAPI_STATIC_DEL_ID: |
| if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) |
| return -EINVAL; |
| return __io_napi_del_id(ctx, napi.op_param); |
| default: |
| return -EINVAL; |
| } |
| } |
| |
| /* |
| * io_napi_unregister() - Unregister napi with io-uring |
| * @ctx: pointer to io-uring context structure |
| * @arg: pointer to io_uring_napi structure |
| * |
| * Unregister napi. If arg has been specified copy the busy poll timeout and |
| * prefer busy poll setting to the passed in structure. |
| */ |
| int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) |
| { |
| const struct io_uring_napi curr = { |
| .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), |
| .prefer_busy_poll = ctx->napi_prefer_busy_poll |
| }; |
| |
| if (arg && copy_to_user(arg, &curr, sizeof(curr))) |
| return -EFAULT; |
| |
| WRITE_ONCE(ctx->napi_busy_poll_dt, 0); |
| WRITE_ONCE(ctx->napi_prefer_busy_poll, false); |
| WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); |
| return 0; |
| } |
| |
| /* |
| * __io_napi_busy_loop() - execute busy poll loop |
| * @ctx: pointer to io-uring context structure |
| * @iowq: pointer to io wait queue |
| * |
| * Execute the busy poll loop and merge the spliced off list. |
| */ |
| void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) |
| { |
| if (ctx->flags & IORING_SETUP_SQPOLL) |
| return; |
| |
| iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); |
| if (iowq->timeout != KTIME_MAX) { |
| ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); |
| |
| iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); |
| } |
| |
| iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); |
| io_napi_blocking_busy_loop(ctx, iowq); |
| } |
| |
| /* |
| * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll |
| * @ctx: pointer to io-uring context structure |
| * |
| * Splice of the napi list and execute the napi busy poll loop. |
| */ |
| int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) |
| { |
| bool is_stale = false; |
| |
| if (!READ_ONCE(ctx->napi_busy_poll_dt)) |
| return 0; |
| if (list_empty_careful(&ctx->napi_list)) |
| return 0; |
| |
| scoped_guard(rcu) { |
| is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); |
| } |
| |
| io_napi_remove_stale(ctx, is_stale); |
| return 1; |
| } |
| |
| #endif |