io_uring: don't split out consume out of SQE get

Remove io_consume_sqe() and inline it back into io_get_sqe(). It
requires req dealloc on error, but in exchange we get cleaner
io_submit_sqes() and better locality for cached_sq_head.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87f2f8e..9c58be0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6762,7 +6762,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	 * 2) allows the kernel side to track the head on its own, even
 	 *    though the application is the one updating it.
 	 */
-	head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
+	head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
 	if (likely(head < ctx->sq_entries))
 		return &ctx->sq_sqes[head];
 
@@ -6772,11 +6772,6 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
-static inline void io_consume_sqe(struct io_ring_ctx *ctx)
-{
-	ctx->cached_sq_head++;
-}
-
 /*
  * Check SQE restrictions (opcode and flags).
  *
@@ -6915,18 +6910,17 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		struct io_kiocb *req;
 		int err;
 
-		sqe = io_get_sqe(ctx);
-		if (unlikely(!sqe)) {
-			io_consume_sqe(ctx);
-			break;
-		}
 		req = io_alloc_req(ctx);
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
-		io_consume_sqe(ctx);
+		sqe = io_get_sqe(ctx);
+		if (unlikely(!sqe)) {
+			kmem_cache_free(req_cachep, req);
+			break;
+		}
 		/* will complete beyond this point, count as submitted */
 		submitted++;