io_uring: move uring_lock location

->uring_lock is prevalently used for submission, even though it protects
many other things like iopoll, registeration, selected bufs, and more.
And it's placed together with ->cq_wait poked on completion and CQ
waiting sides. Move them apart, ->uring_lock goes to the submission
data, and cq_wait to completion related chunk. The last one requires
some reshuffling so everything needed by io_cqring_ev_posted*() is in
one cacheline.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dea5e845caee4c98aa0922b46d713154d81f7bd8.1623709150.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e19c9f7..74c8334 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -356,6 +356,8 @@ struct io_ring_ctx {
 
 	/* submission data */
 	struct {
+		struct mutex		uring_lock;
+
 		/*
 		 * Ring buffer of indices into array of io_uring_sqe, which is
 		 * mmapped by the application using the IORING_OFF_SQES offset.
@@ -392,11 +394,6 @@ struct io_ring_ctx {
 		unsigned		sq_thread_idle;
 	} ____cacheline_aligned_in_smp;
 
-	struct {
-		struct mutex		uring_lock;
-		wait_queue_head_t	cq_wait;
-	} ____cacheline_aligned_in_smp;
-
 	/* IRQ completion list, under ->completion_lock */
 	struct list_head	locked_free_list;
 	unsigned int		locked_free_nr;
@@ -412,12 +409,13 @@ struct io_ring_ctx {
 	struct {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
-		atomic_t		cq_timeouts;
-		unsigned		cq_last_tm_flush;
-		unsigned		cq_extra;
-		struct wait_queue_head	poll_wait;
-		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
+		struct wait_queue_head	poll_wait;
+		struct wait_queue_head	cq_wait;
+		unsigned		cq_extra;
+		atomic_t		cq_timeouts;
+		struct fasync_struct	*cq_fasync;
+		unsigned		cq_last_tm_flush;
 	} ____cacheline_aligned_in_smp;
 
 	struct {