net: sock_def_readable() and friends RCU conversion
sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.
RCU conversion is pretty much needed :
1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).
[Future patch will add a list anchor for wakeup coalescing]
2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().
3) Respect RCU grace period when freeing a "struct socket_wq"
4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"
5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep
6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.
7) Change all sk_has_sleeper() callers to :
- Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
- Use wq_has_sleeper() to eventually wakeup tasks.
- Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)
8) sock_wake_async() is modified to use rcu protection as well.
9) Exceptions :
macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.
Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index d97e1fd..1c4110d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -37,6 +37,7 @@
struct macvtap_queue {
struct sock sk;
struct socket sock;
+ struct socket_wq wq;
struct macvlan_dev *vlan;
struct file *file;
unsigned int flags;
@@ -242,12 +243,15 @@
static void macvtap_sock_write_space(struct sock *sk)
{
+ wait_queue_head_t *wqueue;
+
if (!sock_writeable(sk) ||
!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
return;
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
+ wqueue = sk_sleep(sk);
+ if (wqueue && waitqueue_active(wqueue))
+ wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
}
static int macvtap_open(struct inode *inode, struct file *file)
@@ -272,7 +276,8 @@
if (!q)
goto out;
- init_waitqueue_head(&q->sock.wait);
+ q->sock.wq = &q->wq;
+ init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -308,7 +313,7 @@
goto out;
mask = 0;
- poll_wait(file, &q->sock.wait, wait);
+ poll_wait(file, &q->wq.wait, wait);
if (!skb_queue_empty(&q->sk.sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 20a1793..e525a6cf 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -109,7 +109,7 @@
struct tap_filter txflt;
struct socket socket;
-
+ struct socket_wq wq;
#ifdef TUN_DEBUG
int debug;
#endif
@@ -323,7 +323,7 @@
/* Inform the methods they need to stop using the dev.
*/
if (tfile) {
- wake_up_all(&tun->socket.wait);
+ wake_up_all(&tun->wq.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
@@ -398,7 +398,7 @@
/* Notify and wake up reader process */
if (tun->flags & TUN_FASYNC)
kill_fasync(&tun->fasync, SIGIO, POLL_IN);
- wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
+ wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
POLLRDNORM | POLLRDBAND);
return NETDEV_TX_OK;
@@ -498,7 +498,7 @@
DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
- poll_wait(file, &tun->socket.wait, wait);
+ poll_wait(file, &tun->wq.wait, wait);
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
@@ -773,7 +773,7 @@
DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);
- add_wait_queue(&tun->socket.wait, &wait);
+ add_wait_queue(&tun->wq.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;
@@ -804,7 +804,7 @@
}
current->state = TASK_RUNNING;
- remove_wait_queue(&tun->socket.wait, &wait);
+ remove_wait_queue(&tun->wq.wait, &wait);
return ret;
}
@@ -861,6 +861,7 @@
static void tun_sock_write_space(struct sock *sk)
{
struct tun_struct *tun;
+ wait_queue_head_t *wqueue;
if (!sock_writeable(sk))
return;
@@ -868,8 +869,9 @@
if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
return;
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
+ wqueue = sk_sleep(sk);
+ if (wqueue && waitqueue_active(wqueue))
+ wake_up_interruptible_sync_poll(wqueue, POLLOUT |
POLLWRNORM | POLLWRBAND);
tun = tun_sk(sk)->tun;
@@ -1039,7 +1041,8 @@
if (!sk)
goto err_free_dev;
- init_waitqueue_head(&tun->socket.wait);
+ tun->socket.wq = &tun->wq;
+ init_waitqueue_head(&tun->wq.wait);
tun->socket.ops = &tun_socket_ops;
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;