blob: 52a91d816c0eb9a1f9fe96fd77b3ffefd6145149 [file] [log] [blame]
John Fastabend174a79f2017-08-15 22:32:47 -07001/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12
13/* A BPF sock_map is used to store sock objects. This is primarly used
14 * for doing socket redirect with BPF helper routines.
15 *
John Fastabend2f857d02017-08-28 07:10:25 -070016 * A sock map may have BPF programs attached to it, currently a program
17 * used to parse packets and a program to provide a verdict and redirect
18 * decision on the packet are supported. Any programs attached to a sock
19 * map are inherited by sock objects when they are added to the map. If
20 * no BPF programs are attached the sock object may only be used for sock
21 * redirect.
22 *
23 * A sock object may be in multiple maps, but can only inherit a single
24 * parse or verdict program. If adding a sock object to a map would result
25 * in having multiple parsing programs the update will return an EBUSY error.
John Fastabend174a79f2017-08-15 22:32:47 -070026 *
27 * For reference this program is similar to devmap used in XDP context
28 * reviewing these together may be useful. For an example please review
29 * ./samples/bpf/sockmap/.
30 */
31#include <linux/bpf.h>
32#include <net/sock.h>
33#include <linux/filter.h>
34#include <linux/errno.h>
35#include <linux/file.h>
36#include <linux/kernel.h>
37#include <linux/net.h>
38#include <linux/skbuff.h>
39#include <linux/workqueue.h>
40#include <linux/list.h>
John Fastabend4f738ad2018-03-18 12:57:10 -070041#include <linux/mm.h>
John Fastabend174a79f2017-08-15 22:32:47 -070042#include <net/strparser.h>
John Fastabend34f795022017-10-18 07:10:36 -070043#include <net/tcp.h>
John Fastabend8934ce22018-03-28 12:49:15 -070044#include <linux/ptr_ring.h>
45#include <net/inet_common.h>
John Fastabende20f7332018-04-23 15:39:28 -070046#include <linux/sched/signal.h>
John Fastabend174a79f2017-08-15 22:32:47 -070047
Chenbo Feng6e71b042017-10-18 13:00:22 -070048#define SOCK_CREATE_FLAG_MASK \
49 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
50
John Fastabende5cd3ab2018-05-14 10:00:16 -070051struct bpf_sock_progs {
John Fastabend4f738ad2018-03-18 12:57:10 -070052 struct bpf_prog *bpf_tx_msg;
John Fastabend174a79f2017-08-15 22:32:47 -070053 struct bpf_prog *bpf_parse;
54 struct bpf_prog *bpf_verdict;
John Fastabend174a79f2017-08-15 22:32:47 -070055};
56
John Fastabende5cd3ab2018-05-14 10:00:16 -070057struct bpf_stab {
58 struct bpf_map map;
59 struct sock **sock_map;
60 struct bpf_sock_progs progs;
61};
62
John Fastabend81110382018-05-14 10:00:17 -070063struct bucket {
64 struct hlist_head head;
65 raw_spinlock_t lock;
66};
67
68struct bpf_htab {
69 struct bpf_map map;
70 struct bucket *buckets;
71 atomic_t count;
72 u32 n_buckets;
73 u32 elem_size;
74 struct bpf_sock_progs progs;
75};
76
77struct htab_elem {
78 struct rcu_head rcu;
79 struct hlist_node hash_node;
80 u32 hash;
81 struct sock *sk;
82 char key[0];
83};
84
John Fastabend174a79f2017-08-15 22:32:47 -070085enum smap_psock_state {
86 SMAP_TX_RUNNING,
87};
88
John Fastabend2f857d02017-08-28 07:10:25 -070089struct smap_psock_map_entry {
90 struct list_head list;
91 struct sock **entry;
John Fastabend81110382018-05-14 10:00:17 -070092 struct htab_elem *hash_link;
93 struct bpf_htab *htab;
John Fastabend2f857d02017-08-28 07:10:25 -070094};
95
John Fastabend174a79f2017-08-15 22:32:47 -070096struct smap_psock {
97 struct rcu_head rcu;
John Fastabendffa35662018-03-18 12:56:54 -070098 refcount_t refcnt;
John Fastabend174a79f2017-08-15 22:32:47 -070099
100 /* datapath variables */
101 struct sk_buff_head rxqueue;
102 bool strp_enabled;
103
104 /* datapath error path cache across tx work invocations */
105 int save_rem;
106 int save_off;
107 struct sk_buff *save_skb;
108
John Fastabend4f738ad2018-03-18 12:57:10 -0700109 /* datapath variables for tx_msg ULP */
110 struct sock *sk_redir;
111 int apply_bytes;
112 int cork_bytes;
113 int sg_size;
114 int eval;
115 struct sk_msg_buff *cork;
John Fastabend8934ce22018-03-28 12:49:15 -0700116 struct list_head ingress;
John Fastabend4f738ad2018-03-18 12:57:10 -0700117
John Fastabend174a79f2017-08-15 22:32:47 -0700118 struct strparser strp;
John Fastabend4f738ad2018-03-18 12:57:10 -0700119 struct bpf_prog *bpf_tx_msg;
John Fastabend174a79f2017-08-15 22:32:47 -0700120 struct bpf_prog *bpf_parse;
121 struct bpf_prog *bpf_verdict;
John Fastabend2f857d02017-08-28 07:10:25 -0700122 struct list_head maps;
John Fastabend174a79f2017-08-15 22:32:47 -0700123
124 /* Back reference used when sock callback trigger sockmap operations */
John Fastabend174a79f2017-08-15 22:32:47 -0700125 struct sock *sock;
126 unsigned long state;
127
128 struct work_struct tx_work;
129 struct work_struct gc_work;
130
John Fastabend1aa12bd2018-02-05 10:17:49 -0800131 struct proto *sk_proto;
132 void (*save_close)(struct sock *sk, long timeout);
John Fastabend174a79f2017-08-15 22:32:47 -0700133 void (*save_data_ready)(struct sock *sk);
134 void (*save_write_space)(struct sock *sk);
John Fastabend174a79f2017-08-15 22:32:47 -0700135};
136
John Fastabend4f738ad2018-03-18 12:57:10 -0700137static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
John Fastabend8934ce22018-03-28 12:49:15 -0700138static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
139 int nonblock, int flags, int *addr_len);
John Fastabend4f738ad2018-03-18 12:57:10 -0700140static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
141static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
142 int offset, size_t size, int flags);
143
John Fastabend174a79f2017-08-15 22:32:47 -0700144static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
145{
John Fastabend2f857d02017-08-28 07:10:25 -0700146 return rcu_dereference_sk_user_data(sk);
John Fastabend174a79f2017-08-15 22:32:47 -0700147}
148
John Fastabend8934ce22018-03-28 12:49:15 -0700149static bool bpf_tcp_stream_read(const struct sock *sk)
150{
151 struct smap_psock *psock;
152 bool empty = true;
153
154 rcu_read_lock();
155 psock = smap_psock_sk(sk);
156 if (unlikely(!psock))
157 goto out;
158 empty = list_empty(&psock->ingress);
159out:
160 rcu_read_unlock();
161 return !empty;
162}
163
John Fastabend1aa12bd2018-02-05 10:17:49 -0800164static struct proto tcp_bpf_proto;
165static int bpf_tcp_init(struct sock *sk)
166{
167 struct smap_psock *psock;
168
169 rcu_read_lock();
170 psock = smap_psock_sk(sk);
171 if (unlikely(!psock)) {
172 rcu_read_unlock();
173 return -EINVAL;
174 }
175
176 if (unlikely(psock->sk_proto)) {
177 rcu_read_unlock();
178 return -EBUSY;
179 }
180
181 psock->save_close = sk->sk_prot->close;
182 psock->sk_proto = sk->sk_prot;
John Fastabend4f738ad2018-03-18 12:57:10 -0700183
184 if (psock->bpf_tx_msg) {
185 tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
186 tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
John Fastabend8934ce22018-03-28 12:49:15 -0700187 tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
188 tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
John Fastabend4f738ad2018-03-18 12:57:10 -0700189 }
190
John Fastabend1aa12bd2018-02-05 10:17:49 -0800191 sk->sk_prot = &tcp_bpf_proto;
192 rcu_read_unlock();
193 return 0;
194}
195
John Fastabend4f738ad2018-03-18 12:57:10 -0700196static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
197static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
198
John Fastabend1aa12bd2018-02-05 10:17:49 -0800199static void bpf_tcp_release(struct sock *sk)
200{
201 struct smap_psock *psock;
202
203 rcu_read_lock();
204 psock = smap_psock_sk(sk);
John Fastabend4f738ad2018-03-18 12:57:10 -0700205 if (unlikely(!psock))
206 goto out;
John Fastabend1aa12bd2018-02-05 10:17:49 -0800207
John Fastabend4f738ad2018-03-18 12:57:10 -0700208 if (psock->cork) {
209 free_start_sg(psock->sock, psock->cork);
210 kfree(psock->cork);
211 psock->cork = NULL;
John Fastabend1aa12bd2018-02-05 10:17:49 -0800212 }
John Fastabend4f738ad2018-03-18 12:57:10 -0700213
John Fastabend0e94d872018-04-02 12:50:52 -0700214 if (psock->sk_proto) {
215 sk->sk_prot = psock->sk_proto;
216 psock->sk_proto = NULL;
217 }
John Fastabend4f738ad2018-03-18 12:57:10 -0700218out:
John Fastabend1aa12bd2018-02-05 10:17:49 -0800219 rcu_read_unlock();
220}
221
John Fastabend81110382018-05-14 10:00:17 -0700222static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
223{
224 atomic_dec(&htab->count);
225 kfree_rcu(l, rcu);
226}
227
John Fastabend1aa12bd2018-02-05 10:17:49 -0800228static void bpf_tcp_close(struct sock *sk, long timeout)
229{
230 void (*close_fun)(struct sock *sk, long timeout);
231 struct smap_psock_map_entry *e, *tmp;
John Fastabend8934ce22018-03-28 12:49:15 -0700232 struct sk_msg_buff *md, *mtmp;
John Fastabend1aa12bd2018-02-05 10:17:49 -0800233 struct smap_psock *psock;
234 struct sock *osk;
235
236 rcu_read_lock();
237 psock = smap_psock_sk(sk);
238 if (unlikely(!psock)) {
239 rcu_read_unlock();
240 return sk->sk_prot->close(sk, timeout);
241 }
242
243 /* The psock may be destroyed anytime after exiting the RCU critial
244 * section so by the time we use close_fun the psock may no longer
245 * be valid. However, bpf_tcp_close is called with the sock lock
246 * held so the close hook and sk are still valid.
247 */
248 close_fun = psock->save_close;
249
250 write_lock_bh(&sk->sk_callback_lock);
John Fastabend820ed3f2018-04-02 12:50:46 -0700251 if (psock->cork) {
252 free_start_sg(psock->sock, psock->cork);
253 kfree(psock->cork);
254 psock->cork = NULL;
255 }
256
John Fastabend8934ce22018-03-28 12:49:15 -0700257 list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
258 list_del(&md->list);
259 free_start_sg(psock->sock, md);
260 kfree(md);
261 }
262
John Fastabend1aa12bd2018-02-05 10:17:49 -0800263 list_for_each_entry_safe(e, tmp, &psock->maps, list) {
John Fastabend81110382018-05-14 10:00:17 -0700264 if (e->entry) {
265 osk = cmpxchg(e->entry, sk, NULL);
266 if (osk == sk) {
267 list_del(&e->list);
268 smap_release_sock(psock, sk);
269 }
270 } else {
271 hlist_del_rcu(&e->hash_link->hash_node);
272 smap_release_sock(psock, e->hash_link->sk);
273 free_htab_elem(e->htab, e->hash_link);
John Fastabend1aa12bd2018-02-05 10:17:49 -0800274 }
275 }
276 write_unlock_bh(&sk->sk_callback_lock);
277 rcu_read_unlock();
278 close_fun(sk, timeout);
279}
280
John Fastabend04686ef2017-10-31 19:17:31 -0700281enum __sk_action {
282 __SK_DROP = 0,
283 __SK_PASS,
284 __SK_REDIRECT,
John Fastabend4f738ad2018-03-18 12:57:10 -0700285 __SK_NONE,
John Fastabend04686ef2017-10-31 19:17:31 -0700286};
287
John Fastabend1aa12bd2018-02-05 10:17:49 -0800288static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
289 .name = "bpf_tcp",
290 .uid = TCP_ULP_BPF,
291 .user_visible = false,
292 .owner = NULL,
293 .init = bpf_tcp_init,
294 .release = bpf_tcp_release,
295};
296
John Fastabend4f738ad2018-03-18 12:57:10 -0700297static int memcopy_from_iter(struct sock *sk,
298 struct sk_msg_buff *md,
299 struct iov_iter *from, int bytes)
300{
301 struct scatterlist *sg = md->sg_data;
302 int i = md->sg_curr, rc = -ENOSPC;
303
304 do {
305 int copy;
306 char *to;
307
308 if (md->sg_copybreak >= sg[i].length) {
309 md->sg_copybreak = 0;
310
311 if (++i == MAX_SKB_FRAGS)
312 i = 0;
313
314 if (i == md->sg_end)
315 break;
316 }
317
318 copy = sg[i].length - md->sg_copybreak;
319 to = sg_virt(&sg[i]) + md->sg_copybreak;
320 md->sg_copybreak += copy;
321
322 if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
323 rc = copy_from_iter_nocache(to, copy, from);
324 else
325 rc = copy_from_iter(to, copy, from);
326
327 if (rc != copy) {
328 rc = -EFAULT;
329 goto out;
330 }
331
332 bytes -= copy;
333 if (!bytes)
334 break;
335
336 md->sg_copybreak = 0;
337 if (++i == MAX_SKB_FRAGS)
338 i = 0;
339 } while (i != md->sg_end);
340out:
341 md->sg_curr = i;
342 return rc;
343}
344
345static int bpf_tcp_push(struct sock *sk, int apply_bytes,
346 struct sk_msg_buff *md,
347 int flags, bool uncharge)
348{
349 bool apply = apply_bytes;
350 struct scatterlist *sg;
351 int offset, ret = 0;
352 struct page *p;
353 size_t size;
354
355 while (1) {
356 sg = md->sg_data + md->sg_start;
357 size = (apply && apply_bytes < sg->length) ?
358 apply_bytes : sg->length;
359 offset = sg->offset;
360
361 tcp_rate_check_app_limited(sk);
362 p = sg_page(sg);
363retry:
364 ret = do_tcp_sendpages(sk, p, offset, size, flags);
365 if (ret != size) {
366 if (ret > 0) {
367 if (apply)
368 apply_bytes -= ret;
John Fastabend3cc9a472018-05-02 13:50:19 -0700369
370 sg->offset += ret;
371 sg->length -= ret;
John Fastabend4f738ad2018-03-18 12:57:10 -0700372 size -= ret;
373 offset += ret;
374 if (uncharge)
375 sk_mem_uncharge(sk, ret);
376 goto retry;
377 }
378
John Fastabend4f738ad2018-03-18 12:57:10 -0700379 return ret;
380 }
381
382 if (apply)
383 apply_bytes -= ret;
384 sg->offset += ret;
385 sg->length -= ret;
386 if (uncharge)
387 sk_mem_uncharge(sk, ret);
388
389 if (!sg->length) {
390 put_page(p);
391 md->sg_start++;
392 if (md->sg_start == MAX_SKB_FRAGS)
393 md->sg_start = 0;
Prashant Bhole6ef6d842018-03-30 09:21:00 +0900394 sg_init_table(sg, 1);
John Fastabend4f738ad2018-03-18 12:57:10 -0700395
396 if (md->sg_start == md->sg_end)
397 break;
398 }
399
400 if (apply && !apply_bytes)
401 break;
402 }
403 return 0;
404}
405
406static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
407{
408 struct scatterlist *sg = md->sg_data + md->sg_start;
409
410 if (md->sg_copy[md->sg_start]) {
411 md->data = md->data_end = 0;
412 } else {
413 md->data = sg_virt(sg);
414 md->data_end = md->data + sg->length;
415 }
416}
417
418static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
419{
420 struct scatterlist *sg = md->sg_data;
421 int i = md->sg_start;
422
423 do {
424 int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
425
426 sk_mem_uncharge(sk, uncharge);
427 bytes -= uncharge;
428 if (!bytes)
429 break;
430 i++;
431 if (i == MAX_SKB_FRAGS)
432 i = 0;
433 } while (i != md->sg_end);
434}
435
John Fastabendabaeb092018-05-02 13:50:29 -0700436static void free_bytes_sg(struct sock *sk, int bytes,
437 struct sk_msg_buff *md, bool charge)
John Fastabend4f738ad2018-03-18 12:57:10 -0700438{
439 struct scatterlist *sg = md->sg_data;
440 int i = md->sg_start, free;
441
442 while (bytes && sg[i].length) {
443 free = sg[i].length;
444 if (bytes < free) {
445 sg[i].length -= bytes;
446 sg[i].offset += bytes;
John Fastabendabaeb092018-05-02 13:50:29 -0700447 if (charge)
448 sk_mem_uncharge(sk, bytes);
John Fastabend4f738ad2018-03-18 12:57:10 -0700449 break;
450 }
451
John Fastabendabaeb092018-05-02 13:50:29 -0700452 if (charge)
453 sk_mem_uncharge(sk, sg[i].length);
John Fastabend4f738ad2018-03-18 12:57:10 -0700454 put_page(sg_page(&sg[i]));
455 bytes -= sg[i].length;
456 sg[i].length = 0;
457 sg[i].page_link = 0;
458 sg[i].offset = 0;
459 i++;
460
461 if (i == MAX_SKB_FRAGS)
462 i = 0;
463 }
John Fastabendabaeb092018-05-02 13:50:29 -0700464 md->sg_start = i;
John Fastabend4f738ad2018-03-18 12:57:10 -0700465}
466
467static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
468{
469 struct scatterlist *sg = md->sg_data;
470 int i = start, free = 0;
471
472 while (sg[i].length) {
473 free += sg[i].length;
474 sk_mem_uncharge(sk, sg[i].length);
475 put_page(sg_page(&sg[i]));
476 sg[i].length = 0;
477 sg[i].page_link = 0;
478 sg[i].offset = 0;
479 i++;
480
481 if (i == MAX_SKB_FRAGS)
482 i = 0;
483 }
484
485 return free;
486}
487
488static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
489{
490 int free = free_sg(sk, md->sg_start, md);
491
492 md->sg_start = md->sg_end;
493 return free;
494}
495
496static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
497{
498 return free_sg(sk, md->sg_curr, md);
499}
500
501static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
502{
503 return ((_rc == SK_PASS) ?
John Fastabende5cd3ab2018-05-14 10:00:16 -0700504 (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
John Fastabend4f738ad2018-03-18 12:57:10 -0700505 __SK_DROP);
506}
507
508static unsigned int smap_do_tx_msg(struct sock *sk,
509 struct smap_psock *psock,
510 struct sk_msg_buff *md)
511{
512 struct bpf_prog *prog;
513 unsigned int rc, _rc;
514
515 preempt_disable();
516 rcu_read_lock();
517
518 /* If the policy was removed mid-send then default to 'accept' */
519 prog = READ_ONCE(psock->bpf_tx_msg);
520 if (unlikely(!prog)) {
521 _rc = SK_PASS;
522 goto verdict;
523 }
524
525 bpf_compute_data_pointers_sg(md);
John Fastabend303def32018-05-17 14:16:58 -0700526 md->sk = sk;
John Fastabend4f738ad2018-03-18 12:57:10 -0700527 rc = (*prog->bpf_func)(md, prog->insnsi);
528 psock->apply_bytes = md->apply_bytes;
529
530 /* Moving return codes from UAPI namespace into internal namespace */
531 _rc = bpf_map_msg_verdict(rc, md);
532
533 /* The psock has a refcount on the sock but not on the map and because
534 * we need to drop rcu read lock here its possible the map could be
535 * removed between here and when we need it to execute the sock
536 * redirect. So do the map lookup now for future use.
537 */
538 if (_rc == __SK_REDIRECT) {
539 if (psock->sk_redir)
540 sock_put(psock->sk_redir);
541 psock->sk_redir = do_msg_redirect_map(md);
542 if (!psock->sk_redir) {
543 _rc = __SK_DROP;
544 goto verdict;
545 }
546 sock_hold(psock->sk_redir);
547 }
548verdict:
549 rcu_read_unlock();
550 preempt_enable();
551
552 return _rc;
553}
554
John Fastabend8934ce22018-03-28 12:49:15 -0700555static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
556 struct smap_psock *psock,
557 struct sk_msg_buff *md, int flags)
558{
559 bool apply = apply_bytes;
560 size_t size, copied = 0;
561 struct sk_msg_buff *r;
562 int err = 0, i;
563
564 r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
565 if (unlikely(!r))
566 return -ENOMEM;
567
568 lock_sock(sk);
569 r->sg_start = md->sg_start;
570 i = md->sg_start;
571
572 do {
John Fastabend8934ce22018-03-28 12:49:15 -0700573 size = (apply && apply_bytes < md->sg_data[i].length) ?
574 apply_bytes : md->sg_data[i].length;
575
576 if (!sk_wmem_schedule(sk, size)) {
577 if (!copied)
578 err = -ENOMEM;
579 break;
580 }
581
582 sk_mem_charge(sk, size);
John Fastabend4fcfdfb2018-04-23 15:39:33 -0700583 r->sg_data[i] = md->sg_data[i];
John Fastabend8934ce22018-03-28 12:49:15 -0700584 r->sg_data[i].length = size;
585 md->sg_data[i].length -= size;
586 md->sg_data[i].offset += size;
587 copied += size;
588
589 if (md->sg_data[i].length) {
590 get_page(sg_page(&r->sg_data[i]));
591 r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
592 } else {
593 i++;
594 if (i == MAX_SKB_FRAGS)
595 i = 0;
596 r->sg_end = i;
597 }
598
599 if (apply) {
600 apply_bytes -= size;
601 if (!apply_bytes)
602 break;
603 }
604 } while (i != md->sg_end);
605
606 md->sg_start = i;
607
608 if (!err) {
609 list_add_tail(&r->list, &psock->ingress);
610 sk->sk_data_ready(sk);
611 } else {
612 free_start_sg(sk, r);
613 kfree(r);
614 }
615
616 release_sock(sk);
617 return err;
618}
619
John Fastabend4f738ad2018-03-18 12:57:10 -0700620static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
621 struct sk_msg_buff *md,
622 int flags)
623{
John Fastabendabaeb092018-05-02 13:50:29 -0700624 bool ingress = !!(md->flags & BPF_F_INGRESS);
John Fastabend4f738ad2018-03-18 12:57:10 -0700625 struct smap_psock *psock;
626 struct scatterlist *sg;
John Fastabendabaeb092018-05-02 13:50:29 -0700627 int err = 0;
John Fastabend4f738ad2018-03-18 12:57:10 -0700628
629 sg = md->sg_data;
630
631 rcu_read_lock();
632 psock = smap_psock_sk(sk);
633 if (unlikely(!psock))
634 goto out_rcu;
635
636 if (!refcount_inc_not_zero(&psock->refcnt))
637 goto out_rcu;
638
639 rcu_read_unlock();
John Fastabend8934ce22018-03-28 12:49:15 -0700640
641 if (ingress) {
642 err = bpf_tcp_ingress(sk, send, psock, md, flags);
643 } else {
644 lock_sock(sk);
645 err = bpf_tcp_push(sk, send, md, flags, false);
646 release_sock(sk);
647 }
John Fastabend4f738ad2018-03-18 12:57:10 -0700648 smap_release_sock(psock, sk);
649 if (unlikely(err))
650 goto out;
651 return 0;
652out_rcu:
653 rcu_read_unlock();
654out:
John Fastabendabaeb092018-05-02 13:50:29 -0700655 free_bytes_sg(NULL, send, md, false);
656 return err;
John Fastabend4f738ad2018-03-18 12:57:10 -0700657}
658
659static inline void bpf_md_init(struct smap_psock *psock)
660{
661 if (!psock->apply_bytes) {
662 psock->eval = __SK_NONE;
663 if (psock->sk_redir) {
664 sock_put(psock->sk_redir);
665 psock->sk_redir = NULL;
666 }
667 }
668}
669
670static void apply_bytes_dec(struct smap_psock *psock, int i)
671{
672 if (psock->apply_bytes) {
673 if (psock->apply_bytes < i)
674 psock->apply_bytes = 0;
675 else
676 psock->apply_bytes -= i;
677 }
678}
679
680static int bpf_exec_tx_verdict(struct smap_psock *psock,
681 struct sk_msg_buff *m,
682 struct sock *sk,
683 int *copied, int flags)
684{
685 bool cork = false, enospc = (m->sg_start == m->sg_end);
686 struct sock *redir;
687 int err = 0;
688 int send;
689
690more_data:
691 if (psock->eval == __SK_NONE)
692 psock->eval = smap_do_tx_msg(sk, psock, m);
693
694 if (m->cork_bytes &&
695 m->cork_bytes > psock->sg_size && !enospc) {
696 psock->cork_bytes = m->cork_bytes - psock->sg_size;
697 if (!psock->cork) {
698 psock->cork = kcalloc(1,
699 sizeof(struct sk_msg_buff),
700 GFP_ATOMIC | __GFP_NOWARN);
701
702 if (!psock->cork) {
703 err = -ENOMEM;
704 goto out_err;
705 }
706 }
707 memcpy(psock->cork, m, sizeof(*m));
708 goto out_err;
709 }
710
711 send = psock->sg_size;
712 if (psock->apply_bytes && psock->apply_bytes < send)
713 send = psock->apply_bytes;
714
715 switch (psock->eval) {
716 case __SK_PASS:
717 err = bpf_tcp_push(sk, send, m, flags, true);
718 if (unlikely(err)) {
719 *copied -= free_start_sg(sk, m);
720 break;
721 }
722
723 apply_bytes_dec(psock, send);
724 psock->sg_size -= send;
725 break;
726 case __SK_REDIRECT:
727 redir = psock->sk_redir;
728 apply_bytes_dec(psock, send);
729
730 if (psock->cork) {
731 cork = true;
732 psock->cork = NULL;
733 }
734
735 return_mem_sg(sk, send, m);
736 release_sock(sk);
737
738 err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
739 lock_sock(sk);
740
John Fastabendfec51d42018-05-02 13:50:24 -0700741 if (unlikely(err < 0)) {
742 free_start_sg(sk, m);
743 psock->sg_size = 0;
744 if (!cork)
745 *copied -= send;
746 } else {
747 psock->sg_size -= send;
748 }
749
John Fastabend4f738ad2018-03-18 12:57:10 -0700750 if (cork) {
751 free_start_sg(sk, m);
John Fastabendfec51d42018-05-02 13:50:24 -0700752 psock->sg_size = 0;
John Fastabend4f738ad2018-03-18 12:57:10 -0700753 kfree(m);
754 m = NULL;
John Fastabendfec51d42018-05-02 13:50:24 -0700755 err = 0;
John Fastabend4f738ad2018-03-18 12:57:10 -0700756 }
John Fastabend4f738ad2018-03-18 12:57:10 -0700757 break;
758 case __SK_DROP:
759 default:
John Fastabendabaeb092018-05-02 13:50:29 -0700760 free_bytes_sg(sk, send, m, true);
John Fastabend4f738ad2018-03-18 12:57:10 -0700761 apply_bytes_dec(psock, send);
762 *copied -= send;
763 psock->sg_size -= send;
764 err = -EACCES;
765 break;
766 }
767
768 if (likely(!err)) {
769 bpf_md_init(psock);
770 if (m &&
771 m->sg_data[m->sg_start].page_link &&
772 m->sg_data[m->sg_start].length)
773 goto more_data;
774 }
775
776out_err:
777 return err;
778}
779
John Fastabende20f7332018-04-23 15:39:28 -0700780static int bpf_wait_data(struct sock *sk,
781 struct smap_psock *psk, int flags,
782 long timeo, int *err)
783{
784 int rc;
785
786 DEFINE_WAIT_FUNC(wait, woken_wake_function);
787
788 add_wait_queue(sk_sleep(sk), &wait);
789 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
790 rc = sk_wait_event(sk, &timeo,
791 !list_empty(&psk->ingress) ||
792 !skb_queue_empty(&sk->sk_receive_queue),
793 &wait);
794 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
795 remove_wait_queue(sk_sleep(sk), &wait);
796
797 return rc;
798}
799
John Fastabend8934ce22018-03-28 12:49:15 -0700800static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
801 int nonblock, int flags, int *addr_len)
802{
803 struct iov_iter *iter = &msg->msg_iter;
804 struct smap_psock *psock;
805 int copied = 0;
806
807 if (unlikely(flags & MSG_ERRQUEUE))
808 return inet_recv_error(sk, msg, len, addr_len);
809
810 rcu_read_lock();
811 psock = smap_psock_sk(sk);
812 if (unlikely(!psock))
813 goto out;
814
815 if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
816 goto out;
817 rcu_read_unlock();
818
819 if (!skb_queue_empty(&sk->sk_receive_queue))
820 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
821
822 lock_sock(sk);
John Fastabende20f7332018-04-23 15:39:28 -0700823bytes_ready:
John Fastabend8934ce22018-03-28 12:49:15 -0700824 while (copied != len) {
825 struct scatterlist *sg;
826 struct sk_msg_buff *md;
827 int i;
828
829 md = list_first_entry_or_null(&psock->ingress,
830 struct sk_msg_buff, list);
831 if (unlikely(!md))
832 break;
833 i = md->sg_start;
834 do {
835 struct page *page;
836 int n, copy;
837
838 sg = &md->sg_data[i];
839 copy = sg->length;
840 page = sg_page(sg);
841
842 if (copied + copy > len)
843 copy = len - copied;
844
845 n = copy_page_to_iter(page, sg->offset, copy, iter);
846 if (n != copy) {
847 md->sg_start = i;
848 release_sock(sk);
849 smap_release_sock(psock, sk);
850 return -EFAULT;
851 }
852
853 copied += copy;
854 sg->offset += copy;
855 sg->length -= copy;
856 sk_mem_uncharge(sk, copy);
857
858 if (!sg->length) {
859 i++;
860 if (i == MAX_SKB_FRAGS)
861 i = 0;
John Fastabendfa246692018-03-28 12:49:25 -0700862 if (!md->skb)
863 put_page(page);
John Fastabend8934ce22018-03-28 12:49:15 -0700864 }
865 if (copied == len)
866 break;
867 } while (i != md->sg_end);
868 md->sg_start = i;
869
870 if (!sg->length && md->sg_start == md->sg_end) {
871 list_del(&md->list);
John Fastabendfa246692018-03-28 12:49:25 -0700872 if (md->skb)
873 consume_skb(md->skb);
John Fastabend8934ce22018-03-28 12:49:15 -0700874 kfree(md);
875 }
876 }
877
John Fastabende20f7332018-04-23 15:39:28 -0700878 if (!copied) {
879 long timeo;
880 int data;
881 int err = 0;
882
883 timeo = sock_rcvtimeo(sk, nonblock);
884 data = bpf_wait_data(sk, psock, flags, timeo, &err);
885
886 if (data) {
887 if (!skb_queue_empty(&sk->sk_receive_queue)) {
888 release_sock(sk);
889 smap_release_sock(psock, sk);
890 copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
891 return copied;
892 }
893 goto bytes_ready;
894 }
895
896 if (err)
897 copied = err;
898 }
899
John Fastabend8934ce22018-03-28 12:49:15 -0700900 release_sock(sk);
901 smap_release_sock(psock, sk);
902 return copied;
903out:
904 rcu_read_unlock();
905 return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
906}
907
908
John Fastabend4f738ad2018-03-18 12:57:10 -0700909static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
910{
911 int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
912 struct sk_msg_buff md = {0};
913 unsigned int sg_copy = 0;
914 struct smap_psock *psock;
915 int copied = 0, err = 0;
916 struct scatterlist *sg;
917 long timeo;
918
919 /* Its possible a sock event or user removed the psock _but_ the ops
920 * have not been reprogrammed yet so we get here. In this case fallback
921 * to tcp_sendmsg. Note this only works because we _only_ ever allow
922 * a single ULP there is no hierarchy here.
923 */
924 rcu_read_lock();
925 psock = smap_psock_sk(sk);
926 if (unlikely(!psock)) {
927 rcu_read_unlock();
928 return tcp_sendmsg(sk, msg, size);
929 }
930
931 /* Increment the psock refcnt to ensure its not released while sending a
932 * message. Required because sk lookup and bpf programs are used in
933 * separate rcu critical sections. Its OK if we lose the map entry
934 * but we can't lose the sock reference.
935 */
936 if (!refcount_inc_not_zero(&psock->refcnt)) {
937 rcu_read_unlock();
938 return tcp_sendmsg(sk, msg, size);
939 }
940
941 sg = md.sg_data;
Prashant Bhole6ef6d842018-03-30 09:21:00 +0900942 sg_init_marker(sg, MAX_SKB_FRAGS);
John Fastabend4f738ad2018-03-18 12:57:10 -0700943 rcu_read_unlock();
944
945 lock_sock(sk);
946 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
947
948 while (msg_data_left(msg)) {
949 struct sk_msg_buff *m;
950 bool enospc = false;
951 int copy;
952
953 if (sk->sk_err) {
954 err = sk->sk_err;
955 goto out_err;
956 }
957
958 copy = msg_data_left(msg);
959 if (!sk_stream_memory_free(sk))
960 goto wait_for_sndbuf;
961
962 m = psock->cork_bytes ? psock->cork : &md;
963 m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
964 err = sk_alloc_sg(sk, copy, m->sg_data,
965 m->sg_start, &m->sg_end, &sg_copy,
966 m->sg_end - 1);
967 if (err) {
968 if (err != -ENOSPC)
969 goto wait_for_memory;
970 enospc = true;
971 copy = sg_copy;
972 }
973
974 err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
975 if (err < 0) {
976 free_curr_sg(sk, m);
977 goto out_err;
978 }
979
980 psock->sg_size += copy;
981 copied += copy;
982 sg_copy = 0;
983
984 /* When bytes are being corked skip running BPF program and
985 * applying verdict unless there is no more buffer space. In
986 * the ENOSPC case simply run BPF prorgram with currently
987 * accumulated data. We don't have much choice at this point
988 * we could try extending the page frags or chaining complex
989 * frags but even in these cases _eventually_ we will hit an
990 * OOM scenario. More complex recovery schemes may be
991 * implemented in the future, but BPF programs must handle
992 * the case where apply_cork requests are not honored. The
993 * canonical method to verify this is to check data length.
994 */
995 if (psock->cork_bytes) {
996 if (copy > psock->cork_bytes)
997 psock->cork_bytes = 0;
998 else
999 psock->cork_bytes -= copy;
1000
1001 if (psock->cork_bytes && !enospc)
1002 goto out_cork;
1003
1004 /* All cork bytes accounted for re-run filter */
1005 psock->eval = __SK_NONE;
1006 psock->cork_bytes = 0;
1007 }
1008
1009 err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
1010 if (unlikely(err < 0))
1011 goto out_err;
1012 continue;
1013wait_for_sndbuf:
1014 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1015wait_for_memory:
1016 err = sk_stream_wait_memory(sk, &timeo);
1017 if (err)
1018 goto out_err;
1019 }
1020out_err:
1021 if (err < 0)
1022 err = sk_stream_error(sk, msg->msg_flags, err);
1023out_cork:
1024 release_sock(sk);
1025 smap_release_sock(psock, sk);
1026 return copied ? copied : err;
1027}
1028
1029static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
1030 int offset, size_t size, int flags)
1031{
1032 struct sk_msg_buff md = {0}, *m = NULL;
1033 int err = 0, copied = 0;
1034 struct smap_psock *psock;
1035 struct scatterlist *sg;
1036 bool enospc = false;
1037
1038 rcu_read_lock();
1039 psock = smap_psock_sk(sk);
1040 if (unlikely(!psock))
1041 goto accept;
1042
1043 if (!refcount_inc_not_zero(&psock->refcnt))
1044 goto accept;
1045 rcu_read_unlock();
1046
1047 lock_sock(sk);
1048
Prashant Bhole6ef6d842018-03-30 09:21:00 +09001049 if (psock->cork_bytes) {
John Fastabend4f738ad2018-03-18 12:57:10 -07001050 m = psock->cork;
Prashant Bhole6ef6d842018-03-30 09:21:00 +09001051 sg = &m->sg_data[m->sg_end];
1052 } else {
John Fastabend4f738ad2018-03-18 12:57:10 -07001053 m = &md;
Prashant Bhole6ef6d842018-03-30 09:21:00 +09001054 sg = m->sg_data;
1055 sg_init_marker(sg, MAX_SKB_FRAGS);
1056 }
John Fastabend4f738ad2018-03-18 12:57:10 -07001057
1058 /* Catch case where ring is full and sendpage is stalled. */
1059 if (unlikely(m->sg_end == m->sg_start &&
1060 m->sg_data[m->sg_end].length))
1061 goto out_err;
1062
1063 psock->sg_size += size;
John Fastabend4f738ad2018-03-18 12:57:10 -07001064 sg_set_page(sg, page, size, offset);
1065 get_page(page);
1066 m->sg_copy[m->sg_end] = true;
1067 sk_mem_charge(sk, size);
1068 m->sg_end++;
1069 copied = size;
1070
1071 if (m->sg_end == MAX_SKB_FRAGS)
1072 m->sg_end = 0;
1073
1074 if (m->sg_end == m->sg_start)
1075 enospc = true;
1076
1077 if (psock->cork_bytes) {
1078 if (size > psock->cork_bytes)
1079 psock->cork_bytes = 0;
1080 else
1081 psock->cork_bytes -= size;
1082
1083 if (psock->cork_bytes && !enospc)
1084 goto out_err;
1085
1086 /* All cork bytes accounted for re-run filter */
1087 psock->eval = __SK_NONE;
1088 psock->cork_bytes = 0;
1089 }
1090
1091 err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
1092out_err:
1093 release_sock(sk);
1094 smap_release_sock(psock, sk);
1095 return copied ? copied : err;
1096accept:
1097 rcu_read_unlock();
1098 return tcp_sendpage(sk, page, offset, size, flags);
1099}
1100
1101static void bpf_tcp_msg_add(struct smap_psock *psock,
1102 struct sock *sk,
1103 struct bpf_prog *tx_msg)
1104{
1105 struct bpf_prog *orig_tx_msg;
1106
1107 orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
1108 if (orig_tx_msg)
1109 bpf_prog_put(orig_tx_msg);
1110}
1111
John Fastabend1aa12bd2018-02-05 10:17:49 -08001112static int bpf_tcp_ulp_register(void)
1113{
1114 tcp_bpf_proto = tcp_prot;
1115 tcp_bpf_proto.close = bpf_tcp_close;
John Fastabend4f738ad2018-03-18 12:57:10 -07001116 /* Once BPF TX ULP is registered it is never unregistered. It
1117 * will be in the ULP list for the lifetime of the system. Doing
1118 * duplicate registers is not a problem.
1119 */
John Fastabend1aa12bd2018-02-05 10:17:49 -08001120 return tcp_register_ulp(&bpf_tcp_ulp_ops);
1121}
1122
John Fastabend174a79f2017-08-15 22:32:47 -07001123static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
1124{
1125 struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
1126 int rc;
1127
1128 if (unlikely(!prog))
John Fastabend04686ef2017-10-31 19:17:31 -07001129 return __SK_DROP;
John Fastabend174a79f2017-08-15 22:32:47 -07001130
1131 skb_orphan(skb);
John Fastabend34f795022017-10-18 07:10:36 -07001132 /* We need to ensure that BPF metadata for maps is also cleared
1133 * when we orphan the skb so that we don't have the possibility
1134 * to reference a stale map.
1135 */
John Fastabende5cd3ab2018-05-14 10:00:16 -07001136 TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
John Fastabend174a79f2017-08-15 22:32:47 -07001137 skb->sk = psock->sock;
Daniel Borkmann6aaae2b2017-09-25 02:25:50 +02001138 bpf_compute_data_pointers(skb);
John Fastabend34f795022017-10-18 07:10:36 -07001139 preempt_disable();
John Fastabend174a79f2017-08-15 22:32:47 -07001140 rc = (*prog->bpf_func)(skb, prog->insnsi);
John Fastabend34f795022017-10-18 07:10:36 -07001141 preempt_enable();
John Fastabend174a79f2017-08-15 22:32:47 -07001142 skb->sk = NULL;
1143
John Fastabend04686ef2017-10-31 19:17:31 -07001144 /* Moving return codes from UAPI namespace into internal namespace */
John Fastabendbfa640752017-10-27 09:45:53 -07001145 return rc == SK_PASS ?
John Fastabende5cd3ab2018-05-14 10:00:16 -07001146 (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
John Fastabend04686ef2017-10-31 19:17:31 -07001147 __SK_DROP;
John Fastabend174a79f2017-08-15 22:32:47 -07001148}
1149
John Fastabendfa246692018-03-28 12:49:25 -07001150static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
1151{
1152 struct sock *sk = psock->sock;
1153 int copied = 0, num_sg;
1154 struct sk_msg_buff *r;
1155
1156 r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
1157 if (unlikely(!r))
1158 return -EAGAIN;
1159
1160 if (!sk_rmem_schedule(sk, skb, skb->len)) {
1161 kfree(r);
1162 return -EAGAIN;
1163 }
1164
1165 sg_init_table(r->sg_data, MAX_SKB_FRAGS);
1166 num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
1167 if (unlikely(num_sg < 0)) {
1168 kfree(r);
1169 return num_sg;
1170 }
1171 sk_mem_charge(sk, skb->len);
1172 copied = skb->len;
1173 r->sg_start = 0;
1174 r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
1175 r->skb = skb;
1176 list_add_tail(&r->list, &psock->ingress);
1177 sk->sk_data_ready(sk);
1178 return copied;
1179}
1180
John Fastabend174a79f2017-08-15 22:32:47 -07001181static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
1182{
John Fastabendfa246692018-03-28 12:49:25 -07001183 struct smap_psock *peer;
John Fastabend90a96312017-09-01 11:29:26 -07001184 struct sock *sk;
John Fastabendfa246692018-03-28 12:49:25 -07001185 __u32 in;
John Fastabend174a79f2017-08-15 22:32:47 -07001186 int rc;
1187
John Fastabend174a79f2017-08-15 22:32:47 -07001188 rc = smap_verdict_func(psock, skb);
1189 switch (rc) {
John Fastabend04686ef2017-10-31 19:17:31 -07001190 case __SK_REDIRECT:
John Fastabend34f795022017-10-18 07:10:36 -07001191 sk = do_sk_redirect_map(skb);
John Fastabendfa246692018-03-28 12:49:25 -07001192 if (!sk) {
1193 kfree_skb(skb);
1194 break;
1195 }
John Fastabend174a79f2017-08-15 22:32:47 -07001196
John Fastabendfa246692018-03-28 12:49:25 -07001197 peer = smap_psock_sk(sk);
1198 in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
1199
1200 if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
1201 !test_bit(SMAP_TX_RUNNING, &peer->state))) {
1202 kfree_skb(skb);
1203 break;
1204 }
1205
1206 if (!in && sock_writeable(sk)) {
1207 skb_set_owner_w(skb, sk);
1208 skb_queue_tail(&peer->rxqueue, skb);
1209 schedule_work(&peer->tx_work);
1210 break;
1211 } else if (in &&
1212 atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
1213 skb_queue_tail(&peer->rxqueue, skb);
1214 schedule_work(&peer->tx_work);
1215 break;
John Fastabend174a79f2017-08-15 22:32:47 -07001216 }
1217 /* Fall through and free skb otherwise */
John Fastabend04686ef2017-10-31 19:17:31 -07001218 case __SK_DROP:
John Fastabend174a79f2017-08-15 22:32:47 -07001219 default:
John Fastabend174a79f2017-08-15 22:32:47 -07001220 kfree_skb(skb);
1221 }
1222}
1223
1224static void smap_report_sk_error(struct smap_psock *psock, int err)
1225{
1226 struct sock *sk = psock->sock;
1227
1228 sk->sk_err = err;
1229 sk->sk_error_report(sk);
1230}
1231
John Fastabend174a79f2017-08-15 22:32:47 -07001232static void smap_read_sock_strparser(struct strparser *strp,
1233 struct sk_buff *skb)
1234{
1235 struct smap_psock *psock;
1236
1237 rcu_read_lock();
1238 psock = container_of(strp, struct smap_psock, strp);
1239 smap_do_verdict(psock, skb);
1240 rcu_read_unlock();
1241}
1242
1243/* Called with lock held on socket */
1244static void smap_data_ready(struct sock *sk)
1245{
1246 struct smap_psock *psock;
1247
John Fastabendd26e597d2017-08-28 07:10:45 -07001248 rcu_read_lock();
John Fastabend174a79f2017-08-15 22:32:47 -07001249 psock = smap_psock_sk(sk);
John Fastabendd26e597d2017-08-28 07:10:45 -07001250 if (likely(psock)) {
1251 write_lock_bh(&sk->sk_callback_lock);
John Fastabend174a79f2017-08-15 22:32:47 -07001252 strp_data_ready(&psock->strp);
John Fastabendd26e597d2017-08-28 07:10:45 -07001253 write_unlock_bh(&sk->sk_callback_lock);
1254 }
1255 rcu_read_unlock();
John Fastabend174a79f2017-08-15 22:32:47 -07001256}
1257
1258static void smap_tx_work(struct work_struct *w)
1259{
1260 struct smap_psock *psock;
1261 struct sk_buff *skb;
1262 int rem, off, n;
1263
1264 psock = container_of(w, struct smap_psock, tx_work);
1265
1266 /* lock sock to avoid losing sk_socket at some point during loop */
1267 lock_sock(psock->sock);
1268 if (psock->save_skb) {
1269 skb = psock->save_skb;
1270 rem = psock->save_rem;
1271 off = psock->save_off;
1272 psock->save_skb = NULL;
1273 goto start;
1274 }
1275
1276 while ((skb = skb_dequeue(&psock->rxqueue))) {
John Fastabendfa246692018-03-28 12:49:25 -07001277 __u32 flags;
1278
John Fastabend174a79f2017-08-15 22:32:47 -07001279 rem = skb->len;
1280 off = 0;
1281start:
John Fastabendfa246692018-03-28 12:49:25 -07001282 flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
John Fastabend174a79f2017-08-15 22:32:47 -07001283 do {
John Fastabendfa246692018-03-28 12:49:25 -07001284 if (likely(psock->sock->sk_socket)) {
1285 if (flags)
1286 n = smap_do_ingress(psock, skb);
1287 else
1288 n = skb_send_sock_locked(psock->sock,
1289 skb, off, rem);
1290 } else {
John Fastabend174a79f2017-08-15 22:32:47 -07001291 n = -EINVAL;
John Fastabendfa246692018-03-28 12:49:25 -07001292 }
1293
John Fastabend174a79f2017-08-15 22:32:47 -07001294 if (n <= 0) {
1295 if (n == -EAGAIN) {
1296 /* Retry when space is available */
1297 psock->save_skb = skb;
1298 psock->save_rem = rem;
1299 psock->save_off = off;
1300 goto out;
1301 }
1302 /* Hard errors break pipe and stop xmit */
1303 smap_report_sk_error(psock, n ? -n : EPIPE);
1304 clear_bit(SMAP_TX_RUNNING, &psock->state);
John Fastabend174a79f2017-08-15 22:32:47 -07001305 kfree_skb(skb);
1306 goto out;
1307 }
1308 rem -= n;
1309 off += n;
1310 } while (rem);
John Fastabendfa246692018-03-28 12:49:25 -07001311
1312 if (!flags)
1313 kfree_skb(skb);
John Fastabend174a79f2017-08-15 22:32:47 -07001314 }
1315out:
1316 release_sock(psock->sock);
1317}
1318
1319static void smap_write_space(struct sock *sk)
1320{
1321 struct smap_psock *psock;
1322
1323 rcu_read_lock();
1324 psock = smap_psock_sk(sk);
1325 if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
1326 schedule_work(&psock->tx_work);
1327 rcu_read_unlock();
1328}
1329
1330static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
1331{
John Fastabend174a79f2017-08-15 22:32:47 -07001332 if (!psock->strp_enabled)
John Fastabend2f857d02017-08-28 07:10:25 -07001333 return;
John Fastabend174a79f2017-08-15 22:32:47 -07001334 sk->sk_data_ready = psock->save_data_ready;
1335 sk->sk_write_space = psock->save_write_space;
John Fastabend174a79f2017-08-15 22:32:47 -07001336 psock->save_data_ready = NULL;
1337 psock->save_write_space = NULL;
John Fastabend174a79f2017-08-15 22:32:47 -07001338 strp_stop(&psock->strp);
1339 psock->strp_enabled = false;
John Fastabend174a79f2017-08-15 22:32:47 -07001340}
1341
1342static void smap_destroy_psock(struct rcu_head *rcu)
1343{
1344 struct smap_psock *psock = container_of(rcu,
1345 struct smap_psock, rcu);
1346
1347 /* Now that a grace period has passed there is no longer
1348 * any reference to this sock in the sockmap so we can
1349 * destroy the psock, strparser, and bpf programs. But,
1350 * because we use workqueue sync operations we can not
1351 * do it in rcu context
1352 */
1353 schedule_work(&psock->gc_work);
1354}
1355
John Fastabend2f857d02017-08-28 07:10:25 -07001356static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
John Fastabend174a79f2017-08-15 22:32:47 -07001357{
John Fastabendffa35662018-03-18 12:56:54 -07001358 if (refcount_dec_and_test(&psock->refcnt)) {
1359 tcp_cleanup_ulp(sock);
1360 smap_stop_sock(psock, sock);
1361 clear_bit(SMAP_TX_RUNNING, &psock->state);
1362 rcu_assign_sk_user_data(sock, NULL);
1363 call_rcu_sched(&psock->rcu, smap_destroy_psock);
1364 }
John Fastabend174a79f2017-08-15 22:32:47 -07001365}
1366
1367static int smap_parse_func_strparser(struct strparser *strp,
1368 struct sk_buff *skb)
1369{
1370 struct smap_psock *psock;
1371 struct bpf_prog *prog;
1372 int rc;
1373
1374 rcu_read_lock();
1375 psock = container_of(strp, struct smap_psock, strp);
1376 prog = READ_ONCE(psock->bpf_parse);
1377
1378 if (unlikely(!prog)) {
1379 rcu_read_unlock();
1380 return skb->len;
1381 }
1382
1383 /* Attach socket for bpf program to use if needed we can do this
1384 * because strparser clones the skb before handing it to a upper
1385 * layer, meaning skb_orphan has been called. We NULL sk on the
1386 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
1387 * later and because we are not charging the memory of this skb to
1388 * any socket yet.
1389 */
1390 skb->sk = psock->sock;
Daniel Borkmann6aaae2b2017-09-25 02:25:50 +02001391 bpf_compute_data_pointers(skb);
John Fastabend174a79f2017-08-15 22:32:47 -07001392 rc = (*prog->bpf_func)(skb, prog->insnsi);
1393 skb->sk = NULL;
1394 rcu_read_unlock();
1395 return rc;
1396}
1397
John Fastabend174a79f2017-08-15 22:32:47 -07001398static int smap_read_sock_done(struct strparser *strp, int err)
1399{
1400 return err;
1401}
1402
1403static int smap_init_sock(struct smap_psock *psock,
1404 struct sock *sk)
1405{
Eric Biggers3fd87122017-08-24 14:38:51 -07001406 static const struct strp_callbacks cb = {
1407 .rcv_msg = smap_read_sock_strparser,
1408 .parse_msg = smap_parse_func_strparser,
1409 .read_sock_done = smap_read_sock_done,
1410 };
John Fastabend174a79f2017-08-15 22:32:47 -07001411
John Fastabend174a79f2017-08-15 22:32:47 -07001412 return strp_init(&psock->strp, sk, &cb);
1413}
1414
1415static void smap_init_progs(struct smap_psock *psock,
John Fastabend174a79f2017-08-15 22:32:47 -07001416 struct bpf_prog *verdict,
1417 struct bpf_prog *parse)
1418{
1419 struct bpf_prog *orig_parse, *orig_verdict;
1420
1421 orig_parse = xchg(&psock->bpf_parse, parse);
1422 orig_verdict = xchg(&psock->bpf_verdict, verdict);
1423
1424 if (orig_verdict)
1425 bpf_prog_put(orig_verdict);
1426 if (orig_parse)
1427 bpf_prog_put(orig_parse);
1428}
1429
1430static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
1431{
1432 if (sk->sk_data_ready == smap_data_ready)
1433 return;
1434 psock->save_data_ready = sk->sk_data_ready;
1435 psock->save_write_space = sk->sk_write_space;
John Fastabend174a79f2017-08-15 22:32:47 -07001436 sk->sk_data_ready = smap_data_ready;
1437 sk->sk_write_space = smap_write_space;
John Fastabend174a79f2017-08-15 22:32:47 -07001438 psock->strp_enabled = true;
1439}
1440
1441static void sock_map_remove_complete(struct bpf_stab *stab)
1442{
1443 bpf_map_area_free(stab->sock_map);
1444 kfree(stab);
1445}
1446
1447static void smap_gc_work(struct work_struct *w)
1448{
John Fastabend2f857d02017-08-28 07:10:25 -07001449 struct smap_psock_map_entry *e, *tmp;
John Fastabend8934ce22018-03-28 12:49:15 -07001450 struct sk_msg_buff *md, *mtmp;
John Fastabend174a79f2017-08-15 22:32:47 -07001451 struct smap_psock *psock;
1452
1453 psock = container_of(w, struct smap_psock, gc_work);
1454
1455 /* no callback lock needed because we already detached sockmap ops */
1456 if (psock->strp_enabled)
1457 strp_done(&psock->strp);
1458
1459 cancel_work_sync(&psock->tx_work);
1460 __skb_queue_purge(&psock->rxqueue);
1461
1462 /* At this point all strparser and xmit work must be complete */
1463 if (psock->bpf_parse)
1464 bpf_prog_put(psock->bpf_parse);
1465 if (psock->bpf_verdict)
1466 bpf_prog_put(psock->bpf_verdict);
John Fastabend4f738ad2018-03-18 12:57:10 -07001467 if (psock->bpf_tx_msg)
1468 bpf_prog_put(psock->bpf_tx_msg);
1469
1470 if (psock->cork) {
1471 free_start_sg(psock->sock, psock->cork);
1472 kfree(psock->cork);
1473 }
John Fastabend174a79f2017-08-15 22:32:47 -07001474
John Fastabend8934ce22018-03-28 12:49:15 -07001475 list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
1476 list_del(&md->list);
1477 free_start_sg(psock->sock, md);
1478 kfree(md);
1479 }
1480
John Fastabend2f857d02017-08-28 07:10:25 -07001481 list_for_each_entry_safe(e, tmp, &psock->maps, list) {
1482 list_del(&e->list);
1483 kfree(e);
1484 }
John Fastabend174a79f2017-08-15 22:32:47 -07001485
John Fastabend4f738ad2018-03-18 12:57:10 -07001486 if (psock->sk_redir)
1487 sock_put(psock->sk_redir);
1488
John Fastabend174a79f2017-08-15 22:32:47 -07001489 sock_put(psock->sock);
1490 kfree(psock);
1491}
1492
John Fastabende5cd3ab2018-05-14 10:00:16 -07001493static struct smap_psock *smap_init_psock(struct sock *sock, int node)
John Fastabend174a79f2017-08-15 22:32:47 -07001494{
1495 struct smap_psock *psock;
1496
Martin KaFai Lau96eabe72017-08-18 11:28:00 -07001497 psock = kzalloc_node(sizeof(struct smap_psock),
1498 GFP_ATOMIC | __GFP_NOWARN,
John Fastabende5cd3ab2018-05-14 10:00:16 -07001499 node);
John Fastabend174a79f2017-08-15 22:32:47 -07001500 if (!psock)
1501 return ERR_PTR(-ENOMEM);
1502
John Fastabend4f738ad2018-03-18 12:57:10 -07001503 psock->eval = __SK_NONE;
John Fastabend174a79f2017-08-15 22:32:47 -07001504 psock->sock = sock;
1505 skb_queue_head_init(&psock->rxqueue);
1506 INIT_WORK(&psock->tx_work, smap_tx_work);
1507 INIT_WORK(&psock->gc_work, smap_gc_work);
John Fastabend2f857d02017-08-28 07:10:25 -07001508 INIT_LIST_HEAD(&psock->maps);
John Fastabend8934ce22018-03-28 12:49:15 -07001509 INIT_LIST_HEAD(&psock->ingress);
John Fastabendffa35662018-03-18 12:56:54 -07001510 refcount_set(&psock->refcnt, 1);
John Fastabend174a79f2017-08-15 22:32:47 -07001511
1512 rcu_assign_sk_user_data(sock, psock);
1513 sock_hold(sock);
1514 return psock;
1515}
1516
1517static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
1518{
1519 struct bpf_stab *stab;
John Fastabend174a79f2017-08-15 22:32:47 -07001520 u64 cost;
Eric Dumazet952fad82018-02-13 15:33:52 -08001521 int err;
John Fastabend174a79f2017-08-15 22:32:47 -07001522
John Fastabendfb50df82017-10-18 07:11:22 -07001523 if (!capable(CAP_NET_ADMIN))
1524 return ERR_PTR(-EPERM);
1525
John Fastabend174a79f2017-08-15 22:32:47 -07001526 /* check sanity of attributes */
1527 if (attr->max_entries == 0 || attr->key_size != 4 ||
Chenbo Feng6e71b042017-10-18 13:00:22 -07001528 attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
John Fastabend174a79f2017-08-15 22:32:47 -07001529 return ERR_PTR(-EINVAL);
1530
John Fastabend1aa12bd2018-02-05 10:17:49 -08001531 err = bpf_tcp_ulp_register();
1532 if (err && err != -EEXIST)
1533 return ERR_PTR(err);
1534
John Fastabend174a79f2017-08-15 22:32:47 -07001535 stab = kzalloc(sizeof(*stab), GFP_USER);
1536 if (!stab)
1537 return ERR_PTR(-ENOMEM);
1538
Jakub Kicinskibd475642018-01-11 20:29:06 -08001539 bpf_map_init_from_attr(&stab->map, attr);
John Fastabend174a79f2017-08-15 22:32:47 -07001540
1541 /* make sure page count doesn't overflow */
1542 cost = (u64) stab->map.max_entries * sizeof(struct sock *);
Eric Dumazet952fad82018-02-13 15:33:52 -08001543 err = -EINVAL;
John Fastabend174a79f2017-08-15 22:32:47 -07001544 if (cost >= U32_MAX - PAGE_SIZE)
1545 goto free_stab;
1546
1547 stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
1548
1549 /* if map size is larger than memlock limit, reject it early */
1550 err = bpf_map_precharge_memlock(stab->map.pages);
1551 if (err)
1552 goto free_stab;
1553
Dan Carpenterf740c342017-08-25 23:27:14 +03001554 err = -ENOMEM;
John Fastabend174a79f2017-08-15 22:32:47 -07001555 stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
Martin KaFai Lau96eabe72017-08-18 11:28:00 -07001556 sizeof(struct sock *),
1557 stab->map.numa_node);
John Fastabend174a79f2017-08-15 22:32:47 -07001558 if (!stab->sock_map)
1559 goto free_stab;
1560
John Fastabend174a79f2017-08-15 22:32:47 -07001561 return &stab->map;
1562free_stab:
1563 kfree(stab);
1564 return ERR_PTR(err);
1565}
1566
John Fastabend81110382018-05-14 10:00:17 -07001567static void smap_list_remove(struct smap_psock *psock,
1568 struct sock **entry,
1569 struct htab_elem *hash_link)
John Fastabend2f857d02017-08-28 07:10:25 -07001570{
1571 struct smap_psock_map_entry *e, *tmp;
1572
1573 list_for_each_entry_safe(e, tmp, &psock->maps, list) {
John Fastabend81110382018-05-14 10:00:17 -07001574 if (e->entry == entry || e->hash_link == hash_link) {
John Fastabend2f857d02017-08-28 07:10:25 -07001575 list_del(&e->list);
1576 break;
1577 }
1578 }
1579}
1580
John Fastabend174a79f2017-08-15 22:32:47 -07001581static void sock_map_free(struct bpf_map *map)
1582{
1583 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1584 int i;
1585
1586 synchronize_rcu();
1587
1588 /* At this point no update, lookup or delete operations can happen.
1589 * However, be aware we can still get a socket state event updates,
1590 * and data ready callabacks that reference the psock from sk_user_data
1591 * Also psock worker threads are still in-flight. So smap_release_sock
1592 * will only free the psock after cancel_sync on the worker threads
1593 * and a grace period expire to ensure psock is really safe to remove.
1594 */
1595 rcu_read_lock();
1596 for (i = 0; i < stab->map.max_entries; i++) {
John Fastabend2f857d02017-08-28 07:10:25 -07001597 struct smap_psock *psock;
John Fastabend174a79f2017-08-15 22:32:47 -07001598 struct sock *sock;
1599
1600 sock = xchg(&stab->sock_map[i], NULL);
1601 if (!sock)
1602 continue;
1603
John Fastabend2f857d02017-08-28 07:10:25 -07001604 write_lock_bh(&sock->sk_callback_lock);
1605 psock = smap_psock_sk(sock);
John Fastabend5731a872018-01-04 20:02:09 -08001606 /* This check handles a racing sock event that can get the
1607 * sk_callback_lock before this case but after xchg happens
1608 * causing the refcnt to hit zero and sock user data (psock)
1609 * to be null and queued for garbage collection.
1610 */
1611 if (likely(psock)) {
John Fastabend81110382018-05-14 10:00:17 -07001612 smap_list_remove(psock, &stab->sock_map[i], NULL);
John Fastabend5731a872018-01-04 20:02:09 -08001613 smap_release_sock(psock, sock);
1614 }
John Fastabend2f857d02017-08-28 07:10:25 -07001615 write_unlock_bh(&sock->sk_callback_lock);
John Fastabend174a79f2017-08-15 22:32:47 -07001616 }
1617 rcu_read_unlock();
1618
John Fastabend2f857d02017-08-28 07:10:25 -07001619 sock_map_remove_complete(stab);
John Fastabend174a79f2017-08-15 22:32:47 -07001620}
1621
1622static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
1623{
1624 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1625 u32 i = key ? *(u32 *)key : U32_MAX;
1626 u32 *next = (u32 *)next_key;
1627
1628 if (i >= stab->map.max_entries) {
1629 *next = 0;
1630 return 0;
1631 }
1632
1633 if (i == stab->map.max_entries - 1)
1634 return -ENOENT;
1635
1636 *next = i + 1;
1637 return 0;
1638}
1639
1640struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
1641{
1642 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1643
1644 if (key >= map->max_entries)
1645 return NULL;
1646
1647 return READ_ONCE(stab->sock_map[key]);
1648}
1649
1650static int sock_map_delete_elem(struct bpf_map *map, void *key)
1651{
1652 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
John Fastabend2f857d02017-08-28 07:10:25 -07001653 struct smap_psock *psock;
John Fastabend174a79f2017-08-15 22:32:47 -07001654 int k = *(u32 *)key;
1655 struct sock *sock;
1656
1657 if (k >= map->max_entries)
1658 return -EINVAL;
1659
1660 sock = xchg(&stab->sock_map[k], NULL);
1661 if (!sock)
1662 return -EINVAL;
1663
John Fastabend2f857d02017-08-28 07:10:25 -07001664 write_lock_bh(&sock->sk_callback_lock);
1665 psock = smap_psock_sk(sock);
1666 if (!psock)
1667 goto out;
1668
1669 if (psock->bpf_parse)
1670 smap_stop_sock(psock, sock);
John Fastabend81110382018-05-14 10:00:17 -07001671 smap_list_remove(psock, &stab->sock_map[k], NULL);
John Fastabend2f857d02017-08-28 07:10:25 -07001672 smap_release_sock(psock, sock);
1673out:
1674 write_unlock_bh(&sock->sk_callback_lock);
John Fastabend174a79f2017-08-15 22:32:47 -07001675 return 0;
1676}
1677
1678/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
1679 * done inside rcu critical sections. This ensures on updates that the psock
1680 * will not be released via smap_release_sock() until concurrent updates/deletes
1681 * complete. All operations operate on sock_map using cmpxchg and xchg
1682 * operations to ensure we do not get stale references. Any reads into the
1683 * map must be done with READ_ONCE() because of this.
1684 *
1685 * A psock is destroyed via call_rcu and after any worker threads are cancelled
1686 * and syncd so we are certain all references from the update/lookup/delete
1687 * operations as well as references in the data path are no longer in use.
1688 *
John Fastabend2f857d02017-08-28 07:10:25 -07001689 * Psocks may exist in multiple maps, but only a single set of parse/verdict
1690 * programs may be inherited from the maps it belongs to. A reference count
1691 * is kept with the total number of references to the psock from all maps. The
1692 * psock will not be released until this reaches zero. The psock and sock
1693 * user data data use the sk_callback_lock to protect critical data structures
1694 * from concurrent access. This allows us to avoid two updates from modifying
1695 * the user data in sock and the lock is required anyways for modifying
1696 * callbacks, we simply increase its scope slightly.
John Fastabend174a79f2017-08-15 22:32:47 -07001697 *
John Fastabend2f857d02017-08-28 07:10:25 -07001698 * Rules to follow,
1699 * - psock must always be read inside RCU critical section
1700 * - sk_user_data must only be modified inside sk_callback_lock and read
1701 * inside RCU critical section.
1702 * - psock->maps list must only be read & modified inside sk_callback_lock
1703 * - sock_map must use READ_ONCE and (cmp)xchg operations
1704 * - BPF verdict/parse programs must use READ_ONCE and xchg operations
John Fastabend174a79f2017-08-15 22:32:47 -07001705 */
John Fastabende5cd3ab2018-05-14 10:00:16 -07001706
1707static int __sock_map_ctx_update_elem(struct bpf_map *map,
1708 struct bpf_sock_progs *progs,
1709 struct sock *sock,
1710 struct sock **map_link,
1711 void *key)
John Fastabend174a79f2017-08-15 22:32:47 -07001712{
John Fastabend4f738ad2018-03-18 12:57:10 -07001713 struct bpf_prog *verdict, *parse, *tx_msg;
John Fastabende5cd3ab2018-05-14 10:00:16 -07001714 struct smap_psock_map_entry *e = NULL;
John Fastabend2f857d02017-08-28 07:10:25 -07001715 struct smap_psock *psock;
John Fastabend4f738ad2018-03-18 12:57:10 -07001716 bool new = false;
Gustavo A. R. Silva0e436452018-05-17 09:08:43 -05001717 int err = 0;
John Fastabend174a79f2017-08-15 22:32:47 -07001718
John Fastabend2f857d02017-08-28 07:10:25 -07001719 /* 1. If sock map has BPF programs those will be inherited by the
1720 * sock being added. If the sock is already attached to BPF programs
1721 * this results in an error.
1722 */
John Fastabende5cd3ab2018-05-14 10:00:16 -07001723 verdict = READ_ONCE(progs->bpf_verdict);
1724 parse = READ_ONCE(progs->bpf_parse);
1725 tx_msg = READ_ONCE(progs->bpf_tx_msg);
John Fastabend174a79f2017-08-15 22:32:47 -07001726
John Fastabend2f857d02017-08-28 07:10:25 -07001727 if (parse && verdict) {
John Fastabend174a79f2017-08-15 22:32:47 -07001728 /* bpf prog refcnt may be zero if a concurrent attach operation
1729 * removes the program after the above READ_ONCE() but before
1730 * we increment the refcnt. If this is the case abort with an
1731 * error.
1732 */
John Fastabend96174562018-05-17 14:06:40 -07001733 verdict = bpf_prog_inc_not_zero(verdict);
John Fastabend174a79f2017-08-15 22:32:47 -07001734 if (IS_ERR(verdict))
1735 return PTR_ERR(verdict);
1736
John Fastabend96174562018-05-17 14:06:40 -07001737 parse = bpf_prog_inc_not_zero(parse);
John Fastabend174a79f2017-08-15 22:32:47 -07001738 if (IS_ERR(parse)) {
1739 bpf_prog_put(verdict);
1740 return PTR_ERR(parse);
1741 }
1742 }
1743
John Fastabend4f738ad2018-03-18 12:57:10 -07001744 if (tx_msg) {
John Fastabend96174562018-05-17 14:06:40 -07001745 tx_msg = bpf_prog_inc_not_zero(tx_msg);
John Fastabend4f738ad2018-03-18 12:57:10 -07001746 if (IS_ERR(tx_msg)) {
John Fastabenda593f702018-05-17 14:06:35 -07001747 if (parse && verdict) {
John Fastabend4f738ad2018-03-18 12:57:10 -07001748 bpf_prog_put(parse);
John Fastabenda593f702018-05-17 14:06:35 -07001749 bpf_prog_put(verdict);
1750 }
John Fastabend4f738ad2018-03-18 12:57:10 -07001751 return PTR_ERR(tx_msg);
1752 }
1753 }
1754
John Fastabend2f857d02017-08-28 07:10:25 -07001755 write_lock_bh(&sock->sk_callback_lock);
1756 psock = smap_psock_sk(sock);
1757
1758 /* 2. Do not allow inheriting programs if psock exists and has
1759 * already inherited programs. This would create confusion on
1760 * which parser/verdict program is running. If no psock exists
1761 * create one. Inside sk_callback_lock to ensure concurrent create
1762 * doesn't update user data.
1763 */
1764 if (psock) {
1765 if (READ_ONCE(psock->bpf_parse) && parse) {
1766 err = -EBUSY;
1767 goto out_progs;
1768 }
John Fastabend4f738ad2018-03-18 12:57:10 -07001769 if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
1770 err = -EBUSY;
1771 goto out_progs;
1772 }
1773 if (!refcount_inc_not_zero(&psock->refcnt)) {
1774 err = -EAGAIN;
1775 goto out_progs;
1776 }
John Fastabend2f857d02017-08-28 07:10:25 -07001777 } else {
John Fastabende5cd3ab2018-05-14 10:00:16 -07001778 psock = smap_init_psock(sock, map->numa_node);
John Fastabend174a79f2017-08-15 22:32:47 -07001779 if (IS_ERR(psock)) {
John Fastabend2f857d02017-08-28 07:10:25 -07001780 err = PTR_ERR(psock);
1781 goto out_progs;
John Fastabend174a79f2017-08-15 22:32:47 -07001782 }
John Fastabend2f857d02017-08-28 07:10:25 -07001783
John Fastabend174a79f2017-08-15 22:32:47 -07001784 set_bit(SMAP_TX_RUNNING, &psock->state);
John Fastabend4f738ad2018-03-18 12:57:10 -07001785 new = true;
John Fastabend174a79f2017-08-15 22:32:47 -07001786 }
1787
John Fastabend81110382018-05-14 10:00:17 -07001788 if (map_link) {
1789 e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
1790 if (!e) {
1791 err = -ENOMEM;
1792 goto out_progs;
1793 }
John Fastabend2f857d02017-08-28 07:10:25 -07001794 }
John Fastabend2f857d02017-08-28 07:10:25 -07001795
1796 /* 3. At this point we have a reference to a valid psock that is
1797 * running. Attach any BPF programs needed.
1798 */
John Fastabend4f738ad2018-03-18 12:57:10 -07001799 if (tx_msg)
1800 bpf_tcp_msg_add(psock, sock, tx_msg);
1801 if (new) {
1802 err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
1803 if (err)
1804 goto out_free;
1805 }
1806
John Fastabend2f857d02017-08-28 07:10:25 -07001807 if (parse && verdict && !psock->strp_enabled) {
John Fastabend174a79f2017-08-15 22:32:47 -07001808 err = smap_init_sock(psock, sock);
1809 if (err)
John Fastabend2f857d02017-08-28 07:10:25 -07001810 goto out_free;
John Fastabende5cd3ab2018-05-14 10:00:16 -07001811 smap_init_progs(psock, verdict, parse);
John Fastabend174a79f2017-08-15 22:32:47 -07001812 smap_start_sock(psock, sock);
John Fastabend174a79f2017-08-15 22:32:47 -07001813 }
1814
John Fastabend2f857d02017-08-28 07:10:25 -07001815 /* 4. Place psock in sockmap for use and stop any programs on
1816 * the old sock assuming its not the same sock we are replacing
1817 * it with. Because we can only have a single set of programs if
1818 * old_sock has a strp we can stop it.
1819 */
John Fastabende5cd3ab2018-05-14 10:00:16 -07001820 if (map_link) {
1821 e->entry = map_link;
1822 list_add_tail(&e->list, &psock->maps);
John Fastabend2f857d02017-08-28 07:10:25 -07001823 }
John Fastabende5cd3ab2018-05-14 10:00:16 -07001824 write_unlock_bh(&sock->sk_callback_lock);
1825 return err;
John Fastabend2f857d02017-08-28 07:10:25 -07001826out_free:
1827 smap_release_sock(psock, sock);
1828out_progs:
John Fastabenda593f702018-05-17 14:06:35 -07001829 if (parse && verdict) {
John Fastabend2f857d02017-08-28 07:10:25 -07001830 bpf_prog_put(parse);
John Fastabenda593f702018-05-17 14:06:35 -07001831 bpf_prog_put(verdict);
1832 }
John Fastabend4f738ad2018-03-18 12:57:10 -07001833 if (tx_msg)
1834 bpf_prog_put(tx_msg);
John Fastabend2f857d02017-08-28 07:10:25 -07001835 write_unlock_bh(&sock->sk_callback_lock);
1836 kfree(e);
John Fastabend174a79f2017-08-15 22:32:47 -07001837 return err;
1838}
1839
John Fastabende5cd3ab2018-05-14 10:00:16 -07001840static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1841 struct bpf_map *map,
1842 void *key, u64 flags)
John Fastabend174a79f2017-08-15 22:32:47 -07001843{
1844 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
John Fastabende5cd3ab2018-05-14 10:00:16 -07001845 struct bpf_sock_progs *progs = &stab->progs;
1846 struct sock *osock, *sock;
1847 u32 i = *(u32 *)key;
1848 int err;
1849
1850 if (unlikely(flags > BPF_EXIST))
1851 return -EINVAL;
1852
1853 if (unlikely(i >= stab->map.max_entries))
1854 return -E2BIG;
1855
1856 sock = READ_ONCE(stab->sock_map[i]);
1857 if (flags == BPF_EXIST && !sock)
1858 return -ENOENT;
1859 else if (flags == BPF_NOEXIST && sock)
1860 return -EEXIST;
1861
1862 sock = skops->sk;
1863 err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
1864 key);
1865 if (err)
1866 goto out;
1867
1868 osock = xchg(&stab->sock_map[i], sock);
1869 if (osock) {
1870 struct smap_psock *opsock = smap_psock_sk(osock);
1871
1872 write_lock_bh(&osock->sk_callback_lock);
John Fastabend81110382018-05-14 10:00:17 -07001873 smap_list_remove(opsock, &stab->sock_map[i], NULL);
John Fastabende5cd3ab2018-05-14 10:00:16 -07001874 smap_release_sock(opsock, osock);
1875 write_unlock_bh(&osock->sk_callback_lock);
1876 }
1877out:
John Fastabende23afe52018-05-16 16:38:14 -07001878 return err;
John Fastabende5cd3ab2018-05-14 10:00:16 -07001879}
1880
1881int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
1882{
1883 struct bpf_sock_progs *progs;
John Fastabend464bc0f2017-08-28 07:10:04 -07001884 struct bpf_prog *orig;
John Fastabend174a79f2017-08-15 22:32:47 -07001885
John Fastabende5cd3ab2018-05-14 10:00:16 -07001886 if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
1887 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1888
1889 progs = &stab->progs;
John Fastabend81110382018-05-14 10:00:17 -07001890 } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
1891 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1892
1893 progs = &htab->progs;
John Fastabende5cd3ab2018-05-14 10:00:16 -07001894 } else {
John Fastabend81374aa2017-08-28 07:11:43 -07001895 return -EINVAL;
John Fastabende5cd3ab2018-05-14 10:00:16 -07001896 }
John Fastabend81374aa2017-08-28 07:11:43 -07001897
John Fastabend464bc0f2017-08-28 07:10:04 -07001898 switch (type) {
John Fastabend4f738ad2018-03-18 12:57:10 -07001899 case BPF_SK_MSG_VERDICT:
John Fastabende5cd3ab2018-05-14 10:00:16 -07001900 orig = xchg(&progs->bpf_tx_msg, prog);
John Fastabend4f738ad2018-03-18 12:57:10 -07001901 break;
John Fastabend464bc0f2017-08-28 07:10:04 -07001902 case BPF_SK_SKB_STREAM_PARSER:
John Fastabende5cd3ab2018-05-14 10:00:16 -07001903 orig = xchg(&progs->bpf_parse, prog);
John Fastabend464bc0f2017-08-28 07:10:04 -07001904 break;
1905 case BPF_SK_SKB_STREAM_VERDICT:
John Fastabende5cd3ab2018-05-14 10:00:16 -07001906 orig = xchg(&progs->bpf_verdict, prog);
John Fastabend464bc0f2017-08-28 07:10:04 -07001907 break;
1908 default:
1909 return -EOPNOTSUPP;
1910 }
John Fastabend174a79f2017-08-15 22:32:47 -07001911
John Fastabend464bc0f2017-08-28 07:10:04 -07001912 if (orig)
1913 bpf_prog_put(orig);
John Fastabend174a79f2017-08-15 22:32:47 -07001914
1915 return 0;
1916}
1917
1918static void *sock_map_lookup(struct bpf_map *map, void *key)
1919{
1920 return NULL;
1921}
1922
1923static int sock_map_update_elem(struct bpf_map *map,
1924 void *key, void *value, u64 flags)
1925{
1926 struct bpf_sock_ops_kern skops;
1927 u32 fd = *(u32 *)value;
1928 struct socket *socket;
1929 int err;
1930
1931 socket = sockfd_lookup(fd, &err);
1932 if (!socket)
1933 return err;
1934
1935 skops.sk = socket->sk;
1936 if (!skops.sk) {
1937 fput(socket->file);
1938 return -EINVAL;
1939 }
1940
John Fastabend435bf0d2017-10-18 07:10:15 -07001941 if (skops.sk->sk_type != SOCK_STREAM ||
1942 skops.sk->sk_protocol != IPPROTO_TCP) {
1943 fput(socket->file);
1944 return -EOPNOTSUPP;
1945 }
1946
John Fastabend2f857d02017-08-28 07:10:25 -07001947 err = sock_map_ctx_update_elem(&skops, map, key, flags);
John Fastabend174a79f2017-08-15 22:32:47 -07001948 fput(socket->file);
1949 return err;
1950}
1951
John Fastabendba6b8de2018-04-23 15:39:23 -07001952static void sock_map_release(struct bpf_map *map)
John Fastabend3d9e9522018-02-05 10:17:54 -08001953{
John Fastabende5cd3ab2018-05-14 10:00:16 -07001954 struct bpf_sock_progs *progs;
John Fastabend3d9e9522018-02-05 10:17:54 -08001955 struct bpf_prog *orig;
1956
John Fastabend81110382018-05-14 10:00:17 -07001957 if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
1958 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1959
1960 progs = &stab->progs;
1961 } else {
1962 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1963
1964 progs = &htab->progs;
1965 }
1966
John Fastabende5cd3ab2018-05-14 10:00:16 -07001967 orig = xchg(&progs->bpf_parse, NULL);
John Fastabend3d9e9522018-02-05 10:17:54 -08001968 if (orig)
1969 bpf_prog_put(orig);
John Fastabende5cd3ab2018-05-14 10:00:16 -07001970 orig = xchg(&progs->bpf_verdict, NULL);
John Fastabend3d9e9522018-02-05 10:17:54 -08001971 if (orig)
1972 bpf_prog_put(orig);
John Fastabend4f738ad2018-03-18 12:57:10 -07001973
John Fastabende5cd3ab2018-05-14 10:00:16 -07001974 orig = xchg(&progs->bpf_tx_msg, NULL);
John Fastabend4f738ad2018-03-18 12:57:10 -07001975 if (orig)
1976 bpf_prog_put(orig);
John Fastabend3d9e9522018-02-05 10:17:54 -08001977}
1978
John Fastabend81110382018-05-14 10:00:17 -07001979static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
1980{
1981 struct bpf_htab *htab;
1982 int i, err;
1983 u64 cost;
1984
1985 if (!capable(CAP_NET_ADMIN))
1986 return ERR_PTR(-EPERM);
1987
1988 /* check sanity of attributes */
1989 if (attr->max_entries == 0 || attr->value_size != 4 ||
1990 attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
1991 return ERR_PTR(-EINVAL);
1992
Yonghong Song683d2ac2018-05-16 14:06:26 -07001993 if (attr->key_size > MAX_BPF_STACK)
1994 /* eBPF programs initialize keys on stack, so they cannot be
1995 * larger than max stack size
1996 */
1997 return ERR_PTR(-E2BIG);
1998
John Fastabend81110382018-05-14 10:00:17 -07001999 err = bpf_tcp_ulp_register();
2000 if (err && err != -EEXIST)
2001 return ERR_PTR(err);
2002
2003 htab = kzalloc(sizeof(*htab), GFP_USER);
2004 if (!htab)
2005 return ERR_PTR(-ENOMEM);
2006
2007 bpf_map_init_from_attr(&htab->map, attr);
2008
2009 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
2010 htab->elem_size = sizeof(struct htab_elem) +
2011 round_up(htab->map.key_size, 8);
2012 err = -EINVAL;
2013 if (htab->n_buckets == 0 ||
2014 htab->n_buckets > U32_MAX / sizeof(struct bucket))
2015 goto free_htab;
2016
2017 cost = (u64) htab->n_buckets * sizeof(struct bucket) +
2018 (u64) htab->elem_size * htab->map.max_entries;
2019
2020 if (cost >= U32_MAX - PAGE_SIZE)
2021 goto free_htab;
2022
2023 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
2024 err = bpf_map_precharge_memlock(htab->map.pages);
2025 if (err)
2026 goto free_htab;
2027
2028 err = -ENOMEM;
2029 htab->buckets = bpf_map_area_alloc(
2030 htab->n_buckets * sizeof(struct bucket),
2031 htab->map.numa_node);
2032 if (!htab->buckets)
2033 goto free_htab;
2034
2035 for (i = 0; i < htab->n_buckets; i++) {
2036 INIT_HLIST_HEAD(&htab->buckets[i].head);
2037 raw_spin_lock_init(&htab->buckets[i].lock);
2038 }
2039
2040 return &htab->map;
2041free_htab:
2042 kfree(htab);
2043 return ERR_PTR(err);
2044}
2045
2046static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
2047{
2048 return &htab->buckets[hash & (htab->n_buckets - 1)];
2049}
2050
2051static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
2052{
2053 return &__select_bucket(htab, hash)->head;
2054}
2055
2056static void sock_hash_free(struct bpf_map *map)
2057{
2058 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2059 int i;
2060
2061 synchronize_rcu();
2062
2063 /* At this point no update, lookup or delete operations can happen.
2064 * However, be aware we can still get a socket state event updates,
2065 * and data ready callabacks that reference the psock from sk_user_data
2066 * Also psock worker threads are still in-flight. So smap_release_sock
2067 * will only free the psock after cancel_sync on the worker threads
2068 * and a grace period expire to ensure psock is really safe to remove.
2069 */
2070 rcu_read_lock();
2071 for (i = 0; i < htab->n_buckets; i++) {
2072 struct hlist_head *head = select_bucket(htab, i);
2073 struct hlist_node *n;
2074 struct htab_elem *l;
2075
2076 hlist_for_each_entry_safe(l, n, head, hash_node) {
2077 struct sock *sock = l->sk;
2078 struct smap_psock *psock;
2079
2080 hlist_del_rcu(&l->hash_node);
2081 write_lock_bh(&sock->sk_callback_lock);
2082 psock = smap_psock_sk(sock);
2083 /* This check handles a racing sock event that can get
2084 * the sk_callback_lock before this case but after xchg
2085 * causing the refcnt to hit zero and sock user data
2086 * (psock) to be null and queued for garbage collection.
2087 */
2088 if (likely(psock)) {
2089 smap_list_remove(psock, NULL, l);
2090 smap_release_sock(psock, sock);
2091 }
2092 write_unlock_bh(&sock->sk_callback_lock);
2093 kfree(l);
2094 }
2095 }
2096 rcu_read_unlock();
2097 bpf_map_area_free(htab->buckets);
2098 kfree(htab);
2099}
2100
2101static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
2102 void *key, u32 key_size, u32 hash,
2103 struct sock *sk,
2104 struct htab_elem *old_elem)
2105{
2106 struct htab_elem *l_new;
2107
2108 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
2109 if (!old_elem) {
2110 atomic_dec(&htab->count);
2111 return ERR_PTR(-E2BIG);
2112 }
2113 }
2114 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
2115 htab->map.numa_node);
2116 if (!l_new)
2117 return ERR_PTR(-ENOMEM);
2118
2119 memcpy(l_new->key, key, key_size);
2120 l_new->sk = sk;
2121 l_new->hash = hash;
2122 return l_new;
2123}
2124
2125static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
2126 u32 hash, void *key, u32 key_size)
2127{
2128 struct htab_elem *l;
2129
2130 hlist_for_each_entry_rcu(l, head, hash_node) {
2131 if (l->hash == hash && !memcmp(&l->key, key, key_size))
2132 return l;
2133 }
2134
2135 return NULL;
2136}
2137
2138static inline u32 htab_map_hash(const void *key, u32 key_len)
2139{
2140 return jhash(key, key_len, 0);
2141}
2142
2143static int sock_hash_get_next_key(struct bpf_map *map,
2144 void *key, void *next_key)
2145{
2146 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2147 struct htab_elem *l, *next_l;
2148 struct hlist_head *h;
2149 u32 hash, key_size;
2150 int i = 0;
2151
2152 WARN_ON_ONCE(!rcu_read_lock_held());
2153
2154 key_size = map->key_size;
2155 if (!key)
2156 goto find_first_elem;
2157 hash = htab_map_hash(key, key_size);
2158 h = select_bucket(htab, hash);
2159
2160 l = lookup_elem_raw(h, hash, key, key_size);
2161 if (!l)
2162 goto find_first_elem;
2163 next_l = hlist_entry_safe(
2164 rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
2165 struct htab_elem, hash_node);
2166 if (next_l) {
2167 memcpy(next_key, next_l->key, key_size);
2168 return 0;
2169 }
2170
2171 /* no more elements in this hash list, go to the next bucket */
2172 i = hash & (htab->n_buckets - 1);
2173 i++;
2174
2175find_first_elem:
2176 /* iterate over buckets */
2177 for (; i < htab->n_buckets; i++) {
2178 h = select_bucket(htab, i);
2179
2180 /* pick first element in the bucket */
2181 next_l = hlist_entry_safe(
2182 rcu_dereference_raw(hlist_first_rcu(h)),
2183 struct htab_elem, hash_node);
2184 if (next_l) {
2185 /* if it's not empty, just return it */
2186 memcpy(next_key, next_l->key, key_size);
2187 return 0;
2188 }
2189 }
2190
2191 /* iterated over all buckets and all elements */
2192 return -ENOENT;
2193}
2194
2195static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
2196 struct bpf_map *map,
2197 void *key, u64 map_flags)
2198{
2199 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2200 struct bpf_sock_progs *progs = &htab->progs;
2201 struct htab_elem *l_new = NULL, *l_old;
2202 struct smap_psock_map_entry *e = NULL;
2203 struct hlist_head *head;
2204 struct smap_psock *psock;
2205 u32 key_size, hash;
2206 struct sock *sock;
2207 struct bucket *b;
2208 int err;
2209
2210 sock = skops->sk;
2211
2212 if (sock->sk_type != SOCK_STREAM ||
2213 sock->sk_protocol != IPPROTO_TCP)
2214 return -EOPNOTSUPP;
2215
2216 if (unlikely(map_flags > BPF_EXIST))
2217 return -EINVAL;
2218
2219 e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
2220 if (!e)
2221 return -ENOMEM;
2222
2223 WARN_ON_ONCE(!rcu_read_lock_held());
2224 key_size = map->key_size;
2225 hash = htab_map_hash(key, key_size);
2226 b = __select_bucket(htab, hash);
2227 head = &b->head;
2228
2229 err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
2230 if (err)
2231 goto err;
2232
2233 /* bpf_map_update_elem() can be called in_irq() */
2234 raw_spin_lock_bh(&b->lock);
2235 l_old = lookup_elem_raw(head, hash, key, key_size);
2236 if (l_old && map_flags == BPF_NOEXIST) {
2237 err = -EEXIST;
2238 goto bucket_err;
2239 }
2240 if (!l_old && map_flags == BPF_EXIST) {
2241 err = -ENOENT;
2242 goto bucket_err;
2243 }
2244
2245 l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
2246 if (IS_ERR(l_new)) {
2247 err = PTR_ERR(l_new);
2248 goto bucket_err;
2249 }
2250
2251 psock = smap_psock_sk(sock);
2252 if (unlikely(!psock)) {
2253 err = -EINVAL;
2254 goto bucket_err;
2255 }
2256
2257 e->hash_link = l_new;
2258 e->htab = container_of(map, struct bpf_htab, map);
2259 list_add_tail(&e->list, &psock->maps);
2260
2261 /* add new element to the head of the list, so that
2262 * concurrent search will find it before old elem
2263 */
2264 hlist_add_head_rcu(&l_new->hash_node, head);
2265 if (l_old) {
2266 psock = smap_psock_sk(l_old->sk);
2267
2268 hlist_del_rcu(&l_old->hash_node);
2269 smap_list_remove(psock, NULL, l_old);
2270 smap_release_sock(psock, l_old->sk);
2271 free_htab_elem(htab, l_old);
2272 }
2273 raw_spin_unlock_bh(&b->lock);
2274 return 0;
2275bucket_err:
2276 raw_spin_unlock_bh(&b->lock);
2277err:
2278 kfree(e);
2279 psock = smap_psock_sk(sock);
2280 if (psock)
2281 smap_release_sock(psock, sock);
2282 return err;
2283}
2284
2285static int sock_hash_update_elem(struct bpf_map *map,
2286 void *key, void *value, u64 flags)
2287{
2288 struct bpf_sock_ops_kern skops;
2289 u32 fd = *(u32 *)value;
2290 struct socket *socket;
2291 int err;
2292
2293 socket = sockfd_lookup(fd, &err);
2294 if (!socket)
2295 return err;
2296
2297 skops.sk = socket->sk;
2298 if (!skops.sk) {
2299 fput(socket->file);
2300 return -EINVAL;
2301 }
2302
2303 err = sock_hash_ctx_update_elem(&skops, map, key, flags);
2304 fput(socket->file);
2305 return err;
2306}
2307
2308static int sock_hash_delete_elem(struct bpf_map *map, void *key)
2309{
2310 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2311 struct hlist_head *head;
2312 struct bucket *b;
2313 struct htab_elem *l;
2314 u32 hash, key_size;
2315 int ret = -ENOENT;
2316
2317 key_size = map->key_size;
2318 hash = htab_map_hash(key, key_size);
2319 b = __select_bucket(htab, hash);
2320 head = &b->head;
2321
2322 raw_spin_lock_bh(&b->lock);
2323 l = lookup_elem_raw(head, hash, key, key_size);
2324 if (l) {
2325 struct sock *sock = l->sk;
2326 struct smap_psock *psock;
2327
2328 hlist_del_rcu(&l->hash_node);
2329 write_lock_bh(&sock->sk_callback_lock);
2330 psock = smap_psock_sk(sock);
2331 /* This check handles a racing sock event that can get the
2332 * sk_callback_lock before this case but after xchg happens
2333 * causing the refcnt to hit zero and sock user data (psock)
2334 * to be null and queued for garbage collection.
2335 */
2336 if (likely(psock)) {
2337 smap_list_remove(psock, NULL, l);
2338 smap_release_sock(psock, sock);
2339 }
2340 write_unlock_bh(&sock->sk_callback_lock);
2341 free_htab_elem(htab, l);
2342 ret = 0;
2343 }
2344 raw_spin_unlock_bh(&b->lock);
2345 return ret;
2346}
2347
2348struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
2349{
2350 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2351 struct hlist_head *head;
2352 struct htab_elem *l;
2353 u32 key_size, hash;
2354 struct bucket *b;
2355 struct sock *sk;
2356
2357 key_size = map->key_size;
2358 hash = htab_map_hash(key, key_size);
2359 b = __select_bucket(htab, hash);
2360 head = &b->head;
2361
2362 raw_spin_lock_bh(&b->lock);
2363 l = lookup_elem_raw(head, hash, key, key_size);
2364 sk = l ? l->sk : NULL;
2365 raw_spin_unlock_bh(&b->lock);
2366 return sk;
2367}
2368
John Fastabend174a79f2017-08-15 22:32:47 -07002369const struct bpf_map_ops sock_map_ops = {
2370 .map_alloc = sock_map_alloc,
2371 .map_free = sock_map_free,
2372 .map_lookup_elem = sock_map_lookup,
2373 .map_get_next_key = sock_map_get_next_key,
2374 .map_update_elem = sock_map_update_elem,
2375 .map_delete_elem = sock_map_delete_elem,
John Fastabendba6b8de2018-04-23 15:39:23 -07002376 .map_release_uref = sock_map_release,
John Fastabend174a79f2017-08-15 22:32:47 -07002377};
2378
John Fastabend81110382018-05-14 10:00:17 -07002379const struct bpf_map_ops sock_hash_ops = {
2380 .map_alloc = sock_hash_alloc,
2381 .map_free = sock_hash_free,
2382 .map_lookup_elem = sock_map_lookup,
2383 .map_get_next_key = sock_hash_get_next_key,
2384 .map_update_elem = sock_hash_update_elem,
2385 .map_delete_elem = sock_hash_delete_elem,
2386};
2387
John Fastabend2f857d02017-08-28 07:10:25 -07002388BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
2389 struct bpf_map *, map, void *, key, u64, flags)
John Fastabend174a79f2017-08-15 22:32:47 -07002390{
2391 WARN_ON_ONCE(!rcu_read_lock_held());
John Fastabend2f857d02017-08-28 07:10:25 -07002392 return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
John Fastabend174a79f2017-08-15 22:32:47 -07002393}
2394
2395const struct bpf_func_proto bpf_sock_map_update_proto = {
2396 .func = bpf_sock_map_update,
2397 .gpl_only = false,
2398 .pkt_access = true,
2399 .ret_type = RET_INTEGER,
2400 .arg1_type = ARG_PTR_TO_CTX,
2401 .arg2_type = ARG_CONST_MAP_PTR,
2402 .arg3_type = ARG_PTR_TO_MAP_KEY,
2403 .arg4_type = ARG_ANYTHING,
John Fastabend174a79f2017-08-15 22:32:47 -07002404};
John Fastabend81110382018-05-14 10:00:17 -07002405
2406BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
2407 struct bpf_map *, map, void *, key, u64, flags)
2408{
2409 WARN_ON_ONCE(!rcu_read_lock_held());
2410 return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
2411}
2412
2413const struct bpf_func_proto bpf_sock_hash_update_proto = {
2414 .func = bpf_sock_hash_update,
2415 .gpl_only = false,
2416 .pkt_access = true,
2417 .ret_type = RET_INTEGER,
2418 .arg1_type = ARG_PTR_TO_CTX,
2419 .arg2_type = ARG_CONST_MAP_PTR,
2420 .arg3_type = ARG_PTR_TO_MAP_KEY,
2421 .arg4_type = ARG_ANYTHING,
2422};