Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 1 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> |
| 2 | * |
| 3 | * This program is free software; you can redistribute it and/or |
| 4 | * modify it under the terms of version 2 of the GNU General Public |
| 5 | * License as published by the Free Software Foundation. |
| 6 | * |
| 7 | * This program is distributed in the hope that it will be useful, but |
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 10 | * General Public License for more details. |
| 11 | */ |
| 12 | |
| 13 | #include <linux/kernel.h> |
| 14 | #include <linux/module.h> |
| 15 | #include <linux/skbuff.h> |
| 16 | #include <linux/types.h> |
| 17 | #include <linux/bpf.h> |
| 18 | #include <net/lwtunnel.h> |
Peter Oskolkov | ca78801 | 2019-02-13 11:53:37 -0800 | [diff] [blame] | 19 | #include <net/gre.h> |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 20 | #include <net/ip6_route.h> |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 21 | |
| 22 | struct bpf_lwt_prog { |
| 23 | struct bpf_prog *prog; |
| 24 | char *name; |
| 25 | }; |
| 26 | |
| 27 | struct bpf_lwt { |
| 28 | struct bpf_lwt_prog in; |
| 29 | struct bpf_lwt_prog out; |
| 30 | struct bpf_lwt_prog xmit; |
| 31 | int family; |
| 32 | }; |
| 33 | |
| 34 | #define MAX_PROG_NAME 256 |
| 35 | |
| 36 | static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) |
| 37 | { |
| 38 | return (struct bpf_lwt *)lwt->data; |
| 39 | } |
| 40 | |
| 41 | #define NO_REDIRECT false |
| 42 | #define CAN_REDIRECT true |
| 43 | |
| 44 | static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, |
| 45 | struct dst_entry *dst, bool can_redirect) |
| 46 | { |
| 47 | int ret; |
| 48 | |
| 49 | /* Preempt disable is needed to protect per-cpu redirect_info between |
| 50 | * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and |
| 51 | * access to maps strictly require a rcu_read_lock() for protection, |
| 52 | * mixing with BH RCU lock doesn't work. |
| 53 | */ |
| 54 | preempt_disable(); |
Daniel Borkmann | 6aaae2b | 2017-09-25 02:25:50 +0200 | [diff] [blame] | 55 | bpf_compute_data_pointers(skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 56 | ret = bpf_prog_run_save_cb(lwt->prog, skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 57 | |
| 58 | switch (ret) { |
| 59 | case BPF_OK: |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 60 | case BPF_LWT_REROUTE: |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 61 | break; |
| 62 | |
| 63 | case BPF_REDIRECT: |
| 64 | if (unlikely(!can_redirect)) { |
| 65 | pr_warn_once("Illegal redirect return code in prog %s\n", |
| 66 | lwt->name ? : "<unknown>"); |
| 67 | ret = BPF_OK; |
| 68 | } else { |
Willem de Bruijn | e7c87bd | 2019-01-15 20:19:22 -0500 | [diff] [blame] | 69 | skb_reset_mac_header(skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 70 | ret = skb_do_redirect(skb); |
| 71 | if (ret == 0) |
| 72 | ret = BPF_REDIRECT; |
| 73 | } |
| 74 | break; |
| 75 | |
| 76 | case BPF_DROP: |
| 77 | kfree_skb(skb); |
| 78 | ret = -EPERM; |
| 79 | break; |
| 80 | |
| 81 | default: |
| 82 | pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); |
| 83 | kfree_skb(skb); |
| 84 | ret = -EINVAL; |
| 85 | break; |
| 86 | } |
| 87 | |
| 88 | preempt_enable(); |
| 89 | |
| 90 | return ret; |
| 91 | } |
| 92 | |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 93 | static int bpf_lwt_input_reroute(struct sk_buff *skb) |
| 94 | { |
| 95 | int err = -EINVAL; |
| 96 | |
| 97 | if (skb->protocol == htons(ETH_P_IP)) { |
| 98 | struct iphdr *iph = ip_hdr(skb); |
| 99 | |
| 100 | err = ip_route_input_noref(skb, iph->daddr, iph->saddr, |
| 101 | iph->tos, skb_dst(skb)->dev); |
| 102 | } else if (skb->protocol == htons(ETH_P_IPV6)) { |
| 103 | err = ipv6_stub->ipv6_route_input(skb); |
| 104 | } else { |
| 105 | err = -EAFNOSUPPORT; |
| 106 | } |
| 107 | |
| 108 | if (err) |
| 109 | goto err; |
| 110 | return dst_input(skb); |
| 111 | |
| 112 | err: |
| 113 | kfree_skb(skb); |
| 114 | return err; |
| 115 | } |
| 116 | |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 117 | static int bpf_input(struct sk_buff *skb) |
| 118 | { |
| 119 | struct dst_entry *dst = skb_dst(skb); |
| 120 | struct bpf_lwt *bpf; |
| 121 | int ret; |
| 122 | |
| 123 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 124 | if (bpf->in.prog) { |
| 125 | ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); |
| 126 | if (ret < 0) |
| 127 | return ret; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 128 | if (ret == BPF_LWT_REROUTE) |
| 129 | return bpf_lwt_input_reroute(skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 130 | } |
| 131 | |
| 132 | if (unlikely(!dst->lwtstate->orig_input)) { |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 133 | kfree_skb(skb); |
| 134 | return -EINVAL; |
| 135 | } |
| 136 | |
| 137 | return dst->lwtstate->orig_input(skb); |
| 138 | } |
| 139 | |
| 140 | static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
| 141 | { |
| 142 | struct dst_entry *dst = skb_dst(skb); |
| 143 | struct bpf_lwt *bpf; |
| 144 | int ret; |
| 145 | |
| 146 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 147 | if (bpf->out.prog) { |
| 148 | ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); |
| 149 | if (ret < 0) |
| 150 | return ret; |
| 151 | } |
| 152 | |
| 153 | if (unlikely(!dst->lwtstate->orig_output)) { |
| 154 | pr_warn_once("orig_output not set on dst for prog %s\n", |
| 155 | bpf->out.name); |
| 156 | kfree_skb(skb); |
| 157 | return -EINVAL; |
| 158 | } |
| 159 | |
| 160 | return dst->lwtstate->orig_output(net, sk, skb); |
| 161 | } |
| 162 | |
| 163 | static int xmit_check_hhlen(struct sk_buff *skb) |
| 164 | { |
| 165 | int hh_len = skb_dst(skb)->dev->hard_header_len; |
| 166 | |
| 167 | if (skb_headroom(skb) < hh_len) { |
| 168 | int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); |
| 169 | |
| 170 | if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) |
| 171 | return -ENOMEM; |
| 172 | } |
| 173 | |
| 174 | return 0; |
| 175 | } |
| 176 | |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 177 | static int bpf_lwt_xmit_reroute(struct sk_buff *skb) |
| 178 | { |
| 179 | struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); |
| 180 | int oif = l3mdev ? l3mdev->ifindex : 0; |
| 181 | struct dst_entry *dst = NULL; |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 182 | int err = -EAFNOSUPPORT; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 183 | struct sock *sk; |
| 184 | struct net *net; |
| 185 | bool ipv4; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 186 | |
| 187 | if (skb->protocol == htons(ETH_P_IP)) |
| 188 | ipv4 = true; |
| 189 | else if (skb->protocol == htons(ETH_P_IPV6)) |
| 190 | ipv4 = false; |
| 191 | else |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 192 | goto err; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 193 | |
| 194 | sk = sk_to_full_sk(skb->sk); |
| 195 | if (sk) { |
| 196 | if (sk->sk_bound_dev_if) |
| 197 | oif = sk->sk_bound_dev_if; |
| 198 | net = sock_net(sk); |
| 199 | } else { |
| 200 | net = dev_net(skb_dst(skb)->dev); |
| 201 | } |
| 202 | |
| 203 | if (ipv4) { |
| 204 | struct iphdr *iph = ip_hdr(skb); |
| 205 | struct flowi4 fl4 = {}; |
| 206 | struct rtable *rt; |
| 207 | |
| 208 | fl4.flowi4_oif = oif; |
| 209 | fl4.flowi4_mark = skb->mark; |
| 210 | fl4.flowi4_uid = sock_net_uid(net, sk); |
| 211 | fl4.flowi4_tos = RT_TOS(iph->tos); |
| 212 | fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; |
| 213 | fl4.flowi4_proto = iph->protocol; |
| 214 | fl4.daddr = iph->daddr; |
| 215 | fl4.saddr = iph->saddr; |
| 216 | |
| 217 | rt = ip_route_output_key(net, &fl4); |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 218 | if (IS_ERR(rt)) { |
| 219 | err = PTR_ERR(rt); |
| 220 | goto err; |
| 221 | } |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 222 | dst = &rt->dst; |
| 223 | } else { |
| 224 | struct ipv6hdr *iph6 = ipv6_hdr(skb); |
| 225 | struct flowi6 fl6 = {}; |
| 226 | |
| 227 | fl6.flowi6_oif = oif; |
| 228 | fl6.flowi6_mark = skb->mark; |
| 229 | fl6.flowi6_uid = sock_net_uid(net, sk); |
| 230 | fl6.flowlabel = ip6_flowinfo(iph6); |
| 231 | fl6.flowi6_proto = iph6->nexthdr; |
| 232 | fl6.daddr = iph6->daddr; |
| 233 | fl6.saddr = iph6->saddr; |
| 234 | |
| 235 | err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 236 | if (unlikely(err)) |
| 237 | goto err; |
| 238 | if (IS_ERR(dst)) { |
| 239 | err = PTR_ERR(dst); |
| 240 | goto err; |
| 241 | } |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 242 | } |
| 243 | if (unlikely(dst->error)) { |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 244 | err = dst->error; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 245 | dst_release(dst); |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 246 | goto err; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 247 | } |
| 248 | |
| 249 | /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it |
| 250 | * was done for the previous dst, so we are doing it here again, in |
| 251 | * case the new dst needs much more space. The call below is a noop |
| 252 | * if there is enough header space in skb. |
| 253 | */ |
| 254 | err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); |
| 255 | if (unlikely(err)) |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 256 | goto err; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 257 | |
| 258 | skb_dst_drop(skb); |
| 259 | skb_dst_set(skb, dst); |
| 260 | |
| 261 | err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); |
| 262 | if (unlikely(err)) |
Peter Oskolkov | bd16693 | 2019-02-23 18:25:01 -0800 | [diff] [blame] | 263 | return err; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 264 | |
| 265 | /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ |
| 266 | return LWTUNNEL_XMIT_DONE; |
Peter Oskolkov | fb40588 | 2019-02-14 10:39:31 -0800 | [diff] [blame] | 267 | |
| 268 | err: |
| 269 | kfree_skb(skb); |
| 270 | return err; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 271 | } |
| 272 | |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 273 | static int bpf_xmit(struct sk_buff *skb) |
| 274 | { |
| 275 | struct dst_entry *dst = skb_dst(skb); |
| 276 | struct bpf_lwt *bpf; |
| 277 | |
| 278 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 279 | if (bpf->xmit.prog) { |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 280 | __be16 proto = skb->protocol; |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 281 | int ret; |
| 282 | |
| 283 | ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); |
| 284 | switch (ret) { |
| 285 | case BPF_OK: |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 286 | /* If the header changed, e.g. via bpf_lwt_push_encap, |
| 287 | * BPF_LWT_REROUTE below should have been used if the |
| 288 | * protocol was also changed. |
| 289 | */ |
| 290 | if (skb->protocol != proto) { |
| 291 | kfree_skb(skb); |
| 292 | return -EINVAL; |
| 293 | } |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 294 | /* If the header was expanded, headroom might be too |
| 295 | * small for L2 header to come, expand as needed. |
| 296 | */ |
| 297 | ret = xmit_check_hhlen(skb); |
| 298 | if (unlikely(ret)) |
| 299 | return ret; |
| 300 | |
| 301 | return LWTUNNEL_XMIT_CONTINUE; |
| 302 | case BPF_REDIRECT: |
| 303 | return LWTUNNEL_XMIT_DONE; |
Peter Oskolkov | 3bd0b15 | 2019-02-13 11:53:39 -0800 | [diff] [blame] | 304 | case BPF_LWT_REROUTE: |
| 305 | return bpf_lwt_xmit_reroute(skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 306 | default: |
| 307 | return ret; |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | return LWTUNNEL_XMIT_CONTINUE; |
| 312 | } |
| 313 | |
| 314 | static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) |
| 315 | { |
| 316 | if (prog->prog) |
| 317 | bpf_prog_put(prog->prog); |
| 318 | |
| 319 | kfree(prog->name); |
| 320 | } |
| 321 | |
| 322 | static void bpf_destroy_state(struct lwtunnel_state *lwt) |
| 323 | { |
| 324 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 325 | |
| 326 | bpf_lwt_prog_destroy(&bpf->in); |
| 327 | bpf_lwt_prog_destroy(&bpf->out); |
| 328 | bpf_lwt_prog_destroy(&bpf->xmit); |
| 329 | } |
| 330 | |
| 331 | static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { |
| 332 | [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, |
| 333 | [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, |
| 334 | .len = MAX_PROG_NAME }, |
| 335 | }; |
| 336 | |
| 337 | static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, |
| 338 | enum bpf_prog_type type) |
| 339 | { |
| 340 | struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; |
| 341 | struct bpf_prog *p; |
| 342 | int ret; |
| 343 | u32 fd; |
| 344 | |
Johannes Berg | fceb643 | 2017-04-12 14:34:07 +0200 | [diff] [blame] | 345 | ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, |
| 346 | NULL); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 347 | if (ret < 0) |
| 348 | return ret; |
| 349 | |
| 350 | if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) |
| 351 | return -EINVAL; |
| 352 | |
Taehee Yoo | 71eb525 | 2018-07-29 00:28:31 +0900 | [diff] [blame] | 353 | prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 354 | if (!prog->name) |
| 355 | return -ENOMEM; |
| 356 | |
| 357 | fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); |
| 358 | p = bpf_prog_get_type(fd, type); |
| 359 | if (IS_ERR(p)) |
| 360 | return PTR_ERR(p); |
| 361 | |
| 362 | prog->prog = p; |
| 363 | |
| 364 | return 0; |
| 365 | } |
| 366 | |
| 367 | static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { |
| 368 | [LWT_BPF_IN] = { .type = NLA_NESTED, }, |
| 369 | [LWT_BPF_OUT] = { .type = NLA_NESTED, }, |
| 370 | [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, |
| 371 | [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, |
| 372 | }; |
| 373 | |
David Ahern | 30357d7 | 2017-01-30 12:07:37 -0800 | [diff] [blame] | 374 | static int bpf_build_state(struct nlattr *nla, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 375 | unsigned int family, const void *cfg, |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 376 | struct lwtunnel_state **ts, |
| 377 | struct netlink_ext_ack *extack) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 378 | { |
| 379 | struct nlattr *tb[LWT_BPF_MAX + 1]; |
| 380 | struct lwtunnel_state *newts; |
| 381 | struct bpf_lwt *bpf; |
| 382 | int ret; |
| 383 | |
| 384 | if (family != AF_INET && family != AF_INET6) |
| 385 | return -EAFNOSUPPORT; |
| 386 | |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 387 | ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 388 | if (ret < 0) |
| 389 | return ret; |
| 390 | |
| 391 | if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) |
| 392 | return -EINVAL; |
| 393 | |
| 394 | newts = lwtunnel_state_alloc(sizeof(*bpf)); |
| 395 | if (!newts) |
| 396 | return -ENOMEM; |
| 397 | |
| 398 | newts->type = LWTUNNEL_ENCAP_BPF; |
| 399 | bpf = bpf_lwt_lwtunnel(newts); |
| 400 | |
| 401 | if (tb[LWT_BPF_IN]) { |
| 402 | newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; |
| 403 | ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, |
| 404 | BPF_PROG_TYPE_LWT_IN); |
| 405 | if (ret < 0) |
| 406 | goto errout; |
| 407 | } |
| 408 | |
| 409 | if (tb[LWT_BPF_OUT]) { |
| 410 | newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; |
| 411 | ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, |
| 412 | BPF_PROG_TYPE_LWT_OUT); |
| 413 | if (ret < 0) |
| 414 | goto errout; |
| 415 | } |
| 416 | |
| 417 | if (tb[LWT_BPF_XMIT]) { |
| 418 | newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; |
| 419 | ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, |
| 420 | BPF_PROG_TYPE_LWT_XMIT); |
| 421 | if (ret < 0) |
| 422 | goto errout; |
| 423 | } |
| 424 | |
| 425 | if (tb[LWT_BPF_XMIT_HEADROOM]) { |
| 426 | u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); |
| 427 | |
| 428 | if (headroom > LWT_BPF_MAX_HEADROOM) { |
| 429 | ret = -ERANGE; |
| 430 | goto errout; |
| 431 | } |
| 432 | |
| 433 | newts->headroom = headroom; |
| 434 | } |
| 435 | |
| 436 | bpf->family = family; |
| 437 | *ts = newts; |
| 438 | |
| 439 | return 0; |
| 440 | |
| 441 | errout: |
| 442 | bpf_destroy_state(newts); |
| 443 | kfree(newts); |
| 444 | return ret; |
| 445 | } |
| 446 | |
| 447 | static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, |
| 448 | struct bpf_lwt_prog *prog) |
| 449 | { |
| 450 | struct nlattr *nest; |
| 451 | |
| 452 | if (!prog->prog) |
| 453 | return 0; |
| 454 | |
| 455 | nest = nla_nest_start(skb, attr); |
| 456 | if (!nest) |
| 457 | return -EMSGSIZE; |
| 458 | |
| 459 | if (prog->name && |
| 460 | nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) |
| 461 | return -EMSGSIZE; |
| 462 | |
| 463 | return nla_nest_end(skb, nest); |
| 464 | } |
| 465 | |
| 466 | static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) |
| 467 | { |
| 468 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 469 | |
| 470 | if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || |
| 471 | bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || |
| 472 | bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) |
| 473 | return -EMSGSIZE; |
| 474 | |
| 475 | return 0; |
| 476 | } |
| 477 | |
| 478 | static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) |
| 479 | { |
| 480 | int nest_len = nla_total_size(sizeof(struct nlattr)) + |
| 481 | nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ |
| 482 | 0; |
| 483 | |
| 484 | return nest_len + /* LWT_BPF_IN */ |
| 485 | nest_len + /* LWT_BPF_OUT */ |
| 486 | nest_len + /* LWT_BPF_XMIT */ |
| 487 | 0; |
| 488 | } |
| 489 | |
Wei Yongjun | 79471b1 | 2017-01-12 14:39:28 +0000 | [diff] [blame] | 490 | static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 491 | { |
| 492 | /* FIXME: |
| 493 | * The LWT state is currently rebuilt for delete requests which |
| 494 | * results in a new bpf_prog instance. Comparing names for now. |
| 495 | */ |
| 496 | if (!a->name && !b->name) |
| 497 | return 0; |
| 498 | |
| 499 | if (!a->name || !b->name) |
| 500 | return 1; |
| 501 | |
| 502 | return strcmp(a->name, b->name); |
| 503 | } |
| 504 | |
| 505 | static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) |
| 506 | { |
| 507 | struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); |
| 508 | struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); |
| 509 | |
| 510 | return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || |
| 511 | bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || |
| 512 | bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); |
| 513 | } |
| 514 | |
| 515 | static const struct lwtunnel_encap_ops bpf_encap_ops = { |
| 516 | .build_state = bpf_build_state, |
| 517 | .destroy_state = bpf_destroy_state, |
| 518 | .input = bpf_input, |
| 519 | .output = bpf_output, |
| 520 | .xmit = bpf_xmit, |
| 521 | .fill_encap = bpf_fill_encap_info, |
| 522 | .get_encap_size = bpf_encap_nlsize, |
| 523 | .cmp_encap = bpf_encap_cmp, |
Robert Shearman | 88ff733 | 2017-01-24 16:26:47 +0000 | [diff] [blame] | 524 | .owner = THIS_MODULE, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 525 | }; |
| 526 | |
Peter Oskolkov | ca78801 | 2019-02-13 11:53:37 -0800 | [diff] [blame] | 527 | static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, |
| 528 | int encap_len) |
| 529 | { |
| 530 | struct skb_shared_info *shinfo = skb_shinfo(skb); |
| 531 | |
| 532 | gso_type |= SKB_GSO_DODGY; |
| 533 | shinfo->gso_type |= gso_type; |
| 534 | skb_decrease_gso_size(shinfo, encap_len); |
| 535 | shinfo->gso_segs = 0; |
| 536 | return 0; |
| 537 | } |
| 538 | |
Peter Oskolkov | 52f2787 | 2019-02-13 11:53:36 -0800 | [diff] [blame] | 539 | static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) |
| 540 | { |
Peter Oskolkov | ca78801 | 2019-02-13 11:53:37 -0800 | [diff] [blame] | 541 | int next_hdr_offset; |
| 542 | void *next_hdr; |
| 543 | __u8 protocol; |
| 544 | |
| 545 | /* SCTP and UDP_L4 gso need more nuanced handling than what |
| 546 | * handle_gso_type() does above: skb_decrease_gso_size() is not enough. |
| 547 | * So at the moment only TCP GSO packets are let through. |
| 548 | */ |
| 549 | if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) |
| 550 | return -ENOTSUPP; |
| 551 | |
| 552 | if (ipv4) { |
| 553 | protocol = ip_hdr(skb)->protocol; |
| 554 | next_hdr_offset = sizeof(struct iphdr); |
| 555 | next_hdr = skb_network_header(skb) + next_hdr_offset; |
| 556 | } else { |
| 557 | protocol = ipv6_hdr(skb)->nexthdr; |
| 558 | next_hdr_offset = sizeof(struct ipv6hdr); |
| 559 | next_hdr = skb_network_header(skb) + next_hdr_offset; |
| 560 | } |
| 561 | |
| 562 | switch (protocol) { |
| 563 | case IPPROTO_GRE: |
| 564 | next_hdr_offset += sizeof(struct gre_base_hdr); |
| 565 | if (next_hdr_offset > encap_len) |
| 566 | return -EINVAL; |
| 567 | |
| 568 | if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) |
| 569 | return handle_gso_type(skb, SKB_GSO_GRE_CSUM, |
| 570 | encap_len); |
| 571 | return handle_gso_type(skb, SKB_GSO_GRE, encap_len); |
| 572 | |
| 573 | case IPPROTO_UDP: |
| 574 | next_hdr_offset += sizeof(struct udphdr); |
| 575 | if (next_hdr_offset > encap_len) |
| 576 | return -EINVAL; |
| 577 | |
| 578 | if (((struct udphdr *)next_hdr)->check) |
| 579 | return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, |
| 580 | encap_len); |
| 581 | return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); |
| 582 | |
| 583 | case IPPROTO_IP: |
| 584 | case IPPROTO_IPV6: |
| 585 | if (ipv4) |
| 586 | return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); |
| 587 | else |
| 588 | return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); |
| 589 | |
| 590 | default: |
| 591 | return -EPROTONOSUPPORT; |
| 592 | } |
Peter Oskolkov | 52f2787 | 2019-02-13 11:53:36 -0800 | [diff] [blame] | 593 | } |
| 594 | |
| 595 | int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) |
| 596 | { |
| 597 | struct iphdr *iph; |
| 598 | bool ipv4; |
| 599 | int err; |
| 600 | |
| 601 | if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) |
| 602 | return -EINVAL; |
| 603 | |
| 604 | /* validate protocol and length */ |
| 605 | iph = (struct iphdr *)hdr; |
| 606 | if (iph->version == 4) { |
| 607 | ipv4 = true; |
| 608 | if (unlikely(len < iph->ihl * 4)) |
| 609 | return -EINVAL; |
| 610 | } else if (iph->version == 6) { |
| 611 | ipv4 = false; |
| 612 | if (unlikely(len < sizeof(struct ipv6hdr))) |
| 613 | return -EINVAL; |
| 614 | } else { |
| 615 | return -EINVAL; |
| 616 | } |
| 617 | |
| 618 | if (ingress) |
| 619 | err = skb_cow_head(skb, len + skb->mac_len); |
| 620 | else |
| 621 | err = skb_cow_head(skb, |
| 622 | len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); |
| 623 | if (unlikely(err)) |
| 624 | return err; |
| 625 | |
| 626 | /* push the encap headers and fix pointers */ |
| 627 | skb_reset_inner_headers(skb); |
| 628 | skb->encapsulation = 1; |
| 629 | skb_push(skb, len); |
| 630 | if (ingress) |
| 631 | skb_postpush_rcsum(skb, iph, len); |
| 632 | skb_reset_network_header(skb); |
| 633 | memcpy(skb_network_header(skb), hdr, len); |
| 634 | bpf_compute_data_pointers(skb); |
| 635 | skb_clear_hash(skb); |
| 636 | |
| 637 | if (ipv4) { |
| 638 | skb->protocol = htons(ETH_P_IP); |
| 639 | iph = ip_hdr(skb); |
| 640 | |
| 641 | if (!iph->check) |
| 642 | iph->check = ip_fast_csum((unsigned char *)iph, |
| 643 | iph->ihl); |
| 644 | } else { |
| 645 | skb->protocol = htons(ETH_P_IPV6); |
| 646 | } |
| 647 | |
| 648 | if (skb_is_gso(skb)) |
| 649 | return handle_gso_encap(skb, ipv4, len); |
| 650 | |
| 651 | return 0; |
| 652 | } |
| 653 | |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 654 | static int __init bpf_lwt_init(void) |
| 655 | { |
| 656 | return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); |
| 657 | } |
| 658 | |
| 659 | subsys_initcall(bpf_lwt_init) |