| // SPDX-License-Identifier: GPL-2.0-only |
| /* Unstable Conntrack Helpers for XDP and TC-BPF hook |
| * |
| * These are called from the XDP and SCHED_CLS BPF programs. Note that it is |
| * allowed to break compatibility for these functions since the interface they |
| * are exposed through to BPF programs is explicitly unstable. |
| */ |
| |
| #include <linux/bpf_verifier.h> |
| #include <linux/bpf.h> |
| #include <linux/btf.h> |
| #include <linux/filter.h> |
| #include <linux/mutex.h> |
| #include <linux/types.h> |
| #include <linux/btf_ids.h> |
| #include <linux/net_namespace.h> |
| #include <net/xdp.h> |
| #include <net/netfilter/nf_conntrack_bpf.h> |
| #include <net/netfilter/nf_conntrack_core.h> |
| |
| /* bpf_ct_opts - Options for CT lookup helpers |
| * |
| * Members: |
| * @netns_id - Specify the network namespace for lookup |
| * Values: |
| * BPF_F_CURRENT_NETNS (-1) |
| * Use namespace associated with ctx (xdp_md, __sk_buff) |
| * [0, S32_MAX] |
| * Network Namespace ID |
| * @error - Out parameter, set for any errors encountered |
| * Values: |
| * -EINVAL - Passed NULL for bpf_tuple pointer |
| * -EINVAL - opts->reserved is not 0 |
| * -EINVAL - netns_id is less than -1 |
| * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 |
| * -EINVAL - opts->ct_zone_id set when |
| opts__sz isn't NF_BPF_CT_OPTS_SZ (16) |
| * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP |
| * -ENONET - No network namespace found for netns_id |
| * -ENOENT - Conntrack lookup could not find entry for tuple |
| * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4) |
| * or sizeof(tuple->ipv6) |
| * @l4proto - Layer 4 protocol |
| * Values: |
| * IPPROTO_TCP, IPPROTO_UDP |
| * @dir: - connection tracking tuple direction. |
| * @ct_zone_id - connection tracking zone id. |
| * @ct_zone_dir - connection tracking zone direction. |
| * @reserved - Reserved member, will be reused for more options in future |
| * Values: |
| * 0 |
| */ |
| struct bpf_ct_opts { |
| s32 netns_id; |
| s32 error; |
| u8 l4proto; |
| u8 dir; |
| u16 ct_zone_id; |
| u8 ct_zone_dir; |
| u8 reserved[3]; |
| }; |
| |
| enum { |
| NF_BPF_CT_OPTS_SZ = 16, |
| }; |
| |
| static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple_len, u8 protonum, u8 dir, |
| struct nf_conntrack_tuple *tuple) |
| { |
| union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3; |
| union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3; |
| union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u |
| : &tuple->src.u; |
| union nf_conntrack_man_proto *dport = dir ? &tuple->src.u |
| : (void *)&tuple->dst.u; |
| |
| if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) |
| return -EPROTO; |
| |
| memset(tuple, 0, sizeof(*tuple)); |
| |
| switch (tuple_len) { |
| case sizeof(bpf_tuple->ipv4): |
| tuple->src.l3num = AF_INET; |
| src->ip = bpf_tuple->ipv4.saddr; |
| sport->tcp.port = bpf_tuple->ipv4.sport; |
| dst->ip = bpf_tuple->ipv4.daddr; |
| dport->tcp.port = bpf_tuple->ipv4.dport; |
| break; |
| case sizeof(bpf_tuple->ipv6): |
| tuple->src.l3num = AF_INET6; |
| memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); |
| sport->tcp.port = bpf_tuple->ipv6.sport; |
| memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); |
| dport->tcp.port = bpf_tuple->ipv6.dport; |
| break; |
| default: |
| return -EAFNOSUPPORT; |
| } |
| tuple->dst.protonum = protonum; |
| tuple->dst.dir = dir; |
| |
| return 0; |
| } |
| |
| static struct nf_conn * |
| __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len, |
| u32 timeout) |
| { |
| struct nf_conntrack_tuple otuple, rtuple; |
| struct nf_conntrack_zone ct_zone; |
| struct nf_conn *ct; |
| int err; |
| |
| if (!opts || !bpf_tuple) |
| return ERR_PTR(-EINVAL); |
| if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) |
| return ERR_PTR(-EINVAL); |
| if (opts_len == NF_BPF_CT_OPTS_SZ) { |
| if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) |
| return ERR_PTR(-EINVAL); |
| } else { |
| if (opts->ct_zone_id) |
| return ERR_PTR(-EINVAL); |
| } |
| |
| if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) |
| return ERR_PTR(-EINVAL); |
| |
| err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, |
| IP_CT_DIR_ORIGINAL, &otuple); |
| if (err < 0) |
| return ERR_PTR(err); |
| |
| err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, |
| IP_CT_DIR_REPLY, &rtuple); |
| if (err < 0) |
| return ERR_PTR(err); |
| |
| if (opts->netns_id >= 0) { |
| net = get_net_ns_by_id(net, opts->netns_id); |
| if (unlikely(!net)) |
| return ERR_PTR(-ENONET); |
| } |
| |
| if (opts_len == NF_BPF_CT_OPTS_SZ) { |
| if (opts->ct_zone_dir == 0) |
| opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; |
| nf_ct_zone_init(&ct_zone, |
| opts->ct_zone_id, opts->ct_zone_dir, 0); |
| } else { |
| ct_zone = nf_ct_zone_dflt; |
| } |
| |
| ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, |
| GFP_ATOMIC); |
| if (IS_ERR(ct)) |
| goto out; |
| |
| memset(&ct->proto, 0, sizeof(ct->proto)); |
| __nf_ct_set_timeout(ct, timeout * HZ); |
| |
| out: |
| if (opts->netns_id >= 0) |
| put_net(net); |
| |
| return ct; |
| } |
| |
| static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, |
| struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple_len, struct bpf_ct_opts *opts, |
| u32 opts_len) |
| { |
| struct nf_conntrack_tuple_hash *hash; |
| struct nf_conntrack_tuple tuple; |
| struct nf_conntrack_zone ct_zone; |
| struct nf_conn *ct; |
| int err; |
| |
| if (!opts || !bpf_tuple) |
| return ERR_PTR(-EINVAL); |
| if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) |
| return ERR_PTR(-EINVAL); |
| if (opts_len == NF_BPF_CT_OPTS_SZ) { |
| if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) |
| return ERR_PTR(-EINVAL); |
| } else { |
| if (opts->ct_zone_id) |
| return ERR_PTR(-EINVAL); |
| } |
| if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) |
| return ERR_PTR(-EPROTO); |
| if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) |
| return ERR_PTR(-EINVAL); |
| |
| err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, |
| IP_CT_DIR_ORIGINAL, &tuple); |
| if (err < 0) |
| return ERR_PTR(err); |
| |
| if (opts->netns_id >= 0) { |
| net = get_net_ns_by_id(net, opts->netns_id); |
| if (unlikely(!net)) |
| return ERR_PTR(-ENONET); |
| } |
| |
| if (opts_len == NF_BPF_CT_OPTS_SZ) { |
| if (opts->ct_zone_dir == 0) |
| opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; |
| nf_ct_zone_init(&ct_zone, |
| opts->ct_zone_id, opts->ct_zone_dir, 0); |
| } else { |
| ct_zone = nf_ct_zone_dflt; |
| } |
| |
| hash = nf_conntrack_find_get(net, &ct_zone, &tuple); |
| if (opts->netns_id >= 0) |
| put_net(net); |
| if (!hash) |
| return ERR_PTR(-ENOENT); |
| |
| ct = nf_ct_tuplehash_to_ctrack(hash); |
| opts->dir = NF_CT_DIRECTION(hash); |
| |
| return ct; |
| } |
| |
| BTF_ID_LIST(btf_nf_conn_ids) |
| BTF_ID(struct, nf_conn) |
| BTF_ID(struct, nf_conn___init) |
| |
| /* Check writes into `struct nf_conn` */ |
| static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, |
| const struct bpf_reg_state *reg, |
| int off, int size) |
| { |
| const struct btf_type *ncit, *nct, *t; |
| size_t end; |
| |
| ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); |
| nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); |
| t = btf_type_by_id(reg->btf, reg->btf_id); |
| if (t != nct && t != ncit) { |
| bpf_log(log, "only read is supported\n"); |
| return -EACCES; |
| } |
| |
| /* `struct nf_conn` and `struct nf_conn___init` have the same layout |
| * so we are safe to simply merge offset checks here |
| */ |
| switch (off) { |
| #if defined(CONFIG_NF_CONNTRACK_MARK) |
| case offsetof(struct nf_conn, mark): |
| end = offsetofend(struct nf_conn, mark); |
| break; |
| #endif |
| default: |
| bpf_log(log, "no write support to nf_conn at off %d\n", off); |
| return -EACCES; |
| } |
| |
| if (off + size > end) { |
| bpf_log(log, |
| "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", |
| off, size, end); |
| return -EACCES; |
| } |
| |
| return 0; |
| } |
| |
| __bpf_kfunc_start_defs(); |
| |
| /* bpf_xdp_ct_alloc - Allocate a new CT entry |
| * |
| * Parameters: |
| * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program |
| * Cannot be NULL |
| * @bpf_tuple - Pointer to memory representing the tuple to look up |
| * Cannot be NULL |
| * @tuple__sz - Length of the tuple structure |
| * Must be one of sizeof(bpf_tuple->ipv4) or |
| * sizeof(bpf_tuple->ipv6) |
| * @opts - Additional options for allocation (documented above) |
| * Cannot be NULL |
| * @opts__sz - Length of the bpf_ct_opts structure |
| * Must be NF_BPF_CT_OPTS_SZ (16) or 12 |
| */ |
| __bpf_kfunc struct nf_conn___init * |
| bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) |
| { |
| struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; |
| struct nf_conn *nfct; |
| |
| nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, |
| opts, opts__sz, 10); |
| if (IS_ERR(nfct)) { |
| if (opts) |
| opts->error = PTR_ERR(nfct); |
| return NULL; |
| } |
| |
| return (struct nf_conn___init *)nfct; |
| } |
| |
| /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a |
| * reference to it |
| * |
| * Parameters: |
| * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program |
| * Cannot be NULL |
| * @bpf_tuple - Pointer to memory representing the tuple to look up |
| * Cannot be NULL |
| * @tuple__sz - Length of the tuple structure |
| * Must be one of sizeof(bpf_tuple->ipv4) or |
| * sizeof(bpf_tuple->ipv6) |
| * @opts - Additional options for lookup (documented above) |
| * Cannot be NULL |
| * @opts__sz - Length of the bpf_ct_opts structure |
| * Must be NF_BPF_CT_OPTS_SZ (16) or 12 |
| */ |
| __bpf_kfunc struct nf_conn * |
| bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) |
| { |
| struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; |
| struct net *caller_net; |
| struct nf_conn *nfct; |
| |
| caller_net = dev_net(ctx->rxq->dev); |
| nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); |
| if (IS_ERR(nfct)) { |
| if (opts) |
| opts->error = PTR_ERR(nfct); |
| return NULL; |
| } |
| return nfct; |
| } |
| |
| /* bpf_skb_ct_alloc - Allocate a new CT entry |
| * |
| * Parameters: |
| * @skb_ctx - Pointer to ctx (__sk_buff) in TC program |
| * Cannot be NULL |
| * @bpf_tuple - Pointer to memory representing the tuple to look up |
| * Cannot be NULL |
| * @tuple__sz - Length of the tuple structure |
| * Must be one of sizeof(bpf_tuple->ipv4) or |
| * sizeof(bpf_tuple->ipv6) |
| * @opts - Additional options for allocation (documented above) |
| * Cannot be NULL |
| * @opts__sz - Length of the bpf_ct_opts structure |
| * Must be NF_BPF_CT_OPTS_SZ (16) or 12 |
| */ |
| __bpf_kfunc struct nf_conn___init * |
| bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) |
| { |
| struct sk_buff *skb = (struct sk_buff *)skb_ctx; |
| struct nf_conn *nfct; |
| struct net *net; |
| |
| net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); |
| nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); |
| if (IS_ERR(nfct)) { |
| if (opts) |
| opts->error = PTR_ERR(nfct); |
| return NULL; |
| } |
| |
| return (struct nf_conn___init *)nfct; |
| } |
| |
| /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a |
| * reference to it |
| * |
| * Parameters: |
| * @skb_ctx - Pointer to ctx (__sk_buff) in TC program |
| * Cannot be NULL |
| * @bpf_tuple - Pointer to memory representing the tuple to look up |
| * Cannot be NULL |
| * @tuple__sz - Length of the tuple structure |
| * Must be one of sizeof(bpf_tuple->ipv4) or |
| * sizeof(bpf_tuple->ipv6) |
| * @opts - Additional options for lookup (documented above) |
| * Cannot be NULL |
| * @opts__sz - Length of the bpf_ct_opts structure |
| * Must be NF_BPF_CT_OPTS_SZ (16) or 12 |
| */ |
| __bpf_kfunc struct nf_conn * |
| bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, |
| u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) |
| { |
| struct sk_buff *skb = (struct sk_buff *)skb_ctx; |
| struct net *caller_net; |
| struct nf_conn *nfct; |
| |
| caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); |
| nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); |
| if (IS_ERR(nfct)) { |
| if (opts) |
| opts->error = PTR_ERR(nfct); |
| return NULL; |
| } |
| return nfct; |
| } |
| |
| /* bpf_ct_insert_entry - Add the provided entry into a CT map |
| * |
| * This must be invoked for referenced PTR_TO_BTF_ID. |
| * |
| * @nfct - Pointer to referenced nf_conn___init object, obtained |
| * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. |
| */ |
| __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) |
| { |
| struct nf_conn *nfct = (struct nf_conn *)nfct_i; |
| int err; |
| |
| if (!nf_ct_is_confirmed(nfct)) |
| nfct->timeout += nfct_time_stamp; |
| nfct->status |= IPS_CONFIRMED; |
| err = nf_conntrack_hash_check_insert(nfct); |
| if (err < 0) { |
| nf_conntrack_free(nfct); |
| return NULL; |
| } |
| return nfct; |
| } |
| |
| /* bpf_ct_release - Release acquired nf_conn object |
| * |
| * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects |
| * the program if any references remain in the program in all of the explored |
| * states. |
| * |
| * Parameters: |
| * @nf_conn - Pointer to referenced nf_conn object, obtained using |
| * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. |
| */ |
| __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) |
| { |
| nf_ct_put(nfct); |
| } |
| |
| /* bpf_ct_set_timeout - Set timeout of allocated nf_conn |
| * |
| * Sets the default timeout of newly allocated nf_conn before insertion. |
| * This helper must be invoked for refcounted pointer to nf_conn___init. |
| * |
| * Parameters: |
| * @nfct - Pointer to referenced nf_conn object, obtained using |
| * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. |
| * @timeout - Timeout in msecs. |
| */ |
| __bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) |
| { |
| __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); |
| } |
| |
| /* bpf_ct_change_timeout - Change timeout of inserted nf_conn |
| * |
| * Change timeout associated of the inserted or looked up nf_conn. |
| * This helper must be invoked for refcounted pointer to nf_conn. |
| * |
| * Parameters: |
| * @nfct - Pointer to referenced nf_conn object, obtained using |
| * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. |
| * @timeout - New timeout in msecs. |
| */ |
| __bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) |
| { |
| return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); |
| } |
| |
| /* bpf_ct_set_status - Set status field of allocated nf_conn |
| * |
| * Set the status field of the newly allocated nf_conn before insertion. |
| * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init. |
| * |
| * Parameters: |
| * @nfct - Pointer to referenced nf_conn object, obtained using |
| * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. |
| * @status - New status value. |
| */ |
| __bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) |
| { |
| return nf_ct_change_status_common((struct nf_conn *)nfct, status); |
| } |
| |
| /* bpf_ct_change_status - Change status of inserted nf_conn |
| * |
| * Change the status field of the provided connection tracking entry. |
| * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn. |
| * |
| * Parameters: |
| * @nfct - Pointer to referenced nf_conn object, obtained using |
| * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. |
| * @status - New status value. |
| */ |
| __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) |
| { |
| return nf_ct_change_status_common(nfct, status); |
| } |
| |
| __bpf_kfunc_end_defs(); |
| |
| BTF_KFUNCS_START(nf_ct_kfunc_set) |
| BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) |
| BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) |
| BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) |
| BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) |
| BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) |
| BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) |
| BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) |
| BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) |
| BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) |
| BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) |
| BTF_KFUNCS_END(nf_ct_kfunc_set) |
| |
| static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { |
| .owner = THIS_MODULE, |
| .set = &nf_ct_kfunc_set, |
| }; |
| |
| int register_nf_conntrack_bpf(void) |
| { |
| int ret; |
| |
| ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); |
| ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); |
| if (!ret) { |
| mutex_lock(&nf_conn_btf_access_lock); |
| nfct_btf_struct_access = _nf_conntrack_btf_struct_access; |
| mutex_unlock(&nf_conn_btf_access_lock); |
| } |
| |
| return ret; |
| } |
| |
| void cleanup_nf_conntrack_bpf(void) |
| { |
| mutex_lock(&nf_conn_btf_access_lock); |
| nfct_btf_struct_access = NULL; |
| mutex_unlock(&nf_conn_btf_access_lock); |
| } |