drivers/net/ethernet/sfc/tc_conntrack.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /****************************************************************************
  * Driver for Solarflare network controllers and boards
  * Copyright 2023, Advanced Micro Devices, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
  * by the Free Software Foundation, incorporated herein by reference.
  */

 #include "tc_conntrack.h"
 #include "tc.h"
 #include "mae.h"

 static int efx_tc_flow_block(enum tc_setup_type type, void *type_data,
 			     void *cb_priv);

 static const struct rhashtable_params efx_tc_ct_zone_ht_params = {
 	.key_len	= offsetof(struct efx_tc_ct_zone, linkage),
 	.key_offset	= 0,
 	.head_offset	= offsetof(struct efx_tc_ct_zone, linkage),
 };

 static const struct rhashtable_params efx_tc_ct_ht_params = {
 	.key_len	= offsetof(struct efx_tc_ct_entry, linkage),
 	.key_offset	= 0,
 	.head_offset	= offsetof(struct efx_tc_ct_entry, linkage),
 };

 static void efx_tc_ct_zone_free(void *ptr, void *arg)
 {
 	struct efx_tc_ct_zone *zone = ptr;
 	struct efx_nic *efx = zone->efx;

 	netif_err(efx, drv, efx->net_dev,
 		  "tc ct_zone %u still present at teardown, removing\n",
 		  zone->zone);

 	nf_flow_table_offload_del_cb(zone->nf_ft, efx_tc_flow_block, zone);
 	kfree(zone);
 }

 static void efx_tc_ct_free(void *ptr, void *arg)
 {
 	struct efx_tc_ct_entry *conn = ptr;
 	struct efx_nic *efx = arg;

 	netif_err(efx, drv, efx->net_dev,
 		  "tc ct_entry %lx still present at teardown\n",
 		  conn->cookie);

 	/* We can release the counter, but we can't remove the CT itself
 	 * from hardware because the table meta is already gone.
 	 */
 	efx_tc_flower_release_counter(efx, conn->cnt);
 	kfree(conn);
 }

 int efx_tc_init_conntrack(struct efx_nic *efx)
 {
 	int rc;

 	rc = rhashtable_init(&efx->tc->ct_zone_ht, &efx_tc_ct_zone_ht_params);
 	if (rc < 0)
 		goto fail_ct_zone_ht;
 	rc = rhashtable_init(&efx->tc->ct_ht, &efx_tc_ct_ht_params);
 	if (rc < 0)
 		goto fail_ct_ht;
 	return 0;
 fail_ct_ht:
 	rhashtable_destroy(&efx->tc->ct_zone_ht);
 fail_ct_zone_ht:
 	return rc;
 }

 /* Only call this in init failure teardown.
  * Normal exit should fini instead as there may be entries in the table.
  */
 void efx_tc_destroy_conntrack(struct efx_nic *efx)
 {
 	rhashtable_destroy(&efx->tc->ct_ht);
 	rhashtable_destroy(&efx->tc->ct_zone_ht);
 }

 void efx_tc_fini_conntrack(struct efx_nic *efx)
 {
 	rhashtable_free_and_destroy(&efx->tc->ct_zone_ht, efx_tc_ct_zone_free, NULL);
 	rhashtable_free_and_destroy(&efx->tc->ct_ht, efx_tc_ct_free, efx);
 }

 #define EFX_NF_TCP_FLAG(flg)	cpu_to_be16(be32_to_cpu(TCP_FLAG_##flg) >> 16)

 static int efx_tc_ct_parse_match(struct efx_nic *efx, struct flow_rule *fr,
 				 struct efx_tc_ct_entry *conn)
 {
 	struct flow_dissector *dissector = fr->match.dissector;
 	unsigned char ipv = 0;
 	bool tcp = false;

 	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_CONTROL)) {
 		struct flow_match_control fm;

 		flow_rule_match_control(fr, &fm);
 		if (IS_ALL_ONES(fm.mask->addr_type))
 			switch (fm.key->addr_type) {
 			case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 				ipv = 4;
 				break;
 			case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 				ipv = 6;
 				break;
 			default:
 				break;
 			}
 	}

 	if (!ipv) {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Conntrack missing ipv specification\n");
 		return -EOPNOTSUPP;
 	}

 	if (dissector->used_keys &
 	    ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_TCP) |
 	      BIT_ULL(FLOW_DISSECTOR_KEY_META))) {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Unsupported conntrack keys %#llx\n",
 			  dissector->used_keys);
 		return -EOPNOTSUPP;
 	}

 	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_BASIC)) {
 		struct flow_match_basic fm;

 		flow_rule_match_basic(fr, &fm);
 		if (!IS_ALL_ONES(fm.mask->n_proto)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack eth_proto is not exact-match; mask %04x\n",
 				   ntohs(fm.mask->n_proto));
 			return -EOPNOTSUPP;
 		}
 		conn->eth_proto = fm.key->n_proto;
 		if (conn->eth_proto != (ipv == 4 ? htons(ETH_P_IP)
 						 : htons(ETH_P_IPV6))) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack eth_proto is not IPv%u, is %04x\n",
 				   ipv, ntohs(conn->eth_proto));
 			return -EOPNOTSUPP;
 		}
 		if (!IS_ALL_ONES(fm.mask->ip_proto)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ip_proto is not exact-match; mask %02x\n",
 				   fm.mask->ip_proto);
 			return -EOPNOTSUPP;
 		}
 		conn->ip_proto = fm.key->ip_proto;
 		switch (conn->ip_proto) {
 		case IPPROTO_TCP:
 			tcp = true;
 			break;
 		case IPPROTO_UDP:
 			break;
 		default:
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ip_proto not TCP or UDP, is %02x\n",
 				   conn->ip_proto);
 			return -EOPNOTSUPP;
 		}
 	} else {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Conntrack missing eth_proto, ip_proto\n");
 		return -EOPNOTSUPP;
 	}

 	if (ipv == 4 && flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
 		struct flow_match_ipv4_addrs fm;

 		flow_rule_match_ipv4_addrs(fr, &fm);
 		if (!IS_ALL_ONES(fm.mask->src)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ipv4.src is not exact-match; mask %08x\n",
 				   ntohl(fm.mask->src));
 			return -EOPNOTSUPP;
 		}
 		conn->src_ip = fm.key->src;
 		if (!IS_ALL_ONES(fm.mask->dst)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ipv4.dst is not exact-match; mask %08x\n",
 				   ntohl(fm.mask->dst));
 			return -EOPNOTSUPP;
 		}
 		conn->dst_ip = fm.key->dst;
 	} else if (ipv == 6 && flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 		struct flow_match_ipv6_addrs fm;

 		flow_rule_match_ipv6_addrs(fr, &fm);
 		if (!efx_ipv6_addr_all_ones(&fm.mask->src)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ipv6.src is not exact-match; mask %pI6\n",
 				   &fm.mask->src);
 			return -EOPNOTSUPP;
 		}
 		conn->src_ip6 = fm.key->src;
 		if (!efx_ipv6_addr_all_ones(&fm.mask->dst)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ipv6.dst is not exact-match; mask %pI6\n",
 				   &fm.mask->dst);
 			return -EOPNOTSUPP;
 		}
 		conn->dst_ip6 = fm.key->dst;
 	} else {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Conntrack missing IPv%u addrs\n", ipv);
 		return -EOPNOTSUPP;
 	}

 	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_PORTS)) {
 		struct flow_match_ports fm;

 		flow_rule_match_ports(fr, &fm);
 		if (!IS_ALL_ONES(fm.mask->src)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ports.src is not exact-match; mask %04x\n",
 				   ntohs(fm.mask->src));
 			return -EOPNOTSUPP;
 		}
 		conn->l4_sport = fm.key->src;
 		if (!IS_ALL_ONES(fm.mask->dst)) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack ports.dst is not exact-match; mask %04x\n",
 				   ntohs(fm.mask->dst));
 			return -EOPNOTSUPP;
 		}
 		conn->l4_dport = fm.key->dst;
 	} else {
 		netif_dbg(efx, drv, efx->net_dev, "Conntrack missing L4 ports\n");
 		return -EOPNOTSUPP;
 	}

 	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_TCP)) {
 		__be16 tcp_interesting_flags;
 		struct flow_match_tcp fm;

 		if (!tcp) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Conntrack matching on TCP keys but ipproto is not tcp\n");
 			return -EOPNOTSUPP;
 		}
 		flow_rule_match_tcp(fr, &fm);
 		tcp_interesting_flags = EFX_NF_TCP_FLAG(SYN) |
 					EFX_NF_TCP_FLAG(RST) |
 					EFX_NF_TCP_FLAG(FIN);
 		/* If any of the tcp_interesting_flags is set, we always
 		 * inhibit CT lookup in LHS (so SW can update CT table).
 		 */
 		if (fm.key->flags & tcp_interesting_flags) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Unsupported conntrack tcp.flags %04x/%04x\n",
 				   ntohs(fm.key->flags), ntohs(fm.mask->flags));
 			return -EOPNOTSUPP;
 		}
 		/* Other TCP flags cannot be filtered at CT */
 		if (fm.mask->flags & ~tcp_interesting_flags) {
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Unsupported conntrack tcp.flags %04x/%04x\n",
 				   ntohs(fm.key->flags), ntohs(fm.mask->flags));
 			return -EOPNOTSUPP;
 		}
 	}

 	return 0;
 }

 /**
  * struct efx_tc_ct_mangler_state - tracks which fields have been pedited
  *
  * @ipv4: IP source or destination addr has been set
  * @tcpudp: TCP/UDP source or destination port has been set
  */
 struct efx_tc_ct_mangler_state {
 	u8 ipv4:1;
 	u8 tcpudp:1;
 };

 static int efx_tc_ct_mangle(struct efx_nic *efx, struct efx_tc_ct_entry *conn,
 			    const struct flow_action_entry *fa,
 			    struct efx_tc_ct_mangler_state *mung)
 {
 	/* Is this the first mangle we've processed for this rule? */
 	bool first = !(mung->ipv4 || mung->tcpudp);
 	bool dnat = false;

 	switch (fa->mangle.htype) {
 	case FLOW_ACT_MANGLE_HDR_TYPE_IP4:
 		switch (fa->mangle.offset) {
 		case offsetof(struct iphdr, daddr):
 			dnat = true;
 			fallthrough;
 		case offsetof(struct iphdr, saddr):
 			if (fa->mangle.mask)
 				return -EOPNOTSUPP;
 			conn->nat_ip = htonl(fa->mangle.val);
 			mung->ipv4 = 1;
 			break;
 		default:
 			return -EOPNOTSUPP;
 		}
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
 	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
 		/* Both struct tcphdr and struct udphdr start with
 		 *	__be16 source;
 		 *	__be16 dest;
 		 * so we can use the same code for both.
 		 */
 		switch (fa->mangle.offset) {
 		case offsetof(struct tcphdr, dest):
 			BUILD_BUG_ON(offsetof(struct tcphdr, dest) !=
 				     offsetof(struct udphdr, dest));
 			dnat = true;
 			fallthrough;
 		case offsetof(struct tcphdr, source):
 			BUILD_BUG_ON(offsetof(struct tcphdr, source) !=
 				     offsetof(struct udphdr, source));
 			if (~fa->mangle.mask != 0xffff)
 				return -EOPNOTSUPP;
 			conn->l4_natport = htons(fa->mangle.val);
 			mung->tcpudp = 1;
 			break;
 		default:
 			return -EOPNOTSUPP;
 		}
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 	/* first mangle tells us whether this is SNAT or DNAT;
 	 * subsequent mangles must match that
 	 */
 	if (first)
 		conn->dnat = dnat;
 	else if (conn->dnat != dnat)
 		return -EOPNOTSUPP;
 	return 0;
 }

 static int efx_tc_ct_replace(struct efx_tc_ct_zone *ct_zone,
 			     struct flow_cls_offload *tc)
 {
 	struct flow_rule *fr = flow_cls_offload_flow_rule(tc);
 	struct efx_tc_ct_mangler_state mung = {};
 	struct efx_tc_ct_entry *conn, *old;
 	struct efx_nic *efx = ct_zone->efx;
 	const struct flow_action_entry *fa;
 	struct efx_tc_counter *cnt;
 	int rc, i;

 	if (WARN_ON(!efx->tc))
 		return -ENETDOWN;
 	if (WARN_ON(!efx->tc->up))
 		return -ENETDOWN;

 	conn = kzalloc(sizeof(*conn), GFP_USER);
 	if (!conn)
 		return -ENOMEM;
 	conn->cookie = tc->cookie;
 	old = rhashtable_lookup_get_insert_fast(&efx->tc->ct_ht,
 						&conn->linkage,
 						efx_tc_ct_ht_params);
 	if (IS_ERR(old)) {
 		rc = PTR_ERR(old);
 		goto release;
 	} else if (old) {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Already offloaded conntrack (cookie %lx)\n", tc->cookie);
 		rc = -EEXIST;
 		goto release;
 	}

 	/* Parse match */
 	conn->zone = ct_zone;
 	rc = efx_tc_ct_parse_match(efx, fr, conn);
 	if (rc)
 		goto release;

 	/* Parse actions */
 	flow_action_for_each(i, fa, &fr->action) {
 		switch (fa->id) {
 		case FLOW_ACTION_CT_METADATA:
 			conn->mark = fa->ct_metadata.mark;
 			if (memchr_inv(fa->ct_metadata.labels, 0, sizeof(fa->ct_metadata.labels))) {
 				netif_dbg(efx, drv, efx->net_dev,
 					  "Setting CT label not supported\n");
 				rc = -EOPNOTSUPP;
 				goto release;
 			}
 			break;
 		case FLOW_ACTION_MANGLE:
 			if (conn->eth_proto != htons(ETH_P_IP)) {
 				netif_dbg(efx, drv, efx->net_dev,
 					  "NAT only supported for IPv4\n");
 				rc = -EOPNOTSUPP;
 				goto release;
 			}
 			rc = efx_tc_ct_mangle(efx, conn, fa, &mung);
 			if (rc)
 				goto release;
 			break;
 		default:
 			netif_dbg(efx, drv, efx->net_dev,
 				  "Unhandled action %u for conntrack\n", fa->id);
 			rc = -EOPNOTSUPP;
 			goto release;
 		}
 	}

 	/* fill in defaults for unmangled values */
 	if (!mung.ipv4)
 		conn->nat_ip = conn->dnat ? conn->dst_ip : conn->src_ip;
 	if (!mung.tcpudp)
 		conn->l4_natport = conn->dnat ? conn->l4_dport : conn->l4_sport;

 	cnt = efx_tc_flower_allocate_counter(efx, EFX_TC_COUNTER_TYPE_CT);
 	if (IS_ERR(cnt)) {
 		rc = PTR_ERR(cnt);
 		goto release;
 	}
 	conn->cnt = cnt;

 	rc = efx_mae_insert_ct(efx, conn);
 	if (rc) {
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Failed to insert conntrack, %d\n", rc);
 		goto release;
 	}
 	mutex_lock(&ct_zone->mutex);
 	list_add_tail(&conn->list, &ct_zone->cts);
 	mutex_unlock(&ct_zone->mutex);
 	return 0;
 release:
 	if (conn->cnt)
 		efx_tc_flower_release_counter(efx, conn->cnt);
 	if (!old)
 		rhashtable_remove_fast(&efx->tc->ct_ht, &conn->linkage,
 				       efx_tc_ct_ht_params);
 	kfree(conn);
 	return rc;
 }

 /* Caller must follow with efx_tc_ct_remove_finish() after RCU grace period! */
 static void efx_tc_ct_remove(struct efx_nic *efx, struct efx_tc_ct_entry *conn)
 {
 	int rc;

 	/* Remove it from HW */
 	rc = efx_mae_remove_ct(efx, conn);
 	/* Delete it from SW */
 	rhashtable_remove_fast(&efx->tc->ct_ht, &conn->linkage,
 			       efx_tc_ct_ht_params);
 	if (rc) {
 		netif_err(efx, drv, efx->net_dev,
 			  "Failed to remove conntrack %lx from hw, rc %d\n",
 			  conn->cookie, rc);
 	} else {
 		netif_dbg(efx, drv, efx->net_dev, "Removed conntrack %lx\n",
 			  conn->cookie);
 	}
 }

 static void efx_tc_ct_remove_finish(struct efx_nic *efx, struct efx_tc_ct_entry *conn)
 {
 	/* Remove related CT counter.  This is delayed after the conn object we
 	 * are working with has been successfully removed.  This protects the
 	 * counter from being used-after-free inside efx_tc_ct_stats.
 	 */
 	efx_tc_flower_release_counter(efx, conn->cnt);
 	kfree(conn);
 }

 static int efx_tc_ct_destroy(struct efx_tc_ct_zone *ct_zone,
 			     struct flow_cls_offload *tc)
 {
 	struct efx_nic *efx = ct_zone->efx;
 	struct efx_tc_ct_entry *conn;

 	conn = rhashtable_lookup_fast(&efx->tc->ct_ht, &tc->cookie,
 				      efx_tc_ct_ht_params);
 	if (!conn) {
 		netif_warn(efx, drv, efx->net_dev,
 			   "Conntrack %lx not found to remove\n", tc->cookie);
 		return -ENOENT;
 	}

 	mutex_lock(&ct_zone->mutex);
 	list_del(&conn->list);
 	efx_tc_ct_remove(efx, conn);
 	mutex_unlock(&ct_zone->mutex);
 	synchronize_rcu();
 	efx_tc_ct_remove_finish(efx, conn);
 	return 0;
 }

 static int efx_tc_ct_stats(struct efx_tc_ct_zone *ct_zone,
 			   struct flow_cls_offload *tc)
 {
 	struct efx_nic *efx = ct_zone->efx;
 	struct efx_tc_ct_entry *conn;
 	struct efx_tc_counter *cnt;

 	rcu_read_lock();
 	conn = rhashtable_lookup_fast(&efx->tc->ct_ht, &tc->cookie,
 				      efx_tc_ct_ht_params);
 	if (!conn) {
 		netif_warn(efx, drv, efx->net_dev,
 			   "Conntrack %lx not found for stats\n", tc->cookie);
 		rcu_read_unlock();
 		return -ENOENT;
 	}

 	cnt = conn->cnt;
 	spin_lock_bh(&cnt->lock);
 	/* Report only last use */
 	flow_stats_update(&tc->stats, 0, 0, 0, cnt->touched,
 			  FLOW_ACTION_HW_STATS_DELAYED);
 	spin_unlock_bh(&cnt->lock);
 	rcu_read_unlock();

 	return 0;
 }

 static int efx_tc_flow_block(enum tc_setup_type type, void *type_data,
 			     void *cb_priv)
 {
 	struct flow_cls_offload *tcb = type_data;
 	struct efx_tc_ct_zone *ct_zone = cb_priv;

 	if (type != TC_SETUP_CLSFLOWER)
 		return -EOPNOTSUPP;

 	switch (tcb->command) {
 	case FLOW_CLS_REPLACE:
 		return efx_tc_ct_replace(ct_zone, tcb);
 	case FLOW_CLS_DESTROY:
 		return efx_tc_ct_destroy(ct_zone, tcb);
 	case FLOW_CLS_STATS:
 		return efx_tc_ct_stats(ct_zone, tcb);
 	default:
 		break;
 	}

 	return -EOPNOTSUPP;
 }

 struct efx_tc_ct_zone *efx_tc_ct_register_zone(struct efx_nic *efx, u16 zone,
 					       struct nf_flowtable *ct_ft)
 {
 	struct efx_tc_ct_zone *ct_zone, *old;
 	int rc;

 	ct_zone = kzalloc(sizeof(*ct_zone), GFP_USER);
 	if (!ct_zone)
 		return ERR_PTR(-ENOMEM);
 	ct_zone->zone = zone;
 	old = rhashtable_lookup_get_insert_fast(&efx->tc->ct_zone_ht,
 						&ct_zone->linkage,
 						efx_tc_ct_zone_ht_params);
 	if (old) {
 		/* don't need our new entry */
 		kfree(ct_zone);
 		if (IS_ERR(old)) /* oh dear, it's actually an error */
 			return ERR_CAST(old);
 		if (!refcount_inc_not_zero(&old->ref))
 			return ERR_PTR(-EAGAIN);
 		/* existing entry found */
 		WARN_ON_ONCE(old->nf_ft != ct_ft);
 		netif_dbg(efx, drv, efx->net_dev,
 			  "Found existing ct_zone for %u\n", zone);
 		return old;
 	}
 	ct_zone->nf_ft = ct_ft;
 	ct_zone->efx = efx;
 	INIT_LIST_HEAD(&ct_zone->cts);
 	mutex_init(&ct_zone->mutex);
 	rc = nf_flow_table_offload_add_cb(ct_ft, efx_tc_flow_block, ct_zone);
 	netif_dbg(efx, drv, efx->net_dev, "Adding new ct_zone for %u, rc %d\n",
 		  zone, rc);
 	if (rc < 0)
 		goto fail;
 	refcount_set(&ct_zone->ref, 1);
 	return ct_zone;
 fail:
 	rhashtable_remove_fast(&efx->tc->ct_zone_ht, &ct_zone->linkage,
 			       efx_tc_ct_zone_ht_params);
 	kfree(ct_zone);
 	return ERR_PTR(rc);
 }

 void efx_tc_ct_unregister_zone(struct efx_nic *efx,
 			       struct efx_tc_ct_zone *ct_zone)
 {
 	struct efx_tc_ct_entry *conn, *next;

 	if (!refcount_dec_and_test(&ct_zone->ref))
 		return; /* still in use */
 	nf_flow_table_offload_del_cb(ct_zone->nf_ft, efx_tc_flow_block, ct_zone);
 	rhashtable_remove_fast(&efx->tc->ct_zone_ht, &ct_zone->linkage,
 			       efx_tc_ct_zone_ht_params);
 	mutex_lock(&ct_zone->mutex);
 	list_for_each_entry(conn, &ct_zone->cts, list)
 		efx_tc_ct_remove(efx, conn);
 	synchronize_rcu();
 	/* need to use _safe because efx_tc_ct_remove_finish() frees conn */
 	list_for_each_entry_safe(conn, next, &ct_zone->cts, list)
 		efx_tc_ct_remove_finish(efx, conn);
 	mutex_unlock(&ct_zone->mutex);
 	mutex_destroy(&ct_zone->mutex);
 	netif_dbg(efx, drv, efx->net_dev, "Removed ct_zone for %u\n",
 		  ct_zone->zone);
 	kfree(ct_zone);
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/****************************************************************************
	* Driver for Solarflare network controllers and boards
	* Copyright 2023, Advanced Micro Devices, Inc.
	*
	* This program is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 as published
	* by the Free Software Foundation, incorporated herein by reference.
	*/

	#include "tc_conntrack.h"
	#include "tc.h"
	#include "mae.h"

	static int efx_tc_flow_block(enum tc_setup_type type, void *type_data,
	void *cb_priv);

	static const struct rhashtable_params efx_tc_ct_zone_ht_params = {
	.key_len = offsetof(struct efx_tc_ct_zone, linkage),
	.key_offset = 0,
	.head_offset = offsetof(struct efx_tc_ct_zone, linkage),
	};

	static const struct rhashtable_params efx_tc_ct_ht_params = {
	.key_len = offsetof(struct efx_tc_ct_entry, linkage),
	.key_offset = 0,
	.head_offset = offsetof(struct efx_tc_ct_entry, linkage),
	};

	static void efx_tc_ct_zone_free(void ptr, void arg)
	{
	struct efx_tc_ct_zone *zone = ptr;
	struct efx_nic *efx = zone->efx;

	netif_err(efx, drv, efx->net_dev,
	"tc ct_zone %u still present at teardown, removing\n",
	zone->zone);

	nf_flow_table_offload_del_cb(zone->nf_ft, efx_tc_flow_block, zone);
	kfree(zone);
	}

	static void efx_tc_ct_free(void ptr, void arg)
	{
	struct efx_tc_ct_entry *conn = ptr;
	struct efx_nic *efx = arg;

	netif_err(efx, drv, efx->net_dev,
	"tc ct_entry %lx still present at teardown\n",
	conn->cookie);

	/* We can release the counter, but we can't remove the CT itself
	* from hardware because the table meta is already gone.
	*/
	efx_tc_flower_release_counter(efx, conn->cnt);
	kfree(conn);
	}

	int efx_tc_init_conntrack(struct efx_nic *efx)
	{
	int rc;

	rc = rhashtable_init(&efx->tc->ct_zone_ht, &efx_tc_ct_zone_ht_params);
	if (rc < 0)
	goto fail_ct_zone_ht;
	rc = rhashtable_init(&efx->tc->ct_ht, &efx_tc_ct_ht_params);
	if (rc < 0)
	goto fail_ct_ht;
	return 0;
	fail_ct_ht:
	rhashtable_destroy(&efx->tc->ct_zone_ht);
	fail_ct_zone_ht:
	return rc;
	}

	/* Only call this in init failure teardown.
	* Normal exit should fini instead as there may be entries in the table.
	*/
	void efx_tc_destroy_conntrack(struct efx_nic *efx)
	{
	rhashtable_destroy(&efx->tc->ct_ht);
	rhashtable_destroy(&efx->tc->ct_zone_ht);
	}

	void efx_tc_fini_conntrack(struct efx_nic *efx)
	{
	rhashtable_free_and_destroy(&efx->tc->ct_zone_ht, efx_tc_ct_zone_free, NULL);
	rhashtable_free_and_destroy(&efx->tc->ct_ht, efx_tc_ct_free, efx);
	}

	#define EFX_NF_TCP_FLAG(flg) cpu_to_be16(be32_to_cpu(TCP_FLAG_##flg) >> 16)

	static int efx_tc_ct_parse_match(struct efx_nic efx, struct flow_rule fr,
	struct efx_tc_ct_entry *conn)
	{
	struct flow_dissector *dissector = fr->match.dissector;
	unsigned char ipv = 0;
	bool tcp = false;

	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_CONTROL)) {
	struct flow_match_control fm;

	flow_rule_match_control(fr, &fm);
	if (IS_ALL_ONES(fm.mask->addr_type))
	switch (fm.key->addr_type) {
	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
	ipv = 4;
	break;
	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
	ipv = 6;
	break;
	default:
	break;
	}
	}

	if (!ipv) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack missing ipv specification\n");
	return -EOPNOTSUPP;
	}

	if (dissector->used_keys &
	~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_TCP) \|
	BIT_ULL(FLOW_DISSECTOR_KEY_META))) {
	netif_dbg(efx, drv, efx->net_dev,
	"Unsupported conntrack keys %#llx\n",
	dissector->used_keys);
	return -EOPNOTSUPP;
	}

	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_BASIC)) {
	struct flow_match_basic fm;

	flow_rule_match_basic(fr, &fm);
	if (!IS_ALL_ONES(fm.mask->n_proto)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack eth_proto is not exact-match; mask %04x\n",
	ntohs(fm.mask->n_proto));
	return -EOPNOTSUPP;
	}
	conn->eth_proto = fm.key->n_proto;
	if (conn->eth_proto != (ipv == 4 ? htons(ETH_P_IP)
	: htons(ETH_P_IPV6))) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack eth_proto is not IPv%u, is %04x\n",
	ipv, ntohs(conn->eth_proto));
	return -EOPNOTSUPP;
	}
	if (!IS_ALL_ONES(fm.mask->ip_proto)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ip_proto is not exact-match; mask %02x\n",
	fm.mask->ip_proto);
	return -EOPNOTSUPP;
	}
	conn->ip_proto = fm.key->ip_proto;
	switch (conn->ip_proto) {
	case IPPROTO_TCP:
	tcp = true;
	break;
	case IPPROTO_UDP:
	break;
	default:
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ip_proto not TCP or UDP, is %02x\n",
	conn->ip_proto);
	return -EOPNOTSUPP;
	}
	} else {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack missing eth_proto, ip_proto\n");
	return -EOPNOTSUPP;
	}

	if (ipv == 4 && flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
	struct flow_match_ipv4_addrs fm;

	flow_rule_match_ipv4_addrs(fr, &fm);
	if (!IS_ALL_ONES(fm.mask->src)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ipv4.src is not exact-match; mask %08x\n",
	ntohl(fm.mask->src));
	return -EOPNOTSUPP;
	}
	conn->src_ip = fm.key->src;
	if (!IS_ALL_ONES(fm.mask->dst)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ipv4.dst is not exact-match; mask %08x\n",
	ntohl(fm.mask->dst));
	return -EOPNOTSUPP;
	}
	conn->dst_ip = fm.key->dst;
	} else if (ipv == 6 && flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
	struct flow_match_ipv6_addrs fm;

	flow_rule_match_ipv6_addrs(fr, &fm);
	if (!efx_ipv6_addr_all_ones(&fm.mask->src)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ipv6.src is not exact-match; mask %pI6\n",
	&fm.mask->src);
	return -EOPNOTSUPP;
	}
	conn->src_ip6 = fm.key->src;
	if (!efx_ipv6_addr_all_ones(&fm.mask->dst)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ipv6.dst is not exact-match; mask %pI6\n",
	&fm.mask->dst);
	return -EOPNOTSUPP;
	}
	conn->dst_ip6 = fm.key->dst;
	} else {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack missing IPv%u addrs\n", ipv);
	return -EOPNOTSUPP;
	}

	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_PORTS)) {
	struct flow_match_ports fm;

	flow_rule_match_ports(fr, &fm);
	if (!IS_ALL_ONES(fm.mask->src)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ports.src is not exact-match; mask %04x\n",
	ntohs(fm.mask->src));
	return -EOPNOTSUPP;
	}
	conn->l4_sport = fm.key->src;
	if (!IS_ALL_ONES(fm.mask->dst)) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack ports.dst is not exact-match; mask %04x\n",
	ntohs(fm.mask->dst));
	return -EOPNOTSUPP;
	}
	conn->l4_dport = fm.key->dst;
	} else {
	netif_dbg(efx, drv, efx->net_dev, "Conntrack missing L4 ports\n");
	return -EOPNOTSUPP;
	}

	if (flow_rule_match_key(fr, FLOW_DISSECTOR_KEY_TCP)) {
	__be16 tcp_interesting_flags;
	struct flow_match_tcp fm;

	if (!tcp) {
	netif_dbg(efx, drv, efx->net_dev,
	"Conntrack matching on TCP keys but ipproto is not tcp\n");
	return -EOPNOTSUPP;
	}
	flow_rule_match_tcp(fr, &fm);
	tcp_interesting_flags = EFX_NF_TCP_FLAG(SYN) \|
	EFX_NF_TCP_FLAG(RST) \|
	EFX_NF_TCP_FLAG(FIN);
	/* If any of the tcp_interesting_flags is set, we always
	* inhibit CT lookup in LHS (so SW can update CT table).
	*/
	if (fm.key->flags & tcp_interesting_flags) {
	netif_dbg(efx, drv, efx->net_dev,
	"Unsupported conntrack tcp.flags %04x/%04x\n",
	ntohs(fm.key->flags), ntohs(fm.mask->flags));
	return -EOPNOTSUPP;
	}
	/* Other TCP flags cannot be filtered at CT */
	if (fm.mask->flags & ~tcp_interesting_flags) {
	netif_dbg(efx, drv, efx->net_dev,
	"Unsupported conntrack tcp.flags %04x/%04x\n",
	ntohs(fm.key->flags), ntohs(fm.mask->flags));
	return -EOPNOTSUPP;
	}
	}

	return 0;
	}

	/**
	* struct efx_tc_ct_mangler_state - tracks which fields have been pedited
	*
	* @ipv4: IP source or destination addr has been set
	* @tcpudp: TCP/UDP source or destination port has been set
	*/
	struct efx_tc_ct_mangler_state {
	u8 ipv4:1;
	u8 tcpudp:1;
	};

	static int efx_tc_ct_mangle(struct efx_nic efx, struct efx_tc_ct_entry conn,
	const struct flow_action_entry *fa,
	struct efx_tc_ct_mangler_state *mung)
	{
	/* Is this the first mangle we've processed for this rule? */
	bool first = !(mung->ipv4 \|\| mung->tcpudp);
	bool dnat = false;

	switch (fa->mangle.htype) {
	case FLOW_ACT_MANGLE_HDR_TYPE_IP4:
	switch (fa->mangle.offset) {
	case offsetof(struct iphdr, daddr):
	dnat = true;
	fallthrough;
	case offsetof(struct iphdr, saddr):
	if (fa->mangle.mask)
	return -EOPNOTSUPP;
	conn->nat_ip = htonl(fa->mangle.val);
	mung->ipv4 = 1;
	break;
	default:
	return -EOPNOTSUPP;
	}
	break;
	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
	/* Both struct tcphdr and struct udphdr start with
	* __be16 source;
	* __be16 dest;
	* so we can use the same code for both.
	*/
	switch (fa->mangle.offset) {
	case offsetof(struct tcphdr, dest):
	BUILD_BUG_ON(offsetof(struct tcphdr, dest) !=
	offsetof(struct udphdr, dest));
	dnat = true;
	fallthrough;
	case offsetof(struct tcphdr, source):
	BUILD_BUG_ON(offsetof(struct tcphdr, source) !=
	offsetof(struct udphdr, source));
	if (~fa->mangle.mask != 0xffff)
	return -EOPNOTSUPP;
	conn->l4_natport = htons(fa->mangle.val);
	mung->tcpudp = 1;
	break;
	default:
	return -EOPNOTSUPP;
	}
	break;
	default:
	return -EOPNOTSUPP;
	}
	/* first mangle tells us whether this is SNAT or DNAT;
	* subsequent mangles must match that
	*/
	if (first)
	conn->dnat = dnat;
	else if (conn->dnat != dnat)
	return -EOPNOTSUPP;
	return 0;
	}

	static int efx_tc_ct_replace(struct efx_tc_ct_zone *ct_zone,
	struct flow_cls_offload *tc)
	{
	struct flow_rule *fr = flow_cls_offload_flow_rule(tc);
	struct efx_tc_ct_mangler_state mung = {};
	struct efx_tc_ct_entry conn, old;
	struct efx_nic *efx = ct_zone->efx;
	const struct flow_action_entry *fa;
	struct efx_tc_counter *cnt;
	int rc, i;

	if (WARN_ON(!efx->tc))
	return -ENETDOWN;
	if (WARN_ON(!efx->tc->up))
	return -ENETDOWN;

	conn = kzalloc(sizeof(*conn), GFP_USER);
	if (!conn)
	return -ENOMEM;
	conn->cookie = tc->cookie;
	old = rhashtable_lookup_get_insert_fast(&efx->tc->ct_ht,
	&conn->linkage,
	efx_tc_ct_ht_params);
	if (IS_ERR(old)) {
	rc = PTR_ERR(old);
	goto release;
	} else if (old) {
	netif_dbg(efx, drv, efx->net_dev,
	"Already offloaded conntrack (cookie %lx)\n", tc->cookie);
	rc = -EEXIST;
	goto release;
	}

	/* Parse match */
	conn->zone = ct_zone;
	rc = efx_tc_ct_parse_match(efx, fr, conn);
	if (rc)
	goto release;

	/* Parse actions */
	flow_action_for_each(i, fa, &fr->action) {
	switch (fa->id) {
	case FLOW_ACTION_CT_METADATA:
	conn->mark = fa->ct_metadata.mark;
	if (memchr_inv(fa->ct_metadata.labels, 0, sizeof(fa->ct_metadata.labels))) {
	netif_dbg(efx, drv, efx->net_dev,
	"Setting CT label not supported\n");
	rc = -EOPNOTSUPP;
	goto release;
	}
	break;
	case FLOW_ACTION_MANGLE:
	if (conn->eth_proto != htons(ETH_P_IP)) {
	netif_dbg(efx, drv, efx->net_dev,
	"NAT only supported for IPv4\n");
	rc = -EOPNOTSUPP;
	goto release;
	}
	rc = efx_tc_ct_mangle(efx, conn, fa, &mung);
	if (rc)
	goto release;
	break;
	default:
	netif_dbg(efx, drv, efx->net_dev,
	"Unhandled action %u for conntrack\n", fa->id);
	rc = -EOPNOTSUPP;
	goto release;
	}
	}

	/* fill in defaults for unmangled values */
	if (!mung.ipv4)
	conn->nat_ip = conn->dnat ? conn->dst_ip : conn->src_ip;
	if (!mung.tcpudp)
	conn->l4_natport = conn->dnat ? conn->l4_dport : conn->l4_sport;

	cnt = efx_tc_flower_allocate_counter(efx, EFX_TC_COUNTER_TYPE_CT);
	if (IS_ERR(cnt)) {
	rc = PTR_ERR(cnt);
	goto release;
	}
	conn->cnt = cnt;

	rc = efx_mae_insert_ct(efx, conn);
	if (rc) {
	netif_dbg(efx, drv, efx->net_dev,
	"Failed to insert conntrack, %d\n", rc);
	goto release;
	}
	mutex_lock(&ct_zone->mutex);
	list_add_tail(&conn->list, &ct_zone->cts);
	mutex_unlock(&ct_zone->mutex);
	return 0;
	release:
	if (conn->cnt)
	efx_tc_flower_release_counter(efx, conn->cnt);
	if (!old)
	rhashtable_remove_fast(&efx->tc->ct_ht, &conn->linkage,
	efx_tc_ct_ht_params);
	kfree(conn);
	return rc;
	}

	/* Caller must follow with efx_tc_ct_remove_finish() after RCU grace period! */
	static void efx_tc_ct_remove(struct efx_nic efx, struct efx_tc_ct_entry conn)
	{
	int rc;

	/* Remove it from HW */
	rc = efx_mae_remove_ct(efx, conn);
	/* Delete it from SW */
	rhashtable_remove_fast(&efx->tc->ct_ht, &conn->linkage,
	efx_tc_ct_ht_params);
	if (rc) {
	netif_err(efx, drv, efx->net_dev,
	"Failed to remove conntrack %lx from hw, rc %d\n",
	conn->cookie, rc);
	} else {
	netif_dbg(efx, drv, efx->net_dev, "Removed conntrack %lx\n",
	conn->cookie);
	}
	}

	static void efx_tc_ct_remove_finish(struct efx_nic efx, struct efx_tc_ct_entry conn)
	{
	/* Remove related CT counter. This is delayed after the conn object we
	* are working with has been successfully removed. This protects the
	* counter from being used-after-free inside efx_tc_ct_stats.
	*/
	efx_tc_flower_release_counter(efx, conn->cnt);
	kfree(conn);
	}

	static int efx_tc_ct_destroy(struct efx_tc_ct_zone *ct_zone,
	struct flow_cls_offload *tc)
	{
	struct efx_nic *efx = ct_zone->efx;
	struct efx_tc_ct_entry *conn;

	conn = rhashtable_lookup_fast(&efx->tc->ct_ht, &tc->cookie,
	efx_tc_ct_ht_params);
	if (!conn) {
	netif_warn(efx, drv, efx->net_dev,
	"Conntrack %lx not found to remove\n", tc->cookie);
	return -ENOENT;
	}

	mutex_lock(&ct_zone->mutex);
	list_del(&conn->list);
	efx_tc_ct_remove(efx, conn);
	mutex_unlock(&ct_zone->mutex);
	synchronize_rcu();
	efx_tc_ct_remove_finish(efx, conn);
	return 0;
	}

	static int efx_tc_ct_stats(struct efx_tc_ct_zone *ct_zone,
	struct flow_cls_offload *tc)
	{
	struct efx_nic *efx = ct_zone->efx;
	struct efx_tc_ct_entry *conn;
	struct efx_tc_counter *cnt;

	rcu_read_lock();
	conn = rhashtable_lookup_fast(&efx->tc->ct_ht, &tc->cookie,
	efx_tc_ct_ht_params);
	if (!conn) {
	netif_warn(efx, drv, efx->net_dev,
	"Conntrack %lx not found for stats\n", tc->cookie);
	rcu_read_unlock();
	return -ENOENT;
	}

	cnt = conn->cnt;
	spin_lock_bh(&cnt->lock);
	/* Report only last use */
	flow_stats_update(&tc->stats, 0, 0, 0, cnt->touched,
	FLOW_ACTION_HW_STATS_DELAYED);
	spin_unlock_bh(&cnt->lock);
	rcu_read_unlock();

	return 0;
	}

	static int efx_tc_flow_block(enum tc_setup_type type, void *type_data,
	void *cb_priv)
	{
	struct flow_cls_offload *tcb = type_data;
	struct efx_tc_ct_zone *ct_zone = cb_priv;

	if (type != TC_SETUP_CLSFLOWER)
	return -EOPNOTSUPP;

	switch (tcb->command) {
	case FLOW_CLS_REPLACE:
	return efx_tc_ct_replace(ct_zone, tcb);
	case FLOW_CLS_DESTROY:
	return efx_tc_ct_destroy(ct_zone, tcb);
	case FLOW_CLS_STATS:
	return efx_tc_ct_stats(ct_zone, tcb);
	default:
	break;
	}

	return -EOPNOTSUPP;
	}

	struct efx_tc_ct_zone efx_tc_ct_register_zone(struct efx_nic efx, u16 zone,
	struct nf_flowtable *ct_ft)
	{
	struct efx_tc_ct_zone ct_zone, old;
	int rc;

	ct_zone = kzalloc(sizeof(*ct_zone), GFP_USER);
	if (!ct_zone)
	return ERR_PTR(-ENOMEM);
	ct_zone->zone = zone;
	old = rhashtable_lookup_get_insert_fast(&efx->tc->ct_zone_ht,
	&ct_zone->linkage,
	efx_tc_ct_zone_ht_params);
	if (old) {
	/* don't need our new entry */
	kfree(ct_zone);
	if (IS_ERR(old)) /* oh dear, it's actually an error */
	return ERR_CAST(old);
	if (!refcount_inc_not_zero(&old->ref))
	return ERR_PTR(-EAGAIN);
	/* existing entry found */
	WARN_ON_ONCE(old->nf_ft != ct_ft);
	netif_dbg(efx, drv, efx->net_dev,
	"Found existing ct_zone for %u\n", zone);
	return old;
	}
	ct_zone->nf_ft = ct_ft;
	ct_zone->efx = efx;
	INIT_LIST_HEAD(&ct_zone->cts);
	mutex_init(&ct_zone->mutex);
	rc = nf_flow_table_offload_add_cb(ct_ft, efx_tc_flow_block, ct_zone);
	netif_dbg(efx, drv, efx->net_dev, "Adding new ct_zone for %u, rc %d\n",
	zone, rc);
	if (rc < 0)
	goto fail;
	refcount_set(&ct_zone->ref, 1);
	return ct_zone;
	fail:
	rhashtable_remove_fast(&efx->tc->ct_zone_ht, &ct_zone->linkage,
	efx_tc_ct_zone_ht_params);
	kfree(ct_zone);
	return ERR_PTR(rc);
	}

	void efx_tc_ct_unregister_zone(struct efx_nic *efx,
	struct efx_tc_ct_zone *ct_zone)
	{
	struct efx_tc_ct_entry conn, next;

	if (!refcount_dec_and_test(&ct_zone->ref))
	return; /* still in use */
	nf_flow_table_offload_del_cb(ct_zone->nf_ft, efx_tc_flow_block, ct_zone);
	rhashtable_remove_fast(&efx->tc->ct_zone_ht, &ct_zone->linkage,
	efx_tc_ct_zone_ht_params);
	mutex_lock(&ct_zone->mutex);
	list_for_each_entry(conn, &ct_zone->cts, list)
	efx_tc_ct_remove(efx, conn);
	synchronize_rcu();
	/* need to use _safe because efx_tc_ct_remove_finish() frees conn */
	list_for_each_entry_safe(conn, next, &ct_zone->cts, list)
	efx_tc_ct_remove_finish(efx, conn);
	mutex_unlock(&ct_zone->mutex);
	mutex_destroy(&ct_zone->mutex);
	netif_dbg(efx, drv, efx->net_dev, "Removed ct_zone for %u\n",
	ct_zone->zone);
	kfree(ct_zone);
	}