xfrm: Namespacify xfrm state/policy locks
By semantics, xfrm layer is fully name space aware,
so will the locks, e.g. xfrm_state/pocliy_lock.
Ensure exclusive access into state/policy link list
for different name space with one global lock is not
right in terms of semantics aspect at first place,
as they are indeed mutually independent with each
other, but also more seriously causes scalability
problem.
One practical scenario is on a Open Network Stack,
more than hundreds of lxc tenants acts as routers
within one host, a global xfrm_state/policy_lock
becomes the bottleneck. But onces those locks are
decoupled in a per-namespace fashion, locks contend
is just with in specific name space scope, without
causing additional SPD/SAD access delay for other
name space.
Also this patch improve scalability while as without
changing original xfrm behavior.
Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 907fd2f..73b04d3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -39,12 +39,7 @@
#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
#define XFRM_MAX_QUEUE_LEN 100
-DEFINE_MUTEX(xfrm_cfg_mutex);
-EXPORT_SYMBOL(xfrm_cfg_mutex);
-
-static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
static struct dst_entry *xfrm_policy_sk_bundles;
-static DEFINE_RWLOCK(xfrm_policy_lock);
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
@@ -438,7 +433,7 @@
if (!ndst)
return;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
for (i = hmask; i >= 0; i--)
xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
@@ -446,7 +441,7 @@
net->xfrm.policy_bydst[dir].table = ndst;
net->xfrm.policy_bydst[dir].hmask = nhashmask;
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -463,7 +458,7 @@
if (!nidx)
return;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
for (i = hmask; i >= 0; i--)
xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
@@ -471,7 +466,7 @@
net->xfrm.policy_byidx = nidx;
net->xfrm.policy_idx_hmask = nhashmask;
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}
@@ -504,7 +499,7 @@
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
- read_lock_bh(&xfrm_policy_lock);
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
@@ -513,7 +508,7 @@
si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
si->spdhcnt = net->xfrm.policy_idx_hmask;
si->spdhmcnt = xfrm_policy_hashmax;
- read_unlock_bh(&xfrm_policy_lock);
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_spd_getinfo);
@@ -636,7 +631,7 @@
struct hlist_head *chain;
struct hlist_node *newpos;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
delpol = NULL;
newpos = NULL;
@@ -647,7 +642,7 @@
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
if (excl) {
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
return -EEXIST;
}
delpol = pol;
@@ -685,7 +680,7 @@
if (!mod_timer(&policy->timer, jiffies + HZ))
xfrm_pol_hold(policy);
list_add(&policy->walk.all, &net->xfrm.policy_all);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (delpol)
xfrm_policy_kill(delpol);
@@ -705,7 +700,7 @@
struct hlist_head *chain;
*err = 0;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
@@ -718,7 +713,7 @@
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -727,7 +722,7 @@
break;
}
}
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -746,7 +741,7 @@
return NULL;
*err = 0;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = net->xfrm.policy_byidx + idx_hash(net, id);
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
@@ -757,7 +752,7 @@
*err = security_xfrm_policy_delete(
pol->security);
if (*err) {
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
__xfrm_policy_unlink(pol, dir);
@@ -766,7 +761,7 @@
break;
}
}
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
@@ -829,7 +824,7 @@
{
int dir, err = 0, cnt = 0;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
err = xfrm_policy_flush_secctx_check(net, type, audit_info);
if (err)
@@ -845,7 +840,7 @@
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1, audit_info->loginuid,
@@ -854,7 +849,7 @@
xfrm_policy_kill(pol);
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again1;
}
@@ -866,7 +861,7 @@
if (pol->type != type)
continue;
__xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
cnt++;
xfrm_audit_policy_delete(pol, 1,
@@ -875,7 +870,7 @@
audit_info->secid);
xfrm_policy_kill(pol);
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
goto again2;
}
}
@@ -884,7 +879,7 @@
if (!cnt)
err = -ESRCH;
out:
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
@@ -904,7 +899,7 @@
if (list_empty(&walk->walk.all) && walk->seq != 0)
return 0;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
if (list_empty(&walk->walk.all))
x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
else
@@ -930,7 +925,7 @@
}
list_del_init(&walk->walk.all);
out:
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);
@@ -944,14 +939,14 @@
}
EXPORT_SYMBOL(xfrm_policy_walk_init);
-void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
{
if (list_empty(&walk->walk.all))
return;
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
list_del(&walk->walk.all);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);
@@ -996,7 +991,7 @@
if (unlikely(!daddr || !saddr))
return NULL;
- read_lock_bh(&xfrm_policy_lock);
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_direct(net, daddr, saddr, family, dir);
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
@@ -1032,7 +1027,7 @@
if (ret)
xfrm_pol_hold(ret);
fail:
- read_unlock_bh(&xfrm_policy_lock);
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
return ret;
}
@@ -1109,8 +1104,9 @@
const struct flowi *fl)
{
struct xfrm_policy *pol;
+ struct net *net = sock_net(sk);
- read_lock_bh(&xfrm_policy_lock);
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
if ((pol = sk->sk_policy[dir]) != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
@@ -1134,7 +1130,7 @@
pol = NULL;
}
out:
- read_unlock_bh(&xfrm_policy_lock);
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
return pol;
}
@@ -1172,9 +1168,11 @@
int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
- write_lock_bh(&xfrm_policy_lock);
+ struct net *net = xp_net(pol);
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
pol = __xfrm_policy_unlink(pol, dir);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (pol) {
xfrm_policy_kill(pol);
return 0;
@@ -1193,7 +1191,7 @@
return -EINVAL;
#endif
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
old_pol = sk->sk_policy[dir];
sk->sk_policy[dir] = pol;
if (pol) {
@@ -1210,7 +1208,7 @@
*/
__xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
}
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (old_pol) {
xfrm_policy_kill(old_pol);
@@ -1221,6 +1219,7 @@
static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
+ struct net *net = xp_net(old);
if (newp) {
newp->selector = old->selector;
@@ -1239,9 +1238,9 @@
newp->type = old->type;
memcpy(newp->xfrm_vec, old->xfrm_vec,
newp->xfrm_nr*sizeof(struct xfrm_tmpl));
- write_lock_bh(&xfrm_policy_lock);
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
__xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
- write_unlock_bh(&xfrm_policy_lock);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
xfrm_pol_put(newp);
}
return newp;
@@ -2112,10 +2111,10 @@
dst_hold(&xdst->u.dst);
- spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
xdst->u.dst.next = xfrm_policy_sk_bundles;
xfrm_policy_sk_bundles = &xdst->u.dst;
- spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
route = xdst->route;
}
@@ -2440,7 +2439,7 @@
}
xfrm_nr = ti;
if (npols > 1) {
- xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
+ xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
tpp = stp;
}
@@ -2569,10 +2568,10 @@
{
struct dst_entry *head, *next;
- spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+ spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
head = xfrm_policy_sk_bundles;
xfrm_policy_sk_bundles = NULL;
- spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
while (head) {
next = head->next;
@@ -2956,6 +2955,13 @@
rv = xfrm_sysctl_init(net);
if (rv < 0)
goto out_sysctl;
+
+ /* Initialize the per-net locks here */
+ spin_lock_init(&net->xfrm.xfrm_state_lock);
+ rwlock_init(&net->xfrm.xfrm_policy_lock);
+ spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
+ mutex_init(&net->xfrm.xfrm_cfg_mutex);
+
return 0;
out_sysctl:
@@ -3082,7 +3088,7 @@
struct hlist_head *chain;
u32 priority = ~0U;
- read_lock_bh(&xfrm_policy_lock);
+ read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
hlist_for_each_entry(pol, chain, bydst) {
if (xfrm_migrate_selector_match(sel, &pol->selector) &&
@@ -3105,7 +3111,7 @@
if (ret)
xfrm_pol_hold(ret);
- read_unlock_bh(&xfrm_policy_lock);
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
return ret;
}
@@ -3236,7 +3242,7 @@
/* Stage 2 - find and update state(s) */
for (i = 0, mp = m; i < num_migrate; i++, mp++) {
- if ((x = xfrm_migrate_state_find(mp))) {
+ if ((x = xfrm_migrate_state_find(mp, net))) {
x_cur[nx_cur] = x;
nx_cur++;
if ((xc = xfrm_state_migrate(x, mp))) {