rcu: Grace-period initialization excludes only RCU notifier
Kirill noted the following deadlock cycle on shutdown involving padata:
> With commit 755609a9087fa983f567dc5452b2fa7b089b591f I've got deadlock on
> poweroff.
>
> It guess it happens because of race for cpu_hotplug.lock:
>
> CPU A CPU B
> disable_nonboot_cpus()
> _cpu_down()
> cpu_hotplug_begin()
> mutex_lock(&cpu_hotplug.lock);
> __cpu_notify()
> padata_cpu_callback()
> __padata_remove_cpu()
> padata_replace()
> synchronize_rcu()
> rcu_gp_kthread()
> get_online_cpus();
> mutex_lock(&cpu_hotplug.lock);
It would of course be good to eliminate grace-period delays from
CPU-hotplug notifiers, but that is a separate issue. Deadlock is
not an appropriate diagnostic for excessive CPU-hotplug latency.
Fortunately, grace-period initialization does not actually need to
exclude all of the CPU-hotplug operation, but rather only RCU's own
CPU_UP_PREPARE and CPU_DEAD CPU-hotplug notifiers. This commit therefore
introduces a new per-rcu_state onoff_mutex that provides the required
concurrency control in place of the get_online_cpus() that was previously
in rcu_gp_init().
Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Kirill A. Shutemov <kirill@shutemov.name>
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376..74df86b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
}
@@ -1197,7 +1198,7 @@
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
- get_online_cpus();
+ mutex_lock(&rsp->onoff_mutex);
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@
cond_resched();
}
- put_online_cpus();
+ mutex_unlock(&rsp->onoff_mutex);
return 1;
}
@@ -1700,6 +1701,7 @@
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
+ mutex_lock(&rsp->onoff_mutex);
raw_spin_lock_irqsave(&rsp->onofflock, flags);
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@
init_callback_list(rdp);
/* Disallow further callbacks on this CPU. */
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ mutex_unlock(&rsp->onoff_mutex);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ /* Exclude new grace periods. */
+ mutex_lock(&rsp->onoff_mutex);
+
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
@@ -2662,14 +2668,6 @@
rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /*
- * A new grace period might start here. If so, we won't be part
- * of it, but that is OK, as we are currently in a quiescent state.
- */
-
- /* Exclude any attempts to start a new GP on large systems. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
-
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+ local_irq_restore(flags);
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ mutex_unlock(&rsp->onoff_mutex);
}
static void __cpuinit rcu_prepare_cpu(int cpu)