cxl: Change contexts_lock to a mutex to fix sleep while atomic bug
We had a known sleep while atomic bug if a CXL device was forcefully
unbound while it was in use. This could occur as a result of EEH, or
manually induced with something like this while the device was in use:
echo 0000:01:00.0 > /sys/bus/pci/drivers/cxl-pci/unbind
The issue was that in this code path we iterated over each context and
forcefully detached it with the contexts_lock spin lock held, however
the detach also needed to take the spu_mutex, and call schedule.
This patch changes the contexts_lock to a mutex so that we are not in
atomic context while doing the detach, thereby avoiding the sleep while
atomic.
Also delete the related TODO comment, which suggested an alternate
solution which turned out to not be workable.
Cc: stable@vger.kernel.org
Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index cca4721..4aa31a3 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -82,12 +82,12 @@
* Allocating IDR! We better make sure everything's setup that
* dereferences from it.
*/
+ mutex_lock(&afu->contexts_lock);
idr_preload(GFP_KERNEL);
- spin_lock(&afu->contexts_lock);
i = idr_alloc(&ctx->afu->contexts_idr, ctx, 0,
ctx->afu->num_procs, GFP_NOWAIT);
- spin_unlock(&afu->contexts_lock);
idr_preload_end();
+ mutex_unlock(&afu->contexts_lock);
if (i < 0)
return i;
@@ -168,21 +168,22 @@
struct cxl_context *ctx;
int tmp;
- rcu_read_lock();
- idr_for_each_entry(&afu->contexts_idr, ctx, tmp)
+ mutex_lock(&afu->contexts_lock);
+ idr_for_each_entry(&afu->contexts_idr, ctx, tmp) {
/*
* Anything done in here needs to be setup before the IDR is
* created and torn down after the IDR removed
*/
__detach_context(ctx);
- rcu_read_unlock();
+ }
+ mutex_unlock(&afu->contexts_lock);
}
void cxl_context_free(struct cxl_context *ctx)
{
- spin_lock(&ctx->afu->contexts_lock);
+ mutex_lock(&ctx->afu->contexts_lock);
idr_remove(&ctx->afu->contexts_idr, ctx->pe);
- spin_unlock(&ctx->afu->contexts_lock);
+ mutex_unlock(&ctx->afu->contexts_lock);
synchronize_rcu();
free_page((u64)ctx->sstp);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b5b6bda..7c05239 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -351,7 +351,7 @@
struct device *chardev_s, *chardev_m, *chardev_d;
struct idr contexts_idr;
struct dentry *debugfs;
- spinlock_t contexts_lock;
+ struct mutex contexts_lock;
struct mutex spa_mutex;
spinlock_t afu_cntl_lock;
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 9a5a442..1001cf4 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -610,13 +610,6 @@
return 0;
}
-/*
- * TODO: handle case when this is called inside a rcu_read_lock() which may
- * happen when we unbind the driver (ie. cxl_context_detach_all()) . Terminate
- * & remove use a mutex lock and schedule which will not good with lock held.
- * May need to write do_process_element_cmd() that handles outstanding page
- * faults synchronously.
- */
static inline int detach_process_native_afu_directed(struct cxl_context *ctx)
{
if (!ctx->pe_inserted)
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 10c98ab..0f2cc9f8 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -502,7 +502,7 @@
afu->dev.release = cxl_release_afu;
afu->slice = slice;
idr_init(&afu->contexts_idr);
- spin_lock_init(&afu->contexts_lock);
+ mutex_init(&afu->contexts_lock);
spin_lock_init(&afu->afu_cntl_lock);
mutex_init(&afu->spa_mutex);
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index ce7ec06..461bdbd 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -121,7 +121,7 @@
int rc;
/* Not safe to reset if it is currently in use */
- spin_lock(&afu->contexts_lock);
+ mutex_lock(&afu->contexts_lock);
if (!idr_is_empty(&afu->contexts_idr)) {
rc = -EBUSY;
goto err;
@@ -132,7 +132,7 @@
rc = count;
err:
- spin_unlock(&afu->contexts_lock);
+ mutex_unlock(&afu->contexts_lock);
return rc;
}
@@ -247,7 +247,7 @@
int rc = -EBUSY;
/* can't change this if we have a user */
- spin_lock(&afu->contexts_lock);
+ mutex_lock(&afu->contexts_lock);
if (!idr_is_empty(&afu->contexts_idr))
goto err;
@@ -271,7 +271,7 @@
afu->current_mode = 0;
afu->num_procs = 0;
- spin_unlock(&afu->contexts_lock);
+ mutex_unlock(&afu->contexts_lock);
if ((rc = _cxl_afu_deactivate_mode(afu, old_mode)))
return rc;
@@ -280,7 +280,7 @@
return count;
err:
- spin_unlock(&afu->contexts_lock);
+ mutex_unlock(&afu->contexts_lock);
return rc;
}