drivers/edac: add new nmi rescan
Provides a way for NMI reported errors on x86 to notify the EDAC
subsystem pending ECC errors by writing to a software state variable.
Here's the reworked patch. I added an EDAC stub to the kernel so we can
have variables that are in the kernel even if EDAC is a module. I also
implemented the idea of using the chip driver to select error detection
mode via module parameter and eliminate the kernel compile option.
Please review/test. Thx!
Also, I only made changes to some of the chipset drivers since I am
unfamiliar with the other ones. We can add similar changes as we go.
Signed-off-by: Dave Jiang <djiang@mvista.com>
Signed-off-by: Douglas Thompson <dougthompson@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index e8c4a2b..3cfd906 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -109,15 +109,4 @@
Support for error detection and correction the Intel
Greekcreek/Blackford chipsets.
-choice
- prompt "Error detecting method"
- default EDAC_POLL
-
-config EDAC_POLL
- bool "Poll for errors"
- help
- Poll the chipset periodically to detect errors.
-
-endchoice
-
endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 773472c..19d5ac7 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -5,9 +5,9 @@
# This file may be distributed under the terms of the
# GNU General Public License.
#
-# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $
+obj-$(CONFIG_EDAC) := edac_stub.o
obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o
edac_core-objs := edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index 8bcc887..f51e79a 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -22,6 +22,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/slab.h>
+#include <linux/edac.h>
#include "edac_mc.h"
#define E752X_REVISION " Ver: 2.0.1 " __DATE__
@@ -948,6 +949,16 @@
debugf0("%s(): mci\n", __func__);
debugf0("Starting Probe1\n");
+ /* make sure error reporting method is sane */
+ switch(edac_op_state) {
+ case EDAC_OPSTATE_POLL:
+ case EDAC_OPSTATE_NMI:
+ break;
+ default:
+ edac_op_state = EDAC_OPSTATE_POLL;
+ break;
+ }
+
/* check to see if device 0 function 1 is enabled; if it isn't, we
* assume the BIOS has reserved it for a reason and is expecting
* exclusive access, we take care not to violate that assumption and
@@ -1123,4 +1134,5 @@
module_param(force_function_unhide, int, 0444);
MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:"
" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access");
-
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index 310d91b..0827b9a 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -27,6 +27,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/slab.h>
+#include <linux/edac.h>
#include "edac_mc.h"
#define E7XXX_REVISION " Ver: 2.0.1 " __DATE__
@@ -419,6 +420,17 @@
struct e7xxx_error_info discard;
debugf0("%s(): mci\n", __func__);
+
+ /* make sure error reporting method is sane */
+ switch(edac_op_state) {
+ case EDAC_OPSTATE_POLL:
+ case EDAC_OPSTATE_NMI:
+ break;
+ default:
+ edac_op_state = EDAC_OPSTATE_POLL;
+ break;
+ }
+
pci_read_config_dword(pdev, E7XXX_DRC, &drc);
drc_chan = dual_channel_active(drc, dev_idx);
@@ -565,3 +577,5 @@
MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n"
"Based on.work by Dan Hollis et al");
MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers");
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index eae1ca1..81a28d6 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -27,6 +27,7 @@
#include <linux/list.h>
#include <linux/sysdev.h>
#include <linux/ctype.h>
+#include <linux/edac.h>
#include <asm/uaccess.h>
#include <asm/page.h>
#include <asm/edac.h>
@@ -241,6 +242,7 @@
}
list_add_tail_rcu(&mci->link, insert_before);
+ atomic_inc(&edac_handlers);
return 0;
fail0:
@@ -267,6 +269,7 @@
static void del_mc_from_global_list(struct mem_ctl_info *mci)
{
+ atomic_dec(&edac_handlers);
list_del_rcu(&mci->link);
init_completion(&mci->complete);
call_rcu(&mci->rcu, complete_mc_list_del);
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 3cd3a23..89c96ec 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -1,6 +1,7 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
+#include <linux/edac.h>
#include "edac_mc.h"
#include "edac_module.h"
@@ -102,6 +103,25 @@
}
/*
+ * handler for EDAC to check if NMI type handler has asserted interrupt
+ */
+static int edac_assert_error_check_and_clear(void)
+{
+ int vreg;
+
+ if(edac_op_state == EDAC_OPSTATE_POLL)
+ return 1;
+
+ vreg = atomic_read(&edac_err_assert);
+ if(vreg) {
+ atomic_set(&edac_err_assert, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
* Action thread for EDAC to perform the POLL operations
*/
static int edac_kernel_thread(void *arg)
@@ -109,8 +129,8 @@
int msec;
while (!kthread_should_stop()) {
-
- do_edac_check();
+ if(edac_assert_error_check_and_clear())
+ do_edac_check();
/* goto sleep for the interval */
msec = (HZ * edac_get_poll_msec()) / 1000;
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
new file mode 100644
index 0000000..91a038d
--- /dev/null
+++ b/drivers/edac/edac_stub.c
@@ -0,0 +1,42 @@
+/*
+ * common EDAC components that must be in kernel
+ *
+ * Author: Dave Jiang <djiang@mvista.com>
+ *
+ * 2007 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ *
+ */
+#include <linux/module.h>
+#include <linux/edac.h>
+#include <asm/atomic.h>
+#include <asm/edac.h>
+
+int edac_op_state = EDAC_OPSTATE_INVAL;
+EXPORT_SYMBOL(edac_op_state);
+
+atomic_t edac_handlers = ATOMIC_INIT(0);
+EXPORT_SYMBOL(edac_handlers);
+
+atomic_t edac_err_assert = ATOMIC_INIT(0);
+EXPORT_SYMBOL(edac_err_assert);
+
+inline int edac_handler_set(void)
+{
+ if (edac_op_state == EDAC_OPSTATE_POLL)
+ return 0;
+
+ return atomic_read(&edac_handlers);
+}
+EXPORT_SYMBOL(edac_handler_set);
+
+/*
+ * handler for NMI type of interrupts to assert error
+ */
+inline void edac_atomic_assert_error(void)
+{
+ atomic_set(&edac_err_assert, 1);
+}
+EXPORT_SYMBOL(edac_atomic_assert_error);
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 4d7e786..8eb8b6e 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -19,6 +19,7 @@
#include <linux/pci.h>
#include <linux/pci_ids.h>
#include <linux/slab.h>
+#include <linux/edac.h>
#include <asm/mmzone.h>
#include "edac_mc.h"
@@ -1285,6 +1286,16 @@
if (PCI_FUNC(pdev->devfn) != 0)
return -ENODEV;
+ /* make sure error reporting method is sane */
+ switch(edac_op_state) {
+ case EDAC_OPSTATE_POLL:
+ case EDAC_OPSTATE_NMI:
+ break;
+ default:
+ edac_op_state = EDAC_OPSTATE_POLL;
+ break;
+ }
+
/* Ask the devices for the number of CSROWS and CHANNELS so
* that we can calculate the memory resources, etc
*
@@ -1475,3 +1486,5 @@
("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>");
MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
I5000_REVISION);
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");