Merge tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:

 - Replace pcommit with ADR / directed-flushing.

   The pcommit instruction, which has not shipped on any product, is
   deprecated.  Instead, the requirement is that platforms implement
   either ADR, or provide one or more flush addresses per nvdimm.

   ADR (Asynchronous DRAM Refresh) flushes data in posted write buffers
   to the memory controller on a power-fail event.

   Flush addresses are defined in ACPI 6.x as an NVDIMM Firmware
   Interface Table (NFIT) sub-structure: "Flush Hint Address Structure".
   A flush hint is an mmio address that when written and fenced assures
   that all previous posted writes targeting a given dimm have been
   flushed to media.

 - On-demand ARS (address range scrub).

   Linux uses the results of the ACPI ARS commands to track bad blocks
   in pmem devices.  When latent errors are detected we re-scrub the
   media to refresh the bad block list, userspace can also request a
   re-scrub at any time.

 - Support for the Microsoft DSM (device specific method) command
   format.

 - Support for EDK2/OVMF virtual disk device memory ranges.

 - Various fixes and cleanups across the subsystem.

* tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (41 commits)
  libnvdimm-btt: Delete an unnecessary check before the function call "__nd_device_register"
  nfit: do an ARS scrub on hitting a latent media error
  nfit: move to nfit/ sub-directory
  nfit, libnvdimm: allow an ARS scrub to be triggered on demand
  libnvdimm: register nvdimm_bus devices with an nd_bus driver
  pmem: clarify a debug print in pmem_clear_poison
  x86/insn: remove pcommit
  Revert "KVM: x86: add pcommit support"
  nfit, tools/testing/nvdimm/: unify shutdown paths
  libnvdimm: move ->module to struct nvdimm_bus_descriptor
  nfit: cleanup acpi_nfit_init calling convention
  nfit: fix _FIT evaluation memory leak + use after free
  tools/testing/nvdimm: add manufacturing_{date|location} dimm properties
  tools/testing/nvdimm: add virtual ramdisk range
  acpi, nfit: treat virtual ramdisk SPA as pmem region
  pmem: kill __pmem address space
  pmem: kill wmb_pmem()
  libnvdimm, pmem: use nvdimm_flush() for namespace I/O writes
  fs/dax: remove wmb_pmem()
  libnvdimm, pmem: flush posted-write queues on shutdown
  ...
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ef46d3a..1b3c39a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -395,7 +395,7 @@
 	int (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+	int (*direct_access) (struct block_device *, sector_t, void **,
 				unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
index b91443f..e293fb6 100644
--- a/Documentation/nvdimm/btt.txt
+++ b/Documentation/nvdimm/btt.txt
@@ -256,28 +256,18 @@
 only state using a flag in the info block.
 
 
-5. In-kernel usage
-==================
+5. Usage
+========
 
-Any block driver that supports byte granularity IO to the storage may register
-with the BTT. It will have to provide the rw_bytes interface in its
-block_device_operations struct:
+The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
+(pmem, or blk mode). The easiest way to set up such a namespace is using the
+'ndctl' utility [1]:
 
-	int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
+For example, the ndctl command line to setup a btt with a 4k sector size is:
 
-It may register with the BTT after it adds its own gendisk, using btt_init:
+    ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
 
-	struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
-			u32 lbasize, u8 uuid[], int maxlane);
+See ndctl create-namespace --help for more options.
 
-note that maxlane is the maximum amount of concurrency the driver wishes to
-allow the BTT to use.
-
-The BTT 'disk' appears as a stacked block device that grabs the underlying block
-device in the O_EXCL mode.
-
-When the driver wishes to remove the backing disk, it should similarly call
-btt_fini using the same struct btt* handle that was provided to it by btt_init.
-
-	void btt_fini(struct btt *btt);
+[1]: https://github.com/pmem/ndctl
 
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f9af646..9144204 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -143,12 +143,12 @@
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-		       void __pmem **kaddr, pfn_t *pfn, long size)
+		       void **kaddr, pfn_t *pfn, long size)
 {
 	struct axon_ram_bank *bank = device->bd_disk->private_data;
 	loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
 
-	*kaddr = (void __pmem __force *) bank->io_addr + offset;
+	*kaddr = (void *) bank->io_addr + offset;
 	*pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
 	return bank->size - offset;
 }
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index c64b1e9..d683993 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -225,7 +225,6 @@
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index fbc5e92..643eba4 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -26,13 +26,11 @@
  * @n: length of the copy in bytes
  *
  * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
+ * a subsequent pmem driver flush operation will drain posted write queues.
  */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-		size_t n)
+static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 {
-	int unwritten;
+	int rem;
 
 	/*
 	 * We are copying between two kernel buffers, if
@@ -40,59 +38,36 @@
 	 * fault) we would have already reported a general protection fault
 	 * before the WARN+BUG.
 	 */
-	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
-			(void __user *) src, n);
-	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
-				__func__, dst, src, unwritten))
+	rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n);
+	if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n",
+				__func__, dst, src, rem))
 		BUG();
 }
 
-static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
-		size_t n)
+static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
 {
 	if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
-		return memcpy_mcsafe(dst, (void __force *) src, n);
-	memcpy(dst, (void __force *) src, n);
+		return memcpy_mcsafe(dst, src, n);
+	memcpy(dst, src, n);
 	return 0;
 }
 
 /**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
-	/*
-	 * wmb() to 'sfence' all previous writes such that they are
-	 * architecturally visible to 'pcommit'.  Note, that we've
-	 * already arranged for pmem writes to avoid the cache via
-	 * arch_memcpy_to_pmem().
-	 */
-	wmb();
-	pcommit_sfence();
-}
-
-/**
  * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:	virtual start address
  * @size:	number of bytes to write back
  *
  * Write back a cache range using the CLWB (cache line write back)
- * instruction.  This function requires explicit ordering with an
- * arch_wmb_pmem() call.
+ * instruction.
  */
-static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
 	unsigned long clflush_mask = x86_clflush_size - 1;
-	void *vaddr = (void __force *)addr;
-	void *vend = vaddr + size;
+	void *vend = addr + size;
 	void *p;
 
-	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	for (p = (void *)((unsigned long)addr & ~clflush_mask);
 	     p < vend; p += x86_clflush_size)
 		clwb(p);
 }
@@ -113,16 +88,14 @@
  * @i:		iterator with source data
  *
  * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
- * This function requires explicit ordering with an arch_wmb_pmem() call.
  */
-static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
 		struct iov_iter *i)
 {
-	void *vaddr = (void __force *)addr;
 	size_t len;
 
 	/* TODO: skip the write-back by always using non-temporal stores */
-	len = copy_from_iter_nocache(vaddr, bytes, i);
+	len = copy_from_iter_nocache(addr, bytes, i);
 
 	if (__iter_needs_pmem_wb(i))
 		arch_wb_cache_pmem(addr, bytes);
@@ -136,28 +109,16 @@
  * @size:	number of bytes to zero
  *
  * Write zeros into the memory range starting at 'addr' for 'size' bytes.
- * This function requires explicit ordering with an arch_wmb_pmem() call.
  */
-static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+static inline void arch_clear_pmem(void *addr, size_t size)
 {
-	void *vaddr = (void __force *)addr;
-
-	memset(vaddr, 0, size);
+	memset(addr, 0, size);
 	arch_wb_cache_pmem(addr, size);
 }
 
-static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+static inline void arch_invalidate_pmem(void *addr, size_t size)
 {
-	clflush_cache_range((void __force *) addr, size);
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-	/*
-	 * We require that wmb() be an 'sfence', that is only guaranteed on
-	 * 64-bit builds
-	 */
-	return static_cpu_has(X86_FEATURE_PCOMMIT);
+	clflush_cache_range(addr, size);
 }
 #endif /* CONFIG_ARCH_HAS_PMEM_API */
 #endif /* __ASM_X86_PMEM_H__ */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index d96d043..587d791 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -253,52 +253,6 @@
 		: [pax] "a" (p));
 }
 
-/**
- * pcommit_sfence() - persistent commit and fence
- *
- * The PCOMMIT instruction ensures that data that has been flushed from the
- * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
- * memory and is durable on the DIMM.  The primary use case for this is
- * persistent memory.
- *
- * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
- * with appropriate fencing.
- *
- * Example:
- * void flush_and_commit_buffer(void *vaddr, unsigned int size)
- * {
- *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
- *         void *vend = vaddr + size;
- *         void *p;
- *
- *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
- *              p < vend; p += boot_cpu_data.x86_clflush_size)
- *                 clwb(p);
- *
- *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
- *         // MFENCE via mb() also works
- *         wmb();
- *
- *         // PCOMMIT and the required SFENCE for ordering
- *         pcommit_sfence();
- * }
- *
- * After this function completes the data pointed to by 'vaddr' has been
- * accepted to memory and will be durable if the 'vaddr' points to persistent
- * memory.
- *
- * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
- * things we include both the PCOMMIT and the required SFENCE in the
- * alternatives generated by pcommit_sfence().
- */
-static inline void pcommit_sfence(void)
-{
-	alternative(ASM_NOP7,
-		    ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
-		    "sfence",
-		    X86_FEATURE_PCOMMIT);
-}
-
 #define nop() asm volatile ("nop")
 
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 14c63c7..a002b07 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,6 @@
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
-#define SECONDARY_EXEC_PCOMMIT			0x00200000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 5b15d94..37fee27 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,7 +78,6 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
-#define EXIT_REASON_PCOMMIT             65
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -127,8 +126,7 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
-	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7597b42..6435653 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -366,7 +366,7 @@
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index e17a74b..35058c2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,14 +144,6 @@
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
 
-static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
-}
-
 static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7758680..df07a0a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2707,8 +2707,7 @@
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
-		SECONDARY_EXEC_XSAVES |
-		SECONDARY_EXEC_PCOMMIT;
+		SECONDARY_EXEC_XSAVES;
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
@@ -3270,7 +3269,6 @@
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_XSAVES |
 			SECONDARY_EXEC_ENABLE_PML |
-			SECONDARY_EXEC_PCOMMIT |
 			SECONDARY_EXEC_TSC_SCALING;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -4858,9 +4856,6 @@
 	if (!enable_pml)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
-	/* Currently, we allow L1 guest to directly run pcommit instruction. */
-	exec_control &= ~SECONDARY_EXEC_PCOMMIT;
-
 	return exec_control;
 }
 
@@ -4904,9 +4899,10 @@
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
-	if (cpu_has_secondary_exec_ctrls())
+	if (cpu_has_secondary_exec_ctrls()) {
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 				vmx_secondary_exec_control(vmx));
+	}
 
 	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -7564,13 +7560,6 @@
 	return 1;
 }
 
-static int handle_pcommit(struct kvm_vcpu *vcpu)
-{
-	/* we never catch pcommit instruct for L1 guest. */
-	WARN_ON(1);
-	return 1;
-}
-
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7621,7 +7610,6 @@
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
-	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7930,8 +7918,6 @@
 		 * the XSS exit bitmap in vmcs12.
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
-	case EXIT_REASON_PCOMMIT:
-		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
 	default:
 		return true;
 	}
@@ -9094,15 +9080,6 @@
 
 	if (cpu_has_secondary_exec_ctrls())
 		vmcs_set_secondary_exec_control(secondary_exec_ctl);
-
-	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
-		if (guest_cpuid_has_pcommit(vcpu))
-			vmx->nested.nested_vmx_secondary_ctls_high |=
-				SECONDARY_EXEC_PCOMMIT;
-		else
-			vmx->nested.nested_vmx_secondary_ctls_high &=
-				~SECONDARY_EXEC_PCOMMIT;
-	}
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9715,8 +9692,7 @@
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				  SECONDARY_EXEC_PCOMMIT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
 		if (nested_cpu_has(vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index ec378cd..767be7c 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1012,7 +1012,7 @@
 4: XSAVE
 5: XRSTOR | lfence (11B)
 6: XSAVEOPT | clwb (66) | mfence (11B)
-7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
+7: clflush | clflushopt (66) | sfence (11B)
 EndTable
 
 GrpTable: Grp16
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index acad70a..aebd944 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -454,32 +454,7 @@
 
 	  If you are unsure what to do, do not enable this option.
 
-config ACPI_NFIT
-	tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
-	depends on PHYS_ADDR_T_64BIT
-	depends on BLK_DEV
-	depends on ARCH_HAS_MMIO_FLUSH
-	select LIBNVDIMM
-	help
-	  Infrastructure to probe ACPI 6 compliant platforms for
-	  NVDIMMs (NFIT) and register a libnvdimm device tree.  In
-	  addition to storage devices this also enables libnvdimm to pass
-	  ACPI._DSM messages for platform/dimm configuration.
-
-	  To compile this driver as a module, choose M here:
-	  the module will be called nfit.
-
-config ACPI_NFIT_DEBUG
-	bool "NFIT DSM debug"
-	depends on ACPI_NFIT
-	depends on DYNAMIC_DEBUG
-	default n
-	help
-	  Enabling this option causes the nfit driver to dump the
-	  input and output buffers of _DSM operations on the ACPI0012
-	  device and its children.  This can be very verbose, so leave
-	  it disabled unless you are debugging a hardware / firmware
-	  issue.
+source "drivers/acpi/nfit/Kconfig"
 
 source "drivers/acpi/apei/Kconfig"
 source "drivers/acpi/dptf/Kconfig"
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 88f54f0..35a6ccb 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -69,7 +69,7 @@
 obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-$(CONFIG_ACPI)		+= container.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
-obj-$(CONFIG_ACPI_NFIT)		+= nfit.o
+obj-$(CONFIG_ACPI_NFIT)		+= nfit/
 obj-$(CONFIG_ACPI)		+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
 obj-$(CONFIG_ACPI_BATTERY)	+= battery.o
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
new file mode 100644
index 0000000..dd0d53c
--- /dev/null
+++ b/drivers/acpi/nfit/Kconfig
@@ -0,0 +1,26 @@
+config ACPI_NFIT
+	tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
+	depends on PHYS_ADDR_T_64BIT
+	depends on BLK_DEV
+	depends on ARCH_HAS_MMIO_FLUSH
+	select LIBNVDIMM
+	help
+	  Infrastructure to probe ACPI 6 compliant platforms for
+	  NVDIMMs (NFIT) and register a libnvdimm device tree.  In
+	  addition to storage devices this also enables libnvdimm to pass
+	  ACPI._DSM messages for platform/dimm configuration.
+
+	  To compile this driver as a module, choose M here:
+	  the module will be called nfit.
+
+config ACPI_NFIT_DEBUG
+	bool "NFIT DSM debug"
+	depends on ACPI_NFIT
+	depends on DYNAMIC_DEBUG
+	default n
+	help
+	  Enabling this option causes the nfit driver to dump the
+	  input and output buffers of _DSM operations on the ACPI0012
+	  device and its children.  This can be very verbose, so leave
+	  it disabled unless you are debugging a hardware / firmware
+	  issue.
diff --git a/drivers/acpi/nfit/Makefile b/drivers/acpi/nfit/Makefile
new file mode 100644
index 0000000..a407e76
--- /dev/null
+++ b/drivers/acpi/nfit/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ACPI_NFIT) := nfit.o
+nfit-y := core.o
+nfit-$(CONFIG_X86_MCE) += mce.o
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit/core.c
similarity index 86%
rename from drivers/acpi/nfit.c
rename to drivers/acpi/nfit/core.c
index 1f0e060..8c234dd 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit/core.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
+#include <linux/sysfs.h>
 #include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/acpi.h>
@@ -50,6 +51,9 @@
 MODULE_PARM_DESC(disable_vendor_specific,
 		"Limit commands to the publicly specified set\n");
 
+LIST_HEAD(acpi_descs);
+DEFINE_MUTEX(acpi_desc_lock);
+
 static struct workqueue_struct *nfit_wq;
 
 struct nfit_table_prev {
@@ -360,7 +364,7 @@
 	return to_name[type];
 }
 
-static int nfit_spa_type(struct acpi_nfit_system_address *spa)
+int nfit_spa_type(struct acpi_nfit_system_address *spa)
 {
 	int i;
 
@@ -374,22 +378,25 @@
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_system_address *spa)
 {
-	size_t length = min_t(size_t, sizeof(*spa), spa->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_spa *nfit_spa;
 
+	if (spa->header.length != sizeof(*spa))
+		return false;
+
 	list_for_each_entry(nfit_spa, &prev->spas, list) {
-		if (memcmp(nfit_spa->spa, spa, length) == 0) {
+		if (memcmp(nfit_spa->spa, spa, sizeof(*spa)) == 0) {
 			list_move_tail(&nfit_spa->list, &acpi_desc->spas);
 			return true;
 		}
 	}
 
-	nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa), GFP_KERNEL);
+	nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof(*spa),
+			GFP_KERNEL);
 	if (!nfit_spa)
 		return false;
 	INIT_LIST_HEAD(&nfit_spa->list);
-	nfit_spa->spa = spa;
+	memcpy(nfit_spa->spa, spa, sizeof(*spa));
 	list_add_tail(&nfit_spa->list, &acpi_desc->spas);
 	dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__,
 			spa->range_index,
@@ -401,21 +408,24 @@
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_memory_map *memdev)
 {
-	size_t length = min_t(size_t, sizeof(*memdev), memdev->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_memdev *nfit_memdev;
 
+	if (memdev->header.length != sizeof(*memdev))
+		return false;
+
 	list_for_each_entry(nfit_memdev, &prev->memdevs, list)
-		if (memcmp(nfit_memdev->memdev, memdev, length) == 0) {
+		if (memcmp(nfit_memdev->memdev, memdev, sizeof(*memdev)) == 0) {
 			list_move_tail(&nfit_memdev->list, &acpi_desc->memdevs);
 			return true;
 		}
 
-	nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev), GFP_KERNEL);
+	nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev) + sizeof(*memdev),
+			GFP_KERNEL);
 	if (!nfit_memdev)
 		return false;
 	INIT_LIST_HEAD(&nfit_memdev->list);
-	nfit_memdev->memdev = memdev;
+	memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev));
 	list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
 	dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
 			__func__, memdev->device_handle, memdev->range_index,
@@ -423,25 +433,42 @@
 	return true;
 }
 
+/*
+ * An implementation may provide a truncated control region if no block windows
+ * are defined.
+ */
+static size_t sizeof_dcr(struct acpi_nfit_control_region *dcr)
+{
+	if (dcr->header.length < offsetof(struct acpi_nfit_control_region,
+				window_size))
+		return 0;
+	if (dcr->windows)
+		return sizeof(*dcr);
+	return offsetof(struct acpi_nfit_control_region, window_size);
+}
+
 static bool add_dcr(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_control_region *dcr)
 {
-	size_t length = min_t(size_t, sizeof(*dcr), dcr->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_dcr *nfit_dcr;
 
+	if (!sizeof_dcr(dcr))
+		return false;
+
 	list_for_each_entry(nfit_dcr, &prev->dcrs, list)
-		if (memcmp(nfit_dcr->dcr, dcr, length) == 0) {
+		if (memcmp(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)) == 0) {
 			list_move_tail(&nfit_dcr->list, &acpi_desc->dcrs);
 			return true;
 		}
 
-	nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr), GFP_KERNEL);
+	nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr) + sizeof(*dcr),
+			GFP_KERNEL);
 	if (!nfit_dcr)
 		return false;
 	INIT_LIST_HEAD(&nfit_dcr->list);
-	nfit_dcr->dcr = dcr;
+	memcpy(nfit_dcr->dcr, dcr, sizeof_dcr(dcr));
 	list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs);
 	dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__,
 			dcr->region_index, dcr->windows);
@@ -452,71 +479,102 @@
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_data_region *bdw)
 {
-	size_t length = min_t(size_t, sizeof(*bdw), bdw->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_bdw *nfit_bdw;
 
+	if (bdw->header.length != sizeof(*bdw))
+		return false;
 	list_for_each_entry(nfit_bdw, &prev->bdws, list)
-		if (memcmp(nfit_bdw->bdw, bdw, length) == 0) {
+		if (memcmp(nfit_bdw->bdw, bdw, sizeof(*bdw)) == 0) {
 			list_move_tail(&nfit_bdw->list, &acpi_desc->bdws);
 			return true;
 		}
 
-	nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw), GFP_KERNEL);
+	nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw) + sizeof(*bdw),
+			GFP_KERNEL);
 	if (!nfit_bdw)
 		return false;
 	INIT_LIST_HEAD(&nfit_bdw->list);
-	nfit_bdw->bdw = bdw;
+	memcpy(nfit_bdw->bdw, bdw, sizeof(*bdw));
 	list_add_tail(&nfit_bdw->list, &acpi_desc->bdws);
 	dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__,
 			bdw->region_index, bdw->windows);
 	return true;
 }
 
+static size_t sizeof_idt(struct acpi_nfit_interleave *idt)
+{
+	if (idt->header.length < sizeof(*idt))
+		return 0;
+	return sizeof(*idt) + sizeof(u32) * (idt->line_count - 1);
+}
+
 static bool add_idt(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_interleave *idt)
 {
-	size_t length = min_t(size_t, sizeof(*idt), idt->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_idt *nfit_idt;
 
-	list_for_each_entry(nfit_idt, &prev->idts, list)
-		if (memcmp(nfit_idt->idt, idt, length) == 0) {
+	if (!sizeof_idt(idt))
+		return false;
+
+	list_for_each_entry(nfit_idt, &prev->idts, list) {
+		if (sizeof_idt(nfit_idt->idt) != sizeof_idt(idt))
+			continue;
+
+		if (memcmp(nfit_idt->idt, idt, sizeof_idt(idt)) == 0) {
 			list_move_tail(&nfit_idt->list, &acpi_desc->idts);
 			return true;
 		}
+	}
 
-	nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt), GFP_KERNEL);
+	nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt) + sizeof_idt(idt),
+			GFP_KERNEL);
 	if (!nfit_idt)
 		return false;
 	INIT_LIST_HEAD(&nfit_idt->list);
-	nfit_idt->idt = idt;
+	memcpy(nfit_idt->idt, idt, sizeof_idt(idt));
 	list_add_tail(&nfit_idt->list, &acpi_desc->idts);
 	dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__,
 			idt->interleave_index, idt->line_count);
 	return true;
 }
 
+static size_t sizeof_flush(struct acpi_nfit_flush_address *flush)
+{
+	if (flush->header.length < sizeof(*flush))
+		return 0;
+	return sizeof(*flush) + sizeof(u64) * (flush->hint_count - 1);
+}
+
 static bool add_flush(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_table_prev *prev,
 		struct acpi_nfit_flush_address *flush)
 {
-	size_t length = min_t(size_t, sizeof(*flush), flush->header.length);
 	struct device *dev = acpi_desc->dev;
 	struct nfit_flush *nfit_flush;
 
-	list_for_each_entry(nfit_flush, &prev->flushes, list)
-		if (memcmp(nfit_flush->flush, flush, length) == 0) {
+	if (!sizeof_flush(flush))
+		return false;
+
+	list_for_each_entry(nfit_flush, &prev->flushes, list) {
+		if (sizeof_flush(nfit_flush->flush) != sizeof_flush(flush))
+			continue;
+
+		if (memcmp(nfit_flush->flush, flush,
+					sizeof_flush(flush)) == 0) {
 			list_move_tail(&nfit_flush->list, &acpi_desc->flushes);
 			return true;
 		}
+	}
 
-	nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush), GFP_KERNEL);
+	nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush)
+			+ sizeof_flush(flush), GFP_KERNEL);
 	if (!nfit_flush)
 		return false;
 	INIT_LIST_HEAD(&nfit_flush->list);
-	nfit_flush->flush = flush;
+	memcpy(nfit_flush->flush, flush, sizeof_flush(flush));
 	list_add_tail(&nfit_flush->list, &acpi_desc->flushes);
 	dev_dbg(dev, "%s: nfit_flush handle: %d hint_count: %d\n", __func__,
 			flush->device_handle, flush->hint_count);
@@ -614,7 +672,6 @@
 {
 	u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
 	struct nfit_memdev *nfit_memdev;
-	struct nfit_flush *nfit_flush;
 	struct nfit_bdw *nfit_bdw;
 	struct nfit_idt *nfit_idt;
 	u16 idt_idx, range_index;
@@ -647,14 +704,6 @@
 			nfit_mem->idt_bdw = nfit_idt->idt;
 			break;
 		}
-
-		list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) {
-			if (nfit_flush->flush->device_handle !=
-					nfit_memdev->memdev->device_handle)
-				continue;
-			nfit_mem->nfit_flush = nfit_flush;
-			break;
-		}
 		break;
 	}
 }
@@ -675,6 +724,7 @@
 	}
 
 	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+		struct nfit_flush *nfit_flush;
 		struct nfit_dcr *nfit_dcr;
 		u32 device_handle;
 		u16 dcr;
@@ -721,6 +771,28 @@
 			break;
 		}
 
+		list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) {
+			struct acpi_nfit_flush_address *flush;
+			u16 i;
+
+			if (nfit_flush->flush->device_handle != device_handle)
+				continue;
+			nfit_mem->nfit_flush = nfit_flush;
+			flush = nfit_flush->flush;
+			nfit_mem->flush_wpq = devm_kzalloc(acpi_desc->dev,
+					flush->hint_count
+					* sizeof(struct resource), GFP_KERNEL);
+			if (!nfit_mem->flush_wpq)
+				return -ENOMEM;
+			for (i = 0; i < flush->hint_count; i++) {
+				struct resource *res = &nfit_mem->flush_wpq[i];
+
+				res->start = flush->hint_address[i];
+				res->end = res->start + 8 - 1;
+			}
+			break;
+		}
+
 		if (dcr && !nfit_mem->dcr) {
 			dev_err(acpi_desc->dev, "SPA %d missing DCR %d\n",
 					spa->range_index, dcr);
@@ -806,14 +878,85 @@
 }
 static DEVICE_ATTR_RO(revision);
 
+/*
+ * This shows the number of full Address Range Scrubs that have been
+ * completed since driver load time. Userspace can wait on this using
+ * select/poll etc. A '+' at the end indicates an ARS is in progress
+ */
+static ssize_t scrub_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	nd_desc = dev_get_drvdata(dev);
+	if (nd_desc) {
+		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+		rc = sprintf(buf, "%d%s", acpi_desc->scrub_count,
+				(work_busy(&acpi_desc->work)) ? "+\n" : "\n");
+	}
+	device_unlock(dev);
+	return rc;
+}
+
+static ssize_t scrub_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	ssize_t rc;
+	long val;
+
+	rc = kstrtol(buf, 0, &val);
+	if (rc)
+		return rc;
+	if (val != 1)
+		return -EINVAL;
+
+	device_lock(dev);
+	nd_desc = dev_get_drvdata(dev);
+	if (nd_desc) {
+		struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+		rc = acpi_nfit_ars_rescan(acpi_desc);
+	}
+	device_unlock(dev);
+	if (rc)
+		return rc;
+	return size;
+}
+static DEVICE_ATTR_RW(scrub);
+
+static bool ars_supported(struct nvdimm_bus *nvdimm_bus)
+{
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	const unsigned long mask = 1 << ND_CMD_ARS_CAP | 1 << ND_CMD_ARS_START
+		| 1 << ND_CMD_ARS_STATUS;
+
+	return (nd_desc->cmd_mask & mask) == mask;
+}
+
+static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+	if (a == &dev_attr_scrub.attr && !ars_supported(nvdimm_bus))
+		return 0;
+	return a->mode;
+}
+
 static struct attribute *acpi_nfit_attributes[] = {
 	&dev_attr_revision.attr,
+	&dev_attr_scrub.attr,
 	NULL,
 };
 
 static struct attribute_group acpi_nfit_attribute_group = {
 	.name = "nfit",
 	.attrs = acpi_nfit_attributes,
+	.is_visible = nfit_visible,
 };
 
 static const struct attribute_group *acpi_nfit_attribute_groups[] = {
@@ -1130,11 +1273,11 @@
 	}
 
 	/*
-	 * Until standardization materializes we need to consider up to 3
+	 * Until standardization materializes we need to consider 4
 	 * different command sets.  Note, that checking for function0 (bit0)
 	 * tells us if any commands are reachable through this uuid.
 	 */
-	for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_HPE2; i++)
+	for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
 		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
 			break;
 
@@ -1144,12 +1287,14 @@
 		dsm_mask = 0x3fe;
 		if (disable_vendor_specific)
 			dsm_mask &= ~(1 << ND_CMD_VENDOR);
-	} else if (nfit_mem->family == NVDIMM_FAMILY_HPE1)
+	} else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
 		dsm_mask = 0x1c3c76;
-	else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) {
+	} else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) {
 		dsm_mask = 0x1fe;
 		if (disable_vendor_specific)
 			dsm_mask &= ~(1 << 8);
+	} else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) {
+		dsm_mask = 0xffffffff;
 	} else {
 		dev_dbg(dev, "unknown dimm command family\n");
 		nfit_mem->family = -1;
@@ -1171,6 +1316,7 @@
 	int dimm_count = 0;
 
 	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
+		struct acpi_nfit_flush_address *flush;
 		unsigned long flags = 0, cmd_mask;
 		struct nvdimm *nvdimm;
 		u32 device_handle;
@@ -1204,9 +1350,12 @@
 		if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
 			cmd_mask |= nfit_mem->dsm_mask;
 
+		flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
+			: NULL;
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
 				acpi_nfit_dimm_attribute_groups,
-				flags, cmd_mask);
+				flags, cmd_mask, flush ? flush->hint_count : 0,
+				nfit_mem->flush_wpq);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -1374,24 +1523,6 @@
 	return mmio->base_offset + line_offset + table_offset + sub_line_offset;
 }
 
-static void wmb_blk(struct nfit_blk *nfit_blk)
-{
-
-	if (nfit_blk->nvdimm_flush) {
-		/*
-		 * The first wmb() is needed to 'sfence' all previous writes
-		 * such that they are architecturally visible for the platform
-		 * buffer flush.  Note that we've already arranged for pmem
-		 * writes to avoid the cache via arch_memcpy_to_pmem().  The
-		 * final wmb() ensures ordering for the NVDIMM flush write.
-		 */
-		wmb();
-		writeq(1, nfit_blk->nvdimm_flush);
-		wmb();
-	} else
-		wmb_pmem();
-}
-
 static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
 {
 	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
@@ -1426,7 +1557,7 @@
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	wmb_blk(nfit_blk);
+	nvdimm_flush(nfit_blk->nd_region);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -1477,7 +1608,7 @@
 	}
 
 	if (rw)
-		wmb_blk(nfit_blk);
+		nvdimm_flush(nfit_blk->nd_region);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;
@@ -1509,125 +1640,6 @@
 	return rc;
 }
 
-static void nfit_spa_mapping_release(struct kref *kref)
-{
-	struct nfit_spa_mapping *spa_map = to_spa_map(kref);
-	struct acpi_nfit_system_address *spa = spa_map->spa;
-	struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc;
-
-	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
-	dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
-	if (spa_map->type == SPA_MAP_APERTURE)
-		memunmap((void __force *)spa_map->addr.aperture);
-	else
-		iounmap(spa_map->addr.base);
-	release_mem_region(spa->address, spa->length);
-	list_del(&spa_map->list);
-	kfree(spa_map);
-}
-
-static struct nfit_spa_mapping *find_spa_mapping(
-		struct acpi_nfit_desc *acpi_desc,
-		struct acpi_nfit_system_address *spa)
-{
-	struct nfit_spa_mapping *spa_map;
-
-	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
-	list_for_each_entry(spa_map, &acpi_desc->spa_maps, list)
-		if (spa_map->spa == spa)
-			return spa_map;
-
-	return NULL;
-}
-
-static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc,
-		struct acpi_nfit_system_address *spa)
-{
-	struct nfit_spa_mapping *spa_map;
-
-	mutex_lock(&acpi_desc->spa_map_mutex);
-	spa_map = find_spa_mapping(acpi_desc, spa);
-
-	if (spa_map)
-		kref_put(&spa_map->kref, nfit_spa_mapping_release);
-	mutex_unlock(&acpi_desc->spa_map_mutex);
-}
-
-static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
-		struct acpi_nfit_system_address *spa, enum spa_map_type type)
-{
-	resource_size_t start = spa->address;
-	resource_size_t n = spa->length;
-	struct nfit_spa_mapping *spa_map;
-	struct resource *res;
-
-	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
-
-	spa_map = find_spa_mapping(acpi_desc, spa);
-	if (spa_map) {
-		kref_get(&spa_map->kref);
-		return spa_map->addr.base;
-	}
-
-	spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
-	if (!spa_map)
-		return NULL;
-
-	INIT_LIST_HEAD(&spa_map->list);
-	spa_map->spa = spa;
-	kref_init(&spa_map->kref);
-	spa_map->acpi_desc = acpi_desc;
-
-	res = request_mem_region(start, n, dev_name(acpi_desc->dev));
-	if (!res)
-		goto err_mem;
-
-	spa_map->type = type;
-	if (type == SPA_MAP_APERTURE)
-		spa_map->addr.aperture = (void __pmem *)memremap(start, n,
-							ARCH_MEMREMAP_PMEM);
-	else
-		spa_map->addr.base = ioremap_nocache(start, n);
-
-
-	if (!spa_map->addr.base)
-		goto err_map;
-
-	list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
-	return spa_map->addr.base;
-
- err_map:
-	release_mem_region(start, n);
- err_mem:
-	kfree(spa_map);
-	return NULL;
-}
-
-/**
- * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges
- * @nvdimm_bus: NFIT-bus that provided the spa table entry
- * @nfit_spa: spa table to map
- * @type: aperture or control region
- *
- * In the case where block-data-window apertures and
- * dimm-control-regions are interleaved they will end up sharing a
- * single request_mem_region() + ioremap() for the address range.  In
- * the style of devm nfit_spa_map() mappings are automatically dropped
- * when all region devices referencing the same mapping are disabled /
- * unbound.
- */
-static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
-		struct acpi_nfit_system_address *spa, enum spa_map_type type)
-{
-	void __iomem *iomem;
-
-	mutex_lock(&acpi_desc->spa_map_mutex);
-	iomem = __nfit_spa_map(acpi_desc, spa, type);
-	mutex_unlock(&acpi_desc->spa_map_mutex);
-
-	return iomem;
-}
-
 static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
 		struct acpi_nfit_interleave *idt, u16 interleave_ways)
 {
@@ -1669,9 +1681,7 @@
 		struct device *dev)
 {
 	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
-	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
-	struct nfit_flush *nfit_flush;
 	struct nfit_blk_mmio *mmio;
 	struct nfit_blk *nfit_blk;
 	struct nfit_mem *nfit_mem;
@@ -1697,8 +1707,8 @@
 	/* map block aperture memory */
 	nfit_blk->bdw_offset = nfit_mem->bdw->offset;
 	mmio = &nfit_blk->mmio[BDW];
-	mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw,
-			SPA_MAP_APERTURE);
+	mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address,
+                        nfit_mem->spa_bdw->length, ARCH_MEMREMAP_PMEM);
 	if (!mmio->addr.base) {
 		dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
 				nvdimm_name(nvdimm));
@@ -1720,8 +1730,8 @@
 	nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
 	nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
 	mmio = &nfit_blk->mmio[DCR];
-	mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr,
-			SPA_MAP_CONTROL);
+	mmio->addr.base = devm_nvdimm_ioremap(dev, nfit_mem->spa_dcr->address,
+			nfit_mem->spa_dcr->length);
 	if (!mmio->addr.base) {
 		dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
 				nvdimm_name(nvdimm));
@@ -1746,15 +1756,7 @@
 		return rc;
 	}
 
-	nfit_flush = nfit_mem->nfit_flush;
-	if (nfit_flush && nfit_flush->flush->hint_count != 0) {
-		nfit_blk->nvdimm_flush = devm_ioremap_nocache(dev,
-				nfit_flush->flush->hint_address[0], 8);
-		if (!nfit_blk->nvdimm_flush)
-			return -ENOMEM;
-	}
-
-	if (!arch_has_wmb_pmem() && !nfit_blk->nvdimm_flush)
+	if (nvdimm_has_flush(nfit_blk->nd_region) < 0)
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	if (mmio->line_size == 0)
@@ -1773,29 +1775,6 @@
 	return 0;
 }
 
-static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
-		struct device *dev)
-{
-	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
-	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
-	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
-	struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
-	int i;
-
-	if (!nfit_blk)
-		return; /* never enabled */
-
-	/* auto-free BLK spa mappings */
-	for (i = 0; i < 2; i++) {
-		struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
-
-		if (mmio->addr.base)
-			nfit_spa_unmap(acpi_desc, mmio->spa);
-	}
-	nd_blk_region_set_provider_data(ndbr, NULL);
-	/* devm will free nfit_blk */
-}
-
 static int ars_get_cap(struct acpi_nfit_desc *acpi_desc,
 		struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa)
 {
@@ -1919,11 +1898,11 @@
 	if (ret)
 		return ret;
 
-	ret = devm_add_action(acpi_desc->dev, acpi_nfit_remove_resource, res);
-	if (ret) {
-		remove_resource(res);
+	ret = devm_add_action_or_reset(acpi_desc->dev,
+					acpi_nfit_remove_resource,
+					res);
+	if (ret)
 		return ret;
-	}
 
 	return 0;
 }
@@ -1969,7 +1948,6 @@
 		ndr_desc->num_mappings = blk_valid;
 		ndbr_desc = to_blk_region_desc(ndr_desc);
 		ndbr_desc->enable = acpi_nfit_blk_region_enable;
-		ndbr_desc->disable = acpi_nfit_blk_region_disable;
 		ndbr_desc->do_io = acpi_desc->blk_do_io;
 		nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus,
 				ndr_desc);
@@ -1981,6 +1959,14 @@
 	return 0;
 }
 
+static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa)
+{
+	return (nfit_spa_type(spa) == NFIT_SPA_VDISK ||
+		nfit_spa_type(spa) == NFIT_SPA_VCD   ||
+		nfit_spa_type(spa) == NFIT_SPA_PDISK ||
+		nfit_spa_type(spa) == NFIT_SPA_PCD);
+}
+
 static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_spa *nfit_spa)
 {
@@ -1996,7 +1982,7 @@
 	if (nfit_spa->nd_region)
 		return 0;
 
-	if (spa->range_index == 0) {
+	if (spa->range_index == 0 && !nfit_spa_is_virtual(spa)) {
 		dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
 				__func__);
 		return 0;
@@ -2060,6 +2046,11 @@
 				ndr_desc);
 		if (!nfit_spa->nd_region)
 			rc = -ENOMEM;
+	} else if (nfit_spa_is_virtual(spa)) {
+		nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus,
+				ndr_desc);
+		if (!nfit_spa->nd_region)
+			rc = -ENOMEM;
 	}
 
  out:
@@ -2139,7 +2130,7 @@
 	unsigned int tmo = scrub_timeout;
 	int rc;
 
-	if (nfit_spa->ars_done || !nfit_spa->nd_region)
+	if (!nfit_spa->ars_required || !nfit_spa->nd_region)
 		return;
 
 	rc = ars_start(acpi_desc, nfit_spa);
@@ -2228,7 +2219,9 @@
 	 * firmware initiated scrubs to complete and then we go search for the
 	 * affected spa regions to mark them scanned.  In the second phase we
 	 * initiate a directed scrub for every range that was not scrubbed in
-	 * phase 1.
+	 * phase 1. If we're called for a 'rescan', we harmlessly pass through
+	 * the first phase, but really only care about running phase 2, where
+	 * regions can be notified of new poison.
 	 */
 
 	/* process platform firmware initiated scrubs */
@@ -2331,14 +2324,17 @@
 		 * Flag all the ranges that still need scrubbing, but
 		 * register them now to make data available.
 		 */
-		if (nfit_spa->nd_region)
-			nfit_spa->ars_done = 1;
-		else
+		if (!nfit_spa->nd_region) {
+			nfit_spa->ars_required = 1;
 			acpi_nfit_register_region(acpi_desc, nfit_spa);
+		}
 	}
 
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
 		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
+	acpi_desc->scrub_count++;
+	if (acpi_desc->scrub_count_state)
+		sysfs_notify_dirent(acpi_desc->scrub_count_state);
 	mutex_unlock(&acpi_desc->init_mutex);
 }
 
@@ -2376,14 +2372,89 @@
 	return 0;
 }
 
-int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
+static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
+{
+	struct device *dev = acpi_desc->dev;
+	struct kernfs_node *nfit;
+	struct device *bus_dev;
+
+	if (!ars_supported(acpi_desc->nvdimm_bus))
+		return 0;
+
+	bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
+	nfit = sysfs_get_dirent(bus_dev->kobj.sd, "nfit");
+	if (!nfit) {
+		dev_err(dev, "sysfs_get_dirent 'nfit' failed\n");
+		return -ENODEV;
+	}
+	acpi_desc->scrub_count_state = sysfs_get_dirent(nfit, "scrub");
+	sysfs_put(nfit);
+	if (!acpi_desc->scrub_count_state) {
+		dev_err(dev, "sysfs_get_dirent 'scrub' failed\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void acpi_nfit_destruct(void *data)
+{
+	struct acpi_nfit_desc *acpi_desc = data;
+	struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
+
+	/*
+	 * Destruct under acpi_desc_lock so that nfit_handle_mce does not
+	 * race teardown
+	 */
+	mutex_lock(&acpi_desc_lock);
+	acpi_desc->cancel = 1;
+	/*
+	 * Bounce the nvdimm bus lock to make sure any in-flight
+	 * acpi_nfit_ars_rescan() submissions have had a chance to
+	 * either submit or see ->cancel set.
+	 */
+	device_lock(bus_dev);
+	device_unlock(bus_dev);
+
+	flush_workqueue(nfit_wq);
+	if (acpi_desc->scrub_count_state)
+		sysfs_put(acpi_desc->scrub_count_state);
+	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+	acpi_desc->nvdimm_bus = NULL;
+	list_del(&acpi_desc->list);
+	mutex_unlock(&acpi_desc_lock);
+}
+
+int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
 {
 	struct device *dev = acpi_desc->dev;
 	struct nfit_table_prev prev;
 	const void *end;
-	u8 *data;
 	int rc;
 
+	if (!acpi_desc->nvdimm_bus) {
+		acpi_nfit_init_dsms(acpi_desc);
+
+		acpi_desc->nvdimm_bus = nvdimm_bus_register(dev,
+				&acpi_desc->nd_desc);
+		if (!acpi_desc->nvdimm_bus)
+			return -ENOMEM;
+
+		rc = devm_add_action_or_reset(dev, acpi_nfit_destruct,
+				acpi_desc);
+		if (rc)
+			return rc;
+
+		rc = acpi_nfit_desc_init_scrub_attr(acpi_desc);
+		if (rc)
+			return rc;
+
+		/* register this acpi_desc for mce notifications */
+		mutex_lock(&acpi_desc_lock);
+		list_add_tail(&acpi_desc->list, &acpi_descs);
+		mutex_unlock(&acpi_desc_lock);
+	}
+
 	mutex_lock(&acpi_desc->init_mutex);
 
 	INIT_LIST_HEAD(&prev.spas);
@@ -2406,7 +2477,6 @@
 	list_cut_position(&prev.flushes, &acpi_desc->flushes,
 				acpi_desc->flushes.prev);
 
-	data = (u8 *) acpi_desc->nfit;
 	end = data + sz;
 	while (!IS_ERR_OR_NULL(data))
 		data = add_table(acpi_desc, &prev, data, end);
@@ -2422,12 +2492,9 @@
 	if (rc)
 		goto out_unlock;
 
-	if (nfit_mem_init(acpi_desc) != 0) {
-		rc = -ENOMEM;
+	rc = nfit_mem_init(acpi_desc);
+	if (rc)
 		goto out_unlock;
-	}
-
-	acpi_nfit_init_dsms(acpi_desc);
 
 	rc = acpi_nfit_register_dimms(acpi_desc);
 	if (rc)
@@ -2496,6 +2563,33 @@
 	return 0;
 }
 
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_spa *nfit_spa;
+
+	if (work_busy(&acpi_desc->work))
+		return -EBUSY;
+
+	if (acpi_desc->cancel)
+		return 0;
+
+	mutex_lock(&acpi_desc->init_mutex);
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		struct acpi_nfit_system_address *spa = nfit_spa->spa;
+
+		if (nfit_spa_type(spa) != NFIT_SPA_PM)
+			continue;
+
+		nfit_spa->ars_required = 1;
+	}
+	queue_work(nfit_wq, &acpi_desc->work);
+	dev_dbg(dev, "%s: ars_scan triggered\n", __func__);
+	mutex_unlock(&acpi_desc->init_mutex);
+
+	return 0;
+}
+
 void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 {
 	struct nvdimm_bus_descriptor *nd_desc;
@@ -2505,12 +2599,12 @@
 	acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = "ACPI.NFIT";
+	nd_desc->module = THIS_MODULE;
 	nd_desc->ndctl = acpi_nfit_ctl;
 	nd_desc->flush_probe = acpi_nfit_flush_probe;
 	nd_desc->clear_to_send = acpi_nfit_clear_to_send;
 	nd_desc->attr_groups = acpi_nfit_attribute_groups;
 
-	INIT_LIST_HEAD(&acpi_desc->spa_maps);
 	INIT_LIST_HEAD(&acpi_desc->spas);
 	INIT_LIST_HEAD(&acpi_desc->dcrs);
 	INIT_LIST_HEAD(&acpi_desc->bdws);
@@ -2518,7 +2612,7 @@
 	INIT_LIST_HEAD(&acpi_desc->flushes);
 	INIT_LIST_HEAD(&acpi_desc->memdevs);
 	INIT_LIST_HEAD(&acpi_desc->dimms);
-	mutex_init(&acpi_desc->spa_map_mutex);
+	INIT_LIST_HEAD(&acpi_desc->list);
 	mutex_init(&acpi_desc->init_mutex);
 	INIT_WORK(&acpi_desc->work, acpi_nfit_scrub);
 }
@@ -2532,7 +2626,7 @@
 	struct acpi_table_header *tbl;
 	acpi_status status = AE_OK;
 	acpi_size sz;
-	int rc;
+	int rc = 0;
 
 	status = acpi_get_table_with_size(ACPI_SIG_NFIT, 0, &tbl, &sz);
 	if (ACPI_FAILURE(status)) {
@@ -2545,50 +2639,33 @@
 	if (!acpi_desc)
 		return -ENOMEM;
 	acpi_nfit_desc_init(acpi_desc, &adev->dev);
-	acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
-	if (!acpi_desc->nvdimm_bus)
-		return -ENOMEM;
 
-	/*
-	 * Save the acpi header for later and then skip it,
-	 * making nfit point to the first nfit table header.
-	 */
+	/* Save the acpi header for exporting the revision via sysfs */
 	acpi_desc->acpi_header = *tbl;
-	acpi_desc->nfit = (void *) tbl + sizeof(struct acpi_table_nfit);
-	sz -= sizeof(struct acpi_table_nfit);
 
 	/* Evaluate _FIT and override with that if present */
 	status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf);
 	if (ACPI_SUCCESS(status) && buf.length > 0) {
-		union acpi_object *obj;
-		/*
-		 * Adjust for the acpi_object header of the _FIT
-		 */
-		obj = buf.pointer;
-		if (obj->type == ACPI_TYPE_BUFFER) {
-			acpi_desc->nfit =
-				(struct acpi_nfit_header *)obj->buffer.pointer;
-			sz = obj->buffer.length;
-		} else
+		union acpi_object *obj = buf.pointer;
+
+		if (obj->type == ACPI_TYPE_BUFFER)
+			rc = acpi_nfit_init(acpi_desc, obj->buffer.pointer,
+					obj->buffer.length);
+		else
 			dev_dbg(dev, "%s invalid type %d, ignoring _FIT\n",
 				 __func__, (int) obj->type);
-	}
-
-	rc = acpi_nfit_init(acpi_desc, sz);
-	if (rc) {
-		nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
-		return rc;
-	}
-	return 0;
+		kfree(buf.pointer);
+	} else
+		/* skip over the lead-in header table */
+		rc = acpi_nfit_init(acpi_desc, (void *) tbl
+				+ sizeof(struct acpi_table_nfit),
+				sz - sizeof(struct acpi_table_nfit));
+	return rc;
 }
 
 static int acpi_nfit_remove(struct acpi_device *adev)
 {
-	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
-
-	acpi_desc->cancel = 1;
-	flush_workqueue(nfit_wq);
-	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+	/* see acpi_nfit_destruct */
 	return 0;
 }
 
@@ -2596,9 +2673,8 @@
 {
 	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
-	struct acpi_nfit_header *nfit_saved;
-	union acpi_object *obj;
 	struct device *dev = &adev->dev;
+	union acpi_object *obj;
 	acpi_status status;
 	int ret;
 
@@ -2616,9 +2692,6 @@
 		if (!acpi_desc)
 			goto out_unlock;
 		acpi_nfit_desc_init(acpi_desc, &adev->dev);
-		acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
-		if (!acpi_desc->nvdimm_bus)
-			goto out_unlock;
 	} else {
 		/*
 		 * Finish previous registration before considering new
@@ -2634,21 +2707,14 @@
 		goto out_unlock;
 	}
 
-	nfit_saved = acpi_desc->nfit;
 	obj = buf.pointer;
 	if (obj->type == ACPI_TYPE_BUFFER) {
-		acpi_desc->nfit =
-			(struct acpi_nfit_header *)obj->buffer.pointer;
-		ret = acpi_nfit_init(acpi_desc, obj->buffer.length);
-		if (ret) {
-			/* Merge failed, restore old nfit, and exit */
-			acpi_desc->nfit = nfit_saved;
+		ret = acpi_nfit_init(acpi_desc, obj->buffer.pointer,
+				obj->buffer.length);
+		if (ret)
 			dev_err(dev, "failed to merge updated NFIT\n");
-		}
-	} else {
-		/* Bad _FIT, restore old nfit */
+	} else
 		dev_err(dev, "Invalid _FIT\n");
-	}
 	kfree(buf.pointer);
 
  out_unlock:
@@ -2693,18 +2759,23 @@
 	acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
 	acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE1, nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
 	acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE2, nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
+	acpi_str_to_uuid(UUID_NFIT_DIMM_N_MSFT, nfit_uuid[NFIT_DEV_DIMM_N_MSFT]);
 
 	nfit_wq = create_singlethread_workqueue("nfit");
 	if (!nfit_wq)
 		return -ENOMEM;
 
+	nfit_mce_register();
+
 	return acpi_bus_register_driver(&acpi_nfit_driver);
 }
 
 static __exit void nfit_exit(void)
 {
+	nfit_mce_unregister();
 	acpi_bus_unregister_driver(&acpi_nfit_driver);
 	destroy_workqueue(nfit_wq);
+	WARN_ON(!list_empty(&acpi_descs));
 }
 
 module_init(nfit_init);
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
new file mode 100644
index 0000000..4c745bf
--- /dev/null
+++ b/drivers/acpi/nfit/mce.c
@@ -0,0 +1,89 @@
+/*
+ * NFIT - Machine Check Handler
+ *
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/notifier.h>
+#include <linux/acpi.h>
+#include <asm/mce.h>
+#include "nfit.h"
+
+static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
+			void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	struct acpi_nfit_desc *acpi_desc;
+	struct nfit_spa *nfit_spa;
+
+	/* We only care about memory errors */
+	if (!(mce->status & MCACOD))
+		return NOTIFY_DONE;
+
+	/*
+	 * mce->addr contains the physical addr accessed that caused the
+	 * machine check. We need to walk through the list of NFITs, and see
+	 * if any of them matches that address, and only then start a scrub.
+	 */
+	mutex_lock(&acpi_desc_lock);
+	list_for_each_entry(acpi_desc, &acpi_descs, list) {
+		struct device *dev = acpi_desc->dev;
+		int found_match = 0;
+
+		mutex_lock(&acpi_desc->init_mutex);
+		list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+			struct acpi_nfit_system_address *spa = nfit_spa->spa;
+
+			if (nfit_spa_type(spa) == NFIT_SPA_PM)
+				continue;
+			/* find the spa that covers the mce addr */
+			if (spa->address > mce->addr)
+				continue;
+			if ((spa->address + spa->length - 1) < mce->addr)
+				continue;
+			found_match = 1;
+			dev_dbg(dev, "%s: addr in SPA %d (0x%llx, 0x%llx)\n",
+				__func__, spa->range_index, spa->address,
+				spa->length);
+			/*
+			 * We can break at the first match because we're going
+			 * to rescan all the SPA ranges. There shouldn't be any
+			 * aliasing anyway.
+			 */
+			break;
+		}
+		mutex_unlock(&acpi_desc->init_mutex);
+
+		/*
+		 * We can ignore an -EBUSY here because if an ARS is already
+		 * in progress, just let that be the last authoritative one
+		 */
+		if (found_match)
+			acpi_nfit_ars_rescan(acpi_desc);
+	}
+
+	mutex_unlock(&acpi_desc_lock);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfit_mce_dec = {
+	.notifier_call	= nfit_handle_mce,
+};
+
+void nfit_mce_register(void)
+{
+	mce_register_decode_chain(&nfit_mce_dec);
+}
+
+void nfit_mce_unregister(void)
+{
+	mce_unregister_decode_chain(&nfit_mce_dec);
+}
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit/nfit.h
similarity index 82%
rename from drivers/acpi/nfit.h
rename to drivers/acpi/nfit/nfit.h
index 02b9ea1..e894ded 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -16,6 +16,7 @@
 #define __NFIT_H__
 #include <linux/workqueue.h>
 #include <linux/libnvdimm.h>
+#include <linux/ndctl.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/acpi.h>
@@ -31,6 +32,9 @@
 #define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6"
 #define UUID_NFIT_DIMM_N_HPE2 "5008664b-b758-41a0-a03c-27c2f2d04f7e"
 
+/* https://msdn.microsoft.com/library/windows/hardware/mt604741 */
+#define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05"
+
 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED)
@@ -40,6 +44,7 @@
 	NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
 	NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
 	NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
+	NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT,
 	NFIT_SPA_VOLATILE,
 	NFIT_SPA_PM,
 	NFIT_SPA_DCR,
@@ -74,37 +79,37 @@
 };
 
 struct nfit_spa {
-	struct acpi_nfit_system_address *spa;
 	struct list_head list;
 	struct nd_region *nd_region;
-	unsigned int ars_done:1;
+	unsigned int ars_required:1;
 	u32 clear_err_unit;
 	u32 max_ars;
+	struct acpi_nfit_system_address spa[0];
 };
 
 struct nfit_dcr {
-	struct acpi_nfit_control_region *dcr;
 	struct list_head list;
+	struct acpi_nfit_control_region dcr[0];
 };
 
 struct nfit_bdw {
-	struct acpi_nfit_data_region *bdw;
 	struct list_head list;
+	struct acpi_nfit_data_region bdw[0];
 };
 
 struct nfit_idt {
-	struct acpi_nfit_interleave *idt;
 	struct list_head list;
+	struct acpi_nfit_interleave idt[0];
 };
 
 struct nfit_flush {
-	struct acpi_nfit_flush_address *flush;
 	struct list_head list;
+	struct acpi_nfit_flush_address flush[0];
 };
 
 struct nfit_memdev {
-	struct acpi_nfit_memory_map *memdev;
 	struct list_head list;
+	struct acpi_nfit_memory_map memdev[0];
 };
 
 /* assembled tables for a given dimm/memory-device */
@@ -123,6 +128,7 @@
 	struct list_head list;
 	struct acpi_device *adev;
 	struct acpi_nfit_desc *acpi_desc;
+	struct resource *flush_wpq;
 	unsigned long dsm_mask;
 	int family;
 };
@@ -130,10 +136,7 @@
 struct acpi_nfit_desc {
 	struct nvdimm_bus_descriptor nd_desc;
 	struct acpi_table_header acpi_header;
-	struct acpi_nfit_header *nfit;
-	struct mutex spa_map_mutex;
 	struct mutex init_mutex;
-	struct list_head spa_maps;
 	struct list_head memdevs;
 	struct list_head flushes;
 	struct list_head dimms;
@@ -146,6 +149,9 @@
 	struct nd_cmd_ars_status *ars_status;
 	size_t ars_status_size;
 	struct work_struct work;
+	struct list_head list;
+	struct kernfs_node *scrub_count_state;
+	unsigned int scrub_count;
 	unsigned int cancel:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
@@ -161,7 +167,7 @@
 struct nd_blk_addr {
 	union {
 		void __iomem *base;
-		void __pmem  *aperture;
+		void *aperture;
 	};
 };
 
@@ -180,28 +186,26 @@
 	u64 bdw_offset; /* post interleave offset */
 	u64 stat_offset;
 	u64 cmd_offset;
-	void __iomem *nvdimm_flush;
 	u32 dimm_flags;
 };
 
-enum spa_map_type {
-	SPA_MAP_CONTROL,
-	SPA_MAP_APERTURE,
-};
+extern struct list_head acpi_descs;
+extern struct mutex acpi_desc_lock;
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
 
-struct nfit_spa_mapping {
-	struct acpi_nfit_desc *acpi_desc;
-	struct acpi_nfit_system_address *spa;
-	struct list_head list;
-	struct kref kref;
-	enum spa_map_type type;
-	struct nd_blk_addr addr;
-};
-
-static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
+#ifdef CONFIG_X86_MCE
+void nfit_mce_register(void);
+void nfit_mce_unregister(void);
+#else
+static inline void nfit_mce_register(void)
 {
-	return container_of(kref, struct nfit_spa_mapping, kref);
 }
+static inline void nfit_mce_unregister(void)
+{
+}
+#endif
+
+int nfit_spa_type(struct acpi_nfit_system_address *spa);
 
 static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
 		struct nfit_mem *nfit_mem)
@@ -218,6 +222,6 @@
 }
 
 const u8 *to_nfit_uuid(enum nfit_uuids id);
-int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz);
+int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
 void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev);
 #endif /* __NFIT_H__ */
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index ba5145d..3022dad 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -379,7 +379,7 @@
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-			void __pmem **kaddr, pfn_t *pfn, long size)
+			void **kaddr, pfn_t *pfn, long size)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
@@ -389,7 +389,7 @@
 	page = brd_insert_page(brd, sector);
 	if (!page)
 		return -ENOSPC;
-	*kaddr = (void __pmem *)page_address(page);
+	*kaddr = page_address(page);
 	*pfn = page_to_pfn_t(page);
 
 	return PAGE_SIZE;
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b891a12..803f395 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -211,11 +211,9 @@
 	}
 	dax_dev->dev = dev;
 
-	rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);
-	if (rc) {
-		unregister_dax_dev(dev);
+	rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev);
+	if (rc)
 		return rc;
-	}
 
 	return 0;
 
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 55d510e..dfb1685 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -102,21 +102,19 @@
 	if (rc)
 		return rc;
 
-	rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
-	if (rc) {
-		dax_pmem_percpu_exit(&dax_pmem->ref);
+	rc = devm_add_action_or_reset(dev, dax_pmem_percpu_exit,
+							&dax_pmem->ref);
+	if (rc)
 		return rc;
-	}
 
 	addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
 
-	rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);
-	if (rc) {
-		dax_pmem_percpu_kill(&dax_pmem->ref);
+	rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill,
+							&dax_pmem->ref);
+	if (rc)
 		return rc;
-	}
 
 	nd_region = to_nd_region(dev->parent);
 	dax_region = alloc_dax_region(dev, nd_region->id, &res,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6d35dd4..4788b0b 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,7 @@
 }
 
 static long linear_direct_access(struct dm_target *ti, sector_t sector,
-				 void __pmem **kaddr, pfn_t *pfn, long size)
+				 void **kaddr, pfn_t *pfn, long size)
 {
 	struct linear_c *lc = ti->private;
 	struct block_device *bdev = lc->dev->bdev;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 731e1f5..ce2a910 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2303,7 +2303,7 @@
 }
 
 static long origin_direct_access(struct dm_target *ti, sector_t sector,
-		void __pmem **kaddr, pfn_t *pfn, long size)
+		void **kaddr, pfn_t *pfn, long size)
 {
 	DMWARN("device does not support dax.");
 	return -EIO;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 01bb9cf..83f1d46 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -309,7 +309,7 @@
 }
 
 static long stripe_direct_access(struct dm_target *ti, sector_t sector,
-				 void __pmem **kaddr, pfn_t *pfn, long size)
+				 void **kaddr, pfn_t *pfn, long size)
 {
 	struct stripe_c *sc = ti->private;
 	uint32_t stripe;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 6eecd6b..710ae28 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -149,7 +149,7 @@
 }
 
 static long io_err_direct_access(struct dm_target *ti, sector_t sector,
-				 void __pmem **kaddr, pfn_t *pfn, long size)
+				 void **kaddr, pfn_t *pfn, long size)
 {
 	return -EIO;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ceb69fc..25d1d97 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -906,7 +906,7 @@
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
 static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
-				 void __pmem **kaddr, pfn_t *pfn, long size)
+				 void **kaddr, pfn_t *pfn, long size)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	struct dm_table *map;
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 7c8a3bf..124c243 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -1,6 +1,7 @@
 menuconfig LIBNVDIMM
 	tristate "NVDIMM (Non-Volatile Memory Device) Support"
 	depends on PHYS_ADDR_T_64BIT
+	depends on HAS_IOMEM
 	depends on BLK_DEV
 	help
 	  Generic support for non-volatile memory devices including
@@ -19,7 +20,6 @@
 config BLK_DEV_PMEM
 	tristate "PMEM: Persistent memory block device support"
 	default LIBNVDIMM
-	depends on HAS_IOMEM
 	select ND_BTT if BTT
 	select ND_PFN if NVDIMM_PFN
 	help
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 7e262ef..9faaa96 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -267,10 +267,8 @@
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
 		return -ENOMEM;
-	if (devm_add_action(dev, nd_blk_release_queue, q)) {
-		blk_cleanup_queue(q);
+	if (devm_add_action_or_reset(dev, nd_blk_release_queue, q))
 		return -ENOMEM;
-	}
 
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
@@ -282,10 +280,6 @@
 	disk = alloc_disk(0);
 	if (!disk)
 		return -ENOMEM;
-	if (devm_add_action(dev, nd_blk_release_disk, disk)) {
-		put_disk(disk);
-		return -ENOMEM;
-	}
 
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
@@ -295,6 +289,9 @@
 	set_capacity(disk, 0);
 	device_add_disk(dev, disk);
 
+	if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
+		return -ENOMEM;
+
 	if (nsblk_meta_size(nsblk)) {
 		int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
 
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 816d0da..3fa7919 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -198,8 +198,7 @@
 {
 	struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
 
-	if (dev)
-		__nd_device_register(dev);
+	__nd_device_register(dev);
 	return dev;
 }
 
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5e4e5c7..458daf9 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -31,6 +31,7 @@
 int nvdimm_major;
 static int nvdimm_bus_major;
 static struct class *nd_class;
+static DEFINE_IDA(nd_ida);
 
 static int to_nd_device_type(struct device *dev)
 {
@@ -60,20 +61,13 @@
 			to_nd_device_type(dev));
 }
 
-static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
-
-	return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
-}
-
 static struct module *to_bus_provider(struct device *dev)
 {
 	/* pin bus providers while regions are enabled */
 	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
 		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 
-		return nvdimm_bus->module;
+		return nvdimm_bus->nd_desc->module;
 	}
 	return NULL;
 }
@@ -136,6 +130,21 @@
 	return rc;
 }
 
+static void nvdimm_bus_shutdown(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nd_device_driver *nd_drv = NULL;
+
+	if (dev->driver)
+		nd_drv = to_nd_device_driver(dev->driver);
+
+	if (nd_drv && nd_drv->shutdown) {
+		nd_drv->shutdown(dev);
+		dev_dbg(&nvdimm_bus->dev, "%s.shutdown(%s)\n",
+				dev->driver->name, dev_name(dev));
+	}
+}
+
 void nd_device_notify(struct device *dev, enum nvdimm_event event)
 {
 	device_lock(dev);
@@ -208,14 +217,187 @@
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
+
 static struct bus_type nvdimm_bus_type = {
 	.name = "nd",
 	.uevent = nvdimm_bus_uevent,
 	.match = nvdimm_bus_match,
 	.probe = nvdimm_bus_probe,
 	.remove = nvdimm_bus_remove,
+	.shutdown = nvdimm_bus_shutdown,
 };
 
+static void nvdimm_bus_release(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	ida_simple_remove(&nd_ida, nvdimm_bus->id);
+	kfree(nvdimm_bus);
+}
+
+static bool is_nvdimm_bus(struct device *dev)
+{
+	return dev->release == nvdimm_bus_release;
+}
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+	struct device *dev;
+
+	for (dev = nd_dev; dev; dev = dev->parent)
+		if (is_nvdimm_bus(dev))
+			break;
+	dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+	if (dev)
+		return to_nvdimm_bus(dev);
+	return NULL;
+}
+
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	WARN_ON(!is_nvdimm_bus(dev));
+	return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nd_desc)
+{
+	struct nvdimm_bus *nvdimm_bus;
+	int rc;
+
+	nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+	if (!nvdimm_bus)
+		return NULL;
+	INIT_LIST_HEAD(&nvdimm_bus->list);
+	INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
+	init_waitqueue_head(&nvdimm_bus->probe_wait);
+	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+	mutex_init(&nvdimm_bus->reconfig_mutex);
+	if (nvdimm_bus->id < 0) {
+		kfree(nvdimm_bus);
+		return NULL;
+	}
+	nvdimm_bus->nd_desc = nd_desc;
+	nvdimm_bus->dev.parent = parent;
+	nvdimm_bus->dev.release = nvdimm_bus_release;
+	nvdimm_bus->dev.groups = nd_desc->attr_groups;
+	nvdimm_bus->dev.bus = &nvdimm_bus_type;
+	dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+	rc = device_register(&nvdimm_bus->dev);
+	if (rc) {
+		dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+		goto err;
+	}
+
+	return nvdimm_bus;
+ err:
+	put_device(&nvdimm_bus->dev);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+	if (!nvdimm_bus)
+		return;
+	device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+static int child_unregister(struct device *dev, void *data)
+{
+	/*
+	 * the singular ndctl class device per bus needs to be
+	 * "device_destroy"ed, so skip it here
+	 *
+	 * i.e. remove classless children
+	 */
+	if (dev->class)
+		/* pass */;
+	else
+		nd_device_unregister(dev, ND_SYNC);
+	return 0;
+}
+
+static void free_poison_list(struct list_head *poison_list)
+{
+	struct nd_poison *pl, *next;
+
+	list_for_each_entry_safe(pl, next, poison_list, list) {
+		list_del(&pl->list);
+		kfree(pl);
+	}
+	list_del_init(poison_list);
+}
+
+static int nd_bus_remove(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_del_init(&nvdimm_bus->list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	nd_synchronize();
+	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	free_poison_list(&nvdimm_bus->poison_list);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+
+	nvdimm_bus_destroy_ndctl(nvdimm_bus);
+
+	return 0;
+}
+
+static int nd_bus_probe(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	int rc;
+
+	rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+	if (rc)
+		return rc;
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	/* enable bus provider attributes to look up their local context */
+	dev_set_drvdata(dev, nvdimm_bus->nd_desc);
+
+	return 0;
+}
+
+static struct nd_device_driver nd_bus_driver = {
+	.probe = nd_bus_probe,
+	.remove = nd_bus_remove,
+	.drv = {
+		.name = "nd_bus",
+		.suppress_bind_attrs = true,
+		.bus = &nvdimm_bus_type,
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+};
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+	if (is_nvdimm_bus(dev) && nd_drv == &nd_bus_driver)
+		return true;
+
+	return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
 static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
 
 void nd_synchronize(void)
@@ -395,12 +577,10 @@
 	dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
 			"ndctl%d", nvdimm_bus->id);
 
-	if (IS_ERR(dev)) {
+	if (IS_ERR(dev))
 		dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
 				nvdimm_bus->id, PTR_ERR(dev));
-		return PTR_ERR(dev);
-	}
-	return 0;
+	return PTR_ERR_OR_ZERO(dev);
 }
 
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
@@ -850,8 +1030,14 @@
 		goto err_class;
 	}
 
+	rc = driver_register(&nd_bus_driver.drv);
+	if (rc)
+		goto err_nd_bus;
+
 	return 0;
 
+ err_nd_bus:
+	class_destroy(nd_class);
  err_class:
 	unregister_chrdev(nvdimm_major, "dimmctl");
  err_dimm_chrdev:
@@ -864,8 +1050,10 @@
 
 void nvdimm_bus_exit(void)
 {
+	driver_unregister(&nd_bus_driver.drv);
 	class_destroy(nd_class);
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
 	unregister_chrdev(nvdimm_major, "dimmctl");
 	bus_unregister(&nvdimm_bus_type);
+	ida_destroy(&nd_ida);
 }
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 8b2e3c4..d5dc80c 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -240,7 +240,7 @@
 		return memcpy_from_pmem(buf, nsio->addr + offset, size);
 	} else {
 		memcpy_to_pmem(nsio->addr + offset, buf, size);
-		wmb_pmem();
+		nvdimm_flush(to_nd_region(ndns->dev.parent));
 	}
 
 	return 0;
@@ -266,9 +266,8 @@
 
 	nsio->addr = devm_memremap(dev, res->start, resource_size(res),
 			ARCH_MEMREMAP_PMEM);
-	if (IS_ERR(nsio->addr))
-		return PTR_ERR(nsio->addr);
-	return 0;
+
+	return PTR_ERR_OR_ZERO(nsio->addr);
 }
 EXPORT_SYMBOL_GPL(devm_nsio_enable);
 
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index be89764..715583f 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -20,12 +20,12 @@
 #include <linux/ndctl.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/io.h>
 #include "nd-core.h"
 #include "nd.h"
 
 LIST_HEAD(nvdimm_bus_list);
 DEFINE_MUTEX(nvdimm_bus_list_mutex);
-static DEFINE_IDA(nd_ida);
 
 void nvdimm_bus_lock(struct device *dev)
 {
@@ -57,6 +57,127 @@
 }
 EXPORT_SYMBOL(is_nvdimm_bus_locked);
 
+struct nvdimm_map {
+	struct nvdimm_bus *nvdimm_bus;
+	struct list_head list;
+	resource_size_t offset;
+	unsigned long flags;
+	size_t size;
+	union {
+		void *mem;
+		void __iomem *iomem;
+	};
+	struct kref kref;
+};
+
+static struct nvdimm_map *find_nvdimm_map(struct device *dev,
+		resource_size_t offset)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_map *nvdimm_map;
+
+	list_for_each_entry(nvdimm_map, &nvdimm_bus->mapping_list, list)
+		if (nvdimm_map->offset == offset)
+			return nvdimm_map;
+	return NULL;
+}
+
+static struct nvdimm_map *alloc_nvdimm_map(struct device *dev,
+		resource_size_t offset, size_t size, unsigned long flags)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_map = kzalloc(sizeof(*nvdimm_map), GFP_KERNEL);
+	if (!nvdimm_map)
+		return NULL;
+
+	INIT_LIST_HEAD(&nvdimm_map->list);
+	nvdimm_map->nvdimm_bus = nvdimm_bus;
+	nvdimm_map->offset = offset;
+	nvdimm_map->flags = flags;
+	nvdimm_map->size = size;
+	kref_init(&nvdimm_map->kref);
+
+	if (!request_mem_region(offset, size, dev_name(&nvdimm_bus->dev)))
+		goto err_request_region;
+
+	if (flags)
+		nvdimm_map->mem = memremap(offset, size, flags);
+	else
+		nvdimm_map->iomem = ioremap(offset, size);
+
+	if (!nvdimm_map->mem)
+		goto err_map;
+
+	dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), "%s: bus unlocked!",
+			__func__);
+	list_add(&nvdimm_map->list, &nvdimm_bus->mapping_list);
+
+	return nvdimm_map;
+
+ err_map:
+	release_mem_region(offset, size);
+ err_request_region:
+	kfree(nvdimm_map);
+	return NULL;
+}
+
+static void nvdimm_map_release(struct kref *kref)
+{
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_map = container_of(kref, struct nvdimm_map, kref);
+	nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+	dev_dbg(&nvdimm_bus->dev, "%s: %pa\n", __func__, &nvdimm_map->offset);
+	list_del(&nvdimm_map->list);
+	if (nvdimm_map->flags)
+		memunmap(nvdimm_map->mem);
+	else
+		iounmap(nvdimm_map->iomem);
+	release_mem_region(nvdimm_map->offset, nvdimm_map->size);
+	kfree(nvdimm_map);
+}
+
+static void nvdimm_map_put(void *data)
+{
+	struct nvdimm_map *nvdimm_map = data;
+	struct nvdimm_bus *nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	kref_put(&nvdimm_map->kref, nvdimm_map_release);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+/**
+ * devm_nvdimm_memremap - map a resource that is shared across regions
+ * @dev: device that will own a reference to the shared mapping
+ * @offset: physical base address of the mapping
+ * @size: mapping size
+ * @flags: memremap flags, or, if zero, perform an ioremap instead
+ */
+void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags)
+{
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_bus_lock(dev);
+	nvdimm_map = find_nvdimm_map(dev, offset);
+	if (!nvdimm_map)
+		nvdimm_map = alloc_nvdimm_map(dev, offset, size, flags);
+	else
+		kref_get(&nvdimm_map->kref);
+	nvdimm_bus_unlock(dev);
+
+	if (devm_add_action_or_reset(dev, nvdimm_map_put, nvdimm_map))
+		return NULL;
+
+	return nvdimm_map->mem;
+}
+EXPORT_SYMBOL_GPL(devm_nvdimm_memremap);
+
 u64 nd_fletcher64(void *addr, size_t len, bool le)
 {
 	u32 *buf = addr;
@@ -73,25 +194,6 @@
 }
 EXPORT_SYMBOL_GPL(nd_fletcher64);
 
-static void nvdimm_bus_release(struct device *dev)
-{
-	struct nvdimm_bus *nvdimm_bus;
-
-	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
-	ida_simple_remove(&nd_ida, nvdimm_bus->id);
-	kfree(nvdimm_bus);
-}
-
-struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
-{
-	struct nvdimm_bus *nvdimm_bus;
-
-	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
-	WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release);
-	return nvdimm_bus;
-}
-EXPORT_SYMBOL_GPL(to_nvdimm_bus);
-
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
 {
 	/* struct nvdimm_bus definition is private to libnvdimm */
@@ -99,18 +201,12 @@
 }
 EXPORT_SYMBOL_GPL(to_nd_desc);
 
-struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus)
 {
-	struct device *dev;
-
-	for (dev = nd_dev; dev; dev = dev->parent)
-		if (dev->release == nvdimm_bus_release)
-			break;
-	dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
-	if (dev)
-		return to_nvdimm_bus(dev);
-	return NULL;
+	/* struct nvdimm_bus definition is private to libnvdimm */
+	return &nvdimm_bus->dev;
 }
+EXPORT_SYMBOL_GPL(to_nvdimm_bus_dev);
 
 static bool is_uuid_sep(char sep)
 {
@@ -325,51 +421,6 @@
 };
 EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
 
-struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
-		struct nvdimm_bus_descriptor *nd_desc, struct module *module)
-{
-	struct nvdimm_bus *nvdimm_bus;
-	int rc;
-
-	nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
-	if (!nvdimm_bus)
-		return NULL;
-	INIT_LIST_HEAD(&nvdimm_bus->list);
-	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
-	init_waitqueue_head(&nvdimm_bus->probe_wait);
-	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
-	mutex_init(&nvdimm_bus->reconfig_mutex);
-	if (nvdimm_bus->id < 0) {
-		kfree(nvdimm_bus);
-		return NULL;
-	}
-	nvdimm_bus->nd_desc = nd_desc;
-	nvdimm_bus->module = module;
-	nvdimm_bus->dev.parent = parent;
-	nvdimm_bus->dev.release = nvdimm_bus_release;
-	nvdimm_bus->dev.groups = nd_desc->attr_groups;
-	dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
-	rc = device_register(&nvdimm_bus->dev);
-	if (rc) {
-		dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
-		goto err;
-	}
-
-	rc = nvdimm_bus_create_ndctl(nvdimm_bus);
-	if (rc)
-		goto err;
-
-	mutex_lock(&nvdimm_bus_list_mutex);
-	list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
-	mutex_unlock(&nvdimm_bus_list_mutex);
-
-	return nvdimm_bus;
- err:
-	put_device(&nvdimm_bus->dev);
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
-
 static void set_badblock(struct badblocks *bb, sector_t s, int num)
 {
 	dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
@@ -545,54 +596,6 @@
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-static void free_poison_list(struct list_head *poison_list)
-{
-	struct nd_poison *pl, *next;
-
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		list_del(&pl->list);
-		kfree(pl);
-	}
-	list_del_init(poison_list);
-}
-
-static int child_unregister(struct device *dev, void *data)
-{
-	/*
-	 * the singular ndctl class device per bus needs to be
-	 * "device_destroy"ed, so skip it here
-	 *
-	 * i.e. remove classless children
-	 */
-	if (dev->class)
-		/* pass */;
-	else
-		nd_device_unregister(dev, ND_SYNC);
-	return 0;
-}
-
-void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
-{
-	if (!nvdimm_bus)
-		return;
-
-	mutex_lock(&nvdimm_bus_list_mutex);
-	list_del_init(&nvdimm_bus->list);
-	mutex_unlock(&nvdimm_bus_list_mutex);
-
-	nd_synchronize();
-	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
-
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	free_poison_list(&nvdimm_bus->poison_list);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
-
-	nvdimm_bus_destroy_ndctl(nvdimm_bus);
-
-	device_unregister(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
-
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
 {
@@ -601,7 +604,8 @@
 	if (meta_size == 0)
 		return 0;
 
-	bi.profile = NULL;
+	memset(&bi, 0, sizeof(bi));
+
 	bi.tuple_size = meta_size;
 	bi.tag_size = meta_size;
 
@@ -650,7 +654,6 @@
 	nvdimm_bus_exit();
 	nd_region_devs_exit();
 	nvdimm_devs_exit();
-	ida_destroy(&nd_ida);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index bbde28d..d9bba5e 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -346,7 +346,8 @@
 
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long cmd_mask)
+		unsigned long cmd_mask, int num_flush,
+		struct resource *flush_wpq)
 {
 	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
 	struct device *dev;
@@ -362,6 +363,8 @@
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
 	nvdimm->cmd_mask = cmd_mask;
+	nvdimm->num_flush = num_flush;
+	nvdimm->flush_wpq = flush_wpq;
 	atomic_set(&nvdimm->busy, 0);
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 95825b3..11ea901 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -47,6 +47,7 @@
 
 	nd_desc.attr_groups = e820_pmem_attribute_groups;
 	nd_desc.provider_name = "e820";
+	nd_desc.module = THIS_MODULE;
 	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
 	if (!nvdimm_bus)
 		goto err;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 284cdaa..38ce6bb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -26,11 +26,11 @@
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
 	wait_queue_head_t probe_wait;
-	struct module *module;
 	struct list_head list;
 	struct device dev;
 	int id, probe_active;
 	struct list_head poison_list;
+	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
 };
 
@@ -40,7 +40,8 @@
 	unsigned long cmd_mask;
 	struct device dev;
 	atomic_t busy;
-	int id;
+	int id, num_flush;
+	struct resource *flush_wpq;
 };
 
 bool is_nvdimm(struct device *dev);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d0ac93c..4047639 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -49,9 +49,11 @@
 	struct kref kref;
 };
 
-struct nd_region_namespaces {
-	int count;
-	int active;
+struct nd_region_data {
+	int ns_count;
+	int ns_active;
+	unsigned int flush_mask;
+	void __iomem *flush_wpq[0][0];
 };
 
 static inline struct nd_namespace_index *to_namespace_index(
@@ -119,7 +121,6 @@
 
 struct nd_blk_region {
 	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 	void *blk_provider_data;
@@ -325,6 +326,7 @@
 }
 #endif
 int nd_blk_region_init(struct nd_region *nd_region);
+int nd_region_activate(struct nd_region *nd_region);
 void __nd_iostat_start(struct bio *bio, unsigned long *start);
 static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 36cb390..b511099 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -29,27 +29,28 @@
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
+#include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
 
-struct pmem_device {
-	/* One contiguous memory region per device */
-	phys_addr_t		phys_addr;
-	/* when non-zero this device is hosting a 'pfn' instance */
-	phys_addr_t		data_offset;
-	u64			pfn_flags;
-	void __pmem		*virt_addr;
-	/* immutable base size of the namespace */
-	size_t			size;
-	/* trim size when namespace capacity has been section aligned */
-	u32			pfn_pad;
-	struct badblocks	bb;
-};
+static struct device *to_dev(struct pmem_device *pmem)
+{
+	/*
+	 * nvdimm bus services need a 'dev' parameter, and we record the device
+	 * at init in bb.dev.
+	 */
+	return pmem->bb.dev;
+}
+
+static struct nd_region *to_region(struct pmem_device *pmem)
+{
+	return to_nd_region(to_dev(pmem)->parent);
+}
 
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
-	struct device *dev = pmem->bb.dev;
+	struct device *dev = to_dev(pmem);
 	sector_t sector;
 	long cleared;
 
@@ -57,7 +58,7 @@
 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
 
 	if (cleared > 0 && cleared / 512) {
-		dev_dbg(dev, "%s: %llx clear %ld sector%s\n",
+		dev_dbg(dev, "%s: %#llx clear %ld sector%s\n",
 				__func__, (unsigned long long) sector,
 				cleared / 512, cleared / 512 > 1 ? "s" : "");
 		badblocks_clear(&pmem->bb, sector, cleared / 512);
@@ -73,7 +74,7 @@
 	bool bad_pmem = false;
 	void *mem = kmap_atomic(page);
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
-	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
+	void *pmem_addr = pmem->virt_addr + pmem_off;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
 		bad_pmem = true;
@@ -112,6 +113,11 @@
 	return rc;
 }
 
+/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
+#ifndef REQ_FLUSH
+#define REQ_FLUSH REQ_PREFLUSH
+#endif
+
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
 	int rc = 0;
@@ -120,6 +126,10 @@
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	struct pmem_device *pmem = q->queuedata;
+	struct nd_region *nd_region = to_region(pmem);
+
+	if (bio->bi_rw & REQ_FLUSH)
+		nvdimm_flush(nd_region);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -134,8 +144,8 @@
 	if (do_acct)
 		nd_iostat_end(bio, start);
 
-	if (bio_data_dir(bio))
-		wmb_pmem();
+	if (bio->bi_rw & REQ_FUA)
+		nvdimm_flush(nd_region);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -148,8 +158,6 @@
 	int rc;
 
 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
-	if (rw & WRITE)
-		wmb_pmem();
 
 	/*
 	 * The ->rw_page interface is subtle and tricky.  The core
@@ -163,8 +171,9 @@
 	return rc;
 }
 
-static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void __pmem **kaddr, pfn_t *pfn, long size)
+/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
+__weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
+		      void **kaddr, pfn_t *pfn, long size)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	resource_size_t offset = sector * 512 + pmem->data_offset;
@@ -195,7 +204,7 @@
 	blk_cleanup_queue(q);
 }
 
-void pmem_release_disk(void *disk)
+static void pmem_release_disk(void *disk)
 {
 	del_gendisk(disk);
 	put_disk(disk);
@@ -205,6 +214,7 @@
 		struct nd_namespace_common *ndns)
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct vmem_altmap __altmap, *altmap = NULL;
 	struct resource *res = &nsio->res;
 	struct nd_pfn *nd_pfn = NULL;
@@ -234,7 +244,7 @@
 	dev_set_drvdata(dev, pmem);
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
-	if (!arch_has_wmb_pmem())
+	if (nvdimm_has_flush(nd_region) < 0)
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
@@ -269,15 +279,14 @@
 	 * At release time the queue must be dead before
 	 * devm_memremap_pages is unwound
 	 */
-	if (devm_add_action(dev, pmem_release_queue, q)) {
-		blk_cleanup_queue(q);
+	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
 		return -ENOMEM;
-	}
 
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
-	pmem->virt_addr = (void __pmem *) addr;
+	pmem->virt_addr = addr;
 
+	blk_queue_write_cache(q, true, true);
 	blk_queue_make_request(q, pmem_make_request);
 	blk_queue_physical_block_size(q, PAGE_SIZE);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
@@ -289,10 +298,6 @@
 	disk = alloc_disk_node(0, nid);
 	if (!disk)
 		return -ENOMEM;
-	if (devm_add_action(dev, pmem_release_disk, disk)) {
-		put_disk(disk);
-		return -ENOMEM;
-	}
 
 	disk->fops		= &pmem_fops;
 	disk->queue		= q;
@@ -302,9 +307,13 @@
 			/ 512);
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
-	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
+	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
 	disk->bb = &pmem->bb;
 	device_add_disk(dev, disk);
+
+	if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
+		return -ENOMEM;
+
 	revalidate_disk(disk);
 
 	return 0;
@@ -340,13 +349,20 @@
 {
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
+	nvdimm_flush(to_nd_region(dev->parent));
+
 	return 0;
 }
 
+static void nd_pmem_shutdown(struct device *dev)
+{
+	nvdimm_flush(to_nd_region(dev->parent));
+}
+
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 {
-	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct pmem_device *pmem = dev_get_drvdata(dev);
+	struct nd_region *nd_region = to_region(pmem);
 	resource_size_t offset = 0, end_trunc = 0;
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_io *nsio;
@@ -382,6 +398,7 @@
 	.probe = nd_pmem_probe,
 	.remove = nd_pmem_remove,
 	.notify = nd_pmem_notify,
+	.shutdown = nd_pmem_shutdown,
 	.drv = {
 		.name = "nd_pmem",
 	},
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
new file mode 100644
index 0000000..b4ee4f71
--- /dev/null
+++ b/drivers/nvdimm/pmem.h
@@ -0,0 +1,24 @@
+#ifndef __NVDIMM_PMEM_H__
+#define __NVDIMM_PMEM_H__
+#include <linux/badblocks.h>
+#include <linux/types.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+
+long pmem_direct_access(struct block_device *bdev, sector_t sector,
+		      void **kaddr, pfn_t *pfn, long size);
+/* this definition is in it's own header for tools/testing/nvdimm to consume */
+struct pmem_device {
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	/* when non-zero this device is hosting a 'pfn' instance */
+	phys_addr_t		data_offset;
+	u64			pfn_flags;
+	void			*virt_addr;
+	/* immutable base size of the namespace */
+	size_t			size;
+	/* trim size when namespace capacity has been section aligned */
+	u32			pfn_pad;
+	struct badblocks	bb;
+};
+#endif /* __NVDIMM_PMEM_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 05a9123..8f24177 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -20,7 +20,7 @@
 {
 	int err, rc;
 	static unsigned long once;
-	struct nd_region_namespaces *num_ns;
+	struct nd_region_data *ndrd;
 	struct nd_region *nd_region = to_nd_region(dev);
 
 	if (nd_region->num_lanes > num_online_cpus()
@@ -33,21 +33,21 @@
 				nd_region->num_lanes);
 	}
 
+	rc = nd_region_activate(nd_region);
+	if (rc)
+		return rc;
+
 	rc = nd_blk_region_init(nd_region);
 	if (rc)
 		return rc;
 
 	rc = nd_region_register_namespaces(nd_region, &err);
-	num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
-	if (!num_ns)
-		return -ENOMEM;
-
 	if (rc < 0)
 		return rc;
 
-	num_ns->active = rc;
-	num_ns->count = rc + err;
-	dev_set_drvdata(dev, num_ns);
+	ndrd = dev_get_drvdata(dev);
+	ndrd->ns_active = rc;
+	ndrd->ns_count = rc + err;
 
 	if (rc && err && rc == err)
 		return -ENODEV;
@@ -82,6 +82,8 @@
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
+	device_for_each_child(dev, NULL, child_unregister);
+
 	/* flush attribute readers and disable */
 	nvdimm_bus_lock(dev);
 	nd_region->ns_seed = NULL;
@@ -91,7 +93,6 @@
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 
-	device_for_each_child(dev, NULL, child_unregister);
 	return 0;
 }
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 40fcfea..e8d5ba7 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -14,13 +14,97 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/pmem.h>
 #include <linux/sort.h>
 #include <linux/io.h>
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
 
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <linux/io-64-nonatomic-hi-lo.h>
+
 static DEFINE_IDA(region_ida);
+static DEFINE_PER_CPU(int, flush_idx);
+
+static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
+		struct nd_region_data *ndrd)
+{
+	int i, j;
+
+	dev_dbg(dev, "%s: map %d flush address%s\n", nvdimm_name(nvdimm),
+			nvdimm->num_flush, nvdimm->num_flush == 1 ? "" : "es");
+	for (i = 0; i < nvdimm->num_flush; i++) {
+		struct resource *res = &nvdimm->flush_wpq[i];
+		unsigned long pfn = PHYS_PFN(res->start);
+		void __iomem *flush_page;
+
+		/* check if flush hints share a page */
+		for (j = 0; j < i; j++) {
+			struct resource *res_j = &nvdimm->flush_wpq[j];
+			unsigned long pfn_j = PHYS_PFN(res_j->start);
+
+			if (pfn == pfn_j)
+				break;
+		}
+
+		if (j < i)
+			flush_page = (void __iomem *) ((unsigned long)
+					ndrd->flush_wpq[dimm][j] & PAGE_MASK);
+		else
+			flush_page = devm_nvdimm_ioremap(dev,
+					PHYS_PFN(pfn), PAGE_SIZE);
+		if (!flush_page)
+			return -ENXIO;
+		ndrd->flush_wpq[dimm][i] = flush_page
+			+ (res->start & ~PAGE_MASK);
+	}
+
+	return 0;
+}
+
+int nd_region_activate(struct nd_region *nd_region)
+{
+	int i, num_flush = 0;
+	struct nd_region_data *ndrd;
+	struct device *dev = &nd_region->dev;
+	size_t flush_data_size = sizeof(void *);
+
+	nvdimm_bus_lock(&nd_region->dev);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		/* at least one null hint slot per-dimm for the "no-hint" case */
+		flush_data_size += sizeof(void *);
+		num_flush = min_not_zero(num_flush, nvdimm->num_flush);
+		if (!nvdimm->num_flush)
+			continue;
+		flush_data_size += nvdimm->num_flush * sizeof(void *);
+	}
+	nvdimm_bus_unlock(&nd_region->dev);
+
+	ndrd = devm_kzalloc(dev, sizeof(*ndrd) + flush_data_size, GFP_KERNEL);
+	if (!ndrd)
+		return -ENOMEM;
+	dev_set_drvdata(dev, ndrd);
+
+	ndrd->flush_mask = (1 << ilog2(num_flush)) - 1;
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+		int rc = nvdimm_map_flush(&nd_region->dev, nvdimm, i, ndrd);
+
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
 
 static void nd_region_release(struct device *dev)
 {
@@ -242,12 +326,12 @@
 static ssize_t init_namespaces_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_region_namespaces *num_ns = dev_get_drvdata(dev);
+	struct nd_region_data *ndrd = dev_get_drvdata(dev);
 	ssize_t rc;
 
 	nvdimm_bus_lock(dev);
-	if (num_ns)
-		rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count);
+	if (ndrd)
+		rc = sprintf(buf, "%d/%d\n", ndrd->ns_active, ndrd->ns_count);
 	else
 		rc = -ENXIO;
 	nvdimm_bus_unlock(dev);
@@ -433,8 +517,6 @@
 
 		if (is_nd_pmem(dev))
 			return;
-
-		to_nd_blk_region(dev)->disable(nvdimm_bus, dev);
 	}
 	if (dev->parent && is_nd_blk(dev->parent) && probe) {
 		nd_region = to_nd_region(dev->parent);
@@ -698,7 +780,6 @@
 		if (ndbr) {
 			nd_region = &ndbr->nd_region;
 			ndbr->enable = ndbr_desc->enable;
-			ndbr->disable = ndbr_desc->disable;
 			ndbr->do_io = ndbr_desc->do_io;
 		}
 		region_buf = ndbr;
@@ -794,6 +875,67 @@
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+/**
+ * nvdimm_flush - flush any posted write queues between the cpu and pmem media
+ * @nd_region: blk or interleaved pmem region
+ */
+void nvdimm_flush(struct nd_region *nd_region)
+{
+	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+	int i, idx;
+
+	/*
+	 * Try to encourage some diversity in flush hint addresses
+	 * across cpus assuming a limited number of flush hints.
+	 */
+	idx = this_cpu_read(flush_idx);
+	idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
+
+	/*
+	 * The first wmb() is needed to 'sfence' all previous writes
+	 * such that they are architecturally visible for the platform
+	 * buffer flush.  Note that we've already arranged for pmem
+	 * writes to avoid the cache via arch_memcpy_to_pmem().  The
+	 * final wmb() ensures ordering for the NVDIMM flush write.
+	 */
+	wmb();
+	for (i = 0; i < nd_region->ndr_mappings; i++)
+		if (ndrd->flush_wpq[i][0])
+			writeq(1, ndrd->flush_wpq[i][idx & ndrd->flush_mask]);
+	wmb();
+}
+EXPORT_SYMBOL_GPL(nvdimm_flush);
+
+/**
+ * nvdimm_has_flush - determine write flushing requirements
+ * @nd_region: blk or interleaved pmem region
+ *
+ * Returns 1 if writes require flushing
+ * Returns 0 if writes do not require flushing
+ * Returns -ENXIO if flushing capability can not be determined
+ */
+int nvdimm_has_flush(struct nd_region *nd_region)
+{
+	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+	int i;
+
+	/* no nvdimm == flushing capability unknown */
+	if (nd_region->ndr_mappings == 0)
+		return -ENXIO;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++)
+		/* flush hints present, flushing required */
+		if (ndrd->flush_wpq[i][0])
+			return 1;
+
+	/*
+	 * The platform defines dimm devices without hints, assume
+	 * platform persistence mechanism like ADR
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_flush);
+
 void __exit nd_region_devs_exit(void)
 {
 	ida_destroy(&region_ida);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index fac1b51..9d66b4f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void __pmem **kaddr, pfn_t *pfn, long size);
+			 void **kaddr, pfn_t *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -884,7 +884,7 @@
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-			void __pmem **kaddr, pfn_t *pfn, long size)
+			void **kaddr, pfn_t *pfn, long size)
 {
 	struct dcssblk_dev_info *dev_info;
 	unsigned long offset, dev_sz;
@@ -894,7 +894,7 @@
 		return -ENODEV;
 	dev_sz = dev_info->end - dev_info->start;
 	offset = secnum * 512;
-	*kaddr = (void __pmem *) (dev_info->start + offset);
+	*kaddr = (void *) dev_info->start + offset;
 	*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
 
 	return dev_sz - offset;
diff --git a/fs/dax.c b/fs/dax.c
index 432b9e6..993dc6f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -75,13 +75,13 @@
 	struct request_queue *q = bdev->bd_queue;
 	long rc = -EIO;
 
-	dax->addr = (void __pmem *) ERR_PTR(-EIO);
+	dax->addr = ERR_PTR(-EIO);
 	if (blk_queue_enter(q, true) != 0)
 		return rc;
 
 	rc = bdev_direct_access(bdev, dax);
 	if (rc < 0) {
-		dax->addr = (void __pmem *) ERR_PTR(rc);
+		dax->addr = ERR_PTR(rc);
 		blk_queue_exit(q);
 		return rc;
 	}
@@ -147,12 +147,12 @@
 		      struct buffer_head *bh)
 {
 	loff_t pos = start, max = start, bh_max = start;
-	bool hole = false, need_wmb = false;
+	bool hole = false;
 	struct block_device *bdev = NULL;
 	int rw = iov_iter_rw(iter), rc;
 	long map_len = 0;
 	struct blk_dax_ctl dax = {
-		.addr = (void __pmem *) ERR_PTR(-EIO),
+		.addr = ERR_PTR(-EIO),
 	};
 	unsigned blkbits = inode->i_blkbits;
 	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
@@ -218,7 +218,6 @@
 
 		if (iov_iter_rw(iter) == WRITE) {
 			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-			need_wmb = true;
 		} else if (!hole)
 			len = copy_to_iter((void __force *) dax.addr, max - pos,
 					iter);
@@ -235,8 +234,6 @@
 			dax.addr += len;
 	}
 
-	if (need_wmb)
-		wmb_pmem();
 	dax_unmap_atomic(bdev, &dax);
 
 	return (pos == start) ? rc : pos - start;
@@ -788,7 +785,6 @@
 				return ret;
 		}
 	}
-	wmb_pmem();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1187,7 +1183,6 @@
 		if (dax_map_atomic(bdev, &dax) < 0)
 			return PTR_ERR(dax.addr);
 		clear_pmem(dax.addr + offset, length);
-		wmb_pmem();
 		dax_unmap_atomic(bdev, &dax);
 	}
 	return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c96db9c..adf3307 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1665,7 +1665,7 @@
  */
 struct blk_dax_ctl {
 	sector_t sector;
-	void __pmem *addr;
+	void *addr;
 	long size;
 	pfn_t pfn;
 };
@@ -1676,8 +1676,8 @@
 	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-			pfn_t *, long);
+	long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
+			long);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2e853b6..1bb9548 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -17,7 +17,6 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
-# define __pmem		__attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu		__attribute__((noderef, address_space(4)))
 #else /* CONFIG_SPARSE_RCU_POINTER */
@@ -45,7 +44,6 @@
 # define __cond_lock(x,c) (c)
 # define __percpu
 # define __rcu
-# define __pmem
 # define __private
 # define ACCESS_PRIVATE(p, member) ((p)->member)
 #endif /* __CHECKER__ */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b0db857..91acfce 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -131,7 +131,7 @@
  * >= 0 : the number of bytes accessible at the address
  */
 typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
-				     void __pmem **kaddr, pfn_t *pfn, long size);
+				     void **kaddr, pfn_t *pfn, long size);
 
 void dm_error(const char *message);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0c3c30c..b519e13 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -52,6 +52,7 @@
 
 struct nd_namespace_label;
 struct nvdimm_drvdata;
+
 struct nd_mapping {
 	struct nvdimm *nvdimm;
 	struct nd_namespace_label **labels;
@@ -69,6 +70,7 @@
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
 	unsigned long cmd_mask;
+	struct module *module;
 	char *provider_name;
 	ndctl_fn ndctl;
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
@@ -99,13 +101,21 @@
 	unsigned long flags;
 };
 
+struct device;
+void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags);
+static inline void __iomem *devm_nvdimm_ioremap(struct device *dev,
+		resource_size_t offset, size_t size)
+{
+	return (void __iomem *) devm_nvdimm_memremap(dev, offset, size, 0);
+}
+
 struct nvdimm_bus;
 struct module;
 struct device;
 struct nd_blk_region;
 struct nd_blk_region_desc {
 	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 	struct nd_region_desc ndr_desc;
@@ -119,22 +129,22 @@
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
-		struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
-#define nvdimm_bus_register(parent, desc) \
-	__nvdimm_bus_register(parent, desc, THIS_MODULE)
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
 struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
 struct nvdimm *to_nvdimm(struct device *dev);
 struct nd_region *to_nd_region(struct device *dev);
 struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
+struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
 unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long cmd_mask);
+		unsigned long cmd_mask, int num_flush,
+		struct resource *flush_wpq);
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
 u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
@@ -156,4 +166,6 @@
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
+void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_has_flush(struct nd_region *nd_region);
 #endif /* __LIBNVDIMM_H__ */
diff --git a/include/linux/nd.h b/include/linux/nd.h
index aee2761..f1ea426 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -26,6 +26,7 @@
 	unsigned long type;
 	int (*probe)(struct device *dev);
 	int (*remove)(struct device *dev);
+	void (*shutdown)(struct device *dev);
 	void (*notify)(struct device *dev, enum nvdimm_event event);
 };
 
@@ -67,7 +68,7 @@
 	struct nd_namespace_common common;
 	struct resource res;
 	resource_size_t size;
-	void __pmem *addr;
+	void *addr;
 	struct badblocks bb;
 };
 
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 9499481..a3d90b9 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -28,7 +28,10 @@
 	return __pfn_to_pfn_t(pfn, 0);
 }
 
-extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags);
+static inline pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
+{
+	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
 
 static inline bool pfn_t_has_page(pfn_t pfn)
 {
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index 57d146f..e856c2c 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -26,47 +26,35 @@
  * calling these symbols with arch_has_pmem_api() and redirect to the
  * implementation in asm/pmem.h.
  */
-static inline bool __arch_has_wmb_pmem(void)
-{
-	return false;
-}
-
-static inline void arch_wmb_pmem(void)
+static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 {
 	BUG();
 }
 
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-		size_t n)
-{
-	BUG();
-}
-
-static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src,
-		size_t n)
+static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
 {
 	BUG();
 	return -EFAULT;
 }
 
-static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
 		struct iov_iter *i)
 {
 	BUG();
 	return 0;
 }
 
-static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+static inline void arch_clear_pmem(void *addr, size_t size)
 {
 	BUG();
 }
 
-static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	BUG();
 }
 
-static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+static inline void arch_invalidate_pmem(void *addr, size_t size)
 {
 	BUG();
 }
@@ -77,13 +65,6 @@
 	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
 }
 
-static inline int default_memcpy_from_pmem(void *dst, void __pmem const *src,
-		size_t size)
-{
-	memcpy(dst, (void __force *) src, size);
-	return 0;
-}
-
 /*
  * memcpy_from_pmem - read from persistent memory with error handling
  * @dst: destination buffer
@@ -92,54 +73,13 @@
  *
  * Returns 0 on success negative error code on failure.
  */
-static inline int memcpy_from_pmem(void *dst, void __pmem const *src,
-		size_t size)
+static inline int memcpy_from_pmem(void *dst, void const *src, size_t size)
 {
 	if (arch_has_pmem_api())
 		return arch_memcpy_from_pmem(dst, src, size);
 	else
-		return default_memcpy_from_pmem(dst, src, size);
-}
-
-/**
- * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
- *
- * For a given cpu implementation within an architecture it is possible
- * that wmb_pmem() resolves to a nop.  In the case this returns
- * false, pmem api users are unable to ensure durability and may want to
- * fall back to a different data consistency model, or otherwise notify
- * the user.
- */
-static inline bool arch_has_wmb_pmem(void)
-{
-	return arch_has_pmem_api() && __arch_has_wmb_pmem();
-}
-
-/*
- * These defaults seek to offer decent performance and minimize the
- * window between i/o completion and writes being durable on media.
- * However, it is undefined / architecture specific whether
- * ARCH_MEMREMAP_PMEM + default_memcpy_to_pmem is sufficient for
- * making data durable relative to i/o completion.
- */
-static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src,
-		size_t size)
-{
-	memcpy((void __force *) dst, src, size);
-}
-
-static inline size_t default_copy_from_iter_pmem(void __pmem *addr,
-		size_t bytes, struct iov_iter *i)
-{
-	return copy_from_iter_nocache((void __force *)addr, bytes, i);
-}
-
-static inline void default_clear_pmem(void __pmem *addr, size_t size)
-{
-	if (size == PAGE_SIZE && ((unsigned long)addr & ~PAGE_MASK) == 0)
-		clear_page((void __force *)addr);
-	else
-		memset((void __force *)addr, 0, size);
+		memcpy(dst, src, size);
+	return 0;
 }
 
 /**
@@ -152,29 +92,14 @@
  * being effectively evicted from, or never written to, the processor
  * cache hierarchy after the copy completes.  After memcpy_to_pmem()
  * data may still reside in cpu or platform buffers, so this operation
- * must be followed by a wmb_pmem().
+ * must be followed by a blkdev_issue_flush() on the pmem block device.
  */
-static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
+static inline void memcpy_to_pmem(void *dst, const void *src, size_t n)
 {
 	if (arch_has_pmem_api())
 		arch_memcpy_to_pmem(dst, src, n);
 	else
-		default_memcpy_to_pmem(dst, src, n);
-}
-
-/**
- * wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of memcpy_to_pmem() operations this drains data from
- * cpu write buffers and any platform (memory controller) buffers to
- * ensure that written data is durable on persistent memory media.
- */
-static inline void wmb_pmem(void)
-{
-	if (arch_has_wmb_pmem())
-		arch_wmb_pmem();
-	else
-		wmb();
+		memcpy(dst, src, n);
 }
 
 /**
@@ -184,14 +109,14 @@
  * @i:		iterator with source data
  *
  * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
- * This function requires explicit ordering with a wmb_pmem() call.
+ * See blkdev_issue_flush() note for memcpy_to_pmem().
  */
-static inline size_t copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+static inline size_t copy_from_iter_pmem(void *addr, size_t bytes,
 		struct iov_iter *i)
 {
 	if (arch_has_pmem_api())
 		return arch_copy_from_iter_pmem(addr, bytes, i);
-	return default_copy_from_iter_pmem(addr, bytes, i);
+	return copy_from_iter_nocache(addr, bytes, i);
 }
 
 /**
@@ -200,14 +125,14 @@
  * @size:	number of bytes to zero
  *
  * Write zeros into the memory range starting at 'addr' for 'size' bytes.
- * This function requires explicit ordering with a wmb_pmem() call.
+ * See blkdev_issue_flush() note for memcpy_to_pmem().
  */
-static inline void clear_pmem(void __pmem *addr, size_t size)
+static inline void clear_pmem(void *addr, size_t size)
 {
 	if (arch_has_pmem_api())
 		arch_clear_pmem(addr, size);
 	else
-		default_clear_pmem(addr, size);
+		memset(addr, 0, size);
 }
 
 /**
@@ -218,7 +143,7 @@
  * For platforms that support clearing poison this flushes any poisoned
  * ranges out of the cache
  */
-static inline void invalidate_pmem(void __pmem *addr, size_t size)
+static inline void invalidate_pmem(void *addr, size_t size)
 {
 	if (arch_has_pmem_api())
 		arch_invalidate_pmem(addr, size);
@@ -230,9 +155,9 @@
  * @size:	number of bytes to write back
  *
  * Write back the processor cache range starting at 'addr' for 'size' bytes.
- * This function requires explicit ordering with a wmb_pmem() call.
+ * See blkdev_issue_flush() note for memcpy_to_pmem().
  */
-static inline void wb_cache_pmem(void __pmem *addr, size_t size)
+static inline void wb_cache_pmem(void *addr, size_t size)
 {
 	if (arch_has_pmem_api())
 		arch_wb_cache_pmem(addr, size);
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 309915f7..ba5a8c7 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -298,6 +298,7 @@
 #define NVDIMM_FAMILY_INTEL 0
 #define NVDIMM_FAMILY_HPE1 1
 #define NVDIMM_FAMILY_HPE2 2
+#define NVDIMM_FAMILY_MSFT 3
 
 #define ND_IOCTL_CALL			_IOWR(ND_IOCTL, ND_CMD_CALL,\
 					struct nd_cmd_pkg)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index ddb3247a..251d16b 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@
 }
 EXPORT_SYMBOL(devm_memunmap);
 
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
-	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
 #ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4904ced..24a0836 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -313,7 +313,6 @@
 			__kernel|
 			__force|
 			__iomem|
-			__pmem|
 			__must_check|
 			__init_refok|
 			__kprobes|
diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt
index d388de7..28632ee 100644
--- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt
@@ -947,7 +947,7 @@
 4: XSAVE
 5: XRSTOR | lfence (11B)
 6: XSAVEOPT | clwb (66) | mfence (11B)
-7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
+7: clflush | clflushopt (66) | sfence (11B)
 EndTable
 
 GrpTable: Grp16
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-32.c b/tools/perf/arch/x86/tests/insn-x86-dat-32.c
index 3918dd5..0f196ee 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-32.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-32.c
@@ -1664,5 +1664,3 @@
 "0f c7 1d 78 56 34 12 \txrstors 0x12345678",},
 {{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "",
 "0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%eax,%ecx,8)",},
-{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "",
-"66 0f ae f8          \tpcommit ",},
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-64.c b/tools/perf/arch/x86/tests/insn-x86-dat-64.c
index 9c8c61e..af25bc8 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-64.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-64.c
@@ -1696,5 +1696,3 @@
 "0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%rax,%rcx,8)",},
 {{0x41, 0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
 "41 0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%r8,%rcx,8)",},
-{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "",
-"66 0f ae f8          \tpcommit ",},
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-src.c b/tools/perf/arch/x86/tests/insn-x86-dat-src.c
index 76e0ec3..979487d 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-src.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-src.c
@@ -2655,10 +2655,6 @@
 
 #endif /* #ifndef __x86_64__ */
 
-	/* pcommit */
-
-	asm volatile("pcommit");
-
 	/* Following line is a marker for the awk script - do not change */
 	asm volatile("rdtsc"); /* Stop here */
 
diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
index ec378cd..767be7c 100644
--- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
+++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
@@ -1012,7 +1012,7 @@
 4: XSAVE
 5: XRSTOR | lfence (11B)
 6: XSAVEOPT | clwb (66) | mfence (11B)
-7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
+7: clflush | clflushopt (66) | sfence (11B)
 EndTable
 
 GrpTable: Grp16
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 7859856..ad6dd05 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -11,12 +11,14 @@
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 ldflags-y += --wrap=devm_memremap_pages
-ldflags-y += --wrap=phys_to_pfn_t
+ldflags-y += --wrap=insert_resource
+ldflags-y += --wrap=remove_resource
 
 DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
-ACPI_SRC := $(DRIVERS)/acpi
+ACPI_SRC := $(DRIVERS)/acpi/nfit
 DAX_SRC := $(DRIVERS)/dax
+ccflags-y := -I$(src)/$(NVDIMM_SRC)/
 
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
@@ -27,10 +29,12 @@
 obj-$(CONFIG_DEV_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
-nfit-y := $(ACPI_SRC)/nfit.o
+nfit-y := $(ACPI_SRC)/core.o
+nfit-$(CONFIG_X86_MCE) += $(ACPI_SRC)/mce.o
 nfit-y += config_check.o
 
 nd_pmem-y := $(NVDIMM_SRC)/pmem.o
+nd_pmem-y += pmem-dax.o
 nd_pmem-y += config_check.o
 
 nd_btt-y := $(NVDIMM_SRC)/btt.o
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
index adf18bf..878daf3 100644
--- a/tools/testing/nvdimm/config_check.c
+++ b/tools/testing/nvdimm/config_check.c
@@ -10,6 +10,7 @@
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_LIBNVDIMM));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
+	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_PFN));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX));
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
new file mode 100644
index 0000000..c9b8c48
--- /dev/null
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014-2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include "test/nfit_test.h"
+#include <linux/blkdev.h>
+#include <pmem.h>
+#include <nd.h>
+
+long pmem_direct_access(struct block_device *bdev, sector_t sector,
+		void **kaddr, pfn_t *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512 + pmem->data_offset;
+
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+		return -EIO;
+
+	/*
+	 * Limit dax to a single page at a time given vmalloc()-backed
+	 * in the nfit_test case.
+	 */
+	if (get_nfit_res(pmem->phys_addr + offset)) {
+		struct page *page;
+
+		*kaddr = pmem->virt_addr + offset;
+		page = vmalloc_to_page(pmem->virt_addr + offset);
+		*pfn = page_to_pfn_t(page);
+		dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent,
+				"%s: sector: %#llx pfn: %#lx\n", __func__,
+				(unsigned long long) sector, page_to_pfn(page));
+
+		return PAGE_SIZE;
+	}
+
+	*kaddr = pmem->virt_addr + offset;
+	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+
+	/*
+	 * If badblocks are present, limit known good range to the
+	 * requested range.
+	 */
+	if (unlikely(pmem->bb.count))
+		return size;
+	return pmem->size - pmem->pfn_pad - offset;
+}
diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild
index 9241064..d32f25b 100644
--- a/tools/testing/nvdimm/test/Kbuild
+++ b/tools/testing/nvdimm/test/Kbuild
@@ -1,5 +1,5 @@
 ccflags-y := -I$(src)/../../../../drivers/nvdimm/
-ccflags-y += -I$(src)/../../../../drivers/acpi/
+ccflags-y += -I$(src)/../../../../drivers/acpi/nfit/
 
 obj-m += nfit_test.o
 obj-m += nfit_test_iomap.o
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index c842095..c29f8dc 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -10,11 +10,13 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/memremap.h>
 #include <linux/rculist.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/pfn_t.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include "nfit_test.h"
@@ -52,7 +54,7 @@
 	return NULL;
 }
 
-static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
+struct nfit_test_resource *get_nfit_res(resource_size_t resource)
 {
 	struct nfit_test_resource *res;
 
@@ -62,6 +64,7 @@
 
 	return res;
 }
+EXPORT_SYMBOL(get_nfit_res);
 
 void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
 		void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
@@ -97,10 +100,6 @@
 }
 EXPORT_SYMBOL(__wrap_devm_memremap);
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
-#include <linux/memremap.h>
-#include <linux/pfn_t.h>
-
 void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
@@ -122,19 +121,6 @@
         return phys_to_pfn_t(addr, flags);
 }
 EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
-#else
-/* to be removed post 4.5-rc1 */
-void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res)
-{
-	resource_size_t offset = res->start;
-	struct nfit_test_resource *nfit_res = get_nfit_res(offset);
-
-	if (nfit_res)
-		return nfit_res->buf + offset - nfit_res->res->start;
-	return devm_memremap_pages(dev, res);
-}
-EXPORT_SYMBOL(__wrap_devm_memremap_pages);
-#endif
 
 void *__wrap_memremap(resource_size_t offset, size_t size,
 		unsigned long flags)
@@ -229,6 +215,22 @@
 }
 EXPORT_SYMBOL(__wrap___request_region);
 
+int __wrap_insert_resource(struct resource *parent, struct resource *res)
+{
+	if (get_nfit_res(res->start))
+		return 0;
+	return insert_resource(parent, res);
+}
+EXPORT_SYMBOL(__wrap_insert_resource);
+
+int __wrap_remove_resource(struct resource *res)
+{
+	if (get_nfit_res(res->start))
+		return 0;
+	return remove_resource(res);
+}
+EXPORT_SYMBOL(__wrap_remove_resource);
+
 struct resource *__wrap___devm_request_region(struct device *dev,
 		struct resource *parent, resource_size_t start,
 		resource_size_t n, const char *name)
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index c919866..5404efa 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -98,11 +98,13 @@
 enum {
 	NUM_PM  = 3,
 	NUM_DCR = 5,
+	NUM_HINTS = 8,
 	NUM_BDW = NUM_DCR,
 	NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW,
 	NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */,
 	DIMM_SIZE = SZ_32M,
 	LABEL_SIZE = SZ_128K,
+	SPA_VCD_SIZE = SZ_4M,
 	SPA0_SIZE = DIMM_SIZE,
 	SPA1_SIZE = DIMM_SIZE*2,
 	SPA2_SIZE = DIMM_SIZE,
@@ -470,11 +472,7 @@
 	list_del(&nfit_res->list);
 	spin_unlock(&nfit_test_lock);
 
-	if (is_vmalloc_addr(nfit_res->buf))
-		vfree(nfit_res->buf);
-	else
-		dma_free_coherent(nfit_res->dev, resource_size(res),
-				nfit_res->buf, res->start);
+	vfree(nfit_res->buf);
 	kfree(res);
 	kfree(nfit_res);
 }
@@ -507,9 +505,7 @@
 
 	return nfit_res->buf;
  err:
-	if (buf && !is_vmalloc_addr(buf))
-		dma_free_coherent(dev, size, buf, *dma);
-	else if (buf)
+	if (buf)
 		vfree(buf);
 	kfree(res);
 	kfree(nfit_res);
@@ -524,15 +520,6 @@
 	return __test_alloc(t, size, dma, buf);
 }
 
-static void *test_alloc_coherent(struct nfit_test *t, size_t size,
-		dma_addr_t *dma)
-{
-	struct device *dev = &t->pdev.dev;
-	void *buf = dma_alloc_coherent(dev, size, dma, GFP_KERNEL);
-
-	return __test_alloc(t, size, dma, buf);
-}
-
 static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
 {
 	int i;
@@ -584,7 +571,8 @@
 			+ offsetof(struct acpi_nfit_control_region,
 					window_size) * NUM_DCR
 			+ sizeof(struct acpi_nfit_data_region) * NUM_BDW
-			+ sizeof(struct acpi_nfit_flush_address) * NUM_DCR;
+			+ (sizeof(struct acpi_nfit_flush_address)
+					+ sizeof(u64) * NUM_HINTS) * NUM_DCR;
 	int i;
 
 	t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
@@ -592,15 +580,15 @@
 		return -ENOMEM;
 	t->nfit_size = nfit_size;
 
-	t->spa_set[0] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[0]);
+	t->spa_set[0] = test_alloc(t, SPA0_SIZE, &t->spa_set_dma[0]);
 	if (!t->spa_set[0])
 		return -ENOMEM;
 
-	t->spa_set[1] = test_alloc_coherent(t, SPA1_SIZE, &t->spa_set_dma[1]);
+	t->spa_set[1] = test_alloc(t, SPA1_SIZE, &t->spa_set_dma[1]);
 	if (!t->spa_set[1])
 		return -ENOMEM;
 
-	t->spa_set[2] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[2]);
+	t->spa_set[2] = test_alloc(t, SPA0_SIZE, &t->spa_set_dma[2]);
 	if (!t->spa_set[2])
 		return -ENOMEM;
 
@@ -614,7 +602,8 @@
 			return -ENOMEM;
 		sprintf(t->label[i], "label%d", i);
 
-		t->flush[i] = test_alloc(t, 8, &t->flush_dma[i]);
+		t->flush[i] = test_alloc(t, sizeof(u64) * NUM_HINTS,
+				&t->flush_dma[i]);
 		if (!t->flush[i])
 			return -ENOMEM;
 	}
@@ -630,7 +619,7 @@
 
 static int nfit_test1_alloc(struct nfit_test *t)
 {
-	size_t nfit_size = sizeof(struct acpi_nfit_system_address)
+	size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2
 		+ sizeof(struct acpi_nfit_memory_map)
 		+ offsetof(struct acpi_nfit_control_region, window_size);
 
@@ -639,15 +628,31 @@
 		return -ENOMEM;
 	t->nfit_size = nfit_size;
 
-	t->spa_set[0] = test_alloc_coherent(t, SPA2_SIZE, &t->spa_set_dma[0]);
+	t->spa_set[0] = test_alloc(t, SPA2_SIZE, &t->spa_set_dma[0]);
 	if (!t->spa_set[0])
 		return -ENOMEM;
 
+	t->spa_set[1] = test_alloc(t, SPA_VCD_SIZE, &t->spa_set_dma[1]);
+	if (!t->spa_set[1])
+		return -ENOMEM;
+
 	return ars_state_init(&t->pdev.dev, &t->ars_state);
 }
 
+static void dcr_common_init(struct acpi_nfit_control_region *dcr)
+{
+	dcr->vendor_id = 0xabcd;
+	dcr->device_id = 0;
+	dcr->revision_id = 1;
+	dcr->valid_fields = 1;
+	dcr->manufacturing_location = 0xa;
+	dcr->manufacturing_date = cpu_to_be16(2016);
+}
+
 static void nfit_test0_setup(struct nfit_test *t)
 {
+	const int flush_hint_size = sizeof(struct acpi_nfit_flush_address)
+		+ (sizeof(u64) * NUM_HINTS);
 	struct acpi_nfit_desc *acpi_desc;
 	struct acpi_nfit_memory_map *memdev;
 	void *nfit_buf = t->nfit_buf;
@@ -655,7 +660,7 @@
 	struct acpi_nfit_control_region *dcr;
 	struct acpi_nfit_data_region *bdw;
 	struct acpi_nfit_flush_address *flush;
-	unsigned int offset;
+	unsigned int offset, i;
 
 	/*
 	 * spa0 (interleave first half of dimm0 and dimm1, note storage
@@ -972,9 +977,7 @@
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
 	dcr->region_index = 0+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[0];
 	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
@@ -989,9 +992,7 @@
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
 	dcr->region_index = 1+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[1];
 	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
@@ -1006,9 +1007,7 @@
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
 	dcr->region_index = 2+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[2];
 	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
@@ -1023,9 +1022,7 @@
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
 	dcr->region_index = 3+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[3];
 	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
@@ -1042,9 +1039,7 @@
 	dcr->header.length = offsetof(struct acpi_nfit_control_region,
 			window_size);
 	dcr->region_index = 4+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[0];
 	dcr->code = NFIT_FIC_BYTEN;
 	dcr->windows = 0;
@@ -1056,9 +1051,7 @@
 	dcr->header.length = offsetof(struct acpi_nfit_control_region,
 			window_size);
 	dcr->region_index = 5+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[1];
 	dcr->code = NFIT_FIC_BYTEN;
 	dcr->windows = 0;
@@ -1070,9 +1063,7 @@
 	dcr->header.length = offsetof(struct acpi_nfit_control_region,
 			window_size);
 	dcr->region_index = 6+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[2];
 	dcr->code = NFIT_FIC_BYTEN;
 	dcr->windows = 0;
@@ -1084,9 +1075,7 @@
 	dcr->header.length = offsetof(struct acpi_nfit_control_region,
 			window_size);
 	dcr->region_index = 7+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~handle[3];
 	dcr->code = NFIT_FIC_BYTEN;
 	dcr->windows = 0;
@@ -1141,45 +1130,47 @@
 	/* flush0 (dimm0) */
 	flush = nfit_buf + offset;
 	flush->header.type = ACPI_NFIT_TYPE_FLUSH_ADDRESS;
-	flush->header.length = sizeof(struct acpi_nfit_flush_address);
+	flush->header.length = flush_hint_size;
 	flush->device_handle = handle[0];
-	flush->hint_count = 1;
-	flush->hint_address[0] = t->flush_dma[0];
+	flush->hint_count = NUM_HINTS;
+	for (i = 0; i < NUM_HINTS; i++)
+		flush->hint_address[i] = t->flush_dma[0] + i * sizeof(u64);
 
 	/* flush1 (dimm1) */
-	flush = nfit_buf + offset + sizeof(struct acpi_nfit_flush_address) * 1;
+	flush = nfit_buf + offset + flush_hint_size * 1;
 	flush->header.type = ACPI_NFIT_TYPE_FLUSH_ADDRESS;
-	flush->header.length = sizeof(struct acpi_nfit_flush_address);
+	flush->header.length = flush_hint_size;
 	flush->device_handle = handle[1];
-	flush->hint_count = 1;
-	flush->hint_address[0] = t->flush_dma[1];
+	flush->hint_count = NUM_HINTS;
+	for (i = 0; i < NUM_HINTS; i++)
+		flush->hint_address[i] = t->flush_dma[1] + i * sizeof(u64);
 
 	/* flush2 (dimm2) */
-	flush = nfit_buf + offset + sizeof(struct acpi_nfit_flush_address) * 2;
+	flush = nfit_buf + offset + flush_hint_size  * 2;
 	flush->header.type = ACPI_NFIT_TYPE_FLUSH_ADDRESS;
-	flush->header.length = sizeof(struct acpi_nfit_flush_address);
+	flush->header.length = flush_hint_size;
 	flush->device_handle = handle[2];
-	flush->hint_count = 1;
-	flush->hint_address[0] = t->flush_dma[2];
+	flush->hint_count = NUM_HINTS;
+	for (i = 0; i < NUM_HINTS; i++)
+		flush->hint_address[i] = t->flush_dma[2] + i * sizeof(u64);
 
 	/* flush3 (dimm3) */
-	flush = nfit_buf + offset + sizeof(struct acpi_nfit_flush_address) * 3;
+	flush = nfit_buf + offset + flush_hint_size * 3;
 	flush->header.type = ACPI_NFIT_TYPE_FLUSH_ADDRESS;
-	flush->header.length = sizeof(struct acpi_nfit_flush_address);
+	flush->header.length = flush_hint_size;
 	flush->device_handle = handle[3];
-	flush->hint_count = 1;
-	flush->hint_address[0] = t->flush_dma[3];
+	flush->hint_count = NUM_HINTS;
+	for (i = 0; i < NUM_HINTS; i++)
+		flush->hint_address[i] = t->flush_dma[3] + i * sizeof(u64);
 
 	if (t->setup_hotplug) {
-		offset = offset + sizeof(struct acpi_nfit_flush_address) * 4;
+		offset = offset + flush_hint_size * 4;
 		/* dcr-descriptor4: blk */
 		dcr = nfit_buf + offset;
 		dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 		dcr->header.length = sizeof(struct acpi_nfit_control_region);
 		dcr->region_index = 8+1;
-		dcr->vendor_id = 0xabcd;
-		dcr->device_id = 0;
-		dcr->revision_id = 1;
+		dcr_common_init(dcr);
 		dcr->serial_number = ~handle[4];
 		dcr->code = NFIT_FIC_BLK;
 		dcr->windows = 1;
@@ -1196,9 +1187,7 @@
 		dcr->header.length = offsetof(struct acpi_nfit_control_region,
 				window_size);
 		dcr->region_index = 9+1;
-		dcr->vendor_id = 0xabcd;
-		dcr->device_id = 0;
-		dcr->revision_id = 1;
+		dcr_common_init(dcr);
 		dcr->serial_number = ~handle[4];
 		dcr->code = NFIT_FIC_BYTEN;
 		dcr->windows = 0;
@@ -1300,10 +1289,12 @@
 		/* flush3 (dimm4) */
 		flush = nfit_buf + offset;
 		flush->header.type = ACPI_NFIT_TYPE_FLUSH_ADDRESS;
-		flush->header.length = sizeof(struct acpi_nfit_flush_address);
+		flush->header.length = flush_hint_size;
 		flush->device_handle = handle[4];
-		flush->hint_count = 1;
-		flush->hint_address[0] = t->flush_dma[4];
+		flush->hint_count = NUM_HINTS;
+		for (i = 0; i < NUM_HINTS; i++)
+			flush->hint_address[i] = t->flush_dma[4]
+				+ i * sizeof(u64);
 	}
 
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
@@ -1339,7 +1330,16 @@
 	spa->address = t->spa_set_dma[0];
 	spa->length = SPA2_SIZE;
 
-	offset += sizeof(*spa);
+	/* virtual cd region */
+	spa = nfit_buf + sizeof(*spa);
+	spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
+	spa->header.length = sizeof(*spa);
+	memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_VCD), 16);
+	spa->range_index = 0;
+	spa->address = t->spa_set_dma[1];
+	spa->length = SPA_VCD_SIZE;
+
+	offset += sizeof(*spa) * 2;
 	/* mem-region0 (spa0, dimm0) */
 	memdev = nfit_buf + offset;
 	memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
@@ -1365,9 +1365,7 @@
 	dcr->header.length = offsetof(struct acpi_nfit_control_region,
 			window_size);
 	dcr->region_index = 0+1;
-	dcr->vendor_id = 0xabcd;
-	dcr->device_id = 0;
-	dcr->revision_id = 1;
+	dcr_common_init(dcr);
 	dcr->serial_number = ~0;
 	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
@@ -1462,20 +1460,16 @@
 	nfit_test->setup(nfit_test);
 	acpi_desc = &nfit_test->acpi_desc;
 	acpi_nfit_desc_init(acpi_desc, &pdev->dev);
-	acpi_desc->nfit = nfit_test->nfit_buf;
 	acpi_desc->blk_do_io = nfit_test_blk_do_io;
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = NULL;
+	nd_desc->module = THIS_MODULE;
 	nd_desc->ndctl = nfit_test_ctl;
-	acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
-	if (!acpi_desc->nvdimm_bus)
-		return -ENXIO;
 
-	rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size);
-	if (rc) {
-		nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+	rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_buf,
+			nfit_test->nfit_size);
+	if (rc)
 		return rc;
-	}
 
 	if (nfit_test->setup != nfit_test0_setup)
 		return 0;
@@ -1483,22 +1477,16 @@
 	nfit_test->setup_hotplug = 1;
 	nfit_test->setup(nfit_test);
 
-	rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size);
-	if (rc) {
-		nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+	rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_buf,
+			nfit_test->nfit_size);
+	if (rc)
 		return rc;
-	}
 
 	return 0;
 }
 
 static int nfit_test_remove(struct platform_device *pdev)
 {
-	struct nfit_test *nfit_test = to_nfit_test(&pdev->dev);
-	struct acpi_nfit_desc *acpi_desc = &nfit_test->acpi_desc;
-
-	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
-
 	return 0;
 }
 
@@ -1523,12 +1511,6 @@
 	.id_table = nfit_test_id,
 };
 
-#ifdef CONFIG_CMA_SIZE_MBYTES
-#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
-#else
-#define CMA_SIZE_MBYTES 0
-#endif
-
 static __init int nfit_test_init(void)
 {
 	int rc, i;
@@ -1538,7 +1520,6 @@
 	for (i = 0; i < NUM_NFITS; i++) {
 		struct nfit_test *nfit_test;
 		struct platform_device *pdev;
-		static int once;
 
 		nfit_test = kzalloc(sizeof(*nfit_test), GFP_KERNEL);
 		if (!nfit_test) {
@@ -1577,20 +1558,6 @@
 			goto err_register;
 
 		instances[i] = nfit_test;
-
-		if (!once++) {
-			dma_addr_t dma;
-			void *buf;
-
-			buf = dma_alloc_coherent(&pdev->dev, SZ_128M, &dma,
-					GFP_KERNEL);
-			if (!buf) {
-				rc = -ENOMEM;
-				dev_warn(&pdev->dev, "need 128M of free cma\n");
-				goto err_register;
-			}
-			dma_free_coherent(&pdev->dev, SZ_128M, buf, dma);
-		}
 	}
 
 	rc = platform_driver_register(&nfit_test_driver);
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 96c5e16..9f18e2a 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -12,6 +12,7 @@
  */
 #ifndef __NFIT_TEST_H__
 #define __NFIT_TEST_H__
+#include <linux/list.h>
 
 struct nfit_test_resource {
 	struct list_head list;
@@ -26,4 +27,5 @@
 void __wrap_iounmap(volatile void __iomem *addr);
 void nfit_test_setup(nfit_test_lookup_fn lookup);
 void nfit_test_teardown(void);
+struct nfit_test_resource *get_nfit_res(resource_size_t resource);
 #endif