Merge pull request #65 from vikramkanigiri/vk/console_init

Ensure a console is initialized before it is used
diff --git a/bl1/aarch64/bl1_arch_setup.c b/bl1/aarch64/bl1_arch_setup.c
index 758b8e8..5725bac 100644
--- a/bl1/aarch64/bl1_arch_setup.c
+++ b/bl1/aarch64/bl1_arch_setup.c
@@ -39,11 +39,11 @@
 {
 	unsigned long tmp_reg = 0;
 
-	/* Enable alignment checks and set the exception endianess to LE */
+	/* Enable alignment checks */
 	tmp_reg = read_sctlr_el3();
 	tmp_reg |= (SCTLR_A_BIT | SCTLR_SA_BIT);
-	tmp_reg &= ~SCTLR_EE_BIT;
 	write_sctlr_el3(tmp_reg);
+	isb();
 
 	/*
 	 * Enable HVCs, route FIQs to EL3, set the next EL to be AArch64, route
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 012b779..7259601 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -43,6 +43,16 @@
 
 func bl1_entrypoint
 	/* ---------------------------------------------
+	 * Set the CPU endianness before doing anything
+	 * that might involve memory reads or writes
+	 * ---------------------------------------------
+	 */
+	mrs	x0, sctlr_el3
+	bic	x0, x0, #SCTLR_EE_BIT
+	msr	sctlr_el3, x0
+	isb
+
+	/* ---------------------------------------------
 	 * Perform any processor specific actions upon
 	 * reset e.g. cache, tlb invalidations etc.
 	 * ---------------------------------------------
@@ -86,7 +96,6 @@
 	mrs	x0, sctlr_el3
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el3, x0
-
 	isb
 
 _wait_for_entrypoint:
@@ -98,10 +107,10 @@
 	 * their turn to be woken up
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_get_entrypoint
 	cbnz	x0, _do_warm_boot
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbnz	x0, _do_cold_boot
 
diff --git a/bl1/aarch64/bl1_exceptions.S b/bl1/aarch64/bl1_exceptions.S
index 68d088b..a87b20f 100644
--- a/bl1/aarch64/bl1_exceptions.S
+++ b/bl1/aarch64/bl1_exceptions.S
@@ -189,7 +189,7 @@
 	mov	x0, #SYNC_EXCEPTION_AARCH64
 	bl	plat_report_exception
 
-	bl	read_esr_el3
+	mrs	x0, esr_el3
 	ubfx	x1, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x1, #EC_AARCH64_SMC
 	b.ne	panic
@@ -201,10 +201,8 @@
 	mov	x2, x3
 	mov	x3, x4
 	bl	display_boot_progress
-	mov	x0, x20
-	bl	write_elr
-	mov	x0, x21
-	bl	write_spsr
+	msr	elr_el3, x20
+	msr	spsr_el3, x21
 	ubfx	x0, x21, #MODE_EL_SHIFT, #2
 	cmp	x0, #MODE_EL3
 	b.ne	skip_mmu_teardown
@@ -212,18 +210,11 @@
 	/* ---------------------------------------------
 	 * If BL31 is to be executed in EL3 as well
 	 * then turn off the MMU so that it can perform
-	 * its own setup. TODO: Assuming flat mapped
-	 * translations here. Also all should go into a
-	 * separate MMU teardown function
+	 * its own setup.
 	 * ---------------------------------------------
 	 */
-	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
-	bl	read_sctlr_el3
-	bic	x0, x0, x1
-	bl	write_sctlr_el3
-	mov	x0, #DCCISW
-	bl	dcsw_op_all
-	bl	tlbialle3
+	bl	disable_mmu_icache_el3
+	tlbi	alle3
 skip_mmu_teardown:
 	ldp     x6, x7, [sp, #0x30]
 	ldp     x4, x5, [sp, #0x20]
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index b8af9a5..4f7565f 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -54,8 +54,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -73,7 +72,6 @@
 	mrs	x0, sctlr_el1
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el1, x0
-
 	isb
 
 	/* ---------------------------------------------
@@ -103,7 +101,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -121,7 +119,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/bl31/aarch64/bl31_arch_setup.c b/bl31/aarch64/bl31_arch_setup.c
index acaa6b5..ad73de0 100644
--- a/bl31/aarch64/bl31_arch_setup.c
+++ b/bl31/aarch64/bl31_arch_setup.c
@@ -45,10 +45,9 @@
 	unsigned long tmp_reg = 0;
 	uint64_t counter_freq;
 
-	/* Enable alignment checks and set the exception endianness to LE */
+	/* Enable alignment checks */
 	tmp_reg = read_sctlr_el3();
 	tmp_reg |= (SCTLR_A_BIT | SCTLR_SA_BIT);
-	tmp_reg &= ~SCTLR_EE_BIT;
 	write_sctlr_el3(tmp_reg);
 
 	/*
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 39fa605..763303b 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -89,7 +89,6 @@
 	mrs	x1, sctlr_el3
 	orr	x1, x1, #SCTLR_I_BIT
 	msr	sctlr_el3, x1
-
 	isb
 
 	/* ---------------------------------------------
@@ -108,8 +107,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -138,7 +136,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -155,7 +153,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
@@ -164,7 +162,6 @@
 	 */
 	bl	bl31_main
 
-	zero_callee_saved_regs
 	b	el3_exit
 
 _panic:
diff --git a/bl31/aarch64/runtime_exceptions.S b/bl31/aarch64/runtime_exceptions.S
index 53cc176..9c98ad6 100644
--- a/bl31/aarch64/runtime_exceptions.S
+++ b/bl31/aarch64/runtime_exceptions.S
@@ -39,6 +39,17 @@
 	.globl	el3_exit
 	.globl	get_exception_stack
 
+	.macro save_x18_to_x29_sp_el0
+	stp	x18, x19, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
+	stp	x20, x21, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X20]
+	stp	x22, x23, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X22]
+	stp	x24, x25, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X24]
+	stp	x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
+	stp	x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
+	mrs	x18, sp_el0
+	str	x18, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_SP_EL0]
+	.endm
+
 	.section	.vectors, "ax"; .align 11
 
 	.align	7
@@ -250,6 +261,9 @@
 	stp	x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
 	stp	x6, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X6]
 
+	/* Save rest of the gpregs and sp_el0*/
+	save_x18_to_x29_sp_el0
+
 	mov	x5, xzr
 	mov	x6, sp
 
@@ -264,10 +278,6 @@
 	adr	x14, rt_svc_descs_indices
 	ldrb	w15, [x14, x16]
 
-	/* Save x18 and SP_EL0 */
-	mrs	x17, sp_el0
-	stp	x18, x17, [x6, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
-
 	/* -----------------------------------------------------
 	 * Restore the saved C runtime stack value which will
 	 * become the new SP_EL0 i.e. EL3 runtime stack. It was
@@ -357,8 +367,8 @@
 	msr	elr_el3, x17
 
 	/* Restore saved general purpose registers and return */
-	bl	restore_scratch_registers
-	ldp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	bl	restore_gp_registers
+	ldr	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	eret
 
 smc_unknown:
@@ -369,10 +379,10 @@
 	 * content). Either way, we aren't leaking any secure information
 	 * through them
 	 */
-	bl	restore_scratch_registers_callee
+	bl	restore_gp_registers_callee
 
 smc_prohibited:
-	ldp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	ldr	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	mov	w0, #SMC_UNK
 	eret
 
@@ -381,12 +391,16 @@
 
 	/* -----------------------------------------------------
 	 * The following functions are used to saved and restore
-	 * all the caller saved registers as per the aapcs_64.
+	 * all the general pupose registers. Ideally we would
+	 * only save and restore the callee saved registers when
+	 * a world switch occurs but that type of implementation
+	 * is more complex. So currently we will always save and
+	 * restore these registers on entry and exit of EL3.
 	 * These are not macros to ensure their invocation fits
 	 * within the 32 instructions per exception vector.
 	 * -----------------------------------------------------
 	 */
-func save_scratch_registers
+func save_gp_registers
 	stp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	stp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 	stp	x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
@@ -396,16 +410,15 @@
 	stp	x12, x13, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X12]
 	stp	x14, x15, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X14]
 	stp	x16, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X16]
-	mrs	x17, sp_el0
-	stp	x18, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
+	save_x18_to_x29_sp_el0
 	ret
 
-func restore_scratch_registers
+func restore_gp_registers
 	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 
-restore_scratch_registers_callee:
-	ldp	x18, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
+restore_gp_registers_callee:
+	ldr	x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_SP_EL0]
 
 	ldp	x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
 	ldp	x6, x7, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X6]
@@ -413,9 +426,14 @@
 	ldp	x10, x11, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X10]
 	ldp	x12, x13, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X12]
 	ldp	x14, x15, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X14]
-
 	msr	sp_el0, x17
 	ldp	x16, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X16]
+	ldp	x18, x19, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
+	ldp	x20, x21, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X20]
+	ldp	x22, x23, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X22]
+	ldp	x24, x25, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X24]
+	ldp	x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
+	ldp	x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
 	ret
 
 	/* -----------------------------------------------------
diff --git a/bl31/bl31_main.c b/bl31/bl31_main.c
index 01f00f2..755320d 100644
--- a/bl31/bl31_main.c
+++ b/bl31/bl31_main.c
@@ -100,6 +100,7 @@
 	assert(cm_get_context(mpidr, NON_SECURE));
 	cm_set_next_eret_context(NON_SECURE);
 	write_vbar_el3((uint64_t) runtime_exceptions);
+	isb();
 	next_image_type = NON_SECURE;
 
 	/*
diff --git a/drivers/arm/gic/aarch64/gic_v3_sysregs.S b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
index 2a96da7..ddf85a8 100644
--- a/drivers/arm/gic/aarch64/gic_v3_sysregs.S
+++ b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
@@ -67,23 +67,19 @@
 
 func write_icc_sre_el1
 	msr	ICC_SRE_EL1, x0
-	isb
 	ret
 
 
 func write_icc_sre_el2
 	msr	ICC_SRE_EL2, x0
-	isb
 	ret
 
 
 func write_icc_sre_el3
 	msr	ICC_SRE_EL3, x0
-	isb
 	ret
 
 
 func write_icc_pmr_el1
 	msr	ICC_PMR_EL1, x0
-	isb
 	ret
diff --git a/include/bl31/cm_macros.S b/include/bl31/cm_macros.S
index d264956..e82f3a3 100644
--- a/include/bl31/cm_macros.S
+++ b/include/bl31/cm_macros.S
@@ -27,31 +27,9 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
-
 #include <arch.h>
 #include <context.h>
 
-
-	/* ---------------------------------------------
-	 * Zero out the callee saved register to prevent
-	 * leakage of secure state into the normal world
-	 * during the first ERET after a cold/warm boot.
-	 * ---------------------------------------------
-	 */
-	.macro	zero_callee_saved_regs
-	mov	x19, xzr
-	mov	x20, xzr
-	mov	x21, xzr
-	mov	x22, xzr
-	mov	x23, xzr
-	mov	x24, xzr
-	mov	x25, xzr
-	mov	x26, xzr
-	mov	x27, xzr
-	mov	x28, xzr
-	mov	x29, xzr
-	.endm
-
 	.macro	switch_to_exception_stack reg1 reg2
 	mov     \reg1 , sp
 	ldr	\reg2, [\reg1, #CTX_EL3STATE_OFFSET + CTX_EXCEPTION_SP]
@@ -64,7 +42,7 @@
 	 * -----------------------------------------------------
 	 */
 	.macro	handle_sync_exception
-	stp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	str	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	mrs	x30, esr_el3
 	ubfx	x30, x30, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 
@@ -83,7 +61,7 @@
 	 * not expect any such exceptions.
 	 * -----------------------------------------------------
 	 */
-	bl	save_scratch_registers
+	bl	save_gp_registers
 	switch_to_exception_stack x0 x1
 
 	/* Save the core_context pointer for handled faults */
@@ -92,8 +70,8 @@
 	ldp	x0, xzr, [sp], #0x10
 
 	mov	sp, x0
-	bl	restore_scratch_registers
-	ldp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	bl	restore_gp_registers
+	ldr	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	eret
 	.endm
 
@@ -103,8 +81,8 @@
 	 * -----------------------------------------------------
 	 */
 	.macro	handle_async_exception type
-	stp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
-	bl	save_scratch_registers
+	str	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	bl	save_gp_registers
 	switch_to_exception_stack x0 x1
 
 	/* Save the core_context pointer */
@@ -114,7 +92,7 @@
 	ldp	x0, xzr, [sp], #0x10
 
 	mov	sp, x0
-	bl	restore_scratch_registers
-	ldp	x30, xzr, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	bl	restore_gp_registers
+	ldr	x30, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	.endm
 
diff --git a/include/bl31/context.h b/include/bl31/context.h
index 989b2e6..549fa21 100644
--- a/include/bl31/context.h
+++ b/include/bl31/context.h
@@ -55,10 +55,20 @@
 #define CTX_GPREG_X16		0x80
 #define CTX_GPREG_X17		0x88
 #define CTX_GPREG_X18		0x90
-#define CTX_GPREG_SP_EL0	0x98
-#define CTX_GPREG_LR		0xa0
-/* Unused space to allow registers to be stored as pairs */
-#define CTX_GPREGS_END		0xb0
+#define CTX_GPREG_X19		0x98
+#define CTX_GPREG_X20		0xa0
+#define CTX_GPREG_X21		0xa8
+#define CTX_GPREG_X22		0xb0
+#define CTX_GPREG_X23		0xb8
+#define CTX_GPREG_X24		0xc0
+#define CTX_GPREG_X25		0xc8
+#define CTX_GPREG_X26		0xd0
+#define CTX_GPREG_X27		0xd8
+#define CTX_GPREG_X28		0xe0
+#define CTX_GPREG_X29		0xe8
+#define CTX_GPREG_LR		0xf0
+#define CTX_GPREG_SP_EL0	0xf8
+#define CTX_GPREGS_END		0x100
 
 /*******************************************************************************
  * Constants that allow assembler code to access members of and the 'el3_state'
@@ -188,10 +198,11 @@
 #define CTX_EL3STATE_ALL	(CTX_EL3STATE_END >> DWORD_SHIFT)
 
 /*
- * AArch64 general purpose register context structure. Only x0-x18, lr
- * are saved as the compiler is expected to preserve the remaining
+ * AArch64 general purpose register context structure. Usually x0-x18,
+ * lr are saved as the compiler is expected to preserve the remaining
  * callee saved registers if used by the C runtime and the assembler
- * does not touch the remaining.
+ * does not touch the remaining. But in case of world switch during
+ * exception handling, we need to save the callee registers too.
  */
 DEFINE_REG_STRUCT(gp_regs, CTX_GPREG_ALL);
 
diff --git a/include/common/asm_macros.S b/include/common/asm_macros.S
index 6cf1a19..3dbd9f2 100644
--- a/include/common/asm_macros.S
+++ b/include/common/asm_macros.S
@@ -58,20 +58,13 @@
 
 
 	.macro	smc_check  label
-	bl	read_esr
+	mrs	x0, esr_el3
 	ubfx	x0, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x0, #EC_AARCH64_SMC
 	b.ne	$label
 	.endm
 
 
-	.macro	setup_dcsw_op_args  start_level, end_level, clidr, shift, fw, ls
-	mrs	\clidr, clidr_el1
-	mov	\start_level, xzr
-	ubfx	\end_level, \clidr, \shift, \fw
-	lsl	\end_level, \end_level, \ls
-	.endm
-
 	/*
 	 * This macro verifies that the a given vector doesn't exceed the
 	 * architectural limit of 32 instructions. This is meant to be placed
diff --git a/include/drivers/arm/pl011.h b/include/drivers/arm/pl011.h
index 28aef54..1254920 100644
--- a/include/drivers/arm/pl011.h
+++ b/include/drivers/arm/pl011.h
@@ -78,10 +78,6 @@
 #define PL011_UARTCR_LBE          (1 << 7)	/* Loopback enable */
 #define PL011_UARTCR_UARTEN       (1 << 0)	/* UART Enable */
 
-#if !defined(PL011_BASE)
-#error "The PL011_BASE macro must be defined."
-#endif
-
 #if !defined(PL011_BAUDRATE)
 #define PL011_BAUDRATE  115200
 #endif
diff --git a/include/lib/aarch64/arch_helpers.h b/include/lib/aarch64/arch_helpers.h
index 565b1b4..517e25a 100644
--- a/include/lib/aarch64/arch_helpers.h
+++ b/include/lib/aarch64/arch_helpers.h
@@ -78,6 +78,9 @@
 extern void dcsw_op_louis(unsigned int);
 extern void dcsw_op_all(unsigned int);
 
+extern void disable_mmu_el3(void);
+extern void disable_mmu_icache_el3(void);
+
 /*******************************************************************************
  * Misc. accessor prototypes
  ******************************************************************************/
@@ -191,9 +194,7 @@
 extern unsigned long read_ttbr0_el2(void);
 extern unsigned long read_ttbr0_el3(void);
 
-extern unsigned long read_ttbr1(void);
 extern unsigned long read_ttbr1_el1(void);
-extern unsigned long read_ttbr1_el2(void);
 
 extern unsigned long read_cptr_el2(void);
 extern unsigned long read_cptr_el3(void);
@@ -225,12 +226,10 @@
 extern void write_esr_el2(unsigned long);
 extern void write_esr_el3(unsigned long);
 
-extern void write_afsr0(unsigned long);
 extern void write_afsr0_el1(unsigned long);
 extern void write_afsr0_el2(unsigned long);
 extern void write_afsr0_el3(unsigned long);
 
-extern void write_afsr1(unsigned long);
 extern void write_afsr1_el1(unsigned long);
 extern void write_afsr1_el2(unsigned long);
 extern void write_afsr1_el3(unsigned long);
@@ -260,7 +259,6 @@
 extern void write_ttbr0_el3(unsigned long);
 
 extern void write_ttbr1_el1(unsigned long);
-extern void write_ttbr1_el2(unsigned long);
 
 extern void write_cpuectlr(unsigned long);
 extern void write_cptr_el2(unsigned long);
diff --git a/lib/aarch64/cache_helpers.S b/lib/aarch64/cache_helpers.S
index 2649ad0..a5b918c 100644
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@@ -46,57 +46,41 @@
 
 func dcisw
 	dc	isw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccisw
 	dc	cisw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccsw
 	dc	csw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvac
 	dc	cvac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dcivac
 	dc	ivac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccivac
 	dc	civac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvau
 	dc	cvau, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dczva
 	dc	zva, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -138,94 +122,92 @@
 	ret
 
 
-	/* ------------------------------------------
-	 * Data cache operations by set/way to the
-	 * level specified
-	 * ------------------------------------------
-	 * ----------------------------------
-	 * Call this func with the clidr in
-	 * x0, starting cache level in x10,
-	 * last cache level in x3 & cm op in
-	 * x14
-	 * ----------------------------------
+	/* ---------------------------------------------------------------
+	 * Data cache operations by set/way to the level specified
+	 *
+	 * The main function, do_dcsw_op requires:
+	 * x0: The operation type (0-2), as defined in arch.h
+	 * x3: The last cache level to operate on
+	 * x9: clidr_el1
+	 * and will carry out the operation on each data cache from level 0
+	 * to the level in x3 in sequence
+	 *
+	 * The dcsw_op macro sets up the x3 and x9 parameters based on
+	 * clidr_el1 cache information before invoking the main function
+	 * ---------------------------------------------------------------
 	 */
-func dcsw_op
-all_start_at_level:
-	add	x2, x10, x10, lsr #1            // work out 3x current cache level
-	lsr	x1, x0, x2                      // extract cache type bits from clidr
-	and	x1, x1, #7                      // mask of the bits for current cache only
-	cmp	x1, #2                          // see what cache we have at this level
-	b.lt	skip                            // skip if no cache, or just i-cache
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	isb                                     // isb to sych the new cssr&csidr
-	mrs	x1, ccsidr_el1                  // read the new ccsidr
-	and	x2, x1, #7                      // extract the length of the cache lines
-	add	x2, x2, #4                      // add 4 (line length offset)
-	mov	x4, #0x3ff
-	and	x4, x4, x1, lsr #3              // find maximum number on the way size
-	clz	w5, w4                          // find bit position of way size increment
-	mov	x7, #0x7fff
-	and	x7, x7, x1, lsr #13             // extract max number of the index size
-loop2:
-	mov	x9, x4                          // create working copy of max way size
-loop3:
-	lsl	x6, x9, x5
-	orr	x11, x10, x6                    // factor way and cache number into x11
-	lsl	x6, x7, x2
-	orr	x11, x11, x6                    // factor index number into x11
-	mov	x12, x0
-	mov	x13, x30 // lr
-	mov	x0, x11
-	blr	x14
-	mov	x0, x12
-	mov	x30, x13 // lr
-	subs	x9, x9, #1                      // decrement the way
-	b.ge    loop3
-	subs	x7, x7, #1                      // decrement the index
-	b.ge    loop2
-skip:
-	add	x10, x10, #2                    // increment cache number
-	cmp	x3, x10
-	b.gt    all_start_at_level
-finished:
-	mov	x10, #0                         // swith back to cache level 0
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	dsb	sy
-	isb
-	ret
 
+	.macro	dcsw_op shift, fw, ls
+	mrs	x9, clidr_el1
+	ubfx	x3, x9, \shift, \fw
+	lsl	x3, x3, \ls
+	b	do_dcsw_op
+	.endm
 
 func do_dcsw_op
 	cbz	x3, exit
-	cmp	x0, #DCISW
-	b.eq	dc_isw
-	cmp	x0, #DCCISW
-	b.eq	dc_cisw
-	cmp	x0, #DCCSW
-	b.eq	dc_csw
-dc_isw:
+	mov	x10, xzr
+	adr	x14, dcsw_loop_table	// compute inner loop address
+	add	x14, x14, x0, lsl #5	// inner loop is 8x32-bit instructions
 	mov	x0, x9
-	adr	x14, dcisw
-	b	dcsw_op
-dc_cisw:
-	mov	x0, x9
-	adr	x14, dccisw
-	b	dcsw_op
-dc_csw:
-	mov	x0, x9
-	adr	x14, dccsw
-	b	dcsw_op
+	mov	w8, #1
+loop1:
+	add	x2, x10, x10, lsr #1	// work out 3x current cache level
+	lsr	x1, x0, x2		// extract cache type bits from clidr
+	and	x1, x1, #7		// mask the bits for current cache only
+	cmp	x1, #2			// see what cache we have at this level
+	b.lt	level_done		// nothing to do if no cache or icache
+
+	msr	csselr_el1, x10		// select current cache level in csselr
+	isb				// isb to sych the new cssr&csidr
+	mrs	x1, ccsidr_el1		// read the new ccsidr
+	and	x2, x1, #7		// extract the length of the cache lines
+	add	x2, x2, #4		// add 4 (line length offset)
+	ubfx	x4, x1, #3, #10		// maximum way number
+	clz	w5, w4			// bit position of way size increment
+	lsl	w9, w4, w5		// w9 = aligned max way number
+	lsl	w16, w8, w5		// w16 = way number loop decrement
+	orr	w9, w10, w9		// w9 = combine way and cache number
+	ubfx	w6, w1, #13, #15	// w6 = max set number
+	lsl	w17, w8, w2		// w17 = set number loop decrement
+	dsb	sy			// barrier before we start this level
+	br	x14			// jump to DC operation specific loop
+
+	.macro	dcsw_loop _op
+loop2_\_op:
+	lsl	w7, w6, w2		// w7 = aligned max set number
+
+loop3_\_op:
+	orr	w11, w9, w7		// combine cache, way and set number
+	dc	\_op, x11
+	subs	w7, w7, w17		// decrement set number
+	b.ge	loop3_\_op
+
+	subs	x9, x9, x16		// decrement way number
+	b.ge	loop2_\_op
+
+	b	level_done
+	.endm
+
+level_done:
+	add	x10, x10, #2		// increment cache number
+	cmp	x3, x10
+	b.gt    loop1
+	msr	csselr_el1, xzr		// select cache level 0 in csselr
+	dsb	sy			// barrier to complete final cache operation
+	isb
 exit:
 	ret
 
+dcsw_loop_table:
+	dcsw_loop isw
+	dcsw_loop cisw
+	dcsw_loop csw
+
 
 func dcsw_op_louis
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
 
 
 func dcsw_op_all
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
diff --git a/lib/aarch64/cpu_helpers.S b/lib/aarch64/cpu_helpers.S
index 573d0b8..abb996d 100644
--- a/lib/aarch64/cpu_helpers.S
+++ b/lib/aarch64/cpu_helpers.S
@@ -35,13 +35,11 @@
 
 
 func cpu_reset_handler
-	mov	x19, x30 // lr
-
 	/* ---------------------------------------------
 	 * As a bare minimal enable the SMP bit.
 	 * ---------------------------------------------
 	 */
-	bl	read_midr
+	mrs	x0, midr_el1
 	lsr	x0, x0, #MIDR_PN_SHIFT
 	and	x0, x0, #MIDR_PN_MASK
 	cmp	x0, #MIDR_PN_A57
@@ -49,8 +47,9 @@
 	cmp	x0, #MIDR_PN_A53
 	b.ne	smp_setup_end
 smp_setup_begin:
-	bl	read_cpuectlr
+	mrs	x0, CPUECTLR_EL1
 	orr	x0, x0, #CPUECTLR_SMP_BIT
-	bl	write_cpuectlr
+	msr	CPUECTLR_EL1, x0
+	isb
 smp_setup_end:
-	ret	x19
+	ret
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index e7b2331..e7ee015 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -46,22 +46,18 @@
 	.globl	read_daif
 	.globl	write_daif
 
-	.globl	read_spsr
 	.globl	read_spsr_el1
 	.globl	read_spsr_el2
 	.globl	read_spsr_el3
 
-	.globl	write_spsr
 	.globl	write_spsr_el1
 	.globl	write_spsr_el2
 	.globl	write_spsr_el3
 
-	.globl	read_elr
 	.globl	read_elr_el1
 	.globl	read_elr_el2
 	.globl	read_elr_el3
 
-	.globl	write_elr
 	.globl	write_elr_el1
 	.globl	write_elr_el2
 	.globl	write_elr_el3
@@ -79,6 +75,9 @@
 	.globl	zeromem16
 	.globl	memcpy16
 
+	.globl	disable_mmu_el3
+	.globl	disable_mmu_icache_el3
+
 
 func get_afflvl_shift
 	cmp	x0, #3
@@ -150,16 +149,6 @@
 	ret
 
 
-func read_spsr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el3
-
-
 func read_spsr_el1
 	mrs	x0, spsr_el1
 	ret
@@ -175,44 +164,21 @@
 	ret
 
 
-func write_spsr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el3
-
-
 func write_spsr_el1
 	msr	spsr_el1, x0
-	isb
 	ret
 
 
 func write_spsr_el2
 	msr	spsr_el2, x0
-	isb
 	ret
 
 
 func write_spsr_el3
 	msr	spsr_el3, x0
-	isb
 	ret
 
 
-func read_elr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_elr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_elr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_elr_el3
-
-
 func read_elr_el1
 	mrs	x0, elr_el1
 	ret
@@ -228,31 +194,18 @@
 	ret
 
 
-func write_elr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_elr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_elr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_elr_el3
-
-
 func write_elr_el1
 	msr	elr_el1, x0
-	isb
 	ret
 
 
 func write_elr_el2
 	msr	elr_el2, x0
-	isb
 	ret
 
 
 func write_elr_el3
 	msr	elr_el3, x0
-	isb
 	ret
 
 
@@ -338,3 +291,27 @@
 	subs	x2, x2, #1
 	b.ne	m_loop1
 m_end:	ret
+
+/* ---------------------------------------------------------------------------
+ * Disable the MMU at EL3
+ * This is implemented in assembler to ensure that the data cache is cleaned
+ * and invalidated after the MMU is disabled without any intervening cacheable
+ * data accesses
+ * ---------------------------------------------------------------------------
+ */
+
+func disable_mmu_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
+do_disable_mmu:
+	mrs	x0, sctlr_el3
+	bic	x0, x0, x1
+	msr	sctlr_el3, x0
+	isb				// ensure MMU is off
+	mov	x0, #DCCISW		// DCache clean and invalidate
+	b	dcsw_op_all
+
+
+func disable_mmu_icache_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
+	b	do_disable_mmu
+
diff --git a/lib/aarch64/sysreg_helpers.S b/lib/aarch64/sysreg_helpers.S
index 61468f9..376da49 100644
--- a/lib/aarch64/sysreg_helpers.S
+++ b/lib/aarch64/sysreg_helpers.S
@@ -125,10 +125,7 @@
 	.globl	write_ttbr0_el3
 
 	.globl	read_ttbr1_el1
-	.globl	read_ttbr1_el2
-	.globl	write_ttbr1
 	.globl	write_ttbr1_el1
-	.globl	write_ttbr1_el2
 
 	.globl	read_cpacr
 	.globl	write_cpacr
@@ -160,8 +157,6 @@
 
 #if SUPPORT_VFP
 	.globl	enable_vfp
-	.globl	read_fpexc
-	.globl	write_fpexc
 #endif
 
 
@@ -201,19 +196,16 @@
 
 func write_vbar_el1
 	msr	vbar_el1, x0
-	isb
 	ret
 
 
 func write_vbar_el2
 	msr	vbar_el2, x0
-	isb
 	ret
 
 
 func write_vbar_el3
 	msr	vbar_el3, x0
-	isb
 	ret
 
 
@@ -238,19 +230,16 @@
 
 func write_afsr0_el1
 	msr	afsr0_el1, x0
-	isb
 	ret
 
 
 func write_afsr0_el2
 	msr	afsr0_el2, x0
-	isb
 	ret
 
 
 func write_afsr0_el3
 	msr	afsr0_el3, x0
-	isb
 	ret
 
 
@@ -275,19 +264,16 @@
 
 func write_far_el1
 	msr	far_el1, x0
-	isb
 	ret
 
 
 func write_far_el2
 	msr	far_el2, x0
-	isb
 	ret
 
 
 func write_far_el3
 	msr	far_el3, x0
-	isb
 	ret
 
 
@@ -312,19 +298,16 @@
 
 func write_mair_el1
 	msr	mair_el1, x0
-	isb
 	ret
 
 
 func write_mair_el2
 	msr	mair_el2, x0
-	isb
 	ret
 
 
 func write_mair_el3
 	msr	mair_el3, x0
-	isb
 	ret
 
 
@@ -349,19 +332,16 @@
 
 func write_amair_el1
 	msr	amair_el1, x0
-	isb
 	ret
 
 
 func write_amair_el2
 	msr	amair_el2, x0
-	isb
 	ret
 
 
 func write_amair_el3
 	msr	amair_el3, x0
-	isb
 	ret
 
 
@@ -405,19 +385,16 @@
 
 func write_rmr_el1
 	msr	rmr_el1, x0
-	isb
 	ret
 
 
 func write_rmr_el2
 	msr	rmr_el2, x0
-	isb
 	ret
 
 
 func write_rmr_el3
 	msr	rmr_el3, x0
-	isb
 	ret
 
 
@@ -442,19 +419,16 @@
 
 func write_afsr1_el1
 	msr	afsr1_el1, x0
-	isb
 	ret
 
 
 func write_afsr1_el2
 	msr	afsr1_el2, x0
-	isb
 	ret
 
 
 func write_afsr1_el3
 	msr	afsr1_el3, x0
-	isb
 	ret
 
 
@@ -479,22 +453,16 @@
 
 func write_sctlr_el1
 	msr	sctlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el2
 	msr	sctlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el3
 	msr	sctlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -519,22 +487,16 @@
 
 func write_actlr_el1
 	msr	actlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el2
 	msr	actlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el3
 	msr	actlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -559,22 +521,16 @@
 
 func write_esr_el1
 	msr	esr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el2
 	msr	esr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el3
 	msr	esr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -599,22 +555,16 @@
 
 func write_tcr_el1
 	msr	tcr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el2
 	msr	tcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el3
 	msr	tcr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -622,11 +572,6 @@
 	 * CPTR accessors
 	 * -----------------------------------------------------
 	 */
-func read_cptr_el1
-	b	read_cptr_el1
-	ret
-
-
 func read_cptr_el2
 	mrs	x0, cptr_el2
 	ret
@@ -637,21 +582,13 @@
 	ret
 
 
-func write_cptr_el1
-	b	write_cptr_el1
-
-
 func write_cptr_el2
 	msr	cptr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_cptr_el3
 	msr	cptr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -676,19 +613,16 @@
 
 func write_ttbr0_el1
 	msr	ttbr0_el1, x0
-	isb
 	ret
 
 
 func write_ttbr0_el2
 	msr	ttbr0_el2, x0
-	isb
 	ret
 
 
 func write_ttbr0_el3
 	msr	ttbr0_el3, x0
-	isb
 	ret
 
 
@@ -701,28 +635,11 @@
 	ret
 
 
-func read_ttbr1_el2
-	b	read_ttbr1_el2
-
-
-func read_ttbr1_el3
-	b	read_ttbr1_el3
-
-
 func write_ttbr1_el1
 	msr	ttbr1_el1, x0
-	isb
 	ret
 
 
-func write_ttbr1_el2
-	b	write_ttbr1_el2
-
-
-func write_ttbr1_el3
-	b	write_ttbr1_el3
-
-
 func read_hcr
 	mrs	x0, hcr_el2
 	ret
@@ -730,8 +647,6 @@
 
 func write_hcr
 	msr	hcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -762,8 +677,6 @@
 
 func write_cpuectlr
 	msr	CPUECTLR_EL1, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -789,8 +702,6 @@
 
 func write_scr
 	msr	scr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -818,16 +729,7 @@
 	mov	x1, #AARCH64_CPTR_TFP
 	bic	x0, x0, x1
 	msr	cptr_el3, x0
-	ret
-
-
-func read_fpexc
-	b	read_fpexc
-	ret
-
-
-func write_fpexc
-	b	write_fpexc
+	isb
 	ret
 
 #endif
diff --git a/lib/aarch64/tlb_helpers.S b/lib/aarch64/tlb_helpers.S
index ec1558b..8dfae12 100644
--- a/lib/aarch64/tlb_helpers.S
+++ b/lib/aarch64/tlb_helpers.S
@@ -41,47 +41,33 @@
 
 func tlbialle1
 	tlbi	alle1
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle1is
 	tlbi	alle1is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2
 	tlbi	alle2
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2is
 	tlbi	alle2is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3
 	tlbi	alle3
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3is
 	tlbi	alle3is
-	dsb	sy
-	isb
 	ret
 
 func tlbivmalle1
 	tlbi	vmalle1
-	dsb	sy
-	isb
 	ret
diff --git a/plat/fvp/aarch64/bl1_plat_helpers.S b/plat/fvp/aarch64/bl1_plat_helpers.S
index 92075ea..b4d4458 100644
--- a/plat/fvp/aarch64/bl1_plat_helpers.S
+++ b/plat/fvp/aarch64/bl1_plat_helpers.S
@@ -67,7 +67,7 @@
 	 * loader zeroes out the zi section.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	ldr	x1, =PWRC_BASE
 	str	w0, [x1, #PPOFFR_OFF]
 
@@ -173,8 +173,6 @@
 func platform_cold_boot_init
 	mov	x20, x0
 	bl	platform_mem_init
-	bl	read_mpidr
-	mov	x19, x0
 
 	/* ---------------------------------------------
 	 * Give ourselves a small coherent stack to
@@ -182,6 +180,7 @@
 	 * CCI in assembler
 	 * ---------------------------------------------
 	 */
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -200,7 +199,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/plat/fvp/aarch64/plat_common.c b/plat/fvp/aarch64/plat_common.c
index c8e529d..edeb6e0 100644
--- a/plat/fvp/aarch64/plat_common.c
+++ b/plat/fvp/aarch64/plat_common.c
@@ -69,6 +69,8 @@
 	ttbr = (unsigned long) l1_xlation_table;
 
 	if (GET_EL(current_el) == MODE_EL3) {
+		assert((read_sctlr_el3() & SCTLR_M_BIT) == 0);
+
 		write_mair_el3(mair);
 		tcr |= TCR_EL3_RES1;
 		/* Invalidate EL3 TLBs */
@@ -77,11 +79,19 @@
 		write_tcr_el3(tcr);
 		write_ttbr0_el3(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el3();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el3(sctlr);
 	} else {
+		assert((read_sctlr_el1() & SCTLR_M_BIT) == 0);
 
 		write_mair_el1(mair);
 		/* Invalidate EL1 TLBs */
@@ -90,32 +100,20 @@
 		write_tcr_el1(tcr);
 		write_ttbr0_el1(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el1();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el1(sctlr);
 	}
-
-	return;
-}
-
-void disable_mmu(void)
-{
-	unsigned long sctlr;
-	unsigned long current_el = read_current_el();
-
-	if (GET_EL(current_el) == MODE_EL3) {
-		sctlr = read_sctlr_el3();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el3(sctlr);
-	} else {
-		sctlr = read_sctlr_el1();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el1(sctlr);
-	}
-
-	/* Flush the caches */
-	dcsw_op_all(DCCISW);
+	/* ensure the MMU enable takes effect immediately */
+	isb();
 
 	return;
 }
diff --git a/plat/fvp/plat_gic.c b/plat/fvp/plat_gic.c
index 8457af1..db3c9cf 100644
--- a/plat/fvp/plat_gic.c
+++ b/plat/fvp/plat_gic.c
@@ -86,6 +86,7 @@
 	 */
 	scr_val = read_scr();
 	write_scr(scr_val | SCR_NS_BIT);
+	isb();	/* ensure NS=1 takes effect before accessing ICC_SRE_EL2 */
 
 	/*
 	 * By default EL2 and NS-EL1 software should be able to enable GICv3
@@ -103,9 +104,11 @@
 	write_icc_sre_el2(val | ICC_SRE_EN | ICC_SRE_SRE);
 
 	write_icc_pmr_el1(GIC_PRI_MASK);
+	isb();	/* commite ICC_* changes before setting NS=0 */
 
 	/* Restore SCR_EL3 */
 	write_scr(scr_val);
+	isb();	/* ensure NS=0 takes effect immediately */
 }
 
 /*******************************************************************************
diff --git a/plat/fvp/plat_pm.c b/plat/fvp/plat_pm.c
index 5430fff..f80e2d7 100644
--- a/plat/fvp/plat_pm.c
+++ b/plat/fvp/plat_pm.c
@@ -54,7 +54,11 @@
 	if (target_afflvl != MPIDR_AFFLVL0)
 		return PSCI_E_INVALID_PARAMS;
 
-	/* Enter standby state */
+	/*
+	 * Enter standby state
+	 * dsb is good practice before using wfi to enter low power states
+	 */
+	dsb();
 	wfi();
 
 	return PSCI_E_SUCCESS;
diff --git a/plat/fvp/platform.h b/plat/fvp/platform.h
index 1f4e432..3fe892e 100644
--- a/plat/fvp/platform.h
+++ b/plat/fvp/platform.h
@@ -298,7 +298,6 @@
 #define PL011_UART1_BASE		0x1c0a0000
 #define PL011_UART2_BASE		0x1c0b0000
 #define PL011_UART3_BASE		0x1c0c0000
-#define PL011_BASE			PL011_UART0_BASE
 
 
 /*******************************************************************************
@@ -371,7 +370,6 @@
 extern void bl31_plat_arch_setup(void);
 extern int platform_setup_pm(const struct plat_pm_ops **);
 extern unsigned int platform_get_core_pos(unsigned long mpidr);
-extern void disable_mmu(void);
 extern void enable_mmu(void);
 extern void configure_mmu(struct meminfo *,
 			  unsigned long,
diff --git a/services/std_svc/psci/psci_afflvl_off.c b/services/std_svc/psci/psci_afflvl_off.c
index e007bc3..21a4d1a 100644
--- a/services/std_svc/psci/psci_afflvl_off.c
+++ b/services/std_svc/psci/psci_afflvl_off.c
@@ -82,6 +82,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_afflvl_suspend.c b/services/std_svc/psci/psci_afflvl_suspend.c
index dc12f7a..534e4a9 100644
--- a/services/std_svc/psci/psci_afflvl_suspend.c
+++ b/services/std_svc/psci/psci_afflvl_suspend.c
@@ -198,6 +198,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_entry.S b/services/std_svc/psci/psci_entry.S
index e2c690d..256c538 100644
--- a/services/std_svc/psci/psci_entry.S
+++ b/services/std_svc/psci/psci_entry.S
@@ -75,10 +75,8 @@
 	 * ---------------------------------------------
 	 */
 	msr	spsel, #0
-	isb
 
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -86,14 +84,14 @@
 	 * level 0.
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	get_power_on_target_afflvl
 	cmp	x0, xzr
 	b.lt	_panic
 	mov	x3, x23
 	mov	x2, x0
-	mov	x0, x19
 	mov	x1, #MPIDR_AFFLVL0
+	mrs	x0, mpidr_el1
 	blr	x22
 
 	/* --------------------------------------------
@@ -101,10 +99,9 @@
 	 * -IS-WBWA memory
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
-	zero_callee_saved_regs
 	b	el3_exit
 _panic:
 	b	_panic
@@ -120,7 +117,7 @@
 	sub	sp, sp, #0x10
 	stp	x19, x20, [sp, #0]
 	mov	x19, sp
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	bl	psci_cpu_off
 	mov	x1, #PSCI_E_SUCCESS
@@ -141,7 +138,7 @@
 	mov	x20, x0
 	mov	x21, x1
 	mov	x22, x2
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	mov	x0, x20
 	mov	x1, x21
@@ -158,7 +155,7 @@
 	ret
 
 func final_wfi
-	dsb	sy
+	dsb	sy		// ensure write buffer empty
 	wfi
 wfi_spill:
 	b	wfi_spill