Merge branch 'x86/urgent' into x86/asm, to resolve conflict

Conflicts:
	arch/x86/kernel/entry_64.S

Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbc..8ef964d 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
 #include <asm/page_types.h>
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
 	__HEAD
 ENTRY(startup_32)
@@ -102,7 +103,7 @@
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
 	 * us to not reload segments
 	 */
-	testb	$(1<<6), BP_loadflags(%esi)
+	testb	$KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz	1f
 
 	cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c..90c1521 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
 	__HEAD
 	.code32
@@ -46,7 +47,7 @@
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
 	 * us to not reload segments
 	 */
-	testb $(1<<6), BP_loadflags(%esi)
+	testb $KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz 1f
 
 	cli
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b42..bb635c6 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9..1f5e2b0 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -203,6 +203,8 @@
 
 	err |= restore_xstate_sig(buf, 1);
 
+	force_iret();
+
 	return err;
 }
 
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..5d2641c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
 
 	.section .entry.text, "ax"
 
-	.macro IA32_ARG_FIXUP noebp=0
-	movl	%edi,%r8d
-	.if \noebp
-	.else
-	movl	%ebp,%r9d
-	.endif
-	xchg	%ecx,%esi
-	movl	%ebx,%edi
-	movl	%edx,%edx	/* zero extension */
-	.endm 
-
-	/* clobbers %eax */	
-	.macro  CLEAR_RREGS offset=0, _r9=rax
+	/* clobbers %rax */
+	.macro  CLEAR_RREGS _r9=rax
 	xorl 	%eax,%eax
-	movq	%rax,\offset+R11(%rsp)
-	movq	%rax,\offset+R10(%rsp)
-	movq	%\_r9,\offset+R9(%rsp)
-	movq	%rax,\offset+R8(%rsp)
+	movq	%rax,R11(%rsp)
+	movq	%rax,R10(%rsp)
+	movq	%\_r9,R9(%rsp)
+	movq	%rax,R8(%rsp)
 	.endm
 
 	/*
@@ -60,14 +49,14 @@
 	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
 	 * an appropriately invalid value.
 	 */
-	.macro LOAD_ARGS32 offset, _r9=0
+	.macro LOAD_ARGS32 _r9=0
 	.if \_r9
-	movl \offset+16(%rsp),%r9d
+	movl R9(%rsp),%r9d
 	.endif
-	movl \offset+40(%rsp),%ecx
-	movl \offset+48(%rsp),%edx
-	movl \offset+56(%rsp),%esi
-	movl \offset+64(%rsp),%edi
+	movl RCX(%rsp),%ecx
+	movl RDX(%rsp),%edx
+	movl RSI(%rsp),%esi
+	movl RDI(%rsp),%edi
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
@@ -99,35 +88,38 @@
 /*
  * 32bit SYSENTER instruction entry.
  *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
  * Arguments:
- * %eax	System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6	
- * 	
- * Interrupts off.
- *	
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below.	Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
  * with the int 0x80 path.
- */ 	
+ */
 ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
 	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(kernel_stack), %rsp
-	addq	$(KERNEL_STACK_OFFSET),%rsp
+	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs, here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	/* Construct iret frame (ss,rsp,rflags,cs,rip) */
  	movl	%ebp,%ebp		/* zero extension */
 	pushq_cfi $__USER32_DS
 	/*CFI_REL_OFFSET ss,0*/
@@ -135,18 +127,24 @@
 	CFI_REL_OFFSET rsp,0
 	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
+	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
 	movl	%eax, %eax
+	/* Store thread_info->sysenter_return in rip stack slot */
 	pushq_cfi %r10
 	CFI_REL_OFFSET rip,0
+	/* Store orig_ax */
 	pushq_cfi %rax
+	/* Construct the rest of "struct pt_regs" */
 	cld
-	SAVE_ARGS 0,1,0
- 	/* no need to do an access_ok check here because rbp has been
- 	   32bit zero extended */ 
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
+	/*
+	 * no need to do an access_ok check here because rbp has been
+	 * 32bit zero extended
+	 */
 	ASM_STAC
 1:	movl	(%rbp),%ebp
 	_ASM_EXTABLE(1b,ia32_badarg)
@@ -157,32 +155,39 @@
 	 * ourselves.  To save a few cycles, we can check whether
 	 * NT was set instead of doing an unconditional popfq.
 	 */
-	testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+	testl $X86_EFLAGS_NT,EFLAGS(%rsp)
 	jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	ia32_badsys
 sysenter_do_call:
-	IA32_ARG_FIXUP
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl	%edi,%r8d	/* arg5 */
+	movl	%ebp,%r9d	/* arg6 */
+	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl	%ebx,%edi	/* arg1 */
+	movl	%edx,%edx	/* arg3 (zero extension) */
 sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
-	movq	%rax,RAX-ARGOFFSET(%rsp)
+	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	/* clear IF, that popfq doesn't enable interrupts early */
-	andl	$~0x200,EFLAGS-ARGOFFSET(%rsp)
-	movl	RIP-ARGOFFSET(%rsp),%edx		/* User %eip */
+	andl	$~0x200,EFLAGS(%rsp)
+	movl	RIP(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
-	RESTORE_ARGS 0,24,0,0,0,0
+	RESTORE_RSI_RDI
+	/* pop everything except ss,rsp,rflags slots */
+	REMOVE_PT_GPREGS_FROM_STACK 3*8
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
@@ -192,6 +197,10 @@
 	popq_cfi %rcx				/* User %esp */
 	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
+	/*
+	 * 32bit SYSEXIT restores eip from edx, esp from ecx.
+	 * cs and ss are loaded from MSRs.
+	 */
 	ENABLE_INTERRUPTS_SYSEXIT32
 
 	CFI_RESTORE_STATE
@@ -205,18 +214,18 @@
 	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
 	movl %eax,%edi			/* 1st arg: syscall number */
 	call __audit_syscall_entry
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
+	movl RAX(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
 	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX-ARGOFFSET(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX-ARGOFFSET(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI-ARGOFFSET(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI-ARGOFFSET(%rsp),%r8d	/* reload 5th syscall arg */
+	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
+	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
+	movl RSI(%rsp),%ecx	/* reload 4th syscall arg */
+	movl RDI(%rsp),%r8d	/* reload 5th syscall arg */
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +236,13 @@
 1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
-	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
+	movq RAX(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz \exit
-	CLEAR_RREGS -ARGOFFSET
+	CLEAR_RREGS
 	jmp int_with_check
 	.endm
 
@@ -253,16 +262,16 @@
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz	sysenter_auditsys
 #endif
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
+	RESTORE_EXTRA_REGS
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
@@ -272,27 +281,37 @@
 /*
  * 32bit SYSCALL instruction entry.
  *
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
  * Arguments:
- * %eax	System call number.
- * %ebx Arg1
- * %ecx return EIP 
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
- * %esp user stack 
- * 0(%esp) Arg6
- * 	
- * Interrupts off.
- *	
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2	(note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below.	Set up a complete hardware stack frame to share code
- * with the int 0x80 path.	
- */ 	
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
 ENTRY(ia32_cstar_target)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
+	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	SWAPGS_UNSAFE_STACK
@@ -304,62 +323,77 @@
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,0,0
+	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
+	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
-	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
-	movq	%rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	movq	%rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+	movq	%rax,ORIG_RAX(%rsp)
+	movq	%rcx,RIP(%rsp)
+	CFI_REL_OFFSET rip,RIP
+	movq	%rbp,RCX(%rsp) /* this lies slightly to ptrace */
 	movl	%ebp,%ecx
-	movq	$__USER32_CS,CS-ARGOFFSET(%rsp)
-	movq	$__USER32_DS,SS-ARGOFFSET(%rsp)
-	movq	%r11,EFLAGS-ARGOFFSET(%rsp)
-	/*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
-	movq	%r8,RSP-ARGOFFSET(%rsp)	
-	CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-	/* no need to do an access_ok check here because r8 has been
-	   32bit zero extended */ 
-	/* hardware stack frame is complete now */	
+	movq	$__USER32_CS,CS(%rsp)
+	movq	$__USER32_DS,SS(%rsp)
+	movq	%r11,EFLAGS(%rsp)
+	/*CFI_REL_OFFSET rflags,EFLAGS*/
+	movq	%r8,RSP(%rsp)
+	CFI_REL_OFFSET rsp,RSP
+	/* iret stack frame is complete now */
+	/*
+	 * no need to do an access_ok check here because r8 has been
+	 * 32bit zero extended
+	 */
 	ASM_STAC
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
 	ja  ia32_badsys
 cstar_do_call:
-	IA32_ARG_FIXUP 1
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl	%edi,%r8d	/* arg5 */
+	/* r9 already loaded */	/* arg6 */
+	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl	%ebx,%edi	/* arg1 */
+	movl	%edx,%edx	/* arg3 (zero extension) */
 cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	RESTORE_ARGS 0,-ARG_SKIP,0,0,0
-	movl RIP-ARGOFFSET(%rsp),%ecx
+	andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	RESTORE_RSI_RDI_RDX
+	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
-	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
+	movl EFLAGS(%rsp),%r11d
 	/*CFI_REGISTER rflags,r11*/
 	xorq	%r10,%r10
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
-	movl RSP-ARGOFFSET(%rsp),%esp
+	movl RSP(%rsp),%esp
 	CFI_RESTORE rsp
+	/*
+	 * 64bit->32bit SYSRET restores eip from ecx,
+	 * eflags from r11 (but RF and VM bits are forced to 0),
+	 * cs and ss are loaded from MSRs.
+	 * (Note: 32bit->32bit SYSRET is different: since r11
+	 * does not exist, it merely sets eflags.IF=1).
+	 */
 	USERGS_SYSRET32
-	
+
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
 	CFI_RESTORE_STATE
-	movl %r9d,R9-ARGOFFSET(%rsp)	/* register to be clobbered by call */
+	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
 	auditsys_entry_common
-	movl R9-ARGOFFSET(%rsp),%r9d	/* reload 6th syscall arg */
+	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
 	jmp cstar_dispatch
 
 sysretl_audit:
@@ -368,17 +402,17 @@
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
-	SAVE_REST
-	CLEAR_RREGS 0, r9
+	SAVE_EXTRA_REGS
+	CLEAR_RREGS r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
+	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,26 +425,26 @@
 	jmp ia32_sysret
 	CFI_ENDPROC
 
-/* 
- * Emulated IA32 system calls via int 0x80. 
+/*
+ * Emulated IA32 system calls via int 0x80.
  *
- * Arguments:	 
- * %eax	System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6	(note: not saved in the stack frame, should not be touched)
  *
  * Notes:
- * Uses the same stack frame as the x86-64 version.	
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
  * Arguments are zero extended. For system calls that want sign extension and
  * take long arguments a wrapper is needed. Most calls can just be called
  * directly.
- * Assumes it is only called from user space and entered with interrupts off.	
- */ 				
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
 
 ENTRY(ia32_syscall)
 	CFI_STARTPROC32	simple
@@ -429,40 +463,46 @@
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	movl %eax,%eax
-	pushq_cfi %rax
+	pushq_cfi %rax		/* store orig_ax */
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
-	SAVE_ARGS 0,1,0
-	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
+	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
 ia32_do_call:
-	IA32_ARG_FIXUP
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl %edi,%r8d	/* arg5 */
+	movl %ebp,%r9d	/* arg6 */
+	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl %ebx,%edi	/* arg1 */
+	movl %edx,%edx	/* arg3 (zero extension) */
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 ia32_ret_from_sys_call:
-	CLEAR_RREGS -ARGOFFSET
-	jmp int_ret_from_sys_call 
+	CLEAR_RREGS
+	jmp int_ret_from_sys_call
 
-ia32_tracesys:			 
-	SAVE_REST
+ia32_tracesys:
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
+	RESTORE_EXTRA_REGS
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	jmp ia32_do_call
 END(ia32_syscall)
 
 ia32_badsys:
-	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $0,ORIG_RAX(%rsp)
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
@@ -492,24 +532,23 @@
 
 	ALIGN
 ia32_ptregs_common:
-	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
-	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
-/*	CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
-/*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
-	SAVE_REST
+	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
+	CFI_REL_OFFSET	rax,RAX
+	CFI_REL_OFFSET	rcx,RCX
+	CFI_REL_OFFSET	rdx,RDX
+	CFI_REL_OFFSET	rsi,RSI
+	CFI_REL_OFFSET	rdi,RDI
+	CFI_REL_OFFSET	rip,RIP
+/*	CFI_REL_OFFSET	cs,CS*/
+/*	CFI_REL_OFFSET	rflags,EFLAGS*/
+	CFI_REL_OFFSET	rsp,RSP
+/*	CFI_REL_OFFSET	ss,SS*/
+	SAVE_EXTRA_REGS 8
 	call *%rax
-	RESTORE_REST
-	jmp  ia32_sysret	/* misbalances the return cache */
+	RESTORE_EXTRA_REGS 8
+	ret
 	CFI_ENDPROC
 END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b..0000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-
-long compat_ni_syscall(void)
-{
-	return -ENOSYS;
-}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceec..719cd70 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@
 				advice);
 }
 
-long sys32_vm86_warning(void)
-{
-	struct task_struct *me = current;
-	static char lastcomm[sizeof(me->comm)];
-
-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		compat_printk(KERN_INFO
-			      "%s: vm86 mode not supported on 64 bit kernel\n",
-			      me->comm);
-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
-	}
-	return -ENOSYS;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
 				   size_t count)
 {
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0..0000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* System call table for ia32 emulation. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void compat_ni_syscall(void);
-
-const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
-	/*
-	 * Smells like a compiler bug -- it doesn't work
-	 * when the & below is removed.
-	 */
-	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
-#include <asm/syscalls_32.h>
-};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c..524bddc 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
 	.endm
 #endif
 
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
 	.word \feature
 	.byte \orig_len
 	.byte \alt_len
+	.byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr
+144:
+	.popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+	.skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr1
+144:
+	\newinstr2
+145:
+	.popsection
 .endm
 
 #endif  /*  __ASSEMBLY__  */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbe..5aef6a9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@
 	s32 repl_offset;	/* offset to replacement instruction */
 	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
-	u8  replacementlen;	/* length of new instruction, <= instrlen */
-};
+	u8  replacementlen;	/* length of new instruction */
+	u8  padlen;		/* length of build-time padding */
+} __packed;
 
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@
 }
 #endif	/* CONFIG_SMP */
 
-#define OLDINSTR(oldinstr)	"661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num)	"664"#num
+#define e_replacement(num)	"665"#num
 
-#define b_replacement(number)	"663"#number
-#define e_replacement(number)	"664"#number
+#define alt_end_marker		"663"
+#define alt_slen		"662b-661b"
+#define alt_pad_len		alt_end_marker"b-662b"
+#define alt_total_slen		alt_end_marker"b-661b"
+#define alt_rlen(num)		e_replacement(num)"f-"b_replacement(num)"f"
 
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num)					\
+	"661:\n\t" oldinstr "\n662:\n"					\
+	".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * "		\
+		"((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
 
-#define ALTINSTR_ENTRY(feature, number)					      \
+#define OLDINSTR(oldinstr, num)						\
+	__OLDINSTR(oldinstr, num)					\
+	alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2)					\
+	__OLDINSTR(oldinstr, num1)						\
+	".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+		"((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n"  \
+	alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num)					      \
 	" .long 661b - .\n"				/* label           */ \
-	" .long " b_replacement(number)"f - .\n"	/* new instruction */ \
+	" .long " b_replacement(num)"f - .\n"		/* new instruction */ \
 	" .word " __stringify(feature) "\n"		/* feature bit     */ \
-	" .byte " alt_slen "\n"				/* source len      */ \
-	" .byte " alt_rlen(number) "\n"			/* replacement len */
+	" .byte " alt_total_slen "\n"			/* source len      */ \
+	" .byte " alt_rlen(num) "\n"			/* replacement len */ \
+	" .byte " alt_pad_len "\n"			/* pad len */
 
-#define DISCARD_ENTRY(number)				/* rlen <= slen */    \
-	" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number)	/* replacement */     \
-	b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num)	/* replacement */     \
+	b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
 
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)			\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR(oldinstr, 1)						\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature, 1)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
 	".popsection"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR_2(oldinstr, 1, 2)					\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature1, 1)					\
 	ALTINSTR_ENTRY(feature2, 2)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	DISCARD_ENTRY(2)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
@@ -146,6 +158,9 @@
 #define alternative(oldinstr, newinstr, feature)			\
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+	asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
 /*
  * Alternative inline assembly with input.
  *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22..8118e94 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
-	alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+	alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
 		       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
 		       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb3..959e45b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
  * code region.
- *
- * (Could use an alternative three way for this if there was one.)
  */
 static __always_inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b..4b5f7bf 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,148 @@
  * for assembly code:
  */
 
-#define R15		  0
-#define R14		  8
-#define R13		 16
-#define R12		 24
-#define RBP		 32
-#define RBX		 40
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15		0*8
+#define R14		1*8
+#define R13		2*8
+#define R12		3*8
+#define RBP		4*8
+#define RBX		5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11		6*8
+#define R10		7*8
+#define R9		8*8
+#define R8		9*8
+#define RAX		10*8
+#define RCX		11*8
+#define RDX		12*8
+#define RSI		13*8
+#define RDI		14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX	15*8
+/* Return frame for iretq */
+#define RIP		16*8
+#define CS		17*8
+#define EFLAGS		18*8
+#define RSP		19*8
+#define SS		20*8
 
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11		 48
-#define R10		 56
-#define R9		 64
-#define R8		 72
-#define RAX		 80
-#define RCX		 88
-#define RDX		 96
-#define RSI		104
-#define RDI		112
-#define ORIG_RAX	120       /* + error_code */
-/* end of arguments */
+#define SIZEOF_PTREGS	21*8
 
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP		128
-#define CS		136
-#define EFLAGS		144
-#define RSP		152
-#define SS		160
-
-#define ARGOFFSET	R11
-
-	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
-	subq  $9*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
-	movq_cfi rdi, 8*8
-	movq_cfi rsi, 7*8
-	movq_cfi rdx, 6*8
-
-	.if \save_rcx
-	movq_cfi rcx, 5*8
-	.endif
-
-	.if \rax_enosys
-	movq $-ENOSYS, 4*8(%rsp)
-	.else
-	movq_cfi rax, 4*8
-	.endif
-
-	.if \save_r891011
-	movq_cfi r8,  3*8
-	movq_cfi r9,  2*8
-	movq_cfi r10, 1*8
-	movq_cfi r11, 0*8
-	.endif
-
+	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
+	subq	$15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
 	.endm
 
-#define ARG_SKIP	(9*8)
+	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
+	.if \r11
+	movq_cfi r11, 6*8+\offset
+	.endif
+	.if \r8910
+	movq_cfi r10, 7*8+\offset
+	movq_cfi r9,  8*8+\offset
+	movq_cfi r8,  9*8+\offset
+	.endif
+	.if \rax
+	movq_cfi rax, 10*8+\offset
+	.endif
+	.if \rcx
+	movq_cfi rcx, 11*8+\offset
+	.endif
+	movq_cfi rdx, 12*8+\offset
+	movq_cfi rsi, 13*8+\offset
+	movq_cfi rdi, 14*8+\offset
+	.endm
+	.macro SAVE_C_REGS offset=0
+	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
+	SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_R891011
+	SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
+	SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+	SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
+	.endm
 
-	.macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
-			    rstor_r8910=1, rstor_rdx=1
+	.macro SAVE_EXTRA_REGS offset=0
+	movq_cfi r15, 0*8+\offset
+	movq_cfi r14, 1*8+\offset
+	movq_cfi r13, 2*8+\offset
+	movq_cfi r12, 3*8+\offset
+	movq_cfi rbp, 4*8+\offset
+	movq_cfi rbx, 5*8+\offset
+	.endm
+	.macro SAVE_EXTRA_REGS_RBP offset=0
+	movq_cfi rbp, 4*8+\offset
+	.endm
+
+	.macro RESTORE_EXTRA_REGS offset=0
+	movq_cfi_restore 0*8+\offset, r15
+	movq_cfi_restore 1*8+\offset, r14
+	movq_cfi_restore 2*8+\offset, r13
+	movq_cfi_restore 3*8+\offset, r12
+	movq_cfi_restore 4*8+\offset, rbp
+	movq_cfi_restore 5*8+\offset, rbx
+	.endm
+
+	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
-	movq_cfi_restore 0*8, r11
+	movq_cfi_restore 6*8, r11
 	.endif
-
 	.if \rstor_r8910
-	movq_cfi_restore 1*8, r10
-	movq_cfi_restore 2*8, r9
-	movq_cfi_restore 3*8, r8
+	movq_cfi_restore 7*8, r10
+	movq_cfi_restore 8*8, r9
+	movq_cfi_restore 9*8, r8
 	.endif
-
 	.if \rstor_rax
-	movq_cfi_restore 4*8, rax
+	movq_cfi_restore 10*8, rax
 	.endif
-
 	.if \rstor_rcx
-	movq_cfi_restore 5*8, rcx
+	movq_cfi_restore 11*8, rcx
 	.endif
-
 	.if \rstor_rdx
-	movq_cfi_restore 6*8, rdx
+	movq_cfi_restore 12*8, rdx
 	.endif
-
-	movq_cfi_restore 7*8, rsi
-	movq_cfi_restore 8*8, rdi
-
-	.if ARG_SKIP+\addskip > 0
-	addq $ARG_SKIP+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(ARG_SKIP+\addskip)
-	.endif
+	movq_cfi_restore 13*8, rsi
+	movq_cfi_restore 14*8, rdi
+	.endm
+	.macro RESTORE_C_REGS
+	RESTORE_C_REGS_HELPER 1,1,1,1,1
+	.endm
+	.macro RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_C_REGS_HELPER 0,1,1,1,1
+	.endm
+	.macro RESTORE_C_REGS_EXCEPT_RCX
+	RESTORE_C_REGS_HELPER 1,0,1,1,1
+	.endm
+	.macro RESTORE_C_REGS_EXCEPT_R11
+	RESTORE_C_REGS_HELPER 1,1,0,1,1
+	.endm
+	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
+	RESTORE_C_REGS_HELPER 1,0,0,1,1
+	.endm
+	.macro RESTORE_RSI_RDI
+	RESTORE_C_REGS_HELPER 0,0,0,0,0
+	.endm
+	.macro RESTORE_RSI_RDI_RDX
+	RESTORE_C_REGS_HELPER 0,0,0,0,1
 	.endm
 
-	.macro LOAD_ARGS offset, skiprax=0
-	movq \offset(%rsp),    %r11
-	movq \offset+8(%rsp),  %r10
-	movq \offset+16(%rsp), %r9
-	movq \offset+24(%rsp), %r8
-	movq \offset+40(%rsp), %rcx
-	movq \offset+48(%rsp), %rdx
-	movq \offset+56(%rsp), %rsi
-	movq \offset+64(%rsp), %rdi
-	.if \skiprax
-	.else
-	movq \offset+72(%rsp), %rax
-	.endif
-	.endm
-
-#define REST_SKIP	(6*8)
-
-	.macro SAVE_REST
-	subq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
-	movq_cfi rbx, 5*8
-	movq_cfi rbp, 4*8
-	movq_cfi r12, 3*8
-	movq_cfi r13, 2*8
-	movq_cfi r14, 1*8
-	movq_cfi r15, 0*8
-	.endm
-
-	.macro RESTORE_REST
-	movq_cfi_restore 0*8, r15
-	movq_cfi_restore 1*8, r14
-	movq_cfi_restore 2*8, r13
-	movq_cfi_restore 3*8, r12
-	movq_cfi_restore 4*8, rbp
-	movq_cfi_restore 5*8, rbx
-	addq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(REST_SKIP)
-	.endm
-
-	.macro SAVE_ALL
-	SAVE_ARGS
-	SAVE_REST
-	.endm
-
-	.macro RESTORE_ALL addskip=0
-	RESTORE_REST
-	RESTORE_ARGS 1, \addskip
+	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+	addq $15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
 	.endm
 
 	.macro icebp
@@ -210,37 +215,23 @@
  */
 
 	.macro SAVE_ALL
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ebp
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg edx
+	pushl_cfi_reg ecx
+	pushl_cfi_reg ebx
 	.endm
 
 	.macro RESTORE_ALL
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl_cfi_reg ebx
+	popl_cfi_reg ecx
+	popl_cfi_reg edx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
+	popl_cfi_reg ebp
+	popl_cfi_reg eax
 	.endm
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c40..acdee09 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@
 		sp = task_pt_regs(current)->sp;
 	} else {
 		/* -128 for the x32 ABI redzone */
-		sp = this_cpu_read(old_rsp) - 128;
+		sp = task_pt_regs(current)->sp - 128;
 	}
 
 	return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a5485..0f7a5a1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -231,6 +231,7 @@
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
@@ -418,6 +419,7 @@
 			 " .word %P0\n"		/* 1: do replace */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +434,7 @@
 			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (bit) : : t_no);
@@ -457,6 +460,7 @@
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +487,30 @@
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
-			 " .long 3f - .\n"		/* repl offset */
+			 " .long 4f - .\n"		/* repl offset */
 			 " .word %P1\n"			/* always replace */
-			 " .byte 2b - 1b\n"		/* src len */
-			 " .byte 4f - 3f\n"		/* repl len */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
-			 "4:\n"
+			 "4: jmp %l[t_no]\n"
+			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
 			 " .long 0\n"			/* no replacement */
 			 " .word %P0\n"			/* feature bit */
-			 " .byte 2b - 1b\n"		/* src len */
+			 " .byte 3b - 1b\n"		/* src len */
 			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
 			 ".previous\n"
 			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
 			 : : t_dynamic, t_no);
@@ -527,6 +530,7 @@
 			     " .word %P2\n"		/* always replace */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +545,7 @@
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 4b - 3b\n"		/* src len */
 			     " .byte 6f - 5f\n"		/* repl len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e..a0bf89f 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@
  * Pentium F0 0F bugfix can have resulted in the mapped
  * IDT being write-protected.
  */
-#define set_intr_gate(n, addr)						\
+#define set_intr_gate_notrace(n, addr)					\
 	do {								\
 		BUG_ON((unsigned)n > 0xFF);				\
 		_set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0,	\
 			  __KERNEL_CS);					\
+	} while (0)
+
+#define set_intr_gate(n, addr)						\
+	do {								\
+		set_intr_gate_notrace(n, addr);				\
 		_trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
 				0, 0, __KERNEL_CS);			\
 	} while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f1598..de1cdaf 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
 	CFI_ADJUST_CFA_OFFSET 8
 	.endm
 
+	.macro pushq_cfi_reg reg
+	pushq %\reg
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET \reg, 0
+	.endm
+
 	.macro popq_cfi reg
 	popq \reg
 	CFI_ADJUST_CFA_OFFSET -8
 	.endm
 
+	.macro popq_cfi_reg reg
+	popq %\reg
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE \reg
+	.endm
+
 	.macro pushfq_cfi
 	pushfq
 	CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
 	CFI_ADJUST_CFA_OFFSET 4
 	.endm
 
+	.macro pushl_cfi_reg reg
+	pushl %\reg
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET \reg, 0
+	.endm
+
 	.macro popl_cfi reg
 	popl \reg
 	CFI_ADJUST_CFA_OFFSET -4
 	.endm
 
+	.macro popl_cfi_reg reg
+	popl %\reg
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE \reg
+	.endm
+
 	.macro pushfl_cfi
 	pushfl
 	CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1..e7814b7 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@
 	const insn_byte_t *next_byte;
 };
 
-#define MAX_INSN_SIZE	16
+#define MAX_INSN_SIZE	15
 
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519..021bee9 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -171,9 +171,9 @@
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
 	sti; \
-	SAVE_REST; \
+	SAVE_EXTRA_REGS; \
 	LOCKDEP_SYS_EXIT; \
-	RESTORE_REST; \
+	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c935..5720997 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -209,9 +209,24 @@
 	unsigned short		back_link, __blh;
 	unsigned long		sp0;
 	unsigned short		ss0, __ss0h;
-	unsigned long		sp1;
-	/* ss1 caches MSR_IA32_SYSENTER_CS: */
-	unsigned short		ss1, __ss1h;
+
+	/*
+	 * We don't use ring 1, so sp1 and ss1 are convenient scratch
+	 * spaces in the same cacheline as sp0.  We use them to cache
+	 * some MSR values to avoid unnecessary wrmsr instructions.
+	 *
+	 * We use SYSENTER_ESP to find sp0 and for the NMI emergency
+	 * stack, but we need to context switch it because we do
+	 * horrible things to the kernel stack in vm86 mode.
+	 *
+	 * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid
+	 * corrupting the stack if we went through the sysenter path
+	 * from vm86 mode.
+	 */
+	unsigned long		sp1;	/* MSR_IA32_SYSENTER_ESP */
+	unsigned short		ss1;	/* MSR_IA32_SYSENTER_CS */
+
+	unsigned short		__ss1h;
 	unsigned long		sp2;
 	unsigned short		ss2, __ss2h;
 	unsigned long		__cr3;
@@ -276,13 +291,17 @@
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 
 	/*
-	 * .. and then another 0x100 bytes for the emergency kernel stack:
+	 * Space for the temporary SYSENTER stack:
 	 */
-	unsigned long		stack[64];
+	unsigned long		SYSENTER_stack[64];
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#endif
 
 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -474,7 +493,6 @@
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
 #else
-	unsigned long		usersp;	/* Copy from PDA */
 	unsigned short		es;
 	unsigned short		ds;
 	unsigned short		fsindex;
@@ -564,6 +582,16 @@
 #endif
 }
 
+static inline unsigned long current_top_of_stack(void)
+{
+#ifdef CONFIG_X86_64
+	return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+#else
+	/* sp0 on x86_32 is special in and around vm86 mode. */
+	return this_cpu_read_stable(cpu_current_top_of_stack);
+#endif
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -761,10 +789,10 @@
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
 #ifdef CONFIG_X86_32
-# define BASE_PREFETCH		ASM_NOP4
+# define BASE_PREFETCH		""
 # define ARCH_HAS_PREFETCH
 #else
-# define BASE_PREFETCH		"prefetcht0 (%1)"
+# define BASE_PREFETCH		"prefetcht0 %P1"
 #endif
 
 /*
@@ -775,10 +803,9 @@
  */
 static inline void prefetch(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchnta (%1)",
+	alternative_input(BASE_PREFETCH, "prefetchnta %P1",
 			  X86_FEATURE_XMM,
-			  "r" (x));
+			  "m" (*(const char *)x));
 }
 
 /*
@@ -788,10 +815,9 @@
  */
 static inline void prefetchw(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchw (%1)",
-			  X86_FEATURE_3DNOW,
-			  "r" (x));
+	alternative_input(BASE_PREFETCH, "prefetchw %P1",
+			  X86_FEATURE_3DNOWPREFETCH,
+			  "m" (*(const char *)x));
 }
 
 static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +825,9 @@
 	prefetchw(x);
 }
 
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+			   TOP_OF_KERNEL_STACK_PADDING)
+
 #ifdef CONFIG_X86_32
 /*
  * User space process size: 3GB (default).
@@ -809,39 +838,16 @@
 #define STACK_TOP_MAX		STACK_TOP
 
 #define INIT_THREAD  {							  \
-	.sp0			= sizeof(init_stack) + (long)&init_stack, \
+	.sp0			= TOP_OF_INIT_STACK,			  \
 	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 }
 
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS  {							  \
-	.x86_tss = {							  \
-		.sp0		= sizeof(init_stack) + (long)&init_stack, \
-		.ss0		= __KERNEL_DS,				  \
-		.ss1		= __KERNEL_CS,				  \
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		  \
-	 },								  \
-	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
-#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info)                                                 \
-({                                                                     \
-       unsigned long *__ptr = (unsigned long *)(info);                 \
-       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
-})
-
 /*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
  * is accessible even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
@@ -850,11 +856,11 @@
  * "struct pt_regs" is possible, but they may contain the
  * completely wrong values.
  */
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
-       __regs__ - 1;                                                   \
+#define task_pt_regs(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
+	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
+	((struct pt_regs *)__ptr) - 1;					\
 })
 
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
@@ -886,11 +892,7 @@
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
-	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_TSS  { \
-	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+	.sp0 = TOP_OF_INIT_STACK \
 }
 
 /*
@@ -902,11 +904,6 @@
 #define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
-/*
- * User space RSP while inside the SYSCALL fast path
- */
-DECLARE_PER_CPU(unsigned long, old_rsp);
-
 #endif /* CONFIG_X86_64 */
 
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb..d20bae2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@
 #else /* __i386__ */
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long bp;
 	unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -47,9 +51,12 @@
 	unsigned long dx;
 	unsigned long si;
 	unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long ip;
 	unsigned long cs;
 	unsigned long flags;
@@ -89,11 +96,13 @@
 }
 
 /*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value.  This tricky test checks that with
- * one comparison.  Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ * user_mode(regs) determines whether a register set came from user
+ * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value.  This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
  */
 static inline int user_mode(struct pt_regs *regs)
 {
@@ -104,14 +113,21 @@
 #endif
 }
 
-static inline int user_mode_vm(struct pt_regs *regs)
+/*
+ * This is the fastest way to check whether regs come from user space.
+ * It is unsafe if regs might come from vm86 mode, though -- in vm86
+ * mode, all bits of CS and SS are completely under the user's control.
+ * The CPU considers vm86 mode to be CPL 3 regardless of CS and SS.
+ *
+ * Do NOT use this function unless you have already ruled out the
+ * possibility that regs came from vm86 mode.
+ *
+ * We check for RPL != 0 instead of RPL == 3 because we don't use rings
+ * 1 or 2 and this is more efficient.
+ */
+static inline int user_mode_ignore_vm86(struct pt_regs *regs)
 {
-#ifdef CONFIG_X86_32
-	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
-		USER_RPL;
-#else
-	return user_mode(regs);
-#endif
+	return (regs->cs & SEGMENT_RPL_MASK) != 0;
 }
 
 static inline int v8086_mode(struct pt_regs *regs)
@@ -138,12 +154,8 @@
 #endif
 }
 
-#define current_user_stack_pointer()	this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer()	\
-	(test_thread_flag(TIF_IA32) 	\
-	 ? current_pt_regs()->sp 	\
-	 : this_cpu_read(old_rsp))
+#define current_user_stack_pointer()	current_pt_regs()->sp
+#define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif
 
 #ifdef CONFIG_X86_32
@@ -248,7 +260,7 @@
  */
 #define arch_ptrace_stop_needed(code, info)				\
 ({									\
-	set_thread_flag(TIF_NOTIFY_RESUME);				\
+	force_iret();							\
 	false;								\
 })
 
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a5..d394899 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
 
 #include <linux/const.h>
 
-/* Constructor for a conventional segment GDT (or LDT) entry */
-/* This is a macro so it can be used in initializers */
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
 #define GDT_ENTRY(flags, base, limit)			\
 	((((base)  & _AC(0xff000000,ULL)) << (56-24)) |	\
 	 (((flags) & _AC(0x0000f0ff,ULL)) << 40) |	\
@@ -12,198 +14,226 @@
 	 (((base)  & _AC(0x00ffffff,ULL)) << 16) |	\
 	 (((limit) & _AC(0x0000ffff,ULL))))
 
-/* Simple and small GDT entries for booting only */
+/* Simple and small GDT entries for booting only: */
 
 #define GDT_ENTRY_BOOT_CS	2
-#define __BOOT_CS		(GDT_ENTRY_BOOT_CS * 8)
+#define GDT_ENTRY_BOOT_DS	3
+#define GDT_ENTRY_BOOT_TSS	4
+#define __BOOT_CS		(GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS		(GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS		(GDT_ENTRY_BOOT_TSS*8)
 
-#define GDT_ENTRY_BOOT_DS	(GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS		(GDT_ENTRY_BOOT_DS * 8)
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK	0x3
 
-#define GDT_ENTRY_BOOT_TSS	(GDT_ENTRY_BOOT_CS + 2)
-#define __BOOT_TSS		(GDT_ENTRY_BOOT_TSS * 8)
+/* User mode is privilege level 3: */
+#define USER_RPL		0x3
 
-#define SEGMENT_RPL_MASK	0x3 /*
-				     * Bottom two bits of selector give the ring
-				     * privilege level
-				     */
-#define SEGMENT_TI_MASK		0x4 /* Bit 2 is table indicator (LDT/GDT) */
-#define USER_RPL		0x3 /* User mode is privilege level 3 */
-#define SEGMENT_LDT		0x4 /* LDT segment has TI set... */
-#define SEGMENT_GDT		0x0 /* ... GDT has it cleared */
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK		0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT		0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT		0x0
 
 #ifdef CONFIG_X86_32
 /*
  * The layout of the per-CPU GDT under Linux:
  *
- *   0 - null
+ *   0 - null								<=== cacheline #1
  *   1 - reserved
  *   2 - reserved
  *   3 - reserved
  *
- *   4 - unused			<==== new cacheline
+ *   4 - unused								<=== cacheline #2
  *   5 - unused
  *
  *  ------- start of TLS (Thread-Local Storage) segments:
  *
  *   6 - TLS segment #1			[ glibc's TLS segment ]
  *   7 - TLS segment #2			[ Wine's %fs Win32 segment ]
- *   8 - TLS segment #3
+ *   8 - TLS segment #3							<=== cacheline #3
  *   9 - reserved
  *  10 - reserved
  *  11 - reserved
  *
  *  ------- start of kernel segments:
  *
- *  12 - kernel code segment		<==== new cacheline
+ *  12 - kernel code segment						<=== cacheline #4
  *  13 - kernel data segment
  *  14 - default user CS
  *  15 - default user DS
- *  16 - TSS
+ *  16 - TSS								<=== cacheline #5
  *  17 - LDT
  *  18 - PNPBIOS support (16->32 gate)
  *  19 - PNPBIOS support
- *  20 - PNPBIOS support
+ *  20 - PNPBIOS support						<=== cacheline #6
  *  21 - PNPBIOS support
  *  22 - PNPBIOS support
  *  23 - APM BIOS support
- *  24 - APM BIOS support
+ *  24 - APM BIOS support						<=== cacheline #7
  *  25 - APM BIOS support
  *
  *  26 - ESPFIX small SS
  *  27 - per-cpu			[ offset to per-cpu data area ]
- *  28 - stack_canary-20		[ for stack protector ]
+ *  28 - stack_canary-20		[ for stack protector ]		<=== cacheline #8
  *  29 - unused
  *  30 - unused
  *  31 - TSS for double fault handler
  */
-#define GDT_ENTRY_TLS_MIN	6
-#define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_TLS_MIN		6
+#define GDT_ENTRY_TLS_MAX 		(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
+#define GDT_ENTRY_KERNEL_CS		12
+#define GDT_ENTRY_KERNEL_DS		13
 #define GDT_ENTRY_DEFAULT_USER_CS	14
-
 #define GDT_ENTRY_DEFAULT_USER_DS	15
+#define GDT_ENTRY_TSS			16
+#define GDT_ENTRY_LDT			17
+#define GDT_ENTRY_PNPBIOS_CS32		18
+#define GDT_ENTRY_PNPBIOS_CS16		19
+#define GDT_ENTRY_PNPBIOS_DS		20
+#define GDT_ENTRY_PNPBIOS_TS1		21
+#define GDT_ENTRY_PNPBIOS_TS2		22
+#define GDT_ENTRY_APMBIOS_BASE		23
 
-#define GDT_ENTRY_KERNEL_BASE		(12)
-
-#define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE+0)
-
-#define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE+1)
-
-#define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE+4)
-#define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE+5)
-
-#define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+6)
-#define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+11)
-
-#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE+14)
-#define __ESPFIX_SS			(GDT_ENTRY_ESPFIX_SS*8)
-
-#define GDT_ENTRY_PERCPU		(GDT_ENTRY_KERNEL_BASE+15)
-#ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
-#else
-#define __KERNEL_PERCPU 0
-#endif
-
-#define GDT_ENTRY_STACK_CANARY		(GDT_ENTRY_KERNEL_BASE+16)
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
-#else
-#define __KERNEL_STACK_CANARY		0
-#endif
+#define GDT_ENTRY_ESPFIX_SS		26
+#define GDT_ENTRY_PERCPU		27
+#define GDT_ENTRY_STACK_CANARY		28
 
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
 /*
- * The GDT has 32 entries
+ * Number of entries in the GDT table:
  */
-#define GDT_ENTRIES 32
-
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16		(GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS		(GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1		(GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2		(GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)	/* segment for calling fn */
-#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)	/* code segment for BIOS */
-#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)	/* data segment for BIOS */
-#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8)	/* transfer data segment */
-#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8)	/* another data segment */
-
+#define GDT_ENTRIES			32
 
 /*
- * Matching rules for certain types of segments.
+ * Segment selector values corresponding to the above entries:
  */
 
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+#define __KERNEL_CS			(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS			(GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __ESPFIX_SS			(GDT_ENTRY_ESPFIX_SS*8)
 
+/* segment for calling fn: */
+#define PNP_CS32			(GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16			(GDT_ENTRY_PNPBIOS_CS16*8)
 
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x)		(((x) & 0xf4) == PNP_CS32)
+
+/* data segment for BIOS: */
+#define PNP_DS				(GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1				(GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2				(GDT_ENTRY_PNPBIOS_TS2*8)
+
+#ifdef CONFIG_SMP
+# define __KERNEL_PERCPU		(GDT_ENTRY_PERCPU*8)
 #else
+# define __KERNEL_PERCPU		0
+#endif
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+# define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
+#else
+# define __KERNEL_STACK_CANARY		0
+#endif
+
+#else /* 64-bit: */
+
 #include <asm/cache.h>
 
-#define GDT_ENTRY_KERNEL32_CS 1
-#define GDT_ENTRY_KERNEL_CS 2
-#define GDT_ENTRY_KERNEL_DS 3
-
-#define __KERNEL32_CS   (GDT_ENTRY_KERNEL32_CS * 8)
+#define GDT_ENTRY_KERNEL32_CS		1
+#define GDT_ENTRY_KERNEL_CS		2
+#define GDT_ENTRY_KERNEL_DS		3
 
 /*
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ *   if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ *   if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ *
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
  */
-#define GDT_ENTRY_DEFAULT_USER32_CS 4
-#define GDT_ENTRY_DEFAULT_USER_DS 5
-#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
-#define __USER32_DS	__USER_DS
+#define GDT_ENTRY_DEFAULT_USER32_CS	4
+#define GDT_ENTRY_DEFAULT_USER_DS	5
+#define GDT_ENTRY_DEFAULT_USER_CS	6
 
-#define GDT_ENTRY_TSS 8	/* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
+/* Needs two entries */
+#define GDT_ENTRY_TSS			8
+/* Needs two entries */
+#define GDT_ENTRY_LDT			10
 
-#define GDT_ENTRY_PER_CPU 15	/* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG	(GDT_ENTRY_PER_CPU * 8 + 3)
+#define GDT_ENTRY_TLS_MIN		12
+#define GDT_ENTRY_TLS_MAX		14
 
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0
-#define GS_TLS 1
+/* Abused to load per CPU data from limit */
+#define GDT_ENTRY_PER_CPU		15
 
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES			16
 
-#define GDT_ENTRIES 16
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS			(GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS			(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS			(GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS			(GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER32_DS			__USER_DS
+#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __PER_CPU_SEG			(GDT_ENTRY_PER_CPU*8 + 3)
+
+/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
+#define FS_TLS				0
+#define GS_TLS				1
+
+#define GS_TLS_SEL			((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL			((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
 
 #endif
 
-#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
-#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
 #ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl()  0
+# define get_kernel_rpl()		0
 #endif
 
-#define IDT_ENTRIES 256
-#define NUM_EXCEPTION_VECTORS 32
-/* Bitmask of exception vectors which push an error code on the stack */
-#define EXCEPTION_ERRCODE_MASK  0x00027d00
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define GDT_ENTRY_TLS_ENTRIES 3
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+#define IDT_ENTRIES			256
+#define NUM_EXCEPTION_VECTORS		32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK		0x00027d00
+
+#define GDT_SIZE			(GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES		3
+#define TLS_SIZE			(GDT_ENTRY_TLS_ENTRIES* 8)
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
+
 extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
 #ifdef CONFIG_TRACING
-#define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handlers early_idt_handlers
 #endif
 
 /*
@@ -228,37 +258,30 @@
 } while (0)
 
 /*
- * Save a segment register away
+ * Save a segment register away:
  */
 #define savesegment(seg, value)				\
 	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
 
 /*
- * x86_32 user gs accessors.
+ * x86-32 user GS accessors:
  */
 #ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk)	((tsk)->thread.gs)
-#define lazy_save_gs(v)		savesegment(gs, (v))
-#define lazy_load_gs(v)		loadsegment(gs, (v))
-#else	/* X86_32_LAZY_GS */
-#define get_user_gs(regs)	(u16)((regs)->gs)
-#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v)		do { } while (0)
-#define lazy_load_gs(v)		do { } while (0)
-#endif	/* X86_32_LAZY_GS */
+# ifdef CONFIG_X86_32_LAZY_GS
+#  define get_user_gs(regs)		(u16)({ unsigned long v; savesegment(gs, v); v; })
+#  define set_user_gs(regs, v)		loadsegment(gs, (unsigned long)(v))
+#  define task_user_gs(tsk)		((tsk)->thread.gs)
+#  define lazy_save_gs(v)		savesegment(gs, (v))
+#  define lazy_load_gs(v)		loadsegment(gs, (v))
+# else	/* X86_32_LAZY_GS */
+#  define get_user_gs(regs)		(u16)((regs)->gs)
+#  define set_user_gs(regs, v)		do { (regs)->gs = (v); } while (0)
+#  define task_user_gs(tsk)		(task_pt_regs(tsk)->gs)
+#  define lazy_save_gs(v)		do { } while (0)
+#  define lazy_load_gs(v)		do { } while (0)
+# endif	/* X86_32_LAZY_GS */
 #endif	/* X86_32 */
 
-static inline unsigned long get_limit(unsigned long segment)
-{
-	unsigned long __limit;
-	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
-	return __limit + 1;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e..6fe6b18 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@
 	unsigned long ip;
 	unsigned long flags;
 	unsigned short cs;
-	unsigned short gs;
-	unsigned short fs;
-	unsigned short __pad0;
+	unsigned short __pad2;	/* Was called gs, but was always zero. */
+	unsigned short __pad1;	/* Was called fs, but was always zero. */
+	unsigned short ss;
 	unsigned long err;
 	unsigned long trapno;
 	unsigned long oldmask;
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f..ba665eb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
 
 #ifdef CONFIG_X86_SMAP
 
-#define ASM_CLAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_CLAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
-	.popsection
+#define ASM_CLAC \
+	ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
 
-#define ASM_STAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_STAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
-	.popsection
+#define ASM_STAC \
+	ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
 
 #else /* CONFIG_X86_SMAP */
 
@@ -61,20 +49,20 @@
 static __always_inline void clac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
 }
 
 /* These macros can be used in asm() statements */
 #define ASM_CLAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
 #define ASM_STAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
 
 #else /* CONFIG_X86_SMAP */
 
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00f..2ec1a53 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include <asm/nops.h>
+
 static inline void native_clts(void)
 {
 	asm volatile("clts");
@@ -199,6 +201,14 @@
 		       "+m" (*(volatile char __force *)__p));
 }
 
+static inline void pcommit_sfence(void)
+{
+	alternative(ASM_NOP7,
+		    ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
+		    "sfence",
+		    X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile ("nop")
 
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f2..ea2dbe8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,6 +13,33 @@
 #include <asm/types.h>
 
 /*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack.  We do it because of a nasty
+ * 32-bit corner case.  On x86_32, the hardware stack frame is
+ * variable-length.  Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0.  On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault.  (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line
  * - this struct shares the supervisor stack pages
@@ -145,7 +172,6 @@
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
 #define STACK_WARN		(THREAD_SIZE/8)
-#define KERNEL_STACK_OFFSET	(5*(BITS_PER_LONG/8))
 
 /*
  * macros/functions for gaining access to the thread information structure
@@ -158,10 +184,7 @@
 
 static inline struct thread_info *current_thread_info(void)
 {
-	struct thread_info *ti;
-	ti = (void *)(this_cpu_read_stable(kernel_stack) +
-		      KERNEL_STACK_OFFSET - THREAD_SIZE);
-	return ti;
+	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
 }
 
 static inline unsigned long current_stack_pointer(void)
@@ -177,16 +200,37 @@
 
 #else /* !__ASSEMBLY__ */
 
-/* how to get the thread information struct from ASM */
+/* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
 	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
-	_ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
+	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
- * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
- * a certain register (to be used in assembler memory operands).
+ * ASM operand which evaluates to a 'thread_info' address of
+ * the current task, if it is known that "reg" is exactly "off"
+ * bytes below the top of the stack currently.
+ *
+ * ( The kernel stack's size is known at build time, it is usually
+ *   2 or 4 pages, and the bottom  of the kernel stack contains
+ *   the thread_info structure. So to access the thread_info very
+ *   quickly from assembly code we can calculate down from the
+ *   top of the kernel stack to the bottom, using constant,
+ *   build-time calculations only. )
+ *
+ * For example, to fetch the current thread_info->flags value into %eax
+ * on x86-64 defconfig kernels, in syscall entry code where RSP is
+ * currently at exactly SIZEOF_PTREGS bytes away from the top of the
+ * stack:
+ *
+ *      mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
+ *
+ * will translate to:
+ *
+ *      8b 84 24 b8 c0 ff ff      mov    -0x3f48(%rsp), %eax
+ *
+ * which is below the current RSP by almost 16K.
  */
-#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
 
 #endif
 
@@ -236,6 +280,16 @@
 #endif
 	return false;
 }
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a..580aee3 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
 #else /* __i386__ */
 
 #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 #define R15 0
 #define R14 8
 #define R13 16
 #define R12 24
 #define RBP 32
 #define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 #define R11 48
 #define R10 56
 #define R9 64
@@ -41,15 +45,17 @@
 #define RDX 96
 #define RSI 104
 #define RDI 112
-#define ORIG_RAX 120       /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
 #define RIP 128
 #define CS 136
 #define EFLAGS 144
 #define RSP 152
 #define SS 160
-#define ARGOFFSET R11
 #endif /* __ASSEMBLY__ */
 
 /* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa..bc16115 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@
 #ifndef __KERNEL__
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long rbp;
 	unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -57,9 +61,12 @@
 	unsigned long rdx;
 	unsigned long rsi;
 	unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long rip;
 	unsigned long cs;
 	unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f90..16dc4e8 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@
 	__u64 rip;
 	__u64 eflags;		/* RFLAGS */
 	__u16 cs;
-	__u16 gs;
-	__u16 fs;
-	__u16 __pad0;
+
+	/*
+	 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+	 * Linux saved and restored fs and gs in these slots.  This
+	 * was counterproductive, as fsbase and gsbase were never
+	 * saved, so arch_prctl was presumably unreliable.
+	 *
+	 * If these slots are ever needed for any other purpose, there
+	 * is some risk that very old 64-bit binaries could get
+	 * confused.  I doubt that many such binaries still work,
+	 * though, since the same patch in 2.5.64 also removed the
+	 * 64-bit set_thread_area syscall, so it appears that there is
+	 * no TLS API that works in both pre- and post-2.5.64 kernels.
+	 */
+	__u16 __pad2;		/* Was gs. */
+	__u16 __pad1;		/* Was fs. */
+
+	__u16 ss;
 	__u64 err;
 	__u64 trapno;
 	__u64 oldmask;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..c887cd9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= mcount_64.o
 obj-y			+= syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION)	+= syscall_32.o
 obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
 obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o
 obj-$(CONFIG_SYSFS)	+= ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f..5c993c9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, ...)				\
-do {							\
-	if (debug_alternative)				\
-		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+#define DPRINTK(fmt, args...)						\
+do {									\
+	if (debug_alternative)						\
+		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...)				\
+do {									\
+	if (unlikely(debug_alternative)) {				\
+		int j;							\
+									\
+		if (!(len))						\
+			break;						\
+									\
+		printk(KERN_DEBUG fmt, ##args);				\
+		for (j = 0; j < (len) - 1; j++)				\
+			printk(KERN_CONT "%02hhx ", buf[j]);		\
+		printk(KERN_CONT "%02hhx\n", buf[j]);			\
+	}								\
 } while (0)
 
 /*
@@ -243,12 +258,86 @@
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that asymmetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+	return opcode == 0xeb || opcode == 0xe9;
+}
 
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+	u8 *next_rip, *tgt_rip;
+	s32 n_dspl, o_dspl;
+	int repl_len;
+
+	if (a->replacementlen != 5)
+		return;
+
+	o_dspl = *(s32 *)(insnbuf + 1);
+
+	/* next_rip of the replacement JMP */
+	next_rip = repl_insn + a->replacementlen;
+	/* target rip of the replacement JMP */
+	tgt_rip  = next_rip + o_dspl;
+	n_dspl = tgt_rip - orig_insn;
+
+	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+	if (tgt_rip - orig_insn >= 0) {
+		if (n_dspl - 2 <= 127)
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	/* negative offset */
+	} else {
+		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	}
+
+two_byte_jmp:
+	n_dspl -= 2;
+
+	insnbuf[0] = 0xeb;
+	insnbuf[1] = (s8)n_dspl;
+	add_nops(insnbuf + 2, 3);
+
+	repl_len = 2;
+	goto done;
+
+five_byte_jmp:
+	n_dspl -= 5;
+
+	insnbuf[0] = 0xe9;
+	*(s32 *)&insnbuf[1] = n_dspl;
+
+	repl_len = 5;
+
+done:
+
+	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+		   instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
 void __init_or_module apply_alternatives(struct alt_instr *start,
 					 struct alt_instr *end)
 {
@@ -256,10 +345,10 @@
 	u8 *instr, *replacement;
 	u8 insnbuf[MAX_PATCH_LEN];
 
-	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+	DPRINTK("alt table %p -> %p", start, end);
 	/*
 	 * The scan order should be from start to end. A later scanned
-	 * alternative code can overwrite a previous scanned alternative code.
+	 * alternative code can overwrite previously scanned alternative code.
 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 	 * patch code.
 	 *
@@ -267,29 +356,54 @@
 	 * order.
 	 */
 	for (a = start; a < end; a++) {
+		int insnbuf_sz = 0;
+
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
-		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
-		if (!boot_cpu_has(a->cpuid))
+		if (!boot_cpu_has(a->cpuid)) {
+			if (a->padlen > 1)
+				optimize_nops(a, instr);
+
 			continue;
+		}
+
+		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+			a->cpuid >> 5,
+			a->cpuid & 0x1f,
+			instr, a->instrlen,
+			replacement, a->replacementlen);
+
+		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
 
 		memcpy(insnbuf, replacement, a->replacementlen);
+		insnbuf_sz = a->replacementlen;
 
 		/* 0xe8 is a relative jump; fix the offset. */
-		if (*insnbuf == 0xe8 && a->replacementlen == 5)
-		    *(s32 *)(insnbuf + 1) += replacement - instr;
+		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+			*(s32 *)(insnbuf + 1) += replacement - instr;
+			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+				*(s32 *)(insnbuf + 1),
+				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+		}
 
-		add_nops(insnbuf + a->replacementlen,
-			 a->instrlen - a->replacementlen);
+		if (a->replacementlen && is_jmp(replacement[0]))
+			recompute_jump(a, instr, replacement, insnbuf);
 
-		text_poke_early(instr, insnbuf, a->instrlen);
+		if (a->instrlen > a->replacementlen) {
+			add_nops(insnbuf + a->replacementlen,
+				 a->instrlen - a->replacementlen);
+			insnbuf_sz += a->instrlen - a->replacementlen;
+		}
+		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
+
+		text_poke_early(instr, insnbuf, insnbuf_sz);
 	}
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
 				  u8 *text, u8 *text_end)
 {
@@ -371,8 +485,8 @@
 	smp->locks_end	= locks_end;
 	smp->text	= text;
 	smp->text_end	= text_end;
-	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-		__func__, smp->locks, smp->locks_end,
+	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+		smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 
 	list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +554,7 @@
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +715,7 @@
 	if (likely(!bp_patching_in_progress))
 		return 0;
 
-	if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
 		return 0;
 
 	/* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d3..47703ae 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@
 
 	/* Offset from the sysenter stack to tss.sp0 */
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-		 sizeof(struct tss_struct));
+	       offsetofend(struct tss_struct, SYSENTER_stack));
 
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d..5ce6f2d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@
 #undef ENTRY
 
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
 	BLANK();
 
 	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239..dd9e505 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@
 		set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
 
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+	/* 3DNow or LM implies PREFETCHW */
+	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+			set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95..c928a7a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -959,38 +959,36 @@
 #endif
 }
 
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
-	/* Load these always in case some future AMD CPU supports
-	   SYSENTER from compat mode too. */
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif		/* CONFIG_IA32_EMULATION */
-#endif		/* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
 #ifdef CONFIG_X86_32
 void enable_sep_cpu(void)
 {
-	int cpu = get_cpu();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss;
+	int cpu;
 
-	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		put_cpu();
-		return;
-	}
+	cpu = get_cpu();
+	tss = &per_cpu(cpu_tss, cpu);
+
+	if (!boot_cpu_has(X86_FEATURE_SEP))
+		goto out;
+
+	/*
+	 * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware,
+	 * we cache the SYSENTER CS and ESP values there for easy access:
+	 */
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
-	tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
-	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+	tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack);
 	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
 	put_cpu();
 }
 #endif
@@ -1118,7 +1116,7 @@
 __setup("clearcpuid=", setup_disablecpuid);
 
 DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 #ifdef CONFIG_X86_64
@@ -1130,8 +1128,8 @@
 		     irq_stack_union) __aligned(PAGE_SIZE) __visible;
 
 /*
- * The following four percpu variables are hot.  Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot.  Align current_task to
+ * cacheline size such that they fall in the same cacheline.
  */
 DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
 	&init_task;
@@ -1171,10 +1169,21 @@
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
 	wrmsrl(MSR_LSTAR, system_call);
-	wrmsrl(MSR_CSTAR, ignore_sysret);
 
 #ifdef CONFIG_IA32_EMULATION
-	syscall32_cpu_init();
+	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	/*
+	 * Always load these, in case some future 64-bit CPU supports
+	 * SYSENTER from compat mode too:
+	 */
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
 #endif
 
 	/* Flags to clear on syscall */
@@ -1226,6 +1235,15 @@
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack.  Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+	(unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
@@ -1307,7 +1325,7 @@
 	 */
 	load_ucode_ap();
 
-	t = &per_cpu(init_tss, cpu);
+	t = &per_cpu(cpu_tss, cpu);
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1391,7 +1409,7 @@
 {
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 	struct thread_struct *thread = &curr->thread;
 
 	wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f8..56f7e60 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2147,24 +2147,24 @@
 static unsigned long code_segment_base(struct pt_regs *regs)
 {
 	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+
+#ifdef CONFIG_X86_32
+	/*
 	 * If we are in VM86 mode, add the segment offset to convert to a
 	 * linear address.
 	 */
 	if (regs->flags & X86_VM_MASK)
 		return 0x10 * regs->cs;
 
-	/*
-	 * For IA32 we look at the GDT/LDT segment base to convert the
-	 * effective IP to a linear address.
-	 */
-#ifdef CONFIG_X86_32
-	if (user_mode(regs) && regs->cs != __USER_CS)
+	if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
-	if (test_thread_flag(TIF_IA32)) {
-		if (user_mode(regs) && regs->cs != __USER32_CS)
-			return get_segment_base(regs->cs);
-	}
+	if (user_mode(regs) && !user_64bit_mode(regs) &&
+	    regs->cs != __USER32_CS)
+		return get_segment_base(regs->cs);
 #endif
 	return 0;
 }
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f9..c76d3e3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@
 #ifdef CONFIG_X86_32
 	struct pt_regs fixed_regs;
 
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		crash_fixup_ss_esp(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d..ab3b6563 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -278,7 +278,7 @@
 	print_modules();
 	show_regs(regs);
 #ifdef CONFIG_X86_32
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		sp = regs->sp;
 		ss = regs->ss & 0xffff;
 	} else {
@@ -307,7 +307,7 @@
 	unsigned long flags = oops_begin();
 	int sig = SIGSEGV;
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 		report_bug(regs->ip, regs);
 
 	if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd..39891ff 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -123,13 +123,13 @@
 	int i;
 
 	show_regs_print_info(KERN_EMERG);
-	__show_regs(regs, !user_mode_vm(regs));
+	__show_regs(regs, !user_mode(regs));
 
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
 	 */
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5b..4c8cc34 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -398,7 +398,7 @@
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
@@ -816,15 +816,9 @@
 	pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:	pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
-	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663:	pushl $do_simd_coprocessor_error
-664:
-.previous
+	ALTERNATIVE "pushl_cfi $do_general_protection",	\
+		    "pushl $do_simd_coprocessor_error", \
+		    X86_FEATURE_XMM
 #else
 	pushl_cfi $do_simd_coprocessor_error
 #endif
@@ -1240,20 +1234,13 @@
 	/*CFI_REL_OFFSET es, 0*/
 	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ebp
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg edx
+	pushl_cfi_reg ecx
+	pushl_cfi_reg ebx
 	cld
 	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2babb39..bf9afad 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
  *
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
  * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
  * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
  *
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
  * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
  */
@@ -82,9 +69,9 @@
 #endif /* CONFIG_PARAVIRT */
 
 
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  1f
 	TRACE_IRQS_ON
 1:
@@ -116,8 +103,8 @@
 	call debug_stack_reset
 .endm
 
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
-	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  1f
 	TRACE_IRQS_ON_DEBUG
 1:
@@ -130,34 +117,7 @@
 #endif
 
 /*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
-	/* %rsp:at FRAMEEND */
-	.macro FIXUP_TOP_OF_STACK tmp offset=0
-	movq PER_CPU_VAR(old_rsp),\tmp
-	movq \tmp,RSP+\offset(%rsp)
-	movq $__USER_DS,SS+\offset(%rsp)
-	movq $__USER_CS,CS+\offset(%rsp)
-	movq RIP+\offset(%rsp),\tmp  /* get rip */
-	movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
-	movq R11+\offset(%rsp),\tmp  /* get eflags */
-	movq \tmp,EFLAGS+\offset(%rsp)
-	.endm
-
-	.macro RESTORE_TOP_OF_STACK tmp offset=0
-	movq RSP+\offset(%rsp),\tmp
-	movq \tmp,PER_CPU_VAR(old_rsp)
-	movq EFLAGS+\offset(%rsp),\tmp
-	movq \tmp,R11+\offset(%rsp)
-	.endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
  */
 	.macro EMPTY_FRAME start=1 offset=0
 	.if \start
@@ -173,12 +133,12 @@
  * initial frame state for interrupts (and exceptions without error code)
  */
 	.macro INTR_FRAME start=1 offset=0
-	EMPTY_FRAME \start, SS+8+\offset-RIP
-	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
-	CFI_REL_OFFSET rsp, RSP+\offset-RIP
-	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
-	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
-	CFI_REL_OFFSET rip, RIP+\offset-RIP
+	EMPTY_FRAME \start, 5*8+\offset
+	/*CFI_REL_OFFSET ss, 4*8+\offset*/
+	CFI_REL_OFFSET rsp, 3*8+\offset
+	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
+	/*CFI_REL_OFFSET cs, 1*8+\offset*/
+	CFI_REL_OFFSET rip, 0*8+\offset
 	.endm
 
 /*
@@ -186,30 +146,23 @@
  * with vector already pushed)
  */
 	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, RIP+\offset-ORIG_RAX
-	.endm
-
-/*
- * frame that enables calling into C.
- */
-	.macro PARTIAL_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
-	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
-	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
-	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+	INTR_FRAME \start, 1*8+\offset
 	.endm
 
 /*
  * frame that enables passing a complete pt_regs to a C function.
  */
 	.macro DEFAULT_FRAME start=1 offset=0
-	PARTIAL_FRAME \start, R11+\offset-R15
+	XCPT_FRAME \start, ORIG_RAX+\offset
+	CFI_REL_OFFSET rdi, RDI+\offset
+	CFI_REL_OFFSET rsi, RSI+\offset
+	CFI_REL_OFFSET rdx, RDX+\offset
+	CFI_REL_OFFSET rcx, RCX+\offset
+	CFI_REL_OFFSET rax, RAX+\offset
+	CFI_REL_OFFSET r8, R8+\offset
+	CFI_REL_OFFSET r9, R9+\offset
+	CFI_REL_OFFSET r10, R10+\offset
+	CFI_REL_OFFSET r11, R11+\offset
 	CFI_REL_OFFSET rbx, RBX+\offset
 	CFI_REL_OFFSET rbp, RBP+\offset
 	CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +171,30 @@
 	CFI_REL_OFFSET r15, R15+\offset
 	.endm
 
-ENTRY(save_paranoid)
-	XCPT_FRAME 1 RDI+8
-	cld
-	movq %rdi, RDI+8(%rsp)
-	movq %rsi, RSI+8(%rsp)
-	movq_cfi rdx, RDX+8
-	movq_cfi rcx, RCX+8
-	movq_cfi rax, RAX+8
-	movq %r8, R8+8(%rsp)
-	movq %r9, R9+8(%rsp)
-	movq %r10, R10+8(%rsp)
-	movq %r11, R11+8(%rsp)
-	movq_cfi rbx, RBX+8
-	movq %rbp, RBP+8(%rsp)
-	movq %r12, R12+8(%rsp)
-	movq %r13, R13+8(%rsp)
-	movq %r14, R14+8(%rsp)
-	movq %r15, R15+8(%rsp)
-	movl $1,%ebx
-	movl $MSR_GS_BASE,%ecx
-	rdmsr
-	testl %edx,%edx
-	js 1f	/* negative -> in kernel */
-	SWAPGS
-	xorl %ebx,%ebx
-1:	ret
-	CFI_ENDPROC
-END(save_paranoid)
-
 /*
- * A newly forked process directly context switches into this address.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
-	DEFAULT_FRAME
-
-	LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
-
-	call schedule_tail			# rdi: 'prev' task parameter
-
-	GET_THREAD_INFO(%rcx)
-
-	RESTORE_REST
-
-	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
-	jz   1f
-
-	/*
-	 * By the time we get here, we have no idea whether our pt_regs,
-	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the ia32entry paths.
-	 * Use int_ret_from_sys_call to return, since it can safely handle
-	 * all of the above.
-	 */
-	jmp  int_ret_from_sys_call
-
-1:
-	subq $REST_SKIP, %rsp	# leave space for volatiles
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
-	movq %rbp, %rdi
-	call *%rbx
-	movl $0, RAX(%rsp)
-	RESTORE_REST
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
  *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.  However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * Registers on entry:
  * rax  system call number
+ * rcx  return address
+ * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
  * rdi  arg0
- * rcx  return address for syscall/sysret, C arg3
  * rsi  arg1
  * rdx  arg2
- * r10  arg3 	(--> moved to rcx for C)
+ * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * r11  eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
  *
- * Interrupts are off on entry.
  * Only called from user space.
  *
- * XXX	if we had a free scratch register we could save the RSP into the stack frame
- *      and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
  */
@@ -324,9 +202,15 @@
 ENTRY(system_call)
 	CFI_STARTPROC	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
+	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
+
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
 	SWAPGS_UNSAFE_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
@@ -335,18 +219,37 @@
 	 */
 GLOBAL(system_call_after_swapgs)
 
-	movq	%rsp,PER_CPU_VAR(old_rsp)
+	movq	%rsp,PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(kernel_stack),%rsp
+
+	/* Construct struct pt_regs on stack */
+	pushq_cfi $__USER_DS			/* pt_regs->ss */
+	pushq_cfi PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
 	/*
-	 * No need to follow this irqs off/on section - it's straight
-	 * and short:
+	 * Re-enable interrupts.
+	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+	 * must execute atomically in the face of possible interrupt-driven
+	 * task preemption. We must enable interrupts only after we're done
+	 * with using rsp_scratch:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8, 0, rax_enosys=1
-	movq_cfi rax,(ORIG_RAX-ARGOFFSET)
-	movq  %rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	pushq_cfi	%r11			/* pt_regs->flags */
+	pushq_cfi	$__USER_CS		/* pt_regs->cs */
+	pushq_cfi	%rcx			/* pt_regs->ip */
+	CFI_REL_OFFSET rip,0
+	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
+	pushq_cfi_reg	rdi			/* pt_regs->di */
+	pushq_cfi_reg	rsi			/* pt_regs->si */
+	pushq_cfi_reg	rdx			/* pt_regs->dx */
+	pushq_cfi_reg	rcx			/* pt_regs->cx */
+	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
+	pushq_cfi_reg	r8			/* pt_regs->r8 */
+	pushq_cfi_reg	r9			/* pt_regs->r9 */
+	pushq_cfi_reg	r10			/* pt_regs->r10 */
+	pushq_cfi_reg	r11			/* pt_regs->r11 */
+	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+
+	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -358,12 +261,13 @@
 	ja ret_from_sys_call  /* and return regs->ax */
 	movq %r10,%rcx
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Has incompletely filled pt_regs, iret frame is also incomplete.
  */
 ret_from_sys_call:
+
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -376,52 +280,54 @@
 	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
 	 * very bad.
 	 */
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
+	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
 	CFI_REMEMBER_STATE
 	/*
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
-	movq RIP-ARGOFFSET(%rsp),%rcx
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	movq	RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
-	RESTORE_ARGS 1,-ARG_SKIP,0
+	movq	EFLAGS(%rsp),%r11
 	/*CFI_REGISTER	rflags,r11*/
-	movq	PER_CPU_VAR(old_rsp), %rsp
+	movq	RSP(%rsp),%rsp
+	/*
+	 * 64bit SYSRET restores rip from rcx,
+	 * rflags from r11 (but RF and VM bits are forced to 0),
+	 * cs and ss are loaded from MSRs.
+	 */
 	USERGS_SYSRET64
 
 	CFI_RESTORE_STATE
 
-int_ret_from_sys_call_fixup:
-	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-	jmp int_ret_from_sys_call_irqs_off
-
-	/* Do syscall tracing */
+	/* Do syscall entry tracing */
 tracesys:
-	leaq -REST_SKIP(%rsp), %rdi
+	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
 	call syscall_trace_enter_phase1
 	test %rax, %rax
 	jnz tracesys_phase2		/* if needed, run the slow path */
-	LOAD_ARGS 0			/* else restore clobbered regs */
+	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
+	movq ORIG_RAX(%rsp), %rax
 	jmp system_call_fastpath	/*      and return to the fast path */
 
 tracesys_phase2:
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %rdi
+	SAVE_EXTRA_REGS
 	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
 	movq %rax,%rdx
 	call syscall_trace_enter_phase2
 
 	/*
-	 * Reload arg registers from stack in case ptrace changed them.
+	 * Reload registers from stack in case ptrace changed them.
 	 * We don't reload %rax because syscall_trace_entry_phase2() returned
 	 * the value it wants us to use in the table lookup.
 	 */
-	LOAD_ARGS ARGOFFSET, 1
-	RESTORE_REST
+	RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
 #else
@@ -431,12 +337,12 @@
 	ja   int_ret_from_sys_call	/* RAX(%rsp) is already set */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
-	/* Use IRET because user could have changed frame */
+	movq %rax,RAX(%rsp)
+	/* Use IRET because user could have changed pt_regs->foo */
 
 /*
  * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -468,12 +374,11 @@
 	TRACE_IRQS_OFF
 	jmp int_with_check
 
-	/* handle signals and tracing -- both require a full stack frame */
+	/* handle signals and tracing -- both require a full pt_regs */
 int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
@@ -492,7 +397,7 @@
 	call do_notify_resume
 1:	movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -502,45 +407,26 @@
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
 	CFI_STARTPROC
-	popq	%r11			/* save return address */
-	PARTIAL_FRAME 0
-	SAVE_REST
-	pushq	%r11			/* put it back on stack */
-	FIXUP_TOP_OF_STACK %r11, 8
-	DEFAULT_FRAME 0 8		/* offset 8: return address */
+	DEFAULT_FRAME 0, 8		/* offset 8: return address */
+	SAVE_EXTRA_REGS 8
 	call sys_\func
-	RESTORE_TOP_OF_STACK %r11, 8
-	ret $REST_SKIP		/* pop extended registers */
-	CFI_ENDPROC
-END(stub_\func)
-	.endm
-
-	.macro FIXED_FRAME label,func
-ENTRY(\label)
-	CFI_STARTPROC
-	PARTIAL_FRAME 0 8		/* offset 8: return address */
-	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
-	call \func
-	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
 	ret
 	CFI_ENDPROC
-END(\label)
+END(stub_\func)
 	.endm
 
 	FORK_LIKE  clone
 	FORK_LIKE  fork
 	FORK_LIKE  vfork
-	FIXED_FRAME stub_iopl, sys_iopl
 
 ENTRY(stub_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call sys_execve
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
@@ -548,13 +434,11 @@
 ENTRY(stub_execveat)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call sys_execveat
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execveat)
@@ -566,12 +450,11 @@
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call sys_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
@@ -580,12 +463,11 @@
 ENTRY(stub_x32_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call sys32_x32_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
@@ -593,13 +475,11 @@
 ENTRY(stub_x32_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call compat_sys_execve
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_execve)
@@ -607,13 +487,11 @@
 ENTRY(stub_x32_execveat)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	call compat_sys_execveat
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_execveat)
@@ -621,6 +499,46 @@
 #endif
 
 /*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+	DEFAULT_FRAME
+
+	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+	pushq_cfi $0x0002
+	popfq_cfi				# reset kernel eflags
+
+	call schedule_tail			# rdi: 'prev' task parameter
+
+	GET_THREAD_INFO(%rcx)
+
+	RESTORE_EXTRA_REGS
+
+	testl $3,CS(%rsp)			# from kernel_thread?
+	jz   1f
+
+	/*
+	 * By the time we get here, we have no idea whether our pt_regs,
+	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+	 * the slow path, or one of the ia32entry paths.
+	 * Use int_ret_from_sys_call to return, since it can safely handle
+	 * all of the above.
+	 */
+	jmp  int_ret_from_sys_call
+
+1:
+	movq %rbp, %rdi
+	call *%rbx
+	movl $0, RAX(%rsp)
+	RESTORE_EXTRA_REGS
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(ret_from_fork)
+
+/*
  * Build the entry stubs and pointer table with some assembler magic.
  * We pack 7 stubs into a single 32-byte chunk, which will fit in a
  * single cache line on all modern x86 implementations.
@@ -669,47 +587,45 @@
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	/* reserve pt_regs for scratch regs and rbp */
-	subq $ORIG_RAX-RBP, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
 	cld
-	/* start from rbp in pt_regs and jump over */
-	movq_cfi rdi, (RDI-RBP)
-	movq_cfi rsi, (RSI-RBP)
-	movq_cfi rdx, (RDX-RBP)
-	movq_cfi rcx, (RCX-RBP)
-	movq_cfi rax, (RAX-RBP)
-	movq_cfi  r8,  (R8-RBP)
-	movq_cfi  r9,  (R9-RBP)
-	movq_cfi r10, (R10-RBP)
-	movq_cfi r11, (R11-RBP)
+	/*
+	 * Since nothing in interrupt handling code touches r12...r15 members
+	 * of "struct pt_regs", and since interrupts can nest, we can save
+	 * four stack slots and simultaneously provide
+	 * an unwind-friendly stack layout by saving "truncated" pt_regs
+	 * exactly up to rbp slot, without these members.
+	 */
+	ALLOC_PT_GPREGS_ON_STACK -RBP
+	SAVE_C_REGS -RBP
+	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
+	SAVE_EXTRA_REGS_RBP -RBP
 
-	/* Save rbp so that we can unwind from get_irq_regs() */
-	movq_cfi rbp, 0
+	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
 
-	/* Save previous stack value */
-	movq %rsp, %rsi
-
-	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS-RBP(%rsi)
+	testl $3, CS-RBP(%rsp)
 	je 1f
 	SWAPGS
+1:
 	/*
+	 * Save previous stack pointer, optionally switch to interrupt stack.
 	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-1:	incl PER_CPU_VAR(irq_count)
+	movq %rsp, %rsi
+	incl PER_CPU_VAR(irq_count)
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
-
-	/* Store previous stack value */
 	pushq %rsi
+	/*
+	 * For debugger:
+	 * "CFA (Current Frame Address) is the value on stack + offset"
+	 */
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 */, 0, \
+			0x77 /* DW_OP_breg7 (rsp) */, 0, \
 			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
 			0x22 /* DW_OP_plus */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
@@ -727,7 +643,7 @@
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
-	/* 0(%rsp): old_rsp-ARGOFFSET */
+	/* 0(%rsp): old RSP */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -735,19 +651,20 @@
 
 	/* Restore saved previous stack */
 	popq %rsi
-	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
-	leaq ARGOFFSET-RBP(%rsi), %rsp
+	CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+	/* return code expects complete pt_regs - adjust rsp accordingly: */
+	leaq -RBP(%rsi),%rsp
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
+	CFI_ADJUST_CFA_OFFSET	RBP
 
 exit_intr:
 	GET_THREAD_INFO(%rcx)
-	testl $3,CS-ARGOFFSET(%rsp)
+	testl $3,CS(%rsp)
 	je retint_kernel
 
 	/* Interrupt came from user space */
 	/*
-	 * Has a correct top of stack, but a partial stack frame
+	 * Has a correct top of stack.
 	 * %rcx: thread info. Interrupts off.
 	 */
 retint_with_reschedule:
@@ -770,8 +687,8 @@
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.
 	 */
-	movq (RCX-R11)(%rsp), %rcx
-	cmpq %rcx,(RIP-R11)(%rsp)		/* RCX == RIP */
+	movq RCX(%rsp),%rcx
+	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
 	jne opportunistic_sysret_failed
 
 	/*
@@ -792,19 +709,19 @@
 	shr $__VIRTUAL_MASK_SHIFT, %rcx
 	jnz opportunistic_sysret_failed
 
-	cmpq $__USER_CS,(CS-R11)(%rsp)		/* CS must match SYSRET */
+	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
 	jne opportunistic_sysret_failed
 
-	movq (R11-ARGOFFSET)(%rsp), %r11
-	cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp)	/* R11 == RFLAGS */
+	movq R11(%rsp),%r11
+	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
 	jne opportunistic_sysret_failed
 
-	testq $X86_EFLAGS_RF,%r11		/* sysret can't restore RF */
+	testq $X86_EFLAGS_RF,%r11	/* sysret can't restore RF */
 	jnz opportunistic_sysret_failed
 
 	/* nothing to check for RSP */
 
-	cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp)	/* SS must match SYSRET */
+	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
 	jne opportunistic_sysret_failed
 
 	/*
@@ -813,8 +730,9 @@
 	 */
 irq_return_via_sysret:
 	CFI_REMEMBER_STATE
-	RESTORE_ARGS 1,8,1
-	movq (RSP-RIP)(%rsp),%rsp
+	/* r11 is already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_R11
+	movq RSP(%rsp),%rsp
 	USERGS_SYSRET64
 	CFI_RESTORE_STATE
 
@@ -829,7 +747,8 @@
 	 */
 	TRACE_IRQS_IRETQ
 restore_args:
-	RESTORE_ARGS 1,8,1
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 
 irq_return:
 	INTERRUPT_RETURN
@@ -900,12 +819,12 @@
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	movq $-1,ORIG_RAX(%rsp)
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -917,7 +836,7 @@
 ENTRY(retint_kernel)
 	cmpl $0,PER_CPU_VAR(__preempt_count)
 	jnz  retint_restore_args
-	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp exit_intr
@@ -1010,7 +929,7 @@
 /*
  * Exception entry points.
  */
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
@@ -1032,8 +951,7 @@
 	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	subq $ORIG_RAX-R15, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	ALLOC_PT_GPREGS_ON_STACK
 
 	.if \paranoid
 	.if \paranoid == 1
@@ -1041,10 +959,11 @@
 	testl $3, CS(%rsp)		/* If coming from userspace, switch */
 	jnz 1f				/* stacks. */
 	.endif
-	call save_paranoid
+	call paranoid_entry
 	.else
 	call error_entry
 	.endif
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
 	DEFAULT_FRAME 0
 
@@ -1066,19 +985,20 @@
 	.endif
 
 	.if \shift_ist != -1
-	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+	subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	call \do_sym
 
 	.if \shift_ist != -1
-	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+	addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
+	/* these procedures expect "no swapgs" flag in ebx */
 	.if \paranoid
-	jmp paranoid_exit		/* %ebx: no swapgs flag */
+	jmp paranoid_exit
 	.else
-	jmp error_exit			/* %ebx: no swapgs flag */
+	jmp error_exit
 	.endif
 
 	.if \paranoid == 1
@@ -1282,7 +1202,9 @@
 	addq $0x30,%rsp
 	CFI_ADJUST_CFA_OFFSET -0x30
 	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 	jmp error_exit
 	CFI_ENDPROC
 END(xen_failsafe_callback)
@@ -1314,59 +1236,66 @@
 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
-	/*
-	 * "Paranoid" exit path from exception stack.  This is invoked
-	 * only on return from non-NMI IST interrupts that came
-	 * from kernel space.
-	 *
-	 * We may be returning to very strange contexts (e.g. very early
-	 * in syscall entry), so checking for preemption here would
-	 * be complicated.  Fortunately, we there's no good reason
-	 * to try to handle preemption here.
-	 */
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+	XCPT_FRAME 1 15*8
+	cld
+	SAVE_C_REGS 8
+	SAVE_EXTRA_REGS 8
+	movl $1,%ebx
+	movl $MSR_GS_BASE,%ecx
+	rdmsr
+	testl %edx,%edx
+	js 1f	/* negative -> in kernel */
+	SWAPGS
+	xorl %ebx,%ebx
+1:	ret
+	CFI_ENDPROC
+END(paranoid_entry)
 
-	/* ebx:	no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack.  This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
 	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
-	TRACE_IRQS_IRETQ 0
+	jnz paranoid_exit_no_swapgs
+	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	RESTORE_ALL 8
-	INTERRUPT_RETURN
-paranoid_restore:
-	TRACE_IRQS_IRETQ_DEBUG 0
-	RESTORE_ALL 8
+	jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+	TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
 	CFI_ENDPROC
 END(paranoid_exit)
 
 /*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME
-	CFI_ADJUST_CFA_OFFSET 15*8
-	/* oldrax contains error code */
+	XCPT_FRAME 1 15*8
 	cld
-	movq %rdi, RDI+8(%rsp)
-	movq %rsi, RSI+8(%rsp)
-	movq %rdx, RDX+8(%rsp)
-	movq %rcx, RCX+8(%rsp)
-	movq %rax, RAX+8(%rsp)
-	movq  %r8,  R8+8(%rsp)
-	movq  %r9,  R9+8(%rsp)
-	movq %r10, R10+8(%rsp)
-	movq %r11, R11+8(%rsp)
-	movq_cfi rbx, RBX+8
-	movq %rbp, RBP+8(%rsp)
-	movq %r12, R12+8(%rsp)
-	movq %r13, R13+8(%rsp)
-	movq %r14, R14+8(%rsp)
-	movq %r15, R15+8(%rsp)
+	SAVE_C_REGS 8
+	SAVE_EXTRA_REGS 8
 	xorl %ebx,%ebx
 	testl $3,CS+8(%rsp)
 	je error_kernelspace
@@ -1376,12 +1305,12 @@
 	TRACE_IRQS_OFF
 	ret
 
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here.  B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+	/*
+	 * There are two places in the kernel that can potentially fault with
+	 * usergs. Handle them here.  B stepping K8s sometimes report a
+	 * truncated RIP for IRET exceptions returning to compat mode. Check
+	 * for these here too.
+	 */
 error_kernelspace:
 	CFI_REL_OFFSET rcx, RCX+8
 	incl %ebx
@@ -1411,11 +1340,11 @@
 END(error_entry)
 
 
-/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(error_exit)
 	DEFAULT_FRAME
 	movl %ebx,%eax
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1597,7 +1526,7 @@
 	.rept 5
 	pushq_cfi 11*8(%rsp)
 	.endr
-	CFI_DEF_CFA_OFFSET SS+8-RIP
+	CFI_DEF_CFA_OFFSET 5*8
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1625,7 +1554,7 @@
 	pushq_cfi -6*8(%rsp)
 	.endr
 	subq $(5*8), %rsp
-	CFI_DEF_CFA_OFFSET SS+8-RIP
+	CFI_DEF_CFA_OFFSET 5*8
 end_repeat_nmi:
 
 	/*
@@ -1634,16 +1563,16 @@
 	 * so that we repeat another NMI.
 	 */
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
-	subq $ORIG_RAX-R15, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	ALLOC_PT_GPREGS_ON_STACK
+
 	/*
-	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
 	 * as we should not be calling schedule in NMI context.
 	 * Even with normal interrupts enabled. An NMI should not be
 	 * setting NEED_RESCHED or anything that normal interrupts and
 	 * exceptions might do.
 	 */
-	call save_paranoid
+	call paranoid_entry
 	DEFAULT_FRAME 0
 
 	/*
@@ -1674,8 +1603,10 @@
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
 	/* Pop the extra iret frame at once */
-	RESTORE_ALL 6*8
+	REMOVE_PT_GPREGS_FROM_STACK 6*8
 
 	/* Clear the NMI executing stack variable */
 	movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42..d031bad 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
 #include <asm/cpufeature.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
+#include <asm/bootparam.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@
 	
 	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
 		us to not reload segments */
-	testb $(1<<6), BP_loadflags(%esi)
+	testb $KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz 2f
 
 /*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9..ae6588b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
 /*
- *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *  linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@
 	 * %rsi holds a physical pointer to real_mode_data.
 	 *
 	 * We come here either directly from a 64bit bootloader, or from
-	 * arch/x86_64/boot/compressed/head.S.
+	 * arch/x86/boot/compressed/head_64.S.
 	 *
 	 * We only come here initially at boot nothing else comes here.
 	 *
@@ -146,7 +146,7 @@
 	leaq	level2_kernel_pgt(%rip), %rdi
 	leaq	4096(%rdi), %r8
 	/* See if it is a valid page table entry */
-1:	testq	$1, 0(%rdi)
+1:	testb	$1, 0(%rdi)
 	jz	2f
 	addq	%rbp, 0(%rdi)
 	/* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fc..29c740d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,7 +68,7 @@
 static inline bool interrupted_user_mode(void)
 {
 	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode_vm(regs);
+	return regs && user_mode(regs);
 }
 
 /*
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66..37dae79 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@
 	 * because the ->io_bitmap_max value must match the bitmap
 	 * contents:
 	 */
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 
 	if (turn_on)
 		bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5e..f9fd86a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@
 	if (unlikely(!desc))
 		return false;
 
-	if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
 		desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d..394e643 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@
 	u64 estack_top, estack_bottom;
 	u64 curbase = (u64)task_stack_page(current);
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return;
 
 	if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f..7fe3a9d 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@
 #ifdef CONFIG_X86_32
 	switch (regno) {
 	case GDB_SS:
-		if (!user_mode_vm(regs))
+		if (!user_mode(regs))
 			*(unsigned long *)mem = __KERNEL_DS;
 		break;
 	case GDB_SP:
-		if (!user_mode_vm(regs))
+		if (!user_mode(regs))
 			*(unsigned long *)mem = kernel_stack_pointer(regs);
 		break;
 	case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9..24d0796 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -602,7 +602,7 @@
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return 0;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1007,7 @@
 	struct die_args *args = data;
 	int ret = NOTIFY_DONE;
 
-	if (args->regs && user_mode_vm(args->regs))
+	if (args->regs && user_mode(args->regs))
 		return ret;
 
 	if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861c..02a8720 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -177,7 +177,7 @@
 		 * than just blindly copying user_regs.
 		 */
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-		regs_user_copy->sp = this_cpu_read(old_rsp);
+		regs_user_copy->sp = user_regs->sp;
 		regs_user_copy->cs = __USER_CS;
 		regs_user_copy->ss = __USER_DS;
 		regs_user_copy->cx = -1;  /* usually contains garbage */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d6..12b1cf6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,7 +37,26 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+	.x86_tss = {
+		.sp0 = TOP_OF_INIT_STACK,
+#ifdef CONFIG_X86_32
+		.ss0 = __KERNEL_DS,
+		.ss1 = __KERNEL_CS,
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+#endif
+	 },
+#ifdef CONFIG_X86_32
+	 /*
+	  * Note that the .io_bitmap member must be extra-big. This is because
+	  * the CPU will access an additional byte beyond the end of the IO
+	  * permission bitmap. The extra byte must be all 1 bits, and must
+	  * be within the limit.
+	  */
+	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
 
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -109,7 +128,7 @@
 	unsigned long *bp = t->io_bitmap_ptr;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
 
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..8ed2106 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@
 	unsigned long sp;
 	unsigned short ss, gs;
 
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		sp = regs->sp;
 		ss = regs->ss & 0xffff;
 		gs = get_user_gs(regs);
@@ -206,11 +206,7 @@
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->flags		= X86_EFLAGS_IF;
-	/*
-	 * force it to the iret return path by making it look as if there was
-	 * some work pending.
-	 */
-	set_thread_flag(TIF_NOTIFY_RESUME);
+	force_iret();
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
@@ -248,7 +244,7 @@
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	fpu_switch_t fpu;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@
 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
 	/*
-	 * Reload esp0.
-	 */
-	load_sp0(tss, next);
-
-	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
 	 * stack on entry.  No need to save %es and %ds, as those are
 	 * always kernel segments while inside the kernel.  Doing this
@@ -310,9 +301,17 @@
 	 */
 	arch_end_context_switch(next_p);
 
+	/*
+	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * current_thread_info().
+	 */
+	load_sp0(tss, next);
 	this_cpu_write(kernel_stack,
-		  (unsigned long)task_stack_page(next_p) +
-		  THREAD_SIZE - KERNEL_STACK_OFFSET);
+		       (unsigned long)task_stack_page(next_p) +
+		       THREAD_SIZE);
+	this_cpu_write(cpu_current_top_of_stack,
+		       (unsigned long)task_stack_page(next_p) +
+		       THREAD_SIZE);
 
 	/*
 	 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..4baaa97 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 
 asmlinkage extern void ret_from_fork(void);
 
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@
 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	p->thread.sp = (unsigned long) childregs;
-	p->thread.usersp = me->thread.usersp;
 	set_tsk_thread_flag(p, TIF_FORK);
 	p->thread.io_bitmap_ptr = NULL;
 
@@ -207,7 +206,7 @@
 	 */
 	if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
-		if (test_thread_flag(TIF_IA32))
+		if (is_ia32_task())
 			err = do_set_thread_area(p, -1,
 				(struct user_desc __user *)childregs->si, 0);
 		else
@@ -235,13 +234,12 @@
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
-	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
-	this_cpu_write(old_rsp, new_sp);
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
+	force_iret();
 }
 
 void
@@ -277,15 +275,12 @@
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned fsindex, gsindex;
 	fpu_switch_t fpu;
 
 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-	/* Reload esp0 and ss1. */
-	load_sp0(tss, next);
-
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
 	 *
@@ -401,8 +396,6 @@
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
-	prev->usersp = this_cpu_read(old_rsp);
-	this_cpu_write(old_rsp, next->usersp);
 	this_cpu_write(current_task, next_p);
 
 	/*
@@ -413,9 +406,11 @@
 	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
 	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 
+	/* Reload esp0 and ss1.  This changes current_thread_info(). */
+	load_sp0(tss, next);
+
 	this_cpu_write(kernel_stack,
-		  (unsigned long)task_stack_page(next_p) +
-		  THREAD_SIZE - KERNEL_STACK_OFFSET);
+		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
 
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
-	return (test_tsk_thread_flag(task, TIF_IA32)) ?
-			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
+	return task_pt_regs(task)->sp;
 }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618..a7bc794 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@
 	case offsetof(struct user_regs_struct,cs):
 		if (unlikely(value == 0))
 			return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-		if (test_tsk_thread_flag(task, TIF_IA32))
-			task_pt_regs(task)->cs = value;
-#endif
+		task_pt_regs(task)->cs = value;
 		break;
 	case offsetof(struct user_regs_struct,ss):
 		if (unlikely(value == 0))
 			return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-		if (test_tsk_thread_flag(task, TIF_IA32))
-			task_pt_regs(task)->ss = value;
-#endif
+		task_pt_regs(task)->ss = value;
 		break;
 	}
 
@@ -1421,7 +1415,7 @@
 	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = si_code;
-	info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 }
 
 void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7..77630d5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@
 	movl	(%ebx), %ecx
 	addl	$4, %ebx
 1:
-	testl	$0x1,   %ecx  /* is it a destination page */
+	testb	$0x1, %cl     /* is it a destination page */
 	jz	2f
 	movl	%ecx,	%edi
 	andl	$0xfffff000, %edi
 	jmp     0b
 2:
-	testl	$0x2,	%ecx  /* is it an indirection page */
+	testb	$0x2, %cl    /* is it an indirection page */
 	jz	2f
 	movl	%ecx,	%ebx
 	andl	$0xfffff000, %ebx
 	jmp     0b
 2:
-	testl   $0x4,   %ecx /* is it the done indicator */
+	testb   $0x4, %cl    /* is it the done indicator */
 	jz      2f
 	jmp     3f
 2:
-	testl   $0x8,   %ecx /* is it the source indicator */
+	testb   $0x8, %cl    /* is it the source indicator */
 	jz      0b	     /* Ignore it otherwise */
 	movl    %ecx,   %esi /* For every source page do a copy */
 	andl    $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c69..04cb179 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -221,23 +221,23 @@
 	movq	(%rbx), %rcx
 	addq	$8,	%rbx
 1:
-	testq	$0x1,	%rcx  /* is it a destination page? */
+	testb	$0x1,	%cl   /* is it a destination page? */
 	jz	2f
 	movq	%rcx,	%rdi
 	andq	$0xfffffffffffff000, %rdi
 	jmp	0b
 2:
-	testq	$0x2,	%rcx  /* is it an indirection page? */
+	testb	$0x2,	%cl   /* is it an indirection page? */
 	jz	2f
 	movq	%rcx,   %rbx
 	andq	$0xfffffffffffff000, %rbx
 	jmp	0b
 2:
-	testq	$0x4,	%rcx  /* is it the done indicator? */
+	testb	$0x4,	%cl   /* is it the done indicator? */
 	jz	2f
 	jmp	3f
 2:
-	testq	$0x8,	%rcx  /* is it the source indicator? */
+	testb	$0x8,	%cl   /* is it the source indicator? */
 	jz	0b	      /* Ignore it otherwise */
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e504246..eaa2c5e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -94,15 +94,8 @@
 		COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
 		COPY_SEG_CPL3(cs);
 		COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-		/* Kernel saves and restores only the CS segment register on signals,
-		 * which is the bare minimum needed to allow mixed 32/64-bit code.
-		 * App's signal handler can save/restore other segments if needed. */
-		COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
 
 		get_user_ex(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -115,6 +108,8 @@
 
 	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
 
+	force_iret();
+
 	return err;
 }
 
@@ -162,8 +157,9 @@
 #else /* !CONFIG_X86_32 */
 		put_user_ex(regs->flags, &sc->flags);
 		put_user_ex(regs->cs, &sc->cs);
-		put_user_ex(0, &sc->gs);
-		put_user_ex(0, &sc->fs);
+		put_user_ex(0, &sc->__pad2);
+		put_user_ex(0, &sc->__pad1);
+		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
 		put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +453,19 @@
 
 	regs->sp = (unsigned long)frame;
 
-	/* Set up the CS register to run signal handlers in 64-bit mode,
-	   even if the handler happens to be interrupting 32-bit code. */
+	/*
+	 * Set up the CS and SS registers to run signal handlers in
+	 * 64-bit mode, even if the handler happens to be interrupting
+	 * 32-bit or 16-bit code.
+	 *
+	 * SS is subtle.  In 64-bit mode, we don't need any particular
+	 * SS descriptor, but we do need SS to be valid.  It's possible
+	 * that the old SS is entirely bogus -- this can happen if the
+	 * signal we're trying to deliver is #GP or #SS caused by a bad
+	 * SS value.
+	 */
 	regs->cs = __USER_CS;
+	regs->ss = __USER_DS;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aa..7b20ffd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -806,13 +806,14 @@
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
+	per_cpu(cpu_current_top_of_stack, cpu) =
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 #else
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 #endif
 	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) -
-		KERNEL_STACK_OFFSET + THREAD_SIZE;
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = idle->thread.sp;
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57..3777189 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
 
 typedef asmlinkage void (*sys_call_ptr_t)(void);
 
 extern asmlinkage void sys_ni_syscall(void);
 
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e..d39c091 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+	if (!user_mode(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d16..c8eb469 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@
 {
 	enum ctx_state prev_state;
 
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		/* Other than that, we're just an exception. */
 		prev_state = exception_enter();
 	} else {
@@ -146,7 +146,7 @@
 	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return exception_exit(prev_state);
 	else
 		rcu_nmi_exit();
@@ -158,7 +158,7 @@
  *
  * IST exception handlers normally cannot schedule.  As a special
  * exception, if the exception interrupted userspace code (i.e.
- * user_mode_vm(regs) would return true) and the exception was not
+ * user_mode(regs) would return true) and the exception was not
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
 {
-	BUG_ON(!user_mode_vm(regs));
+	BUG_ON(!user_mode(regs));
 
 	/*
 	 * Sanity check: we need to be on the normal thread stack.  This
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 */
-	BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
-		& ~(THREAD_SIZE - 1)) != 0);
+	BUG_ON((unsigned long)(current_top_of_stack() -
+			       current_stack_pointer()) >= THREAD_SIZE);
 
 	preempt_count_sub(HARDIRQ_OFFSET);
 }
@@ -194,8 +194,7 @@
 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 		  struct pt_regs *regs,	long error_code)
 {
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		/*
 		 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 		 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,8 +206,8 @@
 		}
 		return -1;
 	}
-#endif
-	if (!user_mode(regs)) {
+
+	if (!user_mode_ignore_vm86(regs)) {
 		if (!fixup_exception(regs)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
@@ -384,7 +383,7 @@
 		goto exit;
 	conditional_sti(regs);
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 		die("bounds", regs, error_code);
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,16 +461,14 @@
 	prev_state = exception_enter();
 	conditional_sti(regs);
 
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 		goto exit;
 	}
-#endif
 
 	tsk = current;
-	if (!user_mode(regs)) {
+	if (!user_mode_ignore_vm86(regs)) {
 		if (fixup_exception(regs))
 			goto exit;
 
@@ -587,7 +584,7 @@
 	/* Copy the remainder of the stack from the current stack. */
 	memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 
-	BUG_ON(!user_mode_vm(&new_stack->regs));
+	BUG_ON(!user_mode(&new_stack->regs));
 	return new_stack;
 }
 NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@
 	 * then it's very likely the result of an icebp/int01 trap.
 	 * User wants a sigtrap for that.
 	 */
-	if (!dr6 && user_mode_vm(regs))
+	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
 	/* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
 		preempt_conditional_cli(regs);
@@ -688,7 +685,7 @@
 	 * We already checked v86 mode above, so we can check for kernel mode
 	 * by just checking the CPL of CS.
 	 */
-	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+	if ((dr6 & DR_STEP) && !user_mode_ignore_vm86(regs)) {
 		tsk->thread.debugreg6 &= ~DR_STEP;
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
@@ -721,7 +718,7 @@
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 	{
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
@@ -925,9 +922,21 @@
 /* Set of traps needed for early debugging. */
 void __init early_trap_init(void)
 {
-	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+	/*
+	 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+	 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+	 * CPU runs at ring 0 so it is impossible to hit an invalid
+	 * stack.  Using the original stack works well enough at this
+	 * early stage. DEBUG_STACK will be equipped after cpu_init() in
+	 * trap_init().
+	 *
+	 * We don't need to set trace_idt_table like set_intr_gate(),
+	 * since we don't have trace_debug and it will be reset to
+	 * 'debug' in trap_init() by set_intr_gate_ist().
+	 */
+	set_intr_gate_notrace(X86_TRAP_DB, debug);
 	/* int3 can be called from all */
-	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+	set_system_intr_gate(X86_TRAP_BP, &int3);
 #ifdef CONFIG_X86_32
 	set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
@@ -1005,6 +1014,15 @@
 	 */
 	cpu_init();
 
+	/*
+	 * X86_TRAP_DB and X86_TRAP_BP have been set
+	 * in early_trap_init(). However, ITS works only after
+	 * cpu_init() loads TSS. See comments in early_trap_init().
+	 */
+	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+	/* int3 can be called from all */
+	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
 	x86_init.irqs.trap_init();
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0..0b81ad6 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@
 	int ret = NOTIFY_DONE;
 
 	/* We are only interested in userspace traps */
-	if (regs && !user_mode_vm(regs))
+	if (regs && !user_mode(regs))
 		return NOTIFY_DONE;
 
 	switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf5..fc9db6e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@
 		do_exit(SIGSEGV);
 	}
 
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 	current->thread.sp0 = current->thread.saved_sp0;
 	current->thread.sysenter_cs = __KERNEL_CS;
 	load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@
 	tsk->thread.saved_fs = info->regs32->fs;
 	tsk->thread.saved_gs = get_user_gs(info->regs32);
 
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d..8561585 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1076,6 +1076,7 @@
 {
 	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
 		   THREAD_SIZE / PAGE_SIZE);
+	tss->x86_tss.sp0 = thread->sp0;
 }
 
 /* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb..082a851 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/dwarf2.h>
 
-.macro SAVE reg
-	pushl_cfi %\reg
-	CFI_REL_OFFSET \reg, 0
-.endm
-
-.macro RESTORE reg
-	popl_cfi %\reg
-	CFI_RESTORE \reg
-.endm
-
 .macro read64 reg
 	movl %ebx, %eax
 	movl %ecx, %edx
@@ -67,10 +57,10 @@
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
 	CFI_STARTPROC
-	SAVE ebp
-	SAVE ebx
-	SAVE esi
-	SAVE edi
+	pushl_cfi_reg ebp
+	pushl_cfi_reg ebx
+	pushl_cfi_reg esi
+	pushl_cfi_reg edi
 
 	movl %eax, %esi
 	movl %edx, %edi
@@ -89,10 +79,10 @@
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE edi
-	RESTORE esi
-	RESTORE ebx
-	RESTORE ebp
+	popl_cfi_reg edi
+	popl_cfi_reg esi
+	popl_cfi_reg ebx
+	popl_cfi_reg ebp
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -119,7 +109,7 @@
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@
 
 ENTRY(atomic64_dec_if_positive_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -146,18 +136,18 @@
 2:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
 	CFI_STARTPROC
-	SAVE ebp
-	SAVE ebx
+	pushl_cfi_reg ebp
+	pushl_cfi_reg ebx
 /* these just push these two parameters on the stack */
-	SAVE edi
-	SAVE ecx
+	pushl_cfi_reg edi
+	pushl_cfi_reg ecx
 
 	movl %eax, %ebp
 	movl %edx, %edi
@@ -179,8 +169,8 @@
 3:
 	addl $8, %esp
 	CFI_ADJUST_CFA_OFFSET -8
-	RESTORE ebx
-	RESTORE ebp
+	popl_cfi_reg ebx
+	popl_cfi_reg ebp
 	ret
 4:
 	cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@
 
 ENTRY(atomic64_inc_not_zero_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -209,7 +199,7 @@
 
 	movl $1, %eax
 3:
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8ee..9bc944a 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@
 	   */		
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg: unsigned char *buff
@@ -127,14 +125,12 @@
 6:	addl %ecx,%eax
 	adcl $0, %eax 
 7:	
-	testl $1, 12(%esp)
+	testb $1, 12(%esp)
 	jz 8f
 	roll $8, %eax
 8:
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -145,10 +141,8 @@
 
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
@@ -251,14 +245,12 @@
 	addl %ebx,%eax
 	adcl $0,%eax
 80: 
-	testl $1, 12(%esp)
+	testb $1, 12(%esp)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -298,12 +290,9 @@
 	CFI_STARTPROC
 	subl  $4,%esp	
 	CFI_ADJUST_CFA_OFFSET 4
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
 	movl ARGBASE+4(%esp),%esi	# src
@@ -412,12 +401,9 @@
 
 .previous
 
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
 	popl_cfi %ecx			# equivalent to addl $4,%esp
 	ret	
 	CFI_ENDPROC
@@ -441,12 +427,9 @@
 		
 ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
+	pushl_cfi_reg ebx
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
 	movl ARGBASE+12(%esp),%ecx	#len
@@ -506,12 +489,9 @@
 	jmp  7b			
 .previous				
 
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebx
-	CFI_RESTORE ebx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cf..e67e579 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
 /*
- * Zero a page. 	
- * rdi	page
- */			
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi	- page
+ */
+ENTRY(clear_page)
 	CFI_STARTPROC
+
+	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp clear_page_c_e", X86_FEATURE_ERMS
+
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep stosq
 	ret
 	CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
 	CFI_STARTPROC
-	movl $4096,%ecx
-	xorl %eax,%eax
-	rep stosb
-	ret
-	CFI_ENDPROC
-ENDPROC(clear_page_c_e)
 
-ENTRY(clear_page)
-	CFI_STARTPROC
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
 	.p2align 4
@@ -45,29 +49,13 @@
 	nop
 	ret
 	CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
+ENDPROC(clear_page_orig)
 
-	/*
-	 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
-	 * It is recommended to use this when possible.
-	 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
-	 * Otherwise, use original function.
-	 *
-	 */
-
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
-2:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c_e - clear_page) - (3f - 2b)	/* offset */
-3:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-			     .Lclear_page_end-clear_page, 2b-1b
-	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-			     .Lclear_page_end-clear_page,3b-2b
-	.previous
+ENTRY(clear_page_c_e)
+	CFI_STARTPROC
+	movl $4096,%ecx
+	xorl %eax,%eax
+	rep stosb
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca6..8239dbc 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
 	ALIGN
-copy_page_rep:
+ENTRY(copy_page)
 	CFI_STARTPROC
+	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
 	CFI_ENDPROC
-ENDPROC(copy_page_rep)
+ENDPROC(copy_page)
 
-/*
- *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- *  Could vary the prefetch distance based on SMP/UP.
-*/
-
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
 	CFI_STARTPROC
 	subq	$2*8,	%rsp
 	CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@
 	addq	$2*8, %rsp
 	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-.Lcopy_page_end:
 	CFI_ENDPROC
-ENDPROC(copy_page)
-
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (copy_page_rep - copy_page) - (2f - 1b)	/* offset */
-2:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
-		.Lcopy_page_end-copy_page, 2b-1b
-	.previous
+ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d..fa997df 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -19,33 +16,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
-	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
-	.byte 0xe9	/* 32bit jump */
-	.long \orig-1f	/* by default jump to orig */
-1:
-	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry 0b,2b,\feature1,5,5
-	altinstruction_entry 0b,3b,\feature2,5,5
-	.previous
-	.endm
-
 	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
@@ -67,7 +38,6 @@
 
 	_ASM_EXTABLE(100b,103b)
 	_ASM_EXTABLE(101b,103b)
-#endif
 	.endm
 
 /* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_to_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -94,9 +66,11 @@
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_from_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5f..9734182 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@
 
 	/* handle last odd byte */
 .Lhandle_1:
-	testl $1, %r10d
+	testb $1, %r10b
 	jz    .Lende
 	xorl  %ebx, %ebx
 	source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6..8f72b33 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
  */
 void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
 {
+	/*
+	 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+	 * even if the input buffer is long enough to hold them.
+	 */
+	if (buf_len > MAX_INSN_SIZE)
+		buf_len = MAX_INSN_SIZE;
+
 	memset(insn, 0, sizeof(*insn));
 	insn->kaddr = kaddr;
 	insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@
 				/* VEX.W overrides opnd_size */
 				insn->opnd_bytes = 8;
 		} else {
+			/*
+			 * For VEX2, fake VEX3-like byte#2.
+			 * Makes it easier to decode vex.W, vex.vvvv,
+			 * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+			 */
+			insn->vex_prefix.bytes[2] = b2 & 0x7f;
 			insn->vex_prefix.nbytes = 2;
 			insn->next_byte += 2;
 		}
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9..b046664f 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
 /*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
  * memcpy - Copy a memory block.
  *
  * Input:
@@ -17,15 +25,11 @@
  * Output:
  * rax original destination
  */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memcpy_erms", X86_FEATURE_ERMS
 
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	shrq $3, %rcx
@@ -34,29 +38,21 @@
 	movl %edx, %ecx
 	rep movsb
 	ret
-.Lmemcpy_e:
-	.previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 /*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	rep movsb
 	ret
-.Lmemcpy_e_e:
-	.previous
+ENDPROC(memcpy_erms)
 
-.weak memcpy
-
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
 	CFI_STARTPROC
 	movq %rdi, %rax
 
@@ -183,26 +179,4 @@
 .Lend:
 	retq
 	CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-	/*
-	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-	 * If the feature is supported, memcpy_c_e() is the first choice.
-	 * If enhanced rep movsb copy is not available, use fast string copy
-	 * memcpy_c() when possible. This is faster and code is simpler than
-	 * original memcpy().
-	 * Otherwise, original memcpy() is used.
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 *
-	 * Replace only beginning, memcpy is used to apply alternatives,
-	 * so it is silly to overwrite itself with nops - reboot is the
-	 * only outcome...
-	 */
-	.section .altinstructions, "a"
-	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
-	.previous
+ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530..0f8a0d0 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
  * This assembly file is re-written from memmove_64.c file.
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
-#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
@@ -44,6 +43,8 @@
 	jg 2f
 
 .Lmemmove_begin_forward:
+	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
 	/*
 	 * movsq instruction have many startup latency
 	 * so we handle small size by general register.
@@ -207,21 +208,5 @@
 13:
 	retq
 	CFI_ENDPROC
-
-	.section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
-	/* Forward moving data. */
-	movq %rdx, %rcx
-	rep movsb
-	retq
-.Lmemmove_end_forward_efs:
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry .Lmemmove_begin_forward,		\
-		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
-		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
-		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
-	.previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935..93118fb 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+.weak memset
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
  * simpler and shorter than the orignal function as well.
- *	
+ *
  * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
  * rax   original destination
- */	
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+	 * to use it when possible. If not available, use fast string instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 */
+	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memset_erms", X86_FEATURE_ERMS
+
 	movq %rdi,%r9
 	movq %rdx,%rcx
 	andl $7,%edx
@@ -31,8 +42,8 @@
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e:
-	.previous
+ENDPROC(memset)
+ENDPROC(__memset)
 
 /*
  * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
  *
  * rax   original destination
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
 	movq %rdi,%r9
 	movb %sil,%al
 	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e_e:
-	.previous
+ENDPROC(memset_erms)
 
-.weak memset
-
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
 	CFI_STARTPROC
 	movq %rdi,%r10
 
@@ -134,23 +140,4 @@
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
-	 * It is recommended to use this when possible.
-	 *
-	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
-	 * instructions.
-	 *
-	 * Otherwise, use original memset function.
-	 *
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 */
-	.section .altinstructions,"a"
-	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
-	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
-	.previous
+ENDPROC(memset_orig)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13ee..3ca5218 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
-	pushq_cfi %rbx
-	pushq_cfi %rbp
+	pushq_cfi_reg rbx
+	pushq_cfi_reg rbp
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
@@ -35,8 +35,8 @@
 	movl    %ebp, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq_cfi %rbp
-	popq_cfi %rbx
+	popq_cfi_reg rbp
+	popq_cfi_reg rbx
 	ret
 3:
 	CFI_RESTORE_STATE
@@ -53,10 +53,10 @@
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
-	pushl_cfi %ebx
-	pushl_cfi %ebp
-	pushl_cfi %esi
-	pushl_cfi %edi
+	pushl_cfi_reg ebx
+	pushl_cfi_reg ebp
+	pushl_cfi_reg esi
+	pushl_cfi_reg edi
 	pushl_cfi $0              /* Return value */
 	pushl_cfi %eax
 	movl    4(%eax), %ecx
@@ -80,10 +80,10 @@
 	movl    %esi, 24(%eax)
 	movl    %edi, 28(%eax)
 	popl_cfi %eax
-	popl_cfi %edi
-	popl_cfi %esi
-	popl_cfi %ebp
-	popl_cfi %ebx
+	popl_cfi_reg edi
+	popl_cfi_reg esi
+	popl_cfi_reg ebp
+	popl_cfi_reg ebx
 	ret
 3:
 	CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f0..2322abe 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
  */
 
 #define save_common_regs \
-	pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+	pushl_cfi_reg ecx
 
 #define restore_common_regs \
-	popl_cfi %ecx; CFI_RESTORE ecx
+	popl_cfi_reg ecx
 
 	/* Avoid uglifying the argument copying x86-64 needs to do. */
 	.macro movq src, dst
@@ -64,22 +64,22 @@
  */
 
 #define save_common_regs \
-	pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
-	pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
-	pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
-	pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
-	pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
-	pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
-	pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+	pushq_cfi_reg rdi; \
+	pushq_cfi_reg rsi; \
+	pushq_cfi_reg rcx; \
+	pushq_cfi_reg r8;  \
+	pushq_cfi_reg r9;  \
+	pushq_cfi_reg r10; \
+	pushq_cfi_reg r11
 
 #define restore_common_regs \
-	popq_cfi %r11; CFI_RESTORE r11; \
-	popq_cfi %r10; CFI_RESTORE r10; \
-	popq_cfi %r9;  CFI_RESTORE r9; \
-	popq_cfi %r8;  CFI_RESTORE r8; \
-	popq_cfi %rcx; CFI_RESTORE rcx; \
-	popq_cfi %rsi; CFI_RESTORE rsi; \
-	popq_cfi %rdi; CFI_RESTORE rdi
+	popq_cfi_reg r11; \
+	popq_cfi_reg r10; \
+	popq_cfi_reg r9; \
+	popq_cfi_reg r8; \
+	popq_cfi_reg rcx; \
+	popq_cfi_reg rsi; \
+	popq_cfi_reg rdi
 
 #endif
 
@@ -87,12 +87,10 @@
 ENTRY(call_rwsem_down_read_failed)
 	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-	CFI_REL_OFFSET __ASM_REG(dx), 0
+	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-	CFI_RESTORE __ASM_REG(dx)
+	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
@@ -124,12 +122,10 @@
 ENTRY(call_rwsem_downgrade_wake)
 	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-	CFI_REL_OFFSET __ASM_REG(dx), 0
+	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-	CFI_RESTORE __ASM_REG(dx)
+	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf..5eb7150 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
 	.globl \name
 \name:
 	CFI_STARTPROC
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ecx
+	pushl_cfi_reg edx
 
 	.if \put_ret_addr_in_eax
 	/* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
 	.endif
 
 	call \func
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl_cfi_reg edx
+	popl_cfi_reg ecx
+	popl_cfi_reg eax
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5eb..f89ba4e9 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
 	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	SAVE_ARGS
+	pushq_cfi_reg rdi
+	pushq_cfi_reg rsi
+	pushq_cfi_reg rdx
+	pushq_cfi_reg rcx
+	pushq_cfi_reg rax
+	pushq_cfi_reg r8
+	pushq_cfi_reg r9
+	pushq_cfi_reg r10
+	pushq_cfi_reg r11
 
 	.if \put_ret_addr_in_rdi
+	/* 9*8(%rsp) is return addr on stack */
 	movq_cfi_restore 9*8, rdi
 	.endif
 
@@ -45,11 +54,22 @@
 #endif
 #endif
 
-	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
 	CFI_STARTPROC
-	SAVE_ARGS
+	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	RESTORE_ARGS
+	popq_cfi_reg r11
+	popq_cfi_reg r10
+	popq_cfi_reg r9
+	popq_cfi_reg r8
+	popq_cfi_reg rax
+	popq_cfi_reg rcx
+	popq_cfi_reg rdx
+	popq_cfi_reg rsi
+	popq_cfi_reg rdi
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
+#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c..816488c 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@
 de: ESC
 df: ESC
 # 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
 e0: LOOPNE/LOOPNZ Jb (f64)
 e1: LOOPE/LOOPZ Jb (f64)
 e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@
 e5: IN eAX,Ib
 e6: OUT Ib,AL
 e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
 e8: CALL Jz (f64)
 e9: JMP-near Jz (f64)
 ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@
 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
 # 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 80: JO Jz (f64)
 81: JNO Jz (f64)
 82: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@
 GrpTable: Grp5
 0: INC Ev
 1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 2: CALLN Ev (f64)
 3: CALLF Ep
 4: JMPN Ev (f64)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025f..181c53b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@
 	int ret = 0;
 
 	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode_vm(regs)) {
+	if (kprobes_built_in() && !user_mode(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
@@ -148,7 +148,7 @@
 	instr = (void *)convert_ip_to_linear(current, regs);
 	max_instr = instr + 15;
 
-	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
 		return 0;
 
 	while (instr < max_instr) {
@@ -1035,7 +1035,7 @@
 	if (error_code & PF_USER)
 		return false;
 
-	if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
 		return false;
 
 	return true;
@@ -1140,7 +1140,7 @@
 	 * User-mode registers count as a user access even for any
 	 * potential system fault or CPU buglet:
 	 */
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		local_irq_enable();
 		error_code |= PF_USER;
 		flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efc..52417e7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -179,7 +179,8 @@
 	if (cpu_has_pge) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
+	} else
+		__supported_pte_mask &= ~_PAGE_GLOBAL;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5..4e664bd 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@
 {
 	struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		unsigned long stack = kernel_stack_pointer(regs);
 		if (depth)
 			dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5..757678f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@
 static void fix_processor_context(void)
 {
 	int cpu = smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
 	struct desc_struct *desc = get_cpu_gdt_table(cpu);
 	tss_desc tss;
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ec..ef8187f 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
 110	i386	iopl			sys_iopl
 111	i386	vhangup			sys_vhangup
 112	i386	idle
-113	i386	vm86old			sys_vm86old			sys32_vm86_warning
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
 114	i386	wait4			sys_wait4			compat_sys_wait4
 115	i386	swapoff			sys_swapoff
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
@@ -172,7 +172,7 @@
 163	i386	mremap			sys_mremap
 164	i386	setresuid		sys_setresuid16
 165	i386	getresuid		sys_getresuid16
-166	i386	vm86			sys_vm86			sys32_vm86_warning
+166	i386	vm86			sys_vm86			sys_ni_syscall
 167	i386	query_module
 168	i386	poll			sys_poll
 169	i386	nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fb..9ef32d5 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
 169	common	reboot			sys_reboot
 170	common	sethostname		sys_sethostname
 171	common	setdomainname		sys_setdomainname
-172	common	iopl			stub_iopl
+172	common	iopl			sys_iopl
 173	common	ioperm			sys_ioperm
 174	64	create_module
 175	common	init_module		sys_init_module
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1..8ffd214 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
  */
 static inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9d..a75d8700 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 /*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f56..81665c9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@
 	mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
+	tss->x86_tss.sp0 = thread->sp0;
 }
 
 static void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489..765b768 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -452,8 +452,7 @@
 	clear_tsk_thread_flag(idle, TIF_FORK);
 #endif
 	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) -
-		KERNEL_STACK_OFFSET + THREAD_SIZE;
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefd..985fc3e 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
+	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq PER_CPU_VAR(kernel_stack), %rsp
 
 	pushq $__USER_DS
-	pushq PER_CPU_VAR(old_rsp)
+	pushq PER_CPU_VAR(rsp_scratch)
 	pushq %r11
 	pushq $__USER_CS
 	pushq %rcx
@@ -87,11 +87,11 @@
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
+	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq PER_CPU_VAR(kernel_stack), %rsp
 
 	pushq $__USER32_DS
-	pushq PER_CPU_VAR(old_rsp)
+	pushq PER_CPU_VAR(rsp_scratch)
 	pushq %r11
 	pushq $__USER32_CS
 	pushq %rcx
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 82dc5748..7f32712 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -1210,7 +1210,7 @@
 
 		if (((die_args->trapnr == X86_TRAP_MF) ||
 		     (die_args->trapnr == X86_TRAP_XF)) &&
-		    !user_mode_vm(die_args->regs))
+		    !user_mode(die_args->regs))
 			xpc_die_deactivate();
 
 		break;
diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index f4aec0e..076af43 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -19,3 +19,12 @@
 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 #endif
 #endif
+
+/**
+ * offsetofend(TYPE, MEMBER)
+ *
+ * @TYPE: The type of the structure
+ * @MEMBER: The member within the structure to get the end offset of
+ */
+#define offsetofend(TYPE, MEMBER) \
+	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 2d67b89..049b2f4 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -78,19 +78,6 @@
 extern void vfio_unregister_iommu_driver(
 				const struct vfio_iommu_driver_ops *ops);
 
-/**
- * offsetofend(TYPE, MEMBER)
- *
- * @TYPE: The type of the structure
- * @MEMBER: The member within the structure to get the end offset of
- *
- * Simple helper macro for dealing with variable sized structures passed
- * from user space.  This allows us to easily determine if the provided
- * structure is sized to include various fields.
- */
-#define offsetofend(TYPE, MEMBER) \
-	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
-
 /*
  * External user API
  */
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab79..8c0c1a2 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
 
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
 	"x86-64-unrolled",
 	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
 	"x86-64-movsq",
 	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
 	"x86-64-movsb",
 	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf0..e4c2c30 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db1d3a2..d3dfb79 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@
 		    "Specify length of memory to copy. "
 		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
 	OPT_STRING('r', "routine", &routine, "default",
-		    "Specify routine to copy"),
+		    "Specify routine to copy, \"all\" runs all available routines"),
 	OPT_INTEGER('i', "iterations", &iterations,
 		    "repeat memcpy() invocation this number of times"),
 	OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@
 	const char *const *usage;
 };
 
-static int bench_mem_common(int argc, const char **argv,
-		     const char *prefix __maybe_unused,
-		     struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
 {
-	int i;
-	size_t len;
-	double totallen;
+	const struct routine *r = &info->routines[r_idx];
 	double result_bps[2];
 	u64 result_cycle[2];
 
-	argc = parse_options(argc, argv, options,
-			     info->usage, 0);
-
-	if (no_prefault && only_prefault) {
-		fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
-		return 1;
-	}
-
-	if (use_cycle)
-		init_cycle();
-
-	len = (size_t)perf_atoll((char *)length_str);
-	totallen = (double)len * iterations;
-
 	result_cycle[0] = result_cycle[1] = 0ULL;
 	result_bps[0] = result_bps[1] = 0.0;
 
-	if ((s64)len <= 0) {
-		fprintf(stderr, "Invalid length:%s\n", length_str);
-		return 1;
-	}
-
-	/* same to without specifying either of prefault and no-prefault */
-	if (only_prefault && no_prefault)
-		only_prefault = no_prefault = false;
-
-	for (i = 0; info->routines[i].name; i++) {
-		if (!strcmp(info->routines[i].name, routine))
-			break;
-	}
-	if (!info->routines[i].name) {
-		printf("Unknown routine:%s\n", routine);
-		printf("Available routines...\n");
-		for (i = 0; info->routines[i].name; i++) {
-			printf("\t%s ... %s\n",
-			       info->routines[i].name, info->routines[i].desc);
-		}
-		return 1;
-	}
+	printf("Routine %s (%s)\n", r->name, r->desc);
 
 	if (bench_format == BENCH_FORMAT_DEFAULT)
 		printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@
 	if (!only_prefault && !no_prefault) {
 		/* show both of results */
 		if (use_cycle) {
-			result_cycle[0] =
-				info->do_cycle(&info->routines[i], len, false);
-			result_cycle[1] =
-				info->do_cycle(&info->routines[i], len, true);
+			result_cycle[0] = info->do_cycle(r, len, false);
+			result_cycle[1] = info->do_cycle(r, len, true);
 		} else {
-			result_bps[0] =
-				info->do_gettimeofday(&info->routines[i],
-						len, false);
-			result_bps[1] =
-				info->do_gettimeofday(&info->routines[i],
-						len, true);
+			result_bps[0]   = info->do_gettimeofday(r, len, false);
+			result_bps[1]   = info->do_gettimeofday(r, len, true);
 		}
 	} else {
-		if (use_cycle) {
-			result_cycle[pf] =
-				info->do_cycle(&info->routines[i],
-						len, only_prefault);
-		} else {
-			result_bps[pf] =
-				info->do_gettimeofday(&info->routines[i],
-						len, only_prefault);
-		}
+		if (use_cycle)
+			result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+		else
+			result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
 	}
 
 	switch (bench_format) {
@@ -265,6 +215,60 @@
 		die("unknown format: %d\n", bench_format);
 		break;
 	}
+}
+
+static int bench_mem_common(int argc, const char **argv,
+		     const char *prefix __maybe_unused,
+		     struct bench_mem_info *info)
+{
+	int i;
+	size_t len;
+	double totallen;
+
+	argc = parse_options(argc, argv, options,
+			     info->usage, 0);
+
+	if (no_prefault && only_prefault) {
+		fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+		return 1;
+	}
+
+	if (use_cycle)
+		init_cycle();
+
+	len = (size_t)perf_atoll((char *)length_str);
+	totallen = (double)len * iterations;
+
+	if ((s64)len <= 0) {
+		fprintf(stderr, "Invalid length:%s\n", length_str);
+		return 1;
+	}
+
+	/* same to without specifying either of prefault and no-prefault */
+	if (only_prefault && no_prefault)
+		only_prefault = no_prefault = false;
+
+	if (!strncmp(routine, "all", 3)) {
+		for (i = 0; info->routines[i].name; i++)
+			__bench_mem_routine(info, i, len, totallen);
+		return 0;
+	}
+
+	for (i = 0; info->routines[i].name; i++) {
+		if (!strcmp(info->routines[i].name, routine))
+			break;
+	}
+	if (!info->routines[i].name) {
+		printf("Unknown routine:%s\n", routine);
+		printf("Available routines...\n");
+		for (i = 0; info->routines[i].name; i++) {
+			printf("\t%s ... %s\n",
+			       info->routines[i].name, info->routines[i].desc);
+		}
+		return 1;
+	}
+
+	__bench_mem_routine(info, i, len, totallen);
 
 	return 0;
 }
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff9..f02d028 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
 
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
 	"x86-64-unrolled",
 	"unrolled memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
 	"x86-64-stosq",
 	"movsq-based memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
 	"x86-64-stosb",
 	"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89..de27878 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
 #include "../../../arch/x86/lib/memset_64.S"
 
 /*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788..3a3a0f1 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
 #define altinstruction_entry #
+#define ALTERNATIVE_2 #
 
 #endif