x86/asm/entry/64: Always allocate a complete "struct pt_regs" on the kernel stack
The 64-bit entry code was using six stack slots less by not
saving/restoring registers which are callee-preserved according
to the C ABI, and was not allocating space for them.
Only when syscalls needed a complete "struct pt_regs" was
the complete area allocated and filled in.
As an additional twist, on interrupt entry a "slightly less
truncated pt_regs" trick is used, to make nested interrupt
stacks easier to unwind.
This proved to be a source of significant obfuscation and subtle
bugs. For example, 'stub_fork' had to pop the return address,
extend the struct, save registers, and push return address back.
Ugly. 'ia32_ptregs_common' pops return address and "returns" via
jmp insn, throwing a wrench into CPU return stack cache.
This patch changes the code to always allocate a complete
"struct pt_regs" on the kernel stack. The saving of registers
is still done lazily.
"Partial pt_regs" trick on interrupt stack is retained.
Macros which manipulate "struct pt_regs" on stack are reworked:
- ALLOC_PT_GPREGS_ON_STACK allocates the structure.
- SAVE_C_REGS saves to it those registers which are clobbered
by C code.
- SAVE_EXTRA_REGS saves to it all other registers.
- Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros
reverse it.
'ia32_ptregs_common', 'stub_fork' and friends lost their ugly dance
with the return pointer.
LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets
instead of magic numbers.
'error_entry' and 'save_paranoid' now use SAVE_C_REGS +
SAVE_EXTRA_REGS instead of having it open-coded yet again.
Patch was run-tested: 64-bit executables, 32-bit executables,
strace works.
Timing tests did not show measurable difference in 32-bit
and 64-bit syscalls.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1423778052-21038-2-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/b89763d354aa23e670b9bdf3a40ae320320a7c2e.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a57b338..e8372e0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -26,12 +26,6 @@
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
* backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
@@ -190,9 +184,9 @@
.endm
/*
- * frame that enables calling into C.
+ * frame that enables passing a complete pt_regs to a C function.
*/
- .macro PARTIAL_FRAME start=1 offset=0
+ .macro DEFAULT_FRAME start=1 offset=0
XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
@@ -203,13 +197,6 @@
CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
CFI_REL_OFFSET rbx, RBX+\offset
CFI_REL_OFFSET rbp, RBP+\offset
CFI_REL_OFFSET r12, R12+\offset
@@ -221,21 +208,8 @@
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -264,7 +238,7 @@
GET_THREAD_INFO(%rcx)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
jz 1f
@@ -276,12 +250,10 @@
jmp ret_from_sys_call # go to the SYSRET fastpath
1:
- subq $REST_SKIP, %rsp # leave space for volatiles
- CFI_ADJUST_CFA_OFFSET REST_SKIP
movq %rbp, %rdi
call *%rbx
movl $0, RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
@@ -339,9 +311,11 @@
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8, 0, rax_enosys=1
+ ALLOC_PT_GPREGS_ON_STACK 8
+ SAVE_C_REGS_EXCEPT_RAX_RCX
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
movq_cfi rax,(ORIG_RAX-ARGOFFSET)
- movq %rcx,RIP-ARGOFFSET(%rsp)
+ movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
@@ -372,9 +346,9 @@
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
+ RESTORE_C_REGS_EXCEPT_RCX
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -387,16 +361,17 @@
/* Do syscall tracing */
tracesys:
- leaq -REST_SKIP(%rsp), %rdi
+ movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
- LOAD_ARGS 0 /* else restore clobbered regs */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX-ARGOFFSET(%rsp), %rax
jmp system_call_fastpath /* and return to the fast path */
tracesys_phase2:
- SAVE_REST
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %rdi
movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
@@ -408,8 +383,8 @@
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
@@ -460,7 +435,7 @@
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
int_check_syscall_exit_work:
- SAVE_REST
+ SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
@@ -479,7 +454,7 @@
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -489,15 +464,12 @@
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
- popq %r11 /* save return address */
- PARTIAL_FRAME 0
- SAVE_REST
- pushq %r11 /* put it back on stack */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
+ SAVE_EXTRA_REGS 8
FIXUP_TOP_OF_STACK %r11, 8
- DEFAULT_FRAME 0 8 /* offset 8: return address */
call sys_\func
RESTORE_TOP_OF_STACK %r11, 8
- ret $REST_SKIP /* pop extended registers */
+ ret
CFI_ENDPROC
END(stub_\func)
.endm
@@ -505,7 +477,7 @@
.macro FIXED_FRAME label,func
ENTRY(\label)
CFI_STARTPROC
- PARTIAL_FRAME 0 8 /* offset 8: return address */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
call \func
RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
@@ -522,12 +494,12 @@
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_execve
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
@@ -535,13 +507,13 @@
ENTRY(stub_execveat)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_execveat
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execveat)
@@ -553,12 +525,12 @@
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_rt_sigreturn)
@@ -567,12 +539,12 @@
ENTRY(stub_x32_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys32_x32_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_rt_sigreturn)
@@ -580,13 +552,13 @@
ENTRY(stub_x32_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call compat_sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_execve)
@@ -594,13 +566,13 @@
ENTRY(stub_x32_execveat)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call compat_sys_execveat
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_execveat)
@@ -656,42 +628,28 @@
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- /* reserve pt_regs for scratch regs and rbp */
- subq $ORIG_RAX-RBP, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ SAVE_C_REGS -RBP
+ /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+ SAVE_EXTRA_REGS_RBP -RBP
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
+ leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
- /* Save previous stack value */
- movq %rsp, %rsi
-
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
+ testl $3, CS-RBP(%rsp)
je 1f
SWAPGS
+1:
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
pushq %rsi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
@@ -800,7 +758,8 @@
*/
irq_return_via_sysret:
CFI_REMEMBER_STATE
- RESTORE_ARGS 1,8,1
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
movq (RSP-RIP)(%rsp),%rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
@@ -816,7 +775,8 @@
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 1,8,1
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
irq_return:
INTERRUPT_RETURN
@@ -887,12 +847,12 @@
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1019,8 +979,7 @@
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
.endif
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
@@ -1269,7 +1228,9 @@
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
@@ -1321,11 +1282,15 @@
jnz paranoid_restore
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0
- RESTORE_ALL 8
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
@@ -1339,21 +1304,8 @@
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq %rdx, RDX+8(%rsp)
- movq %rcx, RCX+8(%rsp)
- movq %rax, RAX+8(%rsp)
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1402,7 +1354,7 @@
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1621,8 +1573,8 @@
* so that we repeat another NMI.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
+
/*
* Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
@@ -1661,8 +1613,10 @@
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
/* Pop the extra iret frame at once */
- RESTORE_ALL 6*8
+ REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)