blob: 6d3d20e3e43a9b005f725d1e89dcc8fd81560dcd [file] [log] [blame]
Thomas Gleixner457c8992019-05-19 13:08:55 +01001// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds1da177e2005-04-16 15:20:36 -07002/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 * Copyright (C) 1995 Linus Torvalds
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
Hiroshi Shimamoto66125382008-01-30 13:31:03 +01007 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07008 * X86-64 port
9 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070010 *
11 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070012 */
13
14/*
15 * This file handles the architecture-dependent parts of process handling..
16 */
17
Ashok Raj76e4f662005-06-25 14:55:00 -070018#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/errno.h>
20#include <linux/sched.h>
Ingo Molnar29930022017-02-08 18:51:36 +010021#include <linux/sched/task.h>
Ingo Molnar68db0cf2017-02-08 18:51:37 +010022#include <linux/sched/task_stack.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010023#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/interrupt.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010031#include <linux/delay.h>
Paul Gortmaker186f4362016-07-13 20:18:56 -040032#include <linux/export.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010033#include <linux/ptrace.h>
Andi Kleen95833c82006-01-11 22:44:36 +010034#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080035#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070036#include <linux/kdebug.h>
Erik Bosman529e25f2008-04-14 00:24:18 +020037#include <linux/prctl.h>
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030038#include <linux/uaccess.h>
39#include <linux/io.h>
Frederic Weisbecker8b96f012008-12-06 03:40:00 +010040#include <linux/ftrace.h>
Kyle Hueyff3f0972017-03-20 01:16:21 -070041#include <linux/syscalls.h>
Linus Torvalds58390c82023-04-30 13:00:38 -070042#include <linux/iommu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070043
Linus Torvalds1da177e2005-04-16 15:20:36 -070044#include <asm/processor.h>
Dave Hansen784a46612021-06-23 14:02:05 +020045#include <asm/pkru.h>
Thomas Gleixner63e81802021-10-15 03:16:20 +020046#include <asm/fpu/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070047#include <asm/mmu_context.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070049#include <asm/desc.h>
50#include <asm/proto.h>
51#include <asm/ia32.h>
K.Prasad66cb5912009-06-01 23:44:55 +053052#include <asm/debugreg.h>
David Howellsf05e7982012-03-28 18:11:12 +010053#include <asm/switch_to.h>
Andy Lutomirskib7a584592016-03-16 14:14:21 -070054#include <asm/xen/hypervisor.h>
Dmitry Safonov2eefd872016-09-05 16:33:05 +030055#include <asm/vdso.h>
Reinette Chatre8dd97c62020-05-05 15:36:12 -070056#include <asm/resctrl.h>
Dmitry Safonovada26482017-03-31 14:11:37 +030057#include <asm/unistd.h>
Chang S. Baeb1378a52018-09-18 16:08:53 -070058#include <asm/fsgsbase.h>
H. Peter Anvin (Intel)ad41a142023-12-05 02:50:09 -080059#include <asm/fred.h>
Dmitry Safonovada26482017-03-31 14:11:37 +030060#ifdef CONFIG_IA32_EMULATION
61/* Not included via unistd.h */
62#include <asm/unistd_32_ia32.h>
63#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -070064
Thomas Gleixnerff167012018-11-25 19:33:47 +010065#include "process.h"
66
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010067/* Prints also some state that isn't saved in the pt_regs */
Dmitry Safonov44e21532020-06-29 15:48:46 +010068void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
69 const char *log_lvl)
Linus Torvalds1da177e2005-04-16 15:20:36 -070070{
71 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +020072 unsigned long d0, d1, d2, d3, d6, d7;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010073 unsigned int fsindex, gsindex;
Andy Lutomirskid38bc892018-11-21 15:11:24 -080074 unsigned int ds, es;
Linus Torvalds1da177e2005-04-16 15:20:36 -070075
Dmitry Safonov44e21532020-06-29 15:48:46 +010076 show_iret_regs(regs, log_lvl);
Josh Poimboeufb02fcf92017-12-04 15:07:09 +010077
Josh Poimboeuf6fa81a122016-10-20 11:34:45 -050078 if (regs->orig_ax != -1)
79 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
80 else
81 pr_cont("\n");
82
Dmitry Safonov44e21532020-06-29 15:48:46 +010083 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
84 log_lvl, regs->ax, regs->bx, regs->cx);
85 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
86 log_lvl, regs->dx, regs->si, regs->di);
87 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
88 log_lvl, regs->bp, regs->r8, regs->r9);
89 printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
90 log_lvl, regs->r10, regs->r11, regs->r12);
91 printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
92 log_lvl, regs->r13, regs->r14, regs->r15);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093
Jann Horn9fe62992018-08-31 21:41:51 +020094 if (mode == SHOW_REGS_SHORT)
Josh Poimboeufb02fcf92017-12-04 15:07:09 +010095 return;
96
Jann Horn9fe62992018-08-31 21:41:51 +020097 if (mode == SHOW_REGS_USER) {
98 rdmsrl(MSR_FS_BASE, fs);
99 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
Dmitry Safonov44e21532020-06-29 15:48:46 +0100100 printk("%sFS: %016lx GS: %016lx\n",
101 log_lvl, fs, shadowgs);
Jann Horn9fe62992018-08-31 21:41:51 +0200102 return;
103 }
104
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300105 asm("movl %%ds,%0" : "=r" (ds));
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300106 asm("movl %%es,%0" : "=r" (es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107 asm("movl %%fs,%0" : "=r" (fsindex));
108 asm("movl %%gs,%0" : "=r" (gsindex));
109
110 rdmsrl(MSR_FS_BASE, fs);
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300111 rdmsrl(MSR_GS_BASE, gs);
112 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200114 cr0 = read_cr0();
115 cr2 = read_cr2();
Andy Lutomirski6c690ee2017-06-12 10:26:14 -0700116 cr3 = __read_cr3();
Andy Lutomirski1e02ce42014-10-24 15:58:08 -0700117 cr4 = __read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
Dmitry Safonov44e21532020-06-29 15:48:46 +0100119 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
120 log_lvl, fs, fsindex, gs, gsindex, shadowgs);
Xin Liee632912023-12-05 02:50:02 -0800121 printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n",
Dmitry Safonov44e21532020-06-29 15:48:46 +0100122 log_lvl, regs->cs, ds, es, cr0);
123 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
124 log_lvl, cr2, cr3, cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200125
126 get_debugreg(d0, 0);
127 get_debugreg(d1, 1);
128 get_debugreg(d2, 2);
Alan Sternbb1995d2007-07-21 17:10:42 +0200129 get_debugreg(d3, 3);
130 get_debugreg(d6, 6);
131 get_debugreg(d7, 7);
Dave Jones4338774c2013-06-18 12:09:11 -0400132
133 /* Only print out debug registers if they are in their non-default state. */
Nicolas Ioossba6d0182016-09-10 20:30:45 +0200134 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
135 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
Dmitry Safonov44e21532020-06-29 15:48:46 +0100136 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
137 log_lvl, d0, d1, d2);
138 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
139 log_lvl, d3, d6, d7);
Nicolas Ioossba6d0182016-09-10 20:30:45 +0200140 }
Dave Jones4338774c2013-06-18 12:09:11 -0400141
David Kaplanb53c6bd2024-04-21 21:17:28 +0200142 if (cr4 & X86_CR4_PKE)
Dmitry Safonov44e21532020-06-29 15:48:46 +0100143 printk("%sPKRU: %08x\n", log_lvl, read_pkru());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144}
145
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146void release_thread(struct task_struct *dead_task)
147{
Jann Horn50e04ac2019-07-13 00:41:52 +0200148 WARN_ON(dead_task->mm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149}
150
Andy Lutomirskie137a4d2017-08-01 07:11:37 -0700151enum which_selector {
152 FS,
153 GS
154};
155
156/*
Chang S. Bae58edfd22020-05-28 16:13:50 -0400157 * Out of line to be protected from kprobes and tracing. If this would be
158 * traced or probed than any access to a per CPU variable happens with
159 * the wrong GS.
160 *
161 * It is not used on Xen paravirt. When paravirt support is needed, it
162 * needs to be renamed with native_ prefix.
163 */
164static noinstr unsigned long __rdgsbase_inactive(void)
165{
166 unsigned long gsbase;
167
168 lockdep_assert_irqs_disabled();
169
H. Peter Anvin (Intel)09794f682023-12-05 02:50:07 -0800170 /*
171 * SWAPGS is no longer needed thus NOT allowed with FRED because
172 * FRED transitions ensure that an operating system can _always_
173 * operate with its own GS base address:
174 * - For events that occur in ring 3, FRED event delivery swaps
175 * the GS base address with the IA32_KERNEL_GS_BASE MSR.
176 * - ERETU (the FRED transition that returns to ring 3) also swaps
177 * the GS base address with the IA32_KERNEL_GS_BASE MSR.
178 *
179 * And the operating system can still setup the GS segment for a
180 * user thread without the need of loading a user thread GS with:
181 * - Using LKGS, available with FRED, to modify other attributes
182 * of the GS segment without compromising its ability always to
183 * operate with its own GS base address.
184 * - Accessing the GS segment base address for a user thread as
185 * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
186 *
187 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
188 * MSR instead of the GS segment’s descriptor cache. As such, the
189 * operating system never changes its runtime GS base address.
190 */
191 if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
192 !cpu_feature_enabled(X86_FEATURE_XENPV)) {
Andy Lutomirskid029bff2020-06-26 10:24:30 -0700193 native_swapgs();
194 gsbase = rdgsbase();
195 native_swapgs();
196 } else {
197 instrumentation_begin();
198 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
199 instrumentation_end();
200 }
Chang S. Bae58edfd22020-05-28 16:13:50 -0400201
202 return gsbase;
203}
204
205/*
206 * Out of line to be protected from kprobes and tracing. If this would be
207 * traced or probed than any access to a per CPU variable happens with
208 * the wrong GS.
209 *
210 * It is not used on Xen paravirt. When paravirt support is needed, it
211 * needs to be renamed with native_ prefix.
212 */
213static noinstr void __wrgsbase_inactive(unsigned long gsbase)
214{
215 lockdep_assert_irqs_disabled();
216
H. Peter Anvin (Intel)09794f682023-12-05 02:50:07 -0800217 if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
218 !cpu_feature_enabled(X86_FEATURE_XENPV)) {
Andy Lutomirskid029bff2020-06-26 10:24:30 -0700219 native_swapgs();
220 wrgsbase(gsbase);
221 native_swapgs();
222 } else {
223 instrumentation_begin();
224 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
225 instrumentation_end();
226 }
Chang S. Bae58edfd22020-05-28 16:13:50 -0400227}
228
229/*
Andy Lutomirskie137a4d2017-08-01 07:11:37 -0700230 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
231 * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
232 * It's forcibly inlined because it'll generate better code and this function
233 * is hot.
234 */
235static __always_inline void save_base_legacy(struct task_struct *prev_p,
236 unsigned short selector,
237 enum which_selector which)
238{
239 if (likely(selector == 0)) {
240 /*
241 * On Intel (without X86_BUG_NULL_SEG), the segment base could
242 * be the pre-existing saved base or it could be zero. On AMD
243 * (with X86_BUG_NULL_SEG), the segment base could be almost
244 * anything.
245 *
246 * This branch is very hot (it's hit twice on almost every
247 * context switch between 64-bit programs), and avoiding
248 * the RDMSR helps a lot, so we just assume that whatever
249 * value is already saved is correct. This matches historical
250 * Linux behavior, so it won't break existing applications.
251 *
252 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
253 * report that the base is zero, it needs to actually be zero:
254 * see the corresponding logic in load_seg_legacy.
255 */
256 } else {
257 /*
258 * If the selector is 1, 2, or 3, then the base is zero on
259 * !X86_BUG_NULL_SEG CPUs and could be anything on
260 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
261 * has never attempted to preserve the base across context
262 * switches.
263 *
264 * If selector > 3, then it refers to a real segment, and
265 * saving the base isn't necessary.
266 */
267 if (which == FS)
268 prev_p->thread.fsbase = 0;
269 else
270 prev_p->thread.gsbase = 0;
271 }
272}
273
274static __always_inline void save_fsgs(struct task_struct *task)
275{
276 savesegment(fs, task->thread.fsindex);
277 savesegment(gs, task->thread.gsindex);
Andy Lutomirski67390342020-05-28 16:13:51 -0400278 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
279 /*
280 * If FSGSBASE is enabled, we can't make any useful guesses
281 * about the base, and user code expects us to save the current
282 * value. Fortunately, reading the base directly is efficient.
283 */
284 task->thread.fsbase = rdfsbase();
285 task->thread.gsbase = __rdgsbase_inactive();
286 } else {
287 save_base_legacy(task, task->thread.fsindex, FS);
288 save_base_legacy(task, task->thread.gsindex, GS);
289 }
Andy Lutomirskie137a4d2017-08-01 07:11:37 -0700290}
291
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100292/*
293 * While a process is running,current->thread.fsbase and current->thread.gsbase
Thomas Gleixner67580342020-05-28 16:13:52 -0400294 * may not match the corresponding CPU registers (see save_base_legacy()).
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100295 */
Thomas Gleixner67580342020-05-28 16:13:52 -0400296void current_save_fsgs(void)
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100297{
Thomas Gleixner67580342020-05-28 16:13:52 -0400298 unsigned long flags;
299
300 /* Interrupts need to be off for FSGSBASE */
301 local_irq_save(flags);
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100302 save_fsgs(current);
Thomas Gleixner67580342020-05-28 16:13:52 -0400303 local_irq_restore(flags);
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100304}
Thomas Gleixner67580342020-05-28 16:13:52 -0400305#if IS_ENABLED(CONFIG_KVM)
306EXPORT_SYMBOL_GPL(current_save_fsgs);
Vitaly Kuznetsov42b933b2018-03-13 18:48:04 +0100307#endif
308
Andy Lutomirskie137a4d2017-08-01 07:11:37 -0700309static __always_inline void loadseg(enum which_selector which,
310 unsigned short sel)
311{
312 if (which == FS)
313 loadsegment(fs, sel);
314 else
315 load_gs_index(sel);
316}
317
318static __always_inline void load_seg_legacy(unsigned short prev_index,
319 unsigned long prev_base,
320 unsigned short next_index,
321 unsigned long next_base,
322 enum which_selector which)
323{
324 if (likely(next_index <= 3)) {
325 /*
326 * The next task is using 64-bit TLS, is not using this
327 * segment at all, or is having fun with arcane CPU features.
328 */
329 if (next_base == 0) {
330 /*
331 * Nasty case: on AMD CPUs, we need to forcibly zero
332 * the base.
333 */
334 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
335 loadseg(which, __USER_DS);
336 loadseg(which, next_index);
337 } else {
338 /*
339 * We could try to exhaustively detect cases
340 * under which we can skip the segment load,
341 * but there's really only one case that matters
342 * for performance: if both the previous and
343 * next states are fully zeroed, we can skip
344 * the load.
345 *
346 * (This assumes that prev_base == 0 has no
347 * false positives. This is the case on
348 * Intel-style CPUs.)
349 */
350 if (likely(prev_index | next_index | prev_base))
351 loadseg(which, next_index);
352 }
353 } else {
354 if (prev_index != next_index)
355 loadseg(which, next_index);
356 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
357 next_base);
358 }
359 } else {
360 /*
361 * The next task is using a real segment. Loading the selector
362 * is sufficient.
363 */
364 loadseg(which, next_index);
365 }
366}
367
Dave Hansen9782a712021-06-23 14:02:18 +0200368/*
369 * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
370 * is not XSTATE managed on context switch because that would require a
371 * lookup in the task's FPU xsave buffer and require to keep that updated
372 * in various places.
373 */
374static __always_inline void x86_pkru_load(struct thread_struct *prev,
375 struct thread_struct *next)
376{
377 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
378 return;
379
380 /* Stash the prev task's value: */
381 prev->pkru = rdpkru();
382
383 /*
384 * PKRU writes are slightly expensive. Avoid them when not
385 * strictly necessary:
386 */
387 if (prev->pkru != next->pkru)
388 wrpkru(next->pkru);
389}
390
Chang S. Baef4550b52018-09-18 16:08:56 -0700391static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
392 struct thread_struct *next)
393{
Andy Lutomirski67390342020-05-28 16:13:51 -0400394 if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
395 /* Update the FS and GS selectors if they could have changed. */
396 if (unlikely(prev->fsindex || next->fsindex))
397 loadseg(FS, next->fsindex);
398 if (unlikely(prev->gsindex || next->gsindex))
399 loadseg(GS, next->gsindex);
400
401 /* Update the bases. */
402 wrfsbase(next->fsbase);
403 __wrgsbase_inactive(next->gsbase);
404 } else {
405 load_seg_legacy(prev->fsindex, prev->fsbase,
406 next->fsindex, next->fsbase, FS);
407 load_seg_legacy(prev->gsindex, prev->gsbase,
408 next->gsindex, next->gsbase, GS);
409 }
Chang S. Baef4550b52018-09-18 16:08:56 -0700410}
411
Andy Lutomirski40c45902020-06-26 10:24:29 -0700412unsigned long x86_fsgsbase_read_task(struct task_struct *task,
413 unsigned short selector)
Chang S. Baeb1378a52018-09-18 16:08:53 -0700414{
415 unsigned short idx = selector >> 3;
416 unsigned long base;
417
418 if (likely((selector & SEGMENT_TI_MASK) == 0)) {
419 if (unlikely(idx >= GDT_ENTRIES))
420 return 0;
421
422 /*
423 * There are no user segments in the GDT with nonzero bases
424 * other than the TLS segments.
425 */
426 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
427 return 0;
428
429 idx -= GDT_ENTRY_TLS_MIN;
430 base = get_desc_base(&task->thread.tls_array[idx]);
431 } else {
432#ifdef CONFIG_MODIFY_LDT_SYSCALL
433 struct ldt_struct *ldt;
434
435 /*
436 * If performance here mattered, we could protect the LDT
437 * with RCU. This is a slow path, though, so we can just
438 * take the mutex.
439 */
440 mutex_lock(&task->mm->context.lock);
441 ldt = task->mm->context.ldt;
Eric Dumazet8ab49522020-08-14 11:16:17 -0700442 if (unlikely(!ldt || idx >= ldt->nr_entries))
Chang S. Baeb1378a52018-09-18 16:08:53 -0700443 base = 0;
444 else
445 base = get_desc_base(ldt->entries + idx);
446 mutex_unlock(&task->mm->context.lock);
447#else
448 base = 0;
449#endif
450 }
451
452 return base;
453}
454
Chang S. Bae58edfd22020-05-28 16:13:50 -0400455unsigned long x86_gsbase_read_cpu_inactive(void)
456{
457 unsigned long gsbase;
458
Borislav Petkov5f1dd4d2020-08-18 12:28:31 +0200459 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
Chang S. Bae58edfd22020-05-28 16:13:50 -0400460 unsigned long flags;
461
462 local_irq_save(flags);
463 gsbase = __rdgsbase_inactive();
464 local_irq_restore(flags);
465 } else {
466 rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
467 }
468
469 return gsbase;
470}
471
472void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
473{
Borislav Petkov5f1dd4d2020-08-18 12:28:31 +0200474 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
Chang S. Bae58edfd22020-05-28 16:13:50 -0400475 unsigned long flags;
476
477 local_irq_save(flags);
478 __wrgsbase_inactive(gsbase);
479 local_irq_restore(flags);
480 } else {
481 wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
482 }
483}
484
Chang S. Baeb1378a52018-09-18 16:08:53 -0700485unsigned long x86_fsbase_read_task(struct task_struct *task)
486{
487 unsigned long fsbase;
488
489 if (task == current)
490 fsbase = x86_fsbase_read_cpu();
Borislav Petkov5f1dd4d2020-08-18 12:28:31 +0200491 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
Chang S. Bae005f1412020-05-28 16:13:53 -0400492 (task->thread.fsindex == 0))
Chang S. Baeb1378a52018-09-18 16:08:53 -0700493 fsbase = task->thread.fsbase;
494 else
495 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
496
497 return fsbase;
498}
499
500unsigned long x86_gsbase_read_task(struct task_struct *task)
501{
502 unsigned long gsbase;
503
504 if (task == current)
505 gsbase = x86_gsbase_read_cpu_inactive();
Borislav Petkov5f1dd4d2020-08-18 12:28:31 +0200506 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
Chang S. Bae005f1412020-05-28 16:13:53 -0400507 (task->thread.gsindex == 0))
Chang S. Baeb1378a52018-09-18 16:08:53 -0700508 gsbase = task->thread.gsbase;
509 else
510 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
511
512 return gsbase;
513}
514
Chang S. Bae87ab4682018-11-26 11:55:24 -0800515void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
Chang S. Baeb1378a52018-09-18 16:08:53 -0700516{
Chang S. Bae87ab4682018-11-26 11:55:24 -0800517 WARN_ON_ONCE(task == current);
Chang S. Baeb1378a52018-09-18 16:08:53 -0700518
Chang S. Baeb1378a52018-09-18 16:08:53 -0700519 task->thread.fsbase = fsbase;
Chang S. Baeb1378a52018-09-18 16:08:53 -0700520}
521
Chang S. Bae87ab4682018-11-26 11:55:24 -0800522void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
Chang S. Baeb1378a52018-09-18 16:08:53 -0700523{
Chang S. Bae87ab4682018-11-26 11:55:24 -0800524 WARN_ON_ONCE(task == current);
Chang S. Baeb1378a52018-09-18 16:08:53 -0700525
Chang S. Baeb1378a52018-09-18 16:08:53 -0700526 task->thread.gsbase = gsbase;
Chang S. Baeb1378a52018-09-18 16:08:53 -0700527}
528
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700529static void
530start_thread_common(struct pt_regs *regs, unsigned long new_ip,
531 unsigned long new_sp,
H. Peter Anvin (Intel)ad41a142023-12-05 02:50:09 -0800532 u16 _cs, u16 _ss, u16 _ds)
Ingo Molnar513ad842008-02-21 05:18:40 +0100533{
Andy Lutomirski767d0352017-08-01 07:11:34 -0700534 WARN_ON_ONCE(regs != current_pt_regs());
535
536 if (static_cpu_has(X86_BUG_NULL_SEG)) {
537 /* Loading zero below won't clear the base. */
538 loadsegment(fs, __USER_DS);
539 load_gs_index(__USER_DS);
540 }
541
Rick Edgecombe98cfa462023-06-12 17:10:52 -0700542 reset_thread_features();
543
Jeremy Fitzhardingeada85702008-06-25 00:19:00 -0400544 loadsegment(fs, 0);
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700545 loadsegment(es, _ds);
546 loadsegment(ds, _ds);
Ingo Molnar513ad842008-02-21 05:18:40 +0100547 load_gs_index(0);
Andy Lutomirski767d0352017-08-01 07:11:34 -0700548
H. Peter Anvin (Intel)ad41a142023-12-05 02:50:09 -0800549 regs->ip = new_ip;
550 regs->sp = new_sp;
551 regs->csx = _cs;
552 regs->ssx = _ss;
553 /*
554 * Allow single-step trap and NMI when starting a new task, thus
555 * once the new task enters user space, single-step trap and NMI
556 * are both enabled immediately.
557 *
558 * Entering a new task is logically speaking a return from a
559 * system call (exec, fork, clone, etc.). As such, if ptrace
560 * enables single stepping a single step exception should be
561 * allowed to trigger immediately upon entering user space.
562 * This is not optional.
563 *
564 * NMI should *never* be disabled in user space. As such, this
565 * is an optional, opportunistic way to catch errors.
566 *
567 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
568 * discarded by the legacy IRET instruction on all Intel, AMD,
569 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
570 * even when FRED is not enabled. But we choose the safer side
571 * to use these bits only when FRED is enabled.
572 */
573 if (cpu_feature_enabled(X86_FEATURE_FRED)) {
574 regs->fred_ss.swevent = true;
575 regs->fred_ss.nmi = true;
576 }
577
578 regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
Ingo Molnar513ad842008-02-21 05:18:40 +0100579}
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700580
581void
582start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
583{
584 start_thread_common(regs, new_ip, new_sp,
585 __USER_CS, __USER_DS, 0);
586}
Rian Hunterdc768032018-08-19 16:08:53 -0700587EXPORT_SYMBOL_GPL(start_thread);
Ingo Molnar513ad842008-02-21 05:18:40 +0100588
Brian Gerst7da77072015-06-22 07:55:13 -0400589#ifdef CONFIG_COMPAT
Gabriel Krisman Bertazi2424b142020-10-03 23:25:32 -0400590void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
H. Peter Anvina6f05a62009-10-08 18:02:54 -0700591{
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700592 start_thread_common(regs, new_ip, new_sp,
Gabriel Krisman Bertazi2424b142020-10-03 23:25:32 -0400593 x32 ? __USER_CS : __USER32_CS,
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800594 __USER_DS, __USER_DS);
H. Peter Anvina6f05a62009-10-08 18:02:54 -0700595}
596#endif
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200597
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598/*
599 * switch_to(x,y) should switch tasks from x to y.
600 *
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100601 * This could still be optimized:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 * - fold all the options into a flag word and test it with a single test.
603 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100604 *
605 * Kprobes not supported here. Set the probe on schedule instead.
Frederic Weisbecker8b96f012008-12-06 03:40:00 +0100606 * Function graph tracer not supported too.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 */
Alexander Potapenkob11671b2022-09-15 17:04:07 +0200608__no_kmsan_checks
Andi Kleen35ea79032013-08-05 15:02:39 -0700609__visible __notrace_funcgraph struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100610__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611{
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700612 struct thread_struct *prev = &prev_p->thread;
613 struct thread_struct *next = &next_p->thread;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100614 int cpu = smp_processor_id();
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200615
Andy Lutomirski1d3e53e2017-07-11 10:33:38 -0500616 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
Thomas Gleixnerd7b6d702022-09-15 13:11:05 +0200617 this_cpu_read(pcpu_hot.hardirq_stack_inuse));
Andy Lutomirski1d3e53e2017-07-11 10:33:38 -0500618
Linus Torvalds24b8a232023-10-18 20:41:58 +0200619 if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
620 switch_fpu_prepare(prev_p, cpu);
Linus Torvalds49030622012-02-16 19:11:15 -0800621
Jeremy Fitzhardinge478de5a2008-06-25 00:19:24 -0400622 /* We must save %fs and %gs before load_TLS() because
623 * %fs and %gs may be cleared by load_TLS().
624 *
625 * (e.g. xen_load_tls())
626 */
Andy Lutomirskie137a4d2017-08-01 07:11:37 -0700627 save_fsgs(prev_p);
Jeremy Fitzhardinge478de5a2008-06-25 00:19:24 -0400628
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800629 /*
630 * Load TLS before restoring any segments so that segment loads
631 * reference the correct GDT entries.
632 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633 load_TLS(next, cpu);
634
Jeremy Fitzhardinge3fe0a632008-06-25 00:19:23 -0400635 /*
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800636 * Leave lazy mode, flushing any hypercalls made here. This
637 * must be done after loading TLS entries in the GDT but before
Sebastian Andrzej Siewior6dd677a2019-04-03 18:41:31 +0200638 * loading segments that might reference them.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639 */
Jeremy Fitzhardinge224101e2009-02-18 11:18:57 -0800640 arch_end_context_switch(next_p);
Jeremy Fitzhardinge3fe0a632008-06-25 00:19:23 -0400641
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800642 /* Switch DS and ES.
643 *
644 * Reading them only returns the selectors, but writing them (if
645 * nonzero) loads the full descriptor from the GDT or LDT. The
646 * LDT for next is loaded in switch_mm, and the GDT is loaded
647 * above.
648 *
649 * We therefore need to write new values to the segment
650 * registers on every context switch unless both the new and old
651 * values are zero.
652 *
653 * Note that we don't need to do anything for CS and SS, as
654 * those are saved and restored as part of pt_regs.
655 */
656 savesegment(es, prev->es);
657 if (unlikely(next->es | prev->es))
658 loadsegment(es, next->es);
659
660 savesegment(ds, prev->ds);
661 if (unlikely(next->ds | prev->ds))
662 loadsegment(ds, next->ds);
663
Chang S. Baef4550b52018-09-18 16:08:56 -0700664 x86_fsgsbase_load(prev, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665
Dave Hansen9782a712021-06-23 14:02:18 +0200666 x86_pkru_load(prev, next);
667
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300668 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100669 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 */
Thomas Gleixnere57ef2e2022-09-15 13:11:01 +0200671 raw_cpu_write(pcpu_hot.current_task, next_p);
Thomas Gleixnerc063a212022-09-15 13:11:04 +0200672 raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
Andi Kleen18bd0572006-04-20 02:36:45 +0200673
Linus Torvalds24b8a232023-10-18 20:41:58 +0200674 switch_fpu_finish(next_p);
Sebastian Andrzej Siewior27221462019-04-03 18:41:36 +0200675
Andy Lutomirskibd7dc5a2017-11-02 00:59:09 -0700676 /* Reload sp0. */
Joerg Roedel252e1a02018-07-18 11:40:51 +0200677 update_task_stack(next_p);
Andy Lutomirskib27559a2015-03-06 17:50:18 -0800678
Thomas Gleixnerff167012018-11-25 19:33:47 +0100679 switch_to_extra(prev_p, next_p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680
Andy Lutomirski61f01dd2015-04-26 16:47:59 -0700681 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
682 /*
683 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
684 * does not update the cached descriptor. As a result, if we
685 * do SYSRET while SS is NULL, we'll end up in user mode with
686 * SS apparently equal to __USER_DS but actually unusable.
687 *
688 * The straightforward workaround would be to fix it up just
689 * before SYSRET, but that would slow down the system call
690 * fast paths. Instead, we ensure that SS is never NULL in
691 * system call context. We do this by replacing NULL SS
692 * selectors at every context switch. SYSCALL sets up a valid
693 * SS, so the only way to get NULL is to re-enter the kernel
694 * from CPL 3 through an interrupt. Since that can't happen
695 * in the same task as a running syscall, we are guaranteed to
696 * context switch between every interrupt vector entry and a
697 * subsequent SYSRET.
698 *
699 * We read SS first because SS reads are much faster than
700 * writes. Out of caution, we force SS to __KERNEL_DS even if
701 * it previously had a different non-NULL value.
702 */
703 unsigned short ss_sel;
704 savesegment(ss, ss_sel);
705 if (ss_sel != __KERNEL_DS)
706 loadsegment(ss, __KERNEL_DS);
707 }
708
Fenghua Yu4f341a52016-10-28 15:04:48 -0700709 /* Load the Intel cache allocation PQR MSR. */
Linus Torvalds7fef0992023-03-07 13:06:29 -0800710 resctrl_sched_in(next_p);
Fenghua Yu4f341a52016-10-28 15:04:48 -0700711
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712 return prev_p;
713}
714
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715void set_personality_64bit(void)
716{
717 /* inherit personality from parent */
718
719 /* Make sure to be in 64bit mode */
H. Peter Anvin6bd33002012-02-06 13:03:09 -0800720 clear_thread_flag(TIF_ADDR32);
Dmitry Safonovada26482017-03-31 14:11:37 +0300721 /* Pretend that this comes from a 64bit execve */
722 task_pt_regs(current)->orig_ax = __NR_execve;
Dmitry Safonovacf46022018-05-18 00:35:10 +0100723 current_thread_info()->status &= ~TS_COMPAT;
Stephen Wilson375906f2011-03-13 15:49:14 -0400724 if (current->mm)
Kirill A. Shutemov5ef495e2023-03-12 14:25:57 +0300725 __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
Stephen Wilson375906f2011-03-13 15:49:14 -0400726
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727 /* TBD: overwrites user setup. Should have two bits.
728 But 64bit processes have always behaved this way,
729 so it's not too bad. The main problem is just that
Ingo Molnara97673a2018-12-03 10:47:34 +0100730 32bit children are affected again. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 current->personality &= ~READ_IMPLIES_EXEC;
732}
733
Dmitry Safonovada26482017-03-31 14:11:37 +0300734static void __set_personality_x32(void)
735{
Masahiro Yamada83a44a42022-03-14 12:48:41 -0700736#ifdef CONFIG_X86_X32_ABI
Dmitry Safonovada26482017-03-31 14:11:37 +0300737 if (current->mm)
Gabriel Krisman Bertaziff170cd2020-10-03 23:25:35 -0400738 current->mm->context.flags = 0;
739
Dmitry Safonovada26482017-03-31 14:11:37 +0300740 current->personality &= ~READ_IMPLIES_EXEC;
741 /*
Dmitry Safonova8464462018-10-12 14:42:52 +0100742 * in_32bit_syscall() uses the presence of the x32 syscall bit
Dmitry Safonovada26482017-03-31 14:11:37 +0300743 * flag to determine compat status. The x86 mmap() code relies on
744 * the syscall bitness so set x32 syscall bit right here to make
Dmitry Safonova8464462018-10-12 14:42:52 +0100745 * in_32bit_syscall() work during exec().
Dmitry Safonovada26482017-03-31 14:11:37 +0300746 *
747 * Pretend to come from a x32 execve.
748 */
749 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
Andy Lutomirski37a8f7c2018-01-28 10:38:50 -0800750 current_thread_info()->status &= ~TS_COMPAT;
Dmitry Safonovada26482017-03-31 14:11:37 +0300751#endif
752}
753
754static void __set_personality_ia32(void)
755{
756#ifdef CONFIG_IA32_EMULATION
Gabriel Krisman Bertaziff170cd2020-10-03 23:25:35 -0400757 if (current->mm) {
758 /*
759 * uprobes applied to this MM need to know this and
760 * cannot use user_64bit_mode() at that time.
761 */
Kirill A. Shutemov5ef495e2023-03-12 14:25:57 +0300762 __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
Gabriel Krisman Bertaziff170cd2020-10-03 23:25:35 -0400763 }
764
Dmitry Safonovada26482017-03-31 14:11:37 +0300765 current->personality |= force_personality32;
766 /* Prepare the first "return" to user space */
767 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
Andy Lutomirski37a8f7c2018-01-28 10:38:50 -0800768 current_thread_info()->status |= TS_COMPAT;
Dmitry Safonovada26482017-03-31 14:11:37 +0300769#endif
770}
771
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800772void set_personality_ia32(bool x32)
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800773{
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800774 /* Make sure to be in 32bit mode */
H. Peter Anvin6bd33002012-02-06 13:03:09 -0800775 set_thread_flag(TIF_ADDR32);
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800776
Dmitry Safonovada26482017-03-31 14:11:37 +0300777 if (x32)
778 __set_personality_x32();
779 else
780 __set_personality_ia32();
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800781}
Larry Fingerfebb72a2012-05-06 19:40:03 -0500782EXPORT_SYMBOL_GPL(set_personality_ia32);
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800783
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200784#ifdef CONFIG_CHECKPOINT_RESTORE
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300785static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
786{
787 int ret;
788
789 ret = map_vdso_once(image, addr);
790 if (ret)
791 return ret;
792
793 return (long)image->size;
794}
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200795#endif
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300796
Kirill A. Shutemov2f8794b2023-03-12 14:26:03 +0300797#ifdef CONFIG_ADDRESS_MASKING
798
799#define LAM_U57_BITS 6
800
801static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
802{
803 if (!cpu_feature_enabled(X86_FEATURE_LAM))
804 return -ENODEV;
805
806 /* PTRACE_ARCH_PRCTL */
807 if (current->mm != mm)
808 return -EINVAL;
809
Kirill A. Shutemov23e5d9e2023-03-12 14:26:06 +0300810 if (mm_valid_pasid(mm) &&
811 !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
Kirill A. Shutemovfca1fdd2023-04-03 14:10:19 +0300812 return -EINVAL;
Kirill A. Shutemov23e5d9e2023-03-12 14:26:06 +0300813
Kirill A. Shutemov2f8794b2023-03-12 14:26:03 +0300814 if (mmap_write_lock_killable(mm))
815 return -EINTR;
816
817 if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
818 mmap_write_unlock(mm);
819 return -EBUSY;
820 }
821
822 if (!nr_bits) {
823 mmap_write_unlock(mm);
824 return -EINVAL;
825 } else if (nr_bits <= LAM_U57_BITS) {
826 mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
827 mm->context.untag_mask = ~GENMASK(62, 57);
828 } else {
829 mmap_write_unlock(mm);
830 return -EINVAL;
831 }
832
833 write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
834 set_tlbstate_lam_mode(mm);
835 set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
836
837 mmap_write_unlock(mm);
838
839 return 0;
840}
841#endif
842
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700843long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300844{
845 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846
Kyle Hueydd939382017-03-20 01:16:20 -0700847 switch (option) {
Chang S. Baee696c232018-09-18 16:08:54 -0700848 case ARCH_SET_GS: {
Chang S. Bae87ab4682018-11-26 11:55:24 -0800849 if (unlikely(arg2 >= TASK_SIZE_MAX))
850 return -EPERM;
851
852 preempt_disable();
853 /*
854 * ARCH_SET_GS has always overwritten the index
855 * and the base. Zero is the most sensible value
856 * to put in the index, and is the only value that
857 * makes any sense if FSGSBASE is unavailable.
858 */
859 if (task == current) {
860 loadseg(GS, 0);
861 x86_gsbase_write_cpu_inactive(arg2);
862
863 /*
864 * On non-FSGSBASE systems, save_base_legacy() expects
865 * that we also fill in thread.gsbase.
866 */
867 task->thread.gsbase = arg2;
868
869 } else {
870 task->thread.gsindex = 0;
871 x86_gsbase_write_task(task, arg2);
872 }
873 preempt_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 break;
Chang S. Baee696c232018-09-18 16:08:54 -0700875 }
876 case ARCH_SET_FS: {
Chang S. Bae87ab4682018-11-26 11:55:24 -0800877 /*
878 * Not strictly needed for %fs, but do it for symmetry
879 * with %gs
880 */
881 if (unlikely(arg2 >= TASK_SIZE_MAX))
882 return -EPERM;
883
884 preempt_disable();
885 /*
886 * Set the selector to 0 for the same reason
887 * as %gs above.
888 */
889 if (task == current) {
890 loadseg(FS, 0);
891 x86_fsbase_write_cpu(arg2);
892
893 /*
894 * On non-FSGSBASE systems, save_base_legacy() expects
895 * that we also fill in thread.fsbase.
896 */
897 task->thread.fsbase = arg2;
898 } else {
899 task->thread.fsindex = 0;
900 x86_fsbase_write_task(task, arg2);
901 }
902 preempt_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 break;
Chang S. Baee696c232018-09-18 16:08:54 -0700904 }
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100905 case ARCH_GET_FS: {
Chang S. Baee696c232018-09-18 16:08:54 -0700906 unsigned long base = x86_fsbase_read_task(task);
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700907
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700908 ret = put_user(base, (unsigned long __user *)arg2);
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100909 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100911 case ARCH_GET_GS: {
Chang S. Baee696c232018-09-18 16:08:54 -0700912 unsigned long base = x86_gsbase_read_task(task);
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700913
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700914 ret = put_user(base, (unsigned long __user *)arg2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 break;
916 }
917
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300918#ifdef CONFIG_CHECKPOINT_RESTORE
Vinson Lee6e68b082016-09-17 00:51:53 +0000919# ifdef CONFIG_X86_X32_ABI
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300920 case ARCH_MAP_VDSO_X32:
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700921 return prctl_map_vdso(&vdso_image_x32, arg2);
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200922# endif
923# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300924 case ARCH_MAP_VDSO_32:
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700925 return prctl_map_vdso(&vdso_image_32, arg2);
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200926# endif
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300927 case ARCH_MAP_VDSO_64:
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700928 return prctl_map_vdso(&vdso_image_64, arg2);
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300929#endif
Kirill A. Shutemov2f8794b2023-03-12 14:26:03 +0300930#ifdef CONFIG_ADDRESS_MASKING
931 case ARCH_GET_UNTAG_MASK:
932 return put_user(task->mm->context.untag_mask,
933 (unsigned long __user *)arg2);
934 case ARCH_ENABLE_TAGGED_ADDR:
935 return prctl_enable_tagged_addr(task->mm, arg2);
Kirill A. Shutemov23e5d9e2023-03-12 14:26:06 +0300936 case ARCH_FORCE_TAGGED_SVA:
Kirill A. Shutemov97740262023-04-03 14:10:20 +0300937 if (current != task)
938 return -EINVAL;
Kirill A. Shutemov23e5d9e2023-03-12 14:26:06 +0300939 set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
940 return 0;
Kirill A. Shutemov2f8794b2023-03-12 14:26:03 +0300941 case ARCH_GET_MAX_TAG_BITS:
942 if (!cpu_feature_enabled(X86_FEATURE_LAM))
943 return put_user(0, (unsigned long __user *)arg2);
944 else
945 return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
946#endif
Rick Edgecombe98cfa462023-06-12 17:10:52 -0700947 case ARCH_SHSTK_ENABLE:
948 case ARCH_SHSTK_DISABLE:
949 case ARCH_SHSTK_LOCK:
Mike Rapoport680ed2f2023-06-12 17:11:07 -0700950 case ARCH_SHSTK_UNLOCK:
Rick Edgecombe67840ad2023-06-12 17:11:08 -0700951 case ARCH_SHSTK_STATUS:
Rick Edgecombe98cfa462023-06-12 17:10:52 -0700952 return shstk_prctl(task, option, arg2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 default:
954 ret = -EINVAL;
955 break;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100956 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100958 return ret;
959}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960
Kyle Huey17a6e1b2017-03-20 01:16:22 -0700961SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962{
Kyle Hueyb0b9b012017-03-20 01:16:23 -0700963 long ret;
964
965 ret = do_arch_prctl_64(current, option, arg2);
966 if (ret == -EINVAL)
Thomas Gleixnerf5c0b4f2022-05-12 14:04:08 +0200967 ret = do_arch_prctl_common(option, arg2);
Kyle Hueyb0b9b012017-03-20 01:16:23 -0700968
969 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970}
971
Kyle Huey79170fd2017-03-20 01:16:24 -0700972#ifdef CONFIG_IA32_EMULATION
973COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
974{
Thomas Gleixnerf5c0b4f2022-05-12 14:04:08 +0200975 return do_arch_prctl_common(option, arg2);
Kyle Huey79170fd2017-03-20 01:16:24 -0700976}
977#endif
978
Stefani Seibold89240ba2009-11-03 10:22:40 +0100979unsigned long KSTK_ESP(struct task_struct *task)
980{
Denys Vlasenko263042e2015-03-09 19:39:23 +0100981 return task_pt_regs(task)->sp;
Stefani Seibold89240ba2009-11-03 10:22:40 +0100982}