Blame - arch/x86/mm/tlb.c - linux

blob: cfe6b1e85fa610201b5c8ff9c931bdc461798347 [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	2	#include <linux/init.h>
				3
				4	#include <linux/mm.h>
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	5	#include <linux/spinlock.h>
				6	#include <linux/smp.h>
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	7	#include <linux/interrupt.h>
Paul Gortmaker	4b599fed	2016-07-13 20:18:55 -0400	[diff] [blame]	8	#include <linux/export.h>
Shaohua Li	9329672	2010-10-20 11:07:03 +0800	[diff] [blame]	9	#include <linux/cpu.h>
Tim Chen	18bf3c3	2018-01-29 22:04:47 +0000	[diff] [blame]	10	#include <linux/debugfs.h>
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	11
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	12	#include <asm/tlbflush.h>
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	13	#include <asm/mmu_context.h>
Tim Chen	18bf3c3	2018-01-29 22:04:47 +0000	[diff] [blame]	14	#include <asm/nospec-branch.h>
Jan Beulich	350f8f5	2009-11-13 11:54:40 +0000	[diff] [blame]	15	#include <asm/cache.h>
Tejun Heo	6dd01be	2009-01-21 17:26:06 +0900	[diff] [blame]	16	#include <asm/apic.h>
Kan Liang	5471eea5	2021-06-14 10:59:42 -0700	[diff] [blame]	17	#include <asm/perf_event.h>
Glauber Costa	5af5573	2008-03-25 13:28:56 -0300	[diff] [blame]	18
Peter Zijlstra	935f583	2018-12-03 18:03:49 +0100	[diff] [blame]	19	#include "mm_internal.h"
				20
Thomas Gleixner	2faf153	2020-04-21 11:20:32 +0200	[diff] [blame]	21	#ifdef CONFIG_PARAVIRT
				22	# define STATIC_NOPV
				23	#else
				24	# define STATIC_NOPV static
				25	# define __flush_tlb_local native_flush_tlb_local
Thomas Gleixner	cd30d26	2020-04-21 11:20:33 +0200	[diff] [blame]	26	# define __flush_tlb_global native_flush_tlb_global
Thomas Gleixner	127ac91	2020-04-21 11:20:34 +0200	[diff] [blame]	27	# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	28	# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
Thomas Gleixner	2faf153	2020-04-21 11:20:32 +0200	[diff] [blame]	29	#endif
				30
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	31	/*
Andy Lutomirski	ce4a4e56	2017-05-28 10:00:14 -0700	[diff] [blame]	32	* TLB flushing, formerly SMP-only
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	33	* c/o Linus Torvalds.
				34	*
				35	* These mean you can really definitely utterly forget about
				36	* writing to user space from interrupts. (Its not allowed anyway).
				37	*
				38	* Optimizations Manfred Spraul <manfred@colorfullife.com>
				39	*
				40	* More scalable flush, from Andi Kleen
				41	*
Alex Shi	52aec33	2012-06-28 09:02:23 +0800	[diff] [blame]	42	* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	43	*/
				44
Dave Hansen	2ea907c	2017-12-04 15:07:57 +0100	[diff] [blame]	45	/*
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	46	* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
				47	* stored in cpu_tlb_state.last_user_mm_ibpb.
				48	*/
				49	#define LAST_USER_MM_IBPB 0x1UL
				50
				51	/*
Thomas Gleixner	6c9b7d7	2020-04-21 11:20:41 +0200	[diff] [blame]	52	* The x86 feature is called PCID (Process Context IDentifier). It is similar
				53	* to what is traditionally called ASID on the RISC processors.
				54	*
				55	* We don't use the traditional ASID implementation, where each process/mm gets
				56	* its own ASID and flush/restart when we run out of ASID space.
				57	*
				58	* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
				59	* that came by on this CPU, allowing cheaper switch_mm between processes on
				60	* this CPU.
				61	*
				62	* We end up with different spaces for different things. To avoid confusion we
				63	* use different names for each of them:
				64	*
				65	* ASID - [0, TLB_NR_DYN_ASIDS-1]
				66	* the canonical identifier for an mm
				67	*
				68	* kPCID - [1, TLB_NR_DYN_ASIDS]
				69	* the value we write into the PCID part of CR3; corresponds to the
				70	* ASID+1, because PCID 0 is special.
				71	*
				72	* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
				73	* for KPTI each mm has two address spaces and thus needs two
				74	* PCID values, but we can still do with a single ASID denomination
				75	* for each mm. Corresponds to kPCID + 2048.
				76	*
				77	*/
				78
				79	/* There are 12 bits of space for ASIDS in CR3 */
				80	#define CR3_HW_ASID_BITS 12
				81
				82	/*
				83	* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
				84	* user/kernel switches
				85	*/
				86	#ifdef CONFIG_PAGE_TABLE_ISOLATION
				87	# define PTI_CONSUMED_PCID_BITS 1
				88	#else
				89	# define PTI_CONSUMED_PCID_BITS 0
				90	#endif
				91
				92	#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
				93
				94	/*
				95	* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
				96	* for them being zero-based. Another -1 is because PCID 0 is reserved for
				97	* use by non-PCID-aware users.
				98	*/
				99	#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
				100
				101	/*
				102	* Given @asid, compute kPCID
				103	*/
				104	static inline u16 kern_pcid(u16 asid)
				105	{
				106	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
				107
				108	#ifdef CONFIG_PAGE_TABLE_ISOLATION
				109	/*
Ingo Molnar	d9f6e12	2021-03-18 15:28:01 +0100	[diff] [blame]	110	* Make sure that the dynamic ASID space does not conflict with the
Thomas Gleixner	6c9b7d7	2020-04-21 11:20:41 +0200	[diff] [blame]	111	* bit we are using to switch between user and kernel ASIDs.
				112	*/
				113	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
				114
				115	/*
				116	* The ASID being passed in here should have respected the
				117	* MAX_ASID_AVAILABLE and thus never have the switch bit set.
				118	*/
				119	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
				120	#endif
				121	/*
				122	* The dynamically-assigned ASIDs that get passed in are small
				123	* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
				124	* so do not bother to clear it.
				125	*
				126	* If PCID is on, ASID-aware code paths put the ASID+1 into the
				127	* PCID bits. This serves two purposes. It prevents a nasty
				128	* situation in which PCID-unaware code saves CR3, loads some other
				129	* value (with PCID == 0), and then restores CR3, thus corrupting
				130	* the TLB for ASID 0 if the saved ASID was nonzero. It also means
				131	* that any bugs involving loading a PCID-enabled CR3 with
				132	* CR4.PCIDE off will trigger deterministically.
				133	*/
				134	return asid + 1;
				135	}
				136
				137	/*
				138	* Given @asid, compute uPCID
				139	*/
				140	static inline u16 user_pcid(u16 asid)
				141	{
				142	u16 ret = kern_pcid(asid);
				143	#ifdef CONFIG_PAGE_TABLE_ISOLATION
				144	ret \|= 1 << X86_CR3_PTI_PCID_USER_BIT;
				145	#endif
				146	return ret;
				147	}
				148
				149	static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
				150	{
				151	if (static_cpu_has(X86_FEATURE_PCID)) {
				152	return __sme_pa(pgd) \| kern_pcid(asid);
				153	} else {
				154	VM_WARN_ON_ONCE(asid != 0);
				155	return __sme_pa(pgd);
				156	}
				157	}
				158
				159	static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
				160	{
				161	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
				162	/*
				163	* Use boot_cpu_has() instead of this_cpu_has() as this function
				164	* might be called during early boot. This should work even after
				165	* boot because all CPU's the have same capabilities:
				166	*/
				167	VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
				168	return __sme_pa(pgd) \| kern_pcid(asid) \| CR3_NOFLUSH;
				169	}
				170
				171	/*
Dave Hansen	2ea907c	2017-12-04 15:07:57 +0100	[diff] [blame]	172	* We get here when we do something requiring a TLB invalidation
				173	* but could not go invalidate all of the contexts. We do the
				174	* necessary invalidation by clearing out the 'ctx_id' which
				175	* forces a TLB flush when the context is loaded.
				176	*/
zhong jiang	387048f	2018-07-21 15:55:32 +0800	[diff] [blame]	177	static void clear_asid_other(void)
Dave Hansen	2ea907c	2017-12-04 15:07:57 +0100	[diff] [blame]	178	{
				179	u16 asid;
				180
				181	/*
				182	* This is only expected to be set if we have disabled
				183	* kernel _PAGE_GLOBAL pages.
				184	*/
				185	if (!static_cpu_has(X86_FEATURE_PTI)) {
				186	WARN_ON_ONCE(1);
				187	return;
				188	}
				189
				190	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
				191	/* Do not need to flush the current asid */
				192	if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
				193	continue;
				194	/*
				195	* Make sure the next time we go to switch to
				196	* this asid, we do a flush:
				197	*/
				198	this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
				199	}
				200	this_cpu_write(cpu_tlbstate.invalidate_other, false);
				201	}
				202
Andy Lutomirski	f39681e	2017-06-29 08:53:15 -0700	[diff] [blame]	203	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
				204
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	205
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	206	static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
				207	u16 new_asid, bool need_flush)
				208	{
				209	u16 asid;
				210
				211	if (!static_cpu_has(X86_FEATURE_PCID)) {
				212	*new_asid = 0;
				213	*need_flush = true;
				214	return;
				215	}
				216
Dave Hansen	2ea907c	2017-12-04 15:07:57 +0100	[diff] [blame]	217	if (this_cpu_read(cpu_tlbstate.invalidate_other))
				218	clear_asid_other();
				219
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	220	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
				221	if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
				222	next->context.ctx_id)
				223	continue;
				224
				225	*new_asid = asid;
				226	*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
				227	next_tlb_gen);
				228	return;
				229	}
				230
				231	/*
				232	* We don't currently own an ASID slot on this CPU.
				233	* Allocate a slot.
				234	*/
				235	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
				236	if (*new_asid >= TLB_NR_DYN_ASIDS) {
				237	*new_asid = 0;
				238	this_cpu_write(cpu_tlbstate.next_asid, 1);
				239	}
				240	*need_flush = true;
				241	}
				242
Thomas Gleixner	127ac91	2020-04-21 11:20:34 +0200	[diff] [blame]	243	/*
				244	* Given an ASID, flush the corresponding user ASID. We can delay this
				245	* until the next time we switch to it.
				246	*
				247	* See SWITCH_TO_USER_CR3.
				248	*/
				249	static inline void invalidate_user_asid(u16 asid)
				250	{
				251	/* There is no user ASID if address space separation is off */
				252	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
				253	return;
				254
				255	/*
				256	* We only have a single ASID if PCID is off and the CR3
				257	* write will have flushed it.
				258	*/
				259	if (!cpu_feature_enabled(X86_FEATURE_PCID))
				260	return;
				261
				262	if (!static_cpu_has(X86_FEATURE_PTI))
				263	return;
				264
				265	__set_bit(kern_pcid(asid),
				266	(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
				267	}
				268
Dave Hansen	48e1119	2017-12-04 15:07:58 +0100	[diff] [blame]	269	static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
				270	{
				271	unsigned long new_mm_cr3;
				272
				273	if (need_flush) {
Peter Zijlstra	6fd166a	2017-12-04 15:07:59 +0100	[diff] [blame]	274	invalidate_user_asid(new_asid);
Dave Hansen	48e1119	2017-12-04 15:07:58 +0100	[diff] [blame]	275	new_mm_cr3 = build_cr3(pgdir, new_asid);
				276	} else {
				277	new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
				278	}
				279
				280	/*
				281	* Caution: many callers of this function expect
				282	* that load_cr3() is serializing and orders TLB
				283	* fills with respect to the mm_cpumask writes.
				284	*/
				285	write_cr3(new_mm_cr3);
				286	}
				287
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	288	void leave_mm(int cpu)
				289	{
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	290	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
				291
				292	/*
				293	* It's plausible that we're in lazy TLB mode while our mm is init_mm.
				294	* If so, our callers still expect us to flush the TLB, but there
				295	* aren't any user TLB entries in init_mm to worry about.
				296	*
				297	* This needs to happen before any other sanity checks due to
				298	* intel_idle's shenanigans.
				299	*/
				300	if (loaded_mm == &init_mm)
				301	return;
				302
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	303	/* Warn if we're not lazy. */
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	304	WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	305
				306	switch_mm(NULL, &init_mm, NULL);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	307	}
Andy Lutomirski	6753573	2017-11-04 04:16:12 -0700	[diff] [blame]	308	EXPORT_SYMBOL_GPL(leave_mm);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	309
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	310	void switch_mm(struct mm_struct prev, struct mm_struct next,
				311	struct task_struct *tsk)
				312	{
Andy Lutomirski	078194f	2016-04-26 09:39:09 -0700	[diff] [blame]	313	unsigned long flags;
				314
				315	local_irq_save(flags);
				316	switch_mm_irqs_off(prev, next, tsk);
				317	local_irq_restore(flags);
				318	}
				319
Nadav Amit	1608e4c	2021-02-20 15:17:11 -0800	[diff] [blame]	320	static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
Jiri Kosina	dbfe295	2018-09-25 14:38:18 +0200	[diff] [blame]	321	{
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	322	unsigned long next_tif = task_thread_info(next)->flags;
				323	unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
				324
				325	return (unsigned long)next->mm \| ibpb;
				326	}
				327
				328	static void cond_ibpb(struct task_struct *next)
				329	{
				330	if (!next \|\| !next->mm)
				331	return;
				332
Jiri Kosina	dbfe295	2018-09-25 14:38:18 +0200	[diff] [blame]	333	/*
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	334	* Both, the conditional and the always IBPB mode use the mm
				335	* pointer to avoid the IBPB when switching between tasks of the
				336	* same process. Using the mm pointer instead of mm->context.ctx_id
				337	* opens a hypothetical hole vs. mm_struct reuse, which is more or
				338	* less impossible to control by an attacker. Aside of that it
				339	* would only affect the first schedule so the theoretically
				340	* exposed data is not really interesting.
Jiri Kosina	dbfe295	2018-09-25 14:38:18 +0200	[diff] [blame]	341	*/
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	342	if (static_branch_likely(&switch_mm_cond_ibpb)) {
				343	unsigned long prev_mm, next_mm;
				344
				345	/*
				346	* This is a bit more complex than the always mode because
				347	* it has to handle two cases:
				348	*
				349	* 1) Switch from a user space task (potential attacker)
				350	* which has TIF_SPEC_IB set to a user space task
				351	* (potential victim) which has TIF_SPEC_IB not set.
				352	*
				353	* 2) Switch from a user space task (potential attacker)
				354	* which has TIF_SPEC_IB not set to a user space task
				355	* (potential victim) which has TIF_SPEC_IB set.
				356	*
				357	* This could be done by unconditionally issuing IBPB when
				358	* a task which has TIF_SPEC_IB set is either scheduled in
				359	* or out. Though that results in two flushes when:
				360	*
				361	* - the same user space task is scheduled out and later
				362	* scheduled in again and only a kernel thread ran in
				363	* between.
				364	*
				365	* - a user space task belonging to the same process is
				366	* scheduled in after a kernel thread ran in between
				367	*
				368	* - a user space task belonging to the same process is
				369	* scheduled in immediately.
				370	*
				371	* Optimize this with reasonably small overhead for the
				372	* above cases. Mangle the TIF_SPEC_IB bit into the mm
				373	* pointer of the incoming task which is stored in
				374	* cpu_tlbstate.last_user_mm_ibpb for comparison.
				375	*/
				376	next_mm = mm_mangle_tif_spec_ib(next);
				377	prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
				378
				379	/*
				380	* Issue IBPB only if the mm's are different and one or
				381	* both have the IBPB bit set.
				382	*/
				383	if (next_mm != prev_mm &&
				384	(next_mm \| prev_mm) & LAST_USER_MM_IBPB)
				385	indirect_branch_prediction_barrier();
				386
				387	this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
				388	}
				389
				390	if (static_branch_unlikely(&switch_mm_always_ibpb)) {
				391	/*
				392	* Only flush when switching to a user space task with a
				393	* different context than the user space task which ran
				394	* last on this CPU.
				395	*/
				396	if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
				397	indirect_branch_prediction_barrier();
				398	this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
				399	}
				400	}
Jiri Kosina	dbfe295	2018-09-25 14:38:18 +0200	[diff] [blame]	401	}
				402
Thomas Gleixner	cb2a023	2020-04-21 11:20:30 +0200	[diff] [blame]	403	#ifdef CONFIG_PERF_EVENTS
				404	static inline void cr4_update_pce_mm(struct mm_struct *mm)
				405	{
				406	if (static_branch_unlikely(&rdpmc_always_available_key) \|\|
				407	(!static_branch_unlikely(&rdpmc_never_available_key) &&
Kan Liang	5471eea5	2021-06-14 10:59:42 -0700	[diff] [blame]	408	atomic_read(&mm->context.perf_rdpmc_allowed))) {
				409	/*
				410	* Clear the existing dirty counters to
				411	* prevent the leak for an RDPMC task.
				412	*/
				413	perf_clear_dirty_counters();
Thomas Gleixner	cb2a023	2020-04-21 11:20:30 +0200	[diff] [blame]	414	cr4_set_bits_irqsoff(X86_CR4_PCE);
Kan Liang	5471eea5	2021-06-14 10:59:42 -0700	[diff] [blame]	415	} else
Thomas Gleixner	cb2a023	2020-04-21 11:20:30 +0200	[diff] [blame]	416	cr4_clear_bits_irqsoff(X86_CR4_PCE);
				417	}
				418
				419	void cr4_update_pce(void *ignored)
				420	{
				421	cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
				422	}
				423
				424	#else
				425	static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
				426	#endif
				427
Andy Lutomirski	078194f	2016-04-26 09:39:09 -0700	[diff] [blame]	428	void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
				429	struct task_struct *tsk)
				430	{
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	431	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	432	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	433	bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	434	unsigned cpu = smp_processor_id();
				435	u64 next_tlb_gen;
Rik van Riel	12c4d97	2018-09-25 23:58:39 -0400	[diff] [blame]	436	bool need_flush;
				437	u16 new_asid;
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	438
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	439	/*
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	440	* NB: The scheduler will call us with prev == next when switching
				441	* from lazy TLB mode to normal mode if active_mm isn't changing.
				442	* When this happens, we don't assume that CR3 (and hence
				443	* cpu_tlbstate.loaded_mm) matches next.
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	444	*
				445	* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
				446	*/
Andy Lutomirski	e37e43a	2016-08-11 02:35:23 -0700	[diff] [blame]	447
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	448	/* We don't want flush_tlb_func() to run concurrently with us. */
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	449	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
				450	WARN_ON_ONCE(!irqs_disabled());
				451
				452	/*
				453	* Verify that CR3 is what we think it is. This will catch
				454	* hypothetical buggy code that directly switches to swapper_pg_dir
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	455	* without going through leave_mm() / switch_mm_irqs_off() or that
				456	* does something like write_cr3(read_cr3_pa()).
Andy Lutomirski	a376e7f	2017-09-07 22:06:57 -0700	[diff] [blame]	457	*
				458	* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
				459	* isn't free.
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	460	*/
Andy Lutomirski	a376e7f	2017-09-07 22:06:57 -0700	[diff] [blame]	461	#ifdef CONFIG_DEBUG_VM
Dave Hansen	50fb83a6	2017-12-04 15:07:54 +0100	[diff] [blame]	462	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
Andy Lutomirski	a376e7f	2017-09-07 22:06:57 -0700	[diff] [blame]	463	/*
				464	* If we were to BUG here, we'd be very likely to kill
				465	* the system so hard that we don't see the call trace.
				466	* Try to recover instead by ignoring the error and doing
				467	* a global flush to minimize the chance of corruption.
				468	*
				469	* (This is far from being a fully correct recovery.
				470	* Architecturally, the CPU could prefetch something
				471	* back into an incorrect ASID slot and leave it there
				472	* to cause trouble down the road. It's better than
				473	* nothing, though.)
				474	*/
				475	__flush_tlb_all();
				476	}
				477	#endif
Nadav Amit	09c5272	2021-02-20 15:17:09 -0800	[diff] [blame]	478	if (was_lazy)
				479	this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
Andy Lutomirski	e37e43a	2016-08-11 02:35:23 -0700	[diff] [blame]	480
Mathieu Desnoyers	306e060	2018-01-29 15:20:12 -0500	[diff] [blame]	481	/*
Mathieu Desnoyers	10bcc80	2018-01-29 15:20:18 -0500	[diff] [blame]	482	* The membarrier system call requires a full memory barrier and
				483	* core serialization before returning to user-space, after
Andy Lutomirski	a493d1c	2020-12-03 21:07:03 -0800	[diff] [blame]	484	* storing to rq->curr, when changing mm. This is because
				485	* membarrier() sends IPIs to all CPUs that are in the target mm
				486	* to make them issue memory barriers. However, if another CPU
				487	* switches to/from the target mm concurrently with
				488	* membarrier(), it can cause that CPU not to receive an IPI
				489	* when it really should issue a memory barrier. Writing to CR3
				490	* provides that full memory barrier and core serializing
				491	* instruction.
Mathieu Desnoyers	306e060	2018-01-29 15:20:12 -0500	[diff] [blame]	492	*/
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	493	if (real_prev == next) {
Andy Lutomirski	e8b9b0c	2017-10-14 09:59:49 -0700	[diff] [blame]	494	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
				495	next->context.ctx_id);
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	496
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	497	/*
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	498	* Even in lazy TLB mode, the CPU should stay set in the
				499	* mm_cpumask. The TLB shootdown code can figure out from
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	500	* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	501	*/
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	502	if (WARN_ON_ONCE(real_prev != &init_mm &&
				503	!cpumask_test_cpu(cpu, mm_cpumask(next))))
				504	cpumask_set_cpu(cpu, mm_cpumask(next));
				505
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	506	/*
				507	* If the CPU is not in lazy TLB mode, we are just switching
				508	* from one thread in a process to another thread in the same
				509	* process. No TLB flush required.
				510	*/
				511	if (!was_lazy)
				512	return;
				513
				514	/*
				515	* Read the tlb_gen to check whether a flush is needed.
				516	* If the TLB is up to date, just use it.
				517	* The barrier synchronizes with the tlb_gen increment in
				518	* the TLB shootdown code.
				519	*/
				520	smp_mb();
				521	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
				522	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
				523	next_tlb_gen)
				524	return;
				525
				526	/*
				527	* TLB contents went out of date while we were in lazy
				528	* mode. Fall through to the TLB switching code below.
				529	*/
				530	new_asid = prev_asid;
				531	need_flush = true;
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	532	} else {
Tim Chen	18bf3c3	2018-01-29 22:04:47 +0000	[diff] [blame]	533	/*
				534	* Avoid user/user BTB poisoning by flushing the branch
				535	* predictor when switching between processes. This stops
				536	* one process from doing Spectre-v2 attacks on another.
Tim Chen	18bf3c3	2018-01-29 22:04:47 +0000	[diff] [blame]	537	*/
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	538	cond_ibpb(tsk);
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	539
Rik van Riel	e9d8c61	2018-07-16 15:03:37 -0400	[diff] [blame]	540	/*
				541	* Stop remote flushes for the previous mm.
				542	* Skip kernel threads; we never send init_mm TLB flushing IPIs,
				543	* but the bitmap manipulation can cause cache line contention.
				544	*/
				545	if (real_prev != &init_mm) {
				546	VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
				547	mm_cpumask(real_prev)));
				548	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
				549	}
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	550
				551	/*
				552	* Start remote flushes and then read tlb_gen.
				553	*/
Rik van Riel	e9d8c61	2018-07-16 15:03:37 -0400	[diff] [blame]	554	if (next != &init_mm)
				555	cpumask_set_cpu(cpu, mm_cpumask(next));
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	556	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
				557
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	558	choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
Andy Lutomirski	94b1b03	2017-06-29 08:53:17 -0700	[diff] [blame]	559
Andy Lutomirski	4012e77	2018-08-29 08:47:18 -0700	[diff] [blame]	560	/* Let nmi_uaccess_okay() know that we're changing CR3. */
				561	this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
				562	barrier();
Rik van Riel	12c4d97	2018-09-25 23:58:39 -0400	[diff] [blame]	563	}
Andy Lutomirski	4012e77	2018-08-29 08:47:18 -0700	[diff] [blame]	564
Rik van Riel	12c4d97	2018-09-25 23:58:39 -0400	[diff] [blame]	565	if (need_flush) {
				566	this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
				567	this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
				568	load_new_mm_cr3(next->pgd, new_asid, true);
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	569
Peter Zijlstra	bf9282d	2020-08-12 12:22:17 +0200	[diff] [blame]	570	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
Rik van Riel	12c4d97	2018-09-25 23:58:39 -0400	[diff] [blame]	571	} else {
				572	/* The new ASID is already up to date. */
				573	load_new_mm_cr3(next->pgd, new_asid, false);
Tim Chen	18bf3c3	2018-01-29 22:04:47 +0000	[diff] [blame]	574
Peter Zijlstra	bf9282d	2020-08-12 12:22:17 +0200	[diff] [blame]	575	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
Andy Lutomirski	3d28ebc	2017-05-28 10:00:15 -0700	[diff] [blame]	576	}
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	577
Rik van Riel	12c4d97	2018-09-25 23:58:39 -0400	[diff] [blame]	578	/* Make sure we write CR3 before loaded_mm. */
				579	barrier();
				580
				581	this_cpu_write(cpu_tlbstate.loaded_mm, next);
				582	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
				583
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	584	if (next != real_prev) {
Thomas Gleixner	cb2a023	2020-04-21 11:20:30 +0200	[diff] [blame]	585	cr4_update_pce_mm(next);
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	586	switch_ldt(real_prev, next);
				587	}
Andy Lutomirski	69c0319	2016-04-26 09:39:08 -0700	[diff] [blame]	588	}
				589
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	590	/*
Andy Lutomirski	4e57b94	2017-10-14 09:59:50 -0700	[diff] [blame]	591	* Please ignore the name of this function. It should be called
				592	* switch_to_kernel_thread().
				593	*
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	594	* enter_lazy_tlb() is a hint from the scheduler that we are entering a
				595	* kernel thread or other context without an mm. Acceptable implementations
				596	* include doing nothing whatsoever, switching to init_mm, or various clever
				597	* lazy tricks to try to minimize TLB flushes.
				598	*
				599	* The scheduler reserves the right to call enter_lazy_tlb() several times
				600	* in a row. It will notify us that we're going back to a real mm by
				601	* calling switch_mm_irqs_off().
				602	*/
				603	void enter_lazy_tlb(struct mm_struct mm, struct task_struct tsk)
				604	{
				605	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
				606	return;
				607
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	608	this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	609	}
				610
				611	/*
Andy Lutomirski	72c0098	2017-09-06 19:54:53 -0700	[diff] [blame]	612	* Call this when reinitializing a CPU. It fixes the following potential
				613	* problems:
				614	*
				615	* - The ASID changed from what cpu_tlbstate thinks it is (most likely
				616	* because the CPU was taken down and came back up with CR3's PCID
				617	* bits clear. CPU hotplug can do this.
				618	*
				619	* - The TLB contains junk in slots corresponding to inactive ASIDs.
				620	*
				621	* - The CPU went so far out to lunch that it may have missed a TLB
				622	* flush.
				623	*/
				624	void initialize_tlbstate_and_flush(void)
				625	{
				626	int i;
				627	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
				628	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
				629	unsigned long cr3 = __read_cr3();
				630
				631	/* Assert that CR3 already references the right mm. */
				632	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
				633
				634	/*
				635	* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
				636	* doesn't work like other CR4 bits because it can only be set from
				637	* long mode.)
				638	*/
Andy Lutomirski	7898f79	2017-09-10 08:52:58 -0700	[diff] [blame]	639	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
Andy Lutomirski	72c0098	2017-09-06 19:54:53 -0700	[diff] [blame]	640	!(cr4_read_shadow() & X86_CR4_PCIDE));
				641
				642	/* Force ASID 0 and force a TLB flush. */
Dave Hansen	50fb83a6	2017-12-04 15:07:54 +0100	[diff] [blame]	643	write_cr3(build_cr3(mm->pgd, 0));
Andy Lutomirski	72c0098	2017-09-06 19:54:53 -0700	[diff] [blame]	644
				645	/* Reinitialize tlbstate. */
Thomas Gleixner	4c71a2b6	2018-11-25 19:33:49 +0100	[diff] [blame]	646	this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
Andy Lutomirski	72c0098	2017-09-06 19:54:53 -0700	[diff] [blame]	647	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
				648	this_cpu_write(cpu_tlbstate.next_asid, 1);
				649	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
				650	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
				651
				652	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
				653	this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
				654	}
				655
				656	/*
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	657	* flush_tlb_func()'s memory ordering requirement is that any
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	658	* TLB fills that happen after we flush the TLB are ordered after we
				659	* read active_mm's tlb_gen. We don't need any explicit barriers
				660	* because all x86 flush operations are serializing and the
				661	* atomic64_read operation won't be reordered by the compiler.
				662	*/
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	663	static void flush_tlb_func(void *info)
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	664	{
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	665	/*
				666	* We have three different tlb_gen values in here. They are:
				667	*
				668	* - mm_tlb_gen: the latest generation.
				669	* - local_tlb_gen: the generation that this CPU has already caught
				670	* up to.
				671	* - f->new_tlb_gen: the generation that the requester of the flush
				672	* wants us to catch up to.
				673	*/
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	674	const struct flush_tlb_info *f = info;
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	675	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	676	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	677	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	678	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	679	bool local = smp_processor_id() == f->initiating_cpu;
				680	unsigned long nr_invalidate = 0;
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	681
Andy Lutomirski	bc0d5a8	2017-06-29 08:53:13 -0700	[diff] [blame]	682	/* This code cannot presently handle being reentered. */
				683	VM_WARN_ON(!irqs_disabled());
				684
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	685	if (!local) {
				686	inc_irq_stat(irq_tlb_count);
				687	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
				688
				689	/* Can only happen on remote CPUs */
				690	if (f->mm && f->mm != loaded_mm)
				691	return;
				692	}
				693
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	694	if (unlikely(loaded_mm == &init_mm))
				695	return;
				696
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	697	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	698	loaded_mm->context.ctx_id);
				699
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	700	if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	701	/*
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	702	* We're in lazy mode. We need to at least flush our
				703	* paging-structure cache to avoid speculatively reading
				704	* garbage into our TLB. Since switching to init_mm is barely
				705	* slower than a minimal flush, just switch to init_mm.
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	706	*
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	707	* This should be rare, with native_flush_tlb_multi() skipping
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	708	* IPIs to lazy TLB mode CPUs.
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	709	*/
Andy Lutomirski	b956575	2017-10-09 09:50:49 -0700	[diff] [blame]	710	switch_mm_irqs_off(NULL, &init_mm, NULL);
Andy Lutomirski	b3b90e5	2017-05-22 15:30:02 -0700	[diff] [blame]	711	return;
				712	}
				713
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	714	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
				715	/*
				716	* There's nothing to do: we're already up to date. This can
				717	* happen if two concurrent flushes happen -- the first flush to
				718	* be handled can catch us all the way up, leaving no work for
				719	* the second flush.
				720	*/
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	721	goto done;
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	722	}
				723
				724	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
				725	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
				726
				727	/*
				728	* If we get to this point, we know that our TLB is out of date.
				729	* This does not strictly imply that we need to flush (it's
				730	* possible that f->new_tlb_gen <= local_tlb_gen), but we're
				731	* going to need to flush in the very near future, so we might
				732	* as well get it over with.
				733	*
				734	* The only question is whether to do a full or partial flush.
				735	*
				736	* We do a partial flush if requested and two extra conditions
				737	* are met:
				738	*
				739	* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
				740	* we've always done all needed flushes to catch up to
				741	* local_tlb_gen. If, for example, local_tlb_gen == 2 and
				742	* f->new_tlb_gen == 3, then we know that the flush needed to bring
				743	* us up to date for tlb_gen 3 is the partial flush we're
				744	* processing.
				745	*
				746	* As an example of why this check is needed, suppose that there
				747	* are two concurrent flushes. The first is a full flush that
				748	* changes context.tlb_gen from 1 to 2. The second is a partial
				749	* flush that changes context.tlb_gen from 2 to 3. If they get
				750	* processed on this CPU in reverse order, we'll see
				751	* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
Andy Lutomirski	1299ef1	2018-01-31 08:03:10 -0800	[diff] [blame]	752	* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	753	* 3, we'd be break the invariant: we'd update local_tlb_gen above
				754	* 1 without the full flush that's needed for tlb_gen 2.
				755	*
Ingo Molnar	d9f6e12	2021-03-18 15:28:01 +0100	[diff] [blame]	756	* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	757	* Partial TLB flushes are not all that much cheaper than full TLB
				758	* flushes, so it seems unlikely that it would be a performance win
				759	* to do a partial flush if that won't bring our TLB fully up to
				760	* date. By doing a full flush instead, we can increase
				761	* local_tlb_gen all the way to mm_tlb_gen and we can probably
				762	* avoid another flush in the very near future.
				763	*/
				764	if (f->end != TLB_FLUSH_ALL &&
				765	f->new_tlb_gen == local_tlb_gen + 1 &&
				766	f->new_tlb_gen == mm_tlb_gen) {
				767	/* Partial flush */
Peter Zijlstra	a31acd3	2018-08-26 12:56:48 +0200	[diff] [blame]	768	unsigned long addr = f->start;
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	769
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	770	nr_invalidate = (f->end - f->start) >> f->stride_shift;
				771
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	772	while (addr < f->end) {
Thomas Gleixner	127ac91	2020-04-21 11:20:34 +0200	[diff] [blame]	773	flush_tlb_one_user(addr);
Peter Zijlstra	a31acd3	2018-08-26 12:56:48 +0200	[diff] [blame]	774	addr += 1UL << f->stride_shift;
Andy Lutomirski	b3b90e5	2017-05-22 15:30:02 -0700	[diff] [blame]	775	}
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	776	if (local)
Peter Zijlstra	a31acd3	2018-08-26 12:56:48 +0200	[diff] [blame]	777	count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	778	} else {
				779	/* Full flush. */
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	780	nr_invalidate = TLB_FLUSH_ALL;
				781
Thomas Gleixner	2faf153	2020-04-21 11:20:32 +0200	[diff] [blame]	782	flush_tlb_local();
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	783	if (local)
				784	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
Andy Lutomirski	b3b90e5	2017-05-22 15:30:02 -0700	[diff] [blame]	785	}
Andy Lutomirski	b0579ad	2017-06-29 08:53:16 -0700	[diff] [blame]	786
				787	/* Both paths above update our state to mm_tlb_gen. */
Andy Lutomirski	10af623	2017-07-24 21:41:38 -0700	[diff] [blame]	788	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	789
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	790	/* Tracing is done in a unified manner to reduce the code size */
				791	done:
				792	trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
				793	(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
				794	TLB_LOCAL_MM_SHOOTDOWN,
				795	nr_invalidate);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	796	}
				797
Nadav Amit	6035152	2021-02-20 15:17:06 -0800	[diff] [blame]	798	static bool tlb_is_not_lazy(int cpu)
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	799	{
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	800	return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	801	}
				802
Nadav Amit	6035152	2021-02-20 15:17:06 -0800	[diff] [blame]	803	static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	804
Nadav Amit	2f4305b	2021-02-20 15:17:08 -0800	[diff] [blame]	805	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
				806	EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	807
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	808	STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
Thomas Gleixner	29def59	2020-04-21 11:20:36 +0200	[diff] [blame]	809	const struct flush_tlb_info *info)
Rusty Russell	4595f96	2009-01-10 21:58:09 -0800	[diff] [blame]	810	{
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	811	/*
				812	* Do accounting and tracing. Note that there are (and have always been)
				813	* cases in which a remote TLB flush will be traced, but eventually
				814	* would not happen.
				815	*/
Mel Gorman	ec65993	2014-01-21 14:33:16 -0800	[diff] [blame]	816	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	817	if (info->end == TLB_FLUSH_ALL)
Nadav Amit	18c9824	2016-04-01 14:31:23 -0700	[diff] [blame]	818	trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
				819	else
				820	trace_tlb_flush(TLB_REMOTE_SEND_IPI,
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	821	(info->end - info->start) >> PAGE_SHIFT);
Nadav Amit	18c9824	2016-04-01 14:31:23 -0700	[diff] [blame]	822
Rik van Riel	145f573	2018-09-25 23:58:44 -0400	[diff] [blame]	823	/*
				824	* If no page tables were freed, we can skip sending IPIs to
				825	* CPUs in lazy TLB mode. They will flush the CPU themselves
				826	* at the next context switch.
				827	*
				828	* However, if page tables are getting freed, we need to send the
				829	* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
				830	* up on the new contents of what used to be page tables, while
				831	* doing a speculative memory access.
				832	*/
Nadav Amit	6035152	2021-02-20 15:17:06 -0800	[diff] [blame]	833	if (info->freed_tables) {
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	834	on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
Nadav Amit	6035152	2021-02-20 15:17:06 -0800	[diff] [blame]	835	} else {
				836	/*
				837	* Although we could have used on_each_cpu_cond_mask(),
				838	* open-coding it has performance advantages, as it eliminates
				839	* the need for indirect calls or retpolines. In addition, it
				840	* allows to use a designated cpumask for evaluating the
				841	* condition, instead of allocating one.
				842	*
				843	* This code works under the assumption that there are no nested
				844	* TLB flushes, an assumption that is already made in
				845	* flush_tlb_mm_range().
				846	*
				847	* cond_cpumask is logically a stack-local variable, but it is
				848	* more efficient to have it off the stack and not to allocate
				849	* it on demand. Preemption is disabled and this code is
				850	* non-reentrant.
				851	*/
				852	struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
				853	int cpu;
				854
				855	cpumask_clear(cond_cpumask);
				856
				857	for_each_cpu(cpu, cpumask) {
				858	if (tlb_is_not_lazy(cpu))
				859	__cpumask_set_cpu(cpu, cond_cpumask);
				860	}
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	861	on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
Nadav Amit	6035152	2021-02-20 15:17:06 -0800	[diff] [blame]	862	}
Rusty Russell	4595f96	2009-01-10 21:58:09 -0800	[diff] [blame]	863	}
				864
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	865	void flush_tlb_multi(const struct cpumask *cpumask,
Thomas Gleixner	29def59	2020-04-21 11:20:36 +0200	[diff] [blame]	866	const struct flush_tlb_info *info)
				867	{
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	868	__flush_tlb_multi(cpumask, info);
Thomas Gleixner	29def59	2020-04-21 11:20:36 +0200	[diff] [blame]	869	}
				870
Dave Hansen	a510247	2014-07-31 08:41:03 -0700	[diff] [blame]	871	/*
Mauro Carvalho Chehab	cb1aaeb	2019-06-07 15:54:32 -0300	[diff] [blame]	872	* See Documentation/x86/tlb.rst for details. We choose 33
Dave Hansen	a510247	2014-07-31 08:41:03 -0700	[diff] [blame]	873	* because it is large enough to cover the vast majority (at
				874	* least 95%) of allocations, and is small enough that we are
				875	* confident it will not cause too much overhead. Each single
				876	* flush is about 100 ns, so this caps the maximum overhead at
				877	* _about_ 3,000 ns.
				878	*
				879	* This is in units of pages.
				880	*/
Peter Zijlstra	935f583	2018-12-03 18:03:49 +0100	[diff] [blame]	881	unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
Dave Hansen	e9f4e0a	2014-07-31 08:40:55 -0700	[diff] [blame]	882
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	883	static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
				884
				885	#ifdef CONFIG_DEBUG_VM
				886	static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
				887	#endif
				888
Nadav Amit	1608e4c	2021-02-20 15:17:11 -0800	[diff] [blame]	889	static struct flush_tlb_info get_flush_tlb_info(struct mm_struct mm,
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	890	unsigned long start, unsigned long end,
				891	unsigned int stride_shift, bool freed_tables,
				892	u64 new_tlb_gen)
				893	{
				894	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
				895
				896	#ifdef CONFIG_DEBUG_VM
				897	/*
				898	* Ensure that the following code is non-reentrant and flush_tlb_info
				899	* is not overwritten. This means no TLB flushing is initiated by
				900	* interrupt handlers and machine-check exception handlers.
				901	*/
				902	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
				903	#endif
				904
				905	info->start = start;
				906	info->end = end;
				907	info->mm = mm;
				908	info->stride_shift = stride_shift;
				909	info->freed_tables = freed_tables;
				910	info->new_tlb_gen = new_tlb_gen;
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	911	info->initiating_cpu = smp_processor_id();
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	912
				913	return info;
				914	}
				915
Nadav Amit	1608e4c	2021-02-20 15:17:11 -0800	[diff] [blame]	916	static void put_flush_tlb_info(void)
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	917	{
				918	#ifdef CONFIG_DEBUG_VM
Ingo Molnar	d9f6e12	2021-03-18 15:28:01 +0100	[diff] [blame]	919	/* Complete reentrancy prevention checks */
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	920	barrier();
				921	this_cpu_dec(flush_tlb_info_idx);
				922	#endif
				923	}
				924
Alex Shi	611ae8e	2012-06-28 09:02:22 +0800	[diff] [blame]	925	void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
Rik van Riel	016c4d9	2018-09-25 23:58:42 -0400	[diff] [blame]	926	unsigned long end, unsigned int stride_shift,
				927	bool freed_tables)
Alex Shi	611ae8e	2012-06-28 09:02:22 +0800	[diff] [blame]	928	{
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	929	struct flush_tlb_info *info;
				930	u64 new_tlb_gen;
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	931	int cpu;
Alex Shi	611ae8e	2012-06-28 09:02:22 +0800	[diff] [blame]	932
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	933	cpu = get_cpu();
Andy Lutomirski	ce27374	2017-04-22 00:01:21 -0700	[diff] [blame]	934
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	935	/* Should we flush just the requested range? */
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	936	if ((end == TLB_FLUSH_ALL) \|\|
				937	((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
				938	start = 0;
				939	end = TLB_FLUSH_ALL;
Dave Hansen	4995ab9	2014-07-31 08:40:54 -0700	[diff] [blame]	940	}
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	941
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	942	/* This is also a barrier that synchronizes with switch_mm(). */
				943	new_tlb_gen = inc_mm_tlb_gen(mm);
				944
				945	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
				946	new_tlb_gen);
				947
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	948	/*
				949	* flush_tlb_multi() is not optimized for the common case in which only
				950	* a local TLB flush is needed. Optimize this use-case by calling
				951	* flush_tlb_func_local() directly in this case.
				952	*/
				953	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
				954	flush_tlb_multi(mm_cpumask(mm), info);
				955	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	956	lockdep_assert_irqs_enabled();
Andy Lutomirski	bc0d5a8	2017-06-29 08:53:13 -0700	[diff] [blame]	957	local_irq_disable();
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	958	flush_tlb_func(info);
Andy Lutomirski	bc0d5a8	2017-06-29 08:53:13 -0700	[diff] [blame]	959	local_irq_enable();
				960	}
				961
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	962	put_flush_tlb_info();
Andy Lutomirski	454bbad	2017-05-28 10:00:12 -0700	[diff] [blame]	963	put_cpu();
Alex Shi	e7b52ff	2012-06-28 09:02:17 +0800	[diff] [blame]	964	}
				965
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	966
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	967	static void do_flush_tlb_all(void *info)
				968	{
Mel Gorman	ec65993	2014-01-21 14:33:16 -0800	[diff] [blame]	969	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	970	__flush_tlb_all();
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	971	}
				972
				973	void flush_tlb_all(void)
				974	{
Mel Gorman	ec65993	2014-01-21 14:33:16 -0800	[diff] [blame]	975	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	976	on_each_cpu(do_flush_tlb_all, NULL, 1);
Glauber Costa	c048fdf	2008-03-03 14:12:54 -0300	[diff] [blame]	977	}
Alex Shi	3df3212	2012-06-28 09:02:20 +0800	[diff] [blame]	978
Alex Shi	effee4b	2012-06-28 09:02:24 +0800	[diff] [blame]	979	static void do_kernel_range_flush(void *info)
				980	{
				981	struct flush_tlb_info *f = info;
				982	unsigned long addr;
				983
				984	/* flush range by one by one 'invlpg' */
Andy Lutomirski	a2055ab	2017-05-28 10:00:10 -0700	[diff] [blame]	985	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
Thomas Gleixner	58430c5	2020-04-21 11:20:35 +0200	[diff] [blame]	986	flush_tlb_one_kernel(addr);
Alex Shi	effee4b	2012-06-28 09:02:24 +0800	[diff] [blame]	987	}
				988
				989	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
				990	{
Alex Shi	effee4b	2012-06-28 09:02:24 +0800	[diff] [blame]	991	/* Balance as user space task's flush, a bit conservative */
Dave Hansen	e9f4e0a	2014-07-31 08:40:55 -0700	[diff] [blame]	992	if (end == TLB_FLUSH_ALL \|\|
Andy Lutomirski	be4ffc0d	2017-05-28 10:00:16 -0700	[diff] [blame]	993	(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
Alex Shi	effee4b	2012-06-28 09:02:24 +0800	[diff] [blame]	994	on_each_cpu(do_flush_tlb_all, NULL, 1);
Dave Hansen	e9f4e0a	2014-07-31 08:40:55 -0700	[diff] [blame]	995	} else {
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	996	struct flush_tlb_info *info;
				997
				998	preempt_disable();
				999	info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
				1000
				1001	on_each_cpu(do_kernel_range_flush, info, 1);
				1002
				1003	put_flush_tlb_info();
				1004	preempt_enable();
Alex Shi	effee4b	2012-06-28 09:02:24 +0800	[diff] [blame]	1005	}
				1006	}
Dave Hansen	2d040a1	2014-07-31 08:41:01 -0700	[diff] [blame]	1007
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	1008	/*
Thomas Gleixner	8c5cc19	2020-04-21 11:20:28 +0200	[diff] [blame]	1009	* This can be used from process context to figure out what the value of
				1010	* CR3 is without needing to do a (slow) __read_cr3().
				1011	*
				1012	* It's intended to be used for code like KVM that sneakily changes CR3
				1013	* and needs to restore it. It needs to be used very carefully.
				1014	*/
				1015	unsigned long __get_current_cr3_fast(void)
				1016	{
				1017	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
				1018	this_cpu_read(cpu_tlbstate.loaded_mm_asid));
				1019
				1020	/* For now, be very restrictive about when this can be called. */
				1021	VM_WARN_ON(in_nmi() \|\| preemptible());
				1022
				1023	VM_BUG_ON(cr3 != __read_cr3());
				1024	return cr3;
				1025	}
				1026	EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
				1027
				1028	/*
Thomas Gleixner	58430c5	2020-04-21 11:20:35 +0200	[diff] [blame]	1029	* Flush one page in the kernel mapping
				1030	*/
				1031	void flush_tlb_one_kernel(unsigned long addr)
				1032	{
				1033	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
				1034
				1035	/*
				1036	* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
				1037	* paravirt equivalent. Even with PCID, this is sufficient: we only
				1038	* use PCID if we also use global PTEs for the kernel mapping, and
				1039	* INVLPG flushes global translations across all address spaces.
				1040	*
				1041	* If PTI is on, then the kernel is mapped with non-global PTEs, and
				1042	* __flush_tlb_one_user() will flush the given address for the current
				1043	* kernel address space and for its usermode counterpart, but it does
				1044	* not flush it for other address spaces.
				1045	*/
				1046	flush_tlb_one_user(addr);
				1047
				1048	if (!static_cpu_has(X86_FEATURE_PTI))
				1049	return;
				1050
				1051	/*
				1052	* See above. We need to propagate the flush to all other address
				1053	* spaces. In principle, we only need to propagate it to kernelmode
				1054	* address spaces, but the extra bookkeeping we would need is not
				1055	* worth it.
				1056	*/
				1057	this_cpu_write(cpu_tlbstate.invalidate_other, true);
				1058	}
				1059
				1060	/*
Thomas Gleixner	127ac91	2020-04-21 11:20:34 +0200	[diff] [blame]	1061	* Flush one page in the user mapping
				1062	*/
				1063	STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
				1064	{
				1065	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
				1066
				1067	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
				1068
				1069	if (!static_cpu_has(X86_FEATURE_PTI))
				1070	return;
				1071
				1072	/*
				1073	* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
				1074	* Just use invalidate_user_asid() in case we are called early.
				1075	*/
				1076	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
				1077	invalidate_user_asid(loaded_mm_asid);
				1078	else
				1079	invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
				1080	}
				1081
				1082	void flush_tlb_one_user(unsigned long addr)
				1083	{
				1084	__flush_tlb_one_user(addr);
				1085	}
				1086
				1087	/*
Thomas Gleixner	cd30d26	2020-04-21 11:20:33 +0200	[diff] [blame]	1088	* Flush everything
				1089	*/
				1090	STATIC_NOPV void native_flush_tlb_global(void)
				1091	{
				1092	unsigned long cr4, flags;
				1093
				1094	if (static_cpu_has(X86_FEATURE_INVPCID)) {
				1095	/*
				1096	* Using INVPCID is considerably faster than a pair of writes
				1097	* to CR4 sandwiched inside an IRQ flag save/restore.
				1098	*
				1099	* Note, this works with CR4.PCIDE=0 or 1.
				1100	*/
				1101	invpcid_flush_all();
				1102	return;
				1103	}
				1104
				1105	/*
				1106	* Read-modify-write to CR4 - protect it from preemption and
				1107	* from interrupts. (Use the raw variant because this code can
				1108	* be called from deep inside debugging code.)
				1109	*/
				1110	raw_local_irq_save(flags);
				1111
				1112	cr4 = this_cpu_read(cpu_tlbstate.cr4);
				1113	/* toggle PGE */
				1114	native_write_cr4(cr4 ^ X86_CR4_PGE);
				1115	/* write old PGE again and flush TLBs */
				1116	native_write_cr4(cr4);
				1117
				1118	raw_local_irq_restore(flags);
				1119	}
				1120
Thomas Gleixner	cd30d26	2020-04-21 11:20:33 +0200	[diff] [blame]	1121	/*
Thomas Gleixner	2faf153	2020-04-21 11:20:32 +0200	[diff] [blame]	1122	* Flush the entire current user mapping
				1123	*/
				1124	STATIC_NOPV void native_flush_tlb_local(void)
				1125	{
				1126	/*
				1127	* Preemption or interrupts must be disabled to protect the access
				1128	* to the per CPU variable and to prevent being preempted between
				1129	* read_cr3() and write_cr3().
				1130	*/
				1131	WARN_ON_ONCE(preemptible());
				1132
				1133	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
				1134
				1135	/* If current->mm == NULL then the read_cr3() "borrows" an mm */
				1136	native_write_cr3(__native_read_cr3());
				1137	}
				1138
				1139	void flush_tlb_local(void)
				1140	{
				1141	__flush_tlb_local();
				1142	}
Thomas Gleixner	4b04e6c	2020-04-21 11:20:37 +0200	[diff] [blame]	1143
				1144	/*
				1145	* Flush everything
				1146	*/
				1147	void __flush_tlb_all(void)
				1148	{
				1149	/*
				1150	* This is to catch users with enabled preemption and the PGE feature
				1151	* and don't trigger the warning in __native_flush_tlb().
				1152	*/
				1153	VM_WARN_ON_ONCE(preemptible());
				1154
				1155	if (boot_cpu_has(X86_FEATURE_PGE)) {
				1156	__flush_tlb_global();
				1157	} else {
				1158	/*
				1159	* !PGE -> !PCID (setup_pcid()), thus every flush is total.
				1160	*/
				1161	flush_tlb_local();
				1162	}
				1163	}
				1164	EXPORT_SYMBOL_GPL(__flush_tlb_all);
Thomas Gleixner	2faf153	2020-04-21 11:20:32 +0200	[diff] [blame]	1165
Andy Lutomirski	e73ad5f	2017-05-22 15:30:03 -0700	[diff] [blame]	1166	void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
				1167	{
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	1168	struct flush_tlb_info *info;
				1169
Andy Lutomirski	e73ad5f	2017-05-22 15:30:03 -0700	[diff] [blame]	1170	int cpu = get_cpu();
				1171
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	1172	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
Nadav Amit	4ce94ea	2021-02-20 15:17:07 -0800	[diff] [blame]	1173	/*
				1174	* flush_tlb_multi() is not optimized for the common case in which only
				1175	* a local TLB flush is needed. Optimize this use-case by calling
				1176	* flush_tlb_func_local() directly in this case.
				1177	*/
				1178	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
				1179	flush_tlb_multi(&batch->cpumask, info);
				1180	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
Nadav Amit	3db6d5a	2019-04-25 16:01:43 -0700	[diff] [blame]	1181	lockdep_assert_irqs_enabled();
Andy Lutomirski	bc0d5a8	2017-06-29 08:53:13 -0700	[diff] [blame]	1182	local_irq_disable();
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	1183	flush_tlb_func(info);
Andy Lutomirski	bc0d5a8	2017-06-29 08:53:13 -0700	[diff] [blame]	1184	local_irq_enable();
				1185	}
				1186
Andy Lutomirski	e73ad5f	2017-05-22 15:30:03 -0700	[diff] [blame]	1187	cpumask_clear(&batch->cpumask);
				1188
Nadav Amit	4c1ba39	2021-02-20 15:17:05 -0800	[diff] [blame]	1189	put_flush_tlb_info();
Andy Lutomirski	e73ad5f	2017-05-22 15:30:03 -0700	[diff] [blame]	1190	put_cpu();
				1191	}
				1192
Thomas Gleixner	af5c40c	2020-04-21 11:20:40 +0200	[diff] [blame]	1193	/*
				1194	* Blindly accessing user memory from NMI context can be dangerous
				1195	* if we're in the middle of switching the current user task or
				1196	* switching the loaded mm. It can also be dangerous if we
				1197	* interrupted some kernel code that was temporarily using a
				1198	* different mm.
				1199	*/
				1200	bool nmi_uaccess_okay(void)
				1201	{
				1202	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
				1203	struct mm_struct *current_mm = current->mm;
				1204
				1205	VM_WARN_ON_ONCE(!loaded_mm);
				1206
				1207	/*
				1208	* The condition we want to check is
				1209	* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
				1210	* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
				1211	* is supposed to be reasonably fast.
				1212	*
				1213	* Instead, we check the almost equivalent but somewhat conservative
				1214	* condition below, and we rely on the fact that switch_mm_irqs_off()
				1215	* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
				1216	*/
				1217	if (loaded_mm != current_mm)
				1218	return false;
				1219
				1220	VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
				1221
				1222	return true;
				1223	}
				1224
Dave Hansen	2d040a1	2014-07-31 08:41:01 -0700	[diff] [blame]	1225	static ssize_t tlbflush_read_file(struct file file, char __user user_buf,
				1226	size_t count, loff_t *ppos)
				1227	{
				1228	char buf[32];
				1229	unsigned int len;
				1230
				1231	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
				1232	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
				1233	}
				1234
				1235	static ssize_t tlbflush_write_file(struct file *file,
				1236	const char __user user_buf, size_t count, loff_t ppos)
				1237	{
				1238	char buf[32];
				1239	ssize_t len;
				1240	int ceiling;
				1241
				1242	len = min(count, sizeof(buf) - 1);
				1243	if (copy_from_user(buf, user_buf, len))
				1244	return -EFAULT;
				1245
				1246	buf[len] = '\0';
				1247	if (kstrtoint(buf, 0, &ceiling))
				1248	return -EINVAL;
				1249
				1250	if (ceiling < 0)
				1251	return -EINVAL;
				1252
				1253	tlb_single_page_flush_ceiling = ceiling;
				1254	return count;
				1255	}
				1256
				1257	static const struct file_operations fops_tlbflush = {
				1258	.read = tlbflush_read_file,
				1259	.write = tlbflush_write_file,
				1260	.llseek = default_llseek,
				1261	};
				1262
				1263	static int __init create_tlb_single_page_flush_ceiling(void)
				1264	{
				1265	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR \| S_IWUSR,
				1266	arch_debugfs_dir, NULL, &fops_tlbflush);
				1267	return 0;
				1268	}
				1269	late_initcall(create_tlb_single_page_flush_ceiling);