Blame - mm/migrate_device.c - linux

blob: 70c7dc05bbfca4003acd575a786bf950e34d6d32 [file] [log] [blame]

Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Device Memory Migration functionality.
				4	*
				5	* Originally written by Jérôme Glisse.
				6	*/
				7	#include <linux/export.h>
				8	#include <linux/memremap.h>
				9	#include <linux/migrate.h>
				10	#include <linux/mm_inline.h>
				11	#include <linux/mmu_notifier.h>
				12	#include <linux/oom.h>
				13	#include <linux/pagewalk.h>
				14	#include <linux/rmap.h>
				15	#include <linux/swapops.h>
				16	#include <asm/tlbflush.h>
				17	#include "internal.h"
				18
				19	static int migrate_vma_collect_skip(unsigned long start,
				20	unsigned long end,
				21	struct mm_walk *walk)
				22	{
				23	struct migrate_vma *migrate = walk->private;
				24	unsigned long addr;
				25
				26	for (addr = start; addr < end; addr += PAGE_SIZE) {
				27	migrate->dst[migrate->npages] = 0;
				28	migrate->src[migrate->npages++] = 0;
				29	}
				30
				31	return 0;
				32	}
				33
				34	static int migrate_vma_collect_hole(unsigned long start,
				35	unsigned long end,
				36	__always_unused int depth,
				37	struct mm_walk *walk)
				38	{
				39	struct migrate_vma *migrate = walk->private;
				40	unsigned long addr;
				41
				42	/* Only allow populating anonymous memory. */
				43	if (!vma_is_anonymous(walk->vma))
				44	return migrate_vma_collect_skip(start, end, walk);
				45
				46	for (addr = start; addr < end; addr += PAGE_SIZE) {
				47	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				48	migrate->dst[migrate->npages] = 0;
				49	migrate->npages++;
				50	migrate->cpages++;
				51	}
				52
				53	return 0;
				54	}
				55
				56	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				57	unsigned long start,
				58	unsigned long end,
				59	struct mm_walk *walk)
				60	{
				61	struct migrate_vma *migrate = walk->private;
				62	struct vm_area_struct *vma = walk->vma;
				63	struct mm_struct *mm = vma->vm_mm;
				64	unsigned long addr = start, unmapped = 0;
				65	spinlock_t *ptl;
				66	pte_t *ptep;
				67
				68	again:
				69	if (pmd_none(*pmdp))
				70	return migrate_vma_collect_hole(start, end, -1, walk);
				71
				72	if (pmd_trans_huge(*pmdp)) {
				73	struct page *page;
				74
				75	ptl = pmd_lock(mm, pmdp);
				76	if (unlikely(!pmd_trans_huge(*pmdp))) {
				77	spin_unlock(ptl);
				78	goto again;
				79	}
				80
				81	page = pmd_page(*pmdp);
				82	if (is_huge_zero_page(page)) {
				83	spin_unlock(ptl);
				84	split_huge_pmd(vma, pmdp, addr);
				85	if (pmd_trans_unstable(pmdp))
				86	return migrate_vma_collect_skip(start, end,
				87	walk);
				88	} else {
				89	int ret;
				90
				91	get_page(page);
				92	spin_unlock(ptl);
				93	if (unlikely(!trylock_page(page)))
				94	return migrate_vma_collect_skip(start, end,
				95	walk);
				96	ret = split_huge_page(page);
				97	unlock_page(page);
				98	put_page(page);
				99	if (ret)
				100	return migrate_vma_collect_skip(start, end,
				101	walk);
				102	if (pmd_none(*pmdp))
				103	return migrate_vma_collect_hole(start, end, -1,
				104	walk);
				105	}
				106	}
				107
				108	if (unlikely(pmd_bad(*pmdp)))
				109	return migrate_vma_collect_skip(start, end, walk);
				110
				111	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				112	arch_enter_lazy_mmu_mode();
				113
				114	for (; addr < end; addr += PAGE_SIZE, ptep++) {
				115	unsigned long mpfn = 0, pfn;
				116	struct page *page;
				117	swp_entry_t entry;
				118	pte_t pte;
				119
				120	pte = *ptep;
				121
				122	if (pte_none(pte)) {
				123	if (vma_is_anonymous(vma)) {
				124	mpfn = MIGRATE_PFN_MIGRATE;
				125	migrate->cpages++;
				126	}
				127	goto next;
				128	}
				129
				130	if (!pte_present(pte)) {
				131	/*
				132	* Only care about unaddressable device page special
				133	* page table entry. Other special swap entries are not
				134	* migratable, and we ignore regular swapped page.
				135	*/
				136	entry = pte_to_swp_entry(pte);
				137	if (!is_device_private_entry(entry))
				138	goto next;
				139
				140	page = pfn_swap_entry_to_page(entry);
				141	if (!(migrate->flags &
				142	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
				143	page->pgmap->owner != migrate->pgmap_owner)
				144	goto next;
				145
				146	mpfn = migrate_pfn(page_to_pfn(page)) \|
				147	MIGRATE_PFN_MIGRATE;
				148	if (is_writable_device_private_entry(entry))
				149	mpfn \|= MIGRATE_PFN_WRITE;
				150	} else {
				151	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
				152	goto next;
				153	pfn = pte_pfn(pte);
				154	if (is_zero_pfn(pfn)) {
				155	mpfn = MIGRATE_PFN_MIGRATE;
				156	migrate->cpages++;
				157	goto next;
				158	}
				159	page = vm_normal_page(migrate->vma, addr, pte);
				160	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				161	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				162	}
				163
				164	/* FIXME support THP */
				165	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
				166	mpfn = 0;
				167	goto next;
				168	}
				169
				170	/*
				171	* By getting a reference on the page we pin it and that blocks
				172	* any kind of migration. Side effect is that it "freezes" the
				173	* pte.
				174	*
				175	* We drop this reference after isolating the page from the lru
				176	* for non device page (device page are not on the lru and thus
				177	* can't be dropped from it).
				178	*/
				179	get_page(page);
				180
				181	/*
				182	* Optimize for the common case where page is only mapped once
				183	* in one process. If we can lock the page, then we can safely
				184	* set up a special migration page table entry now.
				185	*/
				186	if (trylock_page(page)) {
				187	pte_t swp_pte;
				188
				189	migrate->cpages++;
				190	ptep_get_and_clear(mm, addr, ptep);
				191
				192	/* Setup special migration page table entry */
				193	if (mpfn & MIGRATE_PFN_WRITE)
				194	entry = make_writable_migration_entry(
				195	page_to_pfn(page));
				196	else
				197	entry = make_readable_migration_entry(
				198	page_to_pfn(page));
				199	swp_pte = swp_entry_to_pte(entry);
				200	if (pte_present(pte)) {
				201	if (pte_soft_dirty(pte))
				202	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				203	if (pte_uffd_wp(pte))
				204	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				205	} else {
				206	if (pte_swp_soft_dirty(pte))
				207	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				208	if (pte_swp_uffd_wp(pte))
				209	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				210	}
				211	set_pte_at(mm, addr, ptep, swp_pte);
				212
				213	/*
				214	* This is like regular unmap: we remove the rmap and
				215	* drop page refcount. Page won't be freed, as we took
				216	* a reference just above.
				217	*/
				218	page_remove_rmap(page, vma, false);
				219	put_page(page);
				220
				221	if (pte_present(pte))
				222	unmapped++;
				223	} else {
				224	put_page(page);
				225	mpfn = 0;
				226	}
				227
				228	next:
				229	migrate->dst[migrate->npages] = 0;
				230	migrate->src[migrate->npages++] = mpfn;
				231	}
				232	arch_leave_lazy_mmu_mode();
				233	pte_unmap_unlock(ptep - 1, ptl);
				234
				235	/* Only flush the TLB if we actually modified any entries */
				236	if (unmapped)
				237	flush_tlb_range(walk->vma, start, end);
				238
				239	return 0;
				240	}
				241
				242	static const struct mm_walk_ops migrate_vma_walk_ops = {
				243	.pmd_entry = migrate_vma_collect_pmd,
				244	.pte_hole = migrate_vma_collect_hole,
				245	};
				246
				247	/*
				248	* migrate_vma_collect() - collect pages over a range of virtual addresses
				249	* @migrate: migrate struct containing all migration information
				250	*
				251	* This will walk the CPU page table. For each virtual address backed by a
				252	* valid page, it updates the src array and takes a reference on the page, in
				253	* order to pin the page until we lock it and unmap it.
				254	*/
				255	static void migrate_vma_collect(struct migrate_vma *migrate)
				256	{
				257	struct mmu_notifier_range range;
				258
				259	/*
				260	* Note that the pgmap_owner is passed to the mmu notifier callback so
				261	* that the registered device driver can skip invalidating device
				262	* private page mappings that won't be migrated.
				263	*/
				264	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
				265	migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
				266	migrate->pgmap_owner);
				267	mmu_notifier_invalidate_range_start(&range);
				268
				269	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
				270	&migrate_vma_walk_ops, migrate);
				271
				272	mmu_notifier_invalidate_range_end(&range);
				273	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				274	}
				275
				276	/*
				277	* migrate_vma_check_page() - check if page is pinned or not
				278	* @page: struct page to check
				279	*
				280	* Pinned pages cannot be migrated. This is the same test as in
				281	* folio_migrate_mapping(), except that here we allow migration of a
				282	* ZONE_DEVICE page.
				283	*/
				284	static bool migrate_vma_check_page(struct page *page)
				285	{
				286	/*
				287	* One extra ref because caller holds an extra reference, either from
				288	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				289	* a device page.
				290	*/
				291	int extra = 1;
				292
				293	/*
				294	* FIXME support THP (transparent huge page), it is bit more complex to
				295	* check them than regular pages, because they can be mapped with a pmd
				296	* or with a pte (split pte mapping).
				297	*/
				298	if (PageCompound(page))
				299	return false;
				300
				301	/* Page from ZONE_DEVICE have one extra reference */
				302	if (is_zone_device_page(page))
				303	extra++;
				304
				305	/* For file back page */
				306	if (page_mapping(page))
				307	extra += 1 + page_has_private(page);
				308
				309	if ((page_count(page) - extra) > page_mapcount(page))
				310	return false;
				311
				312	return true;
				313	}
				314
				315	/*
				316	* migrate_vma_unmap() - replace page mapping with special migration pte entry
				317	* @migrate: migrate struct containing all migration information
				318	*
				319	* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
				320	* special migration pte entry and check if it has been pinned. Pinned pages are
				321	* restored because we cannot migrate them.
				322	*
				323	* This is the last step before we call the device driver callback to allocate
				324	* destination memory and copy contents of original page over to new page.
				325	*/
				326	static void migrate_vma_unmap(struct migrate_vma *migrate)
				327	{
				328	const unsigned long npages = migrate->npages;
				329	unsigned long i, restore = 0;
				330	bool allow_drain = true;
				331
				332	lru_add_drain();
				333
				334	for (i = 0; i < npages; i++) {
				335	struct page *page = migrate_pfn_to_page(migrate->src[i]);
Matthew Wilcox (Oracle)	4b8554c	2022-01-28 14:29:43 -0500	[diff] [blame]	336	struct folio *folio;
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	337
				338	if (!page)
				339	continue;
				340
				341	/* ZONE_DEVICE pages are not on LRU */
				342	if (!is_zone_device_page(page)) {
				343	if (!PageLRU(page) && allow_drain) {
				344	/* Drain CPU's pagevec */
				345	lru_add_drain_all();
				346	allow_drain = false;
				347	}
				348
				349	if (isolate_lru_page(page)) {
				350	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				351	migrate->cpages--;
				352	restore++;
				353	continue;
				354	}
				355
				356	/* Drop the reference we took in collect */
				357	put_page(page);
				358	}
				359
Matthew Wilcox (Oracle)	4b8554c	2022-01-28 14:29:43 -0500	[diff] [blame]	360	folio = page_folio(page);
				361	if (folio_mapped(folio))
				362	try_to_migrate(folio, 0);
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	363
				364	if (page_mapped(page) \|\| !migrate_vma_check_page(page)) {
				365	if (!is_zone_device_page(page)) {
				366	get_page(page);
				367	putback_lru_page(page);
				368	}
				369
				370	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				371	migrate->cpages--;
				372	restore++;
				373	continue;
				374	}
				375	}
				376
				377	for (i = 0; i < npages && restore; i++) {
				378	struct page *page = migrate_pfn_to_page(migrate->src[i]);
Matthew Wilcox (Oracle)	4eecb8b	2022-01-28 23:32:59 -0500	[diff] [blame]	379	struct folio *folio;
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	380
				381	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				382	continue;
				383
Matthew Wilcox (Oracle)	4eecb8b	2022-01-28 23:32:59 -0500	[diff] [blame]	384	folio = page_folio(page);
				385	remove_migration_ptes(folio, folio, false);
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	386
				387	migrate->src[i] = 0;
Matthew Wilcox (Oracle)	4eecb8b	2022-01-28 23:32:59 -0500	[diff] [blame]	388	folio_unlock(folio);
				389	folio_put(folio);
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	390	restore--;
				391	}
				392	}
				393
				394	/**
				395	* migrate_vma_setup() - prepare to migrate a range of memory
				396	* @args: contains the vma, start, and pfns arrays for the migration
				397	*
				398	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
				399	* without an error.
				400	*
				401	* Prepare to migrate a range of memory virtual address range by collecting all
				402	* the pages backing each virtual address in the range, saving them inside the
				403	* src array. Then lock those pages and unmap them. Once the pages are locked
				404	* and unmapped, check whether each page is pinned or not. Pages that aren't
				405	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
				406	* corresponding src array entry. Then restores any pages that are pinned, by
				407	* remapping and unlocking those pages.
				408	*
				409	* The caller should then allocate destination memory and copy source memory to
				410	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
				411	* flag set). Once these are allocated and copied, the caller must update each
				412	* corresponding entry in the dst array with the pfn value of the destination
				413	* page and with MIGRATE_PFN_VALID. Destination pages must be locked via
				414	* lock_page().
				415	*
				416	* Note that the caller does not have to migrate all the pages that are marked
				417	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
				418	* device memory to system memory. If the caller cannot migrate a device page
				419	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
				420	* consequences for the userspace process, so it must be avoided if at all
				421	* possible.
				422	*
				423	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
				424	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
				425	* allowing the caller to allocate device memory for those unbacked virtual
				426	* addresses. For this the caller simply has to allocate device memory and
				427	* properly set the destination entry like for regular migration. Note that
				428	* this can still fail, and thus inside the device driver you must check if the
				429	* migration was successful for those entries after calling migrate_vma_pages(),
				430	* just like for regular migration.
				431	*
				432	* After that, the callers must call migrate_vma_pages() to go over each entry
				433	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				434	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				435	* then migrate_vma_pages() to migrate struct page information from the source
				436	* struct page to the destination struct page. If it fails to migrate the
				437	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
				438	* src array.
				439	*
				440	* At this point all successfully migrated pages have an entry in the src
				441	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				442	* array entry with MIGRATE_PFN_VALID flag set.
				443	*
				444	* Once migrate_vma_pages() returns the caller may inspect which pages were
				445	* successfully migrated, and which were not. Successfully migrated pages will
				446	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
				447	*
				448	* It is safe to update device page table after migrate_vma_pages() because
				449	* both destination and source page are still locked, and the mmap_lock is held
				450	* in read mode (hence no one can unmap the range being migrated).
				451	*
				452	* Once the caller is done cleaning up things and updating its page table (if it
				453	* chose to do so, this is not an obligation) it finally calls
				454	* migrate_vma_finalize() to update the CPU page table to point to new pages
				455	* for successfully migrated pages or otherwise restore the CPU page table to
				456	* point to the original source pages.
				457	*/
				458	int migrate_vma_setup(struct migrate_vma *args)
				459	{
				460	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
				461
				462	args->start &= PAGE_MASK;
				463	args->end &= PAGE_MASK;
				464	if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
				465	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
				466	return -EINVAL;
				467	if (nr_pages <= 0)
				468	return -EINVAL;
				469	if (args->start < args->vma->vm_start \|\|
				470	args->start >= args->vma->vm_end)
				471	return -EINVAL;
				472	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
				473	return -EINVAL;
				474	if (!args->src \|\| !args->dst)
				475	return -EINVAL;
				476
				477	memset(args->src, 0, sizeof(args->src) nr_pages);
				478	args->cpages = 0;
				479	args->npages = 0;
				480
				481	migrate_vma_collect(args);
				482
				483	if (args->cpages)
				484	migrate_vma_unmap(args);
				485
				486	/*
				487	* At this point pages are locked and unmapped, and thus they have
				488	* stable content and can safely be copied to destination memory that
				489	* is allocated by the drivers.
				490	*/
				491	return 0;
				492
				493	}
				494	EXPORT_SYMBOL(migrate_vma_setup);
				495
				496	/*
				497	* This code closely matches the code in:
				498	* __handle_mm_fault()
				499	* handle_pte_fault()
				500	* do_anonymous_page()
				501	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
				502	* private page.
				503	*/
				504	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				505	unsigned long addr,
				506	struct page *page,
				507	unsigned long *src)
				508	{
				509	struct vm_area_struct *vma = migrate->vma;
				510	struct mm_struct *mm = vma->vm_mm;
				511	bool flush = false;
				512	spinlock_t *ptl;
				513	pte_t entry;
				514	pgd_t *pgdp;
				515	p4d_t *p4dp;
				516	pud_t *pudp;
				517	pmd_t *pmdp;
				518	pte_t *ptep;
				519
				520	/* Only allow populating anonymous memory */
				521	if (!vma_is_anonymous(vma))
				522	goto abort;
				523
				524	pgdp = pgd_offset(mm, addr);
				525	p4dp = p4d_alloc(mm, pgdp, addr);
				526	if (!p4dp)
				527	goto abort;
				528	pudp = pud_alloc(mm, p4dp, addr);
				529	if (!pudp)
				530	goto abort;
				531	pmdp = pmd_alloc(mm, pudp, addr);
				532	if (!pmdp)
				533	goto abort;
				534
				535	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				536	goto abort;
				537
				538	/*
				539	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				540	* pte_offset_map() on pmds where a huge pmd might be created
				541	* from a different thread.
				542	*
				543	* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
				544	* parallel threads are excluded by other means.
				545	*
				546	* Here we only have mmap_read_lock(mm).
				547	*/
				548	if (pte_alloc(mm, pmdp))
				549	goto abort;
				550
				551	/* See the comment in pte_alloc_one_map() */
				552	if (unlikely(pmd_trans_unstable(pmdp)))
				553	goto abort;
				554
				555	if (unlikely(anon_vma_prepare(vma)))
				556	goto abort;
				557	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
				558	goto abort;
				559
				560	/*
				561	* The memory barrier inside __SetPageUptodate makes sure that
				562	* preceding stores to the page contents become visible before
				563	* the set_pte_at() write.
				564	*/
				565	__SetPageUptodate(page);
				566
				567	if (is_device_private_page(page)) {
				568	swp_entry_t swp_entry;
				569
				570	if (vma->vm_flags & VM_WRITE)
				571	swp_entry = make_writable_device_private_entry(
				572	page_to_pfn(page));
				573	else
				574	swp_entry = make_readable_device_private_entry(
				575	page_to_pfn(page));
				576	entry = swp_entry_to_pte(swp_entry);
				577	} else {
				578	/*
				579	* For now we only support migrating to un-addressable device
				580	* memory.
				581	*/
				582	if (is_zone_device_page(page)) {
				583	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
				584	goto abort;
				585	}
				586	entry = mk_pte(page, vma->vm_page_prot);
				587	if (vma->vm_flags & VM_WRITE)
				588	entry = pte_mkwrite(pte_mkdirty(entry));
				589	}
				590
				591	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				592
				593	if (check_stable_address_space(mm))
				594	goto unlock_abort;
				595
				596	if (pte_present(*ptep)) {
				597	unsigned long pfn = pte_pfn(*ptep);
				598
				599	if (!is_zero_pfn(pfn))
				600	goto unlock_abort;
				601	flush = true;
				602	} else if (!pte_none(*ptep))
				603	goto unlock_abort;
				604
				605	/*
				606	* Check for userfaultfd but do not deliver the fault. Instead,
				607	* just back off.
				608	*/
				609	if (userfaultfd_missing(vma))
				610	goto unlock_abort;
				611
				612	inc_mm_counter(mm, MM_ANONPAGES);
				613	page_add_new_anon_rmap(page, vma, addr, false);
				614	if (!is_zone_device_page(page))
				615	lru_cache_add_inactive_or_unevictable(page, vma);
				616	get_page(page);
				617
				618	if (flush) {
				619	flush_cache_page(vma, addr, pte_pfn(*ptep));
				620	ptep_clear_flush_notify(vma, addr, ptep);
				621	set_pte_at_notify(mm, addr, ptep, entry);
				622	update_mmu_cache(vma, addr, ptep);
				623	} else {
				624	/* No need to invalidate - it was non-present before */
				625	set_pte_at(mm, addr, ptep, entry);
				626	update_mmu_cache(vma, addr, ptep);
				627	}
				628
				629	pte_unmap_unlock(ptep, ptl);
				630	*src = MIGRATE_PFN_MIGRATE;
				631	return;
				632
				633	unlock_abort:
				634	pte_unmap_unlock(ptep, ptl);
				635	abort:
				636	*src &= ~MIGRATE_PFN_MIGRATE;
				637	}
				638
				639	/**
				640	* migrate_vma_pages() - migrate meta-data from src page to dst page
				641	* @migrate: migrate struct containing all migration information
				642	*
				643	* This migrates struct page meta-data from source struct page to destination
				644	* struct page. This effectively finishes the migration from source page to the
				645	* destination page.
				646	*/
				647	void migrate_vma_pages(struct migrate_vma *migrate)
				648	{
				649	const unsigned long npages = migrate->npages;
				650	const unsigned long start = migrate->start;
				651	struct mmu_notifier_range range;
				652	unsigned long addr, i;
				653	bool notified = false;
				654
				655	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				656	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				657	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				658	struct address_space *mapping;
				659	int r;
				660
				661	if (!newpage) {
				662	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				663	continue;
				664	}
				665
				666	if (!page) {
				667	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
				668	continue;
				669	if (!notified) {
				670	notified = true;
				671
				672	mmu_notifier_range_init_owner(&range,
				673	MMU_NOTIFY_MIGRATE, 0, migrate->vma,
				674	migrate->vma->vm_mm, addr, migrate->end,
				675	migrate->pgmap_owner);
				676	mmu_notifier_invalidate_range_start(&range);
				677	}
				678	migrate_vma_insert_page(migrate, addr, newpage,
				679	&migrate->src[i]);
				680	continue;
				681	}
				682
				683	mapping = page_mapping(page);
				684
				685	if (is_device_private_page(newpage)) {
				686	/*
				687	* For now only support private anonymous when migrating
				688	* to un-addressable device memory.
				689	*/
				690	if (mapping) {
				691	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				692	continue;
				693	}
				694	} else if (is_zone_device_page(newpage)) {
				695	/*
				696	* Other types of ZONE_DEVICE page are not supported.
				697	*/
				698	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				699	continue;
				700	}
				701
				702	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				703	if (r != MIGRATEPAGE_SUCCESS)
				704	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				705	}
				706
				707	/*
				708	* No need to double call mmu_notifier->invalidate_range() callback as
				709	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				710	* did already call it.
				711	*/
				712	if (notified)
				713	mmu_notifier_invalidate_range_only_end(&range);
				714	}
				715	EXPORT_SYMBOL(migrate_vma_pages);
				716
				717	/**
				718	* migrate_vma_finalize() - restore CPU page table entry
				719	* @migrate: migrate struct containing all migration information
				720	*
				721	* This replaces the special migration pte entry with either a mapping to the
				722	* new page if migration was successful for that page, or to the original page
				723	* otherwise.
				724	*
				725	* This also unlocks the pages and puts them back on the lru, or drops the extra
				726	* refcount, for device pages.
				727	*/
				728	void migrate_vma_finalize(struct migrate_vma *migrate)
				729	{
				730	const unsigned long npages = migrate->npages;
				731	unsigned long i;
				732
				733	for (i = 0; i < npages; i++) {
Matthew Wilcox (Oracle)	4eecb8b	2022-01-28 23:32:59 -0500	[diff] [blame]	734	struct folio dst, src;
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	735	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				736	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				737
				738	if (!page) {
				739	if (newpage) {
				740	unlock_page(newpage);
				741	put_page(newpage);
				742	}
				743	continue;
				744	}
				745
				746	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				747	if (newpage) {
				748	unlock_page(newpage);
				749	put_page(newpage);
				750	}
				751	newpage = page;
				752	}
				753
Matthew Wilcox (Oracle)	4eecb8b	2022-01-28 23:32:59 -0500	[diff] [blame]	754	src = page_folio(page);
				755	dst = page_folio(newpage);
				756	remove_migration_ptes(src, dst, false);
				757	folio_unlock(src);
Christoph Hellwig	76cbbea	2022-02-16 15:31:38 +1100	[diff] [blame]	758
				759	if (is_zone_device_page(page))
				760	put_page(page);
				761	else
				762	putback_lru_page(page);
				763
				764	if (newpage != page) {
				765	unlock_page(newpage);
				766	if (is_zone_device_page(newpage))
				767	put_page(newpage);
				768	else
				769	putback_lru_page(newpage);
				770	}
				771	}
				772	}
				773	EXPORT_SYMBOL(migrate_vma_finalize);