blob: ebdf1077d2d35c0cbf278bbdd5a88795c0d74427 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Page Size Migration
*
* This file contains the core logic of mitigations to ensure
* app compatibility during the transition from 4kB to 16kB
* page size in Android.
*
* Copyright (c) 2024, Google LLC.
* Author: Kalesh Singh <kaleshsingh@goole.com>
*/
#include <linux/pgsize_migration.h>
#include <linux/init.h>
#include <linux/jump_label.h>
#include <linux/kobject.h>
#include <linux/kstrtox.h>
#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sysfs.h>
typedef void (*show_pad_maps_fn) (struct seq_file *m, struct vm_area_struct *vma);
typedef int (*show_pad_smaps_fn) (struct seq_file *m, void *v);
#ifdef CONFIG_64BIT
#if PAGE_SIZE == SZ_4K
DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled);
#define is_pgsize_migration_enabled() (static_branch_likely(&pgsize_migration_enabled))
#else /* PAGE_SIZE != SZ_4K */
DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled);
#define is_pgsize_migration_enabled() (static_branch_unlikely(&pgsize_migration_enabled))
#endif /* PAGE_SIZE == SZ_4K */
static ssize_t show_pgsize_migration_enabled(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
if (is_pgsize_migration_enabled())
return sprintf(buf, "%d\n", 1);
else
return sprintf(buf, "%d\n", 0);
}
static ssize_t store_pgsize_migration_enabled(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t n)
{
unsigned long val;
/* Migration is only applicable to 4kB kernels */
if (PAGE_SIZE != SZ_4K)
return n;
if (kstrtoul(buf, 10, &val))
return -EINVAL;
if (val > 1)
return -EINVAL;
if (val == 1)
static_branch_enable(&pgsize_migration_enabled);
else if (val == 0)
static_branch_disable(&pgsize_migration_enabled);
return n;
}
static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR(
enabled,
0644,
show_pgsize_migration_enabled,
store_pgsize_migration_enabled
);
static struct attribute *pgsize_migration_attrs[] = {
&pgsize_migration_enabled_attr.attr,
NULL
};
static struct attribute_group pgsize_migration_attr_group = {
.name = "pgsize_migration",
.attrs = pgsize_migration_attrs,
};
/**
* What: /sys/kernel/mm/pgsize_migration/enabled
* Date: April 2024
* KernelVersion: v5.4+ (GKI kernels)
* Contact: Kalesh Singh <kaleshsingh@google.com>
* Description: /sys/kernel/mm/pgsize_migration/enabled
* allows for userspace to turn on or off page size
* migration mitigations necessary for app compatibility
* during Android's transition from 4kB to 16kB page size.
* Such mitigations include preserving /proc/<pid>/[s]maps
* output as if there was no segment extension by the
* dynamic loader; and preventing fault around in the padding
* sections of ELF LOAD segment mappings.
* Users: Bionic's dynamic linker
*/
static int __init init_pgsize_migration(void)
{
if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group))
pr_err("pgsize_migration: failed to create sysfs group\n");
return 0;
};
late_initcall(init_pgsize_migration);
#if PAGE_SIZE == SZ_4K
void vma_set_pad_pages(struct vm_area_struct *vma,
unsigned long nr_pages)
{
if (!is_pgsize_migration_enabled())
return;
/*
* Usually to modify vm_flags we need to take exclusive mmap_lock but here
* only have the lock in read mode, to avoid all DONTNEED/DONTNEED_LOCKED
* calls needing the write lock.
*
* A race to the flags update can only happen with another MADV_DONTNEED on
* the same process and same range (VMA).
*
* In practice, this specific scenario is not possible because the action that
* could cause it is usually performed at most once per VMA and only by the
* dynamic linker.
*
* Forego protection for this case, to avoid penalties in the common cases.
*/
__vm_flags_mod(vma, 0, VM_PAD_MASK);
__vm_flags_mod(vma, nr_pages << VM_PAD_SHIFT, 0);
}
unsigned long vma_pad_pages(struct vm_area_struct *vma)
{
if (!is_pgsize_migration_enabled())
return 0;
return vma->vm_flags >> VM_PAD_SHIFT;
}
static __always_inline bool str_has_suffix(const char *str, const char *suffix)
{
size_t str_len = strlen(str);
size_t suffix_len = strlen(suffix);
if (str_len < suffix_len)
return false;
return !strncmp(str + str_len - suffix_len, suffix, suffix_len);
}
/*
* The dynamic linker, or interpreter, operates within the process context
* of the binary that necessitated dynamic linking.
*
* Consequently, process context identifiers; like PID, comm, ...; cannot
* be used to differentiate whether the execution context belongs to the
* dynamic linker or not.
*
* linker_ctx() deduces whether execution is currently in the dynamic linker's
* context by correlating the current userspace instruction pointer with the
* VMAs of the current task.
*
* Returns true if in linker context, otherwise false.
*
* Caller must hold mmap lock in read mode.
*/
static inline bool linker_ctx(void)
{
struct pt_regs *regs = task_pt_regs(current);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct file *file;
if (!regs)
return false;
vma = find_vma(mm, instruction_pointer(regs));
/* Current execution context, the VMA must be present */
BUG_ON(!vma);
file = vma->vm_file;
if (!file)
return false;
if ((vma->vm_flags & VM_EXEC)) {
char buf[64];
const int bufsize = sizeof(buf);
char *path;
memset(buf, 0, bufsize);
path = d_path(&file->f_path, buf, bufsize);
/*
* Depending on interpreter requested, valid paths could be any of:
* 1. /system/bin/bootstrap/linker64
* 2. /system/bin/linker64
* 3. /apex/com.android.runtime/bin/linker64
*
* Check the base name (linker64).
*/
if (!strcmp(kbasename(path), "linker64"))
return true;
}
return false;
}
/*
* Saves the number of padding pages for an ELF segment mapping
* in vm_flags.
*
* The number of padding pages is deduced from the madvise DONTNEED range [start, end)
* if the following conditions are met:
* 1) The range is enclosed by a single VMA
* 2) The range ends at the end address of the VMA
* 3) The range starts at an address greater than the start address of the VMA
* 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES.
* 5) The VMA is a file backed VMA.
* 6) The file backing the VMA is a shared library (*.so)
* 7) The madvise was requested by bionic's dynamic linker.
*/
void madvise_vma_pad_pages(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
unsigned long nr_pad_pages;
if (!is_pgsize_migration_enabled())
return;
/*
* If the madvise range is it at the end of the file save the number of
* pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs).
*/
if (start <= vma->vm_start || end != vma->vm_end)
return;
nr_pad_pages = (end - start) >> PAGE_SHIFT;
if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES)
return;
/* Only handle this for file backed VMAs */
if (!vma->vm_file)
return;
/* Limit this to only shared libraries (*.so) */
if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so"))
return;
/* Only bionic's dynamic linker needs to hint padding pages. */
if (!linker_ctx())
return;
vma_set_pad_pages(vma, nr_pad_pages);
}
static const char *pad_vma_name(struct vm_area_struct *vma)
{
return "[page size compat]";
}
static const struct vm_operations_struct pad_vma_ops = {
.name = pad_vma_name,
};
/*
* Returns a new VMA representing the padding in @vma, if no padding
* in @vma returns NULL.
*/
struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *pad;
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
return NULL;
pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
memcpy(pad, vma, sizeof(struct vm_area_struct));
/* Remove file */
pad->vm_file = NULL;
/* Add vm_ops->name */
pad->vm_ops = &pad_vma_ops;
/* Adjust the start to begin at the start of the padding section */
pad->vm_start = VMA_PAD_START(pad);
/* Make the pad vma PROT_NONE */
vm_flags_clear(pad, VM_READ|VM_WRITE|VM_EXEC);
/* Remove padding bits */
vm_flags_clear(pad, VM_PAD_MASK);
return pad;
}
/*
* Returns a new VMA exclusing the padding from @vma; if no padding in
* @vma returns @vma.
*/
struct vm_area_struct *get_data_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *data;
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
return vma;
data = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
memcpy(data, vma, sizeof(struct vm_area_struct));
/* Adjust the end to the start of the padding section */
data->vm_end = VMA_PAD_START(data);
return data;
}
/*
* Calls the show_pad_vma_fn on the @pad VMA, and frees the copies of @vma
* and @pad.
*/
void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad,
struct seq_file *m, void *func, bool smaps)
{
if (!pad)
return;
/*
* This cannot happen. If @pad vma was allocated the corresponding
* @vma should have the VM_PAD_MASK bit(s) set.
*/
BUG_ON(!(vma->vm_flags & VM_PAD_MASK));
/*
* This cannot happen. @pad is a section of the original VMA.
* Therefore @vma cannot be null if @pad is not null.
*/
BUG_ON(!vma);
if (smaps)
((show_pad_smaps_fn)func)(m, pad);
else
((show_pad_maps_fn)func)(m, pad);
kfree(pad);
kfree(vma);
}
/*
* When splitting a padding VMA there are a couple of cases to handle.
*
* Given:
*
* | DDDDPPPP |
*
* where:
* - D represents 1 page of data;
* - P represents 1 page of padding;
* - | represents the boundaries (start/end) of the VMA
*
*
* 1) Split exactly at the padding boundary
*
* | DDDDPPPP | --> | DDDD | PPPP |
*
* - Remove padding flags from the first VMA.
* - The second VMA is all padding
*
* 2) Split within the padding area
*
* | DDDDPPPP | --> | DDDDPP | PP |
*
* - Subtract the length of the second VMA from the first VMA's padding.
* - The second VMA is all padding, adjust its padding length (flags)
*
* 3) Split within the data area
*
* | DDDDPPPP | --> | DD | DDPPPP |
*
* - Remove padding flags from the first VMA.
* - The second VMA is has the same padding as from before the split.
*/
void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new,
unsigned long addr, int new_below)
{
unsigned long nr_pad_pages = vma_pad_pages(vma);
unsigned long nr_vma2_pages;
struct vm_area_struct *first;
struct vm_area_struct *second;
if (!nr_pad_pages)
return;
if (new_below) {
first = new;
second = vma;
} else {
first = vma;
second = new;
}
nr_vma2_pages = vma_pages(second);
if (nr_vma2_pages >= nr_pad_pages) { /* Case 1 & 3*/
vm_flags_clear(first, VM_PAD_MASK);
vma_set_pad_pages(second, nr_pad_pages);
} else { /* Case 2 */
vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages);
vma_set_pad_pages(second, nr_vma2_pages);
}
}
#endif /* PAGE_SIZE == SZ_4K */
#endif /* CONFIG_64BIT */