blob: 2b1b1165867e48d9ad99e04f8cc55e4ae823d45d [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/memblock.h>
#include <linux/stacktrace.h>
#include <linux/page_pinner.h>
#include <linux/jump_label.h>
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
#include <linux/sched/clock.h>
#include "internal.h"
#define PAGE_PINNER_STACK_DEPTH 16
static unsigned long pp_buf_size = 4096;
struct page_pinner {
depot_stack_handle_t handle;
u64 ts_usec;
atomic_t count;
};
enum pp_state {
PP_PUT,
PP_FREE,
PP_FAIL_DETECTED,
};
struct captured_pinner {
depot_stack_handle_t handle;
union {
u64 ts_usec;
u64 elapsed;
};
/* struct page fields */
unsigned long pfn;
int count;
int mapcount;
struct address_space *mapping;
unsigned long flags;
enum pp_state state;
};
struct page_pinner_buffer {
spinlock_t lock;
unsigned long index;
struct captured_pinner *buffer;
};
/* alloc_contig failed pinner */
static struct page_pinner_buffer pp_buffer;
static bool page_pinner_enabled;
DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
EXPORT_SYMBOL_GPL(page_pinner_inited);
DEFINE_STATIC_KEY_TRUE(failure_tracking);
static depot_stack_handle_t failure_handle;
static int __init early_page_pinner_param(char *buf)
{
page_pinner_enabled = true;
return 0;
}
early_param("page_pinner", early_page_pinner_param);
static bool need_page_pinner(void)
{
return page_pinner_enabled;
}
static noinline void register_failure_stack(void)
{
unsigned long entries[4];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
}
static void init_page_pinner(void)
{
if (!page_pinner_enabled)
return;
pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
GFP_KERNEL);
if (!pp_buffer.buffer) {
pr_info("page_pinner disabled due to failure of buffer allocation\n");
return;
}
spin_lock_init(&pp_buffer.lock);
pp_buffer.index = 0;
register_failure_stack();
static_branch_enable(&page_pinner_inited);
}
struct page_ext_operations page_pinner_ops = {
.size = sizeof(struct page_pinner),
.need = need_page_pinner,
.init = init_page_pinner,
};
static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
{
return (void *)page_ext + page_pinner_ops.offset;
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_PINNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
return handle;
}
static void capture_page_state(struct page *page,
struct captured_pinner *record)
{
record->flags = page->flags;
record->mapping = page_mapping(page);
record->pfn = page_to_pfn(page);
record->count = page_count(page);
record->mapcount = page_mapcount(page);
}
static void add_record(struct page_pinner_buffer *pp_buf,
struct captured_pinner *record)
{
unsigned long flags;
unsigned int idx;
spin_lock_irqsave(&pp_buf->lock, flags);
idx = pp_buf->index++;
pp_buf->index %= pp_buf_size;
pp_buf->buffer[idx] = *record;
spin_unlock_irqrestore(&pp_buf->lock, flags);
}
void __free_page_pinner(struct page *page, unsigned int order)
{
struct page_pinner *page_pinner;
struct page_ext *page_ext;
int i;
/* free_page could be called before buffer is initialized */
if (!pp_buffer.buffer)
return;
page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
for (i = 0; i < (1 << order); i++) {
struct captured_pinner record;
if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
continue;
page_pinner = get_page_pinner(page_ext);
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
record.state = PP_FREE;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
atomic_set(&page_pinner->count, 0);
page_pinner->ts_usec = 0;
clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
}
static ssize_t
print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
{
int ret;
unsigned long *entries;
unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
if (record->state == PP_PUT) {
ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
record->elapsed);
} else {
u64 ts_usec = record->ts_usec;
unsigned long rem_usec = do_div(ts_usec, 1000000);
ret = snprintf(kbuf, count,
"%s [%5lu.%06lu]\n",
record->state == PP_FREE ? "Freed at" :
"Failure detected at",
(unsigned long)ts_usec, rem_usec);
}
if (ret >= count)
goto err;
/* Print information relevant to grouping pages by mobility */
ret += snprintf(kbuf + ret, count - ret,
"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
record->pfn,
record->pfn >> pageblock_order,
record->count, record->mapcount,
record->mapping,
record->flags, &record->flags);
if (ret >= count)
goto err;
nr_entries = stack_depot_fetch(record->handle, &entries);
ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
nr_entries, 0);
if (ret >= count)
goto err;
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
if (copy_to_user(buf, kbuf, ret))
ret = -EFAULT;
kfree(kbuf);
return ret;
err:
kfree(kbuf);
return -ENOMEM;
}
void __page_pinner_failure_detect(struct page *page)
{
struct page_ext *page_ext;
struct page_pinner *page_pinner;
struct captured_pinner record;
u64 now;
if (!static_branch_unlikely(&failure_tracking))
return;
page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
page_ext_put(page_ext);
return;
}
now = (u64)ktime_to_us(ktime_get_boottime());
page_pinner = get_page_pinner(page_ext);
if (!page_pinner->ts_usec)
page_pinner->ts_usec = now;
set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
record.ts_usec = now;
record.state = PP_FAIL_DETECTED;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
page_ext_put(page_ext);
}
EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
void __page_pinner_put_page(struct page *page)
{
struct page_ext *page_ext;
struct page_pinner *page_pinner;
struct captured_pinner record;
u64 now, ts_usec;
if (!static_branch_unlikely(&failure_tracking))
return;
page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) {
page_ext_put(page_ext);
return;
}
page_pinner = get_page_pinner(page_ext);
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
now = (u64)ktime_to_us(ktime_get_boottime());
ts_usec = page_pinner->ts_usec;
if (now > ts_usec)
record.elapsed = now - ts_usec;
else
record.elapsed = 0;
record.state = PP_PUT;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
page_ext_put(page_ext);
}
EXPORT_SYMBOL_GPL(__page_pinner_put_page);
static ssize_t read_buffer(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
u64 tmp;
loff_t i, idx;
struct captured_pinner record;
unsigned long flags;
if (!static_branch_unlikely(&failure_tracking))
return -EINVAL;
if (*ppos >= pp_buf_size)
return 0;
i = *ppos;
*ppos = i + 1;
/*
* reading the records in the reverse order with newest one
* being read first followed by older ones
*/
tmp = pp_buffer.index - 1 - i + pp_buf_size;
idx = do_div(tmp, pp_buf_size);
spin_lock_irqsave(&pp_buffer.lock, flags);
record = pp_buffer.buffer[idx];
spin_unlock_irqrestore(&pp_buffer.lock, flags);
if (!record.handle)
return 0;
return print_page_pinner(buf, count, &record);
}
static const struct file_operations proc_buffer_operations = {
.read = read_buffer,
};
static int failure_tracking_set(void *data, u64 val)
{
bool on;
on = (bool)val;
if (on)
static_branch_enable(&failure_tracking);
else
static_branch_disable(&failure_tracking);
return 0;
}
static int failure_tracking_get(void *data, u64 *val)
{
*val = static_branch_unlikely(&failure_tracking);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
failure_tracking_get,
failure_tracking_set, "%llu\n");
static int buffer_size_set(void *data, u64 val)
{
unsigned long flags;
struct captured_pinner *new, *old;
new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
if (!new)
return -ENOMEM;
spin_lock_irqsave(&pp_buffer.lock, flags);
old = pp_buffer.buffer;
pp_buffer.buffer = new;
pp_buffer.index = 0;
pp_buf_size = val;
spin_unlock_irqrestore(&pp_buffer.lock, flags);
kvfree(old);
return 0;
}
static int buffer_size_get(void *data, u64 *val)
{
*val = pp_buf_size;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
buffer_size_get,
buffer_size_set, "%llu\n");
static int __init page_pinner_init(void)
{
struct dentry *pp_debugfs_root;
if (!static_branch_unlikely(&page_pinner_inited))
return 0;
pr_info("page_pinner enabled\n");
pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
debugfs_create_file("buffer", 0444,
pp_debugfs_root, NULL,
&proc_buffer_operations);
debugfs_create_file("failure_tracking", 0644,
pp_debugfs_root, NULL,
&failure_tracking_fops);
debugfs_create_file("buffer_size", 0644,
pp_debugfs_root, NULL,
&buffer_size_fops);
return 0;
}
late_initcall(page_pinner_init)