| /* |
| * zbud.c - Compression buddies allocator |
| * |
| * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. |
| * |
| * Compression buddies ("zbud") provides for efficiently packing two |
| * (or, possibly in the future, more) compressed pages ("zpages") into |
| * a single "raw" pageframe and for tracking both zpages and pageframes |
| * so that whole pageframes can be easily reclaimed in LRU-like order. |
| * It is designed to be used in conjunction with transcendent memory |
| * ("tmem"); for example separate LRU lists are maintained for persistent |
| * vs. ephemeral pages. |
| * |
| * A zbudpage is an overlay for a struct page and thus each zbudpage |
| * refers to a physical pageframe of RAM. When the caller passes a |
| * struct page from the kernel's page allocator, zbud "transforms" it |
| * to a zbudpage which sets/uses a different set of fields than the |
| * struct-page and thus must "untransform" it back by reinitializing |
| * certain fields before the struct-page can be freed. The fields |
| * of a zbudpage include a page lock for controlling access to the |
| * corresponding pageframe, and there is a size field for each zpage. |
| * Each zbudpage also lives on two linked lists: a "budlist" which is |
| * used to support efficient buddying of zpages; and an "lru" which |
| * is used for reclaiming pageframes in approximately least-recently-used |
| * order. |
| * |
| * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks" |
| * which contain the compressed data for zero, one, or two zbuds. Contained |
| * with the compressed data is a tmem_handle which is a key to allow |
| * the same data to be found via the tmem interface so the zpage can |
| * be invalidated (for ephemeral pages) or repatriated to the swap cache |
| * (for persistent pages). The contents of a zbudpageframe must never |
| * be accessed without holding the page lock for the corresponding |
| * zbudpage and, to accomodate highmem machines, the contents may |
| * only be examined or changes when kmapped. Thus, when in use, a |
| * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg". |
| * |
| * Note that the term "zbud" refers to the combination of a zpage and |
| * a tmem_handle that is stored as one of possibly two "buddied" zpages; |
| * it also generically refers to this allocator... sorry for any confusion. |
| * |
| * A zbudref is a pointer to a struct zbudpage (which can be cast to a |
| * struct page), with the LSB either cleared or set to indicate, respectively, |
| * the first or second zpage in the zbudpageframe. Since a zbudref can be |
| * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely |
| * references a stored tmem page and so is the only zbud data structure |
| * externally visible to zbud.c/zbud.h. |
| * |
| * Since we wish to reclaim entire pageframes but zpages may be randomly |
| * added and deleted to any given pageframe, we approximate LRU by |
| * promoting a pageframe to MRU when a zpage is added to it, but |
| * leaving it at the current place in the list when a zpage is deleted |
| * from it. As a side effect, zpages that are difficult to buddy (e.g. |
| * very large paages) will be reclaimed faster than average, which seems |
| * reasonable. |
| * |
| * In the current implementation, no more than two zpages may be stored in |
| * any pageframe and no zpage ever crosses a pageframe boundary. While |
| * other zpage allocation mechanisms may allow greater density, this two |
| * zpage-per-pageframe limit both ensures simple reclaim of pageframes |
| * (including garbage collection of references to the contents of those |
| * pageframes from tmem data structures) AND avoids the need for compaction. |
| * With additional complexity, zbud could be modified to support storing |
| * up to three zpages per pageframe or, to handle larger average zpages, |
| * up to three zpages per pair of pageframes, but it is not clear if the |
| * additional complexity would be worth it. So consider it an exercise |
| * for future developers. |
| * |
| * Note also that zbud does no page allocation or freeing. This is so |
| * that the caller has complete control over and, for accounting, visibility |
| * into if/when pages are allocated and freed. |
| * |
| * Finally, note that zbud limits the size of zpages it can store; the |
| * caller must check the zpage size with zbud_max_buddy_size before |
| * storing it, else BUGs will result. User beware. |
| */ |
| |
| #include <linux/module.h> |
| #include <linux/highmem.h> |
| #include <linux/list.h> |
| #include <linux/spinlock.h> |
| #include <linux/pagemap.h> |
| #include <linux/atomic.h> |
| #include <linux/bug.h> |
| #include "tmem.h" |
| #include "zcache.h" |
| #include "zbud.h" |
| |
| /* |
| * We need to ensure that a struct zbudpage is never larger than a |
| * struct page. This is checked with a BUG_ON in zbud_init. |
| * |
| * The unevictable field indicates that a zbud is being added to the |
| * zbudpage. Since this is a two-phase process (due to tmem locking), |
| * this field locks the zbudpage against eviction when a zbud match |
| * or creation is in process. Since this addition process may occur |
| * in parallel for two zbuds in one zbudpage, the field is a counter |
| * that must not exceed two. |
| */ |
| struct zbudpage { |
| union { |
| struct page page; |
| struct { |
| unsigned long space_for_flags; |
| struct { |
| unsigned zbud0_size:PAGE_SHIFT; |
| unsigned zbud1_size:PAGE_SHIFT; |
| unsigned unevictable:2; |
| }; |
| struct list_head budlist; |
| struct list_head lru; |
| }; |
| }; |
| }; |
| #if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG |
| #error "zbud won't work for this arch, PAGE_SIZE is too large" |
| #endif |
| |
| struct zbudref { |
| union { |
| struct zbudpage *zbudpage; |
| unsigned long zbudref; |
| }; |
| }; |
| |
| #define CHUNK_SHIFT 6 |
| #define CHUNK_SIZE (1 << CHUNK_SHIFT) |
| #define CHUNK_MASK (~(CHUNK_SIZE-1)) |
| #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) |
| #define MAX_CHUNK (NCHUNKS-1) |
| |
| /* |
| * The following functions deal with the difference between struct |
| * page and struct zbudpage. Note the hack of using the pageflags |
| * from struct page; this is to avoid duplicating all the complex |
| * pageflag macros. |
| */ |
| static inline void zbudpage_spin_lock(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) { |
| do { |
| cpu_relax(); |
| } while (test_bit(PG_locked, &page->flags)); |
| } |
| } |
| |
| static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| clear_bit(PG_locked, &page->flags); |
| } |
| |
| static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage) |
| { |
| return trylock_page((struct page *)zbudpage); |
| } |
| |
| static inline int zbudpage_is_locked(struct zbudpage *zbudpage) |
| { |
| return PageLocked((struct page *)zbudpage); |
| } |
| |
| static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage) |
| { |
| return kmap_atomic((struct page *)zbudpage); |
| } |
| |
| /* |
| * A dying zbudpage is an ephemeral page in the process of being evicted. |
| * Any data contained in the zbudpage is invalid and we are just waiting for |
| * the tmem pampds to be invalidated before freeing the page |
| */ |
| static inline int zbudpage_is_dying(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| return test_bit(PG_reclaim, &page->flags); |
| } |
| |
| static inline void zbudpage_set_dying(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| set_bit(PG_reclaim, &page->flags); |
| } |
| |
| static inline void zbudpage_clear_dying(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| clear_bit(PG_reclaim, &page->flags); |
| } |
| |
| /* |
| * A zombie zbudpage is a persistent page in the process of being evicted. |
| * The data contained in the zbudpage is valid and we are just waiting for |
| * the tmem pampds to be invalidated before freeing the page |
| */ |
| static inline int zbudpage_is_zombie(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| return test_bit(PG_dirty, &page->flags); |
| } |
| |
| static inline void zbudpage_set_zombie(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| set_bit(PG_dirty, &page->flags); |
| } |
| |
| static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| clear_bit(PG_dirty, &page->flags); |
| } |
| |
| static inline void kunmap_zbudpage_atomic(void *zbpg) |
| { |
| kunmap_atomic(zbpg); |
| } |
| |
| /* |
| * zbud "translation" and helper functions |
| */ |
| |
| static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref) |
| { |
| unsigned long zbud = (unsigned long)zref; |
| zbud &= ~1UL; |
| return (struct zbudpage *)zbud; |
| } |
| |
| static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage, |
| unsigned budnum) |
| { |
| unsigned long zbud = (unsigned long)zbudpage; |
| BUG_ON(budnum > 1); |
| zbud |= budnum; |
| return (struct zbudref *)zbud; |
| } |
| |
| static inline int zbudref_budnum(struct zbudref *zbudref) |
| { |
| unsigned long zbud = (unsigned long)zbudref; |
| return zbud & 1UL; |
| } |
| |
| static inline unsigned zbud_max_size(void) |
| { |
| return MAX_CHUNK << CHUNK_SHIFT; |
| } |
| |
| static inline unsigned zbud_size_to_chunks(unsigned size) |
| { |
| BUG_ON(size == 0 || size > zbud_max_size()); |
| return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; |
| } |
| |
| /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */ |
| static inline char *zbud_data(void *zbpg, |
| unsigned budnum, unsigned size) |
| { |
| char *p; |
| |
| BUG_ON(size == 0 || size > zbud_max_size()); |
| p = (char *)zbpg; |
| if (budnum == 1) |
| p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); |
| return p; |
| } |
| |
| /* |
| * These are all informative and exposed through debugfs... except for |
| * the arrays... anyone know how to do that? To avoid confusion for |
| * debugfs viewers, some of these should also be atomic_long_t, but |
| * I don't know how to expose atomics via debugfs either... |
| */ |
| static ssize_t zbud_eph_pageframes; |
| static ssize_t zbud_pers_pageframes; |
| static ssize_t zbud_eph_zpages; |
| static ssize_t zbud_pers_zpages; |
| static u64 zbud_eph_zbytes; |
| static u64 zbud_pers_zbytes; |
| static ssize_t zbud_eph_evicted_pageframes; |
| static ssize_t zbud_pers_evicted_pageframes; |
| static ssize_t zbud_eph_cumul_zpages; |
| static ssize_t zbud_pers_cumul_zpages; |
| static u64 zbud_eph_cumul_zbytes; |
| static u64 zbud_pers_cumul_zbytes; |
| static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS]; |
| static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS]; |
| static ssize_t zbud_eph_buddied_count; |
| static ssize_t zbud_pers_buddied_count; |
| static ssize_t zbud_eph_unbuddied_count; |
| static ssize_t zbud_pers_unbuddied_count; |
| static ssize_t zbud_eph_zombie_count; |
| static ssize_t zbud_pers_zombie_count; |
| static atomic_t zbud_eph_zombie_atomic; |
| static atomic_t zbud_pers_zombie_atomic; |
| |
| #ifdef CONFIG_DEBUG_FS |
| #include <linux/debugfs.h> |
| #define zdfs debugfs_create_size_t |
| #define zdfs64 debugfs_create_u64 |
| static int zbud_debugfs_init(void) |
| { |
| struct dentry *root = debugfs_create_dir("zbud", NULL); |
| if (root == NULL) |
| return -ENXIO; |
| |
| /* |
| * would be nice to dump the sizes of the unbuddied |
| * arrays, like was done with sysfs, but it doesn't |
| * look like debugfs is flexible enough to do that |
| */ |
| zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes); |
| zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes); |
| zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes); |
| zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes); |
| zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages); |
| zdfs("eph_evicted_pageframes", S_IRUGO, root, |
| &zbud_eph_evicted_pageframes); |
| zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages); |
| zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes); |
| zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count); |
| zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count); |
| zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages); |
| zdfs("pers_evicted_pageframes", S_IRUGO, root, |
| &zbud_pers_evicted_pageframes); |
| zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages); |
| zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes); |
| zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count); |
| zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count); |
| zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count); |
| return 0; |
| } |
| #undef zdfs |
| #undef zdfs64 |
| #else |
| static inline int zbud_debugfs_init(void) |
| { |
| return 0; |
| } |
| #endif |
| |
| /* protects the buddied list and all unbuddied lists */ |
| static DEFINE_SPINLOCK(zbud_eph_lists_lock); |
| static DEFINE_SPINLOCK(zbud_pers_lists_lock); |
| |
| struct zbud_unbuddied { |
| struct list_head list; |
| unsigned count; |
| }; |
| |
| /* list N contains pages with N chunks USED and NCHUNKS-N unused */ |
| /* element 0 is never used but optimizing that isn't worth it */ |
| static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS]; |
| static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS]; |
| static LIST_HEAD(zbud_eph_lru_list); |
| static LIST_HEAD(zbud_pers_lru_list); |
| static LIST_HEAD(zbud_eph_buddied_list); |
| static LIST_HEAD(zbud_pers_buddied_list); |
| static LIST_HEAD(zbud_eph_zombie_list); |
| static LIST_HEAD(zbud_pers_zombie_list); |
| |
| /* |
| * Given a struct page, transform it to a zbudpage so that it can be |
| * used by zbud and initialize fields as necessary. |
| */ |
| static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph) |
| { |
| struct zbudpage *zbudpage = (struct zbudpage *)page; |
| |
| BUG_ON(page == NULL); |
| INIT_LIST_HEAD(&zbudpage->budlist); |
| INIT_LIST_HEAD(&zbudpage->lru); |
| zbudpage->zbud0_size = 0; |
| zbudpage->zbud1_size = 0; |
| zbudpage->unevictable = 0; |
| if (eph) |
| zbud_eph_pageframes++; |
| else |
| zbud_pers_pageframes++; |
| return zbudpage; |
| } |
| |
| /* "Transform" a zbudpage back to a struct page suitable to free. */ |
| static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, |
| bool eph) |
| { |
| struct page *page = (struct page *)zbudpage; |
| |
| BUG_ON(!list_empty(&zbudpage->budlist)); |
| BUG_ON(!list_empty(&zbudpage->lru)); |
| BUG_ON(zbudpage->zbud0_size != 0); |
| BUG_ON(zbudpage->zbud1_size != 0); |
| BUG_ON(!PageLocked(page)); |
| BUG_ON(zbudpage->unevictable != 0); |
| BUG_ON(zbudpage_is_dying(zbudpage)); |
| BUG_ON(zbudpage_is_zombie(zbudpage)); |
| if (eph) |
| zbud_eph_pageframes--; |
| else |
| zbud_pers_pageframes--; |
| zbudpage_spin_unlock(zbudpage); |
| page_mapcount_reset(page); |
| init_page_count(page); |
| page->index = 0; |
| return page; |
| } |
| |
| /* Mark a zbud as unused and do accounting */ |
| static inline void zbud_unuse_zbud(struct zbudpage *zbudpage, |
| int budnum, bool eph) |
| { |
| unsigned size; |
| |
| BUG_ON(!zbudpage_is_locked(zbudpage)); |
| if (budnum == 0) { |
| size = zbudpage->zbud0_size; |
| zbudpage->zbud0_size = 0; |
| } else { |
| size = zbudpage->zbud1_size; |
| zbudpage->zbud1_size = 0; |
| } |
| if (eph) { |
| zbud_eph_zbytes -= size; |
| zbud_eph_zpages--; |
| } else { |
| zbud_pers_zbytes -= size; |
| zbud_pers_zpages--; |
| } |
| } |
| |
| /* |
| * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer |
| * to some data, set up the zbud appropriately including data copying |
| * and accounting. Note that if cdata is NULL, the data copying is |
| * skipped. (This is useful for lazy writes such as for RAMster.) |
| */ |
| static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th, |
| bool eph, void *cdata, |
| unsigned budnum, unsigned size) |
| { |
| char *to; |
| void *zbpg; |
| struct tmem_handle *to_th; |
| unsigned nchunks = zbud_size_to_chunks(size); |
| |
| BUG_ON(!zbudpage_is_locked(zbudpage)); |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| to = zbud_data(zbpg, budnum, size); |
| to_th = (struct tmem_handle *)to; |
| to_th->index = th->index; |
| to_th->oid = th->oid; |
| to_th->pool_id = th->pool_id; |
| to_th->client_id = th->client_id; |
| to += sizeof(struct tmem_handle); |
| if (cdata != NULL) |
| memcpy(to, cdata, size - sizeof(struct tmem_handle)); |
| kunmap_zbudpage_atomic(zbpg); |
| if (budnum == 0) |
| zbudpage->zbud0_size = size; |
| else |
| zbudpage->zbud1_size = size; |
| if (eph) { |
| zbud_eph_cumul_chunk_counts[nchunks]++; |
| zbud_eph_zpages++; |
| zbud_eph_cumul_zpages++; |
| zbud_eph_zbytes += size; |
| zbud_eph_cumul_zbytes += size; |
| } else { |
| zbud_pers_cumul_chunk_counts[nchunks]++; |
| zbud_pers_zpages++; |
| zbud_pers_cumul_zpages++; |
| zbud_pers_zbytes += size; |
| zbud_pers_cumul_zbytes += size; |
| } |
| } |
| |
| /* |
| * Given a locked dying zbudpage, read out the tmem handles from the data, |
| * unlock the page, then use the handles to tell tmem to flush out its |
| * references |
| */ |
| static void zbud_evict_tmem(struct zbudpage *zbudpage) |
| { |
| int i, j; |
| uint32_t pool_id[2], client_id[2]; |
| uint32_t index[2]; |
| struct tmem_oid oid[2]; |
| struct tmem_pool *pool; |
| void *zbpg; |
| struct tmem_handle *th; |
| unsigned size; |
| |
| /* read out the tmem handles from the data and set aside */ |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| for (i = 0, j = 0; i < 2; i++) { |
| size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; |
| if (size) { |
| th = (struct tmem_handle *)zbud_data(zbpg, i, size); |
| client_id[j] = th->client_id; |
| pool_id[j] = th->pool_id; |
| oid[j] = th->oid; |
| index[j] = th->index; |
| j++; |
| zbud_unuse_zbud(zbudpage, i, true); |
| } |
| } |
| kunmap_zbudpage_atomic(zbpg); |
| zbudpage_spin_unlock(zbudpage); |
| /* zbudpage is now an unlocked dying... tell tmem to flush pointers */ |
| for (i = 0; i < j; i++) { |
| pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); |
| if (pool != NULL) { |
| tmem_flush_page(pool, &oid[i], index[i]); |
| zcache_put_pool(pool); |
| } |
| } |
| } |
| |
| /* |
| * Externally callable zbud handling routines. |
| */ |
| |
| /* |
| * Return the maximum size compressed page that can be stored (secretly |
| * setting aside space for the tmem handle. |
| */ |
| unsigned int zbud_max_buddy_size(void) |
| { |
| return zbud_max_size() - sizeof(struct tmem_handle); |
| } |
| |
| /* |
| * Given a zbud reference, free the corresponding zbud from all lists, |
| * mark it as unused, do accounting, and if the freeing of the zbud |
| * frees up an entire pageframe, return it to the caller (else NULL). |
| */ |
| struct page *zbud_free_and_delist(struct zbudref *zref, bool eph, |
| unsigned int *zsize, unsigned int *zpages) |
| { |
| unsigned long budnum = zbudref_budnum(zref); |
| struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); |
| struct page *page = NULL; |
| unsigned chunks, bud_size, other_bud_size; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| struct zbud_unbuddied *unbud = |
| eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; |
| |
| |
| spin_lock(lists_lock); |
| zbudpage_spin_lock(zbudpage); |
| if (zbudpage_is_dying(zbudpage)) { |
| /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| *zpages = 0; |
| *zsize = 0; |
| goto out; |
| } |
| if (budnum == 0) { |
| bud_size = zbudpage->zbud0_size; |
| other_bud_size = zbudpage->zbud1_size; |
| } else { |
| bud_size = zbudpage->zbud1_size; |
| other_bud_size = zbudpage->zbud0_size; |
| } |
| *zsize = bud_size - sizeof(struct tmem_handle); |
| *zpages = 1; |
| zbud_unuse_zbud(zbudpage, budnum, eph); |
| if (other_bud_size == 0) { /* was unbuddied: unlist and free */ |
| chunks = zbud_size_to_chunks(bud_size) ; |
| if (zbudpage_is_zombie(zbudpage)) { |
| if (eph) |
| zbud_pers_zombie_count = |
| atomic_dec_return(&zbud_eph_zombie_atomic); |
| else |
| zbud_pers_zombie_count = |
| atomic_dec_return(&zbud_pers_zombie_atomic); |
| zbudpage_clear_zombie(zbudpage); |
| } else { |
| BUG_ON(list_empty(&unbud[chunks].list)); |
| list_del_init(&zbudpage->budlist); |
| unbud[chunks].count--; |
| } |
| list_del_init(&zbudpage->lru); |
| spin_unlock(lists_lock); |
| if (eph) |
| zbud_eph_unbuddied_count--; |
| else |
| zbud_pers_unbuddied_count--; |
| page = zbud_unuse_zbudpage(zbudpage, eph); |
| } else { /* was buddied: move remaining buddy to unbuddied list */ |
| chunks = zbud_size_to_chunks(other_bud_size) ; |
| if (!zbudpage_is_zombie(zbudpage)) { |
| list_del_init(&zbudpage->budlist); |
| list_add_tail(&zbudpage->budlist, &unbud[chunks].list); |
| unbud[chunks].count++; |
| } |
| if (eph) { |
| zbud_eph_buddied_count--; |
| zbud_eph_unbuddied_count++; |
| } else { |
| zbud_pers_unbuddied_count++; |
| zbud_pers_buddied_count--; |
| } |
| /* don't mess with lru, no need to move it */ |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| } |
| out: |
| return page; |
| } |
| |
| /* |
| * Given a tmem handle, and a kmapped pointer to compressed data of |
| * the given size, try to find an unbuddied zbudpage in which to |
| * create a zbud. If found, put it there, mark the zbudpage unevictable, |
| * and return a zbudref to it. Else return NULL. |
| */ |
| struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph, |
| void *cdata, unsigned size) |
| { |
| struct zbudpage *zbudpage = NULL, *zbudpage2; |
| unsigned long budnum = 0UL; |
| unsigned nchunks; |
| int i, found_good_buddy = 0; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| struct zbud_unbuddied *unbud = |
| eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; |
| |
| size += sizeof(struct tmem_handle); |
| nchunks = zbud_size_to_chunks(size); |
| for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { |
| spin_lock(lists_lock); |
| if (!list_empty(&unbud[i].list)) { |
| list_for_each_entry_safe(zbudpage, zbudpage2, |
| &unbud[i].list, budlist) { |
| if (zbudpage_spin_trylock(zbudpage)) { |
| found_good_buddy = i; |
| goto found_unbuddied; |
| } |
| } |
| } |
| spin_unlock(lists_lock); |
| } |
| zbudpage = NULL; |
| goto out; |
| |
| found_unbuddied: |
| BUG_ON(!zbudpage_is_locked(zbudpage)); |
| BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0))); |
| if (zbudpage->zbud0_size == 0) |
| budnum = 0UL; |
| else if (zbudpage->zbud1_size == 0) |
| budnum = 1UL; |
| list_del_init(&zbudpage->budlist); |
| if (eph) { |
| list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list); |
| unbud[found_good_buddy].count--; |
| zbud_eph_unbuddied_count--; |
| zbud_eph_buddied_count++; |
| /* "promote" raw zbudpage to most-recently-used */ |
| list_del_init(&zbudpage->lru); |
| list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); |
| } else { |
| list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list); |
| unbud[found_good_buddy].count--; |
| zbud_pers_unbuddied_count--; |
| zbud_pers_buddied_count++; |
| /* "promote" raw zbudpage to most-recently-used */ |
| list_del_init(&zbudpage->lru); |
| list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); |
| } |
| zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); |
| zbudpage->unevictable++; |
| BUG_ON(zbudpage->unevictable == 3); |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| out: |
| return zbudpage_to_zbudref(zbudpage, budnum); |
| |
| } |
| |
| /* |
| * Given a tmem handle, and a kmapped pointer to compressed data of |
| * the given size, and a newly allocated struct page, create an unevictable |
| * zbud in that new page and return a zbudref to it. |
| */ |
| struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph, |
| void *cdata, unsigned size, |
| struct page *newpage) |
| { |
| struct zbudpage *zbudpage; |
| unsigned long budnum = 0; |
| unsigned nchunks; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| struct zbud_unbuddied *unbud = |
| eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; |
| |
| #if 0 |
| /* this may be worth it later to support decompress-in-place? */ |
| static unsigned long counter; |
| budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */ |
| #endif |
| |
| if (size > zbud_max_buddy_size()) |
| return NULL; |
| if (newpage == NULL) |
| return NULL; |
| |
| size += sizeof(struct tmem_handle); |
| nchunks = zbud_size_to_chunks(size) ; |
| spin_lock(lists_lock); |
| zbudpage = zbud_init_zbudpage(newpage, eph); |
| zbudpage_spin_lock(zbudpage); |
| list_add_tail(&zbudpage->budlist, &unbud[nchunks].list); |
| if (eph) { |
| list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); |
| zbud_eph_unbuddied_count++; |
| } else { |
| list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); |
| zbud_pers_unbuddied_count++; |
| } |
| unbud[nchunks].count++; |
| zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); |
| zbudpage->unevictable++; |
| BUG_ON(zbudpage->unevictable == 3); |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| return zbudpage_to_zbudref(zbudpage, budnum); |
| } |
| |
| /* |
| * Finish creation of a zbud by, assuming another zbud isn't being created |
| * in parallel, marking it evictable. |
| */ |
| void zbud_create_finish(struct zbudref *zref, bool eph) |
| { |
| struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| |
| spin_lock(lists_lock); |
| zbudpage_spin_lock(zbudpage); |
| BUG_ON(zbudpage_is_dying(zbudpage)); |
| zbudpage->unevictable--; |
| BUG_ON((int)zbudpage->unevictable < 0); |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| } |
| |
| /* |
| * Given a zbudref and a struct page, decompress the data from |
| * the zbud into the physical page represented by the struct page |
| * by upcalling to zcache_decompress |
| */ |
| int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph, |
| void (*decompress)(char *, unsigned int, char *)) |
| { |
| struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); |
| unsigned long budnum = zbudref_budnum(zref); |
| void *zbpg; |
| char *to_va, *from_va; |
| unsigned size; |
| int ret = -1; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| |
| spin_lock(lists_lock); |
| zbudpage_spin_lock(zbudpage); |
| if (zbudpage_is_dying(zbudpage)) { |
| /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ |
| goto out; |
| } |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| to_va = kmap_atomic(data_page); |
| if (budnum == 0) |
| size = zbudpage->zbud0_size; |
| else |
| size = zbudpage->zbud1_size; |
| BUG_ON(size == 0 || size > zbud_max_size()); |
| from_va = zbud_data(zbpg, budnum, size); |
| from_va += sizeof(struct tmem_handle); |
| size -= sizeof(struct tmem_handle); |
| decompress(from_va, size, to_va); |
| kunmap_atomic(to_va); |
| kunmap_zbudpage_atomic(zbpg); |
| ret = 0; |
| out: |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| return ret; |
| } |
| |
| /* |
| * Given a zbudref and a kernel pointer, copy the data from |
| * the zbud to the kernel pointer. |
| */ |
| int zbud_copy_from_zbud(char *to_va, struct zbudref *zref, |
| size_t *sizep, bool eph) |
| { |
| struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); |
| unsigned long budnum = zbudref_budnum(zref); |
| void *zbpg; |
| char *from_va; |
| unsigned size; |
| int ret = -1; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| |
| spin_lock(lists_lock); |
| zbudpage_spin_lock(zbudpage); |
| if (zbudpage_is_dying(zbudpage)) { |
| /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ |
| goto out; |
| } |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| if (budnum == 0) |
| size = zbudpage->zbud0_size; |
| else |
| size = zbudpage->zbud1_size; |
| BUG_ON(size == 0 || size > zbud_max_size()); |
| from_va = zbud_data(zbpg, budnum, size); |
| from_va += sizeof(struct tmem_handle); |
| size -= sizeof(struct tmem_handle); |
| *sizep = size; |
| memcpy(to_va, from_va, size); |
| |
| kunmap_zbudpage_atomic(zbpg); |
| ret = 0; |
| out: |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| return ret; |
| } |
| |
| /* |
| * Given a zbudref and a kernel pointer, copy the data from |
| * the kernel pointer to the zbud. |
| */ |
| int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph) |
| { |
| struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); |
| unsigned long budnum = zbudref_budnum(zref); |
| void *zbpg; |
| char *to_va; |
| unsigned size; |
| int ret = -1; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| |
| spin_lock(lists_lock); |
| zbudpage_spin_lock(zbudpage); |
| if (zbudpage_is_dying(zbudpage)) { |
| /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ |
| goto out; |
| } |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| if (budnum == 0) |
| size = zbudpage->zbud0_size; |
| else |
| size = zbudpage->zbud1_size; |
| BUG_ON(size == 0 || size > zbud_max_size()); |
| to_va = zbud_data(zbpg, budnum, size); |
| to_va += sizeof(struct tmem_handle); |
| size -= sizeof(struct tmem_handle); |
| memcpy(to_va, from_va, size); |
| |
| kunmap_zbudpage_atomic(zbpg); |
| ret = 0; |
| out: |
| zbudpage_spin_unlock(zbudpage); |
| spin_unlock(lists_lock); |
| return ret; |
| } |
| |
| /* |
| * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure |
| * there are no references to it remaining, and return the now unused |
| * (and re-init'ed) struct page and the total amount of compressed |
| * data that was evicted. |
| */ |
| struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages) |
| { |
| struct zbudpage *zbudpage = NULL, *zbudpage2; |
| struct zbud_unbuddied *unbud = zbud_eph_unbuddied; |
| struct page *page = NULL; |
| bool irqs_disabled = irqs_disabled(); |
| |
| /* |
| * Since this can be called indirectly from cleancache_put, which |
| * has interrupts disabled, as well as frontswap_put, which does not, |
| * we need to be able to handle both cases, even though it is ugly. |
| */ |
| if (irqs_disabled) |
| spin_lock(&zbud_eph_lists_lock); |
| else |
| spin_lock_bh(&zbud_eph_lists_lock); |
| *zsize = 0; |
| if (list_empty(&zbud_eph_lru_list)) |
| goto unlock_out; |
| list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) { |
| /* skip a locked zbudpage */ |
| if (unlikely(!zbudpage_spin_trylock(zbudpage))) |
| continue; |
| /* skip an unevictable zbudpage */ |
| if (unlikely(zbudpage->unevictable != 0)) { |
| zbudpage_spin_unlock(zbudpage); |
| continue; |
| } |
| /* got a locked evictable page */ |
| goto evict_page; |
| |
| } |
| unlock_out: |
| /* no unlocked evictable pages, give up */ |
| if (irqs_disabled) |
| spin_unlock(&zbud_eph_lists_lock); |
| else |
| spin_unlock_bh(&zbud_eph_lists_lock); |
| goto out; |
| |
| evict_page: |
| list_del_init(&zbudpage->budlist); |
| list_del_init(&zbudpage->lru); |
| zbudpage_set_dying(zbudpage); |
| /* |
| * the zbudpage is now "dying" and attempts to read, write, |
| * or delete data from it will be ignored |
| */ |
| if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) { |
| *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size - |
| (2 * sizeof(struct tmem_handle)); |
| *zpages = 2; |
| } else if (zbudpage->zbud0_size != 0) { |
| unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--; |
| *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle); |
| *zpages = 1; |
| } else if (zbudpage->zbud1_size != 0) { |
| unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--; |
| *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle); |
| *zpages = 1; |
| } else { |
| BUG(); |
| } |
| spin_unlock(&zbud_eph_lists_lock); |
| zbud_eph_evicted_pageframes++; |
| if (*zpages == 1) |
| zbud_eph_unbuddied_count--; |
| else |
| zbud_eph_buddied_count--; |
| zbud_evict_tmem(zbudpage); |
| zbudpage_spin_lock(zbudpage); |
| zbudpage_clear_dying(zbudpage); |
| page = zbud_unuse_zbudpage(zbudpage, true); |
| if (!irqs_disabled) |
| local_bh_enable(); |
| out: |
| return page; |
| } |
| |
| /* |
| * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it, |
| * read the tmem_handle(s) out of it into the passed array, and return the |
| * number of zbuds. Caller must perform necessary tmem functions and, |
| * indirectly, zbud functions to fetch any valid data and cause the |
| * now-zombified zbudpage to eventually be freed. We track the zombified |
| * zbudpage count so it is possible to observe if there is a leak. |
| FIXME: describe (ramster) case where data pointers are passed in for memcpy |
| */ |
| unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data, |
| unsigned int *zsize, bool eph) |
| { |
| struct zbudpage *zbudpage = NULL, *zbudpag2; |
| struct tmem_handle *thfrom; |
| char *from_va; |
| void *zbpg; |
| unsigned size; |
| int ret = 0, i; |
| spinlock_t *lists_lock = |
| eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; |
| struct list_head *lru_list = |
| eph ? &zbud_eph_lru_list : &zbud_pers_lru_list; |
| |
| spin_lock_bh(lists_lock); |
| if (list_empty(lru_list)) |
| goto out; |
| list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) { |
| /* skip a locked zbudpage */ |
| if (unlikely(!zbudpage_spin_trylock(zbudpage))) |
| continue; |
| /* skip an unevictable zbudpage */ |
| if (unlikely(zbudpage->unevictable != 0)) { |
| zbudpage_spin_unlock(zbudpage); |
| continue; |
| } |
| /* got a locked evictable page */ |
| goto zombify_page; |
| } |
| /* no unlocked evictable pages, give up */ |
| goto out; |
| |
| zombify_page: |
| /* got an unlocked evictable page, zombify it */ |
| list_del_init(&zbudpage->budlist); |
| zbudpage_set_zombie(zbudpage); |
| /* FIXME what accounting do I need to do here? */ |
| list_del_init(&zbudpage->lru); |
| if (eph) { |
| list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list); |
| zbud_eph_zombie_count = |
| atomic_inc_return(&zbud_eph_zombie_atomic); |
| } else { |
| list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list); |
| zbud_pers_zombie_count = |
| atomic_inc_return(&zbud_pers_zombie_atomic); |
| } |
| /* FIXME what accounting do I need to do here? */ |
| zbpg = kmap_zbudpage_atomic(zbudpage); |
| for (i = 0; i < 2; i++) { |
| size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; |
| if (size) { |
| from_va = zbud_data(zbpg, i, size); |
| thfrom = (struct tmem_handle *)from_va; |
| from_va += sizeof(struct tmem_handle); |
| size -= sizeof(struct tmem_handle); |
| if (th != NULL) |
| th[ret] = *thfrom; |
| if (data != NULL) |
| memcpy(data[ret], from_va, size); |
| if (zsize != NULL) |
| *zsize++ = size; |
| ret++; |
| } |
| } |
| kunmap_zbudpage_atomic(zbpg); |
| zbudpage_spin_unlock(zbudpage); |
| out: |
| spin_unlock_bh(lists_lock); |
| return ret; |
| } |
| |
| void zbud_init(void) |
| { |
| int i; |
| |
| zbud_debugfs_init(); |
| BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE)); |
| BUG_ON(sizeof(struct zbudpage) > sizeof(struct page)); |
| for (i = 0; i < NCHUNKS; i++) { |
| INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list); |
| INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list); |
| } |
| } |