mm/lru: replace pgdat lru_lock with lruvec lock
This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't have
to suffer from per node pgdat->lru_lock competition. They could go fast
with their self lru_lock.
After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.
In isolate_migratepages_block(), compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.
Daniel Jordan's testing show 62% improvement on modified readtwice case on
his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/
Hugh Dickins helped on the patch polish, thanks!
[alex.shi@linux.alibaba.com: fix comment typo]
Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com
[alex.shi@linux.alibaba.com: use page_memcg()]
Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com
Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rong Chen <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/compaction.c b/mm/compaction.c
index 50938e6..e5acb97 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -868,11 +868,20 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ cond_resched();
}
if (!pfn_valid_within(low_pfn))
@@ -944,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
if (!isolate_movable_page(page, isolate_mode))
@@ -987,10 +995,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -1009,9 +1026,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
SetPageLRU(page);
goto isolate_fail_put;
}
- }
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ } else
+ rcu_read_unlock();
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1045,8 +1061,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
put_page(page);
@@ -1061,8 +1077,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1090,7 +1106,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
if (page) {
SetPageLRU(page);
put_page(page);