diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2f7e1e90a9e6..cb3ad7e47df0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1648,15 +1648,17 @@ const struct file_operations proc_pagemap_operations = { static int reclaim_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct reclaim_param *rp = walk->private; + struct vm_area_struct *vma = rp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; LIST_HEAD(page_list); int isolated; + int reclaimed; split_huge_pmd(vma, addr, pmd); - if (pmd_trans_unstable(pmd)) + if (pmd_trans_unstable(pmd) || !rp->nr_to_reclaim) return 0; cont: isolated = 0; @@ -1677,12 +1679,18 @@ cont: inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); isolated++; - if (isolated >= SWAP_CLUSTER_MAX) + rp->nr_scanned++; + if ((isolated >= SWAP_CLUSTER_MAX) || !rp->nr_to_reclaim) break; } pte_unmap_unlock(pte - 1, ptl); - reclaim_pages_from_list(&page_list, vma); - if (addr != end) + reclaimed = reclaim_pages_from_list(&page_list, vma); + rp->nr_reclaimed += reclaimed; + rp->nr_to_reclaim -= reclaimed; + if (rp->nr_to_reclaim < 0) + rp->nr_to_reclaim = 0; + + if (rp->nr_to_reclaim && (addr != end)) goto cont; cond_resched(); @@ -1696,6 +1704,50 @@ enum reclaim_type { RECLAIM_RANGE, }; +struct reclaim_param reclaim_task_anon(struct task_struct *task, + int nr_to_reclaim) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct mm_walk reclaim_walk = {}; + struct reclaim_param rp = { + .nr_to_reclaim = nr_to_reclaim, + }; + + get_task_struct(task); + mm = get_task_mm(task); + if (!mm) + goto out; + + reclaim_walk.mm = mm; + reclaim_walk.pmd_entry = reclaim_pte_range; + + reclaim_walk.private = &rp; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + if (vma->vm_file) + continue; + + if (!rp.nr_to_reclaim) + break; + + rp.vma = vma; + walk_page_range(vma->vm_start, vma->vm_end, + &reclaim_walk); + } + + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); +out: + put_task_struct(task); + return rp; +} + static ssize_t reclaim_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -1708,6 +1760,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf, struct mm_walk reclaim_walk = {}; unsigned long start = 0; unsigned long end = 0; + struct reclaim_param rp; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) @@ -1770,6 +1823,10 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf, reclaim_walk.mm = mm; reclaim_walk.pmd_entry = reclaim_pte_range; + rp.nr_to_reclaim = INT_MAX; + rp.nr_reclaimed = 0; + reclaim_walk.private = &rp; + down_read(&mm->mmap_sem); if (type == RECLAIM_RANGE) { vma = find_vma(mm, start); @@ -1779,7 +1836,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf, if (is_vm_hugetlb_page(vma)) continue; - reclaim_walk.private = vma; + rp.vma = vma; walk_page_range(max(vma->vm_start, start), min(vma->vm_end, end), &reclaim_walk); @@ -1796,7 +1853,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf, if (type == RECLAIM_FILE && !vma->vm_file) continue; - reclaim_walk.private = vma; + rp.vma = vma; walk_page_range(vma->vm_start, vma->vm_end, &reclaim_walk); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 21385142e63a..f1b0668551e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2749,5 +2749,19 @@ static inline void setup_nr_node_ids(void) {} extern int want_old_faultaround_pte; +#ifdef CONFIG_PROCESS_RECLAIM +struct reclaim_param { + struct vm_area_struct *vma; + /* Number of pages scanned */ + int nr_scanned; + /* max pages to reclaim */ + int nr_to_reclaim; + /* pages reclaimed */ + int nr_reclaimed; +}; +extern struct reclaim_param reclaim_task_anon(struct task_struct *task, + int nr_to_reclaim); +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/trace/events/process_reclaim.h b/include/trace/events/process_reclaim.h new file mode 100644 index 000000000000..d79327ee4969 --- /dev/null +++ b/include/trace/events/process_reclaim.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM process_reclaim + +#if !defined(_TRACE_EVENT_PROCESSRECLAIM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_PROCESSRECLAIM_H + +#include +#include +#include + +TRACE_EVENT(process_reclaim, + + TP_PROTO(int tasksize, + short oom_score_adj, + int nr_scanned, int nr_reclaimed, + int per_swap_size, int total_sz, + int nr_to_reclaim), + + TP_ARGS(tasksize, oom_score_adj, nr_scanned, + nr_reclaimed, per_swap_size, + total_sz, nr_to_reclaim), + + TP_STRUCT__entry( + __field(int, tasksize) + __field(short, oom_score_adj) + __field(int, nr_scanned) + __field(int, nr_reclaimed) + __field(int, per_swap_size) + __field(int, total_sz) + __field(int, nr_to_reclaim) + ), + + TP_fast_assign( + __entry->tasksize = tasksize; + __entry->oom_score_adj = oom_score_adj; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->per_swap_size = per_swap_size; + __entry->total_sz = total_sz; + __entry->nr_to_reclaim = nr_to_reclaim; + ), + + TP_printk("%d, %hd, %d, %d, %d, %d, %d", + __entry->tasksize, __entry->oom_score_adj, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->per_swap_size, __entry->total_sz, + __entry->nr_to_reclaim) +); + +TRACE_EVENT(process_reclaim_eff, + + TP_PROTO(int efficiency, int reclaim_avg_efficiency), + + TP_ARGS(efficiency, reclaim_avg_efficiency), + + TP_STRUCT__entry( + __field(int, efficiency) + __field(int, reclaim_avg_efficiency) + ), + + TP_fast_assign( + __entry->efficiency = efficiency; + __entry->reclaim_avg_efficiency = reclaim_avg_efficiency; + ), + + TP_printk("%d, %d", __entry->efficiency, + __entry->reclaim_avg_efficiency) +); + +#endif + +#include + diff --git a/mm/Makefile b/mm/Makefile index 3243322fb58f..61e1aac6a150 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_PROCESS_RECLAIM) += process_reclaim.o diff --git a/mm/process_reclaim.c b/mm/process_reclaim.c new file mode 100644 index 000000000000..92ce0a5c105c --- /dev/null +++ b/mm/process_reclaim.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2015-2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#define MAX_SWAP_TASKS SWAP_CLUSTER_MAX + +static void swap_fn(struct work_struct *work); +DECLARE_WORK(swap_work, swap_fn); + +/* User knob to enable/disable process reclaim feature */ +static int enable_process_reclaim; +module_param_named(enable_process_reclaim, enable_process_reclaim, int, 0644); + +/* The max number of pages tried to be reclaimed in a single run */ +int per_swap_size = SWAP_CLUSTER_MAX * 32; +module_param_named(per_swap_size, per_swap_size, int, 0644); + +int reclaim_avg_efficiency; +module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency, int, 0444); + +/* The vmpressure region where process reclaim operates */ +static unsigned long pressure_min = 50; +static unsigned long pressure_max = 90; +module_param_named(pressure_min, pressure_min, ulong, 0644); +module_param_named(pressure_max, pressure_max, ulong, 0644); + +/* + * Scheduling process reclaim workqueue unecessarily + * when the reclaim efficiency is low does not make + * sense. We try to detect a drop in efficiency and + * disable reclaim for a time period. This period and the + * period for which we monitor a drop in efficiency is + * defined by swap_eff_win. swap_opt_eff is the optimal + * efficincy used as theshold for this. + */ +static int swap_eff_win = 2; +module_param_named(swap_eff_win, swap_eff_win, int, 0644); + +static int swap_opt_eff = 50; +module_param_named(swap_opt_eff, swap_opt_eff, int, 0644); + +static atomic_t skip_reclaim = ATOMIC_INIT(0); +/* Not atomic since only a single instance of swap_fn run at a time */ +static int monitor_eff; + +struct selected_task { + struct task_struct *p; + int tasksize; + short oom_score_adj; +}; + +int selected_cmp(const void *a, const void *b) +{ + const struct selected_task *x = a; + const struct selected_task *y = b; + int ret; + + ret = x->tasksize < y->tasksize ? -1 : 1; + + return ret; +} + +static int test_task_flag(struct task_struct *p, int flag) +{ + struct task_struct *t = p; + + rcu_read_lock(); + for_each_thread(p, t) { + task_lock(t); + if (test_tsk_thread_flag(t, flag)) { + task_unlock(t); + rcu_read_unlock(); + return 1; + } + task_unlock(t); + } + rcu_read_unlock(); + + return 0; +} + +static void swap_fn(struct work_struct *work) +{ + struct task_struct *tsk; + struct reclaim_param rp; + + /* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */ + struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},}; + int si = 0; + int i; + int tasksize; + int total_sz = 0; + short min_score_adj = 360; + int total_scan = 0; + int total_reclaimed = 0; + int nr_to_reclaim; + int efficiency; + + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *p; + short oom_score_adj; + + if (tsk->flags & PF_KTHREAD) + continue; + + if (test_task_flag(tsk, TIF_MEMDIE)) + continue; + + p = find_lock_task_mm(tsk); + if (!p) + continue; + + oom_score_adj = p->signal->oom_score_adj; + if (oom_score_adj < min_score_adj) { + task_unlock(p); + continue; + } + + tasksize = get_mm_counter(p->mm, MM_ANONPAGES); + task_unlock(p); + + if (tasksize <= 0) + continue; + + if (si == MAX_SWAP_TASKS) { + sort(&selected[0], MAX_SWAP_TASKS, + sizeof(struct selected_task), + &selected_cmp, NULL); + if (tasksize < selected[0].tasksize) + continue; + selected[0].p = p; + selected[0].oom_score_adj = oom_score_adj; + selected[0].tasksize = tasksize; + } else { + selected[si].p = p; + selected[si].oom_score_adj = oom_score_adj; + selected[si].tasksize = tasksize; + si++; + } + } + + for (i = 0; i < si; i++) + total_sz += selected[i].tasksize; + + /* Skip reclaim if total size is too less */ + if (total_sz < SWAP_CLUSTER_MAX) { + rcu_read_unlock(); + return; + } + + for (i = 0; i < si; i++) + get_task_struct(selected[i].p); + + rcu_read_unlock(); + + while (si--) { + nr_to_reclaim = + (selected[si].tasksize * per_swap_size) / total_sz; + /* scan atleast a page */ + if (!nr_to_reclaim) + nr_to_reclaim = 1; + + rp = reclaim_task_anon(selected[si].p, nr_to_reclaim); + + trace_process_reclaim(selected[si].tasksize, + selected[si].oom_score_adj, rp.nr_scanned, + rp.nr_reclaimed, per_swap_size, total_sz, + nr_to_reclaim); + total_scan += rp.nr_scanned; + total_reclaimed += rp.nr_reclaimed; + put_task_struct(selected[si].p); + } + + if (total_scan) { + efficiency = (total_reclaimed * 100) / total_scan; + + if (efficiency < swap_opt_eff) { + if (++monitor_eff == swap_eff_win) { + atomic_set(&skip_reclaim, swap_eff_win); + monitor_eff = 0; + } + } else { + monitor_eff = 0; + } + + reclaim_avg_efficiency = + (efficiency + reclaim_avg_efficiency) / 2; + trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency); + } +} + +static int vmpressure_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long pressure = action; + + if (!enable_process_reclaim) + return 0; + + if (!current_is_kswapd()) + return 0; + + if (atomic_dec_if_positive(&skip_reclaim) >= 0) + return 0; + + if ((pressure >= pressure_min) && (pressure < pressure_max)) + if (!work_pending(&swap_work)) + queue_work(system_unbound_wq, &swap_work); + return 0; +} + +static struct notifier_block vmpr_nb = { + .notifier_call = vmpressure_notifier, +}; + +static int __init process_reclaim_init(void) +{ + vmpressure_notifier_register(&vmpr_nb); + return 0; +} + +static void __exit process_reclaim_exit(void) +{ + vmpressure_notifier_unregister(&vmpr_nb); +} + +module_init(process_reclaim_init); +module_exit(process_reclaim_exit);