From 3b28df735e69fecdcb732ac07fa9e4a0beb3ae7c Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 24 Jun 2019 07:22:55 -0700 Subject: [PATCH] simple_lmk: Introduce Simple Low Memory Killer for Android This is a complete low memory killer solution for Android that is small and simple. Processes are killed according to the priorities that Android gives them, so that the least important processes are always killed first. Processes are killed until memory deficits are satisfied, as observed from kswapd struggling to free up pages. Simple LMK stops killing processes when kswapd finally goes back to sleep. The only tunables are the desired amount of memory to be freed per reclaim event and desired frequency of reclaim events. Simple LMK tries to free at least the desired amount of memory per reclaim and waits until all of its victims' memory is freed before proceeding to kill more processes. Signed-off-by: Sultan Alsawaf --- drivers/android/Kconfig | 44 +++++ drivers/android/Makefile | 1 + drivers/android/simple_lmk.c | 332 +++++++++++++++++++++++++++++++++++ include/linux/simple_lmk.h | 26 +++ kernel/fork.c | 2 + mm/vmscan.c | 4 + 6 files changed, 409 insertions(+) create mode 100644 drivers/android/simple_lmk.c create mode 100644 include/linux/simple_lmk.h diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 3a90d51d8419..03dbb1d10906 100755 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,6 +54,50 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. +config ANDROID_SIMPLE_LMK + bool "Simple Android Low Memory Killer" + depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG + ---help--- + This is a complete low memory killer solution for Android that is + small and simple. Processes are killed according to the priorities + that Android gives them, so that the least important processes are + always killed first. Processes are killed until memory deficits are + satisfied, as observed from kswapd struggling to free up pages. Simple + LMK stops killing processes when kswapd finally goes back to sleep. + +if ANDROID_SIMPLE_LMK + +config ANDROID_SIMPLE_LMK_AGGRESSION + int "Reclaim frequency selection" + range 1 3 + default 1 + help + This value determines how frequently Simple LMK will perform memory + reclaims. A lower value corresponds to less frequent reclaims, which + maximizes memory usage. The range of values has a logarithmic + correlation; 2 is twice as aggressive as 1, and 3 is twice as + aggressive as 2, which makes 3 four times as aggressive as 1. + + The aggression is set as a factor of kswapd's scan depth. This means + that a system with more memory will have a more expensive aggression + factor compared to a system with less memory. For example, setting an + aggression factor of 1 with 4 GiB of memory would be like setting a + factor of 2 with 8 GiB of memory; the more memory a system has, the + more expensive it is to use a lower value. + + Choosing a value of 1 here works well with systems that have 4 GiB of + memory. If the default doesn't work well, then this value should be + tweaked based on empirical results using different values. + +config ANDROID_SIMPLE_LMK_MINFREE + int "Minimum MiB of memory to free per reclaim" + range 8 512 + default 100 + help + Simple LMK will try to free at least this much memory per reclaim. + +endif + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c7856e3200da..7c91293b6d59 100755 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c new file mode 100644 index 000000000000..29637ecc5be5 --- /dev/null +++ b/drivers/android/simple_lmk.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ + +#define pr_fmt(fmt) "simple_lmk: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* The sched_param struct is located elsewhere in newer kernels */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#endif + +/* SEND_SIG_FORCED isn't present in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0) +#define SIG_INFO_TYPE SEND_SIG_FORCED +#else +#define SIG_INFO_TYPE SEND_SIG_PRIV +#endif + +/* The group argument to do_send_sig_info is different in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) +#define KILL_GROUP_TYPE true +#else +#define KILL_GROUP_TYPE PIDTYPE_TGID +#endif + +/* The minimum number of pages to free per reclaim */ +#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) + +/* Kill up to this many victims per reclaim */ +#define MAX_VICTIMS 1024 + +struct victim_info { + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long size; +}; + +/* Pulled from the Android framework. Lower adj means higher priority. */ +static const short adj_prio[] = { + 906, /* CACHED_APP_MAX_ADJ */ + 905, /* Cached app */ + 904, /* Cached app */ + 903, /* Cached app */ + 902, /* Cached app */ + 901, /* Cached app */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ +}; + +static struct victim_info victims[MAX_VICTIMS]; +static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); +static DECLARE_COMPLETION(reclaim_done); +static int victims_to_kill; +static bool needs_reclaim; + +static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) +{ + const struct victim_info *lhs = (typeof(lhs))lhs_ptr; + const struct victim_info *rhs = (typeof(rhs))rhs_ptr; + + return rhs->size - lhs->size; +} + +static bool vtsk_is_duplicate(struct victim_info *varr, int vlen, + struct task_struct *vtsk) +{ + int i; + + for (i = 0; i < vlen; i++) { + if (same_thread_group(varr[i].tsk, vtsk)) + return true; + } + + return false; +} + +static unsigned long find_victims(struct victim_info *varr, int *vindex, + int vmaxlen, short target_adj) +{ + unsigned long pages_found = 0; + int old_vindex = *vindex; + struct task_struct *tsk; + + for_each_process(tsk) { + struct task_struct *vtsk; + unsigned long tasksize; + + /* + * Search for tasks with the targeted importance (adj). Since + * only tasks with a positive adj can be targeted, that + * naturally excludes tasks which shouldn't be killed, like init + * and kthreads. Although oom_score_adj can still be changed + * while this code runs, it doesn't really matter. We just need + * to make sure that if the adj changes, we won't deadlock + * trying to lock a task that we locked earlier. + */ + if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || + vtsk_is_duplicate(varr, *vindex, tsk)) + continue; + + vtsk = find_lock_task_mm(tsk); + if (!vtsk) + continue; + + /* Store this potential victim away for later */ + varr[*vindex].tsk = vtsk; + varr[*vindex].mm = vtsk->mm; + varr[*vindex].size = get_mm_rss(vtsk->mm); + + /* Keep track of the number of pages that have been found */ + pages_found += tasksize; + + /* Make sure there's space left in the victim array */ + if (++*vindex == vmaxlen) + break; + } + + /* + * Sort the victims in descending order of size to prioritize killing + * the larger ones first. + */ + if (pages_found) + sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr), + victim_size_cmp, NULL); + + return pages_found; +} + +static int process_victims(struct victim_info *varr, int vlen, + unsigned long pages_needed) +{ + unsigned long pages_found = 0; + int i, nr_to_kill = 0; + + /* + * Calculate the number of tasks that need to be killed and quickly + * release the references to those that'll live. + */ + for (i = 0; i < vlen; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + /* The victim's mm lock is taken in find_victims; release it */ + if (pages_found >= pages_needed) { + task_unlock(vtsk); + continue; + } + + pages_found += victim->size; + nr_to_kill++; + } + + return nr_to_kill; +} + +static void scan_and_kill(unsigned long pages_needed) +{ + int i, nr_to_kill = 0, nr_victims = 0; + unsigned long pages_found = 0; + + /* + * Hold the tasklist lock so tasks don't disappear while scanning. This + * is preferred to holding an RCU read lock so that the list of tasks + * is guaranteed to be up to date. + */ + read_lock(&tasklist_lock); + for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { + pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS, + adj_prio[i]); + if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) + break; + } + read_unlock(&tasklist_lock); + + /* Pretty unlikely but it can happen */ + if (unlikely(!nr_victims)) + return; + + /* First round of victim processing to weed out unneeded victims */ + nr_to_kill = process_victims(victims, nr_victims, pages_needed); + + /* + * Try to kill as few of the chosen victims as possible by sorting the + * chosen victims by size, which means larger victims that have a lower + * adj can be killed in place of smaller victims with a high adj. + */ + sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); + + /* Second round of victim processing to finally select the victims */ + nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); + + /* Kill the victims */ + WRITE_ONCE(victims_to_kill, nr_to_kill); + for (i = 0; i < nr_to_kill; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, + vtsk->signal->oom_score_adj, + victim->size << (PAGE_SHIFT - 10)); + + /* Accelerate the victim's death by forcing the kill signal */ + do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE); + + /* Grab a reference to the victim for later before unlocking */ + get_task_struct(vtsk); + task_unlock(vtsk); + } + + /* Try to speed up the death process now that we can schedule again */ + for (i = 0; i < nr_to_kill; i++) { + struct task_struct *vtsk = victims[i].tsk; + + /* Increase the victim's priority to make it die faster */ + set_user_nice(vtsk, MIN_NICE); + + /* Allow the victim to run on any CPU */ + set_cpus_allowed_ptr(vtsk, cpu_all_mask); + + /* Finally release the victim reference acquired earlier */ + put_task_struct(vtsk); + } + + /* Wait until all the victims die */ + wait_for_completion(&reclaim_done); +} + +static int simple_lmk_reclaim_thread(void *data) +{ + static const struct sched_param sched_max_rt_prio = { + .sched_priority = MAX_RT_PRIO - 1 + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); + + while (1) { + bool should_stop; + + wait_event(oom_waitq, (should_stop = kthread_should_stop()) || + READ_ONCE(needs_reclaim)); + + if (should_stop) + break; + + /* + * Kill a batch of processes and wait for their memory to be + * freed. After their memory is freed, sleep for 20 ms to give + * OOM'd allocations a chance to scavenge for the newly-freed + * pages. Rinse and repeat while there are still OOM'd + * allocations. + */ + do { + scan_and_kill(MIN_FREE_PAGES); + msleep(20); + } while (READ_ONCE(needs_reclaim)); + } + + return 0; +} + +void simple_lmk_decide_reclaim(int kswapd_priority) +{ + if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) + return; + + if (!cmpxchg(&needs_reclaim, false, true)) + wake_up(&oom_waitq); +} + +void simple_lmk_stop_reclaim(void) +{ + WRITE_ONCE(needs_reclaim, false); +} + +void simple_lmk_mm_freed(struct mm_struct *mm) +{ + static atomic_t nr_killed = ATOMIC_INIT(0); + int i, nr_to_kill; + + nr_to_kill = READ_ONCE(victims_to_kill); + for (i = 0; i < nr_to_kill; i++) { + if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { + if (atomic_inc_return(&nr_killed) == nr_to_kill) { + WRITE_ONCE(victims_to_kill, 0); + nr_killed = (atomic_t)ATOMIC_INIT(0); + complete(&reclaim_done); + } + break; + } + } +} + +/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ +static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) +{ + static bool init_done; + struct task_struct *thread; + + if (cmpxchg(&init_done, false, true)) + return 0; + + thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); + BUG_ON(IS_ERR(thread)); + + return 0; +} + +static const struct kernel_param_ops simple_lmk_init_ops = { + .set = simple_lmk_init_set +}; + +/* Needed to prevent Android from thinking there's no LMK and thus rebooting */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "lowmemorykiller." +module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200); diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h new file mode 100644 index 000000000000..b0c247f2f2a5 --- /dev/null +++ b/include/linux/simple_lmk.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ +#ifndef _SIMPLE_LMK_H_ +#define _SIMPLE_LMK_H_ + +struct mm_struct; + +#ifdef CONFIG_ANDROID_SIMPLE_LMK +void simple_lmk_decide_reclaim(int kswapd_priority); +void simple_lmk_stop_reclaim(void); +void simple_lmk_mm_freed(struct mm_struct *mm); +#else +static inline void simple_lmk_decide_reclaim(int kswapd_priority) +{ +} +static inline void simple_lmk_stop_reclaim(void) +{ +} +static inline void simple_lmk_mm_freed(struct mm_struct *mm) +{ +} +#endif + +#endif /* _SIMPLE_LMK_H_ */ diff --git a/kernel/fork.c b/kernel/fork.c index 3aa6131ccfba..ef651a322bf6 100755 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -991,6 +992,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + simple_lmk_mm_freed(mm); mmdrop(mm); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 068285f41938..46c84e8b98ac 100755 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -3624,6 +3625,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + simple_lmk_decide_reclaim(sc.priority); sc.reclaim_idx = classzone_idx; /* @@ -3757,6 +3759,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3793,6 +3796,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /*