diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 3a90d51d8419..03dbb1d10906 100755 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,6 +54,50 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. +config ANDROID_SIMPLE_LMK + bool "Simple Android Low Memory Killer" + depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG + ---help--- + This is a complete low memory killer solution for Android that is + small and simple. Processes are killed according to the priorities + that Android gives them, so that the least important processes are + always killed first. Processes are killed until memory deficits are + satisfied, as observed from kswapd struggling to free up pages. Simple + LMK stops killing processes when kswapd finally goes back to sleep. + +if ANDROID_SIMPLE_LMK + +config ANDROID_SIMPLE_LMK_AGGRESSION + int "Reclaim frequency selection" + range 1 3 + default 1 + help + This value determines how frequently Simple LMK will perform memory + reclaims. A lower value corresponds to less frequent reclaims, which + maximizes memory usage. The range of values has a logarithmic + correlation; 2 is twice as aggressive as 1, and 3 is twice as + aggressive as 2, which makes 3 four times as aggressive as 1. + + The aggression is set as a factor of kswapd's scan depth. This means + that a system with more memory will have a more expensive aggression + factor compared to a system with less memory. For example, setting an + aggression factor of 1 with 4 GiB of memory would be like setting a + factor of 2 with 8 GiB of memory; the more memory a system has, the + more expensive it is to use a lower value. + + Choosing a value of 1 here works well with systems that have 4 GiB of + memory. If the default doesn't work well, then this value should be + tweaked based on empirical results using different values. + +config ANDROID_SIMPLE_LMK_MINFREE + int "Minimum MiB of memory to free per reclaim" + range 8 512 + default 100 + help + Simple LMK will try to free at least this much memory per reclaim. + +endif + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c7856e3200da..7c91293b6d59 100755 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c new file mode 100644 index 000000000000..29637ecc5be5 --- /dev/null +++ b/drivers/android/simple_lmk.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ + +#define pr_fmt(fmt) "simple_lmk: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* The sched_param struct is located elsewhere in newer kernels */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#endif + +/* SEND_SIG_FORCED isn't present in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0) +#define SIG_INFO_TYPE SEND_SIG_FORCED +#else +#define SIG_INFO_TYPE SEND_SIG_PRIV +#endif + +/* The group argument to do_send_sig_info is different in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) +#define KILL_GROUP_TYPE true +#else +#define KILL_GROUP_TYPE PIDTYPE_TGID +#endif + +/* The minimum number of pages to free per reclaim */ +#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) + +/* Kill up to this many victims per reclaim */ +#define MAX_VICTIMS 1024 + +struct victim_info { + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long size; +}; + +/* Pulled from the Android framework. Lower adj means higher priority. */ +static const short adj_prio[] = { + 906, /* CACHED_APP_MAX_ADJ */ + 905, /* Cached app */ + 904, /* Cached app */ + 903, /* Cached app */ + 902, /* Cached app */ + 901, /* Cached app */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ +}; + +static struct victim_info victims[MAX_VICTIMS]; +static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); +static DECLARE_COMPLETION(reclaim_done); +static int victims_to_kill; +static bool needs_reclaim; + +static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) +{ + const struct victim_info *lhs = (typeof(lhs))lhs_ptr; + const struct victim_info *rhs = (typeof(rhs))rhs_ptr; + + return rhs->size - lhs->size; +} + +static bool vtsk_is_duplicate(struct victim_info *varr, int vlen, + struct task_struct *vtsk) +{ + int i; + + for (i = 0; i < vlen; i++) { + if (same_thread_group(varr[i].tsk, vtsk)) + return true; + } + + return false; +} + +static unsigned long find_victims(struct victim_info *varr, int *vindex, + int vmaxlen, short target_adj) +{ + unsigned long pages_found = 0; + int old_vindex = *vindex; + struct task_struct *tsk; + + for_each_process(tsk) { + struct task_struct *vtsk; + unsigned long tasksize; + + /* + * Search for tasks with the targeted importance (adj). Since + * only tasks with a positive adj can be targeted, that + * naturally excludes tasks which shouldn't be killed, like init + * and kthreads. Although oom_score_adj can still be changed + * while this code runs, it doesn't really matter. We just need + * to make sure that if the adj changes, we won't deadlock + * trying to lock a task that we locked earlier. + */ + if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || + vtsk_is_duplicate(varr, *vindex, tsk)) + continue; + + vtsk = find_lock_task_mm(tsk); + if (!vtsk) + continue; + + /* Store this potential victim away for later */ + varr[*vindex].tsk = vtsk; + varr[*vindex].mm = vtsk->mm; + varr[*vindex].size = get_mm_rss(vtsk->mm); + + /* Keep track of the number of pages that have been found */ + pages_found += tasksize; + + /* Make sure there's space left in the victim array */ + if (++*vindex == vmaxlen) + break; + } + + /* + * Sort the victims in descending order of size to prioritize killing + * the larger ones first. + */ + if (pages_found) + sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr), + victim_size_cmp, NULL); + + return pages_found; +} + +static int process_victims(struct victim_info *varr, int vlen, + unsigned long pages_needed) +{ + unsigned long pages_found = 0; + int i, nr_to_kill = 0; + + /* + * Calculate the number of tasks that need to be killed and quickly + * release the references to those that'll live. + */ + for (i = 0; i < vlen; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + /* The victim's mm lock is taken in find_victims; release it */ + if (pages_found >= pages_needed) { + task_unlock(vtsk); + continue; + } + + pages_found += victim->size; + nr_to_kill++; + } + + return nr_to_kill; +} + +static void scan_and_kill(unsigned long pages_needed) +{ + int i, nr_to_kill = 0, nr_victims = 0; + unsigned long pages_found = 0; + + /* + * Hold the tasklist lock so tasks don't disappear while scanning. This + * is preferred to holding an RCU read lock so that the list of tasks + * is guaranteed to be up to date. + */ + read_lock(&tasklist_lock); + for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { + pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS, + adj_prio[i]); + if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) + break; + } + read_unlock(&tasklist_lock); + + /* Pretty unlikely but it can happen */ + if (unlikely(!nr_victims)) + return; + + /* First round of victim processing to weed out unneeded victims */ + nr_to_kill = process_victims(victims, nr_victims, pages_needed); + + /* + * Try to kill as few of the chosen victims as possible by sorting the + * chosen victims by size, which means larger victims that have a lower + * adj can be killed in place of smaller victims with a high adj. + */ + sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); + + /* Second round of victim processing to finally select the victims */ + nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); + + /* Kill the victims */ + WRITE_ONCE(victims_to_kill, nr_to_kill); + for (i = 0; i < nr_to_kill; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, + vtsk->signal->oom_score_adj, + victim->size << (PAGE_SHIFT - 10)); + + /* Accelerate the victim's death by forcing the kill signal */ + do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE); + + /* Grab a reference to the victim for later before unlocking */ + get_task_struct(vtsk); + task_unlock(vtsk); + } + + /* Try to speed up the death process now that we can schedule again */ + for (i = 0; i < nr_to_kill; i++) { + struct task_struct *vtsk = victims[i].tsk; + + /* Increase the victim's priority to make it die faster */ + set_user_nice(vtsk, MIN_NICE); + + /* Allow the victim to run on any CPU */ + set_cpus_allowed_ptr(vtsk, cpu_all_mask); + + /* Finally release the victim reference acquired earlier */ + put_task_struct(vtsk); + } + + /* Wait until all the victims die */ + wait_for_completion(&reclaim_done); +} + +static int simple_lmk_reclaim_thread(void *data) +{ + static const struct sched_param sched_max_rt_prio = { + .sched_priority = MAX_RT_PRIO - 1 + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); + + while (1) { + bool should_stop; + + wait_event(oom_waitq, (should_stop = kthread_should_stop()) || + READ_ONCE(needs_reclaim)); + + if (should_stop) + break; + + /* + * Kill a batch of processes and wait for their memory to be + * freed. After their memory is freed, sleep for 20 ms to give + * OOM'd allocations a chance to scavenge for the newly-freed + * pages. Rinse and repeat while there are still OOM'd + * allocations. + */ + do { + scan_and_kill(MIN_FREE_PAGES); + msleep(20); + } while (READ_ONCE(needs_reclaim)); + } + + return 0; +} + +void simple_lmk_decide_reclaim(int kswapd_priority) +{ + if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) + return; + + if (!cmpxchg(&needs_reclaim, false, true)) + wake_up(&oom_waitq); +} + +void simple_lmk_stop_reclaim(void) +{ + WRITE_ONCE(needs_reclaim, false); +} + +void simple_lmk_mm_freed(struct mm_struct *mm) +{ + static atomic_t nr_killed = ATOMIC_INIT(0); + int i, nr_to_kill; + + nr_to_kill = READ_ONCE(victims_to_kill); + for (i = 0; i < nr_to_kill; i++) { + if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { + if (atomic_inc_return(&nr_killed) == nr_to_kill) { + WRITE_ONCE(victims_to_kill, 0); + nr_killed = (atomic_t)ATOMIC_INIT(0); + complete(&reclaim_done); + } + break; + } + } +} + +/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ +static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) +{ + static bool init_done; + struct task_struct *thread; + + if (cmpxchg(&init_done, false, true)) + return 0; + + thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); + BUG_ON(IS_ERR(thread)); + + return 0; +} + +static const struct kernel_param_ops simple_lmk_init_ops = { + .set = simple_lmk_init_set +}; + +/* Needed to prevent Android from thinking there's no LMK and thus rebooting */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "lowmemorykiller." +module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200); diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h new file mode 100644 index 000000000000..b0c247f2f2a5 --- /dev/null +++ b/include/linux/simple_lmk.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ +#ifndef _SIMPLE_LMK_H_ +#define _SIMPLE_LMK_H_ + +struct mm_struct; + +#ifdef CONFIG_ANDROID_SIMPLE_LMK +void simple_lmk_decide_reclaim(int kswapd_priority); +void simple_lmk_stop_reclaim(void); +void simple_lmk_mm_freed(struct mm_struct *mm); +#else +static inline void simple_lmk_decide_reclaim(int kswapd_priority) +{ +} +static inline void simple_lmk_stop_reclaim(void) +{ +} +static inline void simple_lmk_mm_freed(struct mm_struct *mm) +{ +} +#endif + +#endif /* _SIMPLE_LMK_H_ */ diff --git a/kernel/fork.c b/kernel/fork.c index 3aa6131ccfba..ef651a322bf6 100755 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include @@ -991,6 +992,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + simple_lmk_mm_freed(mm); mmdrop(mm); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 068285f41938..46c84e8b98ac 100755 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -3624,6 +3625,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + simple_lmk_decide_reclaim(sc.priority); sc.reclaim_idx = classzone_idx; /* @@ -3757,6 +3759,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3793,6 +3796,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /*