simple_lmk: Introduce Simple Low Memory Killer for Android

This is a complete low memory killer solution for Android that is small
and simple. Processes are killed according to the priorities that
Android gives them, so that the least important processes are always
killed first. Processes are killed until memory deficits are satisfied,
as observed from kswapd struggling to free up pages. Simple LMK stops
killing processes when kswapd finally goes back to sleep.

The only tunables are the desired amount of memory to be freed per
reclaim event and desired frequency of reclaim events. Simple LMK tries
to free at least the desired amount of memory per reclaim and waits
until all of its victims' memory is freed before proceeding to kill more
processes.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
fourteen
Sultan Alsawaf 6 years ago committed by Jenna
parent 64ae5e767b
commit 3b28df735e
  1. 44
      drivers/android/Kconfig
  2. 1
      drivers/android/Makefile
  3. 332
      drivers/android/simple_lmk.c
  4. 26
      include/linux/simple_lmk.h
  5. 2
      kernel/fork.c
  6. 4
      mm/vmscan.c

@ -54,6 +54,50 @@ config ANDROID_BINDER_IPC_SELFTEST
exhaustively with combinations of various buffer sizes and
alignments.
config ANDROID_SIMPLE_LMK
bool "Simple Android Low Memory Killer"
depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
---help---
This is a complete low memory killer solution for Android that is
small and simple. Processes are killed according to the priorities
that Android gives them, so that the least important processes are
always killed first. Processes are killed until memory deficits are
satisfied, as observed from kswapd struggling to free up pages. Simple
LMK stops killing processes when kswapd finally goes back to sleep.
if ANDROID_SIMPLE_LMK
config ANDROID_SIMPLE_LMK_AGGRESSION
int "Reclaim frequency selection"
range 1 3
default 1
help
This value determines how frequently Simple LMK will perform memory
reclaims. A lower value corresponds to less frequent reclaims, which
maximizes memory usage. The range of values has a logarithmic
correlation; 2 is twice as aggressive as 1, and 3 is twice as
aggressive as 2, which makes 3 four times as aggressive as 1.
The aggression is set as a factor of kswapd's scan depth. This means
that a system with more memory will have a more expensive aggression
factor compared to a system with less memory. For example, setting an
aggression factor of 1 with 4 GiB of memory would be like setting a
factor of 2 with 8 GiB of memory; the more memory a system has, the
more expensive it is to use a lower value.
Choosing a value of 1 here works well with systems that have 4 GiB of
memory. If the default doesn't work well, then this value should be
tweaked based on empirical results using different values.
config ANDROID_SIMPLE_LMK_MINFREE
int "Minimum MiB of memory to free per reclaim"
range 8 512
default 100
help
Simple LMK will try to free at least this much memory per reclaim.
endif
endif # if ANDROID
endmenu

@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o

@ -0,0 +1,332 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
*/
#define pr_fmt(fmt) "simple_lmk: " fmt
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
#include <linux/oom.h>
#include <linux/sort.h>
#include <linux/version.h>
/* The sched_param struct is located elsewhere in newer kernels */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
#include <uapi/linux/sched/types.h>
#endif
/* SEND_SIG_FORCED isn't present in newer kernels */
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)
#define SIG_INFO_TYPE SEND_SIG_FORCED
#else
#define SIG_INFO_TYPE SEND_SIG_PRIV
#endif
/* The group argument to do_send_sig_info is different in newer kernels */
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
#define KILL_GROUP_TYPE true
#else
#define KILL_GROUP_TYPE PIDTYPE_TGID
#endif
/* The minimum number of pages to free per reclaim */
#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
/* Kill up to this many victims per reclaim */
#define MAX_VICTIMS 1024
struct victim_info {
struct task_struct *tsk;
struct mm_struct *mm;
unsigned long size;
};
/* Pulled from the Android framework. Lower adj means higher priority. */
static const short adj_prio[] = {
906, /* CACHED_APP_MAX_ADJ */
905, /* Cached app */
904, /* Cached app */
903, /* Cached app */
902, /* Cached app */
901, /* Cached app */
900, /* CACHED_APP_MIN_ADJ */
800, /* SERVICE_B_ADJ */
700, /* PREVIOUS_APP_ADJ */
600, /* HOME_APP_ADJ */
500, /* SERVICE_ADJ */
400, /* HEAVY_WEIGHT_APP_ADJ */
300, /* BACKUP_APP_ADJ */
200, /* PERCEPTIBLE_APP_ADJ */
100, /* VISIBLE_APP_ADJ */
0 /* FOREGROUND_APP_ADJ */
};
static struct victim_info victims[MAX_VICTIMS];
static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
static DECLARE_COMPLETION(reclaim_done);
static int victims_to_kill;
static bool needs_reclaim;
static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr)
{
const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
return rhs->size - lhs->size;
}
static bool vtsk_is_duplicate(struct victim_info *varr, int vlen,
struct task_struct *vtsk)
{
int i;
for (i = 0; i < vlen; i++) {
if (same_thread_group(varr[i].tsk, vtsk))
return true;
}
return false;
}
static unsigned long find_victims(struct victim_info *varr, int *vindex,
int vmaxlen, short target_adj)
{
unsigned long pages_found = 0;
int old_vindex = *vindex;
struct task_struct *tsk;
for_each_process(tsk) {
struct task_struct *vtsk;
unsigned long tasksize;
/*
* Search for tasks with the targeted importance (adj). Since
* only tasks with a positive adj can be targeted, that
* naturally excludes tasks which shouldn't be killed, like init
* and kthreads. Although oom_score_adj can still be changed
* while this code runs, it doesn't really matter. We just need
* to make sure that if the adj changes, we won't deadlock
* trying to lock a task that we locked earlier.
*/
if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj ||
vtsk_is_duplicate(varr, *vindex, tsk))
continue;
vtsk = find_lock_task_mm(tsk);
if (!vtsk)
continue;
/* Store this potential victim away for later */
varr[*vindex].tsk = vtsk;
varr[*vindex].mm = vtsk->mm;
varr[*vindex].size = get_mm_rss(vtsk->mm);
/* Keep track of the number of pages that have been found */
pages_found += tasksize;
/* Make sure there's space left in the victim array */
if (++*vindex == vmaxlen)
break;
}
/*
* Sort the victims in descending order of size to prioritize killing
* the larger ones first.
*/
if (pages_found)
sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr),
victim_size_cmp, NULL);
return pages_found;
}
static int process_victims(struct victim_info *varr, int vlen,
unsigned long pages_needed)
{
unsigned long pages_found = 0;
int i, nr_to_kill = 0;
/*
* Calculate the number of tasks that need to be killed and quickly
* release the references to those that'll live.
*/
for (i = 0; i < vlen; i++) {
struct victim_info *victim = &victims[i];
struct task_struct *vtsk = victim->tsk;
/* The victim's mm lock is taken in find_victims; release it */
if (pages_found >= pages_needed) {
task_unlock(vtsk);
continue;
}
pages_found += victim->size;
nr_to_kill++;
}
return nr_to_kill;
}
static void scan_and_kill(unsigned long pages_needed)
{
int i, nr_to_kill = 0, nr_victims = 0;
unsigned long pages_found = 0;
/*
* Hold the tasklist lock so tasks don't disappear while scanning. This
* is preferred to holding an RCU read lock so that the list of tasks
* is guaranteed to be up to date.
*/
read_lock(&tasklist_lock);
for (i = 0; i < ARRAY_SIZE(adj_prio); i++) {
pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS,
adj_prio[i]);
if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS)
break;
}
read_unlock(&tasklist_lock);
/* Pretty unlikely but it can happen */
if (unlikely(!nr_victims))
return;
/* First round of victim processing to weed out unneeded victims */
nr_to_kill = process_victims(victims, nr_victims, pages_needed);
/*
* Try to kill as few of the chosen victims as possible by sorting the
* chosen victims by size, which means larger victims that have a lower
* adj can be killed in place of smaller victims with a high adj.
*/
sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL);
/* Second round of victim processing to finally select the victims */
nr_to_kill = process_victims(victims, nr_to_kill, pages_needed);
/* Kill the victims */
WRITE_ONCE(victims_to_kill, nr_to_kill);
for (i = 0; i < nr_to_kill; i++) {
struct victim_info *victim = &victims[i];
struct task_struct *vtsk = victim->tsk;
pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
vtsk->signal->oom_score_adj,
victim->size << (PAGE_SHIFT - 10));
/* Accelerate the victim's death by forcing the kill signal */
do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE);
/* Grab a reference to the victim for later before unlocking */
get_task_struct(vtsk);
task_unlock(vtsk);
}
/* Try to speed up the death process now that we can schedule again */
for (i = 0; i < nr_to_kill; i++) {
struct task_struct *vtsk = victims[i].tsk;
/* Increase the victim's priority to make it die faster */
set_user_nice(vtsk, MIN_NICE);
/* Allow the victim to run on any CPU */
set_cpus_allowed_ptr(vtsk, cpu_all_mask);
/* Finally release the victim reference acquired earlier */
put_task_struct(vtsk);
}
/* Wait until all the victims die */
wait_for_completion(&reclaim_done);
}
static int simple_lmk_reclaim_thread(void *data)
{
static const struct sched_param sched_max_rt_prio = {
.sched_priority = MAX_RT_PRIO - 1
};
sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
while (1) {
bool should_stop;
wait_event(oom_waitq, (should_stop = kthread_should_stop()) ||
READ_ONCE(needs_reclaim));
if (should_stop)
break;
/*
* Kill a batch of processes and wait for their memory to be
* freed. After their memory is freed, sleep for 20 ms to give
* OOM'd allocations a chance to scavenge for the newly-freed
* pages. Rinse and repeat while there are still OOM'd
* allocations.
*/
do {
scan_and_kill(MIN_FREE_PAGES);
msleep(20);
} while (READ_ONCE(needs_reclaim));
}
return 0;
}
void simple_lmk_decide_reclaim(int kswapd_priority)
{
if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION)
return;
if (!cmpxchg(&needs_reclaim, false, true))
wake_up(&oom_waitq);
}
void simple_lmk_stop_reclaim(void)
{
WRITE_ONCE(needs_reclaim, false);
}
void simple_lmk_mm_freed(struct mm_struct *mm)
{
static atomic_t nr_killed = ATOMIC_INIT(0);
int i, nr_to_kill;
nr_to_kill = READ_ONCE(victims_to_kill);
for (i = 0; i < nr_to_kill; i++) {
if (cmpxchg(&victims[i].mm, mm, NULL) == mm) {
if (atomic_inc_return(&nr_killed) == nr_to_kill) {
WRITE_ONCE(victims_to_kill, 0);
nr_killed = (atomic_t)ATOMIC_INIT(0);
complete(&reclaim_done);
}
break;
}
}
}
/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
{
static bool init_done;
struct task_struct *thread;
if (cmpxchg(&init_done, false, true))
return 0;
thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd");
BUG_ON(IS_ERR(thread));
return 0;
}
static const struct kernel_param_ops simple_lmk_init_ops = {
.set = simple_lmk_init_set
};
/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "lowmemorykiller."
module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);

@ -0,0 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
*/
#ifndef _SIMPLE_LMK_H_
#define _SIMPLE_LMK_H_
struct mm_struct;
#ifdef CONFIG_ANDROID_SIMPLE_LMK
void simple_lmk_decide_reclaim(int kswapd_priority);
void simple_lmk_stop_reclaim(void);
void simple_lmk_mm_freed(struct mm_struct *mm);
#else
static inline void simple_lmk_decide_reclaim(int kswapd_priority)
{
}
static inline void simple_lmk_stop_reclaim(void)
{
}
static inline void simple_lmk_mm_freed(struct mm_struct *mm)
{
}
#endif
#endif /* _SIMPLE_LMK_H_ */

@ -94,6 +94,7 @@
#include <linux/thread_info.h>
#include <linux/cpufreq_times.h>
#include <linux/scs.h>
#include <linux/simple_lmk.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@ -991,6 +992,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
simple_lmk_mm_freed(mm);
mmdrop(mm);
}

@ -50,6 +50,7 @@
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
#include <linux/simple_lmk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@ -3624,6 +3625,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
simple_lmk_decide_reclaim(sc.priority);
sc.reclaim_idx = classzone_idx;
/*
@ -3757,6 +3759,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* succeed.
*/
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
simple_lmk_stop_reclaim();
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@ -3793,6 +3796,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
*/
if (!remaining &&
prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
simple_lmk_stop_reclaim();
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*

Loading…
Cancel
Save