You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
kernel_samsung_sm7125/drivers/android/simple_lmk.c

508 lines
14 KiB

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2019-2023 Sultan Alsawaf <sultan@kerneltoast.com>.
*/
#define pr_fmt(fmt) "simple_lmk: " fmt
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/moduleparam.h>
#include <linux/oom.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sort.h>
#include <linux/vmpressure.h>
#include <uapi/linux/sched/types.h>
/* The minimum number of pages to free per reclaim */
#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
/* Kill up to this many victims per reclaim */
#define MAX_VICTIMS 1024
/* Timeout in jiffies for each reclaim */
#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC)
struct victim_info {
struct task_struct *tsk;
struct mm_struct *mm;
unsigned long size;
};
static struct victim_info victims[MAX_VICTIMS] __cacheline_aligned_in_smp;
static struct task_struct *task_bucket[SHRT_MAX + 1] __cacheline_aligned;
static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
static DECLARE_WAIT_QUEUE_HEAD(reaper_waitq);
static DECLARE_COMPLETION(reclaim_done);
static __cacheline_aligned_in_smp DEFINE_RWLOCK(mm_free_lock);
static int nr_victims;
static bool reclaim_active;
static atomic_t needs_reclaim = ATOMIC_INIT(0);
static atomic_t needs_reap = ATOMIC_INIT(0);
static atomic_t nr_killed = ATOMIC_INIT(0);
static int victim_cmp(const void *lhs_ptr, const void *rhs_ptr)
{
const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
return rhs->size - lhs->size;
}
static void victim_swap(void *lhs_ptr, void *rhs_ptr, int size)
{
struct victim_info *lhs = (typeof(lhs))lhs_ptr;
struct victim_info *rhs = (typeof(rhs))rhs_ptr;
swap(*lhs, *rhs);
}
static unsigned long get_total_mm_pages(struct mm_struct *mm)
{
unsigned long pages = 0;
int i;
for (i = 0; i < NR_MM_COUNTERS; i++)
pages += get_mm_counter(mm, i);
return pages;
}
static unsigned long find_victims(int *vindex)
{
short i, min_adj = SHRT_MAX, max_adj = 0;
unsigned long pages_found = 0;
struct task_struct *tsk;
rcu_read_lock();
for_each_process(tsk) {
struct signal_struct *sig;
short adj;
/*
* Search for suitable tasks with a positive adj (importance).
* Since only tasks with a positive adj can be targeted, that
* naturally excludes tasks which shouldn't be killed, like init
* and kthreads. Although oom_score_adj can still be changed
* while this code runs, it doesn't really matter; we just need
* a snapshot of the task's adj.
*/
sig = tsk->signal;
adj = READ_ONCE(sig->oom_score_adj);
if (adj < 0 ||
sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) ||
(thread_group_empty(tsk) && tsk->flags & PF_EXITING))
continue;
/* Store the task in a linked-list bucket based on its adj */
tsk->simple_lmk_next = task_bucket[adj];
task_bucket[adj] = tsk;
/* Track the min and max adjs to speed up the loop below */
if (adj > max_adj)
max_adj = adj;
if (adj < min_adj)
min_adj = adj;
}
/* Start searching for victims from the highest adj (least important) */
for (i = max_adj; i >= min_adj; i--) {
int old_vindex;
tsk = task_bucket[i];
if (!tsk)
continue;
/* Clear out this bucket for the next time reclaim is done */
task_bucket[i] = NULL;
/* Iterate through every task with this adj */
old_vindex = *vindex;
do {
struct task_struct *vtsk;
vtsk = find_lock_task_mm(tsk);
if (!vtsk)
continue;
/* Store this potential victim away for later */
victims[*vindex].tsk = vtsk;
victims[*vindex].mm = vtsk->mm;
victims[*vindex].size = get_total_mm_pages(vtsk->mm);
/* Count the number of pages that have been found */
pages_found += victims[*vindex].size;
/* Make sure there's space left in the victim array */
if (++*vindex == MAX_VICTIMS)
break;
} while ((tsk = tsk->simple_lmk_next));
/* Go to the next bucket if nothing was found */
if (*vindex == old_vindex)
continue;
/*
* Sort the victims in descending order of size to prioritize
* killing the larger ones first.
*/
sort(&victims[old_vindex], *vindex - old_vindex,
sizeof(*victims), victim_cmp, victim_swap);
/* Stop when we are out of space or have enough pages found */
if (*vindex == MAX_VICTIMS || pages_found >= MIN_FREE_PAGES) {
/* Zero out any remaining buckets we didn't touch */
if (i > min_adj)
memset(&task_bucket[min_adj], 0,
(i - min_adj) * sizeof(*task_bucket));
break;
}
}
rcu_read_unlock();
return pages_found;
}
static int process_victims(int vlen)
{
unsigned long pages_found = 0;
int i, nr_to_kill = 0;
/*
* Calculate the number of tasks that need to be killed and quickly
* release the references to those that'll live.
*/
for (i = 0; i < vlen; i++) {
struct victim_info *victim = &victims[i];
struct task_struct *vtsk = victim->tsk;
/* The victim's mm lock is taken in find_victims; release it */
if (pages_found >= MIN_FREE_PAGES) {
task_unlock(vtsk);
} else {
pages_found += victim->size;
nr_to_kill++;
}
}
return nr_to_kill;
}
static void set_task_rt_prio(struct task_struct *tsk, int priority)
{
const struct sched_param rt_prio = {
.sched_priority = priority
};
sched_setscheduler_nocheck(tsk, SCHED_RR, &rt_prio);
}
static void scan_and_kill(void)
{
int i, nr_to_kill, nr_found = 0;
unsigned long pages_found;
/*
* Reset nr_victims so the reaper thread and simple_lmk_mm_freed() are
* aware that the victims array is no longer valid.
*/
write_lock(&mm_free_lock);
nr_victims = 0;
write_unlock(&mm_free_lock);
/* Populate the victims array with tasks sorted by adj and then size */
pages_found = find_victims(&nr_found);
if (unlikely(!nr_found)) {
pr_err_ratelimited("No processes available to kill!\n");
return;
}
/* Minimize the number of victims if we found more pages than needed */
if (pages_found > MIN_FREE_PAGES) {
/* First round of processing to weed out unneeded victims */
nr_to_kill = process_victims(nr_found);
/*
* Try to kill as few of the chosen victims as possible by
* sorting the chosen victims by size, which means larger
* victims that have a lower adj can be killed in place of
* smaller victims with a high adj.
*/
sort(victims, nr_to_kill, sizeof(*victims), victim_cmp,
victim_swap);
/* Second round of processing to finally select the victims */
nr_to_kill = process_victims(nr_to_kill);
} else {
/* Too few pages found, so all the victims need to be killed */
nr_to_kill = nr_found;
}
/*
* Store the final number of victims for simple_lmk_mm_freed() and the
* reaper thread, and indicate that reclaim is active.
*/
write_lock(&mm_free_lock);
nr_victims = nr_to_kill;
reclaim_active = true;
write_unlock(&mm_free_lock);
/* Kill the victims */
for (i = 0; i < nr_to_kill; i++) {
struct victim_info *victim = &victims[i];
struct task_struct *t, *vtsk = victim->tsk;
struct mm_struct *mm = victim->mm;
pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
vtsk->signal->oom_score_adj,
victim->size << (PAGE_SHIFT - 10));
/* Make the victim reap anonymous memory first in exit_mmap() */
set_bit(MMF_OOM_VICTIM, &mm->flags);
/* Accelerate the victim's death by forcing the kill signal */
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true);
/*
* Mark the thread group dead so that other kernel code knows,
* and then elevate the thread group to SCHED_RR with minimum RT
* priority. The entire group needs to be elevated because
* there's no telling which threads have references to the mm as
* well as which thread will happen to put the final reference
* and release the mm's memory. If the mm is released from a
* thread with low scheduling priority then it may take a very
* long time for exit_mmap() to complete.
*/
rcu_read_lock();
for_each_thread(vtsk, t)
set_tsk_thread_flag(t, TIF_MEMDIE);
for_each_thread(vtsk, t)
set_task_rt_prio(t, 1);
rcu_read_unlock();
/* Allow the victim to run on any CPU. This won't schedule. */
set_cpus_allowed_ptr(vtsk, cpu_all_mask);
/* Signals can't wake frozen tasks; only a thaw operation can */
__thaw_task(vtsk);
/* Store the number of anon pages to sort victims for reaping */
victim->size = get_mm_counter(mm, MM_ANONPAGES);
/* Finally release the victim's task lock acquired earlier */
task_unlock(vtsk);
}
/*
* Sort the victims by descending order of anonymous pages so the reaper
* thread can prioritize reaping the victims with the most anonymous
* pages first. Then wake the reaper thread if it's asleep. The lock
* orders the needs_reap store before waitqueue_active().
*/
write_lock(&mm_free_lock);
sort(victims, nr_to_kill, sizeof(*victims), victim_cmp, victim_swap);
atomic_set(&needs_reap, 1);
write_unlock(&mm_free_lock);
if (waitqueue_active(&reaper_waitq))
wake_up(&reaper_waitq);
/* Wait until all the victims die or until the timeout is reached */
if (!wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES))
pr_info("Timeout hit waiting for victims to die, proceeding\n");
/* Clean up for future reclaims but let the reaper thread keep going */
write_lock(&mm_free_lock);
reinit_completion(&reclaim_done);
reclaim_active = false;
nr_killed = (atomic_t)ATOMIC_INIT(0);
write_unlock(&mm_free_lock);
}
static int simple_lmk_reclaim_thread(void *data)
{
/* Use maximum RT priority */
set_task_rt_prio(current, MAX_RT_PRIO - 1);
set_freezable();
while (1) {
wait_event_freezable(oom_waitq, atomic_read(&needs_reclaim));
scan_and_kill();
atomic_set(&needs_reclaim, 0);
}
return 0;
}
static struct mm_struct *next_reap_victim(void)
{
struct mm_struct *mm = NULL;
bool should_retry = false;
int i;
/* Take a write lock so no victim's mm can be freed while scanning */
write_lock(&mm_free_lock);
for (i = 0; i < nr_victims; i++, mm = NULL) {
/* Check if this victim is alive and hasn't been reaped yet */
mm = victims[i].mm;
if (!mm || test_bit(MMF_OOM_SKIP, &mm->flags))
continue;
/* Do a trylock so the reaper thread doesn't sleep */
if (!down_read_trylock(&mm->mmap_sem)) {
should_retry = true;
continue;
}
/* Skip any mm with notifiers for now since they can sleep */
if (mm_has_notifiers(mm)) {
up_read(&mm->mmap_sem);
should_retry = true;
continue;
}
/*
* Check MMF_OOM_SKIP again under the lock in case this mm was
* reaped by exit_mmap() and then had its page tables destroyed.
* No mmgrab() is needed because the reclaim thread sets
* MMF_OOM_VICTIM under task_lock() for the mm's task, which
* guarantees that MMF_OOM_VICTIM is always set before the
* victim mm can enter exit_mmap(). Therefore, an mmap read lock
* is sufficient to keep the mm struct itself from being freed.
*/
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
break;
up_read(&mm->mmap_sem);
}
if (!mm) {
if (should_retry)
/* Return ERR_PTR(-EAGAIN) to try reaping again later */
mm = ERR_PTR(-EAGAIN);
else if (!reclaim_active)
/*
* Nothing left to reap, so stop simple_lmk_mm_freed()
* from iterating over the victims array since reclaim
* is no longer active. Return NULL to stop reaping.
*/
nr_victims = 0;
}
write_unlock(&mm_free_lock);
return mm;
}
static void reap_victims(void)
{
struct mm_struct *mm;
while ((mm = next_reap_victim())) {
if (IS_ERR(mm)) {
/* Wait one jiffy before trying to reap again */
schedule_timeout_uninterruptible(1);
continue;
}
/*
* Reap the victim, then unflag the mm for exit_mmap() reaping
* and mark it as reaped with MMF_OOM_SKIP.
*/
__oom_reap_task_mm(mm);
clear_bit(MMF_OOM_VICTIM, &mm->flags);
set_bit(MMF_OOM_SKIP, &mm->flags);
up_read(&mm->mmap_sem);
}
}
static int simple_lmk_reaper_thread(void *data)
{
/* Use a lower priority than the reclaim thread */
set_task_rt_prio(current, MAX_RT_PRIO - 2);
set_freezable();
while (1) {
wait_event_freezable(reaper_waitq,
atomic_cmpxchg_relaxed(&needs_reap, 1, 0));
reap_victims();
}
return 0;
}
void simple_lmk_mm_freed(struct mm_struct *mm)
{
int i;
/*
* Victims are guaranteed to have MMF_OOM_SKIP set after exit_mmap()
* finishes. Use this to ignore unrelated dying processes.
*/
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
return;
read_lock(&mm_free_lock);
for (i = 0; i < nr_victims; i++) {
if (victims[i].mm == mm) {
/*
* Clear out this victim from the victims array and only
* increment nr_killed if reclaim is active. If reclaim
* isn't active, then clearing out the victim is done
* solely for the reaper thread to avoid freed victims.
*/
victims[i].mm = NULL;
if (reclaim_active &&
atomic_inc_return_relaxed(&nr_killed) == nr_victims)
complete(&reclaim_done);
break;
}
}
read_unlock(&mm_free_lock);
}
static int simple_lmk_vmpressure_cb(struct notifier_block *nb,
unsigned long pressure, void *data)
{
if (pressure == 100) {
atomic_set(&needs_reclaim, 1);
smp_mb__after_atomic();
if (waitqueue_active(&oom_waitq))
wake_up(&oom_waitq);
}
return NOTIFY_OK;
}
static struct notifier_block vmpressure_notif = {
.notifier_call = simple_lmk_vmpressure_cb,
.priority = INT_MAX
};
/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
{
static atomic_t init_done = ATOMIC_INIT(0);
struct task_struct *thread;
if (!atomic_cmpxchg(&init_done, 0, 1)) {
thread = kthread_run(simple_lmk_reaper_thread, NULL,
"simple_lmkd_reaper");
BUG_ON(IS_ERR(thread));
thread = kthread_run(simple_lmk_reclaim_thread, NULL,
"simple_lmkd");
BUG_ON(IS_ERR(thread));
BUG_ON(vmpressure_notifier_register(&vmpressure_notif));
}
return 0;
}
static const struct kernel_param_ops simple_lmk_init_ops = {
.set = simple_lmk_init_set
};
/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "lowmemorykiller."
module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);