You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
203 lines
5.9 KiB
203 lines
5.9 KiB
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2023 Sultan Alsawaf <sultan@kerneltoast.com>.
|
|
*/
|
|
|
|
/**
|
|
* DOC: Capacity Aware Superset Scheduler (CASS) description
|
|
*
|
|
* The Capacity Aware Superset Scheduler (CASS) optimizes runqueue selection of
|
|
* CFS tasks. By using CPU capacity as a basis for comparing the relative
|
|
* utilization between different CPUs, CASS fairly balances load across CPUs of
|
|
* varying capacities. This results in improved multi-core performance,
|
|
* especially when CPUs are overutilized because CASS doesn't clip a CPU's
|
|
* utilization when it eclipses the CPU's capacity.
|
|
*
|
|
* As a superset of capacity aware scheduling, CASS implements a hierarchy of
|
|
* criteria to determine the better CPU to wake a task upon between CPUs that
|
|
* have the same relative utilization. This way, single-core performance,
|
|
* latency, and cache affinity are all optimized where possible.
|
|
*
|
|
* CASS doesn't feature explicit energy awareness but its basic load balancing
|
|
* principle results in decreased overall energy, often better than what is
|
|
* possible with explicit energy awareness. By fairly balancing load based on
|
|
* relative utilization, all CPUs are kept at their lowest P-state necessary to
|
|
* satisfy the overall load at any given moment.
|
|
*/
|
|
|
|
struct cass_cpu_cand {
|
|
int cpu;
|
|
unsigned int exit_lat;
|
|
unsigned long cap;
|
|
unsigned long util;
|
|
};
|
|
|
|
static __always_inline
|
|
unsigned long cass_cpu_util(int cpu, bool sync)
|
|
{
|
|
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
|
|
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
|
|
|
|
/* Deduct @current's util from this CPU if this is a sync wake */
|
|
if (sync && cpu == smp_processor_id())
|
|
sub_positive(&util, task_util(current));
|
|
|
|
if (sched_feat(UTIL_EST))
|
|
util = max_t(unsigned long, util,
|
|
READ_ONCE(cfs_rq->avg.util_est.enqueued));
|
|
|
|
return util;
|
|
}
|
|
|
|
/* Returns true if @a is a better CPU than @b */
|
|
static __always_inline
|
|
bool cass_cpu_better(const struct cass_cpu_cand *a,
|
|
const struct cass_cpu_cand *b,
|
|
int prev_cpu, bool sync)
|
|
{
|
|
#define cass_cmp(a, b) ({ res = (a) - (b); })
|
|
#define cass_eq(a, b) ({ res = (a) == (b); })
|
|
long res;
|
|
|
|
/* Prefer the CPU with lower relative utilization */
|
|
if (cass_cmp(b->util, a->util))
|
|
goto done;
|
|
|
|
/* Prefer the current CPU for sync wakes */
|
|
if (sync && (cass_eq(a->cpu, smp_processor_id()) ||
|
|
!cass_cmp(b->cpu, smp_processor_id())))
|
|
goto done;
|
|
|
|
/* Prefer the CPU with higher capacity */
|
|
if (cass_cmp(a->cap, b->cap))
|
|
goto done;
|
|
|
|
/* Prefer the CPU with lower idle exit latency */
|
|
if (cass_cmp(b->exit_lat, a->exit_lat))
|
|
goto done;
|
|
|
|
/* Prefer the previous CPU */
|
|
if (cass_eq(a->cpu, prev_cpu) || !cass_cmp(b->cpu, prev_cpu))
|
|
goto done;
|
|
|
|
/* Prefer the CPU that shares a cache with the previous CPU */
|
|
if (cass_cmp(cpus_share_cache(a->cpu, prev_cpu),
|
|
cpus_share_cache(b->cpu, prev_cpu)))
|
|
goto done;
|
|
|
|
/* @a isn't a better CPU than @b. @res must be <=0 to indicate such. */
|
|
done:
|
|
/* @a is a better CPU than @b if @res is positive */
|
|
return res > 0;
|
|
}
|
|
|
|
static int cass_best_cpu(struct task_struct *p, int prev_cpu, bool sync)
|
|
{
|
|
/* Initialize @best such that @best always has a valid CPU at the end */
|
|
struct cass_cpu_cand cands[2], *best = cands, *curr;
|
|
struct cpuidle_state *idle_state;
|
|
bool has_idle = false;
|
|
unsigned long p_util;
|
|
int cidx = 0, cpu;
|
|
|
|
/* Get the utilization for this task */
|
|
p_util = task_util_est(p);
|
|
|
|
/*
|
|
* Find the best CPU to wake @p on. The RCU read lock is needed for
|
|
* idle_get_state().
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_cpu_and(cpu, &p->cpus_allowed, cpu_active_mask) {
|
|
/* Use the free candidate slot */
|
|
curr = &cands[cidx];
|
|
curr->cpu = cpu;
|
|
|
|
/*
|
|
* Check if this CPU is idle. For sync wakes, always treat the
|
|
* current CPU as idle.
|
|
*/
|
|
if ((sync && cpu == smp_processor_id()) || idle_cpu(cpu)) {
|
|
/* Discard any previous non-idle candidate */
|
|
if (!has_idle) {
|
|
best = curr;
|
|
cidx ^= 1;
|
|
}
|
|
has_idle = true;
|
|
|
|
/* Nonzero exit latency indicates this CPU is idle */
|
|
curr->exit_lat = 1;
|
|
|
|
/* Add on the actual idle exit latency, if any */
|
|
idle_state = idle_get_state(cpu_rq(cpu));
|
|
if (idle_state)
|
|
curr->exit_lat += idle_state->exit_latency;
|
|
} else {
|
|
/* Skip non-idle CPUs if there's an idle candidate */
|
|
if (has_idle)
|
|
continue;
|
|
|
|
/* Zero exit latency indicates this CPU isn't idle */
|
|
curr->exit_lat = 0;
|
|
}
|
|
|
|
/* Get this CPU's utilization, possibly without @current */
|
|
curr->util = cass_cpu_util(cpu, sync);
|
|
|
|
/*
|
|
* Add @p's utilization to this CPU if it's not @p's CPU, to
|
|
* find what this CPU's relative utilization would look like
|
|
* if @p were on it.
|
|
*/
|
|
if (cpu != task_cpu(p))
|
|
curr->util += p_util;
|
|
|
|
/*
|
|
* Get the current capacity of this CPU adjusted for thermal
|
|
* pressure as well as IRQ and RT-task time.
|
|
*/
|
|
curr->cap = capacity_of(cpu);
|
|
|
|
/* Calculate the relative utilization for this CPU candidate */
|
|
curr->util = curr->util * SCHED_CAPACITY_SCALE / curr->cap;
|
|
|
|
/* If @best == @curr then there's no need to compare them */
|
|
if (best == curr)
|
|
continue;
|
|
|
|
/* Check if this CPU is better than the best CPU found */
|
|
if (cass_cpu_better(curr, best, prev_cpu, sync)) {
|
|
best = curr;
|
|
cidx ^= 1;
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return best->cpu;
|
|
}
|
|
|
|
static int cass_select_task_rq_fair(struct task_struct *p, int prev_cpu,
|
|
int sd_flag, int wake_flags,
|
|
int sibling_count_hint)
|
|
{
|
|
bool sync;
|
|
|
|
/* Don't balance on exec since we don't know what @p will look like */
|
|
if (sd_flag & SD_BALANCE_EXEC)
|
|
return prev_cpu;
|
|
|
|
/*
|
|
* If there aren't any valid CPUs which are active, then just return the
|
|
* first valid CPU since it's possible for certain types of tasks to run
|
|
* on inactive CPUs.
|
|
*/
|
|
if (unlikely(!cpumask_intersects(&p->cpus_allowed, cpu_active_mask)))
|
|
return cpumask_first(&p->cpus_allowed);
|
|
|
|
/* cass_best_cpu() needs the task's utilization, so sync it up */
|
|
if (!(sd_flag & SD_BALANCE_FORK))
|
|
sync_entity_load_avg(&p->se);
|
|
|
|
sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
|
return cass_best_cpu(p, prev_cpu, sync);
|
|
}
|
|
|