diff --git a/init/Kconfig b/init/Kconfig index 1397ad3959bf..b86b4b47a5b9 100755 --- a/init/Kconfig +++ b/init/Kconfig @@ -1086,6 +1086,31 @@ config NET_NS endif # NAMESPACES +config SCHED_CASS + bool "Capacity Aware Superset Scheduler" + depends on SMP + help + This enables the Capacity Aware Superset Scheduler (CASS), which + optimizes runqueue selection of CFS tasks. By using CPU capacity as a + basis for comparing the relative utilization between different CPUs, + CASS fairly balances load across CPUs of varying capacities. This + results in improved multi-core performance, especially when CPUs are + overutilized because CASS doesn't clip a CPU's utilization when it + eclipses the CPU's capacity. + + As a superset of capacity aware scheduling, CASS implements a + hierarchy of criteria to determine the better CPU to wake a task upon + between CPUs that have the same relative utilization. This way, + single-core performance, latency, and cache affinity are all optimized + where possible. + + CASS doesn't feature explicit energy awareness but its basic load + balancing principle results in decreased overall energy, often better + than what is possible with explicit energy awareness. By fairly + balancing load based on relative utilization, all CPUs are kept at + their lowest P-state necessary to satisfy the overall load at any + given moment. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/cass.c b/kernel/sched/cass.c new file mode 100644 index 000000000000..bba9c33f6d7b --- /dev/null +++ b/kernel/sched/cass.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Sultan Alsawaf . + */ + +/** + * DOC: Capacity Aware Superset Scheduler (CASS) description + * + * The Capacity Aware Superset Scheduler (CASS) optimizes runqueue selection of + * CFS tasks. By using CPU capacity as a basis for comparing the relative + * utilization between different CPUs, CASS fairly balances load across CPUs of + * varying capacities. This results in improved multi-core performance, + * especially when CPUs are overutilized because CASS doesn't clip a CPU's + * utilization when it eclipses the CPU's capacity. + * + * As a superset of capacity aware scheduling, CASS implements a hierarchy of + * criteria to determine the better CPU to wake a task upon between CPUs that + * have the same relative utilization. This way, single-core performance, + * latency, and cache affinity are all optimized where possible. + * + * CASS doesn't feature explicit energy awareness but its basic load balancing + * principle results in decreased overall energy, often better than what is + * possible with explicit energy awareness. By fairly balancing load based on + * relative utilization, all CPUs are kept at their lowest P-state necessary to + * satisfy the overall load at any given moment. + */ + +struct cass_cpu_cand { + int cpu; + unsigned int exit_lat; + unsigned long cap; + unsigned long util; +}; + +static __always_inline +unsigned long cass_cpu_util(int cpu, bool sync) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + + /* Deduct @current's util from this CPU if this is a sync wake */ + if (sync && cpu == smp_processor_id()) + sub_positive(&util, task_util(current)); + + if (sched_feat(UTIL_EST)) + util = max_t(unsigned long, util, + READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return util; +} + +/* Returns true if @a is a better CPU than @b */ +static __always_inline +bool cass_cpu_better(const struct cass_cpu_cand *a, + const struct cass_cpu_cand *b, + int prev_cpu, bool sync) +{ +#define cass_cmp(a, b) ({ res = (a) - (b); }) +#define cass_eq(a, b) ({ res = (a) == (b); }) + long res; + + /* Prefer the CPU with lower relative utilization */ + if (cass_cmp(b->util, a->util)) + goto done; + + /* Prefer the current CPU for sync wakes */ + if (sync && (cass_eq(a->cpu, smp_processor_id()) || + !cass_cmp(b->cpu, smp_processor_id()))) + goto done; + + /* Prefer the CPU with higher capacity */ + if (cass_cmp(a->cap, b->cap)) + goto done; + + /* Prefer the CPU with lower idle exit latency */ + if (cass_cmp(b->exit_lat, a->exit_lat)) + goto done; + + /* Prefer the previous CPU */ + if (cass_eq(a->cpu, prev_cpu) || !cass_cmp(b->cpu, prev_cpu)) + goto done; + + /* Prefer the CPU that shares a cache with the previous CPU */ + if (cass_cmp(cpus_share_cache(a->cpu, prev_cpu), + cpus_share_cache(b->cpu, prev_cpu))) + goto done; + + /* @a isn't a better CPU than @b. @res must be <=0 to indicate such. */ +done: + /* @a is a better CPU than @b if @res is positive */ + return res > 0; +} + +static int cass_best_cpu(struct task_struct *p, int prev_cpu, bool sync) +{ + /* Initialize @best such that @best always has a valid CPU at the end */ + struct cass_cpu_cand cands[2], *best = cands, *curr; + struct cpuidle_state *idle_state; + bool has_idle = false; + unsigned long p_util; + int cidx = 0, cpu; + + /* Get the utilization for this task */ + p_util = task_util_est(p); + + /* + * Find the best CPU to wake @p on. The RCU read lock is needed for + * idle_get_state(). + */ + rcu_read_lock(); + for_each_cpu_and(cpu, &p->cpus_allowed, cpu_active_mask) { + /* Use the free candidate slot */ + curr = &cands[cidx]; + curr->cpu = cpu; + + /* + * Check if this CPU is idle. For sync wakes, always treat the + * current CPU as idle. + */ + if ((sync && cpu == smp_processor_id()) || idle_cpu(cpu)) { + /* Discard any previous non-idle candidate */ + if (!has_idle) { + best = curr; + cidx ^= 1; + } + has_idle = true; + + /* Nonzero exit latency indicates this CPU is idle */ + curr->exit_lat = 1; + + /* Add on the actual idle exit latency, if any */ + idle_state = idle_get_state(cpu_rq(cpu)); + if (idle_state) + curr->exit_lat += idle_state->exit_latency; + } else { + /* Skip non-idle CPUs if there's an idle candidate */ + if (has_idle) + continue; + + /* Zero exit latency indicates this CPU isn't idle */ + curr->exit_lat = 0; + } + + /* Get this CPU's utilization, possibly without @current */ + curr->util = cass_cpu_util(cpu, sync); + + /* + * Add @p's utilization to this CPU if it's not @p's CPU, to + * find what this CPU's relative utilization would look like + * if @p were on it. + */ + if (cpu != task_cpu(p)) + curr->util += p_util; + + /* + * Get the current capacity of this CPU adjusted for thermal + * pressure as well as IRQ and RT-task time. + */ + curr->cap = capacity_of(cpu); + + /* Calculate the relative utilization for this CPU candidate */ + curr->util = curr->util * SCHED_CAPACITY_SCALE / curr->cap; + + /* If @best == @curr then there's no need to compare them */ + if (best == curr) + continue; + + /* Check if this CPU is better than the best CPU found */ + if (cass_cpu_better(curr, best, prev_cpu, sync)) { + best = curr; + cidx ^= 1; + } + } + rcu_read_unlock(); + + return best->cpu; +} + +static int cass_select_task_rq_fair(struct task_struct *p, int prev_cpu, + int sd_flag, int wake_flags, + int sibling_count_hint) +{ + bool sync; + + /* Don't balance on exec since we don't know what @p will look like */ + if (sd_flag & SD_BALANCE_EXEC) + return prev_cpu; + + /* + * If there aren't any valid CPUs which are active, then just return the + * first valid CPU since it's possible for certain types of tasks to run + * on inactive CPUs. + */ + if (unlikely(!cpumask_intersects(&p->cpus_allowed, cpu_active_mask))) + return cpumask_first(&p->cpus_allowed); + + /* cass_best_cpu() needs the task's utilization, so sync it up */ + if (!(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + + sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + return cass_best_cpu(p, prev_cpu, sync); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d6f0d82c3af..df40cea1ba3e 100755 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12703,6 +12703,17 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task return rr_interval; } +#ifdef CONFIG_SCHED_CASS +#include "cass.c" + +/* Use CASS. A dummy wrapper ensures the replaced function is still "used". */ +static inline void *select_task_rq_fair_dummy(void) +{ + return (void *)select_task_rq_fair; +} +#define select_task_rq_fair cass_select_task_rq_fair +#endif /* CONFIG_SCHED_CASS */ + /* * All the scheduling class methods: */