diff --git a/anolis/configs/L1-RECOMMEND/arm64/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/arm64/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..b5e3b74de109d4e339db4df0a247d2899bafb8e5 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/arm64/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +# CONFIG_GROUP_IDENTITY is not set diff --git a/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..b5e3b74de109d4e339db4df0a247d2899bafb8e5 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/loongarch/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +# CONFIG_GROUP_IDENTITY is not set diff --git a/anolis/configs/L1-RECOMMEND/riscv/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/riscv/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..b5e3b74de109d4e339db4df0a247d2899bafb8e5 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/riscv/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +# CONFIG_GROUP_IDENTITY is not set diff --git a/anolis/configs/L1-RECOMMEND/sw_64-6b/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/sw_64-6b/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..b5e3b74de109d4e339db4df0a247d2899bafb8e5 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/sw_64-6b/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +# CONFIG_GROUP_IDENTITY is not set diff --git a/anolis/configs/L1-RECOMMEND/sw_64-8a/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/sw_64-8a/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..b5e3b74de109d4e339db4df0a247d2899bafb8e5 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/sw_64-8a/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +# CONFIG_GROUP_IDENTITY is not set diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_GROUP_IDENTITY b/anolis/configs/L1-RECOMMEND/x86/CONFIG_GROUP_IDENTITY new file mode 100644 index 0000000000000000000000000000000000000000..a35767a77c2e893035aa7d262669748a39f1e832 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_GROUP_IDENTITY @@ -0,0 +1 @@ +CONFIG_GROUP_IDENTITY=y diff --git a/include/linux/sched.h b/include/linux/sched.h index 98afbfc297b87e8c3d27cda0337a7a387dd1e1e7..59231a6b087df0deb7bd3db0bdff1b03f3159bff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -647,6 +647,9 @@ struct sched_entity { #endif struct list_head expel_node; +#ifdef CONFIG_GROUP_IDENTITY + long priority; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/init/Kconfig b/init/Kconfig index 80ab159536a5ea25ee2948469c7605468480e784..2f0b5d5f79b400de50aba9898532b7e5c7578bd4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1012,6 +1012,24 @@ config FAIR_GROUP_SCHED select GROUP_SCHED_WEIGHT default CGROUP_SCHED +config GROUP_IDENTITY + bool "Group identity for SCHED_OTHER" + depends on FAIR_GROUP_SCHED && 64BIT && SMP && SYSFS + default FAIR_GROUP_SCHED + help + Enable per-cgroup identity based preemption semantics for CFS. + This uses cpu.cgroup 'cpu.priority' as a strict 3-state class knob: + 1: high class + Can preempt normal/low on wakeup and tick. + 0: normal class + Uses the default fair-class behavior. + -1: low class + Yields to high/normal on wakeup and tick. + + Values other than -1, 0 and 1 are rejected. This interface is + intentionally modeled as a 3-class collocation control, not a + continuous priority scale. + config CFS_BANDWIDTH bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" depends on FAIR_GROUP_SCHED diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f4eba861b11b0aae6991d348dce0e..f2685e6a5b1843a2a2c33055956a616dbec5305a 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -73,6 +73,10 @@ # include "debug.c" #endif +#ifdef CONFIG_GROUP_IDENTITY +# include "group_identity.c" +#endif + #ifdef CONFIG_SCHEDSTATS # include "stats.c" #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e655f64f7bbd17f8f37ffaf50978af4afed53179..a014acdc2fba6cbb884e8565245e4cb7fa982319 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4465,6 +4465,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; +#ifdef CONFIG_GROUP_IDENTITY + p->se.priority = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ @@ -8499,6 +8502,9 @@ struct task_group root_task_group = { .lat_stat_cpu = &root_lat_stat_cpu, .alistats = &root_alistats, #endif +#ifdef CONFIG_GROUP_IDENTITY + .priority = 0, +#endif }; LIST_HEAD(task_groups); @@ -10042,13 +10048,35 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 idle) { int ret; + struct task_group *tg = css_tg(css); - ret = sched_group_set_idle(css_tg(css), idle); + ret = sched_group_set_idle(tg, idle); if (!ret) - scx_group_set_idle(css_tg(css), idle); + scx_group_set_idle(tg, READ_ONCE(tg->idle)); + return ret; +} + +#ifdef CONFIG_GROUP_IDENTITY +static s64 cpu_priority_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->priority; +} + +static int cpu_priority_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 priority) +{ + int ret; + struct task_group *tg = css_tg(css); + + ret = sched_group_set_priority(tg, priority); + if (!ret) + scx_group_set_idle(tg, READ_ONCE(tg->idle)); + return ret; } #endif +#endif #ifdef CONFIG_FAIR_GROUP_SCHED static u64 cpu_slice_read_u64(struct cgroup_subsys_state *css, @@ -10252,6 +10280,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, +#ifdef CONFIG_GROUP_IDENTITY + { + .name = "priority", + .read_s64 = cpu_priority_read_s64, + .write_s64 = cpu_priority_write_s64, + }, +#endif { .name = "slice_us", .flags = CFTYPE_NOT_ON_ROOT, @@ -10966,6 +11001,14 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, +#ifdef CONFIG_GROUP_IDENTITY + { + .name = "priority", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_priority_read_s64, + .write_s64 = cpu_priority_write_s64, + }, +#endif { .name = "slice", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cdbd2e60e91e17b156974d9b82c2515ecf713ce..d724b2f869a50b8ce1505d59c389d571644004a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1344,6 +1344,150 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq, return false; } +#ifdef CONFIG_GROUP_IDENTITY +/* + * cpu.priority is intentionally a strict 3-state class selector: + * 1 => high + * 0 => normal + * -1 => low + * + * sched_group_set_priority() rejects every other value, so the helper + * predicates below can test these exact class values directly while leaving + * the wakeup/tick policy unchanged. + */ + +static inline bool is_highclass(const struct sched_entity *se) +{ + return READ_ONCE(se->priority) == 1; +} + +static inline bool is_underclass(const struct sched_entity *se) +{ + return READ_ONCE(se->priority) == -1; +} + +static inline bool is_normalclass(const struct sched_entity *se) +{ + return READ_ONCE(se->priority) == 0; +} + +static inline struct sched_entity *pick_eevdf_ignore_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best; + s64 vlag; + + if (!curr || !protect_slice(curr)) + return pick_eevdf(cfs_rq); + + vlag = curr->vlag; + cancel_protect_slice(curr); + best = pick_eevdf(cfs_rq); + curr->vlag = vlag; + + return best; +} + +/* + * Identity wakeup policy: + * - {high, normal} waking against low forces a reschedule. The final winner + * still comes from the normal post-resched pick path. + * - high waking against normal only removes current's slice protection, then + * lets the regular EEVDF pick path decide whether the wakee wins. + * - lower classes never wakeup-preempt higher classes. + * + * Returns: + * 2: force reschedule + * 1: drop current's slice protection and defer to EEVDF + * 0: keep the default same-class EEVDF wakeup-preemption + * -1: block wakeup-preemption + */ +static inline int id_wakeup_preempt_action(struct sched_entity *curr, + struct sched_entity *se) +{ + long curr_prio = READ_ONCE(curr->priority); + long se_prio = READ_ONCE(se->priority); + bool under_curr = curr_prio == -1; + bool under_se = se_prio == -1; + bool high_curr = curr_prio == 1; + + if (!sched_feat(GROUP_IDENTITY_PREEMPT)) + return 0; + + /* same class: use default EEVDF */ + if (curr_prio == se_prio) + return 0; + + /* underclass never wakeup-preempts {high, normal} */ + if (under_se) + return -1; + + /* {high, normal} waking against underclass triggers an immediate resched */ + if (under_curr) + return 2; + + /* NORMAL should not wakeup-preempt HIGH */ + if (high_curr) + return -1; + + /* + * HIGH waking against NORMAL current: + * drop RUN_TO_PARITY protection first, then let pick_eevdf() determine + * whether the wakee really wins the post-resched competition. + */ + return 1; +} + +static inline bool id_tick_preempt_class(const struct sched_entity *curr, + const struct sched_entity *se) +{ + if (is_highclass(se)) + return !is_highclass(curr); + + return is_normalclass(se) && is_underclass(curr); +} + +/* + * Tick-side identity preemption ignores current's slice protection, looks at + * the resulting EEVDF winner, and only reschedules when that winner is a + * higher class. + */ +static inline bool id_tick_preempt_needed(struct cfs_rq *cfs_rq, + struct sched_entity *curr) +{ + struct sched_entity *se; + + if (!sched_feat(GROUP_IDENTITY_PREEMPT)) + return false; + + /* highclass can't be preempted by higher class */ + if (is_highclass(curr)) + return false; + + se = pick_eevdf_ignore_slice(cfs_rq); + if (!se || se == curr) + return false; + + if (!id_tick_preempt_class(curr, se)) + return false; + + cancel_protect_slice(curr); + return true; +} +#else +static inline int id_wakeup_preempt_action(struct sched_entity *curr, + struct sched_entity *se) +{ + return 0; +} + +static inline bool id_tick_preempt_needed(struct cfs_rq *cfs_rq, + struct sched_entity *curr) +{ + return false; +} +#endif + /* * Used by other classes to account runtime. */ @@ -4260,6 +4404,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) return u64_u32_load_copy(cfs_rq->avg.last_update_time, cfs_rq->last_update_time_copy); } + #ifdef CONFIG_FAIR_GROUP_SCHED DEFINE_PER_CPU(struct cpumask, cpus_allowed_alt); /* Decide which node for @tg to run on*/ @@ -5842,6 +5987,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) return; } #endif + + if (cfs_rq->nr_queued > 1 && id_tick_preempt_needed(cfs_rq, curr)) + resched_curr(rq_of(cfs_rq)); } @@ -8957,8 +9105,11 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + struct sched_entity *pse_task = pse; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int cse_is_idle, pse_is_idle; + int prio_preempt; + bool next_buddy_marked = false; if (unlikely(se == pse)) return; @@ -8972,8 +9123,11 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); + if (sched_feat(NEXT_BUDDY) && + !(wake_flags & WF_FORK) && + !pse_task->sched_delayed) { + set_next_buddy(pse_task); + next_buddy_marked = true; } /* @@ -9022,6 +9176,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); + + prio_preempt = id_wakeup_preempt_action(se, pse); + if (prio_preempt == 2) { + if (!next_buddy_marked && !pse_task->sched_delayed) + set_next_buddy(pse_task); + goto preempt; + } + + if (prio_preempt == -1) + return; + /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. @@ -9029,7 +9194,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) + if (do_preempt_short(cfs_rq, pse, se) || prio_preempt == 1) cancel_protect_slice(se); /* @@ -13841,7 +14006,6 @@ static void task_change_group_fair(struct task_struct *p) update_nr_iowait_fair(p, -1); detach_task_cfs_rq(p); - #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; @@ -13910,6 +14074,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; +#ifdef CONFIG_GROUP_IDENTITY + WRITE_ONCE(tg->priority, READ_ONCE(parent->priority)); +#endif init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); tg_set_specs_ratio(tg); @@ -14039,6 +14206,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, seqcount_init(&se->idle_seqcount); spin_lock_init(&se->iowait_lock); se->cg_idle_start = se->cg_init_time = cpu_clock(cpu); +#ifdef CONFIG_GROUP_IDENTITY + WRITE_ONCE(se->priority, READ_ONCE(tg->priority)); +#endif INIT_LIST_HEAD(&se->expel_node); } @@ -14094,22 +14264,26 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) return ret; } -int sched_group_set_idle(struct task_group *tg, long idle) +static int __sched_group_set_idle_locked(struct task_group *tg, long idle) { int i; + lockdep_assert_held(&shares_mutex); + if (tg == &root_task_group) return -EINVAL; if (idle < 0 || idle > 1) return -EINVAL; - mutex_lock(&shares_mutex); +#ifdef CONFIG_GROUP_IDENTITY + if ((!idle && READ_ONCE(tg->priority) == -1) || + (idle && READ_ONCE(tg->priority) == 1)) + return -EINVAL; +#endif - if (tg->idle == idle) { - mutex_unlock(&shares_mutex); + if (tg->idle == idle) return 0; - } tg->idle = idle; @@ -14155,10 +14329,64 @@ int sched_group_set_idle(struct task_group *tg, long idle) else __sched_group_set_shares(tg, NICE_0_LOAD); - mutex_unlock(&shares_mutex); return 0; } +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int ret; + + mutex_lock(&shares_mutex); + ret = __sched_group_set_idle_locked(tg, idle); + mutex_unlock(&shares_mutex); + + return ret; +} + +#ifdef CONFIG_GROUP_IDENTITY +/* + * cpu.priority is a 3-class identity control, not a continuous priority + * scale. Low class reuses idle-group semantics while it is active; writing a + * non-low class through cpu.priority clears any existing idle-group state. + */ +int sched_group_set_priority(struct task_group *tg, s64 priority) +{ + int i; + long old_priority; + int ret; + + if (priority != -1 && priority != 0 && priority != 1) + return -EINVAL; + + if (tg == &root_task_group) + return -EINVAL; + + mutex_lock(&shares_mutex); + + old_priority = READ_ONCE(tg->priority); + if (old_priority != priority) { + WRITE_ONCE(tg->priority, priority); + for_each_possible_cpu(i) { + struct sched_entity *se = tg->se[i]; + + if (!se) + continue; + WRITE_ONCE(se->priority, priority); + } + } + + if (priority == -1) + ret = __sched_group_set_idle_locked(tg, 1); + else if (tg->idle) + ret = __sched_group_set_idle_locked(tg, 0); + else + ret = 0; + + mutex_unlock(&shares_mutex); + return ret; +} +#endif + int sched_group_set_slice(struct task_group *tg, u64 slice_us) { u64 slice = 0; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 6f7ac03fffdd8161874165ef26642e6ced8f3877..9cde367a290e327f83ab262bd984f588aaa39f4f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -133,6 +133,9 @@ SCHED_FEAT(NUMA_AFFINE, false) SCHED_FEAT(ID_BOOK_CPU, false) SCHED_FEAT(ID_LOAD_BALANCE, false) SCHED_FEAT(ID_ABSOLUTE_EXPEL, false) +#ifdef CONFIG_GROUP_IDENTITY +SCHED_FEAT(GROUP_IDENTITY_PREEMPT, false) +#endif #ifdef CONFIG_SMP SCHED_FEAT(ID_PUSH_EXPELLEE, false) SCHED_FEAT(ID_PUSH_EXPELLEE_CONSIDER_HIGHCLASS, true) diff --git a/kernel/sched/group_identity.c b/kernel/sched/group_identity.c new file mode 100644 index 0000000000000000000000000000000000000000..996fb5395bbcc174531b4a24a0889fe6e236d2ba --- /dev/null +++ b/kernel/sched/group_identity.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Scheduler group identity sysfs ABI. + */ + +#include +#include + +#define GROUP_IDENTITY_ABI 2 +#define GROUP_IDENTITY_SCHEME "group_identity_2.0" + +#define GI_ATTR_RO(_name) \ + static struct kobj_attribute group_identity_attr_##_name = __ATTR_RO(_name) + +static ssize_t abi_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", GROUP_IDENTITY_ABI); +} +GI_ATTR_RO(abi); + +static ssize_t scheme_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", GROUP_IDENTITY_SCHEME); +} +GI_ATTR_RO(scheme); + +static struct attribute *group_identity_attrs[] = { + &group_identity_attr_abi.attr, + &group_identity_attr_scheme.attr, + NULL, +}; + +static const struct attribute_group group_identity_attr_group = { + .attrs = group_identity_attrs, +}; + +static struct kset *group_identity_kset; + +static int __init group_identity_sysfs_init(void) +{ + int ret; + + group_identity_kset = kset_create_and_add("group_identity", NULL, + kernel_kobj); + if (!group_identity_kset) + return -ENOMEM; + + ret = sysfs_create_group(&group_identity_kset->kobj, + &group_identity_attr_group); + if (ret) { + kset_unregister(group_identity_kset); + group_identity_kset = NULL; + return ret; + } + + return 0; +} +subsys_initcall(group_identity_sysfs_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4b726bb59761f704107a2efee9162f2b21af7d84..905839a8ca0cc75d8fe08e88d22fb08c4c7d35f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -690,6 +690,9 @@ struct task_group { raw_spinlock_t gb_lock; #endif +#ifdef CONFIG_GROUP_IDENTITY + long priority; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -773,6 +776,15 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); +#ifdef CONFIG_GROUP_IDENTITY +extern int sched_group_set_priority(struct task_group *tg, s64 priority); +#else +static inline int sched_group_set_priority(struct task_group *tg, s64 priority) +{ + return 0; +} +#endif + extern int sched_group_set_slice(struct task_group *tg, u64 slice_us); #ifdef CONFIG_SMP