[scheduler]三.CFS调度算法基本思想
CFS的主要思想如下:
根据普通进程的优先级nice值来定⼀个⽐重(weight),该⽐重⽤来计算进程的实际运⾏时间到虚拟运⾏时间(vruntime)的换算;不⾔⽽喻优先级⾼的进程运⾏更多的时间和优先级低的进程运⾏更少的时间在vruntime上是等价的;
根据rq->cfs_rq中进程的数量计算⼀个总的period周期,每个进程再根据⾃⼰的weight占整个的⽐重来计算⾃⼰的理想运⾏时间(ideal_runtime),在scheduler_tick()中判断如果进程的实际运⾏时间(exec_runtime)已经达到理想运⾏时间(ideal_runtime),则进程需要被调度test_tsk_need_resched(curr)。有了period,那么cfs_rq中所有进程在period以内必会得到调度
与此同时,设定⼀个调度周期(sched_latency_ns),⽬标是让每个进程在这个周期内⾄少有机会运⾏⼀次,换⼀种说法就是每个进程等待CPU的时间最长不超过这个调度周期
另⼀个参数sched_min_granularity_ns发挥作⽤的⼀个场景是,CFS把调度周期sched_latency按照进程的数量平分,给每个进程平均分配CPU时间⽚(当然要按照 nice值加权,即根据weight),但是如果进程数量太多的话,就会造成CPU时间⽚太⼩,如果⼩于sched_min_granularity_ns的话就以sched_min_
granularity_ns为准;⽽调度周期也随之不再遵守 sched_latency_ns,⽽是以(sched_min_granularity_ns * 进程数量) 的乘积为准。
参数sched_min_granularity_ns发挥作⽤的另⼀个场景是:参数限定了⼀个唤醒进程要抢占当前进程之前必须满⾜的条件:只有当该唤醒进程的vruntime⽐当前进程的vruntime⼩、并且两者差距 (vdiff)⼤于sched_wakeup_granularity_ns的情况下,才可以抢占,否则不可以。这个参数越⼤,发⽣唤醒抢占就越不容易。
根据进程的虚拟运⾏时间(vruntime),把rq->cfs_rq中的进程组织成⼀个红⿊树(平衡⼆叉树),那么在pick_next_entity时树的最左节点就是运⾏时间最少的进程,是最好的需要调度的候选⼈。
今天九⽉⼀号,需要加油哦。哈哈。
既然task是通过vruntime时间来组织红⿊树,并且调度算法也是通过最左边的叶⼦节点(最⼩的vruntime)来调度task的。我们先看看vruntime怎么计算?
##vruntime怎么计算的
每个进程的vruntime = runtime * (NICE_0_LOAD/nice_n_weight)
我们能够看到优先级对应的权重设定如下:
nice(0)=NICE_0_LOAD = 1024,nice(1)=nice(0)/1.25,nice(-1)=nice(0)*1.25,如下表所⽰:
/* 该表的主要思想是,⾼⼀个等级的weight是低⼀个等级的 1.25 倍 */
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we u a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/
* 15 */ 36, 29, 23, 18, 15,
};
NICE_0_LOAD(1024)在schedule计算中是⼀个⾮常神奇的数字,它的含义就是基准”1”。因为kernel不能表⽰⼩数,所以把1放⼤称为1024。
TICK_NSEC周期更新vruntime⽅式如下:
scheduler_tick---->task_tick_fair---->enqueue_tick----->update_curr,如下:
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
sched_clock_tick();
raw_spin_lock(&rq->lock);
/*设置rq的window_start时间*/
walt_t_window_start(rq);
/*更新task的runnable time,pre runnable time以及当前task的demand时间*/
walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
walt_ktime_clock(), 0);
update_rq_clock(rq); /*更新rq运⾏时间*/
#ifdef CONFIG_INTEL_DWS
if (sched_feat(INTEL_DWS))
update_rq_runnable_task_avg(rq);
#endif
/
*task 每个周期内vruntime虚拟运⾏时间的更新*/
curr->sched_class->task_tick(rq, curr, 0);
/*load是作为负载均衡使⽤的,后⾯会单独讲load的衰减和计算*/
update_cpu_load_active(rq); /*更新cpu load*/
calc_global_load_tick(rq); /*更新系统负载*/
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
rq_last_tick_ret(rq);
if (curr->sched_class == &fair_sched_class)
check_for_migration(rq, curr);
}
/*
* scheduler tick hitting a task of our scheduling class:
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity * = &curr->;
drakensang online
struct sched_domain *sd;
/*这是计算(调度实体)的 vruntime*/
for_each_sched_entity() {
cfs_rq = cfs_rq_of();
entity_tick(cfs_rq, , queued);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
#ifdef CONFIG_64BIT_ONLY_CPU
trace_sched_load_per_bitwidth(rq->cpu, weighted_cpuload(rq->cpu),
weighted_cpuload_32bit(rq->cpu));
#endif
#ifdef CONFIG_SMP
#ifdef CONFIG_SMP
rq->misfit_task = !task_fits_max(curr, rq->cpu);
rcu_read_lock();
sd = rcu_dereference(rq->sd);
/*更加cpu是否过载,rq⾥⾯是否有Misfit task来判断sched domain是否过载,具体怎么判断过载有专门⽂章讲过*/
if (sd) {
virgina
if (cpu_overutilized(task_cpu(curr)))
t_sd_overutilized(sd);
if (rq->misfit_task && sd->parent)
t_sd_overutilized(sd->parent);
}
rcu_read_unlock();
#endif
}
on fire
static void
及时翻译
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq); /*更新运⾏时间和虚拟运⾏时间*/
/*
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG); /*更新entity的load,PELT算法*/
update_cfs_shares(curr);
……………………………….
/*当rq⾥⾯处于runnable的task超过⼀个时候,check是否需要调度。*/
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);veaks
}
/
*计算vruntime*/
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
if (unlikely(!curr))
return;
/
*curr运⾏时间*/
delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
/*重新标记运⾏启动时间,⽅便下个tick,计算运⾏时间*/
curr->exec_start = now;
schedstat_t(curr-&_max,
max(delta_exec, curr-&_max));
/*累加的运⾏时间*/
curr->sum_exec_runtime += delta_exec;
/*更新proc/schedstat数据*/
schedstat_add(cfs_rq, exec_clock, delta_exec);
/*每个tick周期,更新每⼀个task的vruntime*/
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
/*
* delta /= w
*/
/*计算虚拟运⾏时间*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *)
{
if (unlikely(->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &->load);
return delta;
}
上⾯仅仅是根据实际的运⾏时间,在根据进程的权重更新进程的虚拟运⾏时间,那么系统具体是怎么根据权重来计算vruntime的?我们知道:
static void t_load_weight(struct task_struct *p)
{ /*获取task的优先级*/
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->.load;
/*
* SCHED_IDLE tasks get minimal weight:
*//*设置idle thread的优先级权重*/
if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
/*设置正常进程优先级的权重,*/
load->weight = scale_load(prio_to_weight[prio]);
burden
/*进程权重的倒数,数值为2^32/weight*/
load->inv_weight = prio_to_wmult[prio];
/*上⾯两个数值都可以通过查表获取的*/
}
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we u a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
il
/
* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,announce
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
/* 15 */ 36, 29, 23, 18, 15,
};
/*
* Inver (2^32/x) values of the prio_to_weight[] array, precalculated.
*
* In cas where the weight does not change often, we can u the
* precalculated inver to speed up arithmetics by turning divisions
* into multiplications:
*/ /*进程权重的倒数,数值为2^32/weight: 2^32=4294967296 ,2^32/NICE_0_LOAD=2^32/1024=4194304 符合预期*/大学生的责任
static const u32 prio_to_wmult[40] = {
/* -20 */ 48388, 59856, 76040, 92818, 118348,
/* -15 */ 147320, 184698, 229616, 287308, 360437,
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
/*计算虚拟运⾏时间*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *)
{
if (unlikely(->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &->load);
return delta;
}
/*
* delta_exec * weight / lw.weight
* OR
什么是地震* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which ca
* we're guaranteed shift stays positive becau inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
* Or, weight =< lw.weight (becau lw.weight is the runqueue weight), thus
* weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
int shift = WMULT_SHIFT;
__update_inv_weight(lw);
if (unlikely(fact >> 32)) {
while (fact >> 32) {
fact >>= 1;
shift--;
}
}
/* hint to u a 32x32->64 mul */
fact = (u64)(u32)fact * lw->inv_weight;
while (fact >> 32) {
fact >>= 1;
shift--;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
可以知道核⼼计算函数是__calc_delta,是怎么计算的.我们知道 vruntime = runtime * (NICE_0_LOAD/nice_n_weight) =