本文档记录dequeue_task_fair函数的详细流程，这个函数是CFS调度类提供的dequeue_task方法的实现，当一个任务进入睡眠状态或者退出时需要将其从运行队列之中移除，从运行队列之中移除时会调用CFS提供的dequeue_task方法实现，更新CFS调度类内部的数据结构。

`dequeue_task_fair`函数

/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
	int task_sleep = flags & DEQUEUE_SLEEP;
	int idle_h_nr_running = task_has_idle_policy(p);
	bool was_sched_idle = sched_idle_rq(rq);

	util_est_dequeue(&rq->cfs, p);

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);

		cfs_rq->h_nr_running--;
		cfs_rq->idle_h_nr_running -= idle_h_nr_running;

		if (cfs_rq_is_idle(cfs_rq))
			idle_h_nr_running = 1;

		/* end evaluation on encountering a throttled cfs_rq */
		if (cfs_rq_throttled(cfs_rq))
			goto dequeue_throttle;

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight) {
			/* Avoid re-evaluating load for this entity: */
			se = parent_entity(se);
			/*
			 * Bias pick_next to pick a task from this cfs_rq, as
			 * p is sleeping when it is within its sched_slice.
			 */
			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				set_next_buddy(se);
			break;
		}
		flags |= DEQUEUE_SLEEP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);

		update_load_avg(cfs_rq, se, UPDATE_TG);
		se_update_runnable(se);
		update_cfs_group(se);

		cfs_rq->h_nr_running--;
		cfs_rq->idle_h_nr_running -= idle_h_nr_running;

		if (cfs_rq_is_idle(cfs_rq))
			idle_h_nr_running = 1;

		/* end evaluation on encountering a throttled cfs_rq */
		if (cfs_rq_throttled(cfs_rq))
			goto dequeue_throttle;

	}

	/* At this point se is NULL and we are at root level*/
	sub_nr_running(rq, 1);

	/* balance early to pull high priority tasks */
	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
		rq->next_balance = jiffies;

dequeue_throttle:
	util_est_update(&rq->cfs, p, task_sleep);
	hrtick_update(rq);
}

这个函数的主要流程都在两个for循环之中执行，这两个for循环都是在遍历调度层级之中的每一级的实体，遍历过程中更新CFS运行队列之中可运行任务(h_nr_running)与可运行空闲任务(idle_h_nr_running)的统计，这里是减小这两个统计的值并且这两个统计的值是在两个循环之中更新的，而enqueue_task_fair是增加这两个统计的值，这两个统计的更新逻辑结合enqueue_task_fair之中的解释理解，尤其需要注意为什么会在两个循环之中更新这两个统计的值。

在第一个循环中对每个实体调用dequeue_entity函数更新任务以及任务组相关统计随后将任务从CFS运行队列之中移除，这里需要注意的是并不是遍历到的所有的实体都需要从其所在的CFS运行队列之中移除，当实体所在的CFS运行队列之中还有任务时停止向上遍历，这里判断CFS运行队列之中是否还有任务是通过当前实体所在CFS运行队列的权重(cfs_rq->load.weight)是否为0来确定的。这里需要留意parent_entity返回的实体与CFS运行队列cfs_rq之间的关系，它们是一个任务组在某个cpu上的实体以及运行队列，实体参与这个cpu上的任务调度、运行队列保存任务组之中可运行的任务。接下来关注CFS运行队列的权重不为0时的处理，通过parent_entity获取当前实体在调度层级之中的上级实体，若任务p在时间片耗尽之前进入睡眠状态并且这一层没有被限流，那么将上级调度实体设置为这个实体所在CFS运行队列之中期望接下来执行的任务对应的实体。这里通过task_sleep来判断任务p是否是一个在时间片耗尽之前就进入睡眠状态的任务，若task_sleep为1意味着调用这个函数的时候flags之中设置了DEQUEUE_SLEEP标志，这意味着正在执行的任务时间片还没有消耗完但要进入睡眠状态、新的任务要抢占正在运行的任务（若任务的时间片已经消耗完时进行抢占，任务处于可运行状态），可以结合__schedule函数之中signal_pending_state(prev_state, prev)为False时执行的代码逻辑理解。第二个for循环对于遍历到的每个实体调用update_load_avg函数更新实体对应任务和实体所在CFS运行队列的平均负载、调用se_update_runnable函数更新任务组在某个cpu上实体的可运行任务统计、调用update_cfs_group更新任务组在某个cpu上实体的权重。

这个函数调用了两次CFS运行队列以及任务组统计指标的更新函数，例如update_load_avg、se_update_runnable、update_cfs_group这三个函数，第一次调用是在第一个for循环中执行dequeue_entity的时候由dequeue_entity函数调用，第二次调用是在第二个for循环之中，同样的在enqueue_task_fair之中也出现了调用这三个函数两次的情况。这背后的原因主要涉及到两点：第一点是调度层次中当前层次的统计指标变化会影响上级的统计指标计算，在后边详细说明是如何影响的；第二点是调度层级的每一层之中h_nr_running、idle_h_hr_running要考虑所有下层之中可运行任务数量、可运行的空闲任务数量变化，作者把数量变化的传递放到了两个循环之中，可以结合enqueue_task_fair函数中的内容理解这一点。可以结合代码理解第一点，在遍历过程中每一层都会递减这一层实体se所在CFS运行队列中可运行任务统计(h_nr_running的值)，这个CFS运行队列是任务组在当前cpu上的运行队列，这一层实体的父实体pse就是任务组在当前cpu上的调度实体，当遍历到父实体时会更新父实体pse的runnable_weight为se所属CFS运行队列的可运行任务统计，这会进一步影响之后对pse执行update_load_avg时load_avg的计算。简而言之，把数量变化放到两个循环之中处理，每个循环之中当前遍历的实体所属CFS运行队列统计指标的变化会影响接下来遍历到的实体。

这个函数最后的部分调用sub_nr_running将运行队列之中可运行任务的数量递减，因为前边两个for循环的作用让运行队列之中所有的任务都是空闲任务时(!was_sched_idle && sched_idle_rq(rq)为True)将对此运行队列进行下一次负载均衡的时机提前到现在(rq->next_balance = jiffies)，调用update_hrtick更新高精度定时器。

以上的流程中提到的update_load_avg、se_update_runnable、update_cfs_group、hrtick_update函数流程详细记录见enqueue_task_cfs.md，sched_idle_rq、dequeue_entity、sub_nr_running函数流程在后边记录，其他函数忽略。

`sched_idle_rq`函数

/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
			rq->nr_running);
}

这个函数判断一个运行队列之中是否只包含空闲任务，判断的方式为比较运行队列中可运行任务以及可运行空闲任务的数量，二者相等则意味着运行队列之中所有任务都是空闲任务。当运行队列中可运行任务为0时这种判断会误认为运行队列之中所有任务为空闲任务，需要在判断判断之中排除此种情况，因此会要求nr_running不为0。

`dequeue_entity`函数

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	int action = UPDATE_TG;

	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
		action |= DO_DETACH;

	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	/*
	 * When dequeuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - For group_entity, update its runnable_weight to reflect the new
	 *     h_nr_running of its group cfs_rq.
	 *   - Subtract its previous weight from cfs_rq->load.weight.
	 *   - For group entity, update its weight to reflect the new share
	 *     of its group cfs_rq.
	 */
	update_load_avg(cfs_rq, se, action);
	se_update_runnable(se);

	update_stats_dequeue_fair(cfs_rq, se, flags);

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize after update_curr(); which will also have moved
	 * min_vruntime if @se is the one holding it back. But before doing
	 * update_min_vruntime() again, which will discount @se's position and
	 * can move min_vruntime forward still more.
	 */
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);

	update_cfs_group(se);

	/*
	 * Now advance min_vruntime if @se was the entity holding it back,
	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
	 * put back on, and if we advance min_vruntime, we'll be placed back
	 * further than we started -- ie. we'll be penalized.
	 */
	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
		update_min_vruntime(cfs_rq);

	if (cfs_rq->nr_running == 0)
		update_idle_cfs_rq_clock_pelt(cfs_rq);
}

若实体se对应一个任务并且这个任务要从运行此函数的当前cpu之中迁移出去，需要为action追加DO_DETACH标记，这个标记会导致在update_load_avg函数中从CFS运行队列的avg以及sum指标中减去来自实体se的贡献。一个任务从CFS运行队列之中移除时同步这个任务与其所在的CFS运行队列、任务组统计信息的最后时机，这里调用update_curr函数更新正在运行任务的时间统计、调用update_load_avg将实体se对avg、sum指标的贡献同步到CFS运行队列之中、调用se_update_runnable更新任务组在当前cpu上的实体的可运行任务统计。如果se对应的不是当前cpu中正在运行的任务，调用__dequeue_entity函数将实体se从当前cpu的CFS运行队列之中将实体se的on_rq设置为0以表明实体已经不在运行队列之中、调用account_entity_dequeue函数将任务权重从CFS运行队列之中移除。若实体se对的任务并非进入睡眠状态而从CFS运行队列移除，要从任务的虚拟运行时间之中减去CFS运行队列的最小运行时间，这是将任务的虚拟运行时间转换成相对虚拟运行时间。若实体se为任务组在当前cpu上的实体，重新计算这个实体的权重。若在flags之中同时指定了DEQUEUE_SAVE和DEQUEUE_MOVE标记，则意味着实体se对应的任务要离开当前的运行队列，此时调用update_min_vruntime函数更新CFS运行队列的基准虚拟运行时间，结合调用场景理解：在sched_move_task函数要为将任务转移到新的运行队列之中，此时调用此函数时同时指定了DEQUEUE_SAVE和DEQUEUE_MOVE标记；在set_user_nice函数中调用此函数时指定了DEQUEUE_SAVE而没有指定DEQUEUE_MOVE。

以上的流程之中涉及到的update_curr函数详细记录见task_fork_cfs.md，update_load_avg、se_update_runnable、update_cfs_group函数详细记录见enqueue_task_cfs.md，update_min_vruntime函数详细记录见task_fork_cfs.md，在后边详细记录account_entity_dequeue、__dequeue_entity这两个函数的流程，其他的函数忽略。

`account_entity_dequeue`函数

static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	update_load_sub(&cfs_rq->load, se->load.weight);
#ifdef CONFIG_SMP
	if (entity_is_task(se)) {
		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
		list_del_init(&se->group_node);
	}
#endif
	cfs_rq->nr_running--;
	if (se_is_idle(se))
		cfs_rq->idle_nr_running--;
}

这个函数从CFS运行队列的权重之中减去实体se的贡献，递减CFS运行队列之中可运行任务的统计，若se对应的任务为一个空闲任务则递减CFS运行队列中可运行的空闲任务统计，其他的内容忽略。

`__dequeue_entity`函数

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

这个函数将se从CFS运行队列之中移除，这意味这将任务从CFS运行队列之中移除，这个函数的逻辑正好与__enqueue_entity函数相反。

`sub_nr_running`函数

static inline void sub_nr_running(struct rq *rq, unsigned count)
{
	rq->nr_running -= count;
	if (trace_sched_update_nr_running_tp_enabled()) {
		call_trace_sched_update_nr_running(rq, -count);
	}

	/* Check if we still need preemption */
	sched_update_tick_dependency(rq);
}

这个函数修改运行队列之中可运行任务的数量，从中减去count的值，其他的内容忽略。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

dequeue_task_cfs.md

dequeue_task_cfs.md

`dequeue_task_fair`函数

`sched_idle_rq`函数

`dequeue_entity`函数

`account_entity_dequeue`函数

`__dequeue_entity`函数

`sub_nr_running`函数

Files

dequeue_task_cfs.md

Latest commit

History

dequeue_task_cfs.md

File metadata and controls

dequeue_task_fair函数

sched_idle_rq函数

dequeue_entity函数

account_entity_dequeue函数

__dequeue_entity函数

sub_nr_running函数

`dequeue_task_fair`函数

`sched_idle_rq`函数

`dequeue_entity`函数

`account_entity_dequeue`函数

`__dequeue_entity`函数

`sub_nr_running`函数