1073
1077
update_cfs_load(cfs_rq, 0);
1074
1078
account_entity_dequeue(cfs_rq, se);
1075
update_min_vruntime(cfs_rq);
1076
update_cfs_shares(cfs_rq);
1079
1081
* Normalize the entity after updating the min_vruntime because the
1350
1357
struct cfs_rq *cfs_rq;
1351
1358
struct sched_entity *se = &p->se;
1359
int task_sleep = flags & DEQUEUE_SLEEP;
1353
1361
for_each_sched_entity(se) {
1354
1362
cfs_rq = cfs_rq_of(se);
1355
1363
dequeue_entity(cfs_rq, se, flags);
1357
1365
/* Don't dequeue parent if it has other entities besides us */
1358
if (cfs_rq->load.weight)
1366
if (cfs_rq->load.weight) {
1368
* Bias pick_next to pick a task from this cfs_rq, as
1369
* p is sleeping when it is within its sched_slice.
1371
if (task_sleep && parent_entity(se))
1372
set_next_buddy(parent_entity(se));
1360
1375
flags |= DEQUEUE_SLEEP;
1373
1388
#ifdef CONFIG_SMP
1375
static void task_waking_fair(struct rq *rq, struct task_struct *p)
1390
static void task_waking_fair(struct task_struct *p)
1377
1392
struct sched_entity *se = &p->se;
1378
1393
struct cfs_rq *cfs_rq = cfs_rq_of(se);
1380
se->vruntime -= cfs_rq->min_vruntime;
1396
#ifndef CONFIG_64BIT
1397
u64 min_vruntime_copy;
1400
min_vruntime_copy = cfs_rq->min_vruntime_copy;
1402
min_vruntime = cfs_rq->min_vruntime;
1403
} while (min_vruntime != min_vruntime_copy);
1405
min_vruntime = cfs_rq->min_vruntime;
1408
se->vruntime -= min_vruntime;
1383
1411
#ifdef CONFIG_FAIR_GROUP_SCHED
1657
1687
* preempt must be disabled.
1660
select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1690
select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1662
1692
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663
1693
int cpu = smp_processor_id();
1692
1723
nr_running += cpu_rq(i)->cfs.nr_running;
1695
capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1726
capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1697
1728
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1698
1729
nr_running /= 2;
1724
1755
if (affine_sd) {
1725
1756
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726
return select_idle_sibling(p, cpu);
1728
return select_idle_sibling(p, prev_cpu);
1759
new_cpu = select_idle_sibling(p, prev_cpu);
1789
1823
* This is especially important for buddies when the leftmost
1790
1824
* task is higher priority than the buddy.
1792
if (unlikely(se->load.weight != NICE_0_LOAD))
1793
gran = calc_delta_fair(gran, se);
1826
return calc_delta_fair(gran, se);
1827
1858
static void set_last_buddy(struct sched_entity *se)
1829
if (likely(task_of(se)->policy != SCHED_IDLE)) {
1830
for_each_sched_entity(se)
1831
cfs_rq_of(se)->last = se;
1860
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1863
for_each_sched_entity(se)
1864
cfs_rq_of(se)->last = se;
1835
1867
static void set_next_buddy(struct sched_entity *se)
1837
if (likely(task_of(se)->policy != SCHED_IDLE)) {
1838
for_each_sched_entity(se)
1839
cfs_rq_of(se)->next = se;
1869
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1872
for_each_sched_entity(se)
1873
cfs_rq_of(se)->next = se;
1843
1876
static void set_skip_buddy(struct sched_entity *se)
1845
if (likely(task_of(se)->policy != SCHED_IDLE)) {
1846
for_each_sched_entity(se)
1847
cfs_rq_of(se)->skip = se;
1878
for_each_sched_entity(se)
1879
cfs_rq_of(se)->skip = se;
1857
1888
struct sched_entity *se = &curr->se, *pse = &p->se;
1858
1889
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859
1890
int scale = cfs_rq->nr_running >= sched_nr_latency;
1891
int next_buddy_marked = 0;
1861
1893
if (unlikely(se == pse))
1864
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1896
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865
1897
set_next_buddy(pse);
1898
next_buddy_marked = 1;
1868
1902
* We can come here with TIF_NEED_RESCHED already set from new task
1890
1924
update_curr(cfs_rq);
1891
1925
find_matching_se(&se, &pse);
1893
if (wakeup_preempt_entity(se, pse) == 1)
1927
if (wakeup_preempt_entity(se, pse) == 1) {
1929
* Bias pick_next to pick the sched entity that is
1930
* triggering this preemption.
1932
if (!next_buddy_marked)
1933
set_next_buddy(pse);
2102
2143
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103
2144
unsigned long max_load_move, struct sched_domain *sd,
2104
2145
enum cpu_idle_type idle, int *all_pinned,
2105
int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2146
struct cfs_rq *busiest_cfs_rq)
2107
2148
int loops = 0, pulled = 0;
2108
2149
long rem_load_move = max_load_move;
2202
2240
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203
2241
unsigned long max_load_move,
2204
2242
struct sched_domain *sd, enum cpu_idle_type idle,
2205
int *all_pinned, int *this_best_prio)
2207
2245
long rem_load_move = max_load_move;
2208
2246
int busiest_cpu = cpu_of(busiest);
2227
2265
rem_load = div_u64(rem_load, busiest_h_load + 1);
2229
2267
moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230
rem_load, sd, idle, all_pinned, this_best_prio,
2268
rem_load, sd, idle, all_pinned,
2231
2269
busiest_cfs_rq);
2233
2271
if (!moved_load)
2253
2291
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254
2292
unsigned long max_load_move,
2255
2293
struct sched_domain *sd, enum cpu_idle_type idle,
2256
int *all_pinned, int *this_best_prio)
2258
2296
return balance_tasks(this_rq, this_cpu, busiest,
2259
2297
max_load_move, sd, idle, all_pinned,
2260
this_best_prio, &busiest->cfs);
2274
2312
int *all_pinned)
2276
2314
unsigned long total_load_moved = 0, load_moved;
2277
int this_best_prio = this_rq->curr->prio;
2280
2317
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281
2318
max_load_move - total_load_moved,
2282
sd, idle, all_pinned, &this_best_prio);
2319
sd, idle, all_pinned);
2284
2321
total_load_moved += load_moved;
2571
2608
available = total - rq->rt_avg;
2574
if (unlikely((s64)total < SCHED_LOAD_SCALE))
2575
total = SCHED_LOAD_SCALE;
2611
if (unlikely((s64)total < SCHED_POWER_SCALE))
2612
total = SCHED_POWER_SCALE;
2577
total >>= SCHED_LOAD_SHIFT;
2614
total >>= SCHED_POWER_SHIFT;
2579
2616
return div_u64(available, total);
2582
2619
static void update_cpu_power(struct sched_domain *sd, int cpu)
2584
2621
unsigned long weight = sd->span_weight;
2585
unsigned long power = SCHED_LOAD_SCALE;
2622
unsigned long power = SCHED_POWER_SCALE;
2586
2623
struct sched_group *sdg = sd->groups;
2588
2625
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2592
2629
power *= default_scale_smt_power(sd, cpu);
2594
power >>= SCHED_LOAD_SHIFT;
2631
power >>= SCHED_POWER_SHIFT;
2597
sdg->cpu_power_orig = power;
2634
sdg->sgp->power_orig = power;
2599
2636
if (sched_feat(ARCH_POWER))
2600
2637
power *= arch_scale_freq_power(sd, cpu);
2602
2639
power *= default_scale_freq_power(sd, cpu);
2604
power >>= SCHED_LOAD_SHIFT;
2641
power >>= SCHED_POWER_SHIFT;
2606
2643
power *= scale_rt_power(cpu);
2607
power >>= SCHED_LOAD_SHIFT;
2644
power >>= SCHED_POWER_SHIFT;
2612
2649
cpu_rq(cpu)->cpu_power = power;
2613
sdg->cpu_power = power;
2650
sdg->sgp->power = power;
2616
2653
static void update_group_power(struct sched_domain *sd, int cpu)
2646
2683
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2649
* Only siblings can have significantly less than SCHED_LOAD_SCALE
2686
* Only siblings can have significantly less than SCHED_POWER_SCALE
2651
if (sd->level != SD_LV_SIBLING)
2688
if (!(sd->flags & SD_SHARE_CPUPOWER))
2655
2692
* If ~90% of the cpu_power is still there, we're good.
2657
if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2694
if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2736
2773
/* Adjust by relative CPU power of the group */
2737
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2774
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2740
2777
* Consider the group unbalanced when the imbalance is larger
2751
2788
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2752
2789
sgs->group_imb = 1;
2754
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2791
sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2755
2793
if (!sgs->group_capacity)
2756
2794
sgs->group_capacity = fix_small_capacity(sd, group);
2757
2795
sgs->group_weight = group->group_weight;
2954
2992
cpu_avg_load_per_task(this_cpu);
2956
2994
scaled_busy_load_per_task = sds->busiest_load_per_task
2958
scaled_busy_load_per_task /= sds->busiest->cpu_power;
2995
* SCHED_POWER_SCALE;
2996
scaled_busy_load_per_task /= sds->busiest->sgp->power;
2960
2998
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2961
2999
(scaled_busy_load_per_task * imbn)) {
2972
pwr_now += sds->busiest->cpu_power *
3010
pwr_now += sds->busiest->sgp->power *
2973
3011
min(sds->busiest_load_per_task, sds->max_load);
2974
pwr_now += sds->this->cpu_power *
3012
pwr_now += sds->this->sgp->power *
2975
3013
min(sds->this_load_per_task, sds->this_load);
2976
pwr_now /= SCHED_LOAD_SCALE;
3014
pwr_now /= SCHED_POWER_SCALE;
2978
3016
/* Amount of load we'd subtract */
2979
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2980
sds->busiest->cpu_power;
3017
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3018
sds->busiest->sgp->power;
2981
3019
if (sds->max_load > tmp)
2982
pwr_move += sds->busiest->cpu_power *
3020
pwr_move += sds->busiest->sgp->power *
2983
3021
min(sds->busiest_load_per_task, sds->max_load - tmp);
2985
3023
/* Amount of load we'd add */
2986
if (sds->max_load * sds->busiest->cpu_power <
2987
sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2988
tmp = (sds->max_load * sds->busiest->cpu_power) /
2989
sds->this->cpu_power;
3024
if (sds->max_load * sds->busiest->sgp->power <
3025
sds->busiest_load_per_task * SCHED_POWER_SCALE)
3026
tmp = (sds->max_load * sds->busiest->sgp->power) /
3027
sds->this->sgp->power;
2991
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2992
sds->this->cpu_power;
2993
pwr_move += sds->this->cpu_power *
3029
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3030
sds->this->sgp->power;
3031
pwr_move += sds->this->sgp->power *
2994
3032
min(sds->this_load_per_task, sds->this_load + tmp);
2995
pwr_move /= SCHED_LOAD_SCALE;
3033
pwr_move /= SCHED_POWER_SCALE;
2997
3035
/* Move if we gain throughput */
2998
3036
if (pwr_move > pwr_now)
3034
3072
load_above_capacity = (sds->busiest_nr_running -
3035
3073
sds->busiest_group_capacity);
3037
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
3075
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3039
load_above_capacity /= sds->busiest->cpu_power;
3077
load_above_capacity /= sds->busiest->sgp->power;
3052
3090
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3054
3092
/* How much load to actually move to equalise the imbalance */
3055
*imbalance = min(max_pull * sds->busiest->cpu_power,
3056
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
3093
*imbalance = min(max_pull * sds->busiest->sgp->power,
3094
(sds->avg_load - sds->this_load) * sds->this->sgp->power)
3095
/ SCHED_POWER_SCALE;
3060
3098
* if *imbalance is less than the average load per runnable task
3123
3161
if (!sds.busiest || sds.busiest_nr_running == 0)
3124
3162
goto out_balanced;
3126
sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3164
sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3129
3167
* If the busiest group is imbalanced the below checks don't
3203
3241
for_each_cpu(i, sched_group_cpus(group)) {
3204
3242
unsigned long power = power_of(i);
3205
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3243
unsigned long capacity = DIV_ROUND_CLOSEST(power,
3206
3245
unsigned long wl;
3534
3575
double_lock_balance(busiest_rq, target_rq);
3536
3577
/* Search for an sd spanning us and the target CPU. */
3537
3579
for_each_domain(target_cpu, sd) {
3538
3580
if ((sd->flags & SD_LOAD_BALANCE) &&
3539
3581
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3690
3734
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3693
3738
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694
3739
ilb_group = sd->groups;
3697
if (is_semi_idle_group(ilb_group))
3698
return cpumask_first(nohz.grp_idle_mask);
3742
if (is_semi_idle_group(ilb_group)) {
3743
ilb = cpumask_first(nohz.grp_idle_mask);
3700
3747
ilb_group = ilb_group->next;
3702
3749
} while (ilb_group != sd->groups);
3708
3757
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3709
3758
static inline int find_new_ilb(int call_cpu)