1
/****************************************************************************
2
* (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
3
****************************************************************************
5
* File: common/csched_credit.c
6
* Author: Emmanuel Ackaouy
8
* Description: Credit-based SMP CPU scheduler
11
#include <xen/config.h>
14
#include <xen/sched.h>
15
#include <xen/domain.h>
16
#include <xen/delay.h>
17
#include <xen/event.h>
19
#include <xen/perfc.h>
20
#include <xen/sched-if.h>
21
#include <xen/softirq.h>
22
#include <asm/atomic.h>
23
#include <xen/errno.h>
24
#include <xen/keyhandler.h>
29
* Manage very basic per-vCPU counters and stats.
31
* Useful for debugging live systems. The stats are displayed
32
* with runq dumps ('r' on the Xen console).
42
#define CSCHED_DEFAULT_WEIGHT 256
43
#define CSCHED_TICKS_PER_TSLICE 3
44
#define CSCHED_TICKS_PER_ACCT 3
45
#define CSCHED_MSECS_PER_TICK 10
46
#define CSCHED_MSECS_PER_TSLICE \
47
(CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
48
#define CSCHED_CREDITS_PER_MSEC 10
49
#define CSCHED_CREDITS_PER_TSLICE \
50
(CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TSLICE)
51
#define CSCHED_CREDITS_PER_ACCT \
52
(CSCHED_CREDITS_PER_MSEC * CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_ACCT)
58
#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */
59
#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */
60
#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */
61
#define CSCHED_PRI_IDLE -64 /* idle */
67
#define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */
73
#define CSCHED_PCPU(_c) \
74
((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
75
#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
76
#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
77
#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq))
83
#define CSCHED_STAT_CRANK(_X) (perfc_incr(_X))
87
#define CSCHED_VCPU_STATS_RESET(_V) \
90
memset(&(_V)->stats, 0, sizeof((_V)->stats)); \
93
#define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++)
95
#define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y))
97
#else /* CSCHED_STATS */
99
#define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 )
100
#define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 )
101
#define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 )
103
#endif /* CSCHED_STATS */
110
struct list_head runq;
111
uint32_t runq_sort_last;
114
unsigned int idle_bias;
121
struct list_head runq_elem;
122
struct list_head active_vcpu_elem;
123
struct csched_dom *sdom;
126
s_time_t start_time; /* When we were scheduled (used for credit) */
132
uint32_t credit_incr;
133
uint32_t state_active;
145
struct list_head active_vcpu;
146
struct list_head active_sdom_elem;
148
uint16_t active_vcpu_count;
154
* System-wide private data
156
struct csched_private {
158
struct list_head active_sdom;
160
struct timer master_ticker;
173
static struct csched_private csched_priv;
175
static void csched_tick(void *_cpu);
178
__vcpu_on_runq(struct csched_vcpu *svc)
180
return !list_empty(&svc->runq_elem);
183
static inline struct csched_vcpu *
184
__runq_elem(struct list_head *elem)
186
return list_entry(elem, struct csched_vcpu, runq_elem);
190
__runq_insert(unsigned int cpu, struct csched_vcpu *svc)
192
const struct list_head * const runq = RUNQ(cpu);
193
struct list_head *iter;
195
BUG_ON( __vcpu_on_runq(svc) );
196
BUG_ON( cpu != svc->vcpu->processor );
198
list_for_each( iter, runq )
200
const struct csched_vcpu * const iter_svc = __runq_elem(iter);
201
if ( svc->pri > iter_svc->pri )
205
list_add_tail(&svc->runq_elem, iter);
209
__runq_remove(struct csched_vcpu *svc)
211
BUG_ON( !__vcpu_on_runq(svc) );
212
list_del_init(&svc->runq_elem);
215
static void burn_credits(struct csched_vcpu *svc, s_time_t now)
218
unsigned int credits;
220
/* Assert svc is current */
221
ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
223
if ( (delta = now - svc->start_time) <= 0 )
226
credits = (delta*CSCHED_CREDITS_PER_MSEC + MILLISECS(1)/2) / MILLISECS(1);
227
atomic_sub(credits, &svc->credit);
228
svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
232
__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
234
struct csched_vcpu * const cur =
235
CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
241
/* If strictly higher priority than current VCPU, signal the CPU */
242
if ( new->pri > cur->pri )
244
if ( cur->pri == CSCHED_PRI_IDLE )
245
CSCHED_STAT_CRANK(tickle_local_idler);
246
else if ( cur->pri == CSCHED_PRI_TS_OVER )
247
CSCHED_STAT_CRANK(tickle_local_over);
248
else if ( cur->pri == CSCHED_PRI_TS_UNDER )
249
CSCHED_STAT_CRANK(tickle_local_under);
251
CSCHED_STAT_CRANK(tickle_local_other);
257
* If this CPU has at least two runnable VCPUs, we tickle any idlers to
258
* let them know there is runnable work in the system...
260
if ( cur->pri > CSCHED_PRI_IDLE )
262
if ( cpus_empty(csched_priv.idlers) )
264
CSCHED_STAT_CRANK(tickle_idlers_none);
268
CSCHED_STAT_CRANK(tickle_idlers_some);
269
cpus_or(mask, mask, csched_priv.idlers);
270
cpus_and(mask, mask, new->vcpu->cpu_affinity);
274
/* Send scheduler interrupts to designated CPUs */
275
if ( !cpus_empty(mask) )
276
cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
280
csched_pcpu_init(int cpu)
282
struct csched_pcpu *spc;
285
/* Allocate per-PCPU info */
286
spc = xmalloc(struct csched_pcpu);
289
memset(spc, 0, sizeof(*spc));
291
spin_lock_irqsave(&csched_priv.lock, flags);
293
/* Initialize/update system-wide config */
294
csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
295
if ( csched_priv.ncpus <= cpu )
296
csched_priv.ncpus = cpu + 1;
297
if ( csched_priv.master >= csched_priv.ncpus )
298
csched_priv.master = cpu;
300
init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
301
INIT_LIST_HEAD(&spc->runq);
302
spc->runq_sort_last = csched_priv.runq_sort;
303
spc->idle_bias = NR_CPUS - 1;
304
per_cpu(schedule_data, cpu).sched_priv = spc;
306
/* Start off idling... */
307
BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
308
cpu_set(cpu, csched_priv.idlers);
310
spin_unlock_irqrestore(&csched_priv.lock, flags);
317
__csched_vcpu_check(struct vcpu *vc)
319
struct csched_vcpu * const svc = CSCHED_VCPU(vc);
320
struct csched_dom * const sdom = svc->sdom;
322
BUG_ON( svc->vcpu != vc );
323
BUG_ON( sdom != CSCHED_DOM(vc->domain) );
326
BUG_ON( is_idle_vcpu(vc) );
327
BUG_ON( sdom->dom != vc->domain );
331
BUG_ON( !is_idle_vcpu(vc) );
334
CSCHED_STAT_CRANK(vcpu_check);
336
#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
338
#define CSCHED_VCPU_CHECK(_vc)
342
* Delay, in microseconds, between migrations of a VCPU between PCPUs.
343
* This prevents rapid fluttering of a VCPU between CPUs, and reduces the
344
* implicit overheads such as cache-warming. 1ms (1000) has been measured
347
static unsigned int vcpu_migration_delay;
348
integer_param("vcpu_migration_delay", vcpu_migration_delay);
350
void set_vcpu_migration_delay(unsigned int delay)
352
vcpu_migration_delay = delay;
355
unsigned int get_vcpu_migration_delay(void)
357
return vcpu_migration_delay;
361
__csched_vcpu_is_cache_hot(struct vcpu *v)
363
int hot = ((NOW() - v->last_run_time) <
364
((uint64_t)vcpu_migration_delay * 1000u));
367
CSCHED_STAT_CRANK(vcpu_hot);
373
__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
376
* Don't pick up work that's in the peer's scheduling tail or hot on
377
* peer PCPU. Only pick up work that's allowed to run on our CPU.
379
return !vc->is_running &&
380
!__csched_vcpu_is_cache_hot(vc) &&
381
cpu_isset(dest_cpu, vc->cpu_affinity);
385
_csched_cpu_pick(struct vcpu *vc, bool_t commit)
392
* Pick from online CPUs in VCPU's affinity mask, giving a
393
* preference to its current processor if it's in there.
395
cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
396
cpu = cpu_isset(vc->processor, cpus)
398
: cycle_cpu(vc->processor, cpus);
399
ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
402
* Try to find an idle processor within the above constraints.
404
* In multi-core and multi-threaded CPUs, not all idle execution
405
* vehicles are equal!
407
* We give preference to the idle execution vehicle with the most
408
* idling neighbours in its grouping. This distributes work across
409
* distinct cores first and guarantees we don't do something stupid
410
* like run two VCPUs on co-hyperthreads while there are idle cores
413
idlers = csched_priv.idlers;
414
cpu_set(cpu, idlers);
415
cpus_and(cpus, cpus, idlers);
416
cpu_clear(cpu, cpus);
418
while ( !cpus_empty(cpus) )
420
cpumask_t cpu_idlers;
421
cpumask_t nxt_idlers;
422
int nxt, weight_cpu, weight_nxt;
424
nxt = cycle_cpu(cpu, cpus);
426
if ( cpu_isset(cpu, per_cpu(cpu_core_map, nxt)) )
428
ASSERT( cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
429
cpus_and(cpu_idlers, idlers, per_cpu(cpu_sibling_map, cpu));
430
cpus_and(nxt_idlers, idlers, per_cpu(cpu_sibling_map, nxt));
434
ASSERT( !cpu_isset(nxt, per_cpu(cpu_core_map, cpu)) );
435
cpus_and(cpu_idlers, idlers, per_cpu(cpu_core_map, cpu));
436
cpus_and(nxt_idlers, idlers, per_cpu(cpu_core_map, nxt));
439
weight_cpu = cpus_weight(cpu_idlers);
440
weight_nxt = cpus_weight(nxt_idlers);
441
if ( ( (weight_cpu < weight_nxt) ^ sched_smt_power_savings )
442
&& (weight_cpu != weight_nxt) )
444
cpu = cycle_cpu(CSCHED_PCPU(nxt)->idle_bias, nxt_idlers);
446
CSCHED_PCPU(nxt)->idle_bias = cpu;
447
cpus_andnot(cpus, cpus, per_cpu(cpu_sibling_map, cpu));
451
cpus_andnot(cpus, cpus, nxt_idlers);
459
csched_cpu_pick(struct vcpu *vc)
461
return _csched_cpu_pick(vc, 1);
465
__csched_vcpu_acct_start(struct csched_vcpu *svc)
467
struct csched_dom * const sdom = svc->sdom;
470
spin_lock_irqsave(&csched_priv.lock, flags);
472
if ( list_empty(&svc->active_vcpu_elem) )
474
CSCHED_VCPU_STAT_CRANK(svc, state_active);
475
CSCHED_STAT_CRANK(acct_vcpu_active);
477
sdom->active_vcpu_count++;
478
list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
479
if ( list_empty(&sdom->active_sdom_elem) )
481
list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
482
csched_priv.weight += sdom->weight;
486
spin_unlock_irqrestore(&csched_priv.lock, flags);
490
__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
492
struct csched_dom * const sdom = svc->sdom;
494
BUG_ON( list_empty(&svc->active_vcpu_elem) );
496
CSCHED_VCPU_STAT_CRANK(svc, state_idle);
497
CSCHED_STAT_CRANK(acct_vcpu_idle);
499
sdom->active_vcpu_count--;
500
list_del_init(&svc->active_vcpu_elem);
501
if ( list_empty(&sdom->active_vcpu) )
503
BUG_ON( csched_priv.weight < sdom->weight );
504
list_del_init(&sdom->active_sdom_elem);
505
csched_priv.weight -= sdom->weight;
510
csched_vcpu_acct(unsigned int cpu)
512
struct csched_vcpu * const svc = CSCHED_VCPU(current);
514
ASSERT( current->processor == cpu );
515
ASSERT( svc->sdom != NULL );
518
* If this VCPU's priority was boosted when it last awoke, reset it.
519
* If the VCPU is found here, then it's consuming a non-negligeable
520
* amount of CPU resources and should no longer be boosted.
522
if ( svc->pri == CSCHED_PRI_TS_BOOST )
523
svc->pri = CSCHED_PRI_TS_UNDER;
528
if ( !is_idle_vcpu(svc->vcpu) )
529
burn_credits(svc, NOW());
532
* Put this VCPU and domain back on the active list if it was
535
* If it's been active a while, check if we'd be better off
536
* migrating it to run elsewhere (see multi-core and multi-thread
537
* support in csched_cpu_pick()).
539
if ( list_empty(&svc->active_vcpu_elem) )
541
__csched_vcpu_acct_start(svc);
543
else if ( _csched_cpu_pick(current, 0) != cpu )
545
CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
546
CSCHED_STAT_CRANK(migrate_running);
547
set_bit(_VPF_migrating, ¤t->pause_flags);
548
cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
553
csched_vcpu_init(struct vcpu *vc)
555
struct domain * const dom = vc->domain;
556
struct csched_dom *sdom = CSCHED_DOM(dom);
557
struct csched_vcpu *svc;
559
CSCHED_STAT_CRANK(vcpu_init);
561
/* Allocate per-VCPU info */
562
svc = xmalloc(struct csched_vcpu);
565
memset(svc, 0, sizeof(*svc));
567
INIT_LIST_HEAD(&svc->runq_elem);
568
INIT_LIST_HEAD(&svc->active_vcpu_elem);
571
atomic_set(&svc->credit, 0);
573
svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
574
CSCHED_VCPU_STATS_RESET(svc);
575
vc->sched_priv = svc;
577
/* Allocate per-PCPU info */
578
if ( unlikely(!CSCHED_PCPU(vc->processor)) )
580
if ( csched_pcpu_init(vc->processor) != 0 )
584
CSCHED_VCPU_CHECK(vc);
589
csched_vcpu_destroy(struct vcpu *vc)
591
struct csched_vcpu * const svc = CSCHED_VCPU(vc);
592
struct csched_dom * const sdom = svc->sdom;
595
CSCHED_STAT_CRANK(vcpu_destroy);
597
BUG_ON( sdom == NULL );
598
BUG_ON( !list_empty(&svc->runq_elem) );
600
spin_lock_irqsave(&csched_priv.lock, flags);
602
if ( !list_empty(&svc->active_vcpu_elem) )
603
__csched_vcpu_acct_stop_locked(svc);
605
spin_unlock_irqrestore(&csched_priv.lock, flags);
611
csched_vcpu_sleep(struct vcpu *vc)
613
struct csched_vcpu * const svc = CSCHED_VCPU(vc);
615
CSCHED_STAT_CRANK(vcpu_sleep);
617
BUG_ON( is_idle_vcpu(vc) );
619
if ( per_cpu(schedule_data, vc->processor).curr == vc )
620
cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
621
else if ( __vcpu_on_runq(svc) )
626
csched_vcpu_wake(struct vcpu *vc)
628
struct csched_vcpu * const svc = CSCHED_VCPU(vc);
629
const unsigned int cpu = vc->processor;
631
BUG_ON( is_idle_vcpu(vc) );
633
if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
635
CSCHED_STAT_CRANK(vcpu_wake_running);
638
if ( unlikely(__vcpu_on_runq(svc)) )
640
CSCHED_STAT_CRANK(vcpu_wake_onrunq);
644
if ( likely(vcpu_runnable(vc)) )
645
CSCHED_STAT_CRANK(vcpu_wake_runnable);
647
CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
650
* We temporarly boost the priority of awaking VCPUs!
652
* If this VCPU consumes a non negligeable amount of CPU, it
653
* will eventually find itself in the credit accounting code
654
* path where its priority will be reset to normal.
656
* If on the other hand the VCPU consumes little CPU and is
657
* blocking and awoken a lot (doing I/O for example), its
658
* priority will remain boosted, optimizing it's wake-to-run
661
* This allows wake-to-run latency sensitive VCPUs to preempt
662
* more CPU resource intensive VCPUs without impacting overall
665
* The one exception is for VCPUs of capped domains unpausing
666
* after earning credits they had overspent. We don't boost
669
if ( svc->pri == CSCHED_PRI_TS_UNDER &&
670
!(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
672
svc->pri = CSCHED_PRI_TS_BOOST;
675
/* Put the VCPU on the runq and tickle CPUs */
676
__runq_insert(cpu, svc);
677
__runq_tickle(cpu, svc);
683
struct xen_domctl_scheduler_op *op)
685
struct csched_dom * const sdom = CSCHED_DOM(d);
688
if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
690
op->u.credit.weight = sdom->weight;
691
op->u.credit.cap = sdom->cap;
695
ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
697
spin_lock_irqsave(&csched_priv.lock, flags);
699
if ( op->u.credit.weight != 0 )
701
if ( !list_empty(&sdom->active_sdom_elem) )
703
csched_priv.weight -= sdom->weight;
704
csched_priv.weight += op->u.credit.weight;
706
sdom->weight = op->u.credit.weight;
709
if ( op->u.credit.cap != (uint16_t)~0U )
710
sdom->cap = op->u.credit.cap;
712
spin_unlock_irqrestore(&csched_priv.lock, flags);
719
csched_dom_init(struct domain *dom)
721
struct csched_dom *sdom;
723
CSCHED_STAT_CRANK(dom_init);
725
if ( is_idle_domain(dom) )
728
sdom = xmalloc(struct csched_dom);
731
memset(sdom, 0, sizeof(*sdom));
733
/* Initialize credit and weight */
734
INIT_LIST_HEAD(&sdom->active_vcpu);
735
sdom->active_vcpu_count = 0;
736
INIT_LIST_HEAD(&sdom->active_sdom_elem);
738
sdom->weight = CSCHED_DEFAULT_WEIGHT;
740
dom->sched_priv = sdom;
746
csched_dom_destroy(struct domain *dom)
748
CSCHED_STAT_CRANK(dom_destroy);
749
xfree(CSCHED_DOM(dom));
753
* This is a O(n) optimized sort of the runq.
755
* Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
756
* through the runq and move up any UNDERs that are preceded by OVERS. We
757
* remember the last UNDER to make the move up operation O(1).
760
csched_runq_sort(unsigned int cpu)
762
struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
763
struct list_head *runq, *elem, *next, *last_under;
764
struct csched_vcpu *svc_elem;
768
sort_epoch = csched_priv.runq_sort;
769
if ( sort_epoch == spc->runq_sort_last )
772
spc->runq_sort_last = sort_epoch;
774
spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
780
while ( elem != runq )
783
svc_elem = __runq_elem(elem);
785
if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
787
/* does elem need to move up the runq? */
788
if ( elem->prev != last_under )
791
list_add(elem, last_under);
799
spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
803
csched_acct(void* dummy)
806
struct list_head *iter_vcpu, *next_vcpu;
807
struct list_head *iter_sdom, *next_sdom;
808
struct csched_vcpu *svc;
809
struct csched_dom *sdom;
810
uint32_t credit_total;
811
uint32_t weight_total;
812
uint32_t weight_left;
813
uint32_t credit_fair;
814
uint32_t credit_peak;
821
spin_lock_irqsave(&csched_priv.lock, flags);
823
weight_total = csched_priv.weight;
824
credit_total = csched_priv.credit;
826
/* Converge balance towards 0 when it drops negative */
827
if ( csched_priv.credit_balance < 0 )
829
credit_total -= csched_priv.credit_balance;
830
CSCHED_STAT_CRANK(acct_balance);
833
if ( unlikely(weight_total == 0) )
835
csched_priv.credit_balance = 0;
836
spin_unlock_irqrestore(&csched_priv.lock, flags);
837
CSCHED_STAT_CRANK(acct_no_work);
841
CSCHED_STAT_CRANK(acct_run);
843
weight_left = weight_total;
848
list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
850
sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
852
BUG_ON( is_idle_domain(sdom->dom) );
853
BUG_ON( sdom->active_vcpu_count == 0 );
854
BUG_ON( sdom->weight == 0 );
855
BUG_ON( sdom->weight > weight_left );
857
weight_left -= sdom->weight;
860
* A domain's fair share is computed using its weight in competition
861
* with that of all other active domains.
863
* At most, a domain can use credits to run all its active VCPUs
864
* for one full accounting period. We allow a domain to earn more
865
* only when the system-wide credit balance is negative.
867
credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
868
if ( csched_priv.credit_balance < 0 )
870
credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
875
if ( sdom->cap != 0U )
877
credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
878
if ( credit_cap < credit_peak )
879
credit_peak = credit_cap;
881
credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
882
) / sdom->active_vcpu_count;
885
credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
888
if ( credit_fair < credit_peak )
894
if ( weight_left != 0U )
896
/* Give other domains a chance at unused credits */
897
credit_total += ( ( ( credit_fair - credit_peak
899
) + ( weight_left - 1 )
906
* Lazily keep domains with extra credits at the head of
907
* the queue to give others a chance at them in future
908
* accounting periods.
910
CSCHED_STAT_CRANK(acct_reorder);
911
list_del(&sdom->active_sdom_elem);
912
list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
915
credit_fair = credit_peak;
918
/* Compute fair share per VCPU */
919
credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
920
) / sdom->active_vcpu_count;
923
list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
925
svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
926
BUG_ON( sdom != svc->sdom );
928
/* Increment credit */
929
atomic_add(credit_fair, &svc->credit);
930
credit = atomic_read(&svc->credit);
933
* Recompute priority or, if VCPU is idling, remove it from
938
svc->pri = CSCHED_PRI_TS_OVER;
940
/* Park running VCPUs of capped-out domains */
941
if ( sdom->cap != 0U &&
942
credit < -credit_cap &&
943
!(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
945
CSCHED_STAT_CRANK(vcpu_park);
946
vcpu_pause_nosync(svc->vcpu);
947
svc->flags |= CSCHED_FLAG_VCPU_PARKED;
950
/* Lower bound on credits */
951
if ( credit < -CSCHED_CREDITS_PER_TSLICE )
953
CSCHED_STAT_CRANK(acct_min_credit);
954
credit = -CSCHED_CREDITS_PER_TSLICE;
955
atomic_set(&svc->credit, credit);
960
svc->pri = CSCHED_PRI_TS_UNDER;
962
/* Unpark any capped domains whose credits go positive */
963
if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
966
* It's important to unset the flag AFTER the unpause()
967
* call to make sure the VCPU's priority is not boosted
968
* if it is woken up here.
970
CSCHED_STAT_CRANK(vcpu_unpark);
971
vcpu_unpause(svc->vcpu);
972
svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
975
/* Upper bound on credits means VCPU stops earning */
976
if ( credit > CSCHED_CREDITS_PER_TSLICE )
978
__csched_vcpu_acct_stop_locked(svc);
980
atomic_set(&svc->credit, credit);
984
CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
985
CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
986
credit_balance += credit;
990
csched_priv.credit_balance = credit_balance;
992
spin_unlock_irqrestore(&csched_priv.lock, flags);
994
/* Inform each CPU that its runq needs to be sorted */
995
csched_priv.runq_sort++;
998
set_timer( &csched_priv.master_ticker, NOW() +
999
MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
1003
csched_tick(void *_cpu)
1005
unsigned int cpu = (unsigned long)_cpu;
1006
struct csched_pcpu *spc = CSCHED_PCPU(cpu);
1011
* Accounting for running VCPU
1013
if ( !is_idle_vcpu(current) )
1014
csched_vcpu_acct(cpu);
1017
* Check if runq needs to be sorted
1019
* Every physical CPU resorts the runq after the accounting master has
1020
* modified priorities. This is a special O(n) sort and runs at most
1021
* once per accounting period (currently 30 milliseconds).
1023
csched_runq_sort(cpu);
1025
set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1028
static struct csched_vcpu *
1029
csched_runq_steal(int peer_cpu, int cpu, int pri)
1031
const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
1032
const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
1033
struct csched_vcpu *speer;
1034
struct list_head *iter;
1038
* Don't steal from an idle CPU's runq because it's about to
1039
* pick up work from it itself.
1041
if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
1043
list_for_each( iter, &peer_pcpu->runq )
1045
speer = __runq_elem(iter);
1048
* If next available VCPU here is not of strictly higher
1049
* priority than ours, this PCPU is useless to us.
1051
if ( speer->pri <= pri )
1054
/* Is this VCPU is runnable on our PCPU? */
1056
BUG_ON( is_idle_vcpu(vc) );
1058
if (__csched_vcpu_is_migrateable(vc, cpu))
1060
/* We got a candidate. Grab it! */
1061
CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
1062
CSCHED_STAT_CRANK(migrate_queued);
1063
WARN_ON(vc->is_urgent);
1064
__runq_remove(speer);
1065
vc->processor = cpu;
1071
CSCHED_STAT_CRANK(steal_peer_idle);
1075
static struct csched_vcpu *
1076
csched_load_balance(int cpu, struct csched_vcpu *snext)
1078
struct csched_vcpu *speer;
1082
BUG_ON( cpu != snext->vcpu->processor );
1084
/* If this CPU is going offline we shouldn't steal work. */
1085
if ( unlikely(!cpu_online(cpu)) )
1088
if ( snext->pri == CSCHED_PRI_IDLE )
1089
CSCHED_STAT_CRANK(load_balance_idle);
1090
else if ( snext->pri == CSCHED_PRI_TS_OVER )
1091
CSCHED_STAT_CRANK(load_balance_over);
1093
CSCHED_STAT_CRANK(load_balance_other);
1096
* Peek at non-idling CPUs in the system, starting with our
1097
* immediate neighbour.
1099
cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
1100
cpu_clear(cpu, workers);
1103
while ( !cpus_empty(workers) )
1105
peer_cpu = cycle_cpu(peer_cpu, workers);
1106
cpu_clear(peer_cpu, workers);
1109
* Get ahold of the scheduler lock for this peer CPU.
1111
* Note: We don't spin on this lock but simply try it. Spinning could
1112
* cause a deadlock if the peer CPU is also load balancing and trying
1115
if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
1117
CSCHED_STAT_CRANK(steal_trylock_failed);
1122
* Any work over there to steal?
1124
speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
1125
spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
1126
if ( speer != NULL )
1131
/* Failed to find more important work elsewhere... */
1132
__runq_remove(snext);
1137
* This function is in the critical path. It is designed to be simple and
1138
* fast for the common case.
1140
static struct task_slice
1141
csched_schedule(s_time_t now)
1143
const int cpu = smp_processor_id();
1144
struct list_head * const runq = RUNQ(cpu);
1145
struct csched_vcpu * const scurr = CSCHED_VCPU(current);
1146
struct csched_vcpu *snext;
1147
struct task_slice ret;
1149
CSCHED_STAT_CRANK(schedule);
1150
CSCHED_VCPU_CHECK(current);
1152
/* Update credits */
1153
if ( !is_idle_vcpu(scurr->vcpu) )
1155
burn_credits(scurr, now);
1156
scurr->start_time -= now;
1160
* Select next runnable local VCPU (ie top of local runq)
1162
if ( vcpu_runnable(current) )
1163
__runq_insert(cpu, scurr);
1165
BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
1167
snext = __runq_elem(runq->next);
1172
* If the next highest priority local runnable VCPU has already eaten
1173
* through its credits, look on other PCPUs to see if we have more
1174
* urgent work... If not, csched_load_balance() will return snext, but
1175
* already removed from the runq.
1177
if ( snext->pri > CSCHED_PRI_TS_OVER )
1178
__runq_remove(snext);
1180
snext = csched_load_balance(cpu, snext);
1183
* Update idlers mask if necessary. When we're idling, other CPUs
1184
* will tickle us when they get extra work.
1186
if ( snext->pri == CSCHED_PRI_IDLE )
1188
if ( !cpu_isset(cpu, csched_priv.idlers) )
1189
cpu_set(cpu, csched_priv.idlers);
1191
else if ( cpu_isset(cpu, csched_priv.idlers) )
1193
cpu_clear(cpu, csched_priv.idlers);
1196
if ( !is_idle_vcpu(snext->vcpu) )
1197
snext->start_time += now;
1200
* Return task to run next...
1202
ret.time = (is_idle_vcpu(snext->vcpu) ?
1203
-1 : MILLISECS(CSCHED_MSECS_PER_TSLICE));
1204
ret.task = snext->vcpu;
1206
CSCHED_VCPU_CHECK(ret.task);
1211
csched_dump_vcpu(struct csched_vcpu *svc)
1213
struct csched_dom * const sdom = svc->sdom;
1215
printk("[%i.%i] pri=%i flags=%x cpu=%i",
1216
svc->vcpu->domain->domain_id,
1220
svc->vcpu->processor);
1224
printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
1226
printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
1227
svc->stats.credit_last,
1228
svc->stats.credit_incr,
1229
svc->stats.state_active,
1230
svc->stats.state_idle,
1231
svc->stats.migrate_q,
1232
svc->stats.migrate_r);
1240
csched_dump_pcpu(int cpu)
1242
struct list_head *runq, *iter;
1243
struct csched_pcpu *spc;
1244
struct csched_vcpu *svc;
1246
#define cpustr keyhandler_scratch
1248
spc = CSCHED_PCPU(cpu);
1251
cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map, cpu));
1252
printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
1253
cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map, cpu));
1254
printk("core=%s\n", cpustr);
1257
svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
1261
csched_dump_vcpu(svc);
1265
list_for_each( iter, runq )
1267
svc = __runq_elem(iter);
1270
printk("\t%3d: ", ++loop);
1271
csched_dump_vcpu(svc);
1280
struct list_head *iter_sdom, *iter_svc;
1282
#define idlers_buf keyhandler_scratch
1288
"\tcredit balance = %d\n"
1290
"\trunq_sort = %u\n"
1291
"\tdefault-weight = %d\n"
1292
"\tmsecs per tick = %dms\n"
1293
"\tcredits per msec = %d\n"
1294
"\tticks per tslice = %d\n"
1295
"\tticks per acct = %d\n"
1296
"\tmigration delay = %uus\n",
1300
csched_priv.credit_balance,
1302
csched_priv.runq_sort,
1303
CSCHED_DEFAULT_WEIGHT,
1304
CSCHED_MSECS_PER_TICK,
1305
CSCHED_CREDITS_PER_MSEC,
1306
CSCHED_TICKS_PER_TSLICE,
1307
CSCHED_TICKS_PER_ACCT,
1308
vcpu_migration_delay);
1310
cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
1311
printk("idlers: %s\n", idlers_buf);
1313
printk("active vcpus:\n");
1315
list_for_each( iter_sdom, &csched_priv.active_sdom )
1317
struct csched_dom *sdom;
1318
sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
1320
list_for_each( iter_svc, &sdom->active_vcpu )
1322
struct csched_vcpu *svc;
1323
svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
1325
printk("\t%3d: ", ++loop);
1326
csched_dump_vcpu(svc);
1335
spin_lock_init(&csched_priv.lock);
1336
INIT_LIST_HEAD(&csched_priv.active_sdom);
1337
csched_priv.ncpus = 0;
1338
csched_priv.master = UINT_MAX;
1339
cpus_clear(csched_priv.idlers);
1340
csched_priv.weight = 0U;
1341
csched_priv.credit = 0U;
1342
csched_priv.credit_balance = 0;
1343
csched_priv.runq_sort = 0U;
1346
/* Tickers cannot be kicked until SMP subsystem is alive. */
1347
static __init int csched_start_tickers(void)
1349
struct csched_pcpu *spc;
1352
/* Is the credit scheduler initialised? */
1353
if ( csched_priv.ncpus == 0 )
1356
for_each_online_cpu ( cpu )
1358
spc = CSCHED_PCPU(cpu);
1359
set_timer(&spc->ticker, NOW() + MILLISECS(CSCHED_MSECS_PER_TICK));
1362
init_timer( &csched_priv.master_ticker, csched_acct, NULL,
1363
csched_priv.master);
1365
set_timer( &csched_priv.master_ticker, NOW() +
1366
MILLISECS(CSCHED_MSECS_PER_TICK) * CSCHED_TICKS_PER_ACCT );
1370
__initcall(csched_start_tickers);
1372
static void csched_tick_suspend(void)
1374
struct csched_pcpu *spc;
1376
spc = CSCHED_PCPU(smp_processor_id());
1378
stop_timer(&spc->ticker);
1381
static void csched_tick_resume(void)
1383
struct csched_pcpu *spc;
1384
uint64_t now = NOW();
1386
spc = CSCHED_PCPU(smp_processor_id());
1388
set_timer(&spc->ticker, now + MILLISECS(CSCHED_MSECS_PER_TICK)
1389
- now % MILLISECS(CSCHED_MSECS_PER_TICK) );
1392
const struct scheduler sched_credit_def = {
1393
.name = "SMP Credit Scheduler",
1394
.opt_name = "credit",
1395
.sched_id = XEN_SCHEDULER_CREDIT,
1397
.init_domain = csched_dom_init,
1398
.destroy_domain = csched_dom_destroy,
1400
.init_vcpu = csched_vcpu_init,
1401
.destroy_vcpu = csched_vcpu_destroy,
1403
.sleep = csched_vcpu_sleep,
1404
.wake = csched_vcpu_wake,
1406
.adjust = csched_dom_cntl,
1408
.pick_cpu = csched_cpu_pick,
1409
.do_schedule = csched_schedule,
1411
.dump_cpu_state = csched_dump_pcpu,
1412
.dump_settings = csched_dump,
1413
.init = csched_init,
1415
.tick_suspend = csched_tick_suspend,
1416
.tick_resume = csched_tick_resume,