2
* Copyright (c) 2001-2007 Jakub Jermar
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
9
* - Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
11
* - Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
14
* - The name of the author may not be used to endorse or promote products
15
* derived from this software without specific prior written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
/** @addtogroup genericproc
35
* @brief Scheduler and load balancing.
37
* This file contains the scheduler and kcpulb kernel thread which
38
* performs load-balancing of per-CPU run queues.
41
#include <proc/scheduler.h>
42
#include <proc/thread.h>
43
#include <proc/task.h>
47
#include <time/timeout.h>
48
#include <time/delay.h>
50
#include <arch/faddr.h>
51
#include <arch/cycle.h>
53
#include <synch/spinlock.h>
56
#include <fpu_context.h>
65
static void before_task_runs(void);
66
static void before_thread_runs(void);
67
static void after_thread_ran(void);
68
static void scheduler_separated_stack(void);
70
atomic_t nrdy; /**< Number of ready threads in the system. */
72
/** Carry out actions before new task runs. */
73
void before_task_runs(void)
75
before_task_runs_arch();
78
/** Take actions before new thread runs.
80
* Perform actions that need to be
81
* taken before the newly selected
82
* tread is passed control.
84
* THREAD->lock is locked on entry
87
void before_thread_runs(void)
89
before_thread_runs_arch();
90
#ifdef CONFIG_FPU_LAZY
91
if(THREAD == CPU->fpu_owner)
97
if (THREAD->fpu_context_exists)
98
fpu_context_restore(THREAD->saved_fpu_context);
101
THREAD->fpu_context_exists = 1;
106
/** Take actions after THREAD had run.
108
* Perform actions that need to be
109
* taken after the running thread
110
* had been preempted by the scheduler.
112
* THREAD->lock is locked on entry
115
void after_thread_ran(void)
117
after_thread_ran_arch();
120
#ifdef CONFIG_FPU_LAZY
121
void scheduler_fpu_lazy_request(void)
125
spinlock_lock(&CPU->lock);
127
/* Save old context */
128
if (CPU->fpu_owner != NULL) {
129
spinlock_lock(&CPU->fpu_owner->lock);
130
fpu_context_save(CPU->fpu_owner->saved_fpu_context);
131
/* don't prevent migration */
132
CPU->fpu_owner->fpu_context_engaged = 0;
133
spinlock_unlock(&CPU->fpu_owner->lock);
134
CPU->fpu_owner = NULL;
137
spinlock_lock(&THREAD->lock);
138
if (THREAD->fpu_context_exists) {
139
fpu_context_restore(THREAD->saved_fpu_context);
141
/* Allocate FPU context */
142
if (!THREAD->saved_fpu_context) {
144
spinlock_unlock(&THREAD->lock);
145
spinlock_unlock(&CPU->lock);
146
THREAD->saved_fpu_context =
147
(fpu_context_t *) slab_alloc(fpu_context_slab, 0);
148
/* We may have switched CPUs during slab_alloc */
152
THREAD->fpu_context_exists = 1;
154
CPU->fpu_owner = THREAD;
155
THREAD->fpu_context_engaged = 1;
156
spinlock_unlock(&THREAD->lock);
158
spinlock_unlock(&CPU->lock);
162
/** Initialize scheduler
164
* Initialize kernel scheduler.
167
void scheduler_init(void)
171
/** Get thread to be scheduled
173
* Get the optimal thread to be scheduled
174
* according to thread accounting and scheduler
177
* @return Thread to be scheduled.
180
static thread_t *find_best_thread(void)
191
if (atomic_get(&CPU->nrdy) == 0) {
193
* For there was nothing to run, the CPU goes to sleep
194
* until a hardware interrupt or an IPI comes.
195
* This improves energy saving and hyperthreading.
199
* An interrupt might occur right now and wake up a thread.
200
* In such case, the CPU will continue to go to sleep
201
* even though there is a runnable thread.
208
interrupts_disable();
210
for (i = 0; i < RQ_COUNT; i++) {
212
spinlock_lock(&r->lock);
215
* If this queue is empty, try a lower-priority queue.
217
spinlock_unlock(&r->lock);
221
atomic_dec(&CPU->nrdy);
226
* Take the first thread from the queue.
228
t = list_get_instance(r->rq_head.next, thread_t, rq_link);
229
list_remove(&t->rq_link);
231
spinlock_unlock(&r->lock);
233
spinlock_lock(&t->lock);
236
t->ticks = us2ticks((i + 1) * 10000);
237
t->priority = i; /* correct rq index */
240
* Clear the THREAD_FLAG_STOLEN flag so that t can be migrated
241
* when load balancing needs emerge.
243
t->flags &= ~THREAD_FLAG_STOLEN;
244
spinlock_unlock(&t->lock);
252
/** Prevent rq starvation
254
* Prevent low priority threads from starving in rq's.
256
* When the function decides to relink rq's, it reconnects
257
* respective pointers so that in result threads with 'pri'
258
* greater or equal start are moved to a higher-priority queue.
260
* @param start Threshold priority.
263
static void relink_rq(int start)
269
list_initialize(&head);
270
spinlock_lock(&CPU->lock);
271
if (CPU->needs_relink > NEEDS_RELINK_MAX) {
272
for (i = start; i < RQ_COUNT - 1; i++) {
273
/* remember and empty rq[i + 1] */
275
spinlock_lock(&r->lock);
276
list_concat(&head, &r->rq_head);
279
spinlock_unlock(&r->lock);
281
/* append rq[i + 1] to rq[i] */
283
spinlock_lock(&r->lock);
284
list_concat(&r->rq_head, &head);
286
spinlock_unlock(&r->lock);
288
CPU->needs_relink = 0;
290
spinlock_unlock(&CPU->lock);
296
* The thread scheduling procedure.
297
* Passes control directly to
298
* scheduler_separated_stack().
307
ipl = interrupts_disable();
309
if (atomic_get(&haltstate))
313
spinlock_lock(&THREAD->lock);
315
/* Update thread accounting */
316
THREAD->cycles += get_cycle() - THREAD->last_cycle;
318
#ifndef CONFIG_FPU_LAZY
319
fpu_context_save(THREAD->saved_fpu_context);
321
if (!context_save(&THREAD->saved_context)) {
323
* This is the place where threads leave scheduler();
326
/* Save current CPU cycle */
327
THREAD->last_cycle = get_cycle();
329
spinlock_unlock(&THREAD->lock);
330
interrupts_restore(THREAD->saved_context.ipl);
336
* Interrupt priority level of preempted thread is recorded
337
* here to facilitate scheduler() invocations from
338
* interrupts_disable()'d code (e.g. waitq_sleep_timeout()).
340
THREAD->saved_context.ipl = ipl;
344
* Through the 'THE' structure, we keep track of THREAD, TASK, CPU, VM
345
* and preemption counter. At this point THE could be coming either
346
* from THREAD's or CPU's stack.
348
the_copy(THE, (the_t *) CPU->stack);
351
* We may not keep the old stack.
352
* Reason: If we kept the old stack and got blocked, for instance, in
353
* find_best_thread(), the old thread could get rescheduled by another
354
* CPU and overwrite the part of its own stack that was also used by
355
* the scheduler on this CPU.
357
* Moreover, we have to bypass the compiler-generated POP sequence
358
* which is fooled by SP being set to the very top of the stack.
359
* Therefore the scheduler() function continues in
360
* scheduler_separated_stack().
362
context_save(&CPU->saved_context);
363
context_set(&CPU->saved_context, FADDR(scheduler_separated_stack),
364
(uintptr_t) CPU->stack, CPU_STACK_SIZE);
365
context_restore(&CPU->saved_context);
369
/** Scheduler stack switch wrapper
371
* Second part of the scheduler() function
372
* using new stack. Handling the actual context
373
* switch to a new thread.
375
* Assume THREAD->lock is held.
377
void scheduler_separated_stack(void)
380
DEADLOCK_PROBE_INIT(p_joinwq);
385
/* must be run after the switch to scheduler stack */
388
switch (THREAD->state) {
390
spinlock_unlock(&THREAD->lock);
391
thread_ready(THREAD);
396
if (THREAD->detached) {
397
thread_destroy(THREAD);
400
* The thread structure is kept allocated until
401
* somebody calls thread_detach() on it.
403
if (!spinlock_trylock(&THREAD->join_wq.lock)) {
407
spinlock_unlock(&THREAD->lock);
409
spinlock_lock(&THREAD->lock);
410
DEADLOCK_PROBE(p_joinwq,
414
_waitq_wakeup_unsafe(&THREAD->join_wq,
416
spinlock_unlock(&THREAD->join_wq.lock);
418
THREAD->state = Lingering;
419
spinlock_unlock(&THREAD->lock);
425
* Prefer the thread after it's woken up.
427
THREAD->priority = -1;
430
* We need to release wq->lock which we locked in
431
* waitq_sleep(). Address of wq->lock is kept in
432
* THREAD->sleep_queue.
434
spinlock_unlock(&THREAD->sleep_queue->lock);
437
* Check for possible requests for out-of-context
440
if (THREAD->call_me) {
441
THREAD->call_me(THREAD->call_me_with);
442
THREAD->call_me = NULL;
443
THREAD->call_me_with = NULL;
446
spinlock_unlock(&THREAD->lock);
452
* Entering state is unexpected.
454
panic("tid%" PRIu64 ": unexpected state %s.",
455
THREAD->tid, thread_states[THREAD->state]);
462
THREAD = find_best_thread();
464
spinlock_lock(&THREAD->lock);
465
priority = THREAD->priority;
466
spinlock_unlock(&THREAD->lock);
471
* If both the old and the new task are the same, lots of work is
474
if (TASK != THREAD->task) {
479
spinlock_lock(&TASK->lock);
481
spinlock_unlock(&TASK->lock);
484
spinlock_lock(&THREAD->task->lock);
485
as2 = THREAD->task->as;
486
spinlock_unlock(&THREAD->task->lock);
489
* Note that it is possible for two tasks to share one address
494
* Both tasks and address spaces are different.
495
* Replace the old one with the new one.
503
spinlock_lock(&THREAD->lock);
504
THREAD->state = Running;
506
#ifdef SCHEDULER_VERBOSE
507
printf("cpu%u: tid %" PRIu64 " (priority=%d, ticks=%" PRIu64
508
", nrdy=%ld)\n", CPU->id, THREAD->tid, THREAD->priority,
509
THREAD->ticks, atomic_get(&CPU->nrdy));
513
* Some architectures provide late kernel PA2KA(identity)
514
* mapping in a page fault handler. However, the page fault
515
* handler uses the kernel stack of the running thread and
516
* therefore cannot be used to map it. The kernel stack, if
517
* necessary, is to be mapped in before_thread_runs(). This
518
* function must be executed before the switch to the new stack.
520
before_thread_runs();
523
* Copy the knowledge of CPU, TASK, THREAD and preemption counter to
526
the_copy(THE, (the_t *) THREAD->kstack);
528
context_restore(&THREAD->saved_context);
533
/** Load balancing thread
535
* SMP load balancing thread, supervising thread supplies
536
* for the CPU it's wired to.
538
* @param arg Generic thread argument (unused).
541
void kcpulb(void *arg)
544
int count, average, j, k = 0;
549
* Detach kcpulb as nobody will call thread_join_timeout() on it.
551
thread_detach(THREAD);
555
* Work in 1s intervals.
561
* Calculate the number of threads that will be migrated/stolen from
562
* other CPU's. Note that situation can have changed between two
563
* passes. Each time get the most up to date counts.
565
average = atomic_get(&nrdy) / config.cpu_active + 1;
566
count = average - atomic_get(&CPU->nrdy);
572
* Searching least priority queues on all CPU's first and most priority
573
* queues on all CPU's last.
575
for (j = RQ_COUNT - 1; j >= 0; j--) {
576
for (i = 0; i < config.cpu_active; i++) {
581
cpu = &cpus[(i + k) % config.cpu_active];
584
* Not interested in ourselves.
585
* Doesn't require interrupt disabling for kcpulb has
590
if (atomic_get(&cpu->nrdy) <= average)
593
ipl = interrupts_disable();
595
spinlock_lock(&r->lock);
597
spinlock_unlock(&r->lock);
598
interrupts_restore(ipl);
603
l = r->rq_head.prev; /* search rq from the back */
604
while (l != &r->rq_head) {
605
t = list_get_instance(l, thread_t, rq_link);
607
* We don't want to steal CPU-wired threads
608
* neither threads already stolen. The latter
609
* prevents threads from migrating between CPU's
610
* without ever being run. We don't want to
611
* steal threads whose FPU context is still in
614
spinlock_lock(&t->lock);
615
if ((!(t->flags & (THREAD_FLAG_WIRED |
616
THREAD_FLAG_STOLEN))) &&
617
(!(t->fpu_context_engaged))) {
621
spinlock_unlock(&t->lock);
623
atomic_dec(&cpu->nrdy);
627
list_remove(&t->rq_link);
631
spinlock_unlock(&t->lock);
635
spinlock_unlock(&r->lock);
639
* Ready t on local CPU
641
spinlock_lock(&t->lock);
642
#ifdef KCPULB_VERBOSE
643
printf("kcpulb%u: TID %" PRIu64 " -> cpu%u, "
644
"nrdy=%ld, avg=%ld\n", CPU->id, t->tid,
645
CPU->id, atomic_get(&CPU->nrdy),
646
atomic_get(&nrdy) / config.cpu_active);
648
t->flags |= THREAD_FLAG_STOLEN;
650
spinlock_unlock(&t->lock);
654
interrupts_restore(ipl);
660
* We are not satisfied yet, focus on another
667
interrupts_restore(ipl);
671
if (atomic_get(&CPU->nrdy)) {
673
* Be a little bit light-weight and let migrated threads run.
678
* We failed to migrate a single thread.
690
#endif /* CONFIG_SMP */
693
/** Print information about threads & scheduler queues */
694
void sched_print_list(void)
702
/* We are going to mess with scheduler structures,
703
* let's not be interrupted */
704
ipl = interrupts_disable();
705
for (cpu = 0; cpu < config.cpu_count; cpu++) {
707
if (!cpus[cpu].active)
710
spinlock_lock(&cpus[cpu].lock);
711
printf("cpu%u: address=%p, nrdy=%ld, needs_relink=%" PRIs "\n",
712
cpus[cpu].id, &cpus[cpu], atomic_get(&cpus[cpu].nrdy),
713
cpus[cpu].needs_relink);
715
for (i = 0; i < RQ_COUNT; i++) {
716
r = &cpus[cpu].rq[i];
717
spinlock_lock(&r->lock);
719
spinlock_unlock(&r->lock);
722
printf("\trq[%u]: ", i);
723
for (cur = r->rq_head.next; cur != &r->rq_head;
725
t = list_get_instance(cur, thread_t, rq_link);
726
printf("%" PRIu64 "(%s) ", t->tid,
727
thread_states[t->state]);
730
spinlock_unlock(&r->lock);
732
spinlock_unlock(&cpus[cpu].lock);
735
interrupts_restore(ipl);