1
#include <xeno/config.h>
4
#include <xeno/errno.h>
5
#include <xeno/sched.h>
7
#include <xeno/skbuff.h>
8
#include <xeno/interrupt.h>
9
#include <xeno/delay.h>
10
#include <xeno/event.h>
11
#include <xeno/time.h>
12
#include <hypervisor-ifs/dom0_ops.h>
14
#include <asm/domain_page.h>
15
#include <asm/flushtlb.h>
17
#include <xeno/blkdev.h>
18
#include <xeno/console.h>
24
* NB. No ring-3 access in initial guestOS pagetables. Note that we allow
25
* ring-3 privileges in the page directories, so that the guestOS may later
26
* decide to share a 4MB region with applications.
1
/******************************************************************************
4
* Generic domain-handling functions.
28
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
29
#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
31
/* Both these structures are protected by the tasklist_lock. */
32
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;
33
struct task_struct *task_hash[TASK_HASH_SIZE];
35
struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu)
38
struct task_struct *p = NULL;
42
p = alloc_task_struct();
43
if ( p == NULL ) return NULL;
44
memset(p, 0, sizeof(*p));
46
atomic_set(&p->refcnt, 1);
51
sprintf(p->name, "Domain-%d", dom_id);
53
spin_lock_init(&p->blk_ring_lock);
54
spin_lock_init(&p->page_lock);
56
p->shared_info = (void *)get_free_page(GFP_KERNEL);
57
memset(p->shared_info, 0, PAGE_SIZE);
58
SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id);
60
p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
61
memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
65
p->addr_limit = USER_DS;
69
INIT_LIST_HEAD(&p->pg_head);
70
p->max_pages = p->tot_pages = 0;
71
write_lock_irqsave(&tasklist_lock, flags);
73
p->next_hash = task_hash[TASK_HASH(dom_id)];
74
task_hash[TASK_HASH(dom_id)] = p;
75
write_unlock_irqrestore(&tasklist_lock, flags);
81
struct task_struct *find_domain_by_id(unsigned int dom)
83
struct task_struct *p;
86
read_lock_irqsave(&tasklist_lock, flags);
87
p = task_hash[TASK_HASH(dom)];
90
if ( p->domain == dom )
97
read_unlock_irqrestore(&tasklist_lock, flags);
103
void kill_domain_with_errmsg(const char *err)
105
printk("DOM%d FATAL ERROR: %s\n",
106
current->domain, err);
111
void __kill_domain(struct task_struct *p)
7
#include <xen/config.h>
8
#include <xen/compat.h>
11
#include <xen/ctype.h>
12
#include <xen/errno.h>
13
#include <xen/sched.h>
14
#include <xen/domain.h>
16
#include <xen/event.h>
18
#include <xen/console.h>
19
#include <xen/softirq.h>
20
#include <xen/domain_page.h>
21
#include <xen/rangeset.h>
22
#include <xen/guest_access.h>
23
#include <xen/hypercall.h>
24
#include <xen/delay.h>
25
#include <xen/shutdown.h>
26
#include <xen/percpu.h>
27
#include <xen/multicall.h>
28
#include <xen/rcupdate.h>
29
#include <acpi/cpufreq/cpufreq.h>
30
#include <asm/debugger.h>
31
#include <public/sched.h>
32
#include <public/sysctl.h>
33
#include <public/vcpu.h>
35
#include <xen/trace.h>
38
/* Linux config option: propageted to domain0 */
39
/* xen_processor_pmbits: xen control Cx, Px, ... */
40
unsigned int xen_processor_pmbits = XEN_PROCESSOR_PM_PX;
42
/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
43
static unsigned int opt_dom0_vcpus_pin;
44
boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
46
/* set xen as default cpufreq */
47
enum cpufreq_controller cpufreq_controller = FREQCTL_xen;
49
static void __init setup_cpufreq_option(char *str)
53
if ( !strcmp(str, "dom0-kernel") )
55
xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX;
56
cpufreq_controller = FREQCTL_dom0_kernel;
57
opt_dom0_vcpus_pin = 1;
61
if ( !strcmp(str, "none") )
63
xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX;
64
cpufreq_controller = FREQCTL_none;
68
if ( (arg = strpbrk(str, ",:")) != NULL )
71
if ( !strcmp(str, "xen") )
73
cpufreq_cmdline_parse(arg);
75
custom_param("cpufreq", setup_cpufreq_option);
77
/* Protect updates/reads (resp.) of domain_list and domain_hash. */
78
DEFINE_SPINLOCK(domlist_update_lock);
79
DEFINE_RCU_READ_LOCK(domlist_read_lock);
81
#define DOMAIN_HASH_SIZE 256
82
#define DOMAIN_HASH(_id) ((int)(_id)&(DOMAIN_HASH_SIZE-1))
83
static struct domain *domain_hash[DOMAIN_HASH_SIZE];
84
struct domain *domain_list;
88
struct vcpu *idle_vcpu[NR_CPUS] __read_mostly;
90
vcpu_info_t dummy_vcpu_info;
92
int current_domain_id(void)
94
return current->domain->domain_id;
97
static void __domain_finalise_shutdown(struct domain *d)
101
BUG_ON(!spin_is_locked(&d->shutdown_lock));
103
if ( d->is_shut_down )
106
for_each_vcpu ( d, v )
107
if ( !v->paused_for_shutdown )
111
if ( (d->shutdown_code == SHUTDOWN_suspend) && d->suspend_evtchn )
112
evtchn_send(d, d->suspend_evtchn);
114
send_guest_global_virq(dom0, VIRQ_DOM_EXC);
117
static void vcpu_check_shutdown(struct vcpu *v)
119
struct domain *d = v->domain;
121
spin_lock(&d->shutdown_lock);
123
if ( d->is_shutting_down )
125
if ( !v->paused_for_shutdown )
126
vcpu_pause_nosync(v);
127
v->paused_for_shutdown = 1;
128
v->defer_shutdown = 0;
129
__domain_finalise_shutdown(d);
132
spin_unlock(&d->shutdown_lock);
135
struct vcpu *alloc_vcpu(
136
struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
140
BUG_ON((!is_idle_domain(d) || vcpu_id) && d->vcpu[vcpu_id]);
142
if ( (v = alloc_vcpu_struct()) == NULL )
146
v->vcpu_id = vcpu_id;
148
spin_lock_init(&v->virq_lock);
150
if ( is_idle_domain(d) )
152
v->runstate.state = RUNSTATE_running;
156
v->runstate.state = RUNSTATE_offline;
157
v->runstate.state_entry_time = NOW();
158
set_bit(_VPF_down, &v->pause_flags);
159
v->vcpu_info = ((vcpu_id < XEN_LEGACY_MAX_VCPUS)
160
? (vcpu_info_t *)&shared_info(d, vcpu_info[vcpu_id])
164
if ( sched_init_vcpu(v, cpu_id) != 0 )
170
if ( vcpu_initialise(v) != 0 )
172
sched_destroy_vcpu(v);
177
d->vcpu[vcpu_id] = v;
180
int prev_id = v->vcpu_id - 1;
181
while ( (prev_id >= 0) && (d->vcpu[prev_id] == NULL) )
184
v->next_in_list = d->vcpu[prev_id]->next_in_list;
185
d->vcpu[prev_id]->next_in_list = v;
188
/* Must be called after making new vcpu visible to for_each_vcpu(). */
189
vcpu_check_shutdown(v);
194
struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
196
return idle_vcpu[cpu_id] ?: alloc_vcpu(idle_vcpu[0]->domain,
200
static unsigned int __read_mostly extra_dom0_irqs = 256;
201
static unsigned int __read_mostly extra_domU_irqs = 32;
202
static void __init parse_extra_guest_irqs(const char *s)
205
extra_domU_irqs = simple_strtoul(s, &s, 0);
206
if ( *s == ',' && isdigit(*++s) )
207
extra_dom0_irqs = simple_strtoul(s, &s, 0);
209
custom_param("extra_guest_irqs", parse_extra_guest_irqs);
211
struct domain *domain_create(
212
domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
214
struct domain *d, **pd;
215
enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
216
INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
219
if ( (d = alloc_domain_struct()) == NULL )
222
memset(d, 0, sizeof(*d));
223
d->domain_id = domid;
225
lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
227
if ( xsm_alloc_security_domain(d) != 0 )
229
init_status |= INIT_xsm;
231
atomic_set(&d->refcnt, 1);
232
spin_lock_init_prof(d, domain_lock);
233
spin_lock_init_prof(d, page_alloc_lock);
234
spin_lock_init(&d->shutdown_lock);
235
spin_lock_init(&d->hypercall_deadlock_mutex);
236
INIT_PAGE_LIST_HEAD(&d->page_list);
237
INIT_PAGE_LIST_HEAD(&d->xenpage_list);
239
if ( domcr_flags & DOMCRF_hvm )
244
d->is_pinned = opt_dom0_vcpus_pin;
245
d->disable_migrate = 1;
248
if ( domcr_flags & DOMCRF_dummy )
251
rangeset_domain_initialise(d);
252
init_status |= INIT_rangeset;
254
if ( !is_idle_domain(d) )
256
if ( xsm_domain_create(d, ssidref) != 0 )
259
d->is_paused_by_controller = 1;
260
atomic_inc(&d->pause_count);
263
d->nr_pirqs = nr_irqs_gsi + extra_domU_irqs;
265
d->nr_pirqs = nr_irqs_gsi + extra_dom0_irqs;
267
d->pirq_to_evtchn = xmalloc_array(u16, d->nr_pirqs);
268
d->pirq_mask = xmalloc_array(
269
unsigned long, BITS_TO_LONGS(d->nr_pirqs));
270
if ( (d->pirq_to_evtchn == NULL) || (d->pirq_mask == NULL) )
272
memset(d->pirq_to_evtchn, 0, d->nr_pirqs * sizeof(*d->pirq_to_evtchn));
273
bitmap_zero(d->pirq_mask, d->nr_pirqs);
275
if ( evtchn_init(d) != 0 )
277
init_status |= INIT_evtchn;
279
if ( grant_table_create(d) != 0 )
281
init_status |= INIT_gnttab;
284
if ( arch_domain_create(d, domcr_flags) != 0 )
286
init_status |= INIT_arch;
288
d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
289
d->irq_caps = rangeset_new(d, "Interrupts", 0);
290
if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) )
293
if ( sched_init_domain(d) != 0 )
296
if ( !is_idle_domain(d) )
298
spin_lock(&domlist_update_lock);
299
pd = &domain_list; /* NB. domain_list maintained in order of domid. */
300
for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list )
301
if ( (*pd)->domain_id > d->domain_id )
303
d->next_in_list = *pd;
304
d->next_in_hashbucket = domain_hash[DOMAIN_HASH(domid)];
305
rcu_assign_pointer(*pd, d);
306
rcu_assign_pointer(domain_hash[DOMAIN_HASH(domid)], d);
307
spin_unlock(&domlist_update_lock);
313
d->is_dying = DOMDYING_dead;
314
atomic_set(&d->refcnt, DOMAIN_DESTROYED);
315
if ( init_status & INIT_arch )
316
arch_domain_destroy(d);
317
if ( init_status & INIT_gnttab )
318
grant_table_destroy(d);
319
if ( init_status & INIT_evtchn )
322
evtchn_destroy_final(d);
324
if ( init_status & INIT_rangeset )
325
rangeset_domain_destroy(d);
326
if ( init_status & INIT_xsm )
327
xsm_free_security_domain(d);
329
xfree(d->pirq_to_evtchn);
330
free_domain_struct(d);
335
struct domain *get_domain_by_id(domid_t dom)
339
rcu_read_lock(&domlist_read_lock);
341
for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]);
343
d = rcu_dereference(d->next_in_hashbucket) )
345
if ( d->domain_id == dom )
347
if ( unlikely(!get_domain(d)) )
353
rcu_read_unlock(&domlist_read_lock);
359
struct domain *rcu_lock_domain_by_id(domid_t dom)
363
rcu_read_lock(&domlist_read_lock);
365
for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]);
367
d = rcu_dereference(d->next_in_hashbucket) )
369
if ( d->domain_id == dom )
373
rcu_read_unlock(&domlist_read_lock);
378
int rcu_lock_target_domain_by_id(domid_t dom, struct domain **d)
380
if ( dom == DOMID_SELF )
382
*d = rcu_lock_current_domain();
386
if ( (*d = rcu_lock_domain_by_id(dom)) == NULL )
389
if ( !IS_PRIV_FOR(current->domain, *d) )
391
rcu_unlock_domain(*d);
398
int domain_kill(struct domain *d)
402
if ( d == current->domain )
405
/* Protected by domctl_lock. */
406
switch ( d->is_dying )
410
d->is_dying = DOMDYING_dying;
411
spin_barrier(&d->domain_lock);
413
gnttab_release_mappings(d);
414
tmem_destroy(d->tmem);
418
rc = domain_relinquish_resources(d);
421
BUG_ON(rc != -EAGAIN);
424
d->is_dying = DOMDYING_dead;
426
send_guest_global_virq(dom0, VIRQ_DOM_EXC);
436
void __domain_crash(struct domain *d)
438
if ( d->is_shutting_down )
440
/* Print nothing: the domain is already shutting down. */
442
else if ( d == current->domain )
444
printk("Domain %d (vcpu#%d) crashed on cpu#%d:\n",
445
d->domain_id, current->vcpu_id, smp_processor_id());
446
show_execution_state(guest_cpu_user_regs());
450
printk("Domain %d reported crashed by domain %d on cpu#%d:\n",
451
d->domain_id, current->domain->domain_id, smp_processor_id());
454
domain_shutdown(d, SHUTDOWN_crash);
458
void __domain_crash_synchronous(void)
460
__domain_crash(current->domain);
463
* Flush multicall state before dying if a multicall is in progress.
464
* This shouldn't be necessary, but some architectures are calling
465
* domain_crash_synchronous() when they really shouldn't (i.e., from
466
* within hypercall context).
468
if ( this_cpu(mc_state).flags != 0 )
471
"FIXME: synchronous domain crash during a multicall!\n");
472
this_cpu(mc_state).flags = 0;
475
vcpu_end_shutdown_deferral(current);
482
void domain_shutdown(struct domain *d, u8 reason)
486
if ( d->domain_id == 0 )
487
dom0_shutdown(reason);
489
spin_lock(&d->shutdown_lock);
491
if ( d->is_shutting_down )
493
spin_unlock(&d->shutdown_lock);
497
d->is_shutting_down = 1;
498
d->shutdown_code = reason;
500
smp_mb(); /* set shutdown status /then/ check for per-cpu deferrals */
502
for_each_vcpu ( d, v )
504
if ( reason == SHUTDOWN_crash )
505
v->defer_shutdown = 0;
506
else if ( v->defer_shutdown )
508
vcpu_pause_nosync(v);
509
v->paused_for_shutdown = 1;
512
__domain_finalise_shutdown(d);
514
spin_unlock(&d->shutdown_lock);
517
void domain_resume(struct domain *d)
522
* Some code paths assume that shutdown status does not get reset under
523
* their feet (e.g., some assertions make this assumption).
527
spin_lock(&d->shutdown_lock);
529
d->is_shutting_down = d->is_shut_down = 0;
531
for_each_vcpu ( d, v )
533
if ( v->paused_for_shutdown )
535
v->paused_for_shutdown = 0;
538
spin_unlock(&d->shutdown_lock);
543
int vcpu_start_shutdown_deferral(struct vcpu *v)
545
if ( v->defer_shutdown )
548
v->defer_shutdown = 1;
549
smp_mb(); /* set deferral status /then/ check for shutdown */
550
if ( unlikely(v->domain->is_shutting_down) )
551
vcpu_check_shutdown(v);
553
return v->defer_shutdown;
556
void vcpu_end_shutdown_deferral(struct vcpu *v)
558
v->defer_shutdown = 0;
559
smp_mb(); /* clear deferral status /then/ check for shutdown */
560
if ( unlikely(v->domain->is_shutting_down) )
561
vcpu_check_shutdown(v);
564
void domain_pause_for_debugger(void)
566
struct domain *d = current->domain;
569
atomic_inc(&d->pause_count);
570
if ( test_and_set_bool(d->is_paused_by_controller) )
571
domain_unpause(d); /* race-free atomic_dec(&d->pause_count) */
573
for_each_vcpu ( d, v )
574
vcpu_sleep_nosync(v);
576
send_guest_global_virq(dom0, VIRQ_DEBUGGER);
579
/* Complete domain destroy after RCU readers are not holding old references. */
580
static void complete_domain_destroy(struct rcu_head *head)
582
struct domain *d = container_of(head, struct domain, rcu);
114
struct task_struct **pp;
117
if ( p->domain == 0 )
119
extern void machine_restart(char *);
120
printk("Domain 0 killed: rebooting machine!\n");
124
/* Only allow the domain to be destroyed once. */
125
if ( !sched_rem_domain(p) )
128
printk("Killing domain %d\n", p->domain);
130
unlink_blkdev_info(p);
132
for ( i = 0; i < MAX_DOMAIN_VIFS; i++ )
133
unlink_net_vif(p->net_vif_list[i]);
136
* Note this means that find_domain_by_id may fail, even when the caller
137
* holds a reference to the domain being queried. Take care!
139
write_lock_irqsave(&tasklist_lock, flags);
141
pp = &task_hash[TASK_HASH(p->domain)];
142
while ( *pp != p ) *pp = (*pp)->next_hash;
144
write_unlock_irqrestore(&tasklist_lock, flags);
149
BUG(); /* never get here */
158
void kill_domain(void)
160
__kill_domain(current);
164
long kill_other_domain(unsigned int dom, int force)
166
struct task_struct *p;
167
unsigned long cpu_mask = 0;
169
p = find_domain_by_id(dom);
170
if ( p == NULL ) return -ESRCH;
172
if ( p->state == TASK_STOPPED )
178
cpu_mask = mark_hyp_event(p, _HYP_EVENT_DIE);
179
hyp_event_notify(cpu_mask);
183
cpu_mask = mark_guest_event(p, _EVENT_DIE);
184
guest_event_notify(cpu_mask);
191
void stop_domain(void)
193
memcpy(¤t->shared_info->execution_context,
194
get_execution_context(),
195
sizeof(execution_context_t));
197
wmb(); /* All CPUs must see saved info in state TASK_STOPPED. */
198
set_current_state(TASK_STOPPED);
202
long stop_other_domain(unsigned int dom)
204
unsigned long cpu_mask;
205
struct task_struct *p;
210
p = find_domain_by_id (dom);
211
if ( p == NULL) return -ESRCH;
213
if ( p->state != TASK_STOPPED )
215
cpu_mask = mark_guest_event(p, _EVENT_STOP);
216
guest_event_notify(cpu_mask);
223
unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
225
struct list_head *temp;
227
unsigned int alloc_pfns;
228
unsigned int req_pages;
231
/* how many pages do we need to alloc? */
232
req_pages = kbytes >> (PAGE_SHIFT - 10);
234
spin_lock_irqsave(&free_list_lock, flags);
236
/* is there enough mem to serve the request? */
237
if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) >
240
spin_unlock_irqrestore(&free_list_lock, flags);
244
/* allocate pages and build a thread through frame_table */
245
temp = free_list.next;
246
for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ )
248
pf = list_entry(temp, struct pfn_info, list);
249
pf->flags = p->domain;
250
set_page_type_count(pf, 0);
251
set_page_tot_count(pf, 0);
254
list_add_tail(&pf->list, &p->pg_head);
256
ASSERT(free_pfns != 0);
259
spin_unlock_irqrestore(&free_list_lock, flags);
261
p->tot_pages = req_pages;
263
/* TEMPORARY: max_pages should be explicitly specified. */
264
p->max_pages = p->tot_pages;
270
void free_all_dom_mem(struct task_struct *p)
272
struct list_head *ent;
275
spin_lock_irqsave(&free_list_lock, flags);
276
while ( (ent = p->pg_head.next) != &p->pg_head )
278
struct pfn_info *pf = list_entry(ent, struct pfn_info, list);
279
set_page_type_count(pf, 0);
280
set_page_tot_count(pf, 0);
282
ASSERT(ent->next->prev == ent);
283
ASSERT(ent->prev->next == ent);
285
list_add(ent, &free_list);
288
spin_unlock_irqrestore(&free_list_lock, flags);
586
for ( i = d->max_vcpus - 1; i >= 0; i-- )
588
if ( (v = d->vcpu[i]) == NULL )
591
sched_destroy_vcpu(v);
594
grant_table_destroy(d);
596
arch_domain_destroy(d);
598
rangeset_domain_destroy(d);
600
sched_destroy_domain(d);
602
/* Free page used by xen oprofile buffer. */
603
free_xenoprof_pages(d);
605
for ( i = d->max_vcpus - 1; i >= 0; i-- )
606
if ( (v = d->vcpu[i]) != NULL )
609
if ( d->target != NULL )
610
put_domain(d->target);
612
evtchn_destroy_final(d);
615
xfree(d->pirq_to_evtchn);
617
xsm_free_security_domain(d);
618
free_domain_struct(d);
620
send_guest_global_virq(dom0, VIRQ_DOM_EXC);
294
623
/* Release resources belonging to task @p. */
295
void release_task(struct task_struct *p)
297
ASSERT(p->state == TASK_DYING);
300
printk("Releasing task %d\n", p->domain);
303
* This frees up blkdev rings and vbd-access lists. Totally safe since
304
* blkdev ref counting actually uses the task_struct refcnt.
306
destroy_blkdev_info(p);
308
/* Free all memory associated with this domain. */
309
free_page((unsigned long)p->mm.perdomain_pt);
310
UNSHARE_PFN(virt_to_page(p->shared_info));
311
free_page((unsigned long)p->shared_info);
314
kmem_cache_free(task_struct_cachep, p);
319
* final_setup_guestos is used for final setup and launching of domains other
320
* than domain 0. ie. the domains that are being built by the userspace dom0
323
int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
325
unsigned long phys_l2tab;
328
if ( (p->flags & PF_CONSTRUCTED) )
331
p->flags &= ~PF_DONEFPUINIT;
332
if ( builddomain->ctxt.flags & ECF_I387_VALID )
333
p->flags |= PF_DONEFPUINIT;
334
memcpy(&p->shared_info->execution_context,
335
&builddomain->ctxt.i386_ctxt,
336
sizeof(p->shared_info->execution_context));
337
memcpy(&p->thread.i387,
338
&builddomain->ctxt.i387_ctxt,
339
sizeof(p->thread.i387));
340
memcpy(p->thread.traps,
341
&builddomain->ctxt.trap_ctxt,
342
sizeof(p->thread.traps));
343
SET_DEFAULT_FAST_TRAP(&p->thread);
344
(void)set_fast_trap(p, builddomain->ctxt.fast_trap_idx);
345
p->mm.ldt_base = builddomain->ctxt.ldt_base;
346
p->mm.ldt_ents = builddomain->ctxt.ldt_ents;
347
SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
348
SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
349
if ( builddomain->ctxt.gdt_ents != 0 )
351
builddomain->ctxt.gdt_frames,
352
builddomain->ctxt.gdt_ents);
353
p->thread.ss1 = builddomain->ctxt.ring1_ss;
354
p->thread.esp1 = builddomain->ctxt.ring1_esp;
355
for ( i = 0; i < 8; i++ )
356
(void)set_debugreg(p, i, builddomain->ctxt.debugreg[i]);
357
p->event_selector = builddomain->ctxt.event_callback_cs;
358
p->event_address = builddomain->ctxt.event_callback_eip;
359
p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
360
p->failsafe_address = builddomain->ctxt.failsafe_callback_eip;
362
/* NB. Page base must already be pinned! */
363
phys_l2tab = builddomain->ctxt.pt_base;
364
p->mm.pagetable = mk_pagetable(phys_l2tab);
365
get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]);
366
get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]);
368
/* Set up the shared info structure. */
369
update_dom_time(p->shared_info);
371
/* Add virtual network interfaces and point to them in startinfo. */
372
while ( builddomain->num_vifs-- > 0 )
373
(void)create_net_vif(p->domain);
375
p->flags |= PF_CONSTRUCTED;
380
static unsigned long alloc_page_from_domain(unsigned long * cur_addr,
381
unsigned long * index)
383
unsigned long ret = *cur_addr;
384
struct list_head *ent = frame_table[ret >> PAGE_SHIFT].list.prev;
385
*cur_addr = list_entry(ent, struct pfn_info, list) - frame_table;
386
*cur_addr <<= PAGE_SHIFT;
392
* setup_guestos is used for building dom0 solely. other domains are built in
393
* userspace dom0 and final setup is being done by final_setup_guestos.
395
int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
396
unsigned int num_vifs,
397
char *phy_data_start, unsigned long data_len,
398
char *cmdline, unsigned long initrd_len)
400
struct list_head *list_ent;
401
char *src, *vsrc, *dst, *data_start;
402
int i, dom = p->domain;
403
unsigned long phys_l1tab, phys_l2tab;
404
unsigned long cur_address, alloc_address;
405
unsigned long virt_load_address, virt_stack_address;
406
start_info_t *virt_startinfo_address;
408
unsigned long alloc_index;
409
l2_pgentry_t *l2tab, *l2start;
410
l1_pgentry_t *l1tab = NULL, *l1start = NULL;
411
struct pfn_info *page = NULL;
413
extern void ide_probe_devices(xen_disk_info_t *);
414
extern void scsi_probe_devices(xen_disk_info_t *);
415
extern void cciss_probe_devices(xen_disk_info_t *);
420
if ( p->domain != 0 ) BUG();
421
if ( (p->flags & PF_CONSTRUCTED) ) BUG();
424
* This is all a bit grim. We've moved the modules to the "safe" physical
425
* memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
426
* routeine, we're going to copy it down into the region that's actually
427
* been allocated to domain 0. This is highly likely to be overlapping, so
428
* we use a forward copy.
430
* MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
431
* 4GB and lots of network/disk cards that allocate loads of buffers.
432
* We'll have to revist this if we ever support PAE (64GB).
435
data_start = map_domain_mem((unsigned long)phy_data_start);
437
if ( strncmp(data_start, "XenoGues", 8) )
439
printk("DOM%d: Invalid guest OS image\n", dom);
443
virt_load_address = *(unsigned long *)(data_start + 8);
444
if ( (virt_load_address & (PAGE_SIZE-1)) )
446
printk("DOM%d: Guest OS load address not page-aligned (%08lx)\n",
447
dom, virt_load_address);
451
if ( alloc_new_dom_mem(p, params->memory_kb) )
453
printk("DOM%d: Not enough memory --- reduce dom0_mem ??\n", dom);
457
alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) -
459
alloc_address <<= PAGE_SHIFT;
460
alloc_index = p->tot_pages;
462
if ( data_len > (params->memory_kb << 9) )
464
printk("DOM%d: Guest OS image is too large\n"
465
" (%luMB is greater than %uMB limit for a\n"
466
" %uMB address space)\n",
468
(params->memory_kb)>>11,
469
(params->memory_kb)>>10);
474
printk("DOM%d: Guest OS virtual load address is %08lx\n", dom,
477
SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
478
SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
481
* We're basically forcing default RPLs to 1, so that our "what privilege
482
* level are we returning to?" logic works.
484
p->failsafe_selector = FLAT_RING1_CS;
485
p->event_selector = FLAT_RING1_CS;
486
p->thread.ss1 = FLAT_RING1_DS;
487
for ( i = 0; i < 256; i++ )
488
p->thread.traps[i].cs = FLAT_RING1_CS;
491
* WARNING: The new domain must have its 'processor' field
492
* filled in by now !!
494
phys_l2tab = alloc_page_from_domain(&alloc_address, &alloc_index);
495
l2start = l2tab = map_domain_mem(phys_l2tab);
496
memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
497
l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
498
mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
499
l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
500
mk_l2_pgentry(phys_l2tab | __PAGE_HYPERVISOR);
501
memset(l2tab, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
502
p->mm.pagetable = mk_pagetable(phys_l2tab);
504
l2tab += l2_table_offset(virt_load_address);
505
cur_address = list_entry(p->pg_head.next, struct pfn_info, list) -
507
cur_address <<= PAGE_SHIFT;
508
for ( count = 0; count < p->tot_pages; count++ )
510
if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
512
if ( l1tab != NULL ) unmap_domain_mem(l1start);
513
phys_l1tab = alloc_page_from_domain(&alloc_address, &alloc_index);
514
*l2tab++ = mk_l2_pgentry(phys_l1tab|L2_PROT);
515
l1start = l1tab = map_domain_mem(phys_l1tab);
517
l1tab += l1_table_offset(
518
virt_load_address + (count << PAGE_SHIFT));
520
*l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
522
page = frame_table + (cur_address >> PAGE_SHIFT);
523
page->flags = dom | PGT_writeable_page | PG_need_flush;
524
set_page_type_count(page, 1);
525
set_page_tot_count(page, 1);
526
/* Set up the MPT entry. */
527
machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
529
list_ent = frame_table[cur_address >> PAGE_SHIFT].list.next;
530
cur_address = list_entry(list_ent, struct pfn_info, list) -
532
cur_address <<= PAGE_SHIFT;
534
unmap_domain_mem(l1start);
536
/* pages that are part of page tables must be read only */
537
l2tab = l2start + l2_table_offset(virt_load_address +
538
(alloc_index << PAGE_SHIFT));
539
l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
540
l1tab += l1_table_offset(virt_load_address + (alloc_index << PAGE_SHIFT));
542
for ( count = alloc_index; count < p->tot_pages; count++ )
544
*l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
545
page = frame_table + l1_pgentry_to_pagenr(*l1tab);
546
page->flags = dom | PGT_l1_page_table;
549
if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
551
unmap_domain_mem(l1start);
552
l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
556
get_page_type(page); /* guest_pinned */
557
get_page_tot(page); /* guest_pinned */
558
page->flags = dom | PG_guest_pinned | PGT_l2_page_table;
559
unmap_domain_mem(l1start);
561
/* Set up shared info area. */
562
update_dom_time(p->shared_info);
563
p->shared_info->domain_time = 0;
565
virt_startinfo_address = (start_info_t *)
566
(virt_load_address + ((alloc_index - 1) << PAGE_SHIFT));
567
virt_stack_address = (unsigned long)virt_startinfo_address;
569
unmap_domain_mem(l2start);
571
/* Install the new page tables. */
573
__write_cr3_counted(pagetable_val(p->mm.pagetable));
575
/* Copy the guest OS image. */
576
src = (char *)(phy_data_start + 12);
577
vsrc = (char *)(data_start + 12); /* data_start invalid after first page*/
578
dst = (char *)virt_load_address;
579
while ( src < (phy_data_start+data_len) )
583
if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 )
585
unmap_domain_mem( vsrc-1 );
586
vsrc = map_domain_mem( (unsigned long)src );
589
unmap_domain_mem( vsrc );
591
/* Set up start info area. */
592
memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address));
593
virt_startinfo_address->nr_pages = p->tot_pages;
594
virt_startinfo_address->shared_info = virt_to_phys(p->shared_info);
595
virt_startinfo_address->pt_base = virt_load_address +
596
((p->tot_pages - 1) << PAGE_SHIFT);
598
virt_startinfo_address->dom_id = p->domain;
599
virt_startinfo_address->flags = 0;
602
virt_startinfo_address->flags |= SIF_PRIVILEGED;
603
if ( CONSOLE_ISOWNER(p) )
604
virt_startinfo_address->flags |= SIF_CONSOLE;
609
virt_startinfo_address->mod_start = (unsigned long)dst-initrd_len;
610
virt_startinfo_address->mod_len = initrd_len;
611
printk("Initrd len 0x%lx, start at 0x%08lx\n",
612
virt_startinfo_address->mod_len,
613
virt_startinfo_address->mod_start);
616
/* Add virtual network interfaces and point to them in startinfo. */
617
while ( num_vifs-- > 0 )
618
(void)create_net_vif(dom);
620
dst = virt_startinfo_address->cmd_line;
621
if ( cmdline != NULL )
623
for ( i = 0; i < 255; i++ )
625
if ( cmdline[i] == '\0' ) break;
631
/* If this guy's getting the console we'd better let go. */
632
if ( virt_startinfo_address->flags & SIF_CONSOLE )
634
/* NB. Should reset the console here. */
639
/* Reinstate the caller's page tables. */
640
__write_cr3_counted(pagetable_val(current->mm.pagetable));
643
/* DOM0 gets access to all real block devices. */
644
#define MAX_REAL_DISKS 256
645
xd = kmalloc(MAX_REAL_DISKS * sizeof(xen_disk_t), GFP_KERNEL);
646
xdi.max = MAX_REAL_DISKS;
649
ide_probe_devices(&xdi);
650
scsi_probe_devices(&xdi);
651
cciss_probe_devices(&xdi);
652
for ( i = 0; i < xdi.count; i++ )
655
e.device = xd[i].device;
657
e.nr_sectors = xd[i].capacity;
658
if ( (__vbd_create(p, xd[i].device, VBD_MODE_R|VBD_MODE_W,
660
(__vbd_grow(p, xd[i].device, &e) != 0) )
665
p->flags |= PF_CONSTRUCTED;
668
(unsigned long)virt_load_address,
669
(unsigned long)virt_stack_address,
670
(unsigned long)virt_startinfo_address);
676
void __init domain_init(void)
678
printk("Initialising domains\n");
624
void domain_destroy(struct domain *d)
629
BUG_ON(!d->is_dying);
631
/* May be already destroyed, or get_domain() can race us. */
633
_atomic_set(new, DOMAIN_DESTROYED);
634
old = atomic_compareandswap(old, new, &d->refcnt);
635
if ( _atomic_read(old) != 0 )
638
/* Delete from task list and task hashtable. */
639
TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
640
spin_lock(&domlist_update_lock);
643
pd = &(*pd)->next_in_list;
644
rcu_assign_pointer(*pd, d->next_in_list);
645
pd = &domain_hash[DOMAIN_HASH(d->domain_id)];
647
pd = &(*pd)->next_in_hashbucket;
648
rcu_assign_pointer(*pd, d->next_in_hashbucket);
649
spin_unlock(&domlist_update_lock);
651
/* Schedule RCU asynchronous completion of domain destroy. */
652
call_rcu(&d->rcu, complete_domain_destroy);
655
void vcpu_pause(struct vcpu *v)
657
ASSERT(v != current);
658
atomic_inc(&v->pause_count);
662
void vcpu_pause_nosync(struct vcpu *v)
664
atomic_inc(&v->pause_count);
665
vcpu_sleep_nosync(v);
668
void vcpu_unpause(struct vcpu *v)
670
if ( atomic_dec_and_test(&v->pause_count) )
674
void domain_pause(struct domain *d)
678
ASSERT(d != current->domain);
680
atomic_inc(&d->pause_count);
682
for_each_vcpu( d, v )
686
void domain_unpause(struct domain *d)
690
if ( atomic_dec_and_test(&d->pause_count) )
691
for_each_vcpu( d, v )
695
void domain_pause_by_systemcontroller(struct domain *d)
698
if ( test_and_set_bool(d->is_paused_by_controller) )
702
void domain_unpause_by_systemcontroller(struct domain *d)
704
if ( test_and_clear_bool(d->is_paused_by_controller) )
708
int boot_vcpu(struct domain *d, int vcpuid, vcpu_guest_context_u ctxt)
710
struct vcpu *v = d->vcpu[vcpuid];
712
BUG_ON(v->is_initialised);
714
return arch_set_info_guest(v, ctxt);
717
void vcpu_reset(struct vcpu *v)
719
struct domain *d = v->domain;
726
set_bit(_VPF_down, &v->pause_flags);
728
clear_bit(v->vcpu_id, d->poll_mask);
731
v->fpu_initialised = 0;
733
v->is_initialised = 0;
734
#ifdef VCPU_TRAP_LAST
735
v->async_exception_mask = 0;
736
memset(v->async_exception_state, 0, sizeof(v->async_exception_state));
738
cpus_clear(v->cpu_affinity_tmp);
739
clear_bit(_VPF_blocked, &v->pause_flags);
741
domain_unlock(v->domain);
746
long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
748
struct domain *d = current->domain;
750
struct vcpu_guest_context *ctxt;
753
if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) )
756
if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
761
case VCPUOP_initialise:
762
if ( v->vcpu_info == &dummy_vcpu_info )
765
if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
768
if ( copy_from_guest(ctxt, arg, 1) )
776
if ( !v->is_initialised )
777
rc = boot_vcpu(d, vcpuid, ctxt);
784
if ( !v->is_initialised )
787
if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
793
if ( !test_and_set_bit(_VPF_down, &v->pause_flags) )
794
vcpu_sleep_nosync(v);
798
rc = !test_bit(_VPF_down, &v->pause_flags);
801
case VCPUOP_get_runstate_info:
803
struct vcpu_runstate_info runstate;
804
vcpu_runstate_get(v, &runstate);
805
if ( copy_to_guest(arg, &runstate, 1) )
810
case VCPUOP_set_periodic_timer:
812
struct vcpu_set_periodic_timer set;
814
if ( copy_from_guest(&set, arg, 1) )
817
if ( set.period_ns < MILLISECS(1) )
820
v->periodic_period = set.period_ns;
821
vcpu_force_reschedule(v);
826
case VCPUOP_stop_periodic_timer:
827
v->periodic_period = 0;
828
vcpu_force_reschedule(v);
831
case VCPUOP_set_singleshot_timer:
833
struct vcpu_set_singleshot_timer set;
838
if ( copy_from_guest(&set, arg, 1) )
841
if ( (set.flags & VCPU_SSHOTTMR_future) &&
842
(set.timeout_abs_ns < NOW()) )
845
if ( v->singleshot_timer.cpu != smp_processor_id() )
847
stop_timer(&v->singleshot_timer);
848
v->singleshot_timer.cpu = smp_processor_id();
851
set_timer(&v->singleshot_timer, set.timeout_abs_ns);
856
case VCPUOP_stop_singleshot_timer:
860
stop_timer(&v->singleshot_timer);
865
case VCPUOP_send_nmi:
866
if ( !guest_handle_is_null(arg) )
869
if ( !test_and_set_bool(v->nmi_pending) )
876
rc = arch_do_vcpu_op(cmd, v, arg);
883
long vm_assist(struct domain *p, unsigned int cmd, unsigned int type)
885
if ( type > MAX_VMASST_TYPE )
890
case VMASST_CMD_enable:
891
set_bit(type, &p->vm_assist);
893
case VMASST_CMD_disable:
894
clear_bit(type, &p->vm_assist);
907
* indent-tabs-mode: nil