1
/******************************************************************************
2
* arch/x86/mm/shadow/multi.c
4
* Simple, mostly-synchronous shadow page tables.
5
* Parts of this code are Copyright (c) 2006 by XenSource Inc.
6
* Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7
* Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
#include <xen/config.h>
25
#include <xen/types.h>
27
#include <xen/trace.h>
28
#include <xen/sched.h>
29
#include <xen/perfc.h>
30
#include <xen/domain_page.h>
32
#include <asm/current.h>
33
#include <asm/shadow.h>
34
#include <asm/flushtlb.h>
35
#include <asm/hvm/hvm.h>
36
#include <asm/hvm/cacheattr.h>
38
#include <asm/guest_pt.h>
39
#include <public/sched.h>
43
/* THINGS TO DO LATER:
46
* Also: have a heuristic for when to destroy a previous paging-mode's
47
* shadows. When a guest is done with its start-of-day 32-bit tables
48
* and reuses the memory we want to drop those shadows. Start with
49
* shadows in a page in two modes as a hint, but beware of clever tricks
50
* like reusing a pagetable for both PAE and 64-bit during boot...
53
* Rework shadow_get_l*e() to have the option of using map_domain_page()
54
* instead of linear maps. Add appropriate unmap_l*e calls in the users.
55
* Then we can test the speed difference made by linear maps. If the
56
* map_domain_page() version is OK on PAE, we could maybe allow a lightweight
57
* l3-and-l2h-only shadow mode for PAE PV guests that would allow them
58
* to share l2h pages again.
60
* PSE disabled / PSE36
61
* We don't support any modes other than PSE enabled, PSE36 disabled.
62
* Neither of those would be hard to change, but we'd need to be able to
63
* deal with shadows made in one mode and used in another.
66
#define FETCH_TYPE_PREFETCH 1
67
#define FETCH_TYPE_DEMAND 2
68
#define FETCH_TYPE_WRITE 4
70
ft_prefetch = FETCH_TYPE_PREFETCH,
71
ft_demand_read = FETCH_TYPE_DEMAND,
72
ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
75
#ifdef DEBUG_TRACE_DUMP
76
static char *fetch_type_names[] = {
77
[ft_prefetch] "prefetch",
78
[ft_demand_read] "demand read",
79
[ft_demand_write] "demand write",
83
/**************************************************************************/
84
/* Hash table mapping from guest pagetables to shadows
86
* Normal case: maps the mfn of a guest page to the mfn of its shadow page.
87
* FL1's: maps the *gfn* of the start of a superpage to the mfn of a
88
* shadow L1 which maps its "splinters".
92
get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
93
/* Look for FL1 shadows in the hash table */
95
mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow);
100
get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
101
/* Look for shadows in the hash table */
103
mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type);
104
perfc_incr(shadow_get_shadow_status);
109
set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
110
/* Put an FL1 shadow into the hash table */
112
SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
113
gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
115
shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
119
set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
120
/* Put a shadow into the hash table */
122
struct domain *d = v->domain;
125
SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
126
d->domain_id, v->vcpu_id, mfn_x(gmfn),
127
shadow_type, mfn_x(smfn));
129
/* 32-on-64 PV guests don't own their l4 pages so can't get_page them */
130
if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
132
res = get_page(mfn_to_page(gmfn), d);
136
shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn);
140
delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
141
/* Remove a shadow from the hash table */
143
SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
144
gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
145
shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn);
149
delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
150
/* Remove a shadow from the hash table */
152
SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
153
v->domain->domain_id, v->vcpu_id,
154
mfn_x(gmfn), shadow_type, mfn_x(smfn));
155
shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn);
156
/* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */
157
if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow )
158
put_page(mfn_to_page(gmfn));
162
/**************************************************************************/
163
/* Functions for walking the guest page tables */
165
static inline uint32_t
166
sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
169
return guest_walk_tables(v, va, gw, pfec,
170
#if GUEST_PAGING_LEVELS == 3 /* PAE */
172
v->arch.paging.shadow.gl3e
174
pagetable_get_mfn(v->arch.guest_table),
175
v->arch.paging.shadow.guest_vtable
180
/* This validation is called with lock held, and after write permission
181
* removal. Then check is atomic and no more inconsistent content can
182
* be observed before lock is released
184
* Return 1 to indicate success and 0 for inconsistency
186
static inline uint32_t
187
shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
189
struct domain *d = v->domain;
192
#if GUEST_PAGING_LEVELS >= 4
198
ASSERT(shadow_locked_by_me(d));
200
if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
203
/* We may consider caching guest page mapping from last
204
* guest table walk. However considering this check happens
205
* relatively less-frequent, and a bit burden here to
206
* remap guest page is better than caching mapping in each
209
* Also when inconsistency occurs, simply return to trigger
210
* another fault instead of re-validate new path to make
213
perfc_incr(shadow_check_gwalk);
214
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
215
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
216
l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
217
mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
218
l3p = sh_map_domain_page(gw->l3mfn);
219
mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
220
sh_unmap_domain_page(l3p);
222
mismatch |= (gw->l3e.l3 !=
223
v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
225
l2p = sh_map_domain_page(gw->l2mfn);
226
mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
227
sh_unmap_domain_page(l2p);
229
l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
230
mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
232
if ( !(guest_supports_superpages(v) &&
233
(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
235
l1p = sh_map_domain_page(gw->l1mfn);
236
mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
237
sh_unmap_domain_page(l1p);
243
/* Remove write access permissions from a gwalk_t in a batch, and
244
* return OR-ed result for TLB flush hint and need to rewalk the guest
247
* Syncing pages will remove write access to that page; but it may
248
* also give write access to other pages in the path. If we resync any
249
* pages, re-walk from the beginning.
251
#define GW_RMWR_FLUSHTLB 1
252
#define GW_RMWR_REWALK 2
254
static inline uint32_t
255
gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
259
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
260
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
261
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
262
if ( mfn_is_out_of_sync(gw->l3mfn) )
264
sh_resync(v, gw->l3mfn);
269
if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
270
rc = GW_RMWR_FLUSHTLB;
271
#endif /* GUEST_PAGING_LEVELS >= 4 */
273
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
274
if ( mfn_is_out_of_sync(gw->l2mfn) )
276
sh_resync(v, gw->l2mfn);
277
rc |= GW_RMWR_REWALK;
281
if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
282
rc |= GW_RMWR_FLUSHTLB;
283
#endif /* GUEST_PAGING_LEVELS >= 3 */
285
if ( !(guest_supports_superpages(v) &&
286
(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
287
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
288
&& !mfn_is_out_of_sync(gw->l1mfn)
290
&& sh_remove_write_access(v, gw->l1mfn, 1, va) )
291
rc |= GW_RMWR_FLUSHTLB;
296
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
297
/* Lightweight audit: pass all the shadows associated with this guest walk
298
* through the audit mechanisms */
299
static void sh_audit_gw(struct vcpu *v, walk_t *gw)
303
if ( !(SHADOW_AUDIT_ENABLE) )
306
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
307
if ( mfn_valid(gw->l4mfn)
308
&& mfn_valid((smfn = get_shadow_status(v, gw->l4mfn,
309
SH_type_l4_shadow))) )
310
(void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
311
if ( mfn_valid(gw->l3mfn)
312
&& mfn_valid((smfn = get_shadow_status(v, gw->l3mfn,
313
SH_type_l3_shadow))) )
314
(void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
315
#endif /* PAE or 64... */
316
if ( mfn_valid(gw->l2mfn) )
318
if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
319
SH_type_l2_shadow))) )
320
(void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
321
#if GUEST_PAGING_LEVELS == 3
322
if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn,
323
SH_type_l2h_shadow))) )
324
(void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
327
if ( mfn_valid(gw->l1mfn)
328
&& mfn_valid((smfn = get_shadow_status(v, gw->l1mfn,
329
SH_type_l1_shadow))) )
330
(void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
331
else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
332
&& (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
334
(smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) )
335
(void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
339
#define sh_audit_gw(_v, _gw) do {} while(0)
340
#endif /* audit code */
343
#if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
345
sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
346
unsigned long *gl1mfn)
351
ASSERT(shadow_mode_translate(v->domain));
353
// XXX -- this is expensive, but it's easy to cobble together...
356
if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
357
&& mfn_valid(gw.l1mfn) )
360
*gl1mfn = mfn_x(gw.l1mfn);
361
pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
362
(guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
369
sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
373
ASSERT(shadow_mode_translate(v->domain));
375
// XXX -- this is expensive, but it's easy to cobble together...
378
(void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
379
*(guest_l1e_t *)eff_l1e = gw.l1e;
381
#endif /* CONFIG == GUEST (== SHADOW) */
383
/**************************************************************************/
384
/* Functions to compute the correct index into a shadow page, given an
385
* index into the guest page (as returned by guest_get_index()).
386
* This is trivial when the shadow and guest use the same sized PTEs, but
387
* gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
388
* PAE- or 64-bit shadows).
390
* These functions also increment the shadow mfn, when necessary. When PTE
391
* sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
392
* page. In this case, we allocate 2 contiguous pages for the shadow L1, and
393
* use simple pointer arithmetic on a pointer to the guest L1e to figure out
394
* which shadow page we really want. Similarly, when PTE sizes are
395
* mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
396
* way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
397
* space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
400
* For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
401
* of shadow (to store both the shadow, and the info that would normally be
402
* stored in page_info fields). This arrangement allows the shadow and the
403
* "page_info" fields to always be stored in the same page (in fact, in
404
* the same cache line), avoiding an extra call to map_domain_page().
408
guest_index(void *ptr)
410
return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
414
shadow_l1_index(mfn_t *smfn, u32 guest_index)
416
#if (GUEST_PAGING_LEVELS == 2)
417
*smfn = _mfn(mfn_x(*smfn) +
418
(guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
419
return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
426
shadow_l2_index(mfn_t *smfn, u32 guest_index)
428
#if (GUEST_PAGING_LEVELS == 2)
429
// Because we use 2 shadow l2 entries for each guest entry, the number of
430
// guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
432
*smfn = _mfn(mfn_x(*smfn) +
433
(guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
435
// We multiply by two to get the index of the first of the two entries
436
// used to shadow the specified guest entry.
437
return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
443
#if GUEST_PAGING_LEVELS >= 4
446
shadow_l3_index(mfn_t *smfn, u32 guest_index)
452
shadow_l4_index(mfn_t *smfn, u32 guest_index)
457
#endif // GUEST_PAGING_LEVELS >= 4
460
/**************************************************************************/
461
/* Function which computes shadow entries from their corresponding guest
462
* entries. This is the "heart" of the shadow code. It operates using
463
* level-1 shadow types, but handles all levels of entry.
464
* Don't call it directly, but use the four wrappers below.
467
static always_inline void
468
_sh_propagate(struct vcpu *v,
469
guest_intpte_t guest_intpte,
471
void *shadow_entry_ptr,
476
guest_l1e_t guest_entry = { guest_intpte };
477
shadow_l1e_t *sp = shadow_entry_ptr;
478
struct domain *d = v->domain;
479
struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
480
gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
484
/* We don't shadow PAE l3s */
485
ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
487
/* Check there's something for the shadows to map to */
488
if ( !p2m_is_valid(p2mt) && !p2m_is_grant(p2mt) )
490
*sp = shadow_l1e_empty();
494
gflags = guest_l1e_get_flags(guest_entry);
496
if ( unlikely(!(gflags & _PAGE_PRESENT)) )
498
#if !(SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
499
/* If a guest l1 entry is not present, shadow with the magic
500
* guest-not-present entry. */
505
*sp = shadow_l1e_empty();
509
if ( level == 1 && p2mt == p2m_mmio_dm )
511
/* Guest l1e maps emulated MMIO space */
512
*sp = sh_l1e_mmio(target_gfn, gflags);
513
if ( !d->arch.paging.shadow.has_fast_mmio_entries )
514
d->arch.paging.shadow.has_fast_mmio_entries = 1;
518
// Must have a valid target_mfn unless this is a prefetch or an l1
519
// pointing at MMIO space. In the case of a prefetch, an invalid
520
// mfn means that we can not usefully shadow anything, and so we
523
if ( !mfn_valid(target_mfn)
524
&& !(level == 1 && (!shadow_mode_refcounts(d)
525
|| p2mt == p2m_mmio_direct)) )
527
ASSERT((ft == ft_prefetch));
528
*sp = shadow_l1e_empty();
532
// Propagate bits from the guest to the shadow.
533
// Some of these may be overwritten, below.
534
// Since we know the guest's PRESENT bit is set, we also set the shadow's
535
// SHADOW_PRESENT bit.
537
pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
538
_PAGE_RW | _PAGE_PRESENT);
539
if ( guest_supports_nx(v) )
540
pass_thru_flags |= _PAGE_NX_BIT;
541
if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) )
542
pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
543
sflags = gflags & pass_thru_flags;
546
* For HVM domains with direct access to MMIO areas, set the correct
547
* caching attributes in the shadows to match what was asked for.
549
if ( (level == 1) && is_hvm_domain(d) && has_arch_pdevs(d) &&
550
!is_xen_heap_mfn(mfn_x(target_mfn)) )
554
/* compute the PAT index for shadow page entry when VT-d is enabled
555
* and device assigned.
556
* 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
557
* 2) if enables snoop control, compute the PAT index as WB.
558
* 3) if disables snoop control, compute the PAT index with
561
if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
562
sflags |= pat_type_2_pte_flags(type);
563
else if ( d->arch.hvm_domain.is_in_uc_mode )
564
sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
565
else if ( p2mt == p2m_mmio_direct )
566
sflags |= get_pat_flags(v,
568
gfn_to_paddr(target_gfn),
569
((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
570
MTRR_TYPE_UNCACHABLE);
571
else if ( iommu_snoop )
572
sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
574
sflags |= get_pat_flags(v,
576
gfn_to_paddr(target_gfn),
577
((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
578
NO_HARDCODE_MEM_TYPE);
581
// Set the A&D bits for higher level shadows.
582
// Higher level entries do not, strictly speaking, have dirty bits, but
583
// since we use shadow linear tables, each of these entries may, at some
584
// point in time, also serve as a shadow L1 entry.
585
// By setting both the A&D bits in each of these, we eliminate the burden
586
// on the hardware to update these bits on initial accesses.
588
if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
589
sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
591
// If the A or D bit has not yet been set in the guest, then we must
592
// prevent the corresponding kind of access.
594
if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
595
sflags &= ~_PAGE_PRESENT;
597
/* D bits exist in L1es and PSE L2es */
598
if ( unlikely(((level == 1) ||
600
(gflags & _PAGE_PSE) &&
601
guest_supports_superpages(v)))
602
&& !(gflags & _PAGE_DIRTY)) )
605
// shadow_mode_log_dirty support
607
// Only allow the guest write access to a page a) on a demand fault,
608
// or b) if the page is already marked as dirty.
610
// (We handle log-dirty entirely inside the shadow code, without using the
611
// p2m_ram_logdirty p2m type: only HAP uses that.)
612
if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
614
if ( mfn_valid(target_mfn) ) {
615
if ( ft & FETCH_TYPE_WRITE )
616
paging_mark_dirty(d, mfn_x(target_mfn));
617
else if ( !sh_mfn_is_dirty(d, target_mfn) )
622
if ( unlikely((level == 1) && dirty_vram
623
&& dirty_vram->last_dirty == -1
624
&& gfn_x(target_gfn) >= dirty_vram->begin_pfn
625
&& gfn_x(target_gfn) < dirty_vram->end_pfn) )
627
if ( ft & FETCH_TYPE_WRITE )
628
dirty_vram->last_dirty = NOW();
633
/* Read-only memory */
634
if ( p2m_is_readonly(p2mt) )
637
// protect guest page tables
639
if ( unlikely((level == 1)
640
&& sh_mfn_is_a_page_table(target_mfn)
641
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
642
/* Unless the page is out of sync and the guest is
644
&& !(mfn_oos_may_write(target_mfn)
645
&& (ft == ft_demand_write))
649
if ( shadow_mode_trap_reads(d) )
651
// if we are trapping both reads & writes, then mark this page
654
sflags &= ~_PAGE_PRESENT;
658
// otherwise, just prevent any writes...
664
// PV guests in 64-bit mode use two different page tables for user vs
665
// supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
666
// It is always shadowed as present...
667
if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d)
668
&& !is_hvm_domain(d) )
670
sflags |= _PAGE_USER;
673
*sp = shadow_l1e_from_mfn(target_mfn, sflags);
676
SHADOW_DEBUG(PROPAGATE,
677
"%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
678
fetch_type_names[ft], level, guest_entry.l1, sp->l1);
682
/* These four wrappers give us a little bit of type-safety back around
683
* the use of void-* pointers and intpte types in _sh_propagate(), and
684
* allow the compiler to optimize out some level checks. */
686
#if GUEST_PAGING_LEVELS >= 4
688
l4e_propagate_from_guest(struct vcpu *v,
694
_sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
698
l3e_propagate_from_guest(struct vcpu *v,
704
_sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
706
#endif // GUEST_PAGING_LEVELS >= 4
709
l2e_propagate_from_guest(struct vcpu *v,
715
_sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
719
l1e_propagate_from_guest(struct vcpu *v,
726
_sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
730
/**************************************************************************/
731
/* These functions update shadow entries (and do bookkeeping on the shadow
732
* tables they are in). It is intended that they are the only
733
* functions which ever write (non-zero) data onto a shadow page.
736
static inline void safe_write_entry(void *dst, void *src)
737
/* Copy one PTE safely when processors might be running on the
738
* destination pagetable. This does *not* give safety against
739
* concurrent writes (that's what the shadow lock is for), just
740
* stops the hardware picking up partially written entries. */
742
volatile unsigned long *d = dst;
743
unsigned long *s = src;
744
ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
745
#if CONFIG_PAGING_LEVELS == 3
746
/* In PAE mode, pagetable entries are larger
747
* than machine words, so won't get written atomically. We need to make
748
* sure any other cpu running on these shadows doesn't see a
749
* half-written entry. Do this by marking the entry not-present first,
750
* then writing the high word before the low word. */
751
BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
756
/* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
757
* which will be an atomic write, since the entry is aligned. */
758
BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
765
shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
766
/* This function does the actual writes to shadow pages.
767
* It must not be called directly, since it doesn't do the bookkeeping
768
* that shadow_set_l*e() functions do. */
770
shadow_l1e_t *dst = d;
771
shadow_l1e_t *src = s;
775
/* Because we mirror access rights at all levels in the shadow, an
776
* l2 (or higher) entry with the RW bit cleared will leave us with
777
* no write access through the linear map.
778
* We detect that by writing to the shadow with copy_to_user() and
779
* using map_domain_page() to get a writeable mapping if we need to. */
780
if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
782
perfc_incr(shadow_linear_map_failed);
783
map = sh_map_domain_page(mfn);
785
dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
789
for ( i = 0; i < entries; i++ )
790
safe_write_entry(dst++, src++);
792
if ( map != NULL ) sh_unmap_domain_page(map);
796
perms_strictly_increased(u32 old_flags, u32 new_flags)
797
/* Given the flags of two entries, are the new flags a strict
798
* increase in rights over the old ones? */
800
u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
801
u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
802
/* Flip the NX bit, since it's the only one that decreases rights;
803
* we calculate as if it were an "X" bit. */
806
/* If the changed bits are all set in the new flags, then rights strictly
807
* increased between old and new. */
808
return ((of | (of ^ nf)) == nf);
811
/* type is only used to distinguish grant map pages from ordinary RAM
812
* i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw. */
814
shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type)
818
struct domain *owner;
820
ASSERT(!sh_l1e_is_magic(sl1e));
822
if ( !shadow_mode_refcounts(d) )
825
res = get_page_from_l1e(sl1e, d, d);
827
// If a privileged domain is attempting to install a map of a page it does
828
// not own, we let it succeed anyway.
830
if ( unlikely(!res) &&
831
!shadow_mode_translate(d) &&
832
mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
833
(owner = page_get_owner(mfn_to_page(mfn))) &&
835
IS_PRIV_FOR(d, owner))
837
res = get_page_from_l1e(sl1e, d, owner);
838
SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
839
"which is owned by domain %d: %s\n",
840
d->domain_id, mfn_x(mfn), owner->domain_id,
841
res ? "success" : "failed");
844
/* Okay, it might still be a grant mapping PTE. Try it. */
845
if ( unlikely(!res) &&
846
(type == p2m_grant_map_rw ||
847
(type == p2m_grant_map_ro &&
848
!(shadow_l1e_get_flags(sl1e) & _PAGE_RW))) )
850
/* It's a grant mapping. The grant table implementation will
851
already have checked that we're supposed to have access, so
852
we can just grab a reference directly. */
853
mfn = shadow_l1e_get_mfn(sl1e);
854
if ( mfn_valid(mfn) )
855
res = get_page_from_l1e(sl1e, d, page_get_owner(mfn_to_page(mfn)));
858
if ( unlikely(!res) )
860
perfc_incr(shadow_get_page_fail);
861
SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
868
shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
870
if ( !shadow_mode_refcounts(d) )
873
put_page_from_l1e(sl1e, d);
876
#if GUEST_PAGING_LEVELS >= 4
877
static int shadow_set_l4e(struct vcpu *v,
879
shadow_l4e_t new_sl4e,
883
shadow_l4e_t old_sl4e;
885
ASSERT(sl4e != NULL);
888
if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
890
paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
891
| (((unsigned long)sl4e) & ~PAGE_MASK));
893
if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
895
/* About to install a new reference */
896
mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
897
ok = sh_get_ref(v, sl3mfn, paddr);
898
/* Are we pinning l3 shadows to handle wierd linux behaviour? */
899
if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
900
ok |= sh_pin(v, sl3mfn);
903
domain_crash(v->domain);
904
return SHADOW_SET_ERROR;
908
/* Write the new entry */
909
shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
910
flags |= SHADOW_SET_CHANGED;
912
if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
914
/* We lost a reference to an old mfn. */
915
mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
916
if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
917
|| !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
918
shadow_l4e_get_flags(new_sl4e)) )
920
flags |= SHADOW_SET_FLUSH;
922
sh_put_ref(v, osl3mfn, paddr);
927
static int shadow_set_l3e(struct vcpu *v,
929
shadow_l3e_t new_sl3e,
933
shadow_l3e_t old_sl3e;
935
ASSERT(sl3e != NULL);
938
if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
940
paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
941
| (((unsigned long)sl3e) & ~PAGE_MASK));
943
if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
945
/* About to install a new reference */
946
if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
948
domain_crash(v->domain);
949
return SHADOW_SET_ERROR;
953
/* Write the new entry */
954
shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
955
flags |= SHADOW_SET_CHANGED;
957
if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
959
/* We lost a reference to an old mfn. */
960
mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
961
if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
962
!perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
963
shadow_l3e_get_flags(new_sl3e)) )
965
flags |= SHADOW_SET_FLUSH;
967
sh_put_ref(v, osl2mfn, paddr);
971
#endif /* GUEST_PAGING_LEVELS >= 4 */
973
static int shadow_set_l2e(struct vcpu *v,
975
shadow_l2e_t new_sl2e,
979
shadow_l2e_t old_sl2e;
982
#if GUEST_PAGING_LEVELS == 2
983
/* In 2-on-3 we work with pairs of l2es pointing at two-page
984
* shadows. Reference counting and up-pointers track from the first
985
* page of the shadow to the first l2e, so make sure that we're
986
* working with those:
987
* Align the pointer down so it's pointing at the first of the pair */
988
sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
989
/* Align the mfn of the shadow entry too */
990
new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
993
ASSERT(sl2e != NULL);
996
if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
998
paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
999
| (((unsigned long)sl2e) & ~PAGE_MASK));
1001
if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
1003
mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
1005
/* About to install a new reference */
1006
if ( !sh_get_ref(v, sl1mfn, paddr) )
1008
domain_crash(v->domain);
1009
return SHADOW_SET_ERROR;
1011
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
1013
struct page_info *sp = mfn_to_page(sl1mfn);
1014
mfn_t gl1mfn = backpointer(sp);
1016
/* If the shadow is a fl1 then the backpointer contains
1017
the GFN instead of the GMFN, and it's definitely not
1019
if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
1020
&& mfn_is_out_of_sync(gl1mfn) )
1021
sh_resync(v, gl1mfn);
1026
/* Write the new entry */
1027
#if GUEST_PAGING_LEVELS == 2
1029
shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
1030
/* The l1 shadow is two pages long and need to be pointed to by
1031
* two adjacent l1es. The pair have the same flags, but point
1032
* at odd and even MFNs */
1033
ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
1034
pair[1].l2 |= (1<<PAGE_SHIFT);
1035
shadow_write_entries(sl2e, &pair, 2, sl2mfn);
1037
#else /* normal case */
1038
shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
1040
flags |= SHADOW_SET_CHANGED;
1042
if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
1044
/* We lost a reference to an old mfn. */
1045
mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
1046
if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
1047
!perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
1048
shadow_l2e_get_flags(new_sl2e)) )
1050
flags |= SHADOW_SET_FLUSH;
1052
sh_put_ref(v, osl1mfn, paddr);
1057
static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
1062
mfn_t mfn = shadow_l1e_get_mfn(new_sl1e);
1063
int flags = shadow_l1e_get_flags(new_sl1e);
1065
struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
1067
if ( !dirty_vram /* tracking disabled? */
1068
|| !(flags & _PAGE_RW) /* read-only mapping? */
1069
|| !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */
1072
gfn = mfn_to_gfn(d, mfn);
1073
/* Page sharing not supported on shadow PTs */
1074
BUG_ON(SHARED_M2P(gfn));
1076
if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) )
1078
unsigned long i = gfn - dirty_vram->begin_pfn;
1079
struct page_info *page = mfn_to_page(mfn);
1081
if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
1082
/* Initial guest reference, record it */
1083
dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
1084
| ((unsigned long)sl1e & ~PAGE_MASK);
1088
static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
1093
mfn_t mfn = shadow_l1e_get_mfn(old_sl1e);
1094
int flags = shadow_l1e_get_flags(old_sl1e);
1096
struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
1098
if ( !dirty_vram /* tracking disabled? */
1099
|| !(flags & _PAGE_RW) /* read-only mapping? */
1100
|| !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */
1103
gfn = mfn_to_gfn(d, mfn);
1104
/* Page sharing not supported on shadow PTs */
1105
BUG_ON(SHARED_M2P(gfn));
1107
if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) )
1109
unsigned long i = gfn - dirty_vram->begin_pfn;
1110
struct page_info *page = mfn_to_page(mfn);
1112
paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
1113
| ((unsigned long)sl1e & ~PAGE_MASK);
1115
if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
1117
/* Last reference */
1118
if ( dirty_vram->sl1ma[i] == INVALID_PADDR ) {
1119
/* We didn't know it was that one, let's say it is dirty */
1124
ASSERT(dirty_vram->sl1ma[i] == sl1ma);
1125
dirty_vram->sl1ma[i] = INVALID_PADDR;
1126
if ( flags & _PAGE_DIRTY )
1132
/* We had more than one reference, just consider the page dirty. */
1134
/* Check that it's not the one we recorded. */
1135
if ( dirty_vram->sl1ma[i] == sl1ma )
1137
/* Too bad, we remembered the wrong one... */
1138
dirty_vram->sl1ma[i] = INVALID_PADDR;
1142
/* Ok, our recorded sl1e is still pointing to this page, let's
1143
* just hope it will remain. */
1148
dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
1149
dirty_vram->last_dirty = NOW();
1154
static int shadow_set_l1e(struct vcpu *v,
1156
shadow_l1e_t new_sl1e,
1157
p2m_type_t new_type,
1161
struct domain *d = v->domain;
1162
shadow_l1e_t old_sl1e;
1163
#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1164
mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
1166
ASSERT(sl1e != NULL);
1168
#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
1169
if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
1170
&& ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT))
1171
== (_PAGE_RW|_PAGE_PRESENT)) )
1172
oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
1177
if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
1179
if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
1180
&& !sh_l1e_is_magic(new_sl1e) )
1182
/* About to install a new reference */
1183
if ( shadow_mode_refcounts(d) ) {
1184
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
1185
if ( shadow_get_page_from_l1e(new_sl1e, d, new_type) == 0 )
1187
/* Doesn't look like a pagetable. */
1188
flags |= SHADOW_SET_ERROR;
1189
new_sl1e = shadow_l1e_empty();
1193
shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
1198
/* Write the new entry */
1199
shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
1200
flags |= SHADOW_SET_CHANGED;
1202
if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
1203
&& !sh_l1e_is_magic(old_sl1e) )
1205
/* We lost a reference to an old mfn. */
1206
/* N.B. Unlike higher-level sets, never need an extra flush
1207
* when writing an l1e. Because it points to the same guest frame
1208
* as the guest l1e did, it's the guest's responsibility to
1209
* trigger a flush later. */
1210
if ( shadow_mode_refcounts(d) )
1212
shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
1213
shadow_put_page_from_l1e(old_sl1e, d);
1214
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
1221
/**************************************************************************/
1222
/* Macros to walk pagetables. These take the shadow of a pagetable and
1223
* walk every "interesting" entry. That is, they don't touch Xen mappings,
1224
* and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
1225
* second entry (since pairs of entries are managed together). For multi-page
1226
* shadows they walk all pages.
1228
* Arguments are an MFN, the variable to point to each entry, a variable
1229
* to indicate that we are done (we will shortcut to the end of the scan
1230
* when _done != 0), a variable to indicate that we should avoid Xen mappings,
1233
* WARNING: These macros have side-effects. They change the values of both
1234
* the pointer and the MFN. */
1236
static inline void increment_ptr_to_guest_entry(void *ptr)
1240
guest_l1e_t **entry = ptr;
1245
/* All kinds of l1: touch all entries */
1246
#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1249
shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \
1250
ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow \
1251
|| mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\
1252
for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
1254
(_sl1e) = _sp + _i; \
1255
if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
1257
if ( _done ) break; \
1258
increment_ptr_to_guest_entry(_gl1p); \
1260
sh_unmap_domain_page(_sp); \
1263
/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
1264
#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
1265
#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1268
_SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1269
({ (__done = _done); }), _code); \
1270
_sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
1272
_SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
1273
({ (__done = _done); }), _code); \
1275
#else /* Everything else; l1 shadows are only one page */
1276
#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
1277
_SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
1281
#if GUEST_PAGING_LEVELS == 2
1283
/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
1284
#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1286
int _i, _j, __done = 0; \
1287
int _xen = !shadow_mode_external(_dom); \
1288
ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow);\
1289
for ( _j = 0; _j < 4 && !__done; _j++ ) \
1291
shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \
1292
for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
1294
|| ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
1295
< (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
1297
(_sl2e) = _sp + _i; \
1298
if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1300
if ( (__done = (_done)) ) break; \
1301
increment_ptr_to_guest_entry(_gl2p); \
1303
sh_unmap_domain_page(_sp); \
1304
_sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
1308
#elif GUEST_PAGING_LEVELS == 3
1310
/* PAE: if it's an l2h, don't touch Xen mappings */
1311
#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1314
int _xen = !shadow_mode_external(_dom); \
1315
shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1316
ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow \
1317
|| mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow);\
1318
for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1320
|| mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_pae_shadow\
1321
|| ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
1322
< (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
1324
(_sl2e) = _sp + _i; \
1325
if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1327
if ( _done ) break; \
1328
increment_ptr_to_guest_entry(_gl2p); \
1330
sh_unmap_domain_page(_sp); \
1335
/* 64-bit l2: touch all entries except for PAE compat guests. */
1336
#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \
1339
int _xen = !shadow_mode_external(_dom); \
1340
shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \
1341
ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\
1342
mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\
1343
for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
1346
|| !is_pv_32on64_domain(_dom) \
1347
|| mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\
1348
|| (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \
1350
(_sl2e) = _sp + _i; \
1351
if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
1353
if ( _done ) break; \
1354
increment_ptr_to_guest_entry(_gl2p); \
1357
sh_unmap_domain_page(_sp); \
1360
#endif /* different kinds of l2 */
1362
#if GUEST_PAGING_LEVELS == 4
1364
/* 64-bit l3: touch all entries */
1365
#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
1368
shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \
1369
ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\
1370
for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
1372
(_sl3e) = _sp + _i; \
1373
if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
1375
if ( _done ) break; \
1376
increment_ptr_to_guest_entry(_gl3p); \
1378
sh_unmap_domain_page(_sp); \
1381
/* 64-bit l4: avoid Xen mappings */
1382
#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \
1384
shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \
1385
int _xen = !shadow_mode_external(_dom); \
1387
ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\
1388
for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
1390
if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \
1392
(_sl4e) = _sp + _i; \
1393
if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
1395
if ( _done ) break; \
1397
increment_ptr_to_guest_entry(_gl4p); \
1399
sh_unmap_domain_page(_sp); \
1406
/**************************************************************************/
1407
/* Functions to install Xen mappings and linear mappings in shadow pages */
1409
// XXX -- this function should probably be moved to shadow-common.c, but that
1410
// probably wants to wait until the shadow types have been moved from
1411
// shadow-types.h to shadow-private.h
1413
#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1414
void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
1416
struct domain *d = v->domain;
1419
sl4e = sh_map_domain_page(sl4mfn);
1420
ASSERT(sl4e != NULL);
1421
ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
1423
/* Copy the common Xen mappings from the idle domain */
1424
memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1425
&idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
1426
ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
1428
/* Install the per-domain mappings for this domain */
1429
sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
1430
shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
1433
/* Shadow linear mapping for 4-level shadows. N.B. for 3-level
1434
* shadows on 64-bit xen, this linear mapping is later replaced by the
1435
* monitor pagetable structure, which is built in make_monitor_table
1436
* and maintained by sh_update_linear_entries. */
1437
sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1438
shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
1440
/* Self linear mapping. */
1441
if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
1443
// linear tables may not be used with translated PV guests
1444
sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1449
sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
1450
shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
1453
if ( shadow_mode_translate(v->domain) )
1455
/* install domain-specific P2M table */
1456
sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
1457
shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
1461
sh_unmap_domain_page(sl4e);
1465
#if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1466
// For 3-on-3 PV guests, we need to make sure the xen mappings are in
1467
// place, which means that we need to populate the l2h entry in the l3
1470
static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn)
1472
struct domain *d = v->domain;
1474
#if CONFIG_PAGING_LEVELS == 3
1478
if ( !is_pv_32on64_vcpu(v) )
1482
sl2e = sh_map_domain_page(sl2hmfn);
1483
ASSERT(sl2e != NULL);
1484
ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
1486
#if CONFIG_PAGING_LEVELS == 3
1488
/* Copy the common Xen mappings from the idle domain */
1489
memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
1490
&idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
1491
L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
1493
/* Install the per-domain mappings for this domain */
1494
for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
1495
sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
1496
shadow_l2e_from_mfn(
1497
page_to_mfn(perdomain_pt_page(d, i)),
1500
/* We don't set up a linear mapping here because we can't until this
1501
* l2h is installed in an l3e. sh_update_linear_entries() handles
1502
* the linear mappings when CR3 (and so the fourth l3e) is loaded.
1503
* We zero them here, just as a safety measure.
1505
for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1506
sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1508
for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
1509
sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
1512
if ( shadow_mode_translate(d) )
1514
/* Install the domain-specific p2m table */
1516
ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
1517
p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
1518
for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
1520
sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
1521
(l3e_get_flags(p2m[i]) & _PAGE_PRESENT)
1522
? shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
1524
: shadow_l2e_empty();
1526
sh_unmap_domain_page(p2m);
1531
/* Copy the common Xen mappings from the idle domain */
1533
&sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1534
&compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1535
COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e));
1539
sh_unmap_domain_page(sl2e);
1547
/**************************************************************************/
1548
/* Create a shadow of a given guest page.
1551
sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
1553
mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
1554
SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
1555
mfn_x(gmfn), shadow_type, mfn_x(smfn));
1557
if ( shadow_type != SH_type_l2_32_shadow
1558
&& shadow_type != SH_type_l2_pae_shadow
1559
&& shadow_type != SH_type_l2h_pae_shadow
1560
&& shadow_type != SH_type_l4_64_shadow )
1561
/* Lower-level shadow, not yet linked form a higher level */
1562
mfn_to_page(smfn)->up = 0;
1564
#if GUEST_PAGING_LEVELS == 4
1565
#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
1566
if ( shadow_type == SH_type_l4_64_shadow &&
1567
unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
1569
/* We're shadowing a new l4, but we've been assuming the guest uses
1570
* only one l4 per vcpu and context switches using an l4 entry.
1571
* Count the number of active l4 shadows. If there are enough
1572
* of them, decide that this isn't an old linux guest, and stop
1573
* pinning l3es. This is not very quick but it doesn't happen
1575
struct page_info *sp, *t;
1577
int l4count = 0, vcpus = 0;
1578
page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows)
1580
if ( sp->u.sh.type == SH_type_l4_64_shadow )
1583
for_each_vcpu ( v->domain, v2 )
1585
if ( l4count > 2 * vcpus )
1587
/* Unpin all the pinned l3 tables, and don't pin any more. */
1588
page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows)
1590
if ( sp->u.sh.type == SH_type_l3_64_shadow )
1591
sh_unpin(v, page_to_mfn(sp));
1593
v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
1599
// Create the Xen mappings...
1600
if ( !shadow_mode_external(v->domain) )
1602
switch (shadow_type)
1604
#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
1605
case SH_type_l4_shadow:
1606
sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
1608
#if CONFIG_PAGING_LEVELS >= 3 && GUEST_PAGING_LEVELS >= 3
1609
case SH_type_l2h_shadow:
1610
sh_install_xen_entries_in_l2h(v, smfn); break;
1612
default: /* Do nothing */ break;
1616
shadow_promote(v, gmfn, shadow_type);
1617
set_shadow_status(v, gmfn, shadow_type, smfn);
1622
/* Make a splintered superpage shadow */
1624
make_fl1_shadow(struct vcpu *v, gfn_t gfn)
1626
mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow,
1627
(unsigned long) gfn_x(gfn));
1629
SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n",
1630
gfn_x(gfn), mfn_x(smfn));
1632
set_fl1_shadow_status(v, gfn, smfn);
1637
#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
1639
sh_make_monitor_table(struct vcpu *v)
1641
struct domain *d = v->domain;
1643
ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
1645
/* Guarantee we can get the memory we need */
1646
shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
1648
#if CONFIG_PAGING_LEVELS == 4
1651
m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1652
sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
1653
/* Remember the level of this table */
1654
mfn_to_page(m4mfn)->shadow_flags = 4;
1655
#if SHADOW_PAGING_LEVELS < 4
1660
/* Install an l3 table and an l2 table that will hold the shadow
1661
* linear map entries. This overrides the linear map entry that
1662
* was installed by sh_install_xen_entries_in_l4. */
1663
l4e = sh_map_domain_page(m4mfn);
1665
m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1666
mfn_to_page(m3mfn)->shadow_flags = 3;
1667
l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)]
1668
= l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1670
m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1671
mfn_to_page(m2mfn)->shadow_flags = 2;
1672
l3e = sh_map_domain_page(m3mfn);
1673
l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR);
1674
sh_unmap_domain_page(l3e);
1676
if ( is_pv_32on64_vcpu(v) )
1678
/* For 32-on-64 PV guests, we need to map the 32-bit Xen
1679
* area into its usual VAs in the monitor tables */
1680
m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1681
mfn_to_page(m3mfn)->shadow_flags = 3;
1682
l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
1684
m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1685
mfn_to_page(m2mfn)->shadow_flags = 2;
1686
l3e = sh_map_domain_page(m3mfn);
1687
l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1688
sh_install_xen_entries_in_l2h(v, m2mfn);
1689
sh_unmap_domain_page(l3e);
1692
sh_unmap_domain_page(l4e);
1694
#endif /* SHADOW_PAGING_LEVELS < 4 */
1698
#elif CONFIG_PAGING_LEVELS == 3
1706
m3mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1707
/* Remember the level of this table */
1708
mfn_to_page(m3mfn)->shadow_flags = 3;
1710
// Install a monitor l2 table in slot 3 of the l3 table.
1711
// This is used for all Xen entries, including linear maps
1712
m2mfn = shadow_alloc(d, SH_type_monitor_table, 0);
1713
mfn_to_page(m2mfn)->shadow_flags = 2;
1714
l3e = sh_map_domain_page(m3mfn);
1715
l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
1716
sh_install_xen_entries_in_l2h(v, m2mfn);
1717
/* Install the monitor's own linear map */
1718
l2e = sh_map_domain_page(m2mfn);
1719
for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
1720
l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
1721
(l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
1722
? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
1724
sh_unmap_domain_page(l2e);
1725
sh_unmap_domain_page(l3e);
1727
SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
1732
#error this should not happen
1733
#endif /* CONFIG_PAGING_LEVELS */
1735
#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
1737
/**************************************************************************/
1738
/* These functions also take a virtual address and return the level-N
1739
* shadow table mfn and entry, but they create the shadow pagetables if
1740
* they are needed. The "demand" argument is non-zero when handling
1741
* a demand fault (so we know what to do about accessed bits &c).
1742
* If the necessary tables are not present in the guest, they return NULL. */
1744
/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
1745
* more levels than the guest, the upper levels are always fixed and do not
1746
* reflect any information from the guest, so we do not use these functions
1747
* to access them. */
1749
#if GUEST_PAGING_LEVELS >= 4
1750
static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
1754
/* There is always a shadow of the top level table. Get it. */
1755
*sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1756
/* Reading the top level table is always valid. */
1757
return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
1760
static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
1768
if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */
1770
sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
1771
ASSERT(sl4e != NULL);
1772
if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1774
*sl3mfn = shadow_l4e_get_mfn(*sl4e);
1775
ASSERT(mfn_valid(*sl3mfn));
1780
shadow_l4e_t new_sl4e;
1781
/* No l3 shadow installed: find and install it. */
1782
*sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow);
1783
if ( !mfn_valid(*sl3mfn) )
1785
/* No l3 shadow of this page exists at all: make one. */
1786
*sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow);
1788
/* Install the new sl3 table in the sl4e */
1789
l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft);
1790
r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
1791
ASSERT((r & SHADOW_SET_FLUSH) == 0);
1792
if ( r & SHADOW_SET_ERROR )
1795
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1800
/* Now follow it down a level. Guaranteed to succeed. */
1801
return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
1803
#endif /* GUEST_PAGING_LEVELS >= 4 */
1806
static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
1812
#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
1813
mfn_t sl3mfn = _mfn(INVALID_MFN);
1815
if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
1817
sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
1818
if ( sl3e == NULL ) return NULL;
1819
if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
1821
*sl2mfn = shadow_l3e_get_mfn(*sl3e);
1822
ASSERT(mfn_valid(*sl2mfn));
1827
shadow_l3e_t new_sl3e;
1828
unsigned int t = SH_type_l2_shadow;
1830
/* Tag compat L2 containing hypervisor (m2p) mappings */
1831
if ( is_pv_32on64_domain(v->domain) &&
1832
guest_l4_table_offset(gw->va) == 0 &&
1833
guest_l3_table_offset(gw->va) == 3 )
1834
t = SH_type_l2h_shadow;
1836
/* No l2 shadow installed: find and install it. */
1837
*sl2mfn = get_shadow_status(v, gw->l2mfn, t);
1838
if ( !mfn_valid(*sl2mfn) )
1840
/* No l2 shadow of this page exists at all: make one. */
1841
*sl2mfn = sh_make_shadow(v, gw->l2mfn, t);
1843
/* Install the new sl2 table in the sl3e */
1844
l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft);
1845
r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
1846
ASSERT((r & SHADOW_SET_FLUSH) == 0);
1847
if ( r & SHADOW_SET_ERROR )
1850
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1855
/* Now follow it down a level. Guaranteed to succeed. */
1856
return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1857
#elif GUEST_PAGING_LEVELS == 3 /* PAE... */
1858
/* We never demand-shadow PAE l3es: they are only created in
1859
* sh_update_cr3(). Check if the relevant sl3e is present. */
1860
shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table)
1861
+ shadow_l3_linear_offset(gw->va);
1862
if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
1864
*sl2mfn = shadow_l3e_get_mfn(*sl3e);
1865
ASSERT(mfn_valid(*sl2mfn));
1866
return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1867
#else /* 32bit... */
1868
/* There is always a shadow of the top level table. Get it. */
1869
*sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]);
1870
/* This next line is important: the guest l2 has a 16k
1871
* shadow, we need to return the right mfn of the four. This
1872
* call will set it for us as a side-effect. */
1873
(void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va));
1874
/* Reading the top level table is always valid. */
1875
return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
1880
static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
1890
sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
1891
if ( sl2e == NULL ) return NULL;
1893
/* Install the sl1 in the l2e if it wasn't there or if we need to
1894
* re-do it to fix a PSE dirty bit. */
1895
if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
1896
&& likely(ft != ft_demand_write
1897
|| (shadow_l2e_get_flags(*sl2e) & _PAGE_RW)
1898
|| !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
1900
*sl1mfn = shadow_l2e_get_mfn(*sl2e);
1901
ASSERT(mfn_valid(*sl1mfn));
1905
shadow_l2e_t new_sl2e;
1906
int r, flags = guest_l2e_get_flags(gw->l2e);
1907
/* No l1 shadow installed: find and install it. */
1908
if ( !(flags & _PAGE_PRESENT) )
1909
return NULL; /* No guest page. */
1910
if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
1912
/* Splintering a superpage */
1913
gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e);
1914
*sl1mfn = get_fl1_shadow_status(v, l2gfn);
1915
if ( !mfn_valid(*sl1mfn) )
1917
/* No fl1 shadow of this superpage exists at all: make one. */
1918
*sl1mfn = make_fl1_shadow(v, l2gfn);
1923
/* Shadowing an actual guest l1 table */
1924
if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */
1925
*sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow);
1926
if ( !mfn_valid(*sl1mfn) )
1928
/* No l1 shadow of this page exists at all: make one. */
1929
*sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow);
1932
/* Install the new sl1 table in the sl2e */
1933
l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft);
1934
r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
1935
ASSERT((r & SHADOW_SET_FLUSH) == 0);
1936
if ( r & SHADOW_SET_ERROR )
1939
/* This next line is important: in 32-on-PAE and 32-on-64 modes,
1940
* the guest l1 table has an 8k shadow, and we need to return
1941
* the right mfn of the pair. This call will set it for us as a
1942
* side-effect. (In all other cases, it's a no-op and will be
1944
(void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
1947
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
1948
/* All pages walked are now pagetables. Safe to resync pages
1949
in case level 4 or 3 shadows were set. */
1951
shadow_resync_all(v, 0);
1954
/* Now follow it down a level. Guaranteed to succeed. */
1955
return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
1960
/**************************************************************************/
1961
/* Destructors for shadow tables:
1962
* Unregister the shadow, decrement refcounts of any entries present in it,
1963
* and release the memory.
1965
* N.B. These destructors do not clear the contents of the shadows.
1966
* This allows us to delay TLB shootdowns until the page is being reused.
1967
* See shadow_alloc() and shadow_free() for how this is handled.
1970
#if GUEST_PAGING_LEVELS >= 4
1971
void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
1974
u32 t = mfn_to_page(smfn)->u.sh.type;
1977
SHADOW_DEBUG(DESTROY_SHADOW,
1978
"%s(%05lx)\n", __func__, mfn_x(smfn));
1979
ASSERT(t == SH_type_l4_shadow);
1981
/* Record that the guest page isn't shadowed any more (in this type) */
1982
gmfn = backpointer(mfn_to_page(smfn));
1983
delete_shadow_status(v, gmfn, t, smfn);
1984
shadow_demote(v, gmfn, t);
1985
/* Decrement refcounts of all the old entries */
1987
SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
1988
if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
1990
sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
1991
(((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
1992
| ((unsigned long)sl4e & ~PAGE_MASK));
1996
/* Put the memory back in the pool */
1997
shadow_free(v->domain, smfn);
2000
void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
2003
u32 t = mfn_to_page(smfn)->u.sh.type;
2006
SHADOW_DEBUG(DESTROY_SHADOW,
2007
"%s(%05lx)\n", __func__, mfn_x(smfn));
2008
ASSERT(t == SH_type_l3_shadow);
2010
/* Record that the guest page isn't shadowed any more (in this type) */
2011
gmfn = backpointer(mfn_to_page(smfn));
2012
delete_shadow_status(v, gmfn, t, smfn);
2013
shadow_demote(v, gmfn, t);
2015
/* Decrement refcounts of all the old entries */
2017
SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
2018
if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
2019
sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
2020
(((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
2021
| ((unsigned long)sl3e & ~PAGE_MASK));
2024
/* Put the memory back in the pool */
2025
shadow_free(v->domain, smfn);
2027
#endif /* GUEST_PAGING_LEVELS >= 4 */
2030
void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
2033
u32 t = mfn_to_page(smfn)->u.sh.type;
2036
SHADOW_DEBUG(DESTROY_SHADOW,
2037
"%s(%05lx)\n", __func__, mfn_x(smfn));
2039
#if GUEST_PAGING_LEVELS >= 3
2040
ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow);
2042
ASSERT(t == SH_type_l2_shadow);
2045
/* Record that the guest page isn't shadowed any more (in this type) */
2046
gmfn = backpointer(mfn_to_page(smfn));
2047
delete_shadow_status(v, gmfn, t, smfn);
2048
shadow_demote(v, gmfn, t);
2050
/* Decrement refcounts of all the old entries */
2052
SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2053
if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
2054
sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
2055
(((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
2056
| ((unsigned long)sl2e & ~PAGE_MASK));
2059
/* Put the memory back in the pool */
2060
shadow_free(v->domain, smfn);
2063
void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
2065
struct domain *d = v->domain;
2067
u32 t = mfn_to_page(smfn)->u.sh.type;
2069
SHADOW_DEBUG(DESTROY_SHADOW,
2070
"%s(%05lx)\n", __func__, mfn_x(smfn));
2071
ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow);
2073
/* Record that the guest page isn't shadowed any more (in this type) */
2074
if ( t == SH_type_fl1_shadow )
2076
gfn_t gfn = _gfn(mfn_to_page(smfn)->v.sh.back);
2077
delete_fl1_shadow_status(v, gfn, smfn);
2081
mfn_t gmfn = backpointer(mfn_to_page(smfn));
2082
delete_shadow_status(v, gmfn, t, smfn);
2083
shadow_demote(v, gmfn, t);
2086
if ( shadow_mode_refcounts(d) )
2088
/* Decrement refcounts of all the old entries */
2089
mfn_t sl1mfn = smfn;
2090
SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
2091
if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
2092
&& !sh_l1e_is_magic(*sl1e) ) {
2093
shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d);
2094
shadow_put_page_from_l1e(*sl1e, d);
2099
/* Put the memory back in the pool */
2100
shadow_free(v->domain, smfn);
2103
#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
2104
void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
2106
struct domain *d = v->domain;
2107
ASSERT(mfn_to_page(mmfn)->u.sh.type == SH_type_monitor_table);
2109
#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
2112
l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
2114
int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
2116
/* Need to destroy the l3 and l2 monitor pages used
2117
* for the linear map */
2118
ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT);
2119
m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot]));
2120
l3e = sh_map_domain_page(m3mfn);
2121
ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT);
2122
shadow_free(d, _mfn(l3e_get_pfn(l3e[0])));
2123
sh_unmap_domain_page(l3e);
2124
shadow_free(d, m3mfn);
2126
if ( is_pv_32on64_vcpu(v) )
2128
/* Need to destroy the l3 and l2 monitor pages that map the
2129
* Xen VAs at 3GB-4GB */
2130
ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2131
m3mfn = _mfn(l4e_get_pfn(l4e[0]));
2132
l3e = sh_map_domain_page(m3mfn);
2133
ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2134
shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2135
sh_unmap_domain_page(l3e);
2136
shadow_free(d, m3mfn);
2138
sh_unmap_domain_page(l4e);
2140
#elif CONFIG_PAGING_LEVELS == 3
2141
/* Need to destroy the l2 monitor page in slot 4 too */
2143
l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
2144
ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
2145
shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
2146
sh_unmap_domain_page(l3e);
2150
/* Put the memory back in the pool */
2151
shadow_free(d, mmfn);
2155
/**************************************************************************/
2156
/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
2157
* These are called from common code when we are running out of shadow
2158
* memory, and unpinning all the top-level shadows hasn't worked.
2160
* This implementation is pretty crude and slow, but we hope that it won't
2161
* be called very often. */
2163
#if GUEST_PAGING_LEVELS == 2
2165
void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
2168
SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2169
(void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2173
#elif GUEST_PAGING_LEVELS == 3
2175
void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn)
2176
/* Walk a PAE l2 shadow, unhooking entries from all the subshadows */
2179
SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, {
2180
(void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
2184
#elif GUEST_PAGING_LEVELS == 4
2186
void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
2189
SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, {
2190
(void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
2196
/**************************************************************************/
2197
/* Internal translation functions.
2198
* These functions require a pointer to the shadow entry that will be updated.
2201
/* These functions take a new guest entry, translate it to shadow and write
2204
* They return the same bitmaps as the shadow_set_lXe() functions.
2207
#if GUEST_PAGING_LEVELS >= 4
2208
static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
2210
shadow_l4e_t new_sl4e;
2211
guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge;
2212
shadow_l4e_t *sl4p = se;
2213
mfn_t sl3mfn = _mfn(INVALID_MFN);
2214
struct domain *d = v->domain;
2218
perfc_incr(shadow_validate_gl4e_calls);
2220
if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
2222
gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
2223
mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
2224
if ( p2m_is_ram(p2mt) )
2225
sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
2226
else if ( p2mt != p2m_populate_on_demand )
2227
result |= SHADOW_SET_ERROR;
2229
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
2230
if ( mfn_valid(sl3mfn) )
2231
shadow_resync_all(v, 0);
2234
l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
2236
// check for updates to xen reserved slots
2237
if ( !shadow_mode_external(d) )
2239
int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
2240
sizeof(shadow_l4e_t));
2241
int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index);
2243
if ( unlikely(reserved_xen_slot) )
2245
// attempt by the guest to write to a xen reserved slot
2247
SHADOW_PRINTK("%s out-of-range update "
2248
"sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2249
__func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
2250
if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
2252
SHADOW_ERROR("out-of-range l4e update\n");
2253
result |= SHADOW_SET_ERROR;
2256
// do not call shadow_set_l4e...
2261
result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
2266
static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
2268
shadow_l3e_t new_sl3e;
2269
guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge;
2270
shadow_l3e_t *sl3p = se;
2271
mfn_t sl2mfn = _mfn(INVALID_MFN);
2275
perfc_incr(shadow_validate_gl3e_calls);
2277
if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
2279
gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
2280
mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
2281
if ( p2m_is_ram(p2mt) )
2282
sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
2283
else if ( p2mt != p2m_populate_on_demand )
2284
result |= SHADOW_SET_ERROR;
2286
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
2287
if ( mfn_valid(sl2mfn) )
2288
shadow_resync_all(v, 0);
2291
l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
2292
result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
2296
#endif // GUEST_PAGING_LEVELS >= 4
2298
static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
2300
shadow_l2e_t new_sl2e;
2301
guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge;
2302
shadow_l2e_t *sl2p = se;
2303
mfn_t sl1mfn = _mfn(INVALID_MFN);
2307
perfc_incr(shadow_validate_gl2e_calls);
2309
if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT )
2311
gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e);
2312
if ( guest_supports_superpages(v) &&
2313
(guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) )
2315
// superpage -- need to look up the shadow L1 which holds the
2317
sl1mfn = get_fl1_shadow_status(v, gl1gfn);
2319
// XXX - it's possible that we want to do some kind of prefetch
2320
// for superpage fl1's here, but this is *not* on the demand path,
2321
// so we'll hold off trying that for now...
2323
if ( !mfn_valid(sl1mfn) )
2324
sl1mfn = make_fl1_shadow(v, gl1gfn);
2329
mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
2330
if ( p2m_is_ram(p2mt) )
2331
sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2332
else if ( p2mt != p2m_populate_on_demand )
2333
result |= SHADOW_SET_ERROR;
2336
l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch);
2338
// check for updates to xen reserved slots in PV guests...
2339
// XXX -- need to revisit this for PV 3-on-4 guests.
2341
#if SHADOW_PAGING_LEVELS < 4
2342
#if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
2343
if ( !shadow_mode_external(v->domain) )
2345
int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
2346
sizeof(shadow_l2e_t));
2347
int reserved_xen_slot;
2349
#if SHADOW_PAGING_LEVELS == 3
2351
((mfn_to_page(sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow) &&
2353
>= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
2354
#else /* SHADOW_PAGING_LEVELS == 2 */
2355
reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
2358
if ( unlikely(reserved_xen_slot) )
2360
// attempt by the guest to write to a xen reserved slot
2362
SHADOW_PRINTK("%s out-of-range update "
2363
"sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
2364
__func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
2365
if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
2367
SHADOW_ERROR("out-of-range l2e update\n");
2368
result |= SHADOW_SET_ERROR;
2371
// do not call shadow_set_l2e...
2375
#endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
2376
#endif /* SHADOW_PAGING_LEVELS < 4 */
2378
result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
2383
static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
2385
shadow_l1e_t new_sl1e;
2386
guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge;
2387
shadow_l1e_t *sl1p = se;
2392
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2396
perfc_incr(shadow_validate_gl1e_calls);
2398
gfn = guest_l1e_get_gfn(new_gl1e);
2399
gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2401
l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
2402
result |= shadow_set_l1e(v, sl1p, new_sl1e, p2mt, sl1mfn);
2404
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2405
gl1mfn = backpointer(mfn_to_page(sl1mfn));
2406
if ( mfn_valid(gl1mfn)
2407
&& mfn_is_out_of_sync(gl1mfn) )
2409
/* Update the OOS snapshot. */
2410
mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
2413
ASSERT(mfn_valid(snpmfn));
2415
snp = sh_map_domain_page(snpmfn);
2416
snp[guest_index(new_ge)] = new_gl1e;
2417
sh_unmap_domain_page(snp);
2424
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2425
/**************************************************************************/
2426
/* Special validation function for re-syncing out-of-sync shadows.
2427
* Walks the *shadow* page, and for every entry that it finds,
2428
* revalidates the guest entry that corresponds to it.
2429
* N.B. This function is called with the vcpu that unsynced the page,
2430
* *not* the one that is causing it to be resynced. */
2431
void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
2435
guest_l1e_t *gl1p, *gp, *snp;
2438
ASSERT(mfn_valid(snpmfn));
2440
sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2441
ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
2443
snp = sh_map_domain_page(snpmfn);
2444
gp = sh_map_domain_page(gl1mfn);
2447
SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
2448
guest_l1e_t gl1e = *gl1p;
2449
guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
2451
if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
2458
gfn = guest_l1e_get_gfn(gl1e);
2459
gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2460
l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
2461
rc |= shadow_set_l1e(v, sl1p, nsl1e, p2mt, sl1mfn);
2467
sh_unmap_domain_page(gp);
2468
sh_unmap_domain_page(snp);
2470
/* Setting shadow L1 entries should never need us to flush the TLB */
2471
ASSERT(!(rc & SHADOW_SET_FLUSH));
2474
/* Figure out whether it's definitely safe not to sync this l1 table.
2475
* That is: if we can tell that it's only used once, and that the
2476
* toplevel shadow responsible is not one of ours.
2477
* N.B. This function is called with the vcpu that required the resync,
2478
* *not* the one that originally unsynced the page, but it is
2479
* called in the *mode* of the vcpu that unsynced it. Clear? Good. */
2480
int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
2482
struct page_info *sp;
2485
smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
2486
ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2489
sp = mfn_to_page(smfn);
2490
if ( sp->u.sh.count != 1 || !sp->up )
2492
smfn = _mfn(sp->up >> PAGE_SHIFT);
2493
ASSERT(mfn_valid(smfn));
2495
#if (SHADOW_PAGING_LEVELS == 4)
2497
sp = mfn_to_page(smfn);
2498
if ( sp->u.sh.count != 1 || !sp->up )
2500
smfn = _mfn(sp->up >> PAGE_SHIFT);
2501
ASSERT(mfn_valid(smfn));
2504
sp = mfn_to_page(smfn);
2505
if ( sp->u.sh.count != 1
2506
|| sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
2508
smfn = _mfn(sp->up >> PAGE_SHIFT);
2509
ASSERT(mfn_valid(smfn));
2512
#if (GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS == 3)
2513
/* In 2-on-3 shadow mode the up pointer contains the link to the
2514
* shadow page, but the shadow_table contains only the first of the
2515
* four pages that makes the PAE top shadow tables. */
2516
smfn = _mfn(mfn_x(smfn) & ~0x3UL);
2519
if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
2520
#if (SHADOW_PAGING_LEVELS == 3)
2521
|| pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
2522
|| pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
2523
|| pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn)
2528
/* Only in use in one toplevel shadow, and it's not the one we're
2532
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
2535
/**************************************************************************/
2536
/* Functions which translate and install the shadows of arbitrary guest
2537
* entries that we have just seen the guest write. */
2541
sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
2542
void *new_gp, u32 size, u32 sh_type,
2543
u32 (*shadow_index)(mfn_t *smfn, u32 idx),
2544
int (*validate_ge)(struct vcpu *v, void *ge,
2545
mfn_t smfn, void *se))
2546
/* Generic function for mapping and validating. */
2548
mfn_t smfn, smfn2, map_mfn;
2550
u32 shadow_idx, guest_idx;
2553
/* Align address and size to guest entry boundaries */
2554
size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
2555
new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
2556
size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
2557
ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
2559
/* Map the shadow page */
2560
smfn = get_shadow_status(v, gmfn, sh_type);
2561
ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
2562
guest_idx = guest_index(new_gp);
2564
shadow_idx = shadow_index(&map_mfn, guest_idx);
2565
sl1p = sh_map_domain_page(map_mfn);
2567
/* Validate one entry at a time */
2571
guest_idx = guest_index(new_gp);
2572
shadow_idx = shadow_index(&smfn2, guest_idx);
2573
if ( mfn_x(smfn2) != mfn_x(map_mfn) )
2575
/* We have moved to another page of the shadow */
2577
sh_unmap_domain_page(sl1p);
2578
sl1p = sh_map_domain_page(map_mfn);
2580
result |= validate_ge(v,
2584
size -= sizeof(guest_l1e_t);
2585
new_gp += sizeof(guest_l1e_t);
2587
sh_unmap_domain_page(sl1p);
2593
sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
2594
void *new_gl4p, u32 size)
2596
#if GUEST_PAGING_LEVELS >= 4
2597
return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
2601
#else // ! GUEST_PAGING_LEVELS >= 4
2602
SHADOW_ERROR("called in wrong paging mode!\n");
2609
sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
2610
void *new_gl3p, u32 size)
2612
#if GUEST_PAGING_LEVELS >= 4
2613
return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
2617
#else // ! GUEST_PAGING_LEVELS >= 4
2618
SHADOW_ERROR("called in wrong paging mode!\n");
2625
sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
2626
void *new_gl2p, u32 size)
2628
return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2635
sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
2636
void *new_gl2p, u32 size)
2638
#if GUEST_PAGING_LEVELS >= 3
2639
return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
2643
#else /* Non-PAE guests don't have different kinds of l2 table */
2644
SHADOW_ERROR("called in wrong paging mode!\n");
2651
sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
2652
void *new_gl1p, u32 size)
2654
return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
2661
/**************************************************************************/
2662
/* Optimization: If we see two emulated writes of zeros to the same
2663
* page-table without another kind of page fault in between, we guess
2664
* that this is a batch of changes (for process destruction) and
2665
* unshadow the page so we don't take a pagefault on every entry. This
2666
* should also make finding writeable mappings of pagetables much
2669
/* Look to see if this is the second emulated write in a row to this
2670
* page, and unshadow if it is */
2671
static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
2673
#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2674
if ( v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn)
2675
&& sh_mfn_is_a_page_table(gmfn) )
2677
perfc_incr(shadow_early_unshadow);
2678
sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
2679
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
2681
v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
2685
/* Stop counting towards early unshadows, as we've seen a real page fault */
2686
static inline void reset_early_unshadow(struct vcpu *v)
2688
#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
2689
v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN;
2695
/**************************************************************************/
2696
/* Optimization: Prefetch multiple L1 entries. This is called after we have
2697
* demand-faulted a shadow l1e in the fault handler, to see if it's
2698
* worth fetching some more.
2701
#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
2703
/* XXX magic number */
2704
#define PREFETCH_DISTANCE 32
2706
static void sh_prefetch(struct vcpu *v, walk_t *gw,
2707
shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
2712
guest_l1e_t *gl1p = NULL, gl1e;
2716
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2717
guest_l1e_t *snpl1p = NULL;
2721
/* Prefetch no further than the end of the _shadow_ l1 MFN */
2722
dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
2723
/* And no more than a maximum fetches-per-fault */
2724
if ( dist > PREFETCH_DISTANCE )
2725
dist = PREFETCH_DISTANCE;
2727
if ( mfn_valid(gw->l1mfn) )
2729
/* Normal guest page; grab the next guest entry */
2730
gl1p = sh_map_domain_page(gw->l1mfn);
2731
gl1p += guest_l1_table_offset(gw->va);
2733
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2734
if ( mfn_is_out_of_sync(gw->l1mfn) )
2736
mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
2738
ASSERT(mfn_valid(snpmfn));
2739
snpl1p = sh_map_domain_page(snpmfn);
2740
snpl1p += guest_l1_table_offset(gw->va);
2745
for ( i = 1; i < dist ; i++ )
2747
/* No point in prefetching if there's already a shadow */
2748
if ( ptr_sl1e[i].l1 != 0 )
2751
if ( mfn_valid(gw->l1mfn) )
2753
/* Normal guest page; grab the next guest entry */
2755
/* Not worth continuing if we hit an entry that will need another
2756
* fault for A/D-bit propagation anyway */
2757
gflags = guest_l1e_get_flags(gl1e);
2758
if ( (gflags & _PAGE_PRESENT)
2759
&& (!(gflags & _PAGE_ACCESSED)
2760
|| ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
2765
/* Fragmented superpage, unless we've been called wrongly */
2766
ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE);
2767
/* Increment the l1e's GFN by the right number of guest pages */
2768
gl1e = guest_l1e_from_gfn(
2769
_gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i),
2770
guest_l1e_get_flags(gw->l1e));
2773
/* Look at the gfn that the l1e is pointing at */
2774
gfn = guest_l1e_get_gfn(gl1e);
2775
gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
2777
/* Propagate the entry. */
2778
l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
2779
(void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, p2mt, sl1mfn);
2781
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2782
if ( snpl1p != NULL )
2787
sh_unmap_domain_page(gl1p);
2788
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2789
if ( snpl1p != NULL )
2790
sh_unmap_domain_page(snpl1p);
2794
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
2796
#if GUEST_PAGING_LEVELS == 4
2797
typedef u64 guest_va_t;
2798
typedef u64 guest_pa_t;
2799
#elif GUEST_PAGING_LEVELS == 3
2800
typedef u32 guest_va_t;
2801
typedef u64 guest_pa_t;
2803
typedef u32 guest_va_t;
2804
typedef u32 guest_pa_t;
2807
static inline void trace_shadow_gen(u32 event, guest_va_t va)
2811
event |= (GUEST_PAGING_LEVELS-2)<<8;
2812
__trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
2816
static inline void trace_shadow_fixup(guest_l1e_t gl1e,
2822
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
2823
so put it first for alignment sake. */
2827
} __attribute__((packed)) d;
2830
event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
2834
d.flags = this_cpu(trace_shadow_path_flags);
2836
__trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2840
static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
2846
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
2847
so put it first for alignment sake. */
2851
} __attribute__((packed)) d;
2854
event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
2858
d.flags = this_cpu(trace_shadow_path_flags);
2860
__trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2864
static inline void trace_shadow_emulate_other(u32 event,
2871
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
2872
so put it first for alignment sake. */
2873
#if GUEST_PAGING_LEVELS == 2
2879
} __attribute__((packed)) d;
2881
event |= ((GUEST_PAGING_LEVELS-2)<<8);
2886
__trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2890
#if GUEST_PAGING_LEVELS == 3
2891
static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
2892
static DEFINE_PER_CPU(int,trace_extra_emulation_count);
2894
static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
2896
static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
2901
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
2902
so put it first for alignment sake. */
2903
guest_l1e_t gl1e, write_val;
2905
unsigned flags:29, emulation_count:3;
2906
} __attribute__((packed)) d;
2909
event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
2912
d.write_val.l1 = this_cpu(trace_emulate_write_val);
2914
#if GUEST_PAGING_LEVELS == 3
2915
d.emulation_count = this_cpu(trace_extra_emulation_count);
2917
d.flags = this_cpu(trace_shadow_path_flags);
2919
__trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
2923
/**************************************************************************/
2924
/* Entry points into the shadow code */
2926
/* Called from pagefault handler in Xen, and from the HVM trap handlers
2927
* for pagefaults. Returns 1 if this fault was an artefact of the
2928
* shadow code (and the guest should retry) or 0 if it is not (and the
2929
* fault should be handled elsewhere or passed to the guest). */
2931
static int sh_page_fault(struct vcpu *v,
2933
struct cpu_user_regs *regs)
2935
struct domain *d = v->domain;
2937
gfn_t gfn = _gfn(0);
2938
mfn_t gmfn, sl1mfn = _mfn(0);
2939
shadow_l1e_t sl1e, *ptr_sl1e;
2941
struct sh_emulate_ctxt emul_ctxt;
2942
const struct x86_emulate_ops *emul_ops;
2944
fetch_type_t ft = 0;
2948
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2952
SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
2953
v->domain->domain_id, v->vcpu_id, va, regs->error_code,
2956
perfc_incr(shadow_fault);
2958
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
2959
/* If faulting frame is successfully emulated in last shadow fault
2960
* it's highly likely to reach same emulation action for this frame.
2961
* Then try to emulate early to avoid lock aquisition.
2963
if ( v->arch.paging.last_write_emul_ok
2964
&& v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) )
2966
/* check whether error code is 3, or else fall back to normal path
2967
* in case of some validation is required
2969
if ( regs->error_code == (PFEC_write_access | PFEC_page_present) )
2972
gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
2974
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
2975
/* Fall back to the slow path if we're trying to emulate
2976
writes to an out of sync page. */
2977
if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
2980
v->arch.paging.last_write_emul_ok = 0;
2981
goto page_fault_slow_path;
2985
perfc_incr(shadow_fault_fast_emulate);
2986
goto early_emulation;
2989
v->arch.paging.last_write_emul_ok = 0;
2994
// XXX: Need to think about eventually mapping superpages directly in the
2995
// shadow (when possible), as opposed to splintering them into a
2996
// bunch of 4K maps.
2999
#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
3000
if ( (regs->error_code & PFEC_reserved_bit) )
3002
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3003
/* First, need to check that this isn't an out-of-sync
3004
* shadow l1e. If it is, we fall back to the slow path, which
3005
* will sync it up again. */
3009
if ( (__copy_from_user(&sl2e,
3010
(sh_linear_l2_table(v)
3011
+ shadow_l2_linear_offset(va)),
3013
|| !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
3014
|| !mfn_valid(gl1mfn = backpointer(mfn_to_page(
3015
shadow_l2e_get_mfn(sl2e))))
3016
|| unlikely(mfn_is_out_of_sync(gl1mfn)) )
3018
/* Hit the slow path as if there had been no
3019
* shadow entry at all, and let it tidy up */
3020
ASSERT(regs->error_code & PFEC_page_present);
3021
regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3022
goto page_fault_slow_path;
3025
#endif /* SHOPT_OUT_OF_SYNC */
3026
/* The only reasons for reserved bits to be set in shadow entries
3027
* are the two "magic" shadow_l1e entries. */
3028
if ( likely((__copy_from_user(&sl1e,
3029
(sh_linear_l1_table(v)
3030
+ shadow_l1_linear_offset(va)),
3032
&& sh_l1e_is_magic(sl1e)) )
3035
if ( sh_l1e_is_gnp(sl1e) )
3037
/* Not-present in a guest PT: pass to the guest as
3038
* a not-present fault (by flipping two bits). */
3039
ASSERT(regs->error_code & PFEC_page_present);
3040
regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
3041
reset_early_unshadow(v);
3042
perfc_incr(shadow_fault_fast_gnp);
3043
SHADOW_PRINTK("fast path not-present\n");
3044
trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
3049
/* Magic MMIO marker: extract gfn for MMIO address */
3050
ASSERT(sh_l1e_is_mmio(sl1e));
3051
gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
3053
| (va & ~PAGE_MASK);
3055
perfc_incr(shadow_fault_fast_mmio);
3056
SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
3057
reset_early_unshadow(v);
3058
trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
3059
return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3060
? EXCRET_fault_fixed : 0);
3064
/* This should be exceptionally rare: another vcpu has fixed
3065
* the tables between the fault and our reading the l1e.
3066
* Retry and let the hardware give us the right fault next time. */
3067
perfc_incr(shadow_fault_fast_fail);
3068
SHADOW_PRINTK("fast path false alarm!\n");
3069
trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
3070
return EXCRET_fault_fixed;
3074
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3075
page_fault_slow_path:
3077
#endif /* SHOPT_FAST_FAULT_PATH */
3079
/* Detect if this page fault happened while we were already in Xen
3080
* doing a shadow operation. If that happens, the only thing we can
3081
* do is let Xen's normal fault handlers try to fix it. In any case,
3082
* a diagnostic trace of the fault will be more useful than
3083
* a BUG() when we try to take the lock again. */
3084
if ( unlikely(shadow_locked_by_me(d)) )
3086
SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
3087
d->arch.paging.shadow.locker_function);
3093
/* The walk is done in a lock-free style, with some sanity check
3094
* postponed after grabbing shadow lock later. Those delayed checks
3095
* will make sure no inconsistent mapping being translated into
3096
* shadow page table. */
3097
version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
3099
rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
3101
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3102
regs->error_code &= ~PFEC_page_present;
3103
if ( !(rc & _PAGE_PRESENT) )
3104
regs->error_code |= PFEC_page_present;
3109
perfc_incr(shadow_fault_bail_real_fault);
3110
SHADOW_PRINTK("not a shadow fault\n");
3111
reset_early_unshadow(v);
3115
/* It's possible that the guest has put pagetables in memory that it has
3116
* already used for some special purpose (ioreq pages, or granted pages).
3117
* If that happens we'll have killed the guest already but it's still not
3118
* safe to propagate entries out of the guest PT so get out now. */
3119
if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) )
3121
SHADOW_PRINTK("guest is shutting down\n");
3125
/* What kind of access are we dealing with? */
3126
ft = ((regs->error_code & PFEC_write_access)
3127
? ft_demand_write : ft_demand_read);
3129
/* What mfn is the guest trying to access? */
3130
gfn = guest_l1e_get_gfn(gw.l1e);
3131
gmfn = gfn_to_mfn_guest(d, gfn, &p2mt);
3133
if ( shadow_mode_refcounts(d) &&
3134
((!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt)) ||
3135
(!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
3137
perfc_incr(shadow_fault_bail_bad_gfn);
3138
SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
3139
gfn_x(gfn), mfn_x(gmfn));
3140
reset_early_unshadow(v);
3144
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3145
/* Remember this successful VA->GFN translation for later. */
3146
vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn),
3147
regs->error_code | PFEC_page_present);
3148
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3152
TRACE_CLEAR_PATH_FLAGS;
3154
/* Make sure there is enough free shadow memory to build a chain of
3155
* shadow tables. (We never allocate a top-level shadow on this path,
3156
* only a 32b l1, pae l1, or 64b l3+2+1. Note that while
3157
* SH_type_l1_shadow isn't correct in the latter case, all page
3158
* tables are the same size there.)
3160
* Preallocate shadow pages *before* removing writable accesses
3161
* otherwhise an OOS L1 might be demoted and promoted again with
3162
* writable mappings. */
3165
GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
3167
rc = gw_remove_write_accesses(v, va, &gw);
3169
/* First bit set: Removed write access to a page. */
3170
if ( rc & GW_RMWR_FLUSHTLB )
3172
/* Write permission removal is also a hint that other gwalks
3173
* overlapping with this one may be inconsistent
3175
perfc_incr(shadow_rm_write_flush_tlb);
3176
atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
3177
flush_tlb_mask(&d->domain_dirty_cpumask);
3180
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3181
/* Second bit set: Resynced a page. Re-walk needed. */
3182
if ( rc & GW_RMWR_REWALK )
3189
if ( !shadow_check_gwalk(v, va, &gw, version) )
3191
perfc_incr(shadow_inconsistent_gwalk);
3196
shadow_audit_tables(v);
3197
sh_audit_gw(v, &gw);
3199
/* Acquire the shadow. This must happen before we figure out the rights
3200
* for the shadow entry, since we might promote a page here. */
3201
ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
3202
if ( unlikely(ptr_sl1e == NULL) )
3204
/* Couldn't get the sl1e! Since we know the guest entries
3205
* are OK, this can only have been caused by a failed
3206
* shadow_set_l*e(), which will have crashed the guest.
3207
* Get out of the fault handler immediately. */
3208
/* Windows 7 apparently relies on the hardware to do something
3209
* it explicitly hasn't promised to do: load l3 values after
3210
* the cr3 is loaded.
3211
* In any case, in the PAE case, the ASSERT is not true; it can
3212
* happen because of actions the guest is taking. */
3213
#if GUEST_PAGING_LEVELS == 3
3214
v->arch.paging.mode->update_cr3(v, 0);
3216
ASSERT(d->is_shutting_down);
3219
trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
3223
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3224
/* Always unsync when writing to L1 page tables. */
3225
if ( sh_mfn_is_a_page_table(gmfn)
3226
&& ft == ft_demand_write )
3229
if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) )
3231
/* We might end up with a crashed domain here if
3232
* sh_remove_shadows() in a previous sh_resync() call has
3233
* failed. We cannot safely continue since some page is still
3234
* OOS but not in the hash table anymore. */
3240
/* Calculate the shadow entry and write it */
3241
l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
3242
r = shadow_set_l1e(v, ptr_sl1e, sl1e, p2mt, sl1mfn);
3244
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3245
if ( mfn_valid(gw.l1mfn)
3246
&& mfn_is_out_of_sync(gw.l1mfn) )
3248
/* Update the OOS snapshot. */
3249
mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
3252
ASSERT(mfn_valid(snpmfn));
3254
snp = sh_map_domain_page(snpmfn);
3255
snp[guest_l1_table_offset(va)] = gw.l1e;
3256
sh_unmap_domain_page(snp);
3260
#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
3261
/* Prefetch some more shadow entries */
3262
sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
3265
/* Need to emulate accesses to page tables */
3266
if ( sh_mfn_is_a_page_table(gmfn)
3267
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3268
/* Unless they've been allowed to go out of sync with their
3269
shadows and we don't need to unshadow it. */
3270
&& !(mfn_is_out_of_sync(gmfn)
3271
&& !(regs->error_code & PFEC_user_mode))
3275
if ( ft == ft_demand_write )
3277
perfc_incr(shadow_fault_emulate_write);
3280
else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
3282
perfc_incr(shadow_fault_emulate_read);
3287
/* Need to hand off device-model MMIO to the device model */
3288
if ( p2mt == p2m_mmio_dm )
3290
gpa = guest_walk_to_gpa(&gw);
3294
/* Ignore attempts to write to read-only memory. */
3295
if ( p2m_is_readonly(p2mt) && (ft == ft_demand_write) )
3297
static unsigned long lastpage;
3298
if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
3299
gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
3300
" page. va page=%#lx, mfn=%#lx\n",
3301
va & PAGE_MASK, mfn_x(gmfn));
3302
goto emulate_readonly; /* skip over the instruction */
3305
/* In HVM guests, we force CR0.WP always to be set, so that the
3306
* pagetables are always write-protected. If the guest thinks
3307
* CR0.WP is clear, we must emulate faulting supervisor writes to
3308
* allow the guest to write through read-only PTEs. Emulate if the
3309
* fault was a non-user write to a present page. */
3310
if ( is_hvm_domain(d)
3311
&& unlikely(!hvm_wp_enabled(v))
3312
&& regs->error_code == (PFEC_write_access|PFEC_page_present)
3313
&& mfn_valid(gmfn) )
3315
perfc_incr(shadow_fault_emulate_wp);
3319
perfc_incr(shadow_fault_fixed);
3320
d->arch.paging.log_dirty.fault_count++;
3321
reset_early_unshadow(v);
3323
trace_shadow_fixup(gw.l1e, va);
3325
sh_audit_gw(v, &gw);
3326
SHADOW_PRINTK("fixed\n");
3327
shadow_audit_tables(v);
3329
return EXCRET_fault_fixed;
3332
if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
3333
goto not_a_shadow_fault;
3336
* We do not emulate user writes. Instead we use them as a hint that the
3337
* page is no longer a page table. This behaviour differs from native, but
3338
* it seems very unlikely that any OS grants user access to page tables.
3340
if ( (regs->error_code & PFEC_user_mode) )
3342
SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n",
3344
perfc_incr(shadow_fault_emulate_failed);
3345
sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3346
trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
3352
* Write from userspace to ro-mem needs to jump here to avoid getting
3353
* caught by user-mode page-table check above.
3357
* We don't need to hold the lock for the whole emulation; we will
3358
* take it again when we write to the pagetables.
3360
sh_audit_gw(v, &gw);
3361
shadow_audit_tables(v);
3364
this_cpu(trace_emulate_write_val) = 0;
3366
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3369
if ( is_hvm_domain(d) )
3372
* If we are in the middle of injecting an exception or interrupt then
3373
* we should not emulate: it is not the instruction at %eip that caused
3374
* the fault. Furthermore it is almost certainly the case the handler
3375
* stack is currently considered to be a page table, so we should
3376
* unshadow the faulting page before exiting.
3378
if ( unlikely(hvm_event_pending(v)) )
3380
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3383
perfc_incr(shadow_fault_fast_emulate_fail);
3384
v->arch.paging.last_write_emul_ok = 0;
3387
gdprintk(XENLOG_DEBUG, "write to pagetable during event "
3388
"injection: cr2=%#lx, mfn=%#lx\n",
3390
sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
3391
trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
3393
return EXCRET_fault_fixed;
3397
SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
3398
(unsigned long)regs->eip, (unsigned long)regs->esp);
3400
emul_ops = shadow_init_emulation(&emul_ctxt, regs);
3402
r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3405
* NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
3406
* would be a good unshadow hint. If we *do* decide to unshadow-on-fault
3407
* then it must be 'failable': we cannot require the unshadow to succeed.
3409
if ( r == X86EMUL_UNHANDLEABLE )
3411
perfc_incr(shadow_fault_emulate_failed);
3412
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3415
perfc_incr(shadow_fault_fast_emulate_fail);
3416
v->arch.paging.last_write_emul_ok = 0;
3419
SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
3421
/* If this is actually a page table, then we have a bug, and need
3422
* to support more operations in the emulator. More likely,
3423
* though, this is a hint that this page should not be shadowed. */
3424
shadow_remove_all_shadows(v, gmfn);
3426
trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
3431
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3432
/* Record successfully emulated information as heuristics to next
3433
* fault on same frame for acceleration. But be careful to verify
3434
* its attribute still as page table, or else unshadow triggered
3435
* in write emulation normally requires a re-sync with guest page
3436
* table to recover r/w permission. Incorrect record for such case
3437
* will cause unexpected more shadow faults due to propagation is
3440
if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) )
3444
v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT;
3445
v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn);
3446
v->arch.paging.last_write_emul_ok = 1;
3449
else if ( fast_emul )
3450
v->arch.paging.last_write_emul_ok = 0;
3453
#if GUEST_PAGING_LEVELS == 3 /* PAE guest */
3454
if ( r == X86EMUL_OKAY ) {
3455
int i, emulation_count=0;
3456
this_cpu(trace_emulate_initial_va) = va;
3457
/* Emulate up to four extra instructions in the hope of catching
3458
* the "second half" of a 64-bit pagetable write. */
3459
for ( i = 0 ; i < 4 ; i++ )
3461
shadow_continue_emulation(&emul_ctxt, regs);
3462
v->arch.paging.last_write_was_pt = 0;
3463
r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
3464
if ( r == X86EMUL_OKAY )
3467
if ( v->arch.paging.last_write_was_pt )
3469
perfc_incr(shadow_em_ex_pt);
3470
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
3471
break; /* Don't emulate past the other half of the write */
3474
perfc_incr(shadow_em_ex_non_pt);
3478
perfc_incr(shadow_em_ex_fail);
3479
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
3480
break; /* Don't emulate again if we failed! */
3483
this_cpu(trace_extra_emulation_count)=emulation_count;
3485
#endif /* PAE guest */
3487
trace_shadow_emulate(gw.l1e, va);
3489
SHADOW_PRINTK("emulated\n");
3490
return EXCRET_fault_fixed;
3493
if ( !guest_mode(regs) )
3494
goto not_a_shadow_fault;
3495
perfc_incr(shadow_fault_mmio);
3496
sh_audit_gw(v, &gw);
3497
SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
3498
shadow_audit_tables(v);
3499
reset_early_unshadow(v);
3501
trace_shadow_gen(TRC_SHADOW_MMIO, va);
3502
return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
3503
? EXCRET_fault_fixed : 0);
3506
sh_audit_gw(v, &gw);
3507
SHADOW_PRINTK("not a shadow fault\n");
3508
shadow_audit_tables(v);
3509
reset_early_unshadow(v);
3513
trace_not_shadow_fault(gw.l1e, va);
3520
sh_invlpg(struct vcpu *v, unsigned long va)
3521
/* Called when the guest requests an invlpg. Returns 1 if the invlpg
3522
* instruction should be issued on the hardware, or 0 if it's safe not
3528
perfc_incr(shadow_invlpg);
3530
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3531
/* No longer safe to use cached gva->gfn translations */
3535
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
3536
v->arch.paging.last_write_emul_ok = 0;
3539
/* First check that we can safely read the shadow l2e. SMP/PAE linux can
3540
* run as high as 6% of invlpg calls where we haven't shadowed the l2
3542
#if SHADOW_PAGING_LEVELS == 4
3545
if ( !(shadow_l4e_get_flags(
3546
sh_linear_l4_table(v)[shadow_l4_linear_offset(va)])
3549
/* This must still be a copy-from-user because we don't have the
3550
* shadow lock, and the higher-level shadows might disappear
3551
* under our feet. */
3552
if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v)
3553
+ shadow_l3_linear_offset(va)),
3554
sizeof (sl3e)) != 0 )
3556
perfc_incr(shadow_invlpg_fault);
3559
if ( (!shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) )
3562
#else /* SHADOW_PAGING_LEVELS == 3 */
3563
if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)])
3565
// no need to flush anything if there's no SL2...
3569
/* This must still be a copy-from-user because we don't have the shadow
3570
* lock, and the higher-level shadows might disappear under our feet. */
3571
if ( __copy_from_user(&sl2e,
3572
sh_linear_l2_table(v) + shadow_l2_linear_offset(va),
3573
sizeof (sl2e)) != 0 )
3575
perfc_incr(shadow_invlpg_fault);
3579
// If there's nothing shadowed for this particular sl2e, then
3580
// there is no need to do an invlpg, either...
3582
if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3585
// Check to see if the SL2 is a splintered superpage...
3586
// If so, then we'll need to flush the entire TLB (because that's
3587
// easier than invalidating all of the individual 4K pages).
3589
sl1mfn = shadow_l2e_get_mfn(sl2e);
3590
if ( mfn_to_page(sl1mfn)->u.sh.type
3591
== SH_type_fl1_shadow )
3597
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
3598
/* Check to see if the SL1 is out of sync. */
3600
mfn_t gl1mfn = backpointer(mfn_to_page(sl1mfn));
3601
struct page_info *pg = mfn_to_page(gl1mfn);
3602
if ( mfn_valid(gl1mfn)
3603
&& page_is_out_of_sync(pg) )
3605
/* The test above may give false positives, since we don't
3606
* hold the shadow lock yet. Check again with the lock held. */
3607
shadow_lock(v->domain);
3609
/* This must still be a copy-from-user because we didn't
3610
* have the shadow lock last time we checked, and the
3611
* higher-level shadows might have disappeared under our
3613
if ( __copy_from_user(&sl2e,
3614
sh_linear_l2_table(v)
3615
+ shadow_l2_linear_offset(va),
3616
sizeof (sl2e)) != 0 )
3618
perfc_incr(shadow_invlpg_fault);
3619
shadow_unlock(v->domain);
3623
if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
3625
shadow_unlock(v->domain);
3629
sl1mfn = shadow_l2e_get_mfn(sl2e);
3630
gl1mfn = backpointer(mfn_to_page(sl1mfn));
3631
pg = mfn_to_page(gl1mfn);
3633
if ( likely(sh_mfn_is_a_page_table(gl1mfn)
3634
&& page_is_out_of_sync(pg) ) )
3637
sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
3638
/* Remove the shadow entry that maps this VA */
3639
(void) shadow_set_l1e(v, sl1, shadow_l1e_empty(),
3640
p2m_invalid, sl1mfn);
3642
shadow_unlock(v->domain);
3643
/* Need the invlpg, to pick up the disappeareance of the sl1e */
3653
static unsigned long
3654
sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
3655
/* Called to translate a guest virtual address to what the *guest*
3656
* pagetables would map it to. */
3661
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3662
/* Check the vTLB cache first */
3663
unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]);
3664
if ( VALID_GFN(vtlb_gfn) )
3666
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3668
if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
3670
if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
3671
pfec[0] &= ~PFEC_page_present;
3674
gfn = guest_walk_to_gfn(&gw);
3676
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
3677
/* Remember this successful VA->GFN translation for later. */
3678
vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]);
3679
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
3686
sh_update_linear_entries(struct vcpu *v)
3687
/* Sync up all the linear mappings for this vcpu's pagetables */
3689
struct domain *d = v->domain;
3691
/* Linear pagetables in PV guests
3692
* ------------------------------
3694
* Guest linear pagetables, which map the guest pages, are at
3695
* LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
3696
* shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
3697
* are set up at shadow creation time, but (of course!) the PAE case
3698
* is subtler. Normal linear mappings are made by having an entry
3699
* in the top-level table that points to itself (shadow linear) or
3700
* to the guest top-level table (guest linear). For PAE, to set up
3701
* a linear map requires us to copy the four top-level entries into
3702
* level-2 entries. That means that every time we change a PAE l3e,
3703
* we need to reflect the change into the copy.
3705
* Linear pagetables in HVM guests
3706
* -------------------------------
3708
* For HVM guests, the linear pagetables are installed in the monitor
3709
* tables (since we can't put them in the shadow). Shadow linear
3710
* pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
3711
* and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
3712
* a linear pagetable of the monitor tables themselves. We have
3713
* the same issue of having to re-copy PAE l3 entries whevever we use
3716
* Because HVM guests run on the same monitor tables regardless of the
3717
* shadow tables in use, the linear mapping of the shadow tables has to
3718
* be updated every time v->arch.shadow_table changes.
3721
/* Don't try to update the monitor table if it doesn't exist */
3722
if ( shadow_mode_external(d)
3723
&& pagetable_get_pfn(v->arch.monitor_table) == 0 )
3726
#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
3728
/* For PV, one l4e points at the guest l4, one points at the shadow
3729
* l4. No maintenance required.
3730
* For HVM, just need to update the l4e that points to the shadow l4. */
3732
if ( shadow_mode_external(d) )
3734
/* Use the linear map if we can; otherwise make a new mapping */
3737
__linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
3738
l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3744
ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3745
ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
3746
l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]),
3748
sh_unmap_domain_page(ml4e);
3752
#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
3756
* HVM: To give ourselves a linear map of the shadows, we need to
3757
* extend a PAE shadow to 4 levels. We do this by having a monitor
3758
* l3 in slot 0 of the monitor l4 table, and copying the PAE l3
3759
* entries into it. Then, by having the monitor l4e for shadow
3760
* pagetables also point to the monitor l4, we can use it to access
3764
if ( shadow_mode_external(d) )
3766
/* Install copies of the shadow l3es into the monitor l2 table
3767
* that maps SH_LINEAR_PT_VIRT_START. */
3772
/* Use linear mappings if we can; otherwise make new mappings */
3774
ml2e = __linear_l2_table
3775
+ l2_linear_offset(SH_LINEAR_PT_VIRT_START);
3781
int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START);
3782
ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
3784
ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT);
3785
l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot]));
3786
ml3e = sh_map_domain_page(l3mfn);
3787
sh_unmap_domain_page(ml4e);
3789
ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT);
3790
l2mfn = _mfn(l3e_get_pfn(ml3e[0]));
3791
ml2e = sh_map_domain_page(l2mfn);
3792
sh_unmap_domain_page(ml3e);
3795
/* Shadow l3 tables are made up by sh_update_cr3 */
3796
sl3e = v->arch.paging.shadow.l3table;
3798
for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3801
(shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
3802
? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
3808
sh_unmap_domain_page(ml2e);
3811
domain_crash(d); /* XXX */
3813
#elif CONFIG_PAGING_LEVELS == 3
3815
/* PV: need to copy the guest's l3 entries into the guest-linear-map l2
3816
* entries in the shadow, and the shadow's l3 entries into the
3817
* shadow-linear-map l2 entries in the shadow. This is safe to do
3818
* because Xen does not let guests share high-slot l2 tables between l3s,
3819
* so we know we're not treading on anyone's toes.
3821
* HVM: need to copy the shadow's l3 entries into the
3822
* shadow-linear-map l2 entries in the monitor table. This is safe
3823
* because we have one monitor table for each vcpu. The monitor's
3824
* own l3es don't need to be copied because they never change.
3825
* XXX That might change if we start stuffing things into the rest
3826
* of the monitor's virtual address space.
3829
l2_pgentry_t *l2e, new_l2e;
3830
shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
3834
#if GUEST_PAGING_LEVELS == 2
3836
/* Shadow l3 tables were built by sh_update_cr3 */
3837
BUG_ON(!shadow_mode_external(d)); /* PV 2-on-3 is unsupported */
3838
shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3840
#else /* GUEST_PAGING_LEVELS == 3 */
3842
shadow_l3e = (shadow_l3e_t *)&v->arch.paging.shadow.l3table;
3843
guest_l3e = (guest_l3e_t *)&v->arch.paging.shadow.gl3e;
3845
#endif /* GUEST_PAGING_LEVELS */
3847
/* Choose where to write the entries, using linear maps if possible */
3848
if ( shadow_mode_external(d) )
3852
/* From the monitor tables, it's safe to use linear maps
3853
* to update monitor l2s */
3854
l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
3858
/* Map the monitor table's high l2 */
3860
l3e = sh_map_domain_page(
3861
pagetable_get_mfn(v->arch.monitor_table));
3862
ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
3863
l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
3865
sh_unmap_domain_page(l3e);
3870
/* Map the shadow table's high l2 */
3871
ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
3872
l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
3876
/* Write linear mapping of guest (only in PV, and only when
3877
* not translated). */
3878
if ( !shadow_mode_translate(d) )
3880
for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3883
((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
3884
? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
3888
&l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
3893
/* Write linear mapping of shadow. */
3894
for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
3896
new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
3897
? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
3901
&l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
3906
sh_unmap_domain_page(l2e);
3910
#error this should not happen
3913
if ( shadow_mode_external(d) )
3916
* Having modified the linear pagetable mapping, flush local host TLBs.
3917
* This was not needed when vmenter/vmexit always had the side effect
3918
* of flushing host TLBs but, with ASIDs, it is possible to finish
3919
* this CR3 update, vmenter the guest, vmexit due to a page fault,
3920
* without an intervening host TLB flush. Then the page fault code
3921
* could use the linear pagetable to read a top-level shadow page
3922
* table entry. But, without this change, it would fetch the wrong
3923
* value due to a stale TLB.
3930
/* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[].
3931
* Does all appropriate management/bookkeeping/refcounting/etc...
3934
sh_detach_old_tables(struct vcpu *v)
3940
//// vcpu->arch.paging.shadow.guest_vtable
3943
#if GUEST_PAGING_LEVELS == 3
3944
/* PAE guests don't have a mapping of the guest top-level table */
3945
ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
3947
if ( v->arch.paging.shadow.guest_vtable )
3949
struct domain *d = v->domain;
3950
if ( shadow_mode_external(d) || shadow_mode_translate(d) )
3951
sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
3952
v->arch.paging.shadow.guest_vtable = NULL;
3958
//// vcpu->arch.shadow_table[]
3961
#if GUEST_PAGING_LEVELS == 3
3962
/* PAE guests have four shadow_table entries */
3963
for ( i = 0 ; i < 4 ; i++ )
3966
smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
3968
sh_put_ref(v, smfn, 0);
3969
v->arch.shadow_table[i] = pagetable_null();
3973
/* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
3975
sh_set_toplevel_shadow(struct vcpu *v,
3978
unsigned int root_type)
3981
pagetable_t old_entry, new_entry;
3983
struct domain *d = v->domain;
3985
/* Remember the old contents of this slot */
3986
old_entry = v->arch.shadow_table[slot];
3988
/* Now figure out the new contents: is this a valid guest MFN? */
3989
if ( !mfn_valid(gmfn) )
3991
new_entry = pagetable_null();
3992
goto install_new_entry;
3995
/* Guest mfn is valid: shadow it and install the shadow */
3996
smfn = get_shadow_status(v, gmfn, root_type);
3997
if ( !mfn_valid(smfn) )
3999
/* Make sure there's enough free shadow memory. */
4000
shadow_prealloc(d, root_type, 1);
4001
/* Shadow the page. */
4002
smfn = sh_make_shadow(v, gmfn, root_type);
4004
ASSERT(mfn_valid(smfn));
4006
/* Pin the shadow and put it (back) on the list of pinned shadows */
4007
if ( sh_pin(v, smfn) == 0 )
4009
SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
4010
domain_crash(v->domain);
4013
/* Take a ref to this page: it will be released in sh_detach_old_tables()
4014
* or the next call to set_toplevel_shadow() */
4015
if ( !sh_get_ref(v, smfn, 0) )
4017
SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
4018
domain_crash(v->domain);
4021
new_entry = pagetable_from_mfn(smfn);
4024
/* Done. Install it */
4025
SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n",
4026
GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot,
4027
mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
4028
v->arch.shadow_table[slot] = new_entry;
4030
/* Decrement the refcount of the old contents of this slot */
4031
if ( !pagetable_is_null(old_entry) ) {
4032
mfn_t old_smfn = pagetable_get_mfn(old_entry);
4033
/* Need to repin the old toplevel shadow if it's been unpinned
4034
* by shadow_prealloc(): in PV mode we're still running on this
4035
* shadow and it's not safe to free it yet. */
4036
if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) )
4038
SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
4039
domain_crash(v->domain);
4041
sh_put_ref(v, old_smfn, 0);
4047
sh_update_cr3(struct vcpu *v, int do_locking)
4048
/* Updates vcpu->arch.cr3 after the guest has changed CR3.
4049
* Paravirtual guests should set v->arch.guest_table (and guest_table_user,
4051
* HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works;
4052
* this function will call hvm_update_guest_cr(v, 3) to tell them where the
4053
* shadow tables are.
4054
* If do_locking != 0, assume we are being called from outside the
4055
* shadow code, and must take and release the shadow lock; otherwise
4056
* that is the caller's responsibility.
4059
struct domain *d = v->domain;
4061
#if GUEST_PAGING_LEVELS == 3
4067
/* Don't do anything on an uninitialised vcpu */
4068
if ( !is_hvm_domain(d) && !v->is_initialised )
4070
ASSERT(v->arch.cr3 == 0);
4074
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4075
/* Need to resync all the shadow entries on a TLB flush. Resync
4076
* current vcpus OOS pages before switching to the new shadow
4077
* tables so that the VA hint is still valid. */
4078
shadow_resync_current_vcpu(v, do_locking);
4081
if ( do_locking ) shadow_lock(v->domain);
4083
ASSERT(shadow_locked_by_me(v->domain));
4084
ASSERT(v->arch.paging.mode);
4087
//// vcpu->arch.guest_table is already set
4091
/* Double-check that the HVM code has sent us a sane guest_table */
4092
if ( is_hvm_domain(d) )
4094
ASSERT(shadow_mode_external(d));
4095
if ( hvm_paging_enabled(v) )
4096
ASSERT(pagetable_get_pfn(v->arch.guest_table));
4098
ASSERT(v->arch.guest_table.pfn
4099
== d->arch.paging.shadow.unpaged_pagetable.pfn);
4103
SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
4104
d->domain_id, v->vcpu_id,
4105
(unsigned long)pagetable_get_pfn(v->arch.guest_table));
4107
#if GUEST_PAGING_LEVELS == 4
4108
if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) )
4109
gmfn = pagetable_get_mfn(v->arch.guest_table_user);
4112
gmfn = pagetable_get_mfn(v->arch.guest_table);
4116
//// vcpu->arch.paging.shadow.guest_vtable
4118
#if GUEST_PAGING_LEVELS == 4
4119
if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4121
if ( v->arch.paging.shadow.guest_vtable )
4122
sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4123
v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4124
/* PAGING_LEVELS==4 implies 64-bit, which means that
4125
* map_domain_page_global can't fail */
4126
BUG_ON(v->arch.paging.shadow.guest_vtable == NULL);
4129
v->arch.paging.shadow.guest_vtable = __linear_l4_table;
4130
#elif GUEST_PAGING_LEVELS == 3
4131
/* On PAE guests we don't use a mapping of the guest's own top-level
4132
* table. We cache the current state of that table and shadow that,
4133
* until the next CR3 write makes us refresh our cache. */
4134
ASSERT(v->arch.paging.shadow.guest_vtable == NULL);
4136
if ( shadow_mode_external(d) )
4137
/* Find where in the page the l3 table is */
4138
guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]);
4140
/* PV guest: l3 is at the start of a page */
4143
// Ignore the low 2 bits of guest_idx -- they are really just
4147
gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx;
4148
for ( i = 0; i < 4 ; i++ )
4149
v->arch.paging.shadow.gl3e[i] = gl3e[i];
4150
sh_unmap_domain_page(gl3e);
4151
#elif GUEST_PAGING_LEVELS == 2
4152
if ( shadow_mode_external(d) || shadow_mode_translate(d) )
4154
if ( v->arch.paging.shadow.guest_vtable )
4155
sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
4156
v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn);
4157
/* Does this really need map_domain_page_global? Handle the
4158
* error properly if so. */
4159
BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */
4162
v->arch.paging.shadow.guest_vtable = __linear_l2_table;
4164
#error this should never happen
4169
//// vcpu->arch.shadow_table[]
4172
/* We revoke write access to the new guest toplevel page(s) before we
4173
* replace the old shadow pagetable(s), so that we can safely use the
4174
* (old) shadow linear maps in the writeable mapping heuristics. */
4175
#if GUEST_PAGING_LEVELS == 2
4176
if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
4177
flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4178
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
4179
#elif GUEST_PAGING_LEVELS == 3
4180
/* PAE guests have four shadow_table entries, based on the
4181
* current values of the guest's four l3es. */
4187
guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e;
4188
/* First, make all four entries read-only. */
4189
for ( i = 0; i < 4; i++ )
4191
if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4193
gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4194
gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4195
if ( p2m_is_ram(p2mt) )
4196
flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
4200
flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4201
/* Now install the new shadows. */
4202
for ( i = 0; i < 4; i++ )
4204
if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
4206
gl2gfn = guest_l3e_get_gfn(gl3e[i]);
4207
gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
4208
if ( p2m_is_ram(p2mt) )
4209
sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
4210
? SH_type_l2h_shadow
4211
: SH_type_l2_shadow);
4213
sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4216
sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0);
4219
#elif GUEST_PAGING_LEVELS == 4
4220
if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
4221
flush_tlb_mask(&v->domain->domain_dirty_cpumask);
4222
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
4224
#error This should never happen
4229
/// v->arch.paging.shadow.l3table
4231
#if SHADOW_PAGING_LEVELS == 3
4235
for ( i = 0; i < 4; i++ )
4237
#if GUEST_PAGING_LEVELS == 2
4238
/* 2-on-3: make a PAE l3 that points at the four-page l2 */
4239
smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[0]) + i);
4241
/* 3-on-3: make a PAE l3 that points at the four l2 pages */
4242
smfn = pagetable_get_mfn(v->arch.shadow_table[i]);
4244
v->arch.paging.shadow.l3table[i] =
4246
? shadow_l3e_empty()
4247
: shadow_l3e_from_mfn(smfn, _PAGE_PRESENT);
4250
#endif /* SHADOW_PAGING_LEVELS == 3 */
4256
if ( shadow_mode_external(d) )
4258
make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
4260
else // not shadow_mode_external...
4262
/* We don't support PV except guest == shadow == config levels */
4263
BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
4264
#if SHADOW_PAGING_LEVELS == 3
4265
/* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated.
4266
* Don't use make_cr3 because (a) we know it's below 4GB, and
4267
* (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */
4268
ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL);
4269
v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table);
4271
/* 4-on-4: Just use the shadow top-level directly */
4272
make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0]));
4278
/// v->arch.hvm_vcpu.hw_cr[3]
4280
if ( shadow_mode_external(d) )
4282
ASSERT(is_hvm_domain(d));
4283
#if SHADOW_PAGING_LEVELS == 3
4284
/* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */
4285
v->arch.hvm_vcpu.hw_cr[3] =
4286
virt_to_maddr(&v->arch.paging.shadow.l3table);
4288
/* 4-on-4: Just use the shadow top-level directly */
4289
v->arch.hvm_vcpu.hw_cr[3] =
4290
pagetable_get_paddr(v->arch.shadow_table[0]);
4292
hvm_update_guest_cr(v, 3);
4295
/* Fix up the linear pagetable mappings */
4296
sh_update_linear_entries(v);
4298
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
4299
/* No longer safe to use cached gva->gfn translations */
4303
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
4304
v->arch.paging.last_write_emul_ok = 0;
4307
/* Release the lock, if we took it (otherwise it's the caller's problem) */
4308
if ( do_locking ) shadow_unlock(v->domain);
4310
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4311
/* Need to resync all the shadow entries on a TLB flush. We only
4312
* update the shadows, leaving the pages out of sync. Also, we try
4313
* to skip synchronization of shadows not mapped in the new
4315
shadow_sync_other_vcpus(v, do_locking);
4321
/**************************************************************************/
4322
/* Functions to revoke guest rights */
4324
#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
4325
int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
4326
mfn_t smfn, unsigned long off)
4329
shadow_l1e_t *sl1p, sl1e;
4330
struct page_info *sp;
4332
ASSERT(mfn_valid(gmfn));
4333
ASSERT(mfn_valid(smfn));
4335
sp = mfn_to_page(smfn);
4337
if ( ((sp->count_info & PGC_count_mask) != 0)
4338
|| (sp->u.sh.type != SH_type_l1_shadow
4339
&& sp->u.sh.type != SH_type_fl1_shadow) )
4342
sl1p = sh_map_domain_page(smfn);
4345
if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4346
!= (_PAGE_PRESENT|_PAGE_RW))
4347
|| (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4349
sh_unmap_domain_page(sl1p);
4353
/* Found it! Need to remove its write permissions. */
4354
sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4355
r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, smfn);
4356
ASSERT( !(r & SHADOW_SET_ERROR) );
4358
sh_unmap_domain_page(sl1p);
4359
perfc_incr(shadow_writeable_h_7);
4363
perfc_incr(shadow_writeable_h_8);
4368
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4369
static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
4370
/* Look up this vaddr in the current shadow and see if it's a writeable
4371
* mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
4373
shadow_l1e_t sl1e, *sl1p;
4376
#if SHADOW_PAGING_LEVELS >= 4
4382
/* Carefully look in the shadow linear map for the l1e we expect */
4383
#if SHADOW_PAGING_LEVELS >= 4
4384
sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
4385
if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
4387
sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
4388
if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4390
#else /* SHADOW_PAGING_LEVELS == 3 */
4391
sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table)
4392
+ shadow_l3_linear_offset(vaddr);
4393
if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
4396
sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
4397
if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
4399
sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
4401
if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
4402
!= (_PAGE_PRESENT|_PAGE_RW))
4403
|| (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
4406
/* Found it! Need to remove its write permissions. */
4407
sl1mfn = shadow_l2e_get_mfn(*sl2p);
4408
sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
4409
r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, sl1mfn);
4410
if ( r & SHADOW_SET_ERROR ) {
4411
/* Can only currently happen if we found a grant-mapped
4412
* page. Just make the guess fail. */
4415
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
4420
int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn,
4422
/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
4427
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4428
mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */
4431
SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4433
flags = shadow_l1e_get_flags(*sl1e);
4434
if ( (flags & _PAGE_PRESENT)
4435
&& (flags & _PAGE_RW)
4436
&& (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
4438
shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW);
4439
(void) shadow_set_l1e(v, sl1e, ro_sl1e, p2m_ram_rw, sl1mfn);
4440
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
4441
/* Remember the last shadow that we shot a writeable mapping in */
4442
v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn);
4444
if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
4445
& PGT_count_mask) == 0 )
4446
/* This breaks us cleanly out of the FOREACH macro */
4454
int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
4455
/* Excises all mappings to guest frame from this shadow l1 table */
4461
SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done,
4463
flags = shadow_l1e_get_flags(*sl1e);
4464
if ( (flags & _PAGE_PRESENT)
4465
&& (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
4467
(void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(),
4468
p2m_invalid, sl1mfn);
4469
if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
4470
/* This breaks us cleanly out of the FOREACH macro */
4477
/**************************************************************************/
4478
/* Functions to excise all pointers to shadows from higher-level shadows. */
4480
void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
4481
/* Blank out a single shadow entry */
4483
switch ( mfn_to_page(smfn)->u.sh.type )
4485
case SH_type_l1_shadow:
4486
(void) shadow_set_l1e(v, ep, shadow_l1e_empty(), p2m_invalid, smfn);
4488
case SH_type_l2_shadow:
4489
#if GUEST_PAGING_LEVELS >= 3
4490
case SH_type_l2h_shadow:
4492
(void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn);
4494
#if GUEST_PAGING_LEVELS >= 4
4495
case SH_type_l3_shadow:
4496
(void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn);
4498
case SH_type_l4_shadow:
4499
(void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn);
4502
default: BUG(); /* Called with the wrong kind of shadow. */
4506
int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
4507
/* Remove all mappings of this l1 shadow from this l2 shadow */
4513
SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain,
4515
flags = shadow_l2e_get_flags(*sl2e);
4516
if ( (flags & _PAGE_PRESENT)
4517
&& (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
4519
(void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
4520
if ( mfn_to_page(sl1mfn)->u.sh.type == 0 )
4521
/* This breaks us cleanly out of the FOREACH macro */
4528
#if GUEST_PAGING_LEVELS >= 4
4529
int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
4530
/* Remove all mappings of this l2 shadow from this l3 shadow */
4536
SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done,
4538
flags = shadow_l3e_get_flags(*sl3e);
4539
if ( (flags & _PAGE_PRESENT)
4540
&& (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
4542
(void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
4543
if ( mfn_to_page(sl2mfn)->u.sh.type == 0 )
4544
/* This breaks us cleanly out of the FOREACH macro */
4551
int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
4552
/* Remove all mappings of this l3 shadow from this l4 shadow */
4558
SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain,
4560
flags = shadow_l4e_get_flags(*sl4e);
4561
if ( (flags & _PAGE_PRESENT)
4562
&& (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
4564
(void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
4565
if ( mfn_to_page(sl3mfn)->u.sh.type == 0 )
4566
/* This breaks us cleanly out of the FOREACH macro */
4572
#endif /* 64bit guest */
4574
/**************************************************************************/
4575
/* Handling HVM guest writes to pagetables */
4577
/* Translate a VA to an MFN, injecting a page-fault if we fail */
4578
#define BAD_GVA_TO_GFN (~0UL)
4579
#define BAD_GFN_TO_MFN (~1UL)
4580
#define READONLY_GFN (~2UL)
4581
static mfn_t emulate_gva_to_mfn(struct vcpu *v,
4582
unsigned long vaddr,
4583
struct sh_emulate_ctxt *sh_ctxt)
4588
uint32_t pfec = PFEC_page_present | PFEC_write_access;
4590
/* Translate the VA to a GFN */
4591
gfn = sh_gva_to_gfn(v, vaddr, &pfec);
4592
if ( gfn == INVALID_GFN )
4594
if ( is_hvm_vcpu(v) )
4595
hvm_inject_exception(TRAP_page_fault, pfec, vaddr);
4597
propagate_page_fault(vaddr, pfec);
4598
return _mfn(BAD_GVA_TO_GFN);
4601
/* Translate the GFN to an MFN */
4602
/* PoD: query only if shadow lock is held (to avoid deadlock) */
4603
if ( shadow_locked_by_me(v->domain) )
4604
mfn = gfn_to_mfn_query(v->domain, _gfn(gfn), &p2mt);
4606
mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
4608
if ( p2m_is_readonly(p2mt) )
4609
return _mfn(READONLY_GFN);
4610
if ( !p2m_is_ram(p2mt) )
4611
return _mfn(BAD_GFN_TO_MFN);
4613
ASSERT(mfn_valid(mfn));
4614
v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn);
4618
/* Check that the user is allowed to perform this write.
4619
* Returns a mapped pointer to write to, or NULL for error. */
4620
#define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE)
4621
#define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION)
4622
#define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY)
4623
#define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3)
4624
static void *emulate_map_dest(struct vcpu *v,
4625
unsigned long vaddr,
4627
struct sh_emulate_ctxt *sh_ctxt)
4629
unsigned long offset;
4632
sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt);
4633
if ( !mfn_valid(sh_ctxt->mfn1) )
4634
return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ?
4636
(mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ?
4637
MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4640
/* We don't emulate user-mode writes to page tables */
4641
if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 )
4643
gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached "
4644
"emulate_map_dest(). This should never happen!\n");
4645
return MAPPING_UNHANDLEABLE;
4649
/* Unaligned writes mean probably this isn't a pagetable */
4650
if ( vaddr & (bytes - 1) )
4651
sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
4653
if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) )
4655
/* Whole write fits on a single page */
4656
sh_ctxt->mfn2 = _mfn(INVALID_MFN);
4657
map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK);
4661
/* Cross-page emulated writes are only supported for HVM guests;
4662
* PV guests ought to know better */
4663
if ( !is_hvm_vcpu(v) )
4664
return MAPPING_UNHANDLEABLE;
4666
/* This write crosses a page boundary. Translate the second page */
4667
sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK,
4669
if ( !mfn_valid(sh_ctxt->mfn2) )
4670
return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ?
4672
(mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
4673
MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
4675
/* Cross-page writes mean probably not a pagetable */
4676
sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
4678
/* Hack: we map the pages into the vcpu's LDT space, since we
4679
* know that we're not going to need the LDT for HVM guests,
4680
* and only HVM guests are allowed unaligned writes. */
4681
ASSERT(is_hvm_vcpu(v));
4682
map = (void *)LDT_VIRT_START(v);
4683
offset = l1_linear_offset((unsigned long) map);
4684
l1e_write(&__linear_l1_table[offset],
4685
l1e_from_pfn(mfn_x(sh_ctxt->mfn1), __PAGE_HYPERVISOR));
4686
l1e_write(&__linear_l1_table[offset + 1],
4687
l1e_from_pfn(mfn_x(sh_ctxt->mfn2), __PAGE_HYPERVISOR));
4689
map += (vaddr & ~PAGE_MASK);
4692
#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4693
/* Remember if the bottom bit was clear, so we can choose not to run
4694
* the change through the verify code if it's still clear afterwards */
4695
sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT);
4701
/* Tidy up after the emulated write: mark pages dirty, verify the new
4702
* contents, and undo the mapping */
4703
static void emulate_unmap_dest(struct vcpu *v,
4706
struct sh_emulate_ctxt *sh_ctxt)
4708
u32 b1 = bytes, b2 = 0, shflags;
4710
ASSERT(mfn_valid(sh_ctxt->mfn1));
4712
/* If we are writing lots of PTE-aligned zeros, might want to unshadow */
4713
if ( likely(bytes >= 4)
4714
&& (*(u32 *)addr == 0)
4715
&& ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
4716
check_for_early_unshadow(v, sh_ctxt->mfn1);
4718
reset_early_unshadow(v);
4720
/* We can avoid re-verifying the page contents after the write if:
4721
* - it was no larger than the PTE type of this pagetable;
4722
* - it was aligned to the PTE boundaries; and
4723
* - _PAGE_PRESENT was clear before and after the write. */
4724
shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags;
4725
#if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY)
4726
if ( sh_ctxt->low_bit_was_clear
4727
&& !(*(u8 *)addr & _PAGE_PRESENT)
4728
&& ((!(shflags & SHF_32)
4729
/* Not shadowed 32-bit: aligned 64-bit writes that leave
4730
* the present bit unset are safe to ignore. */
4731
&& ((unsigned long)addr & 7) == 0
4734
(!(shflags & (SHF_PAE|SHF_64))
4735
/* Not shadowed PAE/64-bit: aligned 32-bit writes that
4736
* leave the present bit unset are safe to ignore. */
4737
&& ((unsigned long)addr & 3) == 0
4740
/* Writes with this alignment constraint can't possibly cross pages */
4741
ASSERT(!mfn_valid(sh_ctxt->mfn2));
4744
#endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */
4746
if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4748
/* Validate as two writes, one to each page */
4749
b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK);
4753
if ( likely(b1 > 0) )
4754
sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1);
4755
if ( unlikely(b2 > 0) )
4756
sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2);
4759
paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
4761
if ( unlikely(mfn_valid(sh_ctxt->mfn2)) )
4763
unsigned long offset;
4764
paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
4765
/* Undo the hacky two-frame contiguous map. */
4766
ASSERT(((unsigned long) addr & PAGE_MASK) == LDT_VIRT_START(v));
4767
offset = l1_linear_offset((unsigned long) addr);
4768
l1e_write(&__linear_l1_table[offset], l1e_empty());
4769
l1e_write(&__linear_l1_table[offset + 1], l1e_empty());
4773
sh_unmap_domain_page(addr);
4775
atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
4779
sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
4780
u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
4784
/* Unaligned writes are only acceptable on HVM */
4785
if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4786
return X86EMUL_UNHANDLEABLE;
4788
addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4789
if ( emulate_map_dest_failed(addr) )
4792
shadow_lock(v->domain);
4793
memcpy(addr, src, bytes);
4797
#if GUEST_PAGING_LEVELS == 3
4798
if ( vaddr == this_cpu(trace_emulate_initial_va) )
4799
memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4800
else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
4802
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
4803
memcpy(&this_cpu(trace_emulate_write_val),
4804
(void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
4807
memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
4811
emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4812
shadow_audit_tables(v);
4813
shadow_unlock(v->domain);
4814
return X86EMUL_OKAY;
4818
sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
4819
unsigned long old, unsigned long new,
4820
unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
4824
int rv = X86EMUL_OKAY;
4826
/* Unaligned writes are only acceptable on HVM */
4827
if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) )
4828
return X86EMUL_UNHANDLEABLE;
4830
addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt);
4831
if ( emulate_map_dest_failed(addr) )
4834
shadow_lock(v->domain);
4837
case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
4838
case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
4839
case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
4840
case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
4842
SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
4847
rv = X86EMUL_CMPXCHG_FAILED;
4849
SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
4850
" wanted %#lx now %#lx bytes %u\n",
4851
vaddr, prev, old, new, *(unsigned long *)addr, bytes);
4853
emulate_unmap_dest(v, addr, bytes, sh_ctxt);
4854
shadow_audit_tables(v);
4855
shadow_unlock(v->domain);
4861
sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
4862
unsigned long old_lo, unsigned long old_hi,
4863
unsigned long new_lo, unsigned long new_hi,
4864
struct sh_emulate_ctxt *sh_ctxt)
4868
int rv = X86EMUL_OKAY;
4870
/* Unaligned writes are only acceptable on HVM */
4871
if ( (vaddr & 7) && !is_hvm_vcpu(v) )
4872
return X86EMUL_UNHANDLEABLE;
4874
addr = emulate_map_dest(v, vaddr, 8, sh_ctxt);
4875
if ( emulate_map_dest_failed(addr) )
4878
old = (((u64) old_hi) << 32) | (u64) old_lo;
4879
new = (((u64) new_hi) << 32) | (u64) new_lo;
4881
shadow_lock(v->domain);
4882
prev = cmpxchg(((u64 *)addr), old, new);
4885
rv = X86EMUL_CMPXCHG_FAILED;
4887
emulate_unmap_dest(v, addr, 8, sh_ctxt);
4888
shadow_audit_tables(v);
4889
shadow_unlock(v->domain);
4894
/**************************************************************************/
4897
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
4899
#define AUDIT_FAIL(_level, _fmt, _a...) do { \
4900
printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \
4901
"gl" #_level "mfn = %" PRI_mfn \
4902
" sl" #_level "mfn = %" PRI_mfn \
4903
" &gl" #_level "e = %p &sl" #_level "e = %p" \
4904
" gl" #_level "e = %" SH_PRI_gpte \
4905
" sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \
4906
GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4907
_level, guest_index(gl ## _level ## e), \
4908
mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4909
gl ## _level ## e, sl ## _level ## e, \
4910
gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
4916
#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \
4917
printk("Shadow %u-on-%u audit failed at level %i\n" \
4918
"gl" #_level "mfn = %" PRI_mfn \
4919
" sl" #_level "mfn = %" PRI_mfn \
4920
" Error: " _fmt "\n", \
4921
GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
4923
mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
4929
static char * sh_audit_flags(struct vcpu *v, int level,
4930
int gflags, int sflags)
4931
/* Common code for auditing flag bits */
4933
if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
4934
return "shadow is present but guest is not present";
4935
if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) )
4936
return "global bit set in PV shadow";
4937
if ( level == 2 && (sflags & _PAGE_PSE) )
4938
return "PS bit set in shadow";
4939
#if SHADOW_PAGING_LEVELS == 3
4940
if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
4942
if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) )
4943
return "accessed bit not propagated";
4944
if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
4945
&& ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) )
4946
return "dirty bit not propagated";
4947
if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
4948
return "user/supervisor bit does not match";
4949
if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
4950
return "NX bit does not match";
4951
if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
4952
return "shadow grants write access but guest does not";
4956
int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
4958
guest_l1e_t *gl1e, *gp;
4960
mfn_t mfn, gmfn, gl1mfn;
4966
/* Follow the backpointer */
4967
gl1mfn = backpointer(mfn_to_page(sl1mfn));
4969
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
4970
/* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
4971
if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
4973
oos_audit_hash_is_present(v->domain, gl1mfn);
4978
gl1e = gp = sh_map_domain_page(gl1mfn);
4979
SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
4981
if ( sh_l1e_is_magic(*sl1e) )
4983
#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
4984
if ( sh_l1e_is_gnp(*sl1e) )
4986
if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
4987
AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
4991
ASSERT(sh_l1e_is_mmio(*sl1e));
4992
gfn = sh_l1e_mmio_get_gfn(*sl1e);
4993
if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
4994
AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn
4995
" but guest gfn is %" SH_PRI_gfn,
4997
gfn_x(guest_l1e_get_gfn(*gl1e)));
5003
s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
5004
shadow_l1e_get_flags(*sl1e));
5005
if ( s ) AUDIT_FAIL(1, "%s", s);
5007
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5009
gfn = guest_l1e_get_gfn(*gl1e);
5010
mfn = shadow_l1e_get_mfn(*sl1e);
5011
gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
5012
if ( !p2m_is_grant(p2mt) && mfn_x(gmfn) != mfn_x(mfn) )
5013
AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
5014
" --> %" PRI_mfn " != mfn %" PRI_mfn,
5015
gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5019
sh_unmap_domain_page(gp);
5023
int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
5025
guest_l1e_t *gl1e, e;
5027
mfn_t gl1mfn = _mfn(INVALID_MFN);
5031
/* fl1 has no useful backpointer: all we can check are flags */
5032
e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
5033
SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
5034
f = shadow_l1e_get_flags(*sl1e);
5035
f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
5037
|| f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5039
|| f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED)
5040
|| f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
5041
_PAGE_ACCESSED|_PAGE_DIRTY)
5042
|| f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
5043
|| sh_l1e_is_magic(*sl1e)) )
5044
AUDIT_FAIL(1, "fl1e has bad flags");
5049
int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
5051
guest_l2e_t *gl2e, *gp;
5053
mfn_t mfn, gmfn, gl2mfn;
5059
/* Follow the backpointer */
5060
gl2mfn = backpointer(mfn_to_page(sl2mfn));
5062
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5063
/* Only L1's may be out of sync. */
5064
if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
5065
AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
5068
gl2e = gp = sh_map_domain_page(gl2mfn);
5069
SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
5071
s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
5072
shadow_l2e_get_flags(*sl2e));
5073
if ( s ) AUDIT_FAIL(2, "%s", s);
5075
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5077
gfn = guest_l2e_get_gfn(*gl2e);
5078
mfn = shadow_l2e_get_mfn(*sl2e);
5079
gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
5080
? get_fl1_shadow_status(v, gfn)
5081
: get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5083
if ( mfn_x(gmfn) != mfn_x(mfn) )
5084
AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
5085
" (--> %" PRI_mfn ")"
5086
" --> %" PRI_mfn " != mfn %" PRI_mfn,
5088
(guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
5089
: mfn_x(gfn_to_mfn_query(v->domain, gfn, &p2mt)),
5090
mfn_x(gmfn), mfn_x(mfn));
5093
sh_unmap_domain_page(gp);
5097
#if GUEST_PAGING_LEVELS >= 4
5098
int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
5100
guest_l3e_t *gl3e, *gp;
5102
mfn_t mfn, gmfn, gl3mfn;
5108
/* Follow the backpointer */
5109
gl3mfn = backpointer(mfn_to_page(sl3mfn));
5111
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5112
/* Only L1's may be out of sync. */
5113
if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
5114
AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
5117
gl3e = gp = sh_map_domain_page(gl3mfn);
5118
SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
5120
s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
5121
shadow_l3e_get_flags(*sl3e));
5122
if ( s ) AUDIT_FAIL(3, "%s", s);
5124
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5126
gfn = guest_l3e_get_gfn(*gl3e);
5127
mfn = shadow_l3e_get_mfn(*sl3e);
5128
gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5129
((GUEST_PAGING_LEVELS == 3 ||
5130
is_pv_32on64_vcpu(v))
5131
&& !shadow_mode_external(v->domain)
5132
&& (guest_index(gl3e) % 4) == 3)
5133
? SH_type_l2h_shadow
5134
: SH_type_l2_shadow);
5135
if ( mfn_x(gmfn) != mfn_x(mfn) )
5136
AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
5137
" --> %" PRI_mfn " != mfn %" PRI_mfn,
5138
gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5141
sh_unmap_domain_page(gp);
5145
int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
5147
guest_l4e_t *gl4e, *gp;
5149
mfn_t mfn, gmfn, gl4mfn;
5155
/* Follow the backpointer */
5156
gl4mfn = backpointer(mfn_to_page(sl4mfn));
5158
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
5159
/* Only L1's may be out of sync. */
5160
if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
5161
AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
5164
gl4e = gp = sh_map_domain_page(gl4mfn);
5165
SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
5167
s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
5168
shadow_l4e_get_flags(*sl4e));
5169
if ( s ) AUDIT_FAIL(4, "%s", s);
5171
if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
5173
gfn = guest_l4e_get_gfn(*gl4e);
5174
mfn = shadow_l4e_get_mfn(*sl4e);
5175
gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt),
5177
if ( mfn_x(gmfn) != mfn_x(mfn) )
5178
AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
5179
" --> %" PRI_mfn " != mfn %" PRI_mfn,
5180
gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
5183
sh_unmap_domain_page(gp);
5186
#endif /* GUEST_PAGING_LEVELS >= 4 */
5191
#endif /* Audit code */
5193
/**************************************************************************/
5194
/* Entry points into this mode of the shadow code.
5195
* This will all be mangled by the preprocessor to uniquify everything. */
5196
const struct paging_mode sh_paging_mode = {
5197
.page_fault = sh_page_fault,
5198
.invlpg = sh_invlpg,
5199
.gva_to_gfn = sh_gva_to_gfn,
5200
.update_cr3 = sh_update_cr3,
5201
.update_paging_modes = shadow_update_paging_modes,
5202
.write_p2m_entry = shadow_write_p2m_entry,
5203
.write_guest_entry = shadow_write_guest_entry,
5204
.cmpxchg_guest_entry = shadow_cmpxchg_guest_entry,
5205
.guest_map_l1e = sh_guest_map_l1e,
5206
.guest_get_eff_l1e = sh_guest_get_eff_l1e,
5207
.guest_levels = GUEST_PAGING_LEVELS,
5208
.shadow.detach_old_tables = sh_detach_old_tables,
5209
.shadow.x86_emulate_write = sh_x86_emulate_write,
5210
.shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg,
5212
.shadow.x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b,
5214
.shadow.make_monitor_table = sh_make_monitor_table,
5215
.shadow.destroy_monitor_table = sh_destroy_monitor_table,
5216
#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
5217
.shadow.guess_wrmap = sh_guess_wrmap,
5219
.shadow.shadow_levels = SHADOW_PAGING_LEVELS,
5225
* c-set-style: "BSD"
5227
* indent-tabs-mode: nil