1
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2
index 80b7ba4..ab2df55 100644
5
@@ -116,6 +116,10 @@ config ARCH_SUPPORTS_OPROFILE
9
+config ARCH_SUPPORTS_KVM
16
@@ -1619,4 +1623,6 @@ source "security/Kconfig"
18
source "crypto/Kconfig"
20
+source "arch/x86/kvm/Kconfig"
23
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
24
index 7aa1dc6..96f79eb 100644
25
--- a/arch/x86/Makefile
26
+++ b/arch/x86/Makefile
27
@@ -17,3 +17,5 @@ else
29
include $(srctree)/arch/x86/Makefile_64
32
+core-$(CONFIG_KVM) += arch/x86/kvm/
33
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig
35
rename from drivers/kvm/Kconfig
36
rename to arch/x86/kvm/Kconfig
37
index 6569206..4086080 100644
38
--- a/drivers/kvm/Kconfig
39
+++ b/arch/x86/kvm/Kconfig
42
menuconfig VIRTUALIZATION
45
+ depends on ARCH_SUPPORTS_KVM || X86
48
Say Y here to get to see options for using your Linux host to run other
49
@@ -16,7 +16,7 @@ if VIRTUALIZATION
52
tristate "Kernel-based Virtual Machine (KVM) support"
53
- depends on X86 && EXPERIMENTAL
54
+ depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
55
select PREEMPT_NOTIFIERS
58
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile
60
rename from drivers/kvm/Makefile
61
rename to arch/x86/kvm/Makefile
62
index e5a8f4d..ffdd0b3 100644
63
--- a/drivers/kvm/Makefile
64
+++ b/arch/x86/kvm/Makefile
66
# Makefile for Kernel-based Virtual Machine module
69
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
70
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
72
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
74
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
75
obj-$(CONFIG_KVM) += kvm.o
76
kvm-intel-objs = vmx.o
77
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
78
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c
80
rename from drivers/kvm/i8259.c
81
rename to arch/x86/kvm/i8259.c
82
index a679157..ab29cf2 100644
83
--- a/drivers/kvm/i8259.c
84
+++ b/arch/x86/kvm/i8259.c
89
+#include <linux/kvm_host.h>
92
* set irq level. If an edge is detected, then the IRR is set to 1
94
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
98
-static void pic_reset(void *opaque)
99
+void kvm_pic_reset(struct kvm_kpic_state *s)
101
- struct kvm_kpic_state *s = opaque;
106
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
110
- pic_reset(s); /* init */
111
+ kvm_pic_reset(s); /* init */
113
* deassert a pending interrupt
115
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c
117
rename from drivers/kvm/irq.c
118
rename to arch/x86/kvm/irq.c
119
index 7628c7f..e571475 100644
120
--- a/drivers/kvm/irq.c
121
+++ b/arch/x86/kvm/irq.c
125
#include <linux/module.h>
126
+#include <linux/kvm_host.h>
132
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
134
EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
136
-static void vcpu_kick_intr(void *info)
139
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
140
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
144
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
146
- int ipi_pcpu = vcpu->cpu;
148
- if (waitqueue_active(&vcpu->wq)) {
149
- wake_up_interruptible(&vcpu->wq);
150
- ++vcpu->stat.halt_wakeup;
152
- if (vcpu->guest_mode)
153
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
156
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
158
kvm_inject_apic_timer_irqs(vcpu);
159
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
161
index 0000000..53c7f48
163
+++ b/arch/x86/kvm/irq.h
166
+ * irq.h: in kernel interrupt controller related definitions
167
+ * Copyright (c) 2007, Intel Corporation.
169
+ * This program is free software; you can redistribute it and/or modify it
170
+ * under the terms and conditions of the GNU General Public License,
171
+ * version 2, as published by the Free Software Foundation.
173
+ * This program is distributed in the hope it will be useful, but WITHOUT
174
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
175
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
178
+ * You should have received a copy of the GNU General Public License along with
179
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
180
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
182
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
189
+#include <linux/mm_types.h>
190
+#include <linux/hrtimer.h>
191
+#include <linux/kvm_host.h>
200
+typedef void irq_request_func(void *opaque, int level);
202
+struct kvm_kpic_state {
203
+ u8 last_irr; /* edge detection */
204
+ u8 irr; /* interrupt request register */
205
+ u8 imr; /* interrupt mask register */
206
+ u8 isr; /* interrupt service register */
207
+ u8 priority_add; /* highest irq priority */
209
+ u8 read_reg_select;
214
+ u8 rotate_on_auto_eoi;
215
+ u8 special_fully_nested_mode;
216
+ u8 init4; /* true if 4 byte init */
217
+ u8 elcr; /* PIIX edge/trigger selection */
219
+ struct kvm_pic *pics_state;
223
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
224
+ irq_request_func *irq_request;
225
+ void *irq_request_opaque;
226
+ int output; /* intr from master PIC */
227
+ struct kvm_io_device dev;
230
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
231
+void kvm_pic_set_irq(void *opaque, int irq, int level);
232
+int kvm_pic_read_irq(struct kvm_pic *s);
233
+void kvm_pic_update_irq(struct kvm_pic *s);
235
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
237
+ return kvm->arch.vpic;
240
+static inline int irqchip_in_kernel(struct kvm *kvm)
242
+ return pic_irqchip(kvm) != NULL;
245
+void kvm_pic_reset(struct kvm_kpic_state *s);
247
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
248
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
249
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
250
+void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
253
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
255
rename from drivers/kvm/kvm_svm.h
256
rename to arch/x86/kvm/kvm_svm.h
257
index a0e415d..ecdfe97 100644
258
--- a/drivers/kvm/kvm_svm.h
259
+++ b/arch/x86/kvm/kvm_svm.h
261
#include <linux/kernel.h>
262
#include <linux/types.h>
263
#include <linux/list.h>
264
+#include <linux/kvm_host.h>
270
static const u32 host_save_user_msrs[] = {
272
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c
274
rename from drivers/kvm/lapic.c
275
rename to arch/x86/kvm/lapic.c
276
index 238fcad..4076331 100644
277
--- a/drivers/kvm/lapic.c
278
+++ b/arch/x86/kvm/lapic.c
280
* the COPYING file in the top-level directory.
284
+#include <linux/kvm_host.h>
285
#include <linux/kvm.h>
286
#include <linux/mm.h>
287
#include <linux/highmem.h>
290
#define VEC_POS(v) ((v) & (32 - 1))
291
#define REG_POS(v) (((v) >> 5) << 4)
293
static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
295
return *((u32 *) (apic->regs + reg_off));
296
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
298
static inline int apic_hw_enabled(struct kvm_lapic *apic)
300
- return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
301
+ return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
304
static inline int apic_sw_enabled(struct kvm_lapic *apic)
305
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
307
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
309
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
310
+ struct kvm_lapic *apic = vcpu->arch.apic;
314
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
316
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
318
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
319
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
321
+ struct kvm_lapic *apic = vcpu->arch.apic;
323
if (!apic_test_and_set_irr(vec, apic)) {
324
/* a new pending irq is set in IRR */
326
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
327
int short_hand, int dest, int dest_mode)
330
- struct kvm_lapic *target = vcpu->apic;
331
+ struct kvm_lapic *target = vcpu->arch.apic;
333
apic_debug("target %p, source %p, dest 0x%x, "
334
"dest_mode 0x%x, short_hand 0x%x",
335
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
337
apic_clear_vector(vector, apic->regs + APIC_TMR);
339
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
340
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
342
- else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
343
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
344
+ else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
345
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
346
if (waitqueue_active(&vcpu->wq))
347
wake_up_interruptible(&vcpu->wq);
349
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
353
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
354
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
356
"INIT on a runnable vcpu %d\n",
358
- vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
359
+ vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
363
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
364
case APIC_DM_STARTUP:
365
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
366
vcpu->vcpu_id, vector);
367
- if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
368
- vcpu->sipi_vector = vector;
369
- vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
370
+ if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
371
+ vcpu->arch.sipi_vector = vector;
372
+ vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
373
if (waitqueue_active(&vcpu->wq))
374
wake_up_interruptible(&vcpu->wq);
376
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
380
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
381
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
382
unsigned long bitmap)
387
- struct kvm_lapic *apic;
388
+ struct kvm_lapic *apic = NULL;
390
- last = kvm->round_robin_prev_vcpu;
391
+ last = kvm->arch.round_robin_prev_vcpu;
395
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
397
if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
399
- apic = kvm->vcpus[next]->apic;
400
+ apic = kvm->vcpus[next]->arch.apic;
401
if (apic && apic_enabled(apic))
404
} while (next != last);
405
- kvm->round_robin_prev_vcpu = next;
406
+ kvm->arch.round_robin_prev_vcpu = next;
409
- vcpu_id = ffs(bitmap) - 1;
412
- printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
414
- apic = kvm->vcpus[vcpu_id]->apic;
417
+ printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
422
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
423
+ unsigned long bitmap)
425
+ struct kvm_lapic *apic;
427
+ apic = kvm_apic_round_robin(kvm, vector, bitmap);
433
static void apic_set_eoi(struct kvm_lapic *apic)
435
int vector = apic_find_highest_isr(apic);
436
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
437
unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
438
unsigned int vector = icr_low & APIC_VECTOR_MASK;
440
- struct kvm_lapic *target;
441
+ struct kvm_vcpu *target;
442
struct kvm_vcpu *vcpu;
443
unsigned long lpr_map = 0;
445
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
450
+ if (vcpu->arch.apic &&
451
apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
452
if (delivery_mode == APIC_DM_LOWEST)
453
set_bit(vcpu->vcpu_id, &lpr_map);
455
- __apic_accept_irq(vcpu->apic, delivery_mode,
456
+ __apic_accept_irq(vcpu->arch.apic, delivery_mode,
457
vector, level, trig_mode);
461
if (delivery_mode == APIC_DM_LOWEST) {
462
- target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
463
+ target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
465
- __apic_accept_irq(target, delivery_mode,
466
+ __apic_accept_irq(target->arch.apic, delivery_mode,
467
vector, level, trig_mode);
470
@@ -762,19 +769,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
474
-void kvm_free_apic(struct kvm_lapic *apic)
475
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
478
+ if (!vcpu->arch.apic)
481
- hrtimer_cancel(&apic->timer.dev);
482
+ hrtimer_cancel(&vcpu->arch.apic->timer.dev);
484
- if (apic->regs_page) {
485
- __free_page(apic->regs_page);
486
- apic->regs_page = 0;
488
+ if (vcpu->arch.apic->regs_page)
489
+ __free_page(vcpu->arch.apic->regs_page);
492
+ kfree(vcpu->arch.apic);
496
@@ -785,7 +790,7 @@ void kvm_free_apic(struct kvm_lapic *apic)
498
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
500
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
501
+ struct kvm_lapic *apic = vcpu->arch.apic;
505
@@ -794,7 +799,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
507
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
509
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
510
+ struct kvm_lapic *apic = vcpu->arch.apic;
514
@@ -807,29 +812,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
516
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
518
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
519
+ struct kvm_lapic *apic = vcpu->arch.apic;
522
value |= MSR_IA32_APICBASE_BSP;
523
- vcpu->apic_base = value;
524
+ vcpu->arch.apic_base = value;
527
if (apic->vcpu->vcpu_id)
528
value &= ~MSR_IA32_APICBASE_BSP;
530
- vcpu->apic_base = value;
531
- apic->base_address = apic->vcpu->apic_base &
532
+ vcpu->arch.apic_base = value;
533
+ apic->base_address = apic->vcpu->arch.apic_base &
534
MSR_IA32_APICBASE_BASE;
536
/* with FSB delivery interrupt, we can restart APIC functionality */
537
apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
538
- "0x%lx.\n", apic->apic_base, apic->base_address);
539
+ "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
543
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
545
- return vcpu->apic_base;
546
+ return vcpu->arch.apic_base;
548
EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
550
@@ -841,7 +846,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
551
apic_debug("%s\n", __FUNCTION__);
555
+ apic = vcpu->arch.apic;
556
ASSERT(apic != NULL);
558
/* Stop the timer in case it's a reset to an active apic */
559
@@ -872,19 +877,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
560
update_divide_count(apic);
561
atomic_set(&apic->timer.pending, 0);
562
if (vcpu->vcpu_id == 0)
563
- vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
564
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
565
apic_update_ppr(apic);
567
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
568
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
569
vcpu, kvm_apic_id(apic),
570
- vcpu->apic_base, apic->base_address);
571
+ vcpu->arch.apic_base, apic->base_address);
573
EXPORT_SYMBOL_GPL(kvm_lapic_reset);
575
int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
577
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
578
+ struct kvm_lapic *apic = vcpu->arch.apic;
582
@@ -908,9 +913,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
583
wait_queue_head_t *q = &apic->vcpu->wq;
585
atomic_inc(&apic->timer.pending);
586
- if (waitqueue_active(q))
588
- apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
589
+ if (waitqueue_active(q)) {
590
+ apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
591
wake_up_interruptible(q);
593
if (apic_lvtt_period(apic)) {
594
@@ -956,13 +960,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
599
+ vcpu->arch.apic = apic;
601
apic->regs_page = alloc_page(GFP_KERNEL);
602
if (apic->regs_page == NULL) {
603
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
606
+ goto nomem_free_apic;
608
apic->regs = page_address(apic->regs_page);
609
memset(apic->regs, 0, PAGE_SIZE);
610
@@ -971,7 +975,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
611
hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
612
apic->timer.dev.function = apic_timer_fn;
613
apic->base_address = APIC_DEFAULT_PHYS_BASE;
614
- vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
615
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
617
kvm_lapic_reset(vcpu);
618
apic->dev.read = apic_mmio_read;
619
@@ -980,15 +984,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
620
apic->dev.private = apic;
626
- kvm_free_apic(apic);
629
EXPORT_SYMBOL_GPL(kvm_create_lapic);
631
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
633
- struct kvm_lapic *apic = vcpu->apic;
634
+ struct kvm_lapic *apic = vcpu->arch.apic;
637
if (!apic || !apic_enabled(apic))
638
@@ -1004,11 +1009,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
640
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
642
- u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
643
+ u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
646
if (vcpu->vcpu_id == 0) {
647
- if (!apic_hw_enabled(vcpu->apic))
648
+ if (!apic_hw_enabled(vcpu->arch.apic))
650
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
651
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
652
@@ -1019,7 +1024,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
654
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
656
- struct kvm_lapic *apic = vcpu->apic;
657
+ struct kvm_lapic *apic = vcpu->arch.apic;
659
if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
660
atomic_read(&apic->timer.pending) > 0) {
661
@@ -1030,7 +1035,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
663
void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
665
- struct kvm_lapic *apic = vcpu->apic;
666
+ struct kvm_lapic *apic = vcpu->arch.apic;
668
if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
669
apic->timer.last_update = ktime_add_ns(
670
@@ -1041,7 +1046,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
671
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
673
int vector = kvm_apic_has_interrupt(vcpu);
674
- struct kvm_lapic *apic = vcpu->apic;
675
+ struct kvm_lapic *apic = vcpu->arch.apic;
679
@@ -1054,9 +1059,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
681
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
683
- struct kvm_lapic *apic = vcpu->apic;
684
+ struct kvm_lapic *apic = vcpu->arch.apic;
686
- apic->base_address = vcpu->apic_base &
687
+ apic->base_address = vcpu->arch.apic_base &
688
MSR_IA32_APICBASE_BASE;
689
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
690
apic_update_ppr(apic);
691
@@ -1067,7 +1072,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
693
void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
695
- struct kvm_lapic *apic = vcpu->apic;
696
+ struct kvm_lapic *apic = vcpu->arch.apic;
697
struct hrtimer *timer;
700
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
702
index 0000000..447b654
704
+++ b/arch/x86/kvm/lapic.h
706
+#ifndef __KVM_X86_LAPIC_H
707
+#define __KVM_X86_LAPIC_H
711
+#include <linux/kvm_host.h>
714
+ unsigned long base_address;
715
+ struct kvm_io_device dev;
718
+ s64 period; /* unit: ns */
720
+ ktime_t last_update;
721
+ struct hrtimer dev;
723
+ struct kvm_vcpu *vcpu;
724
+ struct page *regs_page;
727
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
728
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
730
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
731
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
732
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
733
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
734
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
735
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
736
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
738
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
739
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
740
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
742
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
743
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
744
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
745
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
746
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
747
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
750
diff --git a/drivers/kvm/mmu.c b/arch/x86/kvm/mmu.c
752
rename from drivers/kvm/mmu.c
753
rename to arch/x86/kvm/mmu.c
754
index feb5ac9..9a57e1a 100644
755
--- a/drivers/kvm/mmu.c
756
+++ b/arch/x86/kvm/mmu.c
764
+#include <linux/kvm_host.h>
765
#include <linux/types.h>
766
#include <linux/string.h>
767
#include <linux/mm.h>
768
#include <linux/highmem.h>
769
#include <linux/module.h>
770
+#include <linux/swap.h>
772
#include <asm/page.h>
773
#include <asm/cmpxchg.h>
778
@@ -82,7 +85,8 @@ static int dbg = 1;
779
#define PT_PAGE_SIZE_MASK (1ULL << 7)
780
#define PT_PAT_MASK (1ULL << 7)
781
#define PT_GLOBAL_MASK (1ULL << 8)
782
-#define PT64_NX_MASK (1ULL << 63)
783
+#define PT64_NX_SHIFT 63
784
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
786
#define PT_PAT_SHIFT 7
787
#define PT_DIR_PAT_SHIFT 12
788
@@ -90,7 +94,8 @@ static int dbg = 1;
790
#define PT32_DIR_PSE36_SIZE 4
791
#define PT32_DIR_PSE36_SHIFT 13
792
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
793
+#define PT32_DIR_PSE36_MASK \
794
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
797
#define PT_FIRST_AVAIL_BITS_SHIFT 9
798
@@ -103,7 +108,7 @@ static int dbg = 1;
799
#define PT64_LEVEL_BITS 9
801
#define PT64_LEVEL_SHIFT(level) \
802
- ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
803
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
805
#define PT64_LEVEL_MASK(level) \
806
(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
807
@@ -115,7 +120,7 @@ static int dbg = 1;
808
#define PT32_LEVEL_BITS 10
810
#define PT32_LEVEL_SHIFT(level) \
811
- ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
812
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
814
#define PT32_LEVEL_MASK(level) \
815
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
816
@@ -132,6 +137,8 @@ static int dbg = 1;
817
#define PT32_DIR_BASE_ADDR_MASK \
818
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
820
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
823
#define PFERR_PRESENT_MASK (1U << 0)
824
#define PFERR_WRITE_MASK (1U << 1)
825
@@ -147,6 +154,11 @@ static int dbg = 1;
829
+#define ACC_EXEC_MASK 1
830
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
831
+#define ACC_USER_MASK PT_USER_MASK
832
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
834
struct kvm_rmap_desc {
835
u64 *shadow_ptes[RMAP_EXT];
836
struct kvm_rmap_desc *more;
837
@@ -156,9 +168,19 @@ static struct kmem_cache *pte_chain_cache;
838
static struct kmem_cache *rmap_desc_cache;
839
static struct kmem_cache *mmu_page_header_cache;
841
+static u64 __read_mostly shadow_trap_nonpresent_pte;
842
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
844
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
846
+ shadow_trap_nonpresent_pte = trap_pte;
847
+ shadow_notrap_nonpresent_pte = notrap_pte;
849
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
851
static int is_write_protection(struct kvm_vcpu *vcpu)
853
- return vcpu->cr0 & X86_CR0_WP;
854
+ return vcpu->arch.cr0 & X86_CR0_WP;
857
static int is_cpuid_PSE36(void)
858
@@ -168,7 +190,7 @@ static int is_cpuid_PSE36(void)
860
static int is_nx(struct kvm_vcpu *vcpu)
862
- return vcpu->shadow_efer & EFER_NX;
863
+ return vcpu->arch.shadow_efer & EFER_NX;
866
static int is_present_pte(unsigned long pte)
867
@@ -176,11 +198,23 @@ static int is_present_pte(unsigned long pte)
868
return pte & PT_PRESENT_MASK;
871
+static int is_shadow_present_pte(u64 pte)
873
+ pte &= ~PT_SHADOW_IO_MARK;
874
+ return pte != shadow_trap_nonpresent_pte
875
+ && pte != shadow_notrap_nonpresent_pte;
878
static int is_writeble_pte(unsigned long pte)
880
return pte & PT_WRITABLE_MASK;
883
+static int is_dirty_pte(unsigned long pte)
885
+ return pte & PT_DIRTY_MASK;
888
static int is_io_pte(unsigned long pte)
890
return pte & PT_SHADOW_IO_MARK;
891
@@ -188,8 +222,15 @@ static int is_io_pte(unsigned long pte)
893
static int is_rmap_pte(u64 pte)
895
- return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
896
- == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
897
+ return pte != shadow_trap_nonpresent_pte
898
+ && pte != shadow_notrap_nonpresent_pte;
901
+static gfn_t pse36_gfn_delta(u32 gpte)
903
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
905
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
908
static void set_shadow_pte(u64 *sptep, u64 spte)
909
@@ -251,18 +292,18 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
912
kvm_mmu_free_some_pages(vcpu);
913
- r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
914
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
918
- r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
919
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
923
- r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
924
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
927
- r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
928
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
929
mmu_page_header_cache, 4);
932
@@ -270,10 +311,10 @@ out:
934
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
936
- mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
937
- mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
938
- mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
939
- mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
940
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
941
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
942
+ mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
943
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
946
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
947
@@ -289,7 +330,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
949
static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
951
- return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
952
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
953
sizeof(struct kvm_pte_chain));
956
@@ -300,7 +341,7 @@ static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
958
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
960
- return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
961
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
962
sizeof(struct kvm_rmap_desc));
965
@@ -310,35 +351,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
969
+ * Take gfn and return the reverse mapping to it.
970
+ * Note: gfn must be unaliased before this function get called
973
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
975
+ struct kvm_memory_slot *slot;
977
+ slot = gfn_to_memslot(kvm, gfn);
978
+ return &slot->rmap[gfn - slot->base_gfn];
982
* Reverse mapping data structures:
984
- * If page->private bit zero is zero, then page->private points to the
985
- * shadow page table entry that points to page_address(page).
986
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
987
+ * that points to page_address(page).
989
- * If page->private bit zero is one, (then page->private & ~1) points
990
- * to a struct kvm_rmap_desc containing more mappings.
991
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
992
+ * containing more mappings.
994
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
995
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
998
+ struct kvm_mmu_page *sp;
999
struct kvm_rmap_desc *desc;
1000
+ unsigned long *rmapp;
1003
if (!is_rmap_pte(*spte))
1005
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
1006
- if (!page_private(page)) {
1007
+ gfn = unalias_gfn(vcpu->kvm, gfn);
1008
+ sp = page_header(__pa(spte));
1009
+ sp->gfns[spte - sp->spt] = gfn;
1010
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn);
1012
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
1013
- set_page_private(page,(unsigned long)spte);
1014
- } else if (!(page_private(page) & 1)) {
1015
+ *rmapp = (unsigned long)spte;
1016
+ } else if (!(*rmapp & 1)) {
1017
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
1018
desc = mmu_alloc_rmap_desc(vcpu);
1019
- desc->shadow_ptes[0] = (u64 *)page_private(page);
1020
+ desc->shadow_ptes[0] = (u64 *)*rmapp;
1021
desc->shadow_ptes[1] = spte;
1022
- set_page_private(page,(unsigned long)desc | 1);
1023
+ *rmapp = (unsigned long)desc | 1;
1025
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
1026
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
1027
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1028
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
1030
if (desc->shadow_ptes[RMAP_EXT-1]) {
1031
@@ -351,7 +409,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
1035
-static void rmap_desc_remove_entry(struct page *page,
1036
+static void rmap_desc_remove_entry(unsigned long *rmapp,
1037
struct kvm_rmap_desc *desc,
1039
struct kvm_rmap_desc *prev_desc)
1040
@@ -365,44 +423,53 @@ static void rmap_desc_remove_entry(struct page *page,
1043
if (!prev_desc && !desc->more)
1044
- set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
1045
+ *rmapp = (unsigned long)desc->shadow_ptes[0];
1048
prev_desc->more = desc->more;
1050
- set_page_private(page,(unsigned long)desc->more | 1);
1051
+ *rmapp = (unsigned long)desc->more | 1;
1052
mmu_free_rmap_desc(desc);
1055
-static void rmap_remove(u64 *spte)
1056
+static void rmap_remove(struct kvm *kvm, u64 *spte)
1058
- struct page *page;
1059
struct kvm_rmap_desc *desc;
1060
struct kvm_rmap_desc *prev_desc;
1061
+ struct kvm_mmu_page *sp;
1062
+ struct page *page;
1063
+ unsigned long *rmapp;
1066
if (!is_rmap_pte(*spte))
1068
+ sp = page_header(__pa(spte));
1069
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
1070
- if (!page_private(page)) {
1071
+ mark_page_accessed(page);
1072
+ if (is_writeble_pte(*spte))
1073
+ kvm_release_page_dirty(page);
1075
+ kvm_release_page_clean(page);
1076
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
1078
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
1080
- } else if (!(page_private(page) & 1)) {
1081
+ } else if (!(*rmapp & 1)) {
1082
rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
1083
- if ((u64 *)page_private(page) != spte) {
1084
+ if ((u64 *)*rmapp != spte) {
1085
printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
1089
- set_page_private(page,0);
1092
rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
1093
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
1094
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1097
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
1098
if (desc->shadow_ptes[i] == spte) {
1099
- rmap_desc_remove_entry(page,
1100
+ rmap_desc_remove_entry(rmapp,
1104
@@ -414,33 +481,56 @@ static void rmap_remove(u64 *spte)
1108
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1109
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
1111
- struct kvm *kvm = vcpu->kvm;
1112
- struct page *page;
1113
struct kvm_rmap_desc *desc;
1114
+ struct kvm_rmap_desc *prev_desc;
1120
+ else if (!(*rmapp & 1)) {
1122
+ return (u64 *)*rmapp;
1125
+ desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1129
+ for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
1130
+ if (prev_spte == spte)
1131
+ return desc->shadow_ptes[i];
1132
+ prev_spte = desc->shadow_ptes[i];
1134
+ desc = desc->more;
1139
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
1141
+ unsigned long *rmapp;
1143
+ int write_protected = 0;
1145
- page = gfn_to_page(kvm, gfn);
1147
+ gfn = unalias_gfn(kvm, gfn);
1148
+ rmapp = gfn_to_rmap(kvm, gfn);
1150
- while (page_private(page)) {
1151
- if (!(page_private(page) & 1))
1152
- spte = (u64 *)page_private(page);
1154
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
1155
- spte = desc->shadow_ptes[0];
1157
+ spte = rmap_next(kvm, rmapp, NULL);
1160
- BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
1161
- != page_to_pfn(page));
1162
BUG_ON(!(*spte & PT_PRESENT_MASK));
1163
- BUG_ON(!(*spte & PT_WRITABLE_MASK));
1164
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
1165
- rmap_remove(spte);
1166
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
1167
- kvm_flush_remote_tlbs(vcpu->kvm);
1168
+ if (is_writeble_pte(*spte)) {
1169
+ set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
1170
+ write_protected = 1;
1172
+ spte = rmap_next(kvm, rmapp, spte);
1174
+ if (write_protected)
1175
+ kvm_flush_remote_tlbs(kvm);
1179
@@ -450,7 +540,7 @@ static int is_empty_shadow_page(u64 *spt)
1182
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1184
+ if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
1185
printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
1188
@@ -459,14 +549,14 @@ static int is_empty_shadow_page(u64 *spt)
1192
-static void kvm_mmu_free_page(struct kvm *kvm,
1193
- struct kvm_mmu_page *page_head)
1194
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1196
- ASSERT(is_empty_shadow_page(page_head->spt));
1197
- list_del(&page_head->link);
1198
- __free_page(virt_to_page(page_head->spt));
1200
- ++kvm->n_free_mmu_pages;
1201
+ ASSERT(is_empty_shadow_page(sp->spt));
1202
+ list_del(&sp->link);
1203
+ __free_page(virt_to_page(sp->spt));
1204
+ __free_page(virt_to_page(sp->gfns));
1206
+ ++kvm->arch.n_free_mmu_pages;
1209
static unsigned kvm_page_table_hashfn(gfn_t gfn)
1210
@@ -477,26 +567,26 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
1211
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1214
- struct kvm_mmu_page *page;
1215
+ struct kvm_mmu_page *sp;
1217
- if (!vcpu->kvm->n_free_mmu_pages)
1218
+ if (!vcpu->kvm->arch.n_free_mmu_pages)
1221
- page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
1223
- page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
1224
- set_page_private(virt_to_page(page->spt), (unsigned long)page);
1225
- list_add(&page->link, &vcpu->kvm->active_mmu_pages);
1226
- ASSERT(is_empty_shadow_page(page->spt));
1227
- page->slot_bitmap = 0;
1228
- page->multimapped = 0;
1229
- page->parent_pte = parent_pte;
1230
- --vcpu->kvm->n_free_mmu_pages;
1232
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1233
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1234
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1235
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1236
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1237
+ ASSERT(is_empty_shadow_page(sp->spt));
1238
+ sp->slot_bitmap = 0;
1239
+ sp->multimapped = 0;
1240
+ sp->parent_pte = parent_pte;
1241
+ --vcpu->kvm->arch.n_free_mmu_pages;
1245
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1246
- struct kvm_mmu_page *page, u64 *parent_pte)
1247
+ struct kvm_mmu_page *sp, u64 *parent_pte)
1249
struct kvm_pte_chain *pte_chain;
1250
struct hlist_node *node;
1251
@@ -504,20 +594,20 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1255
- if (!page->multimapped) {
1256
- u64 *old = page->parent_pte;
1257
+ if (!sp->multimapped) {
1258
+ u64 *old = sp->parent_pte;
1261
- page->parent_pte = parent_pte;
1262
+ sp->parent_pte = parent_pte;
1265
- page->multimapped = 1;
1266
+ sp->multimapped = 1;
1267
pte_chain = mmu_alloc_pte_chain(vcpu);
1268
- INIT_HLIST_HEAD(&page->parent_ptes);
1269
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
1270
+ INIT_HLIST_HEAD(&sp->parent_ptes);
1271
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1272
pte_chain->parent_ptes[0] = old;
1274
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
1275
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1276
if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1278
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1279
@@ -528,23 +618,23 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1281
pte_chain = mmu_alloc_pte_chain(vcpu);
1283
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
1284
+ hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1285
pte_chain->parent_ptes[0] = parent_pte;
1288
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
1289
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1292
struct kvm_pte_chain *pte_chain;
1293
struct hlist_node *node;
1296
- if (!page->multimapped) {
1297
- BUG_ON(page->parent_pte != parent_pte);
1298
- page->parent_pte = NULL;
1299
+ if (!sp->multimapped) {
1300
+ BUG_ON(sp->parent_pte != parent_pte);
1301
+ sp->parent_pte = NULL;
1304
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
1305
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1306
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1307
if (!pte_chain->parent_ptes[i])
1309
@@ -560,9 +650,9 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
1311
hlist_del(&pte_chain->link);
1312
mmu_free_pte_chain(pte_chain);
1313
- if (hlist_empty(&page->parent_ptes)) {
1314
- page->multimapped = 0;
1315
- page->parent_pte = NULL;
1316
+ if (hlist_empty(&sp->parent_ptes)) {
1317
+ sp->multimapped = 0;
1318
+ sp->parent_pte = NULL;
1322
@@ -570,22 +660,21 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
1326
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
1328
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1331
struct hlist_head *bucket;
1332
- struct kvm_mmu_page *page;
1333
+ struct kvm_mmu_page *sp;
1334
struct hlist_node *node;
1336
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
1337
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1338
- bucket = &vcpu->kvm->mmu_page_hash[index];
1339
- hlist_for_each_entry(page, node, bucket, hash_link)
1340
- if (page->gfn == gfn && !page->role.metaphysical) {
1341
+ bucket = &kvm->arch.mmu_page_hash[index];
1342
+ hlist_for_each_entry(sp, node, bucket, hash_link)
1343
+ if (sp->gfn == gfn && !sp->role.metaphysical) {
1344
pgprintk("%s: found role %x\n",
1345
- __FUNCTION__, page->role.word);
1347
+ __FUNCTION__, sp->role.word);
1352
@@ -595,22 +684,23 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1356
- unsigned hugepage_access,
1362
union kvm_mmu_page_role role;
1365
struct hlist_head *bucket;
1366
- struct kvm_mmu_page *page;
1367
+ struct kvm_mmu_page *sp;
1368
struct hlist_node *node;
1371
- role.glevels = vcpu->mmu.root_level;
1372
+ role.glevels = vcpu->arch.mmu.root_level;
1374
role.metaphysical = metaphysical;
1375
- role.hugepage_access = hugepage_access;
1376
- if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
1377
+ role.access = access;
1378
+ if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1379
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1380
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1381
role.quadrant = quadrant;
1382
@@ -618,39 +708,42 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1383
pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
1385
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1386
- bucket = &vcpu->kvm->mmu_page_hash[index];
1387
- hlist_for_each_entry(page, node, bucket, hash_link)
1388
- if (page->gfn == gfn && page->role.word == role.word) {
1389
- mmu_page_add_parent_pte(vcpu, page, parent_pte);
1390
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1391
+ hlist_for_each_entry(sp, node, bucket, hash_link)
1392
+ if (sp->gfn == gfn && sp->role.word == role.word) {
1393
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1394
pgprintk("%s: found\n", __FUNCTION__);
1398
- page = kvm_mmu_alloc_page(vcpu, parent_pte);
1401
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1404
pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
1406
- page->role = role;
1407
- hlist_add_head(&page->hash_link, bucket);
1410
+ hlist_add_head(&sp->hash_link, bucket);
1411
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
1413
- rmap_write_protect(vcpu, gfn);
1415
+ rmap_write_protect(vcpu->kvm, gfn);
1421
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422
- struct kvm_mmu_page *page)
1423
+ struct kvm_mmu_page *sp)
1432
- if (page->role.level == PT_PAGE_TABLE_LEVEL) {
1433
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1434
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1435
- if (pt[i] & PT_PRESENT_MASK)
1436
- rmap_remove(&pt[i]);
1438
+ if (is_shadow_present_pte(pt[i]))
1439
+ rmap_remove(kvm, &pt[i]);
1440
+ pt[i] = shadow_trap_nonpresent_pte;
1442
kvm_flush_remote_tlbs(kvm);
1444
@@ -659,8 +752,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1445
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1449
- if (!(ent & PT_PRESENT_MASK))
1450
+ pt[i] = shadow_trap_nonpresent_pte;
1451
+ if (!is_shadow_present_pte(ent))
1453
ent &= PT64_BASE_ADDR_MASK;
1454
mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
1455
@@ -668,147 +761,238 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1456
kvm_flush_remote_tlbs(kvm);
1459
-static void kvm_mmu_put_page(struct kvm_mmu_page *page,
1461
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1463
- mmu_page_remove_parent_pte(page, parent_pte);
1464
+ mmu_page_remove_parent_pte(sp, parent_pte);
1467
-static void kvm_mmu_zap_page(struct kvm *kvm,
1468
- struct kvm_mmu_page *page)
1469
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1473
+ for (i = 0; i < KVM_MAX_VCPUS; ++i)
1474
+ if (kvm->vcpus[i])
1475
+ kvm->vcpus[i]->arch.last_pte_updated = NULL;
1478
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1482
- while (page->multimapped || page->parent_pte) {
1483
- if (!page->multimapped)
1484
- parent_pte = page->parent_pte;
1485
+ ++kvm->stat.mmu_shadow_zapped;
1486
+ while (sp->multimapped || sp->parent_pte) {
1487
+ if (!sp->multimapped)
1488
+ parent_pte = sp->parent_pte;
1490
struct kvm_pte_chain *chain;
1492
- chain = container_of(page->parent_ptes.first,
1493
+ chain = container_of(sp->parent_ptes.first,
1494
struct kvm_pte_chain, link);
1495
parent_pte = chain->parent_ptes[0];
1497
BUG_ON(!parent_pte);
1498
- kvm_mmu_put_page(page, parent_pte);
1499
- set_shadow_pte(parent_pte, 0);
1500
+ kvm_mmu_put_page(sp, parent_pte);
1501
+ set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1503
- kvm_mmu_page_unlink_children(kvm, page);
1504
- if (!page->root_count) {
1505
- hlist_del(&page->hash_link);
1506
- kvm_mmu_free_page(kvm, page);
1507
+ kvm_mmu_page_unlink_children(kvm, sp);
1508
+ if (!sp->root_count) {
1509
+ hlist_del(&sp->hash_link);
1510
+ kvm_mmu_free_page(kvm, sp);
1512
- list_move(&page->link, &kvm->active_mmu_pages);
1513
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
1514
+ kvm_mmu_reset_last_pte_updated(kvm);
1518
+ * Changing the number of mmu pages allocated to the vm
1519
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1521
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1524
+ * If we set the number of mmu pages to be smaller be than the
1525
+ * number of actived pages , we must to free some mmu pages before we
1526
+ * change the value
1529
+ if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
1530
+ kvm_nr_mmu_pages) {
1531
+ int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1532
+ - kvm->arch.n_free_mmu_pages;
1534
+ while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1535
+ struct kvm_mmu_page *page;
1537
+ page = container_of(kvm->arch.active_mmu_pages.prev,
1538
+ struct kvm_mmu_page, link);
1539
+ kvm_mmu_zap_page(kvm, page);
1540
+ n_used_mmu_pages--;
1542
+ kvm->arch.n_free_mmu_pages = 0;
1545
+ kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1546
+ - kvm->arch.n_alloc_mmu_pages;
1548
+ kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1551
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1552
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555
struct hlist_head *bucket;
1556
- struct kvm_mmu_page *page;
1557
+ struct kvm_mmu_page *sp;
1558
struct hlist_node *node, *n;
1561
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
1563
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1564
- bucket = &vcpu->kvm->mmu_page_hash[index];
1565
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
1566
- if (page->gfn == gfn && !page->role.metaphysical) {
1567
+ bucket = &kvm->arch.mmu_page_hash[index];
1568
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1569
+ if (sp->gfn == gfn && !sp->role.metaphysical) {
1570
pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
1572
- kvm_mmu_zap_page(vcpu->kvm, page);
1574
+ kvm_mmu_zap_page(kvm, sp);
1580
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
1581
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1583
- struct kvm_mmu_page *page;
1584
+ struct kvm_mmu_page *sp;
1586
- while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
1587
- pgprintk("%s: zap %lx %x\n",
1588
- __FUNCTION__, gfn, page->role.word);
1589
- kvm_mmu_zap_page(vcpu->kvm, page);
1590
+ while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
1591
+ pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
1592
+ kvm_mmu_zap_page(kvm, sp);
1596
-static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
1597
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1599
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
1600
- struct kvm_mmu_page *page_head = page_header(__pa(pte));
1601
+ int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1602
+ struct kvm_mmu_page *sp = page_header(__pa(pte));
1604
- __set_bit(slot, &page_head->slot_bitmap);
1605
+ __set_bit(slot, &sp->slot_bitmap);
1608
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
1609
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1611
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1612
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1614
- return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
1615
+ if (gpa == UNMAPPED_GVA)
1617
+ return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1620
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
1621
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1622
+ unsigned pt_access, unsigned pte_access,
1623
+ int user_fault, int write_fault, int dirty,
1624
+ int *ptwrite, gfn_t gfn)
1627
+ int was_rmapped = is_rmap_pte(*shadow_pte);
1630
- ASSERT((gpa & HPA_ERR_MASK) == 0);
1631
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1633
- return gpa | HPA_ERR_MASK;
1634
- return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
1635
- | (gpa & (PAGE_SIZE-1));
1637
+ pgprintk("%s: spte %llx access %x write_fault %d"
1638
+ " user_fault %d gfn %lx\n",
1639
+ __FUNCTION__, *shadow_pte, pt_access,
1640
+ write_fault, user_fault, gfn);
1642
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
1644
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1646
+ * We don't set the accessed bit, since we sometimes want to see
1647
+ * whether the guest actually used the pte (in order to detect
1650
+ spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
1652
+ pte_access &= ~ACC_WRITE_MASK;
1653
+ if (!(pte_access & ACC_EXEC_MASK))
1654
+ spte |= PT64_NX_MASK;
1656
+ page = gfn_to_page(vcpu->kvm, gfn);
1658
+ spte |= PT_PRESENT_MASK;
1659
+ if (pte_access & ACC_USER_MASK)
1660
+ spte |= PT_USER_MASK;
1662
+ if (is_error_page(page)) {
1663
+ set_shadow_pte(shadow_pte,
1664
+ shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
1665
+ kvm_release_page_clean(page);
1669
- if (gpa == UNMAPPED_GVA)
1670
- return UNMAPPED_GVA;
1671
- return gpa_to_hpa(vcpu, gpa);
1673
+ spte |= page_to_phys(page);
1675
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1677
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1678
+ if ((pte_access & ACC_WRITE_MASK)
1679
+ || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1680
+ struct kvm_mmu_page *shadow;
1682
- if (gpa == UNMAPPED_GVA)
1684
- return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
1685
+ spte |= PT_WRITABLE_MASK;
1687
+ mmu_unshadow(vcpu->kvm, gfn);
1691
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1693
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
1694
+ __FUNCTION__, gfn);
1695
+ pte_access &= ~ACC_WRITE_MASK;
1696
+ if (is_writeble_pte(spte)) {
1697
+ spte &= ~PT_WRITABLE_MASK;
1698
+ kvm_x86_ops->tlb_flush(vcpu);
1707
+ if (pte_access & ACC_WRITE_MASK)
1708
+ mark_page_dirty(vcpu->kvm, gfn);
1710
+ pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
1711
+ set_shadow_pte(shadow_pte, spte);
1712
+ page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1713
+ if (!was_rmapped) {
1714
+ rmap_add(vcpu, shadow_pte, gfn);
1715
+ if (!is_rmap_pte(*shadow_pte))
1716
+ kvm_release_page_clean(page);
1719
+ kvm_release_page_clean(page);
1720
+ if (!ptwrite || !*ptwrite)
1721
+ vcpu->arch.last_pte_updated = shadow_pte;
1724
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1728
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
1729
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1731
int level = PT32E_ROOT_LEVEL;
1732
- hpa_t table_addr = vcpu->mmu.root_hpa;
1733
+ hpa_t table_addr = vcpu->arch.mmu.root_hpa;
1737
u32 index = PT64_INDEX(v, level);
1741
ASSERT(VALID_PAGE(table_addr));
1742
table = __va(table_addr);
1745
- pte = table[index];
1746
- if (is_present_pte(pte) && is_writeble_pte(pte))
1748
- mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
1749
- page_header_update_slot(vcpu->kvm, table, v);
1750
- table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1752
- rmap_add(vcpu, &table[index]);
1754
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1755
+ 0, write, 1, &pt_write, gfn);
1756
+ return pt_write || is_io_pte(table[index]);
1759
- if (table[index] == 0) {
1760
+ if (table[index] == shadow_trap_nonpresent_pte) {
1761
struct kvm_mmu_page *new_table;
1764
@@ -816,7 +1000,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
1766
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1768
- 1, 0, &table[index]);
1769
+ 1, ACC_ALL, &table[index],
1772
pgprintk("nonpaging_map: ENOMEM\n");
1774
@@ -829,77 +1014,86 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
1778
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1779
+ struct kvm_mmu_page *sp)
1783
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1784
+ sp->spt[i] = shadow_trap_nonpresent_pte;
1787
static void mmu_free_roots(struct kvm_vcpu *vcpu)
1790
- struct kvm_mmu_page *page;
1791
+ struct kvm_mmu_page *sp;
1793
- if (!VALID_PAGE(vcpu->mmu.root_hpa))
1794
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1796
#ifdef CONFIG_X86_64
1797
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1798
- hpa_t root = vcpu->mmu.root_hpa;
1799
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1800
+ hpa_t root = vcpu->arch.mmu.root_hpa;
1802
- page = page_header(root);
1803
- --page->root_count;
1804
- vcpu->mmu.root_hpa = INVALID_PAGE;
1805
+ sp = page_header(root);
1807
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1811
for (i = 0; i < 4; ++i) {
1812
- hpa_t root = vcpu->mmu.pae_root[i];
1813
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
1816
root &= PT64_BASE_ADDR_MASK;
1817
- page = page_header(root);
1818
- --page->root_count;
1819
+ sp = page_header(root);
1822
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
1823
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1825
- vcpu->mmu.root_hpa = INVALID_PAGE;
1826
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1829
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1833
- struct kvm_mmu_page *page;
1834
+ struct kvm_mmu_page *sp;
1836
- root_gfn = vcpu->cr3 >> PAGE_SHIFT;
1837
+ root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1839
#ifdef CONFIG_X86_64
1840
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1841
- hpa_t root = vcpu->mmu.root_hpa;
1842
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1843
+ hpa_t root = vcpu->arch.mmu.root_hpa;
1845
ASSERT(!VALID_PAGE(root));
1846
- page = kvm_mmu_get_page(vcpu, root_gfn, 0,
1847
- PT64_ROOT_LEVEL, 0, 0, NULL);
1848
- root = __pa(page->spt);
1849
- ++page->root_count;
1850
- vcpu->mmu.root_hpa = root;
1851
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1852
+ PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1853
+ root = __pa(sp->spt);
1855
+ vcpu->arch.mmu.root_hpa = root;
1859
for (i = 0; i < 4; ++i) {
1860
- hpa_t root = vcpu->mmu.pae_root[i];
1861
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
1863
ASSERT(!VALID_PAGE(root));
1864
- if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
1865
- if (!is_present_pte(vcpu->pdptrs[i])) {
1866
- vcpu->mmu.pae_root[i] = 0;
1867
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1868
+ if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1869
+ vcpu->arch.mmu.pae_root[i] = 0;
1872
- root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
1873
- } else if (vcpu->mmu.root_level == 0)
1874
+ root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1875
+ } else if (vcpu->arch.mmu.root_level == 0)
1877
- page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1878
- PT32_ROOT_LEVEL, !is_paging(vcpu),
1880
- root = __pa(page->spt);
1881
- ++page->root_count;
1882
- vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
1883
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1884
+ PT32_ROOT_LEVEL, !is_paging(vcpu),
1885
+ ACC_ALL, NULL, NULL);
1886
+ root = __pa(sp->spt);
1888
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1890
- vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
1891
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1894
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1895
@@ -908,26 +1102,23 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1898
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1907
+ pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1908
r = mmu_topup_memory_caches(vcpu);
1913
- ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
1914
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1916
+ gfn = gva >> PAGE_SHIFT;
1918
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
1920
- if (is_error_hpa(paddr))
1923
- return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
1924
+ return nonpaging_map(vcpu, gva & PAGE_MASK,
1925
+ error_code & PFERR_WRITE_MASK, gfn);
1928
static void nonpaging_free(struct kvm_vcpu *vcpu)
1929
@@ -937,19 +1128,20 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
1931
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1933
- struct kvm_mmu *context = &vcpu->mmu;
1934
+ struct kvm_mmu *context = &vcpu->arch.mmu;
1936
context->new_cr3 = nonpaging_new_cr3;
1937
context->page_fault = nonpaging_page_fault;
1938
context->gva_to_gpa = nonpaging_gva_to_gpa;
1939
context->free = nonpaging_free;
1940
+ context->prefetch_page = nonpaging_prefetch_page;
1941
context->root_level = 0;
1942
context->shadow_root_level = PT32E_ROOT_LEVEL;
1943
context->root_hpa = INVALID_PAGE;
1947
-static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1948
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1950
++vcpu->stat.tlb_flush;
1951
kvm_x86_ops->tlb_flush(vcpu);
1952
@@ -965,7 +1157,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
1956
- kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
1957
+ kvm_inject_page_fault(vcpu, addr, err_code);
1960
static void paging_free(struct kvm_vcpu *vcpu)
1961
@@ -983,12 +1175,13 @@ static void paging_free(struct kvm_vcpu *vcpu)
1963
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1965
- struct kvm_mmu *context = &vcpu->mmu;
1966
+ struct kvm_mmu *context = &vcpu->arch.mmu;
1968
ASSERT(is_pae(vcpu));
1969
context->new_cr3 = paging_new_cr3;
1970
context->page_fault = paging64_page_fault;
1971
context->gva_to_gpa = paging64_gva_to_gpa;
1972
+ context->prefetch_page = paging64_prefetch_page;
1973
context->free = paging_free;
1974
context->root_level = level;
1975
context->shadow_root_level = level;
1976
@@ -1003,12 +1196,13 @@ static int paging64_init_context(struct kvm_vcpu *vcpu)
1978
static int paging32_init_context(struct kvm_vcpu *vcpu)
1980
- struct kvm_mmu *context = &vcpu->mmu;
1981
+ struct kvm_mmu *context = &vcpu->arch.mmu;
1983
context->new_cr3 = paging_new_cr3;
1984
context->page_fault = paging32_page_fault;
1985
context->gva_to_gpa = paging32_gva_to_gpa;
1986
context->free = paging_free;
1987
+ context->prefetch_page = paging32_prefetch_page;
1988
context->root_level = PT32_ROOT_LEVEL;
1989
context->shadow_root_level = PT32E_ROOT_LEVEL;
1990
context->root_hpa = INVALID_PAGE;
1991
@@ -1023,7 +1217,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
1992
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1995
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1996
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1998
if (!is_paging(vcpu))
1999
return nonpaging_init_context(vcpu);
2000
@@ -1038,9 +1232,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2001
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2004
- if (VALID_PAGE(vcpu->mmu.root_hpa)) {
2005
- vcpu->mmu.free(vcpu);
2006
- vcpu->mmu.root_hpa = INVALID_PAGE;
2007
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
2008
+ vcpu->arch.mmu.free(vcpu);
2009
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2013
@@ -1060,7 +1254,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2016
mmu_alloc_roots(vcpu);
2017
- kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
2018
+ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2019
kvm_mmu_flush_tlb(vcpu);
2021
mutex_unlock(&vcpu->kvm->lock);
2022
@@ -1074,47 +1268,79 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2025
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2026
- struct kvm_mmu_page *page,
2027
+ struct kvm_mmu_page *sp,
2031
struct kvm_mmu_page *child;
2034
- if (is_present_pte(pte)) {
2035
- if (page->role.level == PT_PAGE_TABLE_LEVEL)
2036
- rmap_remove(spte);
2037
+ if (is_shadow_present_pte(pte)) {
2038
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
2039
+ rmap_remove(vcpu->kvm, spte);
2041
child = page_header(pte & PT64_BASE_ADDR_MASK);
2042
mmu_page_remove_parent_pte(child, spte);
2045
- set_shadow_pte(spte, 0);
2046
- kvm_flush_remote_tlbs(vcpu->kvm);
2047
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
2050
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2051
- struct kvm_mmu_page *page,
2052
+ struct kvm_mmu_page *sp,
2054
- const void *new, int bytes)
2055
+ const void *new, int bytes,
2056
+ int offset_in_pte)
2058
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
2059
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2060
+ ++vcpu->kvm->stat.mmu_pde_zapped;
2064
- if (page->role.glevels == PT32_ROOT_LEVEL)
2065
- paging32_update_pte(vcpu, page, spte, new, bytes);
2066
+ ++vcpu->kvm->stat.mmu_pte_updated;
2067
+ if (sp->role.glevels == PT32_ROOT_LEVEL)
2068
+ paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
2070
- paging64_update_pte(vcpu, page, spte, new, bytes);
2071
+ paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
2074
+static bool need_remote_flush(u64 old, u64 new)
2076
+ if (!is_shadow_present_pte(old))
2078
+ if (!is_shadow_present_pte(new))
2080
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
2082
+ old ^= PT64_NX_MASK;
2083
+ new ^= PT64_NX_MASK;
2084
+ return (old & ~new & PT64_PERM_MASK) != 0;
2087
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
2089
+ if (need_remote_flush(old, new))
2090
+ kvm_flush_remote_tlbs(vcpu->kvm);
2092
+ kvm_mmu_flush_tlb(vcpu);
2095
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2097
+ u64 *spte = vcpu->arch.last_pte_updated;
2099
+ return !!(spte && (*spte & PT_ACCESSED_MASK));
2102
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2103
const u8 *new, int bytes)
2105
gfn_t gfn = gpa >> PAGE_SHIFT;
2106
- struct kvm_mmu_page *page;
2107
+ struct kvm_mmu_page *sp;
2108
struct hlist_node *node, *n;
2109
struct hlist_head *bucket;
2113
unsigned offset = offset_in_page(gpa);
2115
@@ -1126,20 +1352,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2118
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
2119
- if (gfn == vcpu->last_pt_write_gfn) {
2120
- ++vcpu->last_pt_write_count;
2121
- if (vcpu->last_pt_write_count >= 3)
2122
+ ++vcpu->kvm->stat.mmu_pte_write;
2123
+ kvm_mmu_audit(vcpu, "pre pte write");
2124
+ if (gfn == vcpu->arch.last_pt_write_gfn
2125
+ && !last_updated_pte_accessed(vcpu)) {
2126
+ ++vcpu->arch.last_pt_write_count;
2127
+ if (vcpu->arch.last_pt_write_count >= 3)
2130
- vcpu->last_pt_write_gfn = gfn;
2131
- vcpu->last_pt_write_count = 1;
2132
+ vcpu->arch.last_pt_write_gfn = gfn;
2133
+ vcpu->arch.last_pt_write_count = 1;
2134
+ vcpu->arch.last_pte_updated = NULL;
2136
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
2137
- bucket = &vcpu->kvm->mmu_page_hash[index];
2138
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
2139
- if (page->gfn != gfn || page->role.metaphysical)
2140
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2141
+ hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2142
+ if (sp->gfn != gfn || sp->role.metaphysical)
2144
- pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2145
+ pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2146
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2147
misaligned |= bytes < 4;
2148
if (misaligned || flooded) {
2149
@@ -1154,14 +1384,15 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2152
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2153
- gpa, bytes, page->role.word);
2154
- kvm_mmu_zap_page(vcpu->kvm, page);
2155
+ gpa, bytes, sp->role.word);
2156
+ kvm_mmu_zap_page(vcpu->kvm, sp);
2157
+ ++vcpu->kvm->stat.mmu_flooded;
2160
page_offset = offset;
2161
- level = page->role.level;
2162
+ level = sp->role.level;
2164
- if (page->role.glevels == PT32_ROOT_LEVEL) {
2165
+ if (sp->role.glevels == PT32_ROOT_LEVEL) {
2166
page_offset <<= 1; /* 32->64 */
2168
* A 32-bit pde maps 4MB while the shadow pdes map
2169
@@ -1175,46 +1406,91 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2171
quadrant = page_offset >> PAGE_SHIFT;
2172
page_offset &= ~PAGE_MASK;
2173
- if (quadrant != page->role.quadrant)
2174
+ if (quadrant != sp->role.quadrant)
2177
- spte = &page->spt[page_offset / sizeof(*spte)];
2178
+ spte = &sp->spt[page_offset / sizeof(*spte)];
2180
- mmu_pte_write_zap_pte(vcpu, page, spte);
2181
- mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
2183
+ mmu_pte_write_zap_pte(vcpu, sp, spte);
2184
+ mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
2185
+ page_offset & (pte_size - 1));
2186
+ mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2190
+ kvm_mmu_audit(vcpu, "post pte write");
2193
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2195
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
2196
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2198
- return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
2199
+ return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2202
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2204
- while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
2205
- struct kvm_mmu_page *page;
2206
+ while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
2207
+ struct kvm_mmu_page *sp;
2209
- page = container_of(vcpu->kvm->active_mmu_pages.prev,
2210
- struct kvm_mmu_page, link);
2211
- kvm_mmu_zap_page(vcpu->kvm, page);
2212
+ sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2213
+ struct kvm_mmu_page, link);
2214
+ kvm_mmu_zap_page(vcpu->kvm, sp);
2215
+ ++vcpu->kvm->stat.mmu_recycled;
2219
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2222
+ enum emulation_result er;
2224
+ mutex_lock(&vcpu->kvm->lock);
2225
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
2234
+ r = mmu_topup_memory_caches(vcpu);
2238
+ er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
2239
+ mutex_unlock(&vcpu->kvm->lock);
2242
+ case EMULATE_DONE:
2244
+ case EMULATE_DO_MMIO:
2245
+ ++vcpu->stat.mmio_exits;
2247
+ case EMULATE_FAIL:
2248
+ kvm_report_emulation_failure(vcpu, "pagetable");
2254
+ mutex_unlock(&vcpu->kvm->lock);
2257
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2259
static void free_mmu_pages(struct kvm_vcpu *vcpu)
2261
- struct kvm_mmu_page *page;
2262
+ struct kvm_mmu_page *sp;
2264
- while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
2265
- page = container_of(vcpu->kvm->active_mmu_pages.next,
2266
- struct kvm_mmu_page, link);
2267
- kvm_mmu_zap_page(vcpu->kvm, page);
2268
+ while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2269
+ sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2270
+ struct kvm_mmu_page, link);
2271
+ kvm_mmu_zap_page(vcpu->kvm, sp);
2273
- free_page((unsigned long)vcpu->mmu.pae_root);
2274
+ free_page((unsigned long)vcpu->arch.mmu.pae_root);
2277
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2278
@@ -1224,8 +1500,12 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2282
- vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
2284
+ if (vcpu->kvm->arch.n_requested_mmu_pages)
2285
+ vcpu->kvm->arch.n_free_mmu_pages =
2286
+ vcpu->kvm->arch.n_requested_mmu_pages;
2288
+ vcpu->kvm->arch.n_free_mmu_pages =
2289
+ vcpu->kvm->arch.n_alloc_mmu_pages;
2291
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2292
* Therefore we need to allocate shadow page tables in the first
2293
@@ -1234,9 +1514,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2294
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2297
- vcpu->mmu.pae_root = page_address(page);
2298
+ vcpu->arch.mmu.pae_root = page_address(page);
2299
for (i = 0; i < 4; ++i)
2300
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
2301
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2305
@@ -1248,7 +1528,7 @@ error_1:
2306
int kvm_mmu_create(struct kvm_vcpu *vcpu)
2309
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
2310
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2312
return alloc_mmu_pages(vcpu);
2314
@@ -1256,7 +1536,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
2315
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
2318
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
2319
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2321
return init_kvm_mmu(vcpu);
2323
@@ -1272,31 +1552,29 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2325
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2327
- struct kvm_mmu_page *page;
2328
+ struct kvm_mmu_page *sp;
2330
- list_for_each_entry(page, &kvm->active_mmu_pages, link) {
2331
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2335
- if (!test_bit(slot, &page->slot_bitmap))
2336
+ if (!test_bit(slot, &sp->slot_bitmap))
2341
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2343
- if (pt[i] & PT_WRITABLE_MASK) {
2344
- rmap_remove(&pt[i]);
2345
+ if (pt[i] & PT_WRITABLE_MASK)
2346
pt[i] &= ~PT_WRITABLE_MASK;
2351
void kvm_mmu_zap_all(struct kvm *kvm)
2353
- struct kvm_mmu_page *page, *node;
2354
+ struct kvm_mmu_page *sp, *node;
2356
- list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
2357
- kvm_mmu_zap_page(kvm, page);
2358
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2359
+ kvm_mmu_zap_page(kvm, sp);
2361
kvm_flush_remote_tlbs(kvm);
2363
@@ -1337,6 +1615,25 @@ nomem:
2368
+ * Caculate mmu pages needed for kvm.
2370
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2373
+ unsigned int nr_mmu_pages;
2374
+ unsigned int nr_pages = 0;
2376
+ for (i = 0; i < kvm->nmemslots; i++)
2377
+ nr_pages += kvm->memslots[i].npages;
2379
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2380
+ nr_mmu_pages = max(nr_mmu_pages,
2381
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
2383
+ return nr_mmu_pages;
2388
static const char *audit_msg;
2389
@@ -1359,22 +1656,36 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
2390
for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
2393
- if (!(ent & PT_PRESENT_MASK))
2394
+ if (ent == shadow_trap_nonpresent_pte)
2397
va = canonicalize(va);
2400
+ if (ent == shadow_notrap_nonpresent_pte)
2401
+ printk(KERN_ERR "audit: (%s) nontrapping pte"
2402
+ " in nonleaf level: levels %d gva %lx"
2403
+ " level %d pte %llx\n", audit_msg,
2404
+ vcpu->arch.mmu.root_level, va, level, ent);
2406
audit_mappings_page(vcpu, ent, va, level - 1);
2408
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
2409
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
2411
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
2412
+ struct page *page = gpa_to_page(vcpu, gpa);
2413
+ hpa_t hpa = page_to_phys(page);
2415
- if ((ent & PT_PRESENT_MASK)
2416
+ if (is_shadow_present_pte(ent)
2417
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
2418
- printk(KERN_ERR "audit error: (%s) levels %d"
2419
- " gva %lx gpa %llx hpa %llx ent %llx\n",
2420
- audit_msg, vcpu->mmu.root_level,
2421
- va, gpa, hpa, ent);
2422
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
2423
+ " gva %lx gpa %llx hpa %llx ent %llx %d\n",
2424
+ audit_msg, vcpu->arch.mmu.root_level,
2425
+ va, gpa, hpa, ent,
2426
+ is_shadow_present_pte(ent));
2427
+ else if (ent == shadow_notrap_nonpresent_pte
2428
+ && !is_error_hpa(hpa))
2429
+ printk(KERN_ERR "audit: (%s) notrap shadow,"
2430
+ " valid guest gva %lx\n", audit_msg, va);
2431
+ kvm_release_page_clean(page);
2436
@@ -1383,13 +1694,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
2440
- if (vcpu->mmu.root_level == 4)
2441
- audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
2442
+ if (vcpu->arch.mmu.root_level == 4)
2443
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
2445
for (i = 0; i < 4; ++i)
2446
- if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
2447
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
2448
audit_mappings_page(vcpu,
2449
- vcpu->mmu.pae_root[i],
2450
+ vcpu->arch.mmu.pae_root[i],
2454
@@ -1404,15 +1715,15 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
2455
struct kvm_rmap_desc *d;
2457
for (j = 0; j < m->npages; ++j) {
2458
- struct page *page = m->phys_mem[j];
2459
+ unsigned long *rmapp = &m->rmap[j];
2461
- if (!page->private)
2464
- if (!(page->private & 1)) {
2465
+ if (!(*rmapp & 1)) {
2469
- d = (struct kvm_rmap_desc *)(page->private & ~1ul);
2470
+ d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
2472
for (k = 0; k < RMAP_EXT; ++k)
2473
if (d->shadow_ptes[k])
2474
@@ -1429,13 +1740,13 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
2475
static int count_writable_mappings(struct kvm_vcpu *vcpu)
2478
- struct kvm_mmu_page *page;
2479
+ struct kvm_mmu_page *sp;
2482
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
2483
- u64 *pt = page->spt;
2484
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2485
+ u64 *pt = sp->spt;
2487
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
2488
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
2491
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
2492
@@ -1463,23 +1774,23 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
2494
static void audit_write_protection(struct kvm_vcpu *vcpu)
2496
- struct kvm_mmu_page *page;
2498
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
2501
+ struct kvm_mmu_page *sp;
2502
+ struct kvm_memory_slot *slot;
2503
+ unsigned long *rmapp;
2506
- if (page->role.metaphysical)
2507
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
2508
+ if (sp->role.metaphysical)
2511
- hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
2513
- pg = pfn_to_page(hfn);
2515
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2516
+ gfn = unalias_gfn(vcpu->kvm, sp->gfn);
2517
+ rmapp = &slot->rmap[gfn - slot->base_gfn];
2519
printk(KERN_ERR "%s: (%s) shadow page has writable"
2520
" mappings: gfn %lx role %x\n",
2521
- __FUNCTION__, audit_msg, page->gfn,
2523
+ __FUNCTION__, audit_msg, sp->gfn,
2528
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
2529
new file mode 100644
2530
index 0000000..1fce19e
2532
+++ b/arch/x86/kvm/mmu.h
2534
+#ifndef __KVM_X86_MMU_H
2535
+#define __KVM_X86_MMU_H
2537
+#include <linux/kvm_host.h>
2539
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2541
+ if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
2542
+ __kvm_mmu_free_some_pages(vcpu);
2545
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
2547
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
2550
+ return kvm_mmu_load(vcpu);
2553
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
2555
+#ifdef CONFIG_X86_64
2556
+ return vcpu->arch.shadow_efer & EFER_LME;
2562
+static inline int is_pae(struct kvm_vcpu *vcpu)
2564
+ return vcpu->arch.cr4 & X86_CR4_PAE;
2567
+static inline int is_pse(struct kvm_vcpu *vcpu)
2569
+ return vcpu->arch.cr4 & X86_CR4_PSE;
2572
+static inline int is_paging(struct kvm_vcpu *vcpu)
2574
+ return vcpu->arch.cr0 & X86_CR0_PG;
2578
diff --git a/drivers/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
2579
similarity index 50%
2580
rename from drivers/kvm/paging_tmpl.h
2581
rename to arch/x86/kvm/paging_tmpl.h
2582
index 6b094b4..56b88f7 100644
2583
--- a/drivers/kvm/paging_tmpl.h
2584
+++ b/arch/x86/kvm/paging_tmpl.h
2586
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
2587
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
2588
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
2589
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
2590
#ifdef CONFIG_X86_64
2591
#define PT_MAX_FULL_LEVELS 4
2592
+ #define CMPXCHG cmpxchg
2594
+ #define CMPXCHG cmpxchg64
2595
#define PT_MAX_FULL_LEVELS 2
2599
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
2600
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
2601
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
2602
+ #define PT_LEVEL_BITS PT32_LEVEL_BITS
2603
#define PT_MAX_FULL_LEVELS 2
2604
+ #define CMPXCHG cmpxchg
2606
#error Invalid PTTYPE value
2609
+#define gpte_to_gfn FNAME(gpte_to_gfn)
2610
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
2613
* The guest_walker structure emulates the behavior of the hardware page
2616
struct guest_walker {
2618
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
2619
- pt_element_t *table;
2621
- pt_element_t *ptep;
2622
- struct page *page;
2624
- pt_element_t inherited_ar;
2625
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
2626
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
2627
+ unsigned pt_access;
2628
+ unsigned pte_access;
2633
+static gfn_t gpte_to_gfn(pt_element_t gpte)
2635
+ return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
2638
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
2640
+ return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
2643
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
2644
+ gfn_t table_gfn, unsigned index,
2645
+ pt_element_t orig_pte, pt_element_t new_pte)
2648
+ pt_element_t *table;
2649
+ struct page *page;
2651
+ page = gfn_to_page(kvm, table_gfn);
2652
+ table = kmap_atomic(page, KM_USER0);
2654
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
2656
+ kunmap_atomic(table, KM_USER0);
2658
+ kvm_release_page_dirty(page);
2660
+ return (ret != orig_pte);
2663
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
2667
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
2670
+ access &= ~(gpte >> PT64_NX_SHIFT);
2676
* Fetch a guest pte for a guest virtual address
2678
@@ -74,103 +122,104 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
2679
struct kvm_vcpu *vcpu, gva_t addr,
2680
int write_fault, int user_fault, int fetch_fault)
2683
- struct kvm_memory_slot *slot;
2684
- pt_element_t *ptep;
2685
- pt_element_t root;
2688
+ unsigned index, pt_access, pte_access;
2691
pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
2692
- walker->level = vcpu->mmu.root_level;
2693
- walker->table = NULL;
2694
- walker->page = NULL;
2695
- walker->ptep = NULL;
2698
+ walker->level = vcpu->arch.mmu.root_level;
2699
+ pte = vcpu->arch.cr3;
2701
if (!is_long_mode(vcpu)) {
2702
- walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
2703
- root = *walker->ptep;
2704
- walker->pte = root;
2705
- if (!(root & PT_PRESENT_MASK))
2706
+ pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
2707
+ if (!is_present_pte(pte))
2712
- table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2713
- walker->table_gfn[walker->level - 1] = table_gfn;
2714
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
2715
- walker->level - 1, table_gfn);
2716
- slot = gfn_to_memslot(vcpu->kvm, table_gfn);
2717
- hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
2718
- walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
2719
- walker->table = kmap_atomic(walker->page, KM_USER0);
2721
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
2722
(vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
2724
- walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
2725
+ pt_access = ACC_ALL;
2728
- int index = PT_INDEX(addr, walker->level);
2730
+ index = PT_INDEX(addr, walker->level);
2732
- ptep = &walker->table[index];
2733
- walker->index = index;
2734
- ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
2735
- ((unsigned long)ptep & PAGE_MASK));
2736
+ table_gfn = gpte_to_gfn(pte);
2737
+ pte_gpa = gfn_to_gpa(table_gfn);
2738
+ pte_gpa += index * sizeof(pt_element_t);
2739
+ walker->table_gfn[walker->level - 1] = table_gfn;
2740
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
2741
+ pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
2742
+ walker->level - 1, table_gfn);
2744
+ kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
2746
- if (!is_present_pte(*ptep))
2747
+ if (!is_present_pte(pte))
2750
- if (write_fault && !is_writeble_pte(*ptep))
2751
+ if (write_fault && !is_writeble_pte(pte))
2752
if (user_fault || is_write_protection(vcpu))
2755
- if (user_fault && !(*ptep & PT_USER_MASK))
2756
+ if (user_fault && !(pte & PT_USER_MASK))
2760
- if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
2761
+ if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
2765
- if (!(*ptep & PT_ACCESSED_MASK)) {
2766
+ if (!(pte & PT_ACCESSED_MASK)) {
2767
mark_page_dirty(vcpu->kvm, table_gfn);
2768
- *ptep |= PT_ACCESSED_MASK;
2769
+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
2770
+ index, pte, pte|PT_ACCESSED_MASK))
2772
+ pte |= PT_ACCESSED_MASK;
2775
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
2777
+ walker->ptes[walker->level - 1] = pte;
2779
if (walker->level == PT_PAGE_TABLE_LEVEL) {
2780
- walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
2782
+ walker->gfn = gpte_to_gfn(pte);
2786
if (walker->level == PT_DIRECTORY_LEVEL
2787
- && (*ptep & PT_PAGE_SIZE_MASK)
2788
+ && (pte & PT_PAGE_SIZE_MASK)
2789
&& (PTTYPE == 64 || is_pse(vcpu))) {
2790
- walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
2792
+ walker->gfn = gpte_to_gfn_pde(pte);
2793
walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
2794
+ if (PTTYPE == 32 && is_cpuid_PSE36())
2795
+ walker->gfn += pse36_gfn_delta(pte);
2799
- walker->inherited_ar &= walker->table[index];
2800
- table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
2801
- kunmap_atomic(walker->table, KM_USER0);
2802
- paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
2803
- walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
2804
- walker->table = kmap_atomic(walker->page, KM_USER0);
2805
+ pt_access = pte_access;
2807
- walker->table_gfn[walker->level - 1 ] = table_gfn;
2808
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
2809
- walker->level - 1, table_gfn);
2811
- walker->pte = *ptep;
2813
- walker->ptep = NULL;
2814
- if (walker->table)
2815
- kunmap_atomic(walker->table, KM_USER0);
2816
- pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
2818
+ if (write_fault && !is_dirty_pte(pte)) {
2821
+ mark_page_dirty(vcpu->kvm, table_gfn);
2822
+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
2823
+ pte|PT_DIRTY_MASK);
2826
+ pte |= PT_DIRTY_MASK;
2827
+ kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
2828
+ walker->ptes[walker->level - 1] = pte;
2831
+ walker->pt_access = pt_access;
2832
+ walker->pte_access = pte_access;
2833
+ pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
2834
+ __FUNCTION__, (u64)pte, pt_access, pte_access);
2838
@@ -187,153 +236,28 @@ err:
2839
walker->error_code |= PFERR_USER_MASK;
2841
walker->error_code |= PFERR_FETCH_MASK;
2842
- if (walker->table)
2843
- kunmap_atomic(walker->table, KM_USER0);
2847
-static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
2848
- struct guest_walker *walker)
2850
- mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
2853
-static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
2856
- pt_element_t gpte,
2861
- struct guest_walker *walker,
2865
- int dirty = gpte & PT_DIRTY_MASK;
2866
- u64 spte = *shadow_pte;
2867
- int was_rmapped = is_rmap_pte(spte);
2869
- pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
2870
- " user_fault %d gfn %lx\n",
2871
- __FUNCTION__, spte, (u64)gpte, access_bits,
2872
- write_fault, user_fault, gfn);
2874
- if (write_fault && !dirty) {
2875
- pt_element_t *guest_ent, *tmp = NULL;
2878
- guest_ent = walker->ptep;
2880
- tmp = kmap_atomic(walker->page, KM_USER0);
2881
- guest_ent = &tmp[walker->index];
2884
- *guest_ent |= PT_DIRTY_MASK;
2885
- if (!walker->ptep)
2886
- kunmap_atomic(tmp, KM_USER0);
2888
- FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
2891
- spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
2892
- spte |= gpte & PT64_NX_MASK;
2894
- access_bits &= ~PT_WRITABLE_MASK;
2896
- paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
2898
- spte |= PT_PRESENT_MASK;
2899
- if (access_bits & PT_USER_MASK)
2900
- spte |= PT_USER_MASK;
2902
- if (is_error_hpa(paddr)) {
2904
- spte |= PT_SHADOW_IO_MARK;
2905
- spte &= ~PT_PRESENT_MASK;
2906
- set_shadow_pte(shadow_pte, spte);
2912
- if ((access_bits & PT_WRITABLE_MASK)
2913
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
2914
- struct kvm_mmu_page *shadow;
2916
- spte |= PT_WRITABLE_MASK;
2918
- mmu_unshadow(vcpu, gfn);
2922
- shadow = kvm_mmu_lookup_page(vcpu, gfn);
2924
- pgprintk("%s: found shadow page for %lx, marking ro\n",
2925
- __FUNCTION__, gfn);
2926
- access_bits &= ~PT_WRITABLE_MASK;
2927
- if (is_writeble_pte(spte)) {
2928
- spte &= ~PT_WRITABLE_MASK;
2929
- kvm_x86_ops->tlb_flush(vcpu);
2938
- if (access_bits & PT_WRITABLE_MASK)
2939
- mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
2941
- set_shadow_pte(shadow_pte, spte);
2942
- page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
2944
- rmap_add(vcpu, shadow_pte);
2947
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
2948
- u64 *shadow_pte, u64 access_bits,
2949
- int user_fault, int write_fault, int *ptwrite,
2950
- struct guest_walker *walker, gfn_t gfn)
2952
- access_bits &= gpte;
2953
- FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
2954
- gpte, access_bits, user_fault, write_fault,
2955
- ptwrite, walker, gfn);
2958
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
2959
- u64 *spte, const void *pte, int bytes)
2960
+ u64 *spte, const void *pte, int bytes,
2961
+ int offset_in_pte)
2964
+ unsigned pte_access;
2966
- if (bytes < sizeof(pt_element_t))
2968
gpte = *(const pt_element_t *)pte;
2969
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
2970
+ if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
2971
+ if (!offset_in_pte && !is_present_pte(gpte))
2972
+ set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
2975
+ if (bytes < sizeof(pt_element_t))
2977
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
2978
- FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
2980
- (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
2983
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
2984
- u64 *shadow_pte, u64 access_bits,
2985
- int user_fault, int write_fault, int *ptwrite,
2986
- struct guest_walker *walker, gfn_t gfn)
2990
- access_bits &= gpde;
2991
- gaddr = (gpa_t)gfn << PAGE_SHIFT;
2992
- if (PTTYPE == 32 && is_cpuid_PSE36())
2993
- gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
2994
- (32 - PT32_DIR_PSE36_SHIFT);
2995
- FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
2996
- gpde, access_bits, user_fault, write_fault,
2997
- ptwrite, walker, gfn);
2998
+ pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
2999
+ mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
3000
+ gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
3004
@@ -346,15 +270,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
3008
- u64 *prev_shadow_ent = NULL;
3009
+ unsigned access = walker->pt_access;
3011
- if (!is_present_pte(walker->pte))
3012
+ if (!is_present_pte(walker->ptes[walker->level - 1]))
3015
- shadow_addr = vcpu->mmu.root_hpa;
3016
- level = vcpu->mmu.shadow_root_level;
3017
+ shadow_addr = vcpu->arch.mmu.root_hpa;
3018
+ level = vcpu->arch.mmu.shadow_root_level;
3019
if (level == PT32E_ROOT_LEVEL) {
3020
- shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
3021
+ shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
3022
shadow_addr &= PT64_BASE_ADDR_MASK;
3025
@@ -365,14 +289,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
3029
- unsigned hugepage_access = 0;
3030
+ bool new_page = 0;
3032
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
3033
- if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
3034
+ if (is_shadow_present_pte(*shadow_ent)) {
3035
if (level == PT_PAGE_TABLE_LEVEL)
3037
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
3038
- prev_shadow_ent = shadow_ent;
3042
@@ -382,37 +305,34 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
3043
if (level - 1 == PT_PAGE_TABLE_LEVEL
3044
&& walker->level == PT_DIRECTORY_LEVEL) {
3046
- hugepage_access = walker->pte;
3047
- hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
3048
- if (walker->pte & PT64_NX_MASK)
3049
- hugepage_access |= (1 << 2);
3050
- hugepage_access >>= PT_WRITABLE_SHIFT;
3051
- table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
3053
+ if (!is_dirty_pte(walker->ptes[level - 1]))
3054
+ access &= ~ACC_WRITE_MASK;
3055
+ table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
3058
table_gfn = walker->table_gfn[level - 2];
3060
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
3061
- metaphysical, hugepage_access,
3063
+ metaphysical, access,
3064
+ shadow_ent, &new_page);
3065
+ if (new_page && !metaphysical) {
3066
+ pt_element_t curr_pte;
3067
+ kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
3068
+ &curr_pte, sizeof(curr_pte));
3069
+ if (curr_pte != walker->ptes[level - 2])
3072
shadow_addr = __pa(shadow_page->spt);
3073
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
3074
| PT_WRITABLE_MASK | PT_USER_MASK;
3075
*shadow_ent = shadow_pte;
3076
- prev_shadow_ent = shadow_ent;
3079
- if (walker->level == PT_DIRECTORY_LEVEL) {
3080
- FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
3081
- walker->inherited_ar, user_fault, write_fault,
3082
- ptwrite, walker, walker->gfn);
3084
- ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
3085
- FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
3086
- walker->inherited_ar, user_fault, write_fault,
3087
- ptwrite, walker, walker->gfn);
3089
+ mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
3090
+ user_fault, write_fault,
3091
+ walker->ptes[walker->level-1] & PT_DIRTY_MASK,
3092
+ ptwrite, walker->gfn);
3097
@@ -460,7 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
3099
pgprintk("%s: guest page fault\n", __FUNCTION__);
3100
inject_page_fault(vcpu, addr, walker.error_code);
3101
- vcpu->last_pt_write_count = 0; /* reset fork detector */
3102
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
3106
@@ -470,12 +390,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
3107
shadow_pte, *shadow_pte, write_pt);
3110
- vcpu->last_pt_write_count = 0; /* reset fork detector */
3111
+ vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
3114
* mmio: emulate if accessible, otherwise its a guest fault.
3116
- if (is_io_pte(*shadow_pte))
3117
+ if (shadow_pte && is_io_pte(*shadow_pte))
3120
++vcpu->stat.pf_fixed;
3121
@@ -493,13 +413,39 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
3122
r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
3125
- gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
3126
+ gpa = gfn_to_gpa(walker.gfn);
3127
gpa |= vaddr & ~PAGE_MASK;
3133
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
3134
+ struct kvm_mmu_page *sp)
3136
+ int i, offset = 0;
3137
+ pt_element_t *gpt;
3138
+ struct page *page;
3140
+ if (sp->role.metaphysical
3141
+ || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
3142
+ nonpaging_prefetch_page(vcpu, sp);
3147
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
3148
+ page = gfn_to_page(vcpu->kvm, sp->gfn);
3149
+ gpt = kmap_atomic(page, KM_USER0);
3150
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
3151
+ if (is_present_pte(gpt[offset + i]))
3152
+ sp->spt[i] = shadow_trap_nonpresent_pte;
3154
+ sp->spt[i] = shadow_notrap_nonpresent_pte;
3155
+ kunmap_atomic(gpt, KM_USER0);
3156
+ kvm_release_page_clean(page);
3162
@@ -508,4 +454,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
3163
#undef SHADOW_PT_INDEX
3164
#undef PT_LEVEL_MASK
3165
#undef PT_DIR_BASE_ADDR_MASK
3166
+#undef PT_LEVEL_BITS
3167
#undef PT_MAX_FULL_LEVELS
3169
+#undef gpte_to_gfn_pde
3171
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
3172
new file mode 100644
3173
index 0000000..56fc4c8
3175
+++ b/arch/x86/kvm/segment_descriptor.h
3177
+#ifndef __SEGMENT_DESCRIPTOR_H
3178
+#define __SEGMENT_DESCRIPTOR_H
3180
+struct segment_descriptor {
3188
+ u8 limit_high : 4;
3191
+ u8 default_op : 1;
3192
+ u8 granularity : 1;
3194
+} __attribute__((packed));
3196
+#ifdef CONFIG_X86_64
3197
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
3198
+struct segment_descriptor_64 {
3199
+ struct segment_descriptor s;
3206
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c
3207
similarity index 84%
3208
rename from drivers/kvm/svm.c
3209
rename to arch/x86/kvm/svm.c
3210
index 4e04e49..745b1ec 100644
3211
--- a/drivers/kvm/svm.c
3212
+++ b/arch/x86/kvm/svm.c
3214
* the COPYING file in the top-level directory.
3217
+#include <linux/kvm_host.h>
3219
#include "kvm_svm.h"
3220
-#include "x86_emulate.h"
3224
#include <linux/module.h>
3225
#include <linux/kernel.h>
3226
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
3227
#define SEG_TYPE_LDT 2
3228
#define SEG_TYPE_BUSY_TSS16 3
3230
-#define KVM_EFER_LMA (1 << 10)
3231
-#define KVM_EFER_LME (1 << 8)
3233
#define SVM_FEATURE_NPT (1 << 0)
3234
#define SVM_FEATURE_LBRV (1 << 1)
3235
#define SVM_DEATURE_SVML (1 << 2)
3236
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
3238
static inline u8 pop_irq(struct kvm_vcpu *vcpu)
3240
- int word_index = __ffs(vcpu->irq_summary);
3241
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
3242
+ int word_index = __ffs(vcpu->arch.irq_summary);
3243
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
3244
int irq = word_index * BITS_PER_LONG + bit_index;
3246
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
3247
- if (!vcpu->irq_pending[word_index])
3248
- clear_bit(word_index, &vcpu->irq_summary);
3249
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
3250
+ if (!vcpu->arch.irq_pending[word_index])
3251
+ clear_bit(word_index, &vcpu->arch.irq_summary);
3255
static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
3257
- set_bit(irq, vcpu->irq_pending);
3258
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
3259
+ set_bit(irq, vcpu->arch.irq_pending);
3260
+ set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
3263
static inline void clgi(void)
3264
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
3266
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3268
- if (!(efer & KVM_EFER_LMA))
3269
- efer &= ~KVM_EFER_LME;
3270
+ if (!(efer & EFER_LMA))
3271
+ efer &= ~EFER_LME;
3273
to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
3274
- vcpu->shadow_efer = efer;
3275
+ vcpu->arch.shadow_efer = efer;
3278
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
3279
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
3280
+ bool has_error_code, u32 error_code)
3282
struct vcpu_svm *svm = to_svm(vcpu);
3284
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
3285
- SVM_EVTINJ_VALID_ERR |
3286
- SVM_EVTINJ_TYPE_EXEPT |
3288
+ svm->vmcb->control.event_inj = nr
3289
+ | SVM_EVTINJ_VALID
3290
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
3291
+ | SVM_EVTINJ_TYPE_EXEPT;
3292
svm->vmcb->control.event_inj_err = error_code;
3295
-static void inject_ud(struct kvm_vcpu *vcpu)
3296
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
3298
- to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
3299
- SVM_EVTINJ_TYPE_EXEPT |
3302
+ struct vcpu_svm *svm = to_svm(vcpu);
3304
-static int is_page_fault(uint32_t info)
3306
- info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
3307
- return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
3308
+ return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
3311
static int is_external_interrupt(u32 info)
3312
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
3313
printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
3316
- if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
3317
+ if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
3318
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
3320
svm->vmcb->save.rip,
3324
- vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
3325
+ vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
3326
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
3328
- vcpu->interrupt_window_open = 1;
3329
+ vcpu->arch.interrupt_window_open = 1;
3332
static int has_svm(void)
3333
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
3334
svm_data->next_asid = svm_data->max_asid + 1;
3335
svm_features = cpuid_edx(SVM_CPUID_FUNC);
3337
- asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
3338
+ asm volatile ("sgdt %0" : "=m"(gdt_descr));
3339
gdt = (struct desc_struct *)gdt_descr.address;
3340
svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
3342
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
3344
control->intercept_cr_read = INTERCEPT_CR0_MASK |
3345
INTERCEPT_CR3_MASK |
3346
- INTERCEPT_CR4_MASK;
3347
+ INTERCEPT_CR4_MASK |
3348
+ INTERCEPT_CR8_MASK;
3350
control->intercept_cr_write = INTERCEPT_CR0_MASK |
3351
INTERCEPT_CR3_MASK |
3352
- INTERCEPT_CR4_MASK;
3353
+ INTERCEPT_CR4_MASK |
3354
+ INTERCEPT_CR8_MASK;
3356
control->intercept_dr_read = INTERCEPT_DR0_MASK |
3357
INTERCEPT_DR1_MASK |
3358
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
3359
INTERCEPT_DR5_MASK |
3362
- control->intercept_exceptions = 1 << PF_VECTOR;
3363
+ control->intercept_exceptions = (1 << PF_VECTOR) |
3367
control->intercept = (1ULL << INTERCEPT_INTR) |
3368
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
3369
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
3371
save->efer = MSR_EFER_SVME_MASK;
3373
- save->dr6 = 0xffff0ff0;
3374
+ save->dr6 = 0xffff0ff0;
3377
save->rip = 0x0000fff0;
3378
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
3382
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
3383
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
3385
struct vcpu_svm *svm = to_svm(vcpu);
3387
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
3389
if (vcpu->vcpu_id != 0) {
3390
svm->vmcb->save.rip = 0;
3391
- svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
3392
- svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
3393
+ svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
3394
+ svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
3400
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
3401
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
3405
- if (irqchip_in_kernel(kvm)) {
3406
- err = kvm_create_lapic(&svm->vcpu);
3411
page = alloc_page(GFP_KERNEL);
3414
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
3416
fx_init(&svm->vcpu);
3417
svm->vcpu.fpu_active = 1;
3418
- svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
3419
+ svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
3420
if (svm->vcpu.vcpu_id == 0)
3421
- svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
3422
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
3426
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3430
- delta = vcpu->host_tsc - tsc_this;
3431
+ delta = vcpu->arch.host_tsc - tsc_this;
3432
svm->vmcb->control.tsc_offset += delta;
3434
kvm_migrate_apic_timer(vcpu);
3435
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
3436
struct vcpu_svm *svm = to_svm(vcpu);
3439
+ ++vcpu->stat.host_state_reload;
3440
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
3441
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
3443
- rdtscll(vcpu->host_tsc);
3444
- kvm_put_guest_fpu(vcpu);
3445
+ rdtscll(vcpu->arch.host_tsc);
3448
static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
3449
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
3451
struct vcpu_svm *svm = to_svm(vcpu);
3453
- vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3454
- vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3455
- vcpu->rip = svm->vmcb->save.rip;
3456
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3457
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3458
+ vcpu->arch.rip = svm->vmcb->save.rip;
3461
static void svm_decache_regs(struct kvm_vcpu *vcpu)
3463
struct vcpu_svm *svm = to_svm(vcpu);
3464
- svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
3465
- svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
3466
- svm->vmcb->save.rip = vcpu->rip;
3467
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3468
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3469
+ svm->vmcb->save.rip = vcpu->arch.rip;
3472
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
3473
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3474
struct vcpu_svm *svm = to_svm(vcpu);
3476
#ifdef CONFIG_X86_64
3477
- if (vcpu->shadow_efer & KVM_EFER_LME) {
3478
+ if (vcpu->arch.shadow_efer & EFER_LME) {
3479
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
3480
- vcpu->shadow_efer |= KVM_EFER_LMA;
3481
- svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
3482
+ vcpu->arch.shadow_efer |= EFER_LMA;
3483
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
3486
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
3487
- vcpu->shadow_efer &= ~KVM_EFER_LMA;
3488
- svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
3489
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
3490
+ vcpu->arch.shadow_efer &= ~EFER_LMA;
3491
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
3495
- if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
3496
+ if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
3497
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
3498
vcpu->fpu_active = 1;
3502
+ vcpu->arch.cr0 = cr0;
3503
cr0 |= X86_CR0_PG | X86_CR0_WP;
3504
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
3505
svm->vmcb->save.cr0 = cr0;
3506
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3508
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3511
+ vcpu->arch.cr4 = cr4;
3512
to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
3515
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
3516
svm->db_regs[dr] = value;
3519
- if (vcpu->cr4 & X86_CR4_DE) {
3520
+ if (vcpu->arch.cr4 & X86_CR4_DE) {
3521
*exception = UD_VECTOR;
3524
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3525
struct kvm *kvm = svm->vcpu.kvm;
3528
- enum emulation_result er;
3531
if (!irqchip_in_kernel(kvm) &&
3532
is_external_interrupt(exit_int_info))
3533
push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
3535
- mutex_lock(&kvm->lock);
3537
fault_address = svm->vmcb->control.exit_info_2;
3538
error_code = svm->vmcb->control.exit_info_1;
3539
- r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
3541
- mutex_unlock(&kvm->lock);
3545
- mutex_unlock(&kvm->lock);
3548
- er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
3550
- mutex_unlock(&kvm->lock);
3551
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
3555
- case EMULATE_DONE:
3557
- case EMULATE_DO_MMIO:
3558
- ++svm->vcpu.stat.mmio_exits;
3560
- case EMULATE_FAIL:
3561
- kvm_report_emulation_failure(&svm->vcpu, "pagetable");
3566
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3570
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3572
+ er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0);
3573
+ if (er != EMULATE_DONE)
3574
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3578
static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3580
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
3581
- if (!(svm->vcpu.cr0 & X86_CR0_TS))
3582
+ if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
3583
svm->vmcb->save.cr0 &= ~X86_CR0_TS;
3584
svm->vcpu.fpu_active = 1;
3586
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3588
static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3590
- u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
3591
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
3592
int size, down, in, string, rep;
3595
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3596
string = (io_info & SVM_IOIO_STR_MASK) != 0;
3599
- if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
3600
+ if (emulate_instruction(&svm->vcpu,
3601
+ kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
3605
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3607
svm->next_rip = svm->vmcb->save.rip + 3;
3608
skip_emulated_instruction(&svm->vcpu);
3609
- return kvm_hypercall(&svm->vcpu, kvm_run);
3610
+ kvm_emulate_hypercall(&svm->vcpu);
3614
static int invalid_op_interception(struct vcpu_svm *svm,
3615
struct kvm_run *kvm_run)
3617
- inject_ud(&svm->vcpu);
3618
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3622
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3623
static int emulate_on_interception(struct vcpu_svm *svm,
3624
struct kvm_run *kvm_run)
3626
- if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
3627
+ if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
3628
pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
3632
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3634
+ emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
3635
+ if (irqchip_in_kernel(svm->vcpu.kvm))
3637
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3641
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3643
struct vcpu_svm *svm = to_svm(vcpu);
3644
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3646
static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3648
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
3649
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3652
if (svm_get_msr(&svm->vcpu, ecx, &data))
3653
- svm_inject_gp(&svm->vcpu, 0);
3654
+ kvm_inject_gp(&svm->vcpu, 0);
3656
svm->vmcb->save.rax = data & 0xffffffff;
3657
- svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
3658
+ svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
3659
svm->next_rip = svm->vmcb->save.rip + 2;
3660
skip_emulated_instruction(&svm->vcpu);
3662
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3663
case MSR_IA32_SYSENTER_ESP:
3664
svm->vmcb->save.sysenter_esp = data;
3666
+ case MSR_K7_EVNTSEL0:
3667
+ case MSR_K7_EVNTSEL1:
3668
+ case MSR_K7_EVNTSEL2:
3669
+ case MSR_K7_EVNTSEL3:
3671
+ * only support writing 0 to the performance counters for now
3672
+ * to make Windows happy. Should be replaced by a real
3673
+ * performance counter emulation later.
3680
return kvm_set_msr_common(vcpu, ecx, data);
3683
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3685
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
3687
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
3688
+ u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3689
u64 data = (svm->vmcb->save.rax & -1u)
3690
- | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
3691
+ | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3692
svm->next_rip = svm->vmcb->save.rip + 2;
3693
if (svm_set_msr(&svm->vcpu, ecx, data))
3694
- svm_inject_gp(&svm->vcpu, 0);
3695
+ kvm_inject_gp(&svm->vcpu, 0);
3697
skip_emulated_instruction(&svm->vcpu);
3699
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
3702
if (kvm_run->request_interrupt_window &&
3703
- !svm->vcpu.irq_summary) {
3704
+ !svm->vcpu.arch.irq_summary) {
3705
++svm->vcpu.stat.irq_window_exits;
3706
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3708
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
3709
[SVM_EXIT_READ_CR0] = emulate_on_interception,
3710
[SVM_EXIT_READ_CR3] = emulate_on_interception,
3711
[SVM_EXIT_READ_CR4] = emulate_on_interception,
3712
+ [SVM_EXIT_READ_CR8] = emulate_on_interception,
3714
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
3715
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
3716
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
3717
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
3718
[SVM_EXIT_READ_DR0] = emulate_on_interception,
3719
[SVM_EXIT_READ_DR1] = emulate_on_interception,
3720
[SVM_EXIT_READ_DR2] = emulate_on_interception,
3721
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
3722
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
3723
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
3724
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
3725
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
3726
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
3727
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
3728
[SVM_EXIT_INTR] = nop_on_interception,
3729
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3732
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3733
- || svm_exit_handlers[exit_code] == 0) {
3734
+ || !svm_exit_handlers[exit_code]) {
3735
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3736
kvm_run->hw.hardware_exit_reason = exit_code;
3738
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
3739
int cpu = raw_smp_processor_id();
3741
struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
3742
- svm_data->tss_desc->type = 9; //available 32/64-bit TSS
3743
+ svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
3747
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
3748
struct vmcb *vmcb = svm->vmcb;
3749
int intr_vector = -1;
3751
- kvm_inject_pending_timer_irqs(vcpu);
3752
if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
3753
((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
3754
intr_vector = vmcb->control.exit_int_info &
3755
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
3756
push_irq(&svm->vcpu, control->int_vector);
3759
- svm->vcpu.interrupt_window_open =
3760
+ svm->vcpu.arch.interrupt_window_open =
3761
!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
3764
static void svm_do_inject_vector(struct vcpu_svm *svm)
3766
struct kvm_vcpu *vcpu = &svm->vcpu;
3767
- int word_index = __ffs(vcpu->irq_summary);
3768
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
3769
+ int word_index = __ffs(vcpu->arch.irq_summary);
3770
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
3771
int irq = word_index * BITS_PER_LONG + bit_index;
3773
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
3774
- if (!vcpu->irq_pending[word_index])
3775
- clear_bit(word_index, &vcpu->irq_summary);
3776
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
3777
+ if (!vcpu->arch.irq_pending[word_index])
3778
+ clear_bit(word_index, &vcpu->arch.irq_summary);
3779
svm_inject_irq(svm, irq);
3782
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
3783
struct vcpu_svm *svm = to_svm(vcpu);
3784
struct vmcb_control_area *control = &svm->vmcb->control;
3786
- svm->vcpu.interrupt_window_open =
3787
+ svm->vcpu.arch.interrupt_window_open =
3788
(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3789
(svm->vmcb->save.rflags & X86_EFLAGS_IF));
3791
- if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
3792
+ if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
3794
* If interrupts enabled, and not blocked by sti or mov ss. Good.
3796
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
3798
* Interrupts blocked. Wait for unblock.
3800
- if (!svm->vcpu.interrupt_window_open &&
3801
- (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
3802
+ if (!svm->vcpu.arch.interrupt_window_open &&
3803
+ (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
3804
control->intercept |= 1ULL << INTERCEPT_VINTR;
3807
control->intercept &= ~(1ULL << INTERCEPT_VINTR);
3810
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3815
static void save_db_regs(unsigned long *db_regs)
3817
asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
3818
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3819
svm->host_cr2 = kvm_read_cr2();
3820
svm->host_dr6 = read_dr6();
3821
svm->host_dr7 = read_dr7();
3822
- svm->vmcb->save.cr2 = vcpu->cr2;
3823
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
3825
if (svm->vmcb->save.dr7 & 0xff) {
3827
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3830
#ifdef CONFIG_X86_64
3831
- "push %%rbx; push %%rcx; push %%rdx;"
3832
- "push %%rsi; push %%rdi; push %%rbp;"
3833
- "push %%r8; push %%r9; push %%r10; push %%r11;"
3834
- "push %%r12; push %%r13; push %%r14; push %%r15;"
3835
+ "push %%rbp; \n\t"
3837
- "push %%ebx; push %%ecx; push %%edx;"
3838
- "push %%esi; push %%edi; push %%ebp;"
3839
+ "push %%ebp; \n\t"
3842
#ifdef CONFIG_X86_64
3843
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3844
"mov %%r14, %c[r14](%[svm]) \n\t"
3845
"mov %%r15, %c[r15](%[svm]) \n\t"
3847
- "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
3848
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
3849
- "pop %%rbp; pop %%rdi; pop %%rsi;"
3850
- "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
3853
"mov %%ebx, %c[rbx](%[svm]) \n\t"
3854
"mov %%ecx, %c[rcx](%[svm]) \n\t"
3855
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3856
"mov %%edi, %c[rdi](%[svm]) \n\t"
3857
"mov %%ebp, %c[rbp](%[svm]) \n\t"
3859
- "pop %%ebp; pop %%edi; pop %%esi;"
3860
- "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
3865
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
3866
- [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
3867
- [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
3868
- [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
3869
- [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
3870
- [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
3871
- [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
3872
+ [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
3873
+ [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
3874
+ [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
3875
+ [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
3876
+ [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
3877
+ [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
3878
+#ifdef CONFIG_X86_64
3879
+ , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
3880
+ [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
3881
+ [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
3882
+ [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
3883
+ [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
3884
+ [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
3885
+ [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
3886
+ [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3889
#ifdef CONFIG_X86_64
3890
- ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
3891
- [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
3892
- [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
3893
- [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
3894
- [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
3895
- [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
3896
- [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
3897
- [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
3898
+ , "rbx", "rcx", "rdx", "rsi", "rdi"
3899
+ , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3901
+ , "ebx", "ecx", "edx" , "esi", "edi"
3903
- : "cc", "memory" );
3906
if ((svm->vmcb->save.dr7 & 0xff))
3907
load_db_regs(svm->host_db_regs);
3909
- vcpu->cr2 = svm->vmcb->save.cr2;
3910
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
3912
write_dr6(svm->host_dr6);
3913
write_dr7(svm->host_dr7);
3914
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3918
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
3919
- unsigned long addr,
3920
- uint32_t err_code)
3922
- struct vcpu_svm *svm = to_svm(vcpu);
3923
- uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
3925
- ++vcpu->stat.pf_guest;
3927
- if (is_page_fault(exit_int_info)) {
3929
- svm->vmcb->control.event_inj_err = 0;
3930
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
3931
- SVM_EVTINJ_VALID_ERR |
3932
- SVM_EVTINJ_TYPE_EXEPT |
3937
- svm->vmcb->save.cr2 = addr;
3938
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
3939
- SVM_EVTINJ_VALID_ERR |
3940
- SVM_EVTINJ_TYPE_EXEPT |
3942
- svm->vmcb->control.event_inj_err = err_code;
3946
static int is_disabled(void)
3949
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3950
hypercall[0] = 0x0f;
3951
hypercall[1] = 0x01;
3952
hypercall[2] = 0xd9;
3953
- hypercall[3] = 0xc3;
3956
static void svm_check_processor_compat(void *rtn)
3957
@@ -1725,9 +1695,6 @@ static struct kvm_x86_ops svm_x86_ops = {
3958
.set_rflags = svm_set_rflags,
3960
.tlb_flush = svm_flush_tlb,
3961
- .inject_page_fault = svm_inject_page_fault,
3963
- .inject_gp = svm_inject_gp,
3965
.run = svm_vcpu_run,
3966
.handle_exit = handle_exit,
3967
@@ -1735,19 +1702,23 @@ static struct kvm_x86_ops svm_x86_ops = {
3968
.patch_hypercall = svm_patch_hypercall,
3969
.get_irq = svm_get_irq,
3970
.set_irq = svm_set_irq,
3971
+ .queue_exception = svm_queue_exception,
3972
+ .exception_injected = svm_exception_injected,
3973
.inject_pending_irq = svm_intr_assist,
3974
.inject_pending_vectors = do_interrupt_requests,
3976
+ .set_tss_addr = svm_set_tss_addr,
3979
static int __init svm_init(void)
3981
- return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
3982
+ return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
3986
static void __exit svm_exit(void)
3992
module_init(svm_init)
3993
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h
3994
similarity index 98%
3995
rename from drivers/kvm/svm.h
3996
rename to arch/x86/kvm/svm.h
3997
index 3b1b0f3..5fd5049 100644
3998
--- a/drivers/kvm/svm.h
3999
+++ b/arch/x86/kvm/svm.h
4000
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
4001
#define INTERCEPT_CR0_MASK 1
4002
#define INTERCEPT_CR3_MASK (1 << 3)
4003
#define INTERCEPT_CR4_MASK (1 << 4)
4004
+#define INTERCEPT_CR8_MASK (1 << 8)
4006
#define INTERCEPT_DR0_MASK 1
4007
#define INTERCEPT_DR1_MASK (1 << 1)
4008
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
4010
#define SVM_EXIT_ERR -1
4012
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
4013
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
4015
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
4016
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
4017
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c
4018
similarity index 75%
4019
rename from drivers/kvm/vmx.c
4020
rename to arch/x86/kvm/vmx.c
4021
index bb56ae3..20c0f5e 100644
4022
--- a/drivers/kvm/vmx.c
4023
+++ b/arch/x86/kvm/vmx.c
4029
-#include "x86_emulate.h"
4032
#include "segment_descriptor.h"
4035
+#include <linux/kvm_host.h>
4036
#include <linux/module.h>
4037
#include <linux/kernel.h>
4038
#include <linux/mm.h>
4039
#include <linux/highmem.h>
4040
#include <linux/sched.h>
4041
+#include <linux/moduleparam.h>
4044
#include <asm/desc.h>
4046
MODULE_AUTHOR("Qumranet");
4047
MODULE_LICENSE("GPL");
4049
+static int bypass_guest_pf = 1;
4050
+module_param(bypass_guest_pf, bool, 0);
4055
@@ -43,6 +47,7 @@ struct vcpu_vmx {
4056
struct kvm_vcpu vcpu;
4059
+ u32 idt_vectoring_info;
4060
struct kvm_msr_entry *guest_msrs;
4061
struct kvm_msr_entry *host_msrs;
4063
@@ -57,8 +62,15 @@ struct vcpu_vmx {
4064
u16 fs_sel, gs_sel, ldt_sel;
4065
int gs_ldt_reload_needed;
4066
int fs_reload_needed;
4069
+ int guest_efer_loaded;
4080
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
4081
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
4082
static struct page *vmx_io_bitmap_a;
4083
static struct page *vmx_io_bitmap_b;
4085
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
4087
static struct vmcs_config {
4091
u32 pin_based_exec_ctrl;
4092
u32 cpu_based_exec_ctrl;
4093
+ u32 cpu_based_2nd_exec_ctrl;
4097
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
4098
rdmsrl(e[i].index, e[i].data);
4101
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
4103
- return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
4106
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
4108
- int efer_offset = vmx->msr_offset_efer;
4109
- return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
4110
- msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
4113
static inline int is_page_fault(u32 intr_info)
4115
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
4116
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
4117
(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
4120
+static inline int is_invalid_opcode(u32 intr_info)
4122
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
4123
+ INTR_INFO_VALID_MASK)) ==
4124
+ (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
4127
static inline int is_external_interrupt(u32 intr_info)
4129
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
4130
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
4131
return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
4134
+static inline int cpu_has_secondary_exec_ctrls(void)
4136
+ return (vmcs_config.cpu_based_exec_ctrl &
4137
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
4140
+static inline int cpu_has_vmx_virtualize_apic_accesses(void)
4142
+ return (vmcs_config.cpu_based_2nd_exec_ctrl &
4143
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
4146
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
4148
+ return ((cpu_has_vmx_virtualize_apic_accesses()) &&
4149
+ (irqchip_in_kernel(kvm)));
4152
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
4155
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
4156
vmcs_clear(vmx->vmcs);
4157
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
4158
per_cpu(current_vmcs, cpu) = NULL;
4159
- rdtscll(vmx->vcpu.host_tsc);
4160
+ rdtscll(vmx->vcpu.arch.host_tsc);
4163
static void vcpu_clear(struct vcpu_vmx *vmx)
4165
- if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
4166
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
4169
- __vcpu_clear(vmx);
4170
+ if (vmx->vcpu.cpu == -1)
4172
+ smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
4176
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
4179
asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
4180
- : "=q"(error) : "a"(value), "d"(field) : "cc" );
4181
+ : "=q"(error) : "a"(value), "d"(field) : "cc");
4182
if (unlikely(error))
4183
vmwrite_error(field, value);
4185
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
4189
- eb = 1u << PF_VECTOR;
4190
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
4191
if (!vcpu->fpu_active)
4192
eb |= 1u << NM_VECTOR;
4193
if (vcpu->guest_debug.enabled)
4195
- if (vcpu->rmode.active)
4196
+ if (vcpu->arch.rmode.active)
4198
vmcs_write32(EXCEPTION_BITMAP, eb);
4200
@@ -344,16 +366,42 @@ static void reload_tss(void)
4202
static void load_transition_efer(struct vcpu_vmx *vmx)
4205
int efer_offset = vmx->msr_offset_efer;
4206
+ u64 host_efer = vmx->host_msrs[efer_offset].data;
4207
+ u64 guest_efer = vmx->guest_msrs[efer_offset].data;
4210
- trans_efer = vmx->host_msrs[efer_offset].data;
4211
- trans_efer &= ~EFER_SAVE_RESTORE_BITS;
4212
- trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
4213
- wrmsrl(MSR_EFER, trans_efer);
4214
+ if (efer_offset < 0)
4217
+ * NX is emulated; LMA and LME handled by hardware; SCE meaninless
4218
+ * outside long mode
4220
+ ignore_bits = EFER_NX | EFER_SCE;
4221
+#ifdef CONFIG_X86_64
4222
+ ignore_bits |= EFER_LMA | EFER_LME;
4223
+ /* SCE is meaningful only in long mode on Intel */
4224
+ if (guest_efer & EFER_LMA)
4225
+ ignore_bits &= ~(u64)EFER_SCE;
4227
+ if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
4230
+ vmx->host_state.guest_efer_loaded = 1;
4231
+ guest_efer &= ~ignore_bits;
4232
+ guest_efer |= host_efer & ignore_bits;
4233
+ wrmsrl(MSR_EFER, guest_efer);
4234
vmx->vcpu.stat.efer_reload++;
4237
+static void reload_host_efer(struct vcpu_vmx *vmx)
4239
+ if (vmx->host_state.guest_efer_loaded) {
4240
+ vmx->host_state.guest_efer_loaded = 0;
4241
+ load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
4245
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
4247
struct vcpu_vmx *vmx = to_vmx(vcpu);
4248
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
4251
#ifdef CONFIG_X86_64
4252
- if (is_long_mode(&vmx->vcpu)) {
4253
+ if (is_long_mode(&vmx->vcpu))
4254
save_msrs(vmx->host_msrs +
4255
vmx->msr_offset_kernel_gs_base, 1);
4259
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
4260
- if (msr_efer_need_save_restore(vmx))
4261
- load_transition_efer(vmx);
4262
+ load_transition_efer(vmx);
4265
static void vmx_load_host_state(struct vcpu_vmx *vmx)
4266
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
4267
if (!vmx->host_state.loaded)
4270
+ ++vmx->vcpu.stat.host_state_reload;
4271
vmx->host_state.loaded = 0;
4272
if (vmx->host_state.fs_reload_needed)
4273
load_fs(vmx->host_state.fs_sel);
4274
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
4276
save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
4277
load_msrs(vmx->host_msrs, vmx->save_nmsrs);
4278
- if (msr_efer_need_save_restore(vmx))
4279
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
4280
+ reload_host_efer(vmx);
4284
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4285
* Make sure the time stamp counter is monotonous.
4288
- delta = vcpu->host_tsc - tsc_this;
4289
+ delta = vcpu->arch.host_tsc - tsc_this;
4290
vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
4293
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4294
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
4296
vmx_load_host_state(to_vmx(vcpu));
4297
- kvm_put_guest_fpu(vcpu);
4300
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
4301
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
4303
vcpu->fpu_active = 1;
4304
vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
4305
- if (vcpu->cr0 & X86_CR0_TS)
4306
+ if (vcpu->arch.cr0 & X86_CR0_TS)
4307
vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
4308
update_exception_bitmap(vcpu);
4310
@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
4312
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
4314
- if (vcpu->rmode.active)
4315
+ if (vcpu->arch.rmode.active)
4316
rflags |= IOPL_MASK | X86_EFLAGS_VM;
4317
vmcs_writel(GUEST_RFLAGS, rflags);
4319
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
4320
if (interruptibility & 3)
4321
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
4322
interruptibility & ~3);
4323
- vcpu->interrupt_window_open = 1;
4324
+ vcpu->arch.interrupt_window_open = 1;
4327
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
4328
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
4329
+ bool has_error_code, u32 error_code)
4331
- printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
4332
- vmcs_readl(GUEST_RIP));
4333
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
4334
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4336
- INTR_TYPE_EXCEPTION |
4337
- INTR_INFO_DELIEVER_CODE_MASK |
4338
- INTR_INFO_VALID_MASK);
4339
+ nr | INTR_TYPE_EXCEPTION
4340
+ | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
4341
+ | INTR_INFO_VALID_MASK);
4342
+ if (has_error_code)
4343
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
4346
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
4348
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
4350
+ return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4354
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
4355
* if efer.sce is enabled.
4357
index = __find_msr_index(vmx, MSR_K6_STAR);
4358
- if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
4359
+ if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
4360
move_msr_up(vmx, index, save_nmsrs++);
4363
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
4364
#ifdef CONFIG_X86_64
4366
ret = kvm_set_msr_common(vcpu, msr_index, data);
4367
- if (vmx->host_state.loaded)
4368
+ if (vmx->host_state.loaded) {
4369
+ reload_host_efer(vmx);
4370
load_transition_efer(vmx);
4374
vmcs_writel(GUEST_FS_BASE, data);
4375
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
4378
* Sync the rsp and rip registers into the vcpu structure. This allows
4379
- * registers to be accessed by indexing vcpu->regs.
4380
+ * registers to be accessed by indexing vcpu->arch.regs.
4382
static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
4384
- vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4385
- vcpu->rip = vmcs_readl(GUEST_RIP);
4386
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
4387
+ vcpu->arch.rip = vmcs_readl(GUEST_RIP);
4391
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
4393
static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
4395
- vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
4396
- vmcs_writel(GUEST_RIP, vcpu->rip);
4397
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
4398
+ vmcs_writel(GUEST_RIP, vcpu->arch.rip);
4401
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
4402
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
4404
static int vmx_get_irq(struct kvm_vcpu *vcpu)
4406
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
4407
u32 idtv_info_field;
4409
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4410
+ idtv_info_field = vmx->idt_vectoring_info;
4411
if (idtv_info_field & INTR_INFO_VALID_MASK) {
4412
if (is_external_interrupt(idtv_info_field))
4413
return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
4415
- printk("pending exception: not handled yet\n");
4416
+ printk(KERN_DEBUG "pending exception: not handled yet\n");
4420
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
4423
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
4424
- u32 msr, u32* result)
4425
+ u32 msr, u32 *result)
4427
u32 vmx_msr_low, vmx_msr_high;
4428
u32 ctl = ctl_min | ctl_opt;
4429
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4431
u32 _pin_based_exec_control = 0;
4432
u32 _cpu_based_exec_control = 0;
4433
+ u32 _cpu_based_2nd_exec_control = 0;
4434
u32 _vmexit_control = 0;
4435
u32 _vmentry_control = 0;
4437
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4438
CPU_BASED_USE_IO_BITMAPS |
4439
CPU_BASED_MOV_DR_EXITING |
4440
CPU_BASED_USE_TSC_OFFSETING;
4441
-#ifdef CONFIG_X86_64
4442
- opt = CPU_BASED_TPR_SHADOW;
4446
+ opt = CPU_BASED_TPR_SHADOW |
4447
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
4448
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
4449
&_cpu_based_exec_control) < 0)
4451
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4452
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
4453
~CPU_BASED_CR8_STORE_EXITING;
4455
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
4457
+ opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
4458
+ SECONDARY_EXEC_WBINVD_EXITING;
4459
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
4460
+ &_cpu_based_2nd_exec_control) < 0)
4463
+#ifndef CONFIG_X86_64
4464
+ if (!(_cpu_based_2nd_exec_control &
4465
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
4466
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
4470
#ifdef CONFIG_X86_64
4471
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
4473
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
4474
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
4475
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
4476
vmcs_conf->vmexit_ctrl = _vmexit_control;
4477
vmcs_conf->vmentry_ctrl = _vmentry_control;
4479
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
4481
unsigned long flags;
4483
- vcpu->rmode.active = 0;
4484
+ vcpu->arch.rmode.active = 0;
4486
- vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
4487
- vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
4488
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
4489
+ vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
4490
+ vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
4491
+ vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
4493
flags = vmcs_readl(GUEST_RFLAGS);
4494
flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
4495
- flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
4496
+ flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
4497
vmcs_writel(GUEST_RFLAGS, flags);
4499
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
4500
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
4502
update_exception_bitmap(vcpu);
4504
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
4505
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
4506
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
4507
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
4508
+ fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
4509
+ fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
4510
+ fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
4511
+ fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
4513
vmcs_write16(GUEST_SS_SELECTOR, 0);
4514
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
4515
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
4516
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
4519
-static gva_t rmode_tss_base(struct kvm* kvm)
4520
+static gva_t rmode_tss_base(struct kvm *kvm)
4522
- gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
4523
- return base_gfn << PAGE_SHIFT;
4524
+ if (!kvm->arch.tss_addr) {
4525
+ gfn_t base_gfn = kvm->memslots[0].base_gfn +
4526
+ kvm->memslots[0].npages - 3;
4527
+ return base_gfn << PAGE_SHIFT;
4529
+ return kvm->arch.tss_addr;
4532
static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
4533
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
4534
save->base = vmcs_readl(sf->base);
4535
save->limit = vmcs_read32(sf->limit);
4536
save->ar = vmcs_read32(sf->ar_bytes);
4537
- vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
4538
+ vmcs_write16(sf->selector, save->base >> 4);
4539
+ vmcs_write32(sf->base, save->base & 0xfffff);
4540
vmcs_write32(sf->limit, 0xffff);
4541
vmcs_write32(sf->ar_bytes, 0xf3);
4543
@@ -1095,19 +1167,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
4545
unsigned long flags;
4547
- vcpu->rmode.active = 1;
4548
+ vcpu->arch.rmode.active = 1;
4550
- vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
4551
+ vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
4552
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
4554
- vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
4555
+ vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
4556
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
4558
- vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
4559
+ vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
4560
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4562
flags = vmcs_readl(GUEST_RFLAGS);
4563
- vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
4564
+ vcpu->arch.rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
4566
flags |= IOPL_MASK | X86_EFLAGS_VM;
4568
@@ -1125,10 +1197,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
4569
vmcs_writel(GUEST_CS_BASE, 0xf0000);
4570
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
4572
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
4573
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
4574
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
4575
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
4576
+ fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
4577
+ fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
4578
+ fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
4579
+ fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
4581
kvm_mmu_reset_context(vcpu);
4582
init_rmode_tss(vcpu->kvm);
4583
@@ -1149,7 +1221,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
4584
| AR_TYPE_BUSY_64_TSS);
4587
- vcpu->shadow_efer |= EFER_LMA;
4588
+ vcpu->arch.shadow_efer |= EFER_LMA;
4590
find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
4591
vmcs_write32(VM_ENTRY_CONTROLS,
4592
@@ -1159,7 +1231,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
4594
static void exit_lmode(struct kvm_vcpu *vcpu)
4596
- vcpu->shadow_efer &= ~EFER_LMA;
4597
+ vcpu->arch.shadow_efer &= ~EFER_LMA;
4599
vmcs_write32(VM_ENTRY_CONTROLS,
4600
vmcs_read32(VM_ENTRY_CONTROLS)
4601
@@ -1170,22 +1242,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
4603
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
4605
- vcpu->cr4 &= KVM_GUEST_CR4_MASK;
4606
- vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
4607
+ vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
4608
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
4611
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4613
vmx_fpu_deactivate(vcpu);
4615
- if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
4616
+ if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
4619
- if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
4620
+ if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
4623
#ifdef CONFIG_X86_64
4624
- if (vcpu->shadow_efer & EFER_LME) {
4625
+ if (vcpu->arch.shadow_efer & EFER_LME) {
4626
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
4628
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
4629
@@ -1196,7 +1268,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4630
vmcs_writel(CR0_READ_SHADOW, cr0);
4631
vmcs_writel(GUEST_CR0,
4632
(cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
4634
+ vcpu->arch.cr0 = cr0;
4636
if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
4637
vmx_fpu_activate(vcpu);
4638
@@ -1205,16 +1277,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4639
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4641
vmcs_writel(GUEST_CR3, cr3);
4642
- if (vcpu->cr0 & X86_CR0_PE)
4643
+ if (vcpu->arch.cr0 & X86_CR0_PE)
4644
vmx_fpu_deactivate(vcpu);
4647
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4649
vmcs_writel(CR4_READ_SHADOW, cr4);
4650
- vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
4651
+ vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
4652
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
4654
+ vcpu->arch.cr4 = cr4;
4657
#ifdef CONFIG_X86_64
4658
@@ -1224,7 +1296,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
4659
struct vcpu_vmx *vmx = to_vmx(vcpu);
4660
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
4662
- vcpu->shadow_efer = efer;
4663
+ vcpu->arch.shadow_efer = efer;
4664
if (efer & EFER_LMA) {
4665
vmcs_write32(VM_ENTRY_CONTROLS,
4666
vmcs_read32(VM_ENTRY_CONTROLS) |
4667
@@ -1301,17 +1373,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
4668
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4671
- if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
4672
- vcpu->rmode.tr.selector = var->selector;
4673
- vcpu->rmode.tr.base = var->base;
4674
- vcpu->rmode.tr.limit = var->limit;
4675
- vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
4676
+ if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
4677
+ vcpu->arch.rmode.tr.selector = var->selector;
4678
+ vcpu->arch.rmode.tr.base = var->base;
4679
+ vcpu->arch.rmode.tr.limit = var->limit;
4680
+ vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
4683
vmcs_writel(sf->base, var->base);
4684
vmcs_write32(sf->limit, var->limit);
4685
vmcs_write16(sf->selector, var->selector);
4686
- if (vcpu->rmode.active && var->s) {
4687
+ if (vcpu->arch.rmode.active && var->s) {
4689
* Hack real-mode segments into vm86 compatibility.
4691
@@ -1355,35 +1427,30 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
4692
vmcs_writel(GUEST_GDTR_BASE, dt->base);
4695
-static int init_rmode_tss(struct kvm* kvm)
4696
+static int init_rmode_tss(struct kvm *kvm)
4698
- struct page *p1, *p2, *p3;
4699
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
4702
- p1 = gfn_to_page(kvm, fn++);
4703
- p2 = gfn_to_page(kvm, fn++);
4704
- p3 = gfn_to_page(kvm, fn);
4708
- if (!p1 || !p2 || !p3) {
4709
- kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
4710
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4713
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4714
+ r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
4717
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4720
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4724
+ r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4730
- page = kmap_atomic(p1, KM_USER0);
4732
- *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4733
- kunmap_atomic(page, KM_USER0);
4735
- page = kmap_atomic(p2, KM_USER0);
4737
- kunmap_atomic(page, KM_USER0);
4739
- page = kmap_atomic(p3, KM_USER0);
4741
- *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
4742
- kunmap_atomic(page, KM_USER0);
4747
@@ -1397,6 +1464,27 @@ static void seg_setup(int seg)
4748
vmcs_write32(sf->ar_bytes, 0x93);
4751
+static int alloc_apic_access_page(struct kvm *kvm)
4753
+ struct kvm_userspace_memory_region kvm_userspace_mem;
4756
+ mutex_lock(&kvm->lock);
4757
+ if (kvm->arch.apic_access_page)
4759
+ kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
4760
+ kvm_userspace_mem.flags = 0;
4761
+ kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
4762
+ kvm_userspace_mem.memory_size = PAGE_SIZE;
4763
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
4766
+ kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
4768
+ mutex_unlock(&kvm->lock);
4773
* Sets up the vmcs for emulated real mode.
4775
@@ -1407,92 +1495,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4777
struct descriptor_table dt;
4780
unsigned long kvm_vmx_return;
4784
- if (!init_rmode_tss(vmx->vcpu.kvm)) {
4789
- vmx->vcpu.rmode.active = 0;
4791
- vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4792
- set_cr8(&vmx->vcpu, 0);
4793
- msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
4794
- if (vmx->vcpu.vcpu_id == 0)
4795
- msr |= MSR_IA32_APICBASE_BSP;
4796
- kvm_set_apic_base(&vmx->vcpu, msr);
4798
- fx_init(&vmx->vcpu);
4801
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
4802
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
4804
- if (vmx->vcpu.vcpu_id == 0) {
4805
- vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4806
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
4808
- vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
4809
- vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
4811
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
4812
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
4814
- seg_setup(VCPU_SREG_DS);
4815
- seg_setup(VCPU_SREG_ES);
4816
- seg_setup(VCPU_SREG_FS);
4817
- seg_setup(VCPU_SREG_GS);
4818
- seg_setup(VCPU_SREG_SS);
4820
- vmcs_write16(GUEST_TR_SELECTOR, 0);
4821
- vmcs_writel(GUEST_TR_BASE, 0);
4822
- vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4823
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4825
- vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4826
- vmcs_writel(GUEST_LDTR_BASE, 0);
4827
- vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4828
- vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4830
- vmcs_write32(GUEST_SYSENTER_CS, 0);
4831
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
4832
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
4834
- vmcs_writel(GUEST_RFLAGS, 0x02);
4835
- if (vmx->vcpu.vcpu_id == 0)
4836
- vmcs_writel(GUEST_RIP, 0xfff0);
4838
- vmcs_writel(GUEST_RIP, 0);
4839
- vmcs_writel(GUEST_RSP, 0);
4841
- //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
4842
- vmcs_writel(GUEST_DR7, 0x400);
4844
- vmcs_writel(GUEST_GDTR_BASE, 0);
4845
- vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4847
- vmcs_writel(GUEST_IDTR_BASE, 0);
4848
- vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4850
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
4851
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4852
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4855
vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
4856
vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
4858
- guest_write_tsc(0);
4860
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4862
- /* Special registers */
4863
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4866
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
4867
vmcs_config.pin_based_exec_ctrl);
4868
@@ -1507,8 +1518,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4870
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
4872
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4873
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4874
+ if (cpu_has_secondary_exec_ctrls()) {
4875
+ exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4876
+ if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4878
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4879
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
4882
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
4883
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
4884
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4886
vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
4887
@@ -1536,7 +1555,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4889
vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
4891
- asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
4892
+ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
4893
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
4894
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4895
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4896
@@ -1567,97 +1586,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4902
vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
4904
/* 22.2.1, 20.8.1 */
4905
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
4907
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4909
-#ifdef CONFIG_X86_64
4910
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4911
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
4912
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4913
- page_to_phys(vmx->vcpu.apic->regs_page));
4914
- vmcs_write32(TPR_THRESHOLD, 0);
4917
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4918
vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
4920
- vmx->vcpu.cr0 = 0x60000010;
4921
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
4922
- vmx_set_cr4(&vmx->vcpu, 0);
4923
-#ifdef CONFIG_X86_64
4924
- vmx_set_efer(&vmx->vcpu, 0);
4926
- vmx_fpu_activate(&vmx->vcpu);
4927
- update_exception_bitmap(&vmx->vcpu);
4928
+ if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4929
+ if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
4938
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4939
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4941
struct vcpu_vmx *vmx = to_vmx(vcpu);
4945
- vmx_vcpu_setup(vmx);
4948
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
4953
- unsigned long flags;
4954
- unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
4955
- u16 sp = vmcs_readl(GUEST_RSP);
4956
- u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4958
- if (sp > ss_limit || sp < 6 ) {
4959
- vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
4961
- vmcs_readl(GUEST_RSP),
4962
- vmcs_readl(GUEST_SS_BASE),
4963
- vmcs_read32(GUEST_SS_LIMIT));
4965
+ if (!init_rmode_tss(vmx->vcpu.kvm)) {
4970
- if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
4971
- X86EMUL_CONTINUE) {
4972
- vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
4974
+ vmx->vcpu.arch.rmode.active = 0;
4976
+ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4977
+ set_cr8(&vmx->vcpu, 0);
4978
+ msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
4979
+ if (vmx->vcpu.vcpu_id == 0)
4980
+ msr |= MSR_IA32_APICBASE_BSP;
4981
+ kvm_set_apic_base(&vmx->vcpu, msr);
4983
+ fx_init(&vmx->vcpu);
4986
+ * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
4987
+ * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
4989
+ if (vmx->vcpu.vcpu_id == 0) {
4990
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4991
+ vmcs_writel(GUEST_CS_BASE, 0x000f0000);
4993
+ vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4994
+ vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4996
+ vmcs_write32(GUEST_CS_LIMIT, 0xffff);
4997
+ vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
4999
- flags = vmcs_readl(GUEST_RFLAGS);
5000
- cs = vmcs_readl(GUEST_CS_BASE) >> 4;
5001
- ip = vmcs_readl(GUEST_RIP);
5002
+ seg_setup(VCPU_SREG_DS);
5003
+ seg_setup(VCPU_SREG_ES);
5004
+ seg_setup(VCPU_SREG_FS);
5005
+ seg_setup(VCPU_SREG_GS);
5006
+ seg_setup(VCPU_SREG_SS);
5008
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
5009
+ vmcs_writel(GUEST_TR_BASE, 0);
5010
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5011
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5013
- if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
5014
- emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
5015
- emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
5016
- vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
5018
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5019
+ vmcs_writel(GUEST_LDTR_BASE, 0);
5020
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5021
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5023
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
5024
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
5025
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
5027
+ vmcs_writel(GUEST_RFLAGS, 0x02);
5028
+ if (vmx->vcpu.vcpu_id == 0)
5029
+ vmcs_writel(GUEST_RIP, 0xfff0);
5031
+ vmcs_writel(GUEST_RIP, 0);
5032
+ vmcs_writel(GUEST_RSP, 0);
5034
+ /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
5035
+ vmcs_writel(GUEST_DR7, 0x400);
5037
+ vmcs_writel(GUEST_GDTR_BASE, 0);
5038
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5040
+ vmcs_writel(GUEST_IDTR_BASE, 0);
5041
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5043
+ vmcs_write32(GUEST_ACTIVITY_STATE, 0);
5044
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5045
+ vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5047
+ guest_write_tsc(0);
5049
+ /* Special registers */
5050
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5054
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5056
+ if (cpu_has_vmx_tpr_shadow()) {
5057
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5058
+ if (vm_need_tpr_shadow(vmx->vcpu.kvm))
5059
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5060
+ page_to_phys(vmx->vcpu.arch.apic->regs_page));
5061
+ vmcs_write32(TPR_THRESHOLD, 0);
5064
- vmcs_writel(GUEST_RFLAGS, flags &
5065
- ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
5066
- vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
5067
- vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
5068
- vmcs_writel(GUEST_RIP, ent[0]);
5069
- vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
5070
+ if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
5071
+ vmcs_write64(APIC_ACCESS_ADDR,
5072
+ page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
5074
+ vmx->vcpu.arch.cr0 = 0x60000010;
5075
+ vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
5076
+ vmx_set_cr4(&vmx->vcpu, 0);
5077
+#ifdef CONFIG_X86_64
5078
+ vmx_set_efer(&vmx->vcpu, 0);
5080
+ vmx_fpu_activate(&vmx->vcpu);
5081
+ update_exception_bitmap(&vmx->vcpu);
5089
static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
5091
- if (vcpu->rmode.active) {
5092
- inject_rmode_irq(vcpu, irq);
5093
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
5095
+ if (vcpu->arch.rmode.active) {
5096
+ vmx->rmode.irq.pending = true;
5097
+ vmx->rmode.irq.vector = irq;
5098
+ vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
5099
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5100
+ irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
5101
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
5102
+ vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
5105
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5106
@@ -1666,13 +1733,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
5108
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
5110
- int word_index = __ffs(vcpu->irq_summary);
5111
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
5112
+ int word_index = __ffs(vcpu->arch.irq_summary);
5113
+ int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
5114
int irq = word_index * BITS_PER_LONG + bit_index;
5116
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
5117
- if (!vcpu->irq_pending[word_index])
5118
- clear_bit(word_index, &vcpu->irq_summary);
5119
+ clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
5120
+ if (!vcpu->arch.irq_pending[word_index])
5121
+ clear_bit(word_index, &vcpu->arch.irq_summary);
5122
vmx_inject_irq(vcpu, irq);
5125
@@ -1682,12 +1749,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
5127
u32 cpu_based_vm_exec_control;
5129
- vcpu->interrupt_window_open =
5130
+ vcpu->arch.interrupt_window_open =
5131
((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
5132
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
5134
- if (vcpu->interrupt_window_open &&
5135
- vcpu->irq_summary &&
5136
+ if (vcpu->arch.interrupt_window_open &&
5137
+ vcpu->arch.irq_summary &&
5138
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
5140
* If interrupts enabled, and not blocked by sti or mov ss. Good.
5141
@@ -1695,8 +1762,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
5142
kvm_do_inject_irq(vcpu);
5144
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5145
- if (!vcpu->interrupt_window_open &&
5146
- (vcpu->irq_summary || kvm_run->request_interrupt_window))
5147
+ if (!vcpu->arch.interrupt_window_open &&
5148
+ (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
5150
* Interrupts blocked. Wait for unblock.
5152
@@ -1706,6 +1773,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
5153
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5156
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5159
+ struct kvm_userspace_memory_region tss_mem = {
5161
+ .guest_phys_addr = addr,
5162
+ .memory_size = PAGE_SIZE * 3,
5166
+ ret = kvm_set_memory_region(kvm, &tss_mem, 0);
5169
+ kvm->arch.tss_addr = addr;
5173
static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
5175
struct kvm_guest_debug *dbg = &vcpu->guest_debug;
5176
@@ -1727,7 +1811,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
5177
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5178
int vec, u32 err_code)
5180
- if (!vcpu->rmode.active)
5181
+ if (!vcpu->arch.rmode.active)
5185
@@ -1735,32 +1819,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5186
* Cause the #SS fault with 0 error code in VM86 mode.
5188
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
5189
- if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
5190
+ if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
5195
static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5197
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
5198
u32 intr_info, error_code;
5199
unsigned long cr2, rip;
5201
enum emulation_result er;
5204
- vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
5205
+ vect_info = vmx->idt_vectoring_info;
5206
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5208
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5209
- !is_page_fault(intr_info)) {
5210
+ !is_page_fault(intr_info))
5211
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
5212
"intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
5215
if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
5216
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
5217
- set_bit(irq, vcpu->irq_pending);
5218
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
5219
+ set_bit(irq, vcpu->arch.irq_pending);
5220
+ set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
5223
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
5224
@@ -1771,52 +1854,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5228
+ if (is_invalid_opcode(intr_info)) {
5229
+ er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
5230
+ if (er != EMULATE_DONE)
5231
+ kvm_queue_exception(vcpu, UD_VECTOR);
5236
rip = vmcs_readl(GUEST_RIP);
5237
if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
5238
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5239
if (is_page_fault(intr_info)) {
5240
cr2 = vmcs_readl(EXIT_QUALIFICATION);
5242
- mutex_lock(&vcpu->kvm->lock);
5243
- r = kvm_mmu_page_fault(vcpu, cr2, error_code);
5245
- mutex_unlock(&vcpu->kvm->lock);
5249
- mutex_unlock(&vcpu->kvm->lock);
5253
- er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
5254
- mutex_unlock(&vcpu->kvm->lock);
5257
- case EMULATE_DONE:
5259
- case EMULATE_DO_MMIO:
5260
- ++vcpu->stat.mmio_exits;
5262
- case EMULATE_FAIL:
5263
- kvm_report_emulation_failure(vcpu, "pagetable");
5268
+ return kvm_mmu_page_fault(vcpu, cr2, error_code);
5271
- if (vcpu->rmode.active &&
5272
+ if (vcpu->arch.rmode.active &&
5273
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
5275
- if (vcpu->halt_request) {
5276
- vcpu->halt_request = 0;
5277
+ if (vcpu->arch.halt_request) {
5278
+ vcpu->arch.halt_request = 0;
5279
return kvm_emulate_halt(vcpu);
5284
- if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
5285
+ if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
5286
+ (INTR_TYPE_EXCEPTION | 1)) {
5287
kvm_run->exit_reason = KVM_EXIT_DEBUG;
5290
@@ -1850,7 +1915,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5291
string = (exit_qualification & 16) != 0;
5294
- if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
5295
+ if (emulate_instruction(vcpu,
5296
+ kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
5300
@@ -1873,7 +1939,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5301
hypercall[0] = 0x0f;
5302
hypercall[1] = 0x01;
5303
hypercall[2] = 0xc1;
5304
- hypercall[3] = 0xc3;
5307
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5308
@@ -1890,23 +1955,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5311
vcpu_load_rsp_rip(vcpu);
5312
- set_cr0(vcpu, vcpu->regs[reg]);
5313
+ set_cr0(vcpu, vcpu->arch.regs[reg]);
5314
skip_emulated_instruction(vcpu);
5317
vcpu_load_rsp_rip(vcpu);
5318
- set_cr3(vcpu, vcpu->regs[reg]);
5319
+ set_cr3(vcpu, vcpu->arch.regs[reg]);
5320
skip_emulated_instruction(vcpu);
5323
vcpu_load_rsp_rip(vcpu);
5324
- set_cr4(vcpu, vcpu->regs[reg]);
5325
+ set_cr4(vcpu, vcpu->arch.regs[reg]);
5326
skip_emulated_instruction(vcpu);
5329
vcpu_load_rsp_rip(vcpu);
5330
- set_cr8(vcpu, vcpu->regs[reg]);
5331
+ set_cr8(vcpu, vcpu->arch.regs[reg]);
5332
skip_emulated_instruction(vcpu);
5333
+ if (irqchip_in_kernel(vcpu->kvm))
5335
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
5338
@@ -1914,8 +1981,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5340
vcpu_load_rsp_rip(vcpu);
5341
vmx_fpu_deactivate(vcpu);
5342
- vcpu->cr0 &= ~X86_CR0_TS;
5343
- vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
5344
+ vcpu->arch.cr0 &= ~X86_CR0_TS;
5345
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
5346
vmx_fpu_activate(vcpu);
5347
skip_emulated_instruction(vcpu);
5349
@@ -1923,13 +1990,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5352
vcpu_load_rsp_rip(vcpu);
5353
- vcpu->regs[reg] = vcpu->cr3;
5354
+ vcpu->arch.regs[reg] = vcpu->arch.cr3;
5355
vcpu_put_rsp_rip(vcpu);
5356
skip_emulated_instruction(vcpu);
5359
vcpu_load_rsp_rip(vcpu);
5360
- vcpu->regs[reg] = get_cr8(vcpu);
5361
+ vcpu->arch.regs[reg] = get_cr8(vcpu);
5362
vcpu_put_rsp_rip(vcpu);
5363
skip_emulated_instruction(vcpu);
5365
@@ -1975,7 +2042,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5369
- vcpu->regs[reg] = val;
5370
+ vcpu->arch.regs[reg] = val;
5374
@@ -1992,29 +2059,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5376
static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5378
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
5379
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5382
if (vmx_get_msr(vcpu, ecx, &data)) {
5383
- vmx_inject_gp(vcpu, 0);
5384
+ kvm_inject_gp(vcpu, 0);
5388
/* FIXME: handling of bits 32:63 of rax, rdx */
5389
- vcpu->regs[VCPU_REGS_RAX] = data & -1u;
5390
- vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
5391
+ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
5392
+ vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
5393
skip_emulated_instruction(vcpu);
5397
static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5399
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
5400
- u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
5401
- | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
5402
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5403
+ u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
5404
+ | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
5406
if (vmx_set_msr(vcpu, ecx, data) != 0) {
5407
- vmx_inject_gp(vcpu, 0);
5408
+ kvm_inject_gp(vcpu, 0);
5412
@@ -2042,7 +2109,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
5415
if (kvm_run->request_interrupt_window &&
5416
- !vcpu->irq_summary) {
5417
+ !vcpu->arch.irq_summary) {
5418
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5419
++vcpu->stat.irq_window_exits;
5421
@@ -2059,7 +2126,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5422
static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5424
skip_emulated_instruction(vcpu);
5425
- return kvm_hypercall(vcpu, kvm_run);
5426
+ kvm_emulate_hypercall(vcpu);
5430
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5432
+ skip_emulated_instruction(vcpu);
5433
+ /* TODO: Add support for VT-d/pass-through device */
5437
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5439
+ u64 exit_qualification;
5440
+ enum emulation_result er;
5441
+ unsigned long offset;
5443
+ exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
5444
+ offset = exit_qualification & 0xffful;
5446
+ er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
5448
+ if (er != EMULATE_DONE) {
5450
+ "Fail to handle apic access vmexit! Offset is 0x%lx\n",
5458
@@ -2081,7 +2176,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
5459
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
5460
[EXIT_REASON_HLT] = handle_halt,
5461
[EXIT_REASON_VMCALL] = handle_vmcall,
5462
- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
5463
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5464
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5465
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
5468
static const int kvm_vmx_max_exit_handlers =
5469
@@ -2093,9 +2190,9 @@ static const int kvm_vmx_max_exit_handlers =
5471
static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
5473
- u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
5474
u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5475
struct vcpu_vmx *vmx = to_vmx(vcpu);
5476
+ u32 vectoring_info = vmx->idt_vectoring_info;
5478
if (unlikely(vmx->fail)) {
5479
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5480
@@ -2104,8 +2201,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
5484
- if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
5485
- exit_reason != EXIT_REASON_EXCEPTION_NMI )
5486
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5487
+ exit_reason != EXIT_REASON_EXCEPTION_NMI)
5488
printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
5489
"exit reason is 0x%x\n", __FUNCTION__, exit_reason);
5490
if (exit_reason < kvm_vmx_max_exit_handlers
5491
@@ -2150,26 +2247,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
5493
static void vmx_intr_assist(struct kvm_vcpu *vcpu)
5495
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
5496
u32 idtv_info_field, intr_info_field;
5497
int has_ext_irq, interrupt_window_open;
5500
- kvm_inject_pending_timer_irqs(vcpu);
5501
update_tpr_threshold(vcpu);
5503
has_ext_irq = kvm_cpu_has_interrupt(vcpu);
5504
intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
5505
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
5506
+ idtv_info_field = vmx->idt_vectoring_info;
5507
if (intr_info_field & INTR_INFO_VALID_MASK) {
5508
if (idtv_info_field & INTR_INFO_VALID_MASK) {
5509
/* TODO: fault when IDT_Vectoring */
5510
- printk(KERN_ERR "Fault when IDT_Vectoring\n");
5511
+ if (printk_ratelimit())
5512
+ printk(KERN_ERR "Fault when IDT_Vectoring\n");
5515
enable_irq_window(vcpu);
5518
if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
5519
+ if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
5520
+ == INTR_TYPE_EXT_INTR
5521
+ && vcpu->arch.rmode.active) {
5522
+ u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
5524
+ vmx_inject_irq(vcpu, vect);
5525
+ if (unlikely(has_ext_irq))
5526
+ enable_irq_window(vcpu);
5530
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
5531
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5532
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
5533
@@ -2194,6 +2303,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
5534
enable_irq_window(vcpu);
5538
+ * Failure to inject an interrupt should give us the information
5539
+ * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
5540
+ * when fetching the interrupt redirection bitmap in the real-mode
5541
+ * tss, this doesn't happen. So we do it ourselves.
5543
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
5545
+ vmx->rmode.irq.pending = 0;
5546
+ if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
5548
+ vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
5549
+ if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
5550
+ vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
5551
+ vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
5554
+ vmx->idt_vectoring_info =
5555
+ VECTORING_INFO_VALID_MASK
5556
+ | INTR_TYPE_EXT_INTR
5557
+ | vmx->rmode.irq.vector;
5560
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5562
struct vcpu_vmx *vmx = to_vmx(vcpu);
5563
@@ -2204,50 +2336,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5565
vmcs_writel(HOST_CR0, read_cr0());
5569
/* Store host registers */
5570
#ifdef CONFIG_X86_64
5571
- "push %%rax; push %%rbx; push %%rdx;"
5572
- "push %%rsi; push %%rdi; push %%rbp;"
5573
- "push %%r8; push %%r9; push %%r10; push %%r11;"
5574
- "push %%r12; push %%r13; push %%r14; push %%r15;"
5575
+ "push %%rdx; push %%rbp;"
5577
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
5579
- "pusha; push %%ecx \n\t"
5580
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
5581
+ "push %%edx; push %%ebp;"
5584
+ ASM_VMX_VMWRITE_RSP_RDX "\n\t"
5585
/* Check if vmlaunch of vmresume is needed */
5587
+ "cmpl $0, %c[launched](%0) \n\t"
5588
/* Load guest registers. Don't clobber flags. */
5589
#ifdef CONFIG_X86_64
5590
- "mov %c[cr2](%3), %%rax \n\t"
5591
+ "mov %c[cr2](%0), %%rax \n\t"
5592
"mov %%rax, %%cr2 \n\t"
5593
- "mov %c[rax](%3), %%rax \n\t"
5594
- "mov %c[rbx](%3), %%rbx \n\t"
5595
- "mov %c[rdx](%3), %%rdx \n\t"
5596
- "mov %c[rsi](%3), %%rsi \n\t"
5597
- "mov %c[rdi](%3), %%rdi \n\t"
5598
- "mov %c[rbp](%3), %%rbp \n\t"
5599
- "mov %c[r8](%3), %%r8 \n\t"
5600
- "mov %c[r9](%3), %%r9 \n\t"
5601
- "mov %c[r10](%3), %%r10 \n\t"
5602
- "mov %c[r11](%3), %%r11 \n\t"
5603
- "mov %c[r12](%3), %%r12 \n\t"
5604
- "mov %c[r13](%3), %%r13 \n\t"
5605
- "mov %c[r14](%3), %%r14 \n\t"
5606
- "mov %c[r15](%3), %%r15 \n\t"
5607
- "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
5608
+ "mov %c[rax](%0), %%rax \n\t"
5609
+ "mov %c[rbx](%0), %%rbx \n\t"
5610
+ "mov %c[rdx](%0), %%rdx \n\t"
5611
+ "mov %c[rsi](%0), %%rsi \n\t"
5612
+ "mov %c[rdi](%0), %%rdi \n\t"
5613
+ "mov %c[rbp](%0), %%rbp \n\t"
5614
+ "mov %c[r8](%0), %%r8 \n\t"
5615
+ "mov %c[r9](%0), %%r9 \n\t"
5616
+ "mov %c[r10](%0), %%r10 \n\t"
5617
+ "mov %c[r11](%0), %%r11 \n\t"
5618
+ "mov %c[r12](%0), %%r12 \n\t"
5619
+ "mov %c[r13](%0), %%r13 \n\t"
5620
+ "mov %c[r14](%0), %%r14 \n\t"
5621
+ "mov %c[r15](%0), %%r15 \n\t"
5622
+ "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
5624
- "mov %c[cr2](%3), %%eax \n\t"
5625
+ "mov %c[cr2](%0), %%eax \n\t"
5626
"mov %%eax, %%cr2 \n\t"
5627
- "mov %c[rax](%3), %%eax \n\t"
5628
- "mov %c[rbx](%3), %%ebx \n\t"
5629
- "mov %c[rdx](%3), %%edx \n\t"
5630
- "mov %c[rsi](%3), %%esi \n\t"
5631
- "mov %c[rdi](%3), %%edi \n\t"
5632
- "mov %c[rbp](%3), %%ebp \n\t"
5633
- "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
5634
+ "mov %c[rax](%0), %%eax \n\t"
5635
+ "mov %c[rbx](%0), %%ebx \n\t"
5636
+ "mov %c[rdx](%0), %%edx \n\t"
5637
+ "mov %c[rsi](%0), %%esi \n\t"
5638
+ "mov %c[rdi](%0), %%edi \n\t"
5639
+ "mov %c[rbp](%0), %%ebp \n\t"
5640
+ "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
5642
/* Enter guest mode */
5643
"jne .Llaunched \n\t"
5644
@@ -2257,72 +2386,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5645
".Lkvm_vmx_return: "
5646
/* Save guest registers, load host registers, keep flags */
5647
#ifdef CONFIG_X86_64
5648
- "xchg %3, (%%rsp) \n\t"
5649
- "mov %%rax, %c[rax](%3) \n\t"
5650
- "mov %%rbx, %c[rbx](%3) \n\t"
5651
- "pushq (%%rsp); popq %c[rcx](%3) \n\t"
5652
- "mov %%rdx, %c[rdx](%3) \n\t"
5653
- "mov %%rsi, %c[rsi](%3) \n\t"
5654
- "mov %%rdi, %c[rdi](%3) \n\t"
5655
- "mov %%rbp, %c[rbp](%3) \n\t"
5656
- "mov %%r8, %c[r8](%3) \n\t"
5657
- "mov %%r9, %c[r9](%3) \n\t"
5658
- "mov %%r10, %c[r10](%3) \n\t"
5659
- "mov %%r11, %c[r11](%3) \n\t"
5660
- "mov %%r12, %c[r12](%3) \n\t"
5661
- "mov %%r13, %c[r13](%3) \n\t"
5662
- "mov %%r14, %c[r14](%3) \n\t"
5663
- "mov %%r15, %c[r15](%3) \n\t"
5664
+ "xchg %0, (%%rsp) \n\t"
5665
+ "mov %%rax, %c[rax](%0) \n\t"
5666
+ "mov %%rbx, %c[rbx](%0) \n\t"
5667
+ "pushq (%%rsp); popq %c[rcx](%0) \n\t"
5668
+ "mov %%rdx, %c[rdx](%0) \n\t"
5669
+ "mov %%rsi, %c[rsi](%0) \n\t"
5670
+ "mov %%rdi, %c[rdi](%0) \n\t"
5671
+ "mov %%rbp, %c[rbp](%0) \n\t"
5672
+ "mov %%r8, %c[r8](%0) \n\t"
5673
+ "mov %%r9, %c[r9](%0) \n\t"
5674
+ "mov %%r10, %c[r10](%0) \n\t"
5675
+ "mov %%r11, %c[r11](%0) \n\t"
5676
+ "mov %%r12, %c[r12](%0) \n\t"
5677
+ "mov %%r13, %c[r13](%0) \n\t"
5678
+ "mov %%r14, %c[r14](%0) \n\t"
5679
+ "mov %%r15, %c[r15](%0) \n\t"
5680
"mov %%cr2, %%rax \n\t"
5681
- "mov %%rax, %c[cr2](%3) \n\t"
5682
- "mov (%%rsp), %3 \n\t"
5683
+ "mov %%rax, %c[cr2](%0) \n\t"
5685
- "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
5686
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
5687
- "pop %%rbp; pop %%rdi; pop %%rsi;"
5688
- "pop %%rdx; pop %%rbx; pop %%rax \n\t"
5689
+ "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
5691
- "xchg %3, (%%esp) \n\t"
5692
- "mov %%eax, %c[rax](%3) \n\t"
5693
- "mov %%ebx, %c[rbx](%3) \n\t"
5694
- "pushl (%%esp); popl %c[rcx](%3) \n\t"
5695
- "mov %%edx, %c[rdx](%3) \n\t"
5696
- "mov %%esi, %c[rsi](%3) \n\t"
5697
- "mov %%edi, %c[rdi](%3) \n\t"
5698
- "mov %%ebp, %c[rbp](%3) \n\t"
5699
+ "xchg %0, (%%esp) \n\t"
5700
+ "mov %%eax, %c[rax](%0) \n\t"
5701
+ "mov %%ebx, %c[rbx](%0) \n\t"
5702
+ "pushl (%%esp); popl %c[rcx](%0) \n\t"
5703
+ "mov %%edx, %c[rdx](%0) \n\t"
5704
+ "mov %%esi, %c[rsi](%0) \n\t"
5705
+ "mov %%edi, %c[rdi](%0) \n\t"
5706
+ "mov %%ebp, %c[rbp](%0) \n\t"
5707
"mov %%cr2, %%eax \n\t"
5708
- "mov %%eax, %c[cr2](%3) \n\t"
5709
- "mov (%%esp), %3 \n\t"
5710
+ "mov %%eax, %c[cr2](%0) \n\t"
5712
- "pop %%ecx; popa \n\t"
5713
+ "pop %%ebp; pop %%ebp; pop %%edx \n\t"
5716
- : "=q" (vmx->fail)
5717
- : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
5719
- [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
5720
- [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
5721
- [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
5722
- [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
5723
- [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
5724
- [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
5725
- [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
5726
+ "setbe %c[fail](%0) \n\t"
5727
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP),
5728
+ [launched]"i"(offsetof(struct vcpu_vmx, launched)),
5729
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
5730
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
5731
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
5732
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
5733
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
5734
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
5735
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
5736
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
5737
#ifdef CONFIG_X86_64
5738
- [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
5739
- [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
5740
- [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
5741
- [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
5742
- [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
5743
- [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
5744
- [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
5745
- [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
5746
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
5747
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
5748
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
5749
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
5750
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
5751
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
5752
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
5753
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
5755
- [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
5756
- : "cc", "memory" );
5757
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
5759
+#ifdef CONFIG_X86_64
5760
+ , "rbx", "rdi", "rsi"
5761
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
5763
+ , "ebx", "edi", "rsi"
5767
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
5768
+ if (vmx->rmode.irq.pending)
5769
+ fixup_rmode_irq(vmx);
5771
- vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
5772
+ vcpu->arch.interrupt_window_open =
5773
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
5775
- asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
5776
+ asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
5779
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5780
@@ -2332,36 +2468,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5784
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
5785
- unsigned long addr,
5788
- u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
5790
- ++vcpu->stat.pf_guest;
5792
- if (is_page_fault(vect_info)) {
5793
- printk(KERN_DEBUG "inject_page_fault: "
5794
- "double fault 0x%lx @ 0x%lx\n",
5795
- addr, vmcs_readl(GUEST_RIP));
5796
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
5797
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5799
- INTR_TYPE_EXCEPTION |
5800
- INTR_INFO_DELIEVER_CODE_MASK |
5801
- INTR_INFO_VALID_MASK);
5805
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
5806
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5808
- INTR_TYPE_EXCEPTION |
5809
- INTR_INFO_DELIEVER_CODE_MASK |
5810
- INTR_INFO_VALID_MASK);
5814
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
5816
struct vcpu_vmx *vmx = to_vmx(vcpu);
5817
@@ -2397,12 +2503,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
5821
- if (irqchip_in_kernel(kvm)) {
5822
- err = kvm_create_lapic(&vmx->vcpu);
5827
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
5828
if (!vmx->guest_msrs) {
5830
@@ -2499,9 +2599,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
5831
.set_rflags = vmx_set_rflags,
5833
.tlb_flush = vmx_flush_tlb,
5834
- .inject_page_fault = vmx_inject_page_fault,
5836
- .inject_gp = vmx_inject_gp,
5838
.run = vmx_vcpu_run,
5839
.handle_exit = kvm_handle_exit,
5840
@@ -2509,8 +2606,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
5841
.patch_hypercall = vmx_patch_hypercall,
5842
.get_irq = vmx_get_irq,
5843
.set_irq = vmx_inject_irq,
5844
+ .queue_exception = vmx_queue_exception,
5845
+ .exception_injected = vmx_exception_injected,
5846
.inject_pending_irq = vmx_intr_assist,
5847
.inject_pending_vectors = do_interrupt_requests,
5849
+ .set_tss_addr = vmx_set_tss_addr,
5852
static int __init vmx_init(void)
5853
@@ -2541,10 +2642,13 @@ static int __init vmx_init(void)
5854
memset(iova, 0xff, PAGE_SIZE);
5855
kunmap(vmx_io_bitmap_b);
5857
- r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
5858
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
5862
+ if (bypass_guest_pf)
5863
+ kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
5868
@@ -2559,7 +2663,7 @@ static void __exit vmx_exit(void)
5869
__free_page(vmx_io_bitmap_b);
5870
__free_page(vmx_io_bitmap_a);
5876
module_init(vmx_init)
5877
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h
5878
similarity index 96%
5879
rename from drivers/kvm/vmx.h
5880
rename to arch/x86/kvm/vmx.h
5881
index fd4e146..d52ae8d 100644
5882
--- a/drivers/kvm/vmx.h
5883
+++ b/arch/x86/kvm/vmx.h
5889
+ * Definitions of Primary Processor-Based VM-Execution Controls.
5891
#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
5892
#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
5893
#define CPU_BASED_HLT_EXITING 0x00000080
5895
#define CPU_BASED_MONITOR_EXITING 0x20000000
5896
#define CPU_BASED_PAUSE_EXITING 0x40000000
5897
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
5899
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
5901
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
5902
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
5905
#define PIN_BASED_EXT_INTR_MASK 0x00000001
5906
#define PIN_BASED_NMI_EXITING 0x00000008
5908
#define VM_ENTRY_SMM 0x00000400
5909
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
5911
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
5913
/* VMCS Encodings */
5915
GUEST_ES_SELECTOR = 0x00000800,
5916
@@ -89,6 +96,8 @@ enum vmcs_field {
5917
TSC_OFFSET_HIGH = 0x00002011,
5918
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
5919
VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
5920
+ APIC_ACCESS_ADDR = 0x00002014,
5921
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
5922
VMCS_LINK_POINTER = 0x00002800,
5923
VMCS_LINK_POINTER_HIGH = 0x00002801,
5924
GUEST_IA32_DEBUGCTL = 0x00002802,
5925
@@ -214,6 +223,8 @@ enum vmcs_field {
5926
#define EXIT_REASON_MSR_WRITE 32
5927
#define EXIT_REASON_MWAIT_INSTRUCTION 36
5928
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
5929
+#define EXIT_REASON_APIC_ACCESS 44
5930
+#define EXIT_REASON_WBINVD 54
5933
* Interruption-information format
5934
@@ -230,13 +241,14 @@ enum vmcs_field {
5936
#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
5937
#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
5938
+#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
5941
* Exit Qualifications for MOV for Control Register Access
5943
-#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
5944
+#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
5945
#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
5946
-#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
5947
+#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
5948
#define LMSW_SOURCE_DATA_SHIFT 16
5949
#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
5950
#define REG_EAX (0 << 8)
5951
@@ -259,11 +271,11 @@ enum vmcs_field {
5953
* Exit Qualifications for MOV for Debug Register Access
5955
-#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
5956
+#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
5957
#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
5958
#define TYPE_MOV_TO_DR (0 << 4)
5959
#define TYPE_MOV_FROM_DR (1 << 4)
5960
-#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
5961
+#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
5965
@@ -307,4 +319,6 @@ enum vmcs_field {
5966
#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
5967
#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
5969
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
5972
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c
5973
similarity index 52%
5974
rename from drivers/kvm/kvm_main.c
5975
rename to arch/x86/kvm/x86.c
5976
index 47c10b8..4b26270 100644
5977
--- a/drivers/kvm/kvm_main.c
5978
+++ b/arch/x86/kvm/x86.c
5981
* Kernel-based Virtual Machine driver for Linux
5983
- * This module enables machines with Intel VT-x extensions to run virtual
5984
- * machines without emulation or binary translation.
5985
+ * derived from drivers/kvm/kvm_main.c
5987
* Copyright (C) 2006 Qumranet, Inc.
5994
-#include "x86_emulate.h"
5995
+#include <linux/kvm_host.h>
5996
#include "segment_descriptor.h"
6000
#include <linux/kvm.h>
6001
-#include <linux/module.h>
6002
-#include <linux/errno.h>
6003
-#include <linux/percpu.h>
6004
-#include <linux/gfp.h>
6005
-#include <linux/mm.h>
6006
-#include <linux/miscdevice.h>
6007
+#include <linux/fs.h>
6008
#include <linux/vmalloc.h>
6009
-#include <linux/reboot.h>
6010
-#include <linux/debugfs.h>
6011
+#include <linux/module.h>
6012
+#include <linux/mman.h>
6013
#include <linux/highmem.h>
6014
-#include <linux/file.h>
6015
-#include <linux/sysdev.h>
6016
-#include <linux/cpu.h>
6017
-#include <linux/sched.h>
6018
-#include <linux/cpumask.h>
6019
-#include <linux/smp.h>
6020
-#include <linux/anon_inodes.h>
6021
-#include <linux/profile.h>
6023
-#include <asm/processor.h>
6024
-#include <asm/msr.h>
6025
-#include <asm/io.h>
6026
-#include <asm/uaccess.h>
6027
-#include <asm/desc.h>
6029
-MODULE_AUTHOR("Qumranet");
6030
-MODULE_LICENSE("GPL");
6032
-static DEFINE_SPINLOCK(kvm_lock);
6033
-static LIST_HEAD(vm_list);
6035
-static cpumask_t cpus_hardware_enabled;
6037
-struct kvm_x86_ops *kvm_x86_ops;
6038
-struct kmem_cache *kvm_vcpu_cache;
6039
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
6041
-static __read_mostly struct preempt_ops kvm_preempt_ops;
6043
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
6045
-static struct kvm_stats_debugfs_item {
6048
- struct dentry *dentry;
6049
-} debugfs_entries[] = {
6050
- { "pf_fixed", STAT_OFFSET(pf_fixed) },
6051
- { "pf_guest", STAT_OFFSET(pf_guest) },
6052
- { "tlb_flush", STAT_OFFSET(tlb_flush) },
6053
- { "invlpg", STAT_OFFSET(invlpg) },
6054
- { "exits", STAT_OFFSET(exits) },
6055
- { "io_exits", STAT_OFFSET(io_exits) },
6056
- { "mmio_exits", STAT_OFFSET(mmio_exits) },
6057
- { "signal_exits", STAT_OFFSET(signal_exits) },
6058
- { "irq_window", STAT_OFFSET(irq_window_exits) },
6059
- { "halt_exits", STAT_OFFSET(halt_exits) },
6060
- { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
6061
- { "request_irq", STAT_OFFSET(request_irq_exits) },
6062
- { "irq_exits", STAT_OFFSET(irq_exits) },
6063
- { "light_exits", STAT_OFFSET(light_exits) },
6064
- { "efer_reload", STAT_OFFSET(efer_reload) },
6068
-static struct dentry *debugfs_dir;
6069
+#include <asm/uaccess.h>
6070
+#include <asm/msr.h>
6072
#define MAX_IO_MSRS 256
6074
#define CR0_RESERVED_BITS \
6075
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
6076
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
6077
@@ -102,317 +43,150 @@ static struct dentry *debugfs_dir;
6078
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
6079
#define EFER_RESERVED_BITS 0xfffffffffffff2fe
6081
-#ifdef CONFIG_X86_64
6082
-// LDT or TSS descriptor in the GDT. 16 bytes.
6083
-struct segment_descriptor_64 {
6084
- struct segment_descriptor s;
6088
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
6089
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
6092
+struct kvm_x86_ops *kvm_x86_ops;
6094
+struct kvm_stats_debugfs_item debugfs_entries[] = {
6095
+ { "pf_fixed", VCPU_STAT(pf_fixed) },
6096
+ { "pf_guest", VCPU_STAT(pf_guest) },
6097
+ { "tlb_flush", VCPU_STAT(tlb_flush) },
6098
+ { "invlpg", VCPU_STAT(invlpg) },
6099
+ { "exits", VCPU_STAT(exits) },
6100
+ { "io_exits", VCPU_STAT(io_exits) },
6101
+ { "mmio_exits", VCPU_STAT(mmio_exits) },
6102
+ { "signal_exits", VCPU_STAT(signal_exits) },
6103
+ { "irq_window", VCPU_STAT(irq_window_exits) },
6104
+ { "halt_exits", VCPU_STAT(halt_exits) },
6105
+ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
6106
+ { "request_irq", VCPU_STAT(request_irq_exits) },
6107
+ { "irq_exits", VCPU_STAT(irq_exits) },
6108
+ { "host_state_reload", VCPU_STAT(host_state_reload) },
6109
+ { "efer_reload", VCPU_STAT(efer_reload) },
6110
+ { "fpu_reload", VCPU_STAT(fpu_reload) },
6111
+ { "insn_emulation", VCPU_STAT(insn_emulation) },
6112
+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
6113
+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
6114
+ { "mmu_pte_write", VM_STAT(mmu_pte_write) },
6115
+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
6116
+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
6117
+ { "mmu_flooded", VM_STAT(mmu_flooded) },
6118
+ { "mmu_recycled", VM_STAT(mmu_recycled) },
6119
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
6123
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
6124
- unsigned long arg);
6126
unsigned long segment_base(u16 selector)
6128
struct descriptor_table gdt;
6129
struct segment_descriptor *d;
6130
unsigned long table_base;
6131
- typedef unsigned long ul;
6137
- asm ("sgdt %0" : "=m"(gdt));
6138
+ asm("sgdt %0" : "=m"(gdt));
6139
table_base = gdt.base;
6141
if (selector & 4) { /* from ldt */
6144
- asm ("sldt %0" : "=g"(ldt_selector));
6145
+ asm("sldt %0" : "=g"(ldt_selector));
6146
table_base = segment_base(ldt_selector);
6148
d = (struct segment_descriptor *)(table_base + (selector & ~7));
6149
- v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
6150
+ v = d->base_low | ((unsigned long)d->base_mid << 16) |
6151
+ ((unsigned long)d->base_high << 24);
6152
#ifdef CONFIG_X86_64
6153
- if (d->system == 0
6154
- && (d->type == 2 || d->type == 9 || d->type == 11))
6155
- v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
6156
+ if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
6157
+ v |= ((unsigned long) \
6158
+ ((struct segment_descriptor_64 *)d)->base_higher) << 32;
6162
EXPORT_SYMBOL_GPL(segment_base);
6164
-static inline int valid_vcpu(int n)
6166
- return likely(n >= 0 && n < KVM_MAX_VCPUS);
6169
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6171
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
6174
- vcpu->guest_fpu_loaded = 1;
6175
- fx_save(&vcpu->host_fx_image);
6176
- fx_restore(&vcpu->guest_fx_image);
6178
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
6180
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6182
- if (!vcpu->guest_fpu_loaded)
6185
- vcpu->guest_fpu_loaded = 0;
6186
- fx_save(&vcpu->guest_fx_image);
6187
- fx_restore(&vcpu->host_fx_image);
6189
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
6192
- * Switches to specified vcpu, until a matching vcpu_put()
6194
-static void vcpu_load(struct kvm_vcpu *vcpu)
6198
- mutex_lock(&vcpu->mutex);
6200
- preempt_notifier_register(&vcpu->preempt_notifier);
6201
- kvm_x86_ops->vcpu_load(vcpu, cpu);
6205
-static void vcpu_put(struct kvm_vcpu *vcpu)
6207
- preempt_disable();
6208
- kvm_x86_ops->vcpu_put(vcpu);
6209
- preempt_notifier_unregister(&vcpu->preempt_notifier);
6211
- mutex_unlock(&vcpu->mutex);
6214
-static void ack_flush(void *_completed)
6218
-void kvm_flush_remote_tlbs(struct kvm *kvm)
6222
- struct kvm_vcpu *vcpu;
6225
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
6226
- vcpu = kvm->vcpus[i];
6229
- if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
6232
- if (cpu != -1 && cpu != raw_smp_processor_id())
6233
- cpu_set(cpu, cpus);
6235
- smp_call_function_mask(cpus, ack_flush, NULL, 1);
6238
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
6239
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
6241
- struct page *page;
6244
- mutex_init(&vcpu->mutex);
6246
- vcpu->mmu.root_hpa = INVALID_PAGE;
6248
- vcpu->vcpu_id = id;
6249
- if (!irqchip_in_kernel(kvm) || id == 0)
6250
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
6251
+ if (irqchip_in_kernel(vcpu->kvm))
6252
+ return vcpu->arch.apic_base;
6254
- vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
6255
- init_waitqueue_head(&vcpu->wq);
6257
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
6262
- vcpu->run = page_address(page);
6264
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
6267
- goto fail_free_run;
6269
- vcpu->pio_data = page_address(page);
6271
- r = kvm_mmu_create(vcpu);
6273
- goto fail_free_pio_data;
6277
-fail_free_pio_data:
6278
- free_page((unsigned long)vcpu->pio_data);
6280
- free_page((unsigned long)vcpu->run);
6284
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
6286
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
6288
- kvm_mmu_destroy(vcpu);
6290
- hrtimer_cancel(&vcpu->apic->timer.dev);
6291
- kvm_free_apic(vcpu->apic);
6292
- free_page((unsigned long)vcpu->pio_data);
6293
- free_page((unsigned long)vcpu->run);
6295
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
6297
-static struct kvm *kvm_create_vm(void)
6299
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
6302
- return ERR_PTR(-ENOMEM);
6304
- kvm_io_bus_init(&kvm->pio_bus);
6305
- mutex_init(&kvm->lock);
6306
- INIT_LIST_HEAD(&kvm->active_mmu_pages);
6307
- kvm_io_bus_init(&kvm->mmio_bus);
6308
- spin_lock(&kvm_lock);
6309
- list_add(&kvm->vm_list, &vm_list);
6310
- spin_unlock(&kvm_lock);
6315
- * Free any memory in @free but not in @dont.
6317
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
6318
- struct kvm_memory_slot *dont)
6322
- if (!dont || free->phys_mem != dont->phys_mem)
6323
- if (free->phys_mem) {
6324
- for (i = 0; i < free->npages; ++i)
6325
- if (free->phys_mem[i])
6326
- __free_page(free->phys_mem[i]);
6327
- vfree(free->phys_mem);
6330
- if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
6331
- vfree(free->dirty_bitmap);
6333
- free->phys_mem = NULL;
6335
- free->dirty_bitmap = NULL;
6338
-static void kvm_free_physmem(struct kvm *kvm)
6342
- for (i = 0; i < kvm->nmemslots; ++i)
6343
- kvm_free_physmem_slot(&kvm->memslots[i], NULL);
6344
+ return vcpu->arch.apic_base;
6346
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
6348
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
6349
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
6353
- for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
6354
- if (vcpu->pio.guest_pages[i]) {
6355
- __free_page(vcpu->pio.guest_pages[i]);
6356
- vcpu->pio.guest_pages[i] = NULL;
6358
+ /* TODO: reserve bits check */
6359
+ if (irqchip_in_kernel(vcpu->kvm))
6360
+ kvm_lapic_set_base(vcpu, data);
6362
+ vcpu->arch.apic_base = data;
6364
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
6366
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6367
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
6370
- kvm_mmu_unload(vcpu);
6372
+ WARN_ON(vcpu->arch.exception.pending);
6373
+ vcpu->arch.exception.pending = true;
6374
+ vcpu->arch.exception.has_error_code = false;
6375
+ vcpu->arch.exception.nr = nr;
6377
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
6379
-static void kvm_free_vcpus(struct kvm *kvm)
6380
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
6386
- * Unpin any mmu pages first.
6388
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
6389
- if (kvm->vcpus[i])
6390
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
6391
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
6392
- if (kvm->vcpus[i]) {
6393
- kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
6394
- kvm->vcpus[i] = NULL;
6396
+ ++vcpu->stat.pf_guest;
6397
+ if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
6398
+ printk(KERN_DEBUG "kvm: inject_page_fault:"
6399
+ " double fault 0x%lx\n", addr);
6400
+ vcpu->arch.exception.nr = DF_VECTOR;
6401
+ vcpu->arch.exception.error_code = 0;
6405
+ vcpu->arch.cr2 = addr;
6406
+ kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
6409
-static void kvm_destroy_vm(struct kvm *kvm)
6410
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
6412
- spin_lock(&kvm_lock);
6413
- list_del(&kvm->vm_list);
6414
- spin_unlock(&kvm_lock);
6415
- kvm_io_bus_destroy(&kvm->pio_bus);
6416
- kvm_io_bus_destroy(&kvm->mmio_bus);
6418
- kfree(kvm->vioapic);
6419
- kvm_free_vcpus(kvm);
6420
- kvm_free_physmem(kvm);
6424
-static int kvm_vm_release(struct inode *inode, struct file *filp)
6426
- struct kvm *kvm = filp->private_data;
6428
- kvm_destroy_vm(kvm);
6430
+ WARN_ON(vcpu->arch.exception.pending);
6431
+ vcpu->arch.exception.pending = true;
6432
+ vcpu->arch.exception.has_error_code = true;
6433
+ vcpu->arch.exception.nr = nr;
6434
+ vcpu->arch.exception.error_code = error_code;
6436
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
6438
-static void inject_gp(struct kvm_vcpu *vcpu)
6439
+static void __queue_exception(struct kvm_vcpu *vcpu)
6441
- kvm_x86_ops->inject_gp(vcpu, 0);
6442
+ kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
6443
+ vcpu->arch.exception.has_error_code,
6444
+ vcpu->arch.exception.error_code);
6448
* Load the pae pdptrs. Return true is they are all valid.
6450
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
6451
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
6453
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
6454
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
6458
- struct page *page;
6459
- u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
6460
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
6462
mutex_lock(&vcpu->kvm->lock);
6463
- page = gfn_to_page(vcpu->kvm, pdpt_gfn);
6465
+ ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
6466
+ offset * sizeof(u64), sizeof(pdpte));
6472
- pdpt = kmap_atomic(page, KM_USER0);
6473
- memcpy(pdpte, pdpt+offset, sizeof(pdpte));
6474
- kunmap_atomic(pdpt, KM_USER0);
6476
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
6477
if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
6479
@@ -421,67 +195,87 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
6483
- memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
6484
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
6486
mutex_unlock(&vcpu->kvm->lock);
6491
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
6493
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
6494
+ bool changed = true;
6497
+ if (is_long_mode(vcpu) || !is_pae(vcpu))
6500
+ mutex_lock(&vcpu->kvm->lock);
6501
+ r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
6504
+ changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
6506
+ mutex_unlock(&vcpu->kvm->lock);
6511
void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6513
if (cr0 & CR0_RESERVED_BITS) {
6514
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
6517
+ cr0, vcpu->arch.cr0);
6518
+ kvm_inject_gp(vcpu, 0);
6522
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
6523
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
6525
+ kvm_inject_gp(vcpu, 0);
6529
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
6530
printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
6531
"and a clear PE flag\n");
6533
+ kvm_inject_gp(vcpu, 0);
6537
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
6538
#ifdef CONFIG_X86_64
6539
- if ((vcpu->shadow_efer & EFER_LME)) {
6540
+ if ((vcpu->arch.shadow_efer & EFER_LME)) {
6543
if (!is_pae(vcpu)) {
6544
printk(KERN_DEBUG "set_cr0: #GP, start paging "
6545
"in long mode while PAE is disabled\n");
6547
+ kvm_inject_gp(vcpu, 0);
6550
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
6552
printk(KERN_DEBUG "set_cr0: #GP, start paging "
6553
"in long mode while CS.L == 1\n");
6555
+ kvm_inject_gp(vcpu, 0);
6561
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
6562
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
6563
printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
6566
+ kvm_inject_gp(vcpu, 0);
6572
kvm_x86_ops->set_cr0(vcpu, cr0);
6574
+ vcpu->arch.cr0 = cr0;
6576
mutex_lock(&vcpu->kvm->lock);
6577
kvm_mmu_reset_context(vcpu);
6578
@@ -492,7 +286,7 @@ EXPORT_SYMBOL_GPL(set_cr0);
6580
void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
6582
- set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
6583
+ set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
6585
EXPORT_SYMBOL_GPL(lmsw);
6587
@@ -500,7 +294,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6589
if (cr4 & CR4_RESERVED_BITS) {
6590
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
6592
+ kvm_inject_gp(vcpu, 0);
6596
@@ -508,23 +302,23 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6597
if (!(cr4 & X86_CR4_PAE)) {
6598
printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
6601
+ kvm_inject_gp(vcpu, 0);
6604
} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
6605
- && !load_pdptrs(vcpu, vcpu->cr3)) {
6606
+ && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
6607
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
6609
+ kvm_inject_gp(vcpu, 0);
6613
if (cr4 & X86_CR4_VMXE) {
6614
printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
6616
+ kvm_inject_gp(vcpu, 0);
6619
kvm_x86_ops->set_cr4(vcpu, cr4);
6621
+ vcpu->arch.cr4 = cr4;
6622
mutex_lock(&vcpu->kvm->lock);
6623
kvm_mmu_reset_context(vcpu);
6624
mutex_unlock(&vcpu->kvm->lock);
6625
@@ -533,10 +327,15 @@ EXPORT_SYMBOL_GPL(set_cr4);
6627
void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
6629
+ if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
6630
+ kvm_mmu_flush_tlb(vcpu);
6634
if (is_long_mode(vcpu)) {
6635
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
6636
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
6638
+ kvm_inject_gp(vcpu, 0);
6642
@@ -544,23 +343,20 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
6643
if (cr3 & CR3_PAE_RESERVED_BITS) {
6645
"set_cr3: #GP, reserved bits\n");
6647
+ kvm_inject_gp(vcpu, 0);
6650
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
6651
printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
6657
- if (cr3 & CR3_NONPAE_RESERVED_BITS) {
6659
- "set_cr3: #GP, reserved bits\n");
6661
+ kvm_inject_gp(vcpu, 0);
6666
+ * We don't check reserved bits in nonpae mode, because
6667
+ * this isn't enforced, and VMware depends on this.
6671
mutex_lock(&vcpu->kvm->lock);
6672
@@ -574,10 +370,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
6673
* to debug) behavior on the guest side.
6675
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
6677
+ kvm_inject_gp(vcpu, 0);
6680
- vcpu->mmu.new_cr3(vcpu);
6681
+ vcpu->arch.cr3 = cr3;
6682
+ vcpu->arch.mmu.new_cr3(vcpu);
6684
mutex_unlock(&vcpu->kvm->lock);
6686
@@ -587,13 +383,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
6688
if (cr8 & CR8_RESERVED_BITS) {
6689
printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
6691
+ kvm_inject_gp(vcpu, 0);
6694
if (irqchip_in_kernel(vcpu->kvm))
6695
kvm_lapic_set_tpr(vcpu, cr8);
6698
+ vcpu->arch.cr8 = cr8;
6700
EXPORT_SYMBOL_GPL(set_cr8);
6702
@@ -602,210 +398,806 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
6703
if (irqchip_in_kernel(vcpu->kvm))
6704
return kvm_lapic_get_cr8(vcpu);
6707
+ return vcpu->arch.cr8;
6709
EXPORT_SYMBOL_GPL(get_cr8);
6711
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
6713
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
6714
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
6716
+ * This list is modified at module load time to reflect the
6717
+ * capabilities of the host cpu.
6719
+static u32 msrs_to_save[] = {
6720
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
6722
+#ifdef CONFIG_X86_64
6723
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
6725
+ MSR_IA32_TIME_STAMP_COUNTER,
6728
+static unsigned num_msrs_to_save;
6730
+static u32 emulated_msrs[] = {
6731
+ MSR_IA32_MISC_ENABLE,
6734
+#ifdef CONFIG_X86_64
6736
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
6738
- if (irqchip_in_kernel(vcpu->kvm))
6739
- return vcpu->apic_base;
6741
- return vcpu->apic_base;
6742
+ if (efer & EFER_RESERVED_BITS) {
6743
+ printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
6745
+ kvm_inject_gp(vcpu, 0);
6749
+ if (is_paging(vcpu)
6750
+ && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
6751
+ printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
6752
+ kvm_inject_gp(vcpu, 0);
6756
+ kvm_x86_ops->set_efer(vcpu, efer);
6758
+ efer &= ~EFER_LMA;
6759
+ efer |= vcpu->arch.shadow_efer & EFER_LMA;
6761
+ vcpu->arch.shadow_efer = efer;
6763
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
6765
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
6769
+ * Writes msr value into into the appropriate "register".
6770
+ * Returns 0 on success, non-0 otherwise.
6771
+ * Assumes vcpu_load() was already called.
6773
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
6775
- /* TODO: reserve bits check */
6776
- if (irqchip_in_kernel(vcpu->kvm))
6777
- kvm_lapic_set_base(vcpu, data);
6779
- vcpu->apic_base = data;
6780
+ return kvm_x86_ops->set_msr(vcpu, msr_index, data);
6782
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
6784
-void fx_init(struct kvm_vcpu *vcpu)
6786
+ * Adapt set_msr() to msr_io()'s calling convention
6788
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
6790
- unsigned after_mxcsr_mask;
6791
+ return kvm_set_msr(vcpu, index, *data);
6794
- /* Initialize guest FPU by resetting ours and saving into guest's */
6795
- preempt_disable();
6796
- fx_save(&vcpu->host_fx_image);
6798
- fx_save(&vcpu->guest_fx_image);
6799
- fx_restore(&vcpu->host_fx_image);
6802
- vcpu->cr0 |= X86_CR0_ET;
6803
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
6804
- vcpu->guest_fx_image.mxcsr = 0x1f80;
6805
- memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
6806
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
6807
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
6810
+#ifdef CONFIG_X86_64
6812
+ set_efer(vcpu, data);
6815
+ case MSR_IA32_MC0_STATUS:
6816
+ pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
6817
+ __FUNCTION__, data);
6819
+ case MSR_IA32_MCG_STATUS:
6820
+ pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
6821
+ __FUNCTION__, data);
6823
+ case MSR_IA32_UCODE_REV:
6824
+ case MSR_IA32_UCODE_WRITE:
6825
+ case 0x200 ... 0x2ff: /* MTRRs */
6827
+ case MSR_IA32_APICBASE:
6828
+ kvm_set_apic_base(vcpu, data);
6830
+ case MSR_IA32_MISC_ENABLE:
6831
+ vcpu->arch.ia32_misc_enable_msr = data;
6834
+ pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
6839
-EXPORT_SYMBOL_GPL(fx_init);
6840
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
6844
+ * Reads an msr value (of 'msr_index') into 'pdata'.
6845
+ * Returns 0 on success, non-0 otherwise.
6846
+ * Assumes vcpu_load() was already called.
6848
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
6850
+ return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
6853
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
6858
+ case 0xc0010010: /* SYSCFG */
6859
+ case 0xc0010015: /* HWCR */
6860
+ case MSR_IA32_PLATFORM_ID:
6861
+ case MSR_IA32_P5_MC_ADDR:
6862
+ case MSR_IA32_P5_MC_TYPE:
6863
+ case MSR_IA32_MC0_CTL:
6864
+ case MSR_IA32_MCG_STATUS:
6865
+ case MSR_IA32_MCG_CAP:
6866
+ case MSR_IA32_MC0_MISC:
6867
+ case MSR_IA32_MC0_MISC+4:
6868
+ case MSR_IA32_MC0_MISC+8:
6869
+ case MSR_IA32_MC0_MISC+12:
6870
+ case MSR_IA32_MC0_MISC+16:
6871
+ case MSR_IA32_UCODE_REV:
6872
+ case MSR_IA32_PERF_STATUS:
6873
+ case MSR_IA32_EBL_CR_POWERON:
6874
+ /* MTRR registers */
6876
+ case 0x200 ... 0x2ff:
6879
+ case 0xcd: /* fsb frequency */
6882
+ case MSR_IA32_APICBASE:
6883
+ data = kvm_get_apic_base(vcpu);
6885
+ case MSR_IA32_MISC_ENABLE:
6886
+ data = vcpu->arch.ia32_misc_enable_msr;
6888
+#ifdef CONFIG_X86_64
6890
+ data = vcpu->arch.shadow_efer;
6894
+ pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
6900
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
6903
- * Allocate some memory and give it an address in the guest physical address
6905
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
6907
- * Discontiguous memory is allowed, mostly for framebuffers.
6908
+ * @return number of msrs set successfully.
6910
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
6911
- struct kvm_memory_region *mem)
6912
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
6913
+ struct kvm_msr_entry *entries,
6914
+ int (*do_msr)(struct kvm_vcpu *vcpu,
6915
+ unsigned index, u64 *data))
6919
- unsigned long npages;
6921
- struct kvm_memory_slot *memslot;
6922
- struct kvm_memory_slot old, new;
6926
- /* General sanity checks */
6927
- if (mem->memory_size & (PAGE_SIZE - 1))
6929
- if (mem->guest_phys_addr & (PAGE_SIZE - 1))
6932
+ for (i = 0; i < msrs->nmsrs; ++i)
6933
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
6942
+ * Read or write a bunch of msrs. Parameters are user addresses.
6944
+ * @return number of msrs set successfully.
6946
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
6947
+ int (*do_msr)(struct kvm_vcpu *vcpu,
6948
+ unsigned index, u64 *data),
6951
+ struct kvm_msrs msrs;
6952
+ struct kvm_msr_entry *entries;
6957
+ if (copy_from_user(&msrs, user_msrs, sizeof msrs))
6959
- if (mem->slot >= KVM_MEMORY_SLOTS)
6962
+ if (msrs.nmsrs >= MAX_IO_MSRS)
6964
- if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
6967
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
6968
+ entries = vmalloc(size);
6972
- memslot = &kvm->memslots[mem->slot];
6973
- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
6974
- npages = mem->memory_size >> PAGE_SHIFT;
6976
+ if (copy_from_user(entries, user_msrs->entries, size))
6980
- mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
6981
+ r = n = __msr_io(vcpu, &msrs, entries, do_msr);
6985
- mutex_lock(&kvm->lock);
6987
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
6990
- new = old = *memslot;
6993
- new.base_gfn = base_gfn;
6994
- new.npages = npages;
6995
- new.flags = mem->flags;
7002
- /* Disallow changing a memory slot's size. */
7004
- if (npages && old.npages && npages != old.npages)
7007
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
7010
+void decache_vcpus_on_cpu(int cpu)
7013
+ struct kvm_vcpu *vcpu;
7016
+ spin_lock(&kvm_lock);
7017
+ list_for_each_entry(vm, &vm_list, vm_list)
7018
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
7019
+ vcpu = vm->vcpus[i];
7023
+ * If the vcpu is locked, then it is running on some
7024
+ * other cpu and therefore it is not cached on the
7025
+ * cpu in question.
7027
+ * If it's not locked, check the last cpu it executed
7030
+ if (mutex_trylock(&vcpu->mutex)) {
7031
+ if (vcpu->cpu == cpu) {
7032
+ kvm_x86_ops->vcpu_decache(vcpu);
7035
+ mutex_unlock(&vcpu->mutex);
7038
+ spin_unlock(&kvm_lock);
7041
- /* Check for overlaps */
7043
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
7044
- struct kvm_memory_slot *s = &kvm->memslots[i];
7045
+int kvm_dev_ioctl_check_extension(long ext)
7051
- if (!((base_gfn + npages <= s->base_gfn) ||
7052
- (base_gfn >= s->base_gfn + s->npages)))
7055
+ case KVM_CAP_IRQCHIP:
7057
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
7058
+ case KVM_CAP_USER_MEMORY:
7059
+ case KVM_CAP_SET_TSS_ADDR:
7060
+ case KVM_CAP_EXT_CPUID:
7069
- /* Deallocate if slot is being removed */
7071
- new.phys_mem = NULL;
7074
- /* Free page dirty bitmap if unneeded */
7075
- if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
7076
- new.dirty_bitmap = NULL;
7077
+long kvm_arch_dev_ioctl(struct file *filp,
7078
+ unsigned int ioctl, unsigned long arg)
7080
+ void __user *argp = (void __user *)arg;
7085
+ case KVM_GET_MSR_INDEX_LIST: {
7086
+ struct kvm_msr_list __user *user_msr_list = argp;
7087
+ struct kvm_msr_list msr_list;
7090
- /* Allocate if a slot is being created */
7091
- if (npages && !new.phys_mem) {
7092
- new.phys_mem = vmalloc(npages * sizeof(struct page *));
7094
+ if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
7096
+ n = msr_list.nmsrs;
7097
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
7098
+ if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
7101
+ if (n < num_msrs_to_save)
7104
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
7105
+ num_msrs_to_save * sizeof(u32)))
7107
+ if (copy_to_user(user_msr_list->indices
7108
+ + num_msrs_to_save * sizeof(u32),
7110
+ ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
7122
- if (!new.phys_mem)
7124
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
7126
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
7129
- memset(new.phys_mem, 0, npages * sizeof(struct page *));
7130
- for (i = 0; i < npages; ++i) {
7131
- new.phys_mem[i] = alloc_page(GFP_HIGHUSER
7133
- if (!new.phys_mem[i])
7135
- set_page_private(new.phys_mem[i],0);
7138
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
7140
+ kvm_x86_ops->vcpu_put(vcpu);
7141
+ kvm_put_guest_fpu(vcpu);
7144
+static int is_efer_nx(void)
7148
- /* Allocate page dirty bitmap if needed */
7149
- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
7150
- unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
7151
+ rdmsrl(MSR_EFER, efer);
7152
+ return efer & EFER_NX;
7155
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
7158
+ struct kvm_cpuid_entry2 *e, *entry;
7160
- new.dirty_bitmap = vmalloc(dirty_bytes);
7161
- if (!new.dirty_bitmap)
7163
- memset(new.dirty_bitmap, 0, dirty_bytes);
7165
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
7166
+ e = &vcpu->arch.cpuid_entries[i];
7167
+ if (e->function == 0x80000001) {
7172
+ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
7173
+ entry->edx &= ~(1 << 20);
7174
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
7178
- if (mem->slot >= kvm->nmemslots)
7179
- kvm->nmemslots = mem->slot + 1;
7180
+/* when an old userspace process fills a new kernel module */
7181
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
7182
+ struct kvm_cpuid *cpuid,
7183
+ struct kvm_cpuid_entry __user *entries)
7186
+ struct kvm_cpuid_entry *cpuid_entries;
7190
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
7193
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
7194
+ if (!cpuid_entries)
7197
+ if (copy_from_user(cpuid_entries, entries,
7198
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
7200
+ for (i = 0; i < cpuid->nent; i++) {
7201
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
7202
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
7203
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
7204
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
7205
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
7206
+ vcpu->arch.cpuid_entries[i].index = 0;
7207
+ vcpu->arch.cpuid_entries[i].flags = 0;
7208
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
7209
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
7210
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
7212
+ vcpu->arch.cpuid_nent = cpuid->nent;
7213
+ cpuid_fix_nx_cap(vcpu);
7216
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7217
- kvm_flush_remote_tlbs(kvm);
7219
+ vfree(cpuid_entries);
7224
- mutex_unlock(&kvm->lock);
7225
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
7226
+ struct kvm_cpuid2 *cpuid,
7227
+ struct kvm_cpuid_entry2 __user *entries)
7231
- kvm_free_physmem_slot(&old, &new);
7233
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
7236
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
7237
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
7239
+ vcpu->arch.cpuid_nent = cpuid->nent;
7243
- mutex_unlock(&kvm->lock);
7244
- kvm_free_physmem_slot(&new, &old);
7250
- * Get (and clear) the dirty memory log for a memory slot.
7252
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
7253
- struct kvm_dirty_log *log)
7254
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
7255
+ struct kvm_cpuid2 *cpuid,
7256
+ struct kvm_cpuid_entry2 __user *entries)
7258
- struct kvm_memory_slot *memslot;
7261
- unsigned long any = 0;
7263
- mutex_lock(&kvm->lock);
7267
- if (log->slot >= KVM_MEMORY_SLOTS)
7269
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
7272
- memslot = &kvm->memslots[log->slot];
7274
- if (!memslot->dirty_bitmap)
7276
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
7277
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
7282
+ cpuid->nent = vcpu->arch.cpuid_nent;
7286
+static inline u32 bit(int bitno)
7288
+ return 1 << (bitno & 31);
7291
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
7294
+ entry->function = function;
7295
+ entry->index = index;
7296
+ cpuid_count(entry->function, entry->index,
7297
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
7301
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
7302
+ u32 index, int *nent, int maxnent)
7304
+ const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
7305
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
7306
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
7307
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
7308
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
7309
+ bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
7310
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
7311
+ bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
7312
+ bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
7313
+ bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
7314
+ const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
7315
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
7316
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
7317
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
7318
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
7319
+ bit(X86_FEATURE_PGE) |
7320
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
7321
+ bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
7322
+ bit(X86_FEATURE_SYSCALL) |
7323
+ (bit(X86_FEATURE_NX) && is_efer_nx()) |
7324
+#ifdef CONFIG_X86_64
7325
+ bit(X86_FEATURE_LM) |
7327
+ bit(X86_FEATURE_MMXEXT) |
7328
+ bit(X86_FEATURE_3DNOWEXT) |
7329
+ bit(X86_FEATURE_3DNOW);
7330
+ const u32 kvm_supported_word3_x86_features =
7331
+ bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
7332
+ const u32 kvm_supported_word6_x86_features =
7333
+ bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
7335
+ /* all func 2 cpuid_count() should be called on the same cpu */
7337
+ do_cpuid_1_ent(entry, function, index);
7340
+ switch (function) {
7342
+ entry->eax = min(entry->eax, (u32)0xb);
7345
+ entry->edx &= kvm_supported_word0_x86_features;
7346
+ entry->ecx &= kvm_supported_word3_x86_features;
7348
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
7349
+ * may return different values. This forces us to get_cpu() before
7350
+ * issuing the first command, and also to emulate this annoying behavior
7351
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
7353
+ int t, times = entry->eax & 0xff;
7355
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
7356
+ for (t = 1; t < times && *nent < maxnent; ++t) {
7357
+ do_cpuid_1_ent(&entry[t], function, 0);
7358
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
7363
+ /* function 4 and 0xb have additional index. */
7365
+ int index, cache_type;
7367
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
7368
+ /* read more entries until cache_type is zero */
7369
+ for (index = 1; *nent < maxnent; ++index) {
7370
+ cache_type = entry[index - 1].eax & 0x1f;
7373
+ do_cpuid_1_ent(&entry[index], function, index);
7374
+ entry[index].flags |=
7375
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
7381
+ int index, level_type;
7383
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
7384
+ /* read more entries until level_type is zero */
7385
+ for (index = 1; *nent < maxnent; ++index) {
7386
+ level_type = entry[index - 1].ecx & 0xff;
7389
+ do_cpuid_1_ent(&entry[index], function, index);
7390
+ entry[index].flags |=
7391
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
7397
+ entry->eax = min(entry->eax, 0x8000001a);
7400
+ entry->edx &= kvm_supported_word1_x86_features;
7401
+ entry->ecx &= kvm_supported_word6_x86_features;
7407
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
7408
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
7409
+ struct kvm_cpuid2 *cpuid,
7410
+ struct kvm_cpuid_entry2 __user *entries)
7412
+ struct kvm_cpuid_entry2 *cpuid_entries;
7413
+ int limit, nent = 0, r = -E2BIG;
7416
- for (i = 0; !any && i < n/sizeof(long); ++i)
7417
- any = memslot->dirty_bitmap[i];
7418
+ if (cpuid->nent < 1)
7421
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
7422
+ if (!cpuid_entries)
7425
+ do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
7426
+ limit = cpuid_entries[0].eax;
7427
+ for (func = 1; func <= limit && nent < cpuid->nent; ++func)
7428
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
7429
+ &nent, cpuid->nent);
7431
+ if (nent >= cpuid->nent)
7434
+ do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
7435
+ limit = cpuid_entries[nent - 1].eax;
7436
+ for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
7437
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
7438
+ &nent, cpuid->nent);
7440
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
7442
+ if (copy_to_user(entries, cpuid_entries,
7443
+ nent * sizeof(struct kvm_cpuid_entry2)))
7445
+ cpuid->nent = nent;
7448
- /* If nothing is dirty, don't bother messing with page tables. */
7450
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
7451
- kvm_flush_remote_tlbs(kvm);
7452
- memset(memslot->dirty_bitmap, 0, n);
7454
+ vfree(cpuid_entries);
7459
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
7460
+ struct kvm_lapic_state *s)
7463
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
7469
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
7470
+ struct kvm_lapic_state *s)
7473
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
7474
+ kvm_apic_post_state_restore(vcpu);
7480
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
7481
+ struct kvm_interrupt *irq)
7483
+ if (irq->irq < 0 || irq->irq >= 256)
7485
+ if (irqchip_in_kernel(vcpu->kvm))
7489
+ set_bit(irq->irq, vcpu->arch.irq_pending);
7490
+ set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
7497
+long kvm_arch_vcpu_ioctl(struct file *filp,
7498
+ unsigned int ioctl, unsigned long arg)
7500
+ struct kvm_vcpu *vcpu = filp->private_data;
7501
+ void __user *argp = (void __user *)arg;
7505
+ case KVM_GET_LAPIC: {
7506
+ struct kvm_lapic_state lapic;
7508
+ memset(&lapic, 0, sizeof lapic);
7509
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
7513
+ if (copy_to_user(argp, &lapic, sizeof lapic))
7518
+ case KVM_SET_LAPIC: {
7519
+ struct kvm_lapic_state lapic;
7522
+ if (copy_from_user(&lapic, argp, sizeof lapic))
7524
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
7530
+ case KVM_INTERRUPT: {
7531
+ struct kvm_interrupt irq;
7535
+ if (copy_from_user(&irq, argp, sizeof irq))
7537
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
7543
+ case KVM_SET_CPUID: {
7544
+ struct kvm_cpuid __user *cpuid_arg = argp;
7545
+ struct kvm_cpuid cpuid;
7548
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
7550
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
7555
+ case KVM_SET_CPUID2: {
7556
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
7557
+ struct kvm_cpuid2 cpuid;
7560
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
7562
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
7563
+ cpuid_arg->entries);
7568
+ case KVM_GET_CPUID2: {
7569
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
7570
+ struct kvm_cpuid2 cpuid;
7573
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
7575
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
7576
+ cpuid_arg->entries);
7580
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
7585
+ case KVM_GET_MSRS:
7586
+ r = msr_io(vcpu, argp, kvm_get_msr, 1);
7588
+ case KVM_SET_MSRS:
7589
+ r = msr_io(vcpu, argp, do_set_msr, 0);
7595
- mutex_unlock(&kvm->lock);
7599
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
7603
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
7605
+ ret = kvm_x86_ops->set_tss_addr(kvm, addr);
7609
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
7610
+ u32 kvm_nr_mmu_pages)
7612
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
7615
+ mutex_lock(&kvm->lock);
7617
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
7618
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
7620
+ mutex_unlock(&kvm->lock);
7624
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
7626
+ return kvm->arch.n_alloc_mmu_pages;
7629
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
7632
+ struct kvm_mem_alias *alias;
7634
+ for (i = 0; i < kvm->arch.naliases; ++i) {
7635
+ alias = &kvm->arch.aliases[i];
7636
+ if (gfn >= alias->base_gfn
7637
+ && gfn < alias->base_gfn + alias->npages)
7638
+ return alias->target_gfn + gfn - alias->base_gfn;
7644
* Set a new alias region. Aliases map a portion of physical memory into
7645
* another portion. This is useful for memory windows, for example the PC
7646
@@ -834,15 +1226,15 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
7648
mutex_lock(&kvm->lock);
7650
- p = &kvm->aliases[alias->slot];
7651
+ p = &kvm->arch.aliases[alias->slot];
7652
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
7653
p->npages = alias->memory_size >> PAGE_SHIFT;
7654
p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
7656
for (n = KVM_ALIAS_SLOTS; n > 0; --n)
7657
- if (kvm->aliases[n - 1].npages)
7658
+ if (kvm->arch.aliases[n - 1].npages)
7660
- kvm->naliases = n;
7661
+ kvm->arch.naliases = n;
7663
kvm_mmu_zap_all(kvm);
7665
@@ -861,17 +1253,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
7667
switch (chip->chip_id) {
7668
case KVM_IRQCHIP_PIC_MASTER:
7669
- memcpy (&chip->chip.pic,
7670
+ memcpy(&chip->chip.pic,
7671
&pic_irqchip(kvm)->pics[0],
7672
sizeof(struct kvm_pic_state));
7674
case KVM_IRQCHIP_PIC_SLAVE:
7675
- memcpy (&chip->chip.pic,
7676
+ memcpy(&chip->chip.pic,
7677
&pic_irqchip(kvm)->pics[1],
7678
sizeof(struct kvm_pic_state));
7680
case KVM_IRQCHIP_IOAPIC:
7681
- memcpy (&chip->chip.ioapic,
7682
+ memcpy(&chip->chip.ioapic,
7683
ioapic_irqchip(kvm),
7684
sizeof(struct kvm_ioapic_state));
7686
@@ -889,17 +1281,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
7688
switch (chip->chip_id) {
7689
case KVM_IRQCHIP_PIC_MASTER:
7690
- memcpy (&pic_irqchip(kvm)->pics[0],
7691
+ memcpy(&pic_irqchip(kvm)->pics[0],
7693
sizeof(struct kvm_pic_state));
7695
case KVM_IRQCHIP_PIC_SLAVE:
7696
- memcpy (&pic_irqchip(kvm)->pics[1],
7697
+ memcpy(&pic_irqchip(kvm)->pics[1],
7699
sizeof(struct kvm_pic_state));
7701
case KVM_IRQCHIP_IOAPIC:
7702
- memcpy (ioapic_irqchip(kvm),
7703
+ memcpy(ioapic_irqchip(kvm),
7705
sizeof(struct kvm_ioapic_state));
7707
@@ -911,110 +1303,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
7711
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
7713
+ * Get (and clear) the dirty memory log for a memory slot.
7715
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
7716
+ struct kvm_dirty_log *log)
7719
- struct kvm_mem_alias *alias;
7721
- for (i = 0; i < kvm->naliases; ++i) {
7722
- alias = &kvm->aliases[i];
7723
- if (gfn >= alias->base_gfn
7724
- && gfn < alias->base_gfn + alias->npages)
7725
- return alias->target_gfn + gfn - alias->base_gfn;
7731
+ struct kvm_memory_slot *memslot;
7734
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
7737
+ mutex_lock(&kvm->lock);
7739
- for (i = 0; i < kvm->nmemslots; ++i) {
7740
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
7741
+ r = kvm_get_dirty_log(kvm, log, &is_dirty);
7745
- if (gfn >= memslot->base_gfn
7746
- && gfn < memslot->base_gfn + memslot->npages)
7748
+ /* If nothing is dirty, don't bother messing with page tables. */
7750
+ kvm_mmu_slot_remove_write_access(kvm, log->slot);
7751
+ kvm_flush_remote_tlbs(kvm);
7752
+ memslot = &kvm->memslots[log->slot];
7753
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
7754
+ memset(memslot->dirty_bitmap, 0, n);
7759
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
7761
- gfn = unalias_gfn(kvm, gfn);
7762
- return __gfn_to_memslot(kvm, gfn);
7765
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
7767
- struct kvm_memory_slot *slot;
7769
- gfn = unalias_gfn(kvm, gfn);
7770
- slot = __gfn_to_memslot(kvm, gfn);
7773
- return slot->phys_mem[gfn - slot->base_gfn];
7776
+ mutex_unlock(&kvm->lock);
7779
-EXPORT_SYMBOL_GPL(gfn_to_page);
7781
-/* WARNING: Does not work on aliased pages. */
7782
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
7783
+long kvm_arch_vm_ioctl(struct file *filp,
7784
+ unsigned int ioctl, unsigned long arg)
7786
- struct kvm_memory_slot *memslot;
7787
+ struct kvm *kvm = filp->private_data;
7788
+ void __user *argp = (void __user *)arg;
7791
- memslot = __gfn_to_memslot(kvm, gfn);
7792
- if (memslot && memslot->dirty_bitmap) {
7793
- unsigned long rel_gfn = gfn - memslot->base_gfn;
7795
+ case KVM_SET_TSS_ADDR:
7796
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
7800
+ case KVM_SET_MEMORY_REGION: {
7801
+ struct kvm_memory_region kvm_mem;
7802
+ struct kvm_userspace_memory_region kvm_userspace_mem;
7805
- if (!test_bit(rel_gfn, memslot->dirty_bitmap))
7806
- set_bit(rel_gfn, memslot->dirty_bitmap);
7808
+ if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
7810
+ kvm_userspace_mem.slot = kvm_mem.slot;
7811
+ kvm_userspace_mem.flags = kvm_mem.flags;
7812
+ kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
7813
+ kvm_userspace_mem.memory_size = kvm_mem.memory_size;
7814
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
7820
+ case KVM_SET_NR_MMU_PAGES:
7821
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
7825
+ case KVM_GET_NR_MMU_PAGES:
7826
+ r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
7828
+ case KVM_SET_MEMORY_ALIAS: {
7829
+ struct kvm_memory_alias alias;
7831
-int emulator_read_std(unsigned long addr,
7833
- unsigned int bytes,
7834
- struct kvm_vcpu *vcpu)
7838
+ if (copy_from_user(&alias, argp, sizeof alias))
7840
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
7845
+ case KVM_CREATE_IRQCHIP:
7847
+ kvm->arch.vpic = kvm_create_pic(kvm);
7848
+ if (kvm->arch.vpic) {
7849
+ r = kvm_ioapic_init(kvm);
7851
+ kfree(kvm->arch.vpic);
7852
+ kvm->arch.vpic = NULL;
7858
+ case KVM_IRQ_LINE: {
7859
+ struct kvm_irq_level irq_event;
7862
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
7863
- unsigned offset = addr & (PAGE_SIZE-1);
7864
- unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
7865
- unsigned long pfn;
7866
- struct page *page;
7869
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
7871
+ if (irqchip_in_kernel(kvm)) {
7872
+ mutex_lock(&kvm->lock);
7873
+ if (irq_event.irq < 16)
7874
+ kvm_pic_set_irq(pic_irqchip(kvm),
7877
+ kvm_ioapic_set_irq(kvm->arch.vioapic,
7880
+ mutex_unlock(&kvm->lock);
7885
+ case KVM_GET_IRQCHIP: {
7886
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
7887
+ struct kvm_irqchip chip;
7889
- if (gpa == UNMAPPED_GVA)
7890
- return X86EMUL_PROPAGATE_FAULT;
7891
- pfn = gpa >> PAGE_SHIFT;
7892
- page = gfn_to_page(vcpu->kvm, pfn);
7894
- return X86EMUL_UNHANDLEABLE;
7895
- page_virt = kmap_atomic(page, KM_USER0);
7897
+ if (copy_from_user(&chip, argp, sizeof chip))
7900
+ if (!irqchip_in_kernel(kvm))
7902
+ r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
7906
+ if (copy_to_user(argp, &chip, sizeof chip))
7911
+ case KVM_SET_IRQCHIP: {
7912
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
7913
+ struct kvm_irqchip chip;
7915
- memcpy(data, page_virt + offset, tocopy);
7917
+ if (copy_from_user(&chip, argp, sizeof chip))
7920
+ if (!irqchip_in_kernel(kvm))
7922
+ r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
7928
+ case KVM_GET_SUPPORTED_CPUID: {
7929
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
7930
+ struct kvm_cpuid2 cpuid;
7932
- kunmap_atomic(page_virt, KM_USER0);
7934
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
7936
+ r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
7937
+ cpuid_arg->entries);
7945
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
7951
- return X86EMUL_CONTINUE;
7958
-EXPORT_SYMBOL_GPL(emulator_read_std);
7960
-static int emulator_write_std(unsigned long addr,
7962
- unsigned int bytes,
7963
- struct kvm_vcpu *vcpu)
7964
+static void kvm_init_msr_list(void)
7966
- pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
7967
- return X86EMUL_UNHANDLEABLE;
7971
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
7972
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
7975
+ msrs_to_save[j] = msrs_to_save[i];
7978
+ num_msrs_to_save = j;
7982
@@ -1025,14 +1498,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
7984
struct kvm_io_device *dev;
7987
- dev = &vcpu->apic->dev;
7988
+ if (vcpu->arch.apic) {
7989
+ dev = &vcpu->arch.apic->dev;
7990
if (dev->in_range(dev, addr))
7997
static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
8000
@@ -1044,11 +1518,33 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
8004
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
8006
+int emulator_read_std(unsigned long addr,
8008
+ unsigned int bytes,
8009
+ struct kvm_vcpu *vcpu)
8011
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
8015
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
8016
+ unsigned offset = addr & (PAGE_SIZE-1);
8017
+ unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
8020
+ if (gpa == UNMAPPED_GVA)
8021
+ return X86EMUL_PROPAGATE_FAULT;
8022
+ ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
8024
+ return X86EMUL_UNHANDLEABLE;
8031
+ return X86EMUL_CONTINUE;
8033
+EXPORT_SYMBOL_GPL(emulator_read_std);
8035
static int emulator_read_emulated(unsigned long addr,
8037
@@ -1062,14 +1558,21 @@ static int emulator_read_emulated(unsigned long addr,
8038
memcpy(val, vcpu->mmio_data, bytes);
8039
vcpu->mmio_read_completed = 0;
8040
return X86EMUL_CONTINUE;
8041
- } else if (emulator_read_std(addr, val, bytes, vcpu)
8042
- == X86EMUL_CONTINUE)
8043
- return X86EMUL_CONTINUE;
8046
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
8048
- gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
8049
+ /* For APIC access vmexit */
8050
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
8053
+ if (emulator_read_std(addr, val, bytes, vcpu)
8054
+ == X86EMUL_CONTINUE)
8055
+ return X86EMUL_CONTINUE;
8056
if (gpa == UNMAPPED_GVA)
8057
return X86EMUL_PROPAGATE_FAULT;
8061
* Is this MMIO handled locally?
8063
@@ -1090,19 +1593,12 @@ static int emulator_read_emulated(unsigned long addr,
8064
static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
8065
const void *val, int bytes)
8067
- struct page *page;
8071
- if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
8073
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
8075
+ ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
8078
- mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
8079
- virt = kmap_atomic(page, KM_USER0);
8080
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
8081
- memcpy(virt + offset_in_page(gpa), val, bytes);
8082
- kunmap_atomic(virt, KM_USER0);
8086
@@ -1112,16 +1608,21 @@ static int emulator_write_emulated_onepage(unsigned long addr,
8087
struct kvm_vcpu *vcpu)
8089
struct kvm_io_device *mmio_dev;
8090
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
8091
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
8093
if (gpa == UNMAPPED_GVA) {
8094
- kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
8095
+ kvm_inject_page_fault(vcpu, addr, 2);
8096
return X86EMUL_PROPAGATE_FAULT;
8099
+ /* For APIC access vmexit */
8100
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
8103
if (emulator_write_phys(vcpu, gpa, val, bytes))
8104
return X86EMUL_CONTINUE;
8108
* Is this MMIO handled locally?
8110
@@ -1173,6 +1674,31 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
8112
printk(KERN_WARNING "kvm: emulating exchange as write\n");
8114
+#ifndef CONFIG_X86_64
8115
+ /* guests cmpxchg8b have to be emulated atomically */
8117
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
8118
+ struct page *page;
8122
+ if (gpa == UNMAPPED_GVA ||
8123
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
8126
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
8130
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
8131
+ addr = kmap_atomic(page, KM_USER0);
8132
+ set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
8133
+ kunmap_atomic(addr, KM_USER0);
8134
+ kvm_release_page_dirty(page);
8139
return emulator_write_emulated(addr, new, bytes, vcpu);
8142
@@ -1188,11 +1714,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
8144
int emulate_clts(struct kvm_vcpu *vcpu)
8146
- kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
8147
+ kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
8148
return X86EMUL_CONTINUE;
8151
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
8152
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
8154
struct kvm_vcpu *vcpu = ctxt->vcpu;
8156
@@ -1223,7 +1749,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
8158
static int reported;
8160
- unsigned long rip = vcpu->rip;
8161
+ unsigned long rip = vcpu->arch.rip;
8162
unsigned long rip_linear;
8164
rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
8165
@@ -1241,7 +1767,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
8167
struct x86_emulate_ops emulate_ops = {
8168
.read_std = emulator_read_std,
8169
- .write_std = emulator_write_std,
8170
.read_emulated = emulator_read_emulated,
8171
.write_emulated = emulator_write_emulated,
8172
.cmpxchg_emulated = emulator_cmpxchg_emulated,
8173
@@ -1250,44 +1775,63 @@ struct x86_emulate_ops emulate_ops = {
8174
int emulate_instruction(struct kvm_vcpu *vcpu,
8175
struct kvm_run *run,
8181
- struct x86_emulate_ctxt emulate_ctxt;
8185
- vcpu->mmio_fault_cr2 = cr2;
8186
+ vcpu->arch.mmio_fault_cr2 = cr2;
8187
kvm_x86_ops->cache_regs(vcpu);
8189
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
8191
- emulate_ctxt.vcpu = vcpu;
8192
- emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
8193
- emulate_ctxt.cr2 = cr2;
8194
- emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
8195
- ? X86EMUL_MODE_REAL : cs_l
8196
- ? X86EMUL_MODE_PROT64 : cs_db
8197
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
8199
- if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
8200
- emulate_ctxt.cs_base = 0;
8201
- emulate_ctxt.ds_base = 0;
8202
- emulate_ctxt.es_base = 0;
8203
- emulate_ctxt.ss_base = 0;
8205
- emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
8206
- emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
8207
- emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
8208
- emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
8209
+ vcpu->mmio_is_write = 0;
8210
+ vcpu->arch.pio.string = 0;
8214
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
8216
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
8217
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
8218
+ vcpu->arch.emulate_ctxt.mode =
8219
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
8220
+ ? X86EMUL_MODE_REAL : cs_l
8221
+ ? X86EMUL_MODE_PROT64 : cs_db
8222
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
8224
+ if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
8225
+ vcpu->arch.emulate_ctxt.cs_base = 0;
8226
+ vcpu->arch.emulate_ctxt.ds_base = 0;
8227
+ vcpu->arch.emulate_ctxt.es_base = 0;
8228
+ vcpu->arch.emulate_ctxt.ss_base = 0;
8230
+ vcpu->arch.emulate_ctxt.cs_base =
8231
+ get_segment_base(vcpu, VCPU_SREG_CS);
8232
+ vcpu->arch.emulate_ctxt.ds_base =
8233
+ get_segment_base(vcpu, VCPU_SREG_DS);
8234
+ vcpu->arch.emulate_ctxt.es_base =
8235
+ get_segment_base(vcpu, VCPU_SREG_ES);
8236
+ vcpu->arch.emulate_ctxt.ss_base =
8237
+ get_segment_base(vcpu, VCPU_SREG_SS);
8240
+ vcpu->arch.emulate_ctxt.gs_base =
8241
+ get_segment_base(vcpu, VCPU_SREG_GS);
8242
+ vcpu->arch.emulate_ctxt.fs_base =
8243
+ get_segment_base(vcpu, VCPU_SREG_FS);
8245
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
8246
+ ++vcpu->stat.insn_emulation;
8248
+ ++vcpu->stat.insn_emulation_fail;
8249
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
8250
+ return EMULATE_DONE;
8251
+ return EMULATE_FAIL;
8255
- emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
8256
- emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
8257
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
8259
- vcpu->mmio_is_write = 0;
8260
- vcpu->pio.string = 0;
8261
- r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
8262
- if (vcpu->pio.string)
8263
+ if (vcpu->arch.pio.string)
8264
return EMULATE_DO_MMIO;
8266
if ((r || vcpu->mmio_is_write) && run) {
8267
@@ -1309,7 +1853,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
8270
kvm_x86_ops->decache_regs(vcpu);
8271
- kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
8272
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
8274
if (vcpu->mmio_is_write) {
8275
vcpu->mmio_needed = 0;
8276
@@ -1320,439 +1864,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
8278
EXPORT_SYMBOL_GPL(emulate_instruction);
8281
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
8283
-static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
8285
- DECLARE_WAITQUEUE(wait, current);
8287
- add_wait_queue(&vcpu->wq, &wait);
8290
- * We will block until either an interrupt or a signal wakes us up
8292
- while (!kvm_cpu_has_interrupt(vcpu)
8293
- && !signal_pending(current)
8294
- && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
8295
- && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
8296
- set_current_state(TASK_INTERRUPTIBLE);
8302
- __set_current_state(TASK_RUNNING);
8303
- remove_wait_queue(&vcpu->wq, &wait);
8306
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
8308
- ++vcpu->stat.halt_exits;
8309
- if (irqchip_in_kernel(vcpu->kvm)) {
8310
- vcpu->mp_state = VCPU_MP_STATE_HALTED;
8311
- kvm_vcpu_block(vcpu);
8312
- if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
8316
- vcpu->run->exit_reason = KVM_EXIT_HLT;
8320
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
8322
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
8324
- unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
8326
- kvm_x86_ops->cache_regs(vcpu);
8327
- ret = -KVM_EINVAL;
8328
-#ifdef CONFIG_X86_64
8329
- if (is_long_mode(vcpu)) {
8330
- nr = vcpu->regs[VCPU_REGS_RAX];
8331
- a0 = vcpu->regs[VCPU_REGS_RDI];
8332
- a1 = vcpu->regs[VCPU_REGS_RSI];
8333
- a2 = vcpu->regs[VCPU_REGS_RDX];
8334
- a3 = vcpu->regs[VCPU_REGS_RCX];
8335
- a4 = vcpu->regs[VCPU_REGS_R8];
8336
- a5 = vcpu->regs[VCPU_REGS_R9];
8340
- nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
8341
- a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
8342
- a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
8343
- a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
8344
- a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
8345
- a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
8346
- a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
8350
- run->hypercall.nr = nr;
8351
- run->hypercall.args[0] = a0;
8352
- run->hypercall.args[1] = a1;
8353
- run->hypercall.args[2] = a2;
8354
- run->hypercall.args[3] = a3;
8355
- run->hypercall.args[4] = a4;
8356
- run->hypercall.args[5] = a5;
8357
- run->hypercall.ret = ret;
8358
- run->hypercall.longmode = is_long_mode(vcpu);
8359
- kvm_x86_ops->decache_regs(vcpu);
8362
- vcpu->regs[VCPU_REGS_RAX] = ret;
8363
- kvm_x86_ops->decache_regs(vcpu);
8366
-EXPORT_SYMBOL_GPL(kvm_hypercall);
8368
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
8370
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8373
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
8375
- struct descriptor_table dt = { limit, base };
8377
- kvm_x86_ops->set_gdt(vcpu, &dt);
8380
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
8382
- struct descriptor_table dt = { limit, base };
8384
- kvm_x86_ops->set_idt(vcpu, &dt);
8387
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
8388
- unsigned long *rflags)
8391
- *rflags = kvm_x86_ops->get_rflags(vcpu);
8394
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
8396
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
8407
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
8412
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
8413
- unsigned long *rflags)
8417
- set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
8418
- *rflags = kvm_x86_ops->get_rflags(vcpu);
8424
- set_cr3(vcpu, val);
8427
- set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
8430
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
8435
- * Register the para guest with the host:
8437
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
8439
- struct kvm_vcpu_para_state *para_state;
8440
- hpa_t para_state_hpa, hypercall_hpa;
8441
- struct page *para_state_page;
8442
- unsigned char *hypercall;
8443
- gpa_t hypercall_gpa;
8445
- printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
8446
- printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
8449
- * Needs to be page aligned:
8451
- if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
8454
- para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
8455
- printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
8456
- if (is_error_hpa(para_state_hpa))
8459
- mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
8460
- para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
8461
- para_state = kmap(para_state_page);
8463
- printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
8464
- printk(KERN_DEBUG ".... size: %d\n", para_state->size);
8466
- para_state->host_version = KVM_PARA_API_VERSION;
8468
- * We cannot support guests that try to register themselves
8469
- * with a newer API version than the host supports:
8471
- if (para_state->guest_version > KVM_PARA_API_VERSION) {
8472
- para_state->ret = -KVM_EINVAL;
8473
- goto err_kunmap_skip;
8476
- hypercall_gpa = para_state->hypercall_gpa;
8477
- hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
8478
- printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
8479
- if (is_error_hpa(hypercall_hpa)) {
8480
- para_state->ret = -KVM_EINVAL;
8481
- goto err_kunmap_skip;
8484
- printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
8485
- vcpu->para_state_page = para_state_page;
8486
- vcpu->para_state_gpa = para_state_gpa;
8487
- vcpu->hypercall_gpa = hypercall_gpa;
8489
- mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
8490
- hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
8491
- KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
8492
- kvm_x86_ops->patch_hypercall(vcpu, hypercall);
8493
- kunmap_atomic(hypercall, KM_USER1);
8495
- para_state->ret = 0;
8497
- kunmap(para_state_page);
8503
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
8508
- case 0xc0010010: /* SYSCFG */
8509
- case 0xc0010015: /* HWCR */
8510
- case MSR_IA32_PLATFORM_ID:
8511
- case MSR_IA32_P5_MC_ADDR:
8512
- case MSR_IA32_P5_MC_TYPE:
8513
- case MSR_IA32_MC0_CTL:
8514
- case MSR_IA32_MCG_STATUS:
8515
- case MSR_IA32_MCG_CAP:
8516
- case MSR_IA32_MC0_MISC:
8517
- case MSR_IA32_MC0_MISC+4:
8518
- case MSR_IA32_MC0_MISC+8:
8519
- case MSR_IA32_MC0_MISC+12:
8520
- case MSR_IA32_MC0_MISC+16:
8521
- case MSR_IA32_UCODE_REV:
8522
- case MSR_IA32_PERF_STATUS:
8523
- case MSR_IA32_EBL_CR_POWERON:
8524
- /* MTRR registers */
8526
- case 0x200 ... 0x2ff:
8529
- case 0xcd: /* fsb frequency */
8532
- case MSR_IA32_APICBASE:
8533
- data = kvm_get_apic_base(vcpu);
8535
- case MSR_IA32_MISC_ENABLE:
8536
- data = vcpu->ia32_misc_enable_msr;
8538
-#ifdef CONFIG_X86_64
8540
- data = vcpu->shadow_efer;
8544
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
8550
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
8553
- * Reads an msr value (of 'msr_index') into 'pdata'.
8554
- * Returns 0 on success, non-0 otherwise.
8555
- * Assumes vcpu_load() was already called.
8557
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
8559
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
8562
-#ifdef CONFIG_X86_64
8564
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
8566
- if (efer & EFER_RESERVED_BITS) {
8567
- printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
8573
- if (is_paging(vcpu)
8574
- && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
8575
- printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
8580
- kvm_x86_ops->set_efer(vcpu, efer);
8582
- efer &= ~EFER_LMA;
8583
- efer |= vcpu->shadow_efer & EFER_LMA;
8585
- vcpu->shadow_efer = efer;
8590
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
8593
-#ifdef CONFIG_X86_64
8595
- set_efer(vcpu, data);
8598
- case MSR_IA32_MC0_STATUS:
8599
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
8600
- __FUNCTION__, data);
8602
- case MSR_IA32_MCG_STATUS:
8603
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
8604
- __FUNCTION__, data);
8606
- case MSR_IA32_UCODE_REV:
8607
- case MSR_IA32_UCODE_WRITE:
8608
- case 0x200 ... 0x2ff: /* MTRRs */
8610
- case MSR_IA32_APICBASE:
8611
- kvm_set_apic_base(vcpu, data);
8613
- case MSR_IA32_MISC_ENABLE:
8614
- vcpu->ia32_misc_enable_msr = data;
8617
- * This is the 'probe whether the host is KVM' logic:
8619
- case MSR_KVM_API_MAGIC:
8620
- return vcpu_register_para(vcpu, data);
8623
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
8628
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
8631
- * Writes msr value into into the appropriate "register".
8632
- * Returns 0 on success, non-0 otherwise.
8633
- * Assumes vcpu_load() was already called.
8635
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
8637
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
8640
-void kvm_resched(struct kvm_vcpu *vcpu)
8642
- if (!need_resched())
8646
-EXPORT_SYMBOL_GPL(kvm_resched);
8648
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
8649
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
8653
- struct kvm_cpuid_entry *e, *best;
8655
- kvm_x86_ops->cache_regs(vcpu);
8656
- function = vcpu->regs[VCPU_REGS_RAX];
8657
- vcpu->regs[VCPU_REGS_RAX] = 0;
8658
- vcpu->regs[VCPU_REGS_RBX] = 0;
8659
- vcpu->regs[VCPU_REGS_RCX] = 0;
8660
- vcpu->regs[VCPU_REGS_RDX] = 0;
8662
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
8663
- e = &vcpu->cpuid_entries[i];
8664
- if (e->function == function) {
8667
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
8668
+ if (vcpu->arch.pio.guest_pages[i]) {
8669
+ kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
8670
+ vcpu->arch.pio.guest_pages[i] = NULL;
8673
- * Both basic or both extended?
8675
- if (((e->function ^ function) & 0x80000000) == 0)
8676
- if (!best || e->function > best->function)
8680
- vcpu->regs[VCPU_REGS_RAX] = best->eax;
8681
- vcpu->regs[VCPU_REGS_RBX] = best->ebx;
8682
- vcpu->regs[VCPU_REGS_RCX] = best->ecx;
8683
- vcpu->regs[VCPU_REGS_RDX] = best->edx;
8685
- kvm_x86_ops->decache_regs(vcpu);
8686
- kvm_x86_ops->skip_emulated_instruction(vcpu);
8688
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
8690
static int pio_copy_data(struct kvm_vcpu *vcpu)
8692
- void *p = vcpu->pio_data;
8693
+ void *p = vcpu->arch.pio_data;
8696
- int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
8697
+ int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
8699
- q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
8700
+ q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
8703
free_pio_guest_pages(vcpu);
8706
- q += vcpu->pio.guest_page_offset;
8707
- bytes = vcpu->pio.size * vcpu->pio.cur_count;
8709
+ q += vcpu->arch.pio.guest_page_offset;
8710
+ bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
8711
+ if (vcpu->arch.pio.in)
8712
memcpy(q, p, bytes);
8714
memcpy(p, q, bytes);
8715
- q -= vcpu->pio.guest_page_offset;
8716
+ q -= vcpu->arch.pio.guest_page_offset;
8718
free_pio_guest_pages(vcpu);
8722
-static int complete_pio(struct kvm_vcpu *vcpu)
8723
+int complete_pio(struct kvm_vcpu *vcpu)
8725
- struct kvm_pio_request *io = &vcpu->pio;
8726
+ struct kvm_pio_request *io = &vcpu->arch.pio;
8730
@@ -1760,7 +1910,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
8734
- memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
8735
+ memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
8739
@@ -1778,15 +1928,15 @@ static int complete_pio(struct kvm_vcpu *vcpu)
8740
* The size of the register should really depend on
8741
* current address size.
8743
- vcpu->regs[VCPU_REGS_RCX] -= delta;
8744
+ vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
8750
- vcpu->regs[VCPU_REGS_RDI] += delta;
8751
+ vcpu->arch.regs[VCPU_REGS_RDI] += delta;
8753
- vcpu->regs[VCPU_REGS_RSI] += delta;
8754
+ vcpu->arch.regs[VCPU_REGS_RSI] += delta;
8757
kvm_x86_ops->decache_regs(vcpu);
8758
@@ -1804,13 +1954,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
8759
/* TODO: String I/O for in kernel device */
8761
mutex_lock(&vcpu->kvm->lock);
8763
- kvm_iodevice_read(pio_dev, vcpu->pio.port,
8765
+ if (vcpu->arch.pio.in)
8766
+ kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
8767
+ vcpu->arch.pio.size,
8770
- kvm_iodevice_write(pio_dev, vcpu->pio.port,
8772
+ kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
8773
+ vcpu->arch.pio.size,
8775
mutex_unlock(&vcpu->kvm->lock);
8777
@@ -1818,8 +1968,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
8778
static void pio_string_write(struct kvm_io_device *pio_dev,
8779
struct kvm_vcpu *vcpu)
8781
- struct kvm_pio_request *io = &vcpu->pio;
8782
- void *pd = vcpu->pio_data;
8783
+ struct kvm_pio_request *io = &vcpu->arch.pio;
8784
+ void *pd = vcpu->arch.pio_data;
8787
mutex_lock(&vcpu->kvm->lock);
8788
@@ -1832,32 +1982,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
8789
mutex_unlock(&vcpu->kvm->lock);
8792
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
8793
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
8796
+ return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
8799
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
8800
int size, unsigned port)
8802
struct kvm_io_device *pio_dev;
8804
vcpu->run->exit_reason = KVM_EXIT_IO;
8805
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8806
- vcpu->run->io.size = vcpu->pio.size = size;
8807
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
8808
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8809
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
8810
- vcpu->run->io.port = vcpu->pio.port = port;
8811
- vcpu->pio.in = in;
8812
- vcpu->pio.string = 0;
8813
- vcpu->pio.down = 0;
8814
- vcpu->pio.guest_page_offset = 0;
8815
- vcpu->pio.rep = 0;
8816
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
8817
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
8818
+ vcpu->arch.pio.in = in;
8819
+ vcpu->arch.pio.string = 0;
8820
+ vcpu->arch.pio.down = 0;
8821
+ vcpu->arch.pio.guest_page_offset = 0;
8822
+ vcpu->arch.pio.rep = 0;
8824
kvm_x86_ops->cache_regs(vcpu);
8825
- memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
8826
+ memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
8827
kvm_x86_ops->decache_regs(vcpu);
8829
kvm_x86_ops->skip_emulated_instruction(vcpu);
8831
pio_dev = vcpu_find_pio_dev(vcpu, port);
8833
- kernel_pio(pio_dev, vcpu, vcpu->pio_data);
8834
+ kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
8838
@@ -1877,15 +2033,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
8840
vcpu->run->exit_reason = KVM_EXIT_IO;
8841
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8842
- vcpu->run->io.size = vcpu->pio.size = size;
8843
+ vcpu->run->io.size = vcpu->arch.pio.size = size;
8844
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8845
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
8846
- vcpu->run->io.port = vcpu->pio.port = port;
8847
- vcpu->pio.in = in;
8848
- vcpu->pio.string = 1;
8849
- vcpu->pio.down = down;
8850
- vcpu->pio.guest_page_offset = offset_in_page(address);
8851
- vcpu->pio.rep = rep;
8852
+ vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
8853
+ vcpu->run->io.port = vcpu->arch.pio.port = port;
8854
+ vcpu->arch.pio.in = in;
8855
+ vcpu->arch.pio.string = 1;
8856
+ vcpu->arch.pio.down = down;
8857
+ vcpu->arch.pio.guest_page_offset = offset_in_page(address);
8858
+ vcpu->arch.pio.rep = rep;
8861
kvm_x86_ops->skip_emulated_instruction(vcpu);
8862
@@ -1911,37 +2067,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
8863
* String I/O in reverse. Yuck. Kill the guest, fix later.
8865
pr_unimpl(vcpu, "guest string pio down\n");
8867
+ kvm_inject_gp(vcpu, 0);
8870
vcpu->run->io.count = now;
8871
- vcpu->pio.cur_count = now;
8872
+ vcpu->arch.pio.cur_count = now;
8874
- if (vcpu->pio.cur_count == vcpu->pio.count)
8875
+ if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
8876
kvm_x86_ops->skip_emulated_instruction(vcpu);
8878
for (i = 0; i < nr_pages; ++i) {
8879
mutex_lock(&vcpu->kvm->lock);
8880
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
8883
- vcpu->pio.guest_pages[i] = page;
8884
+ vcpu->arch.pio.guest_pages[i] = page;
8885
mutex_unlock(&vcpu->kvm->lock);
8888
+ kvm_inject_gp(vcpu, 0);
8889
free_pio_guest_pages(vcpu);
8894
pio_dev = vcpu_find_pio_dev(vcpu, port);
8895
- if (!vcpu->pio.in) {
8896
+ if (!vcpu->arch.pio.in) {
8897
/* string PIO write */
8898
ret = pio_copy_data(vcpu);
8899
if (ret >= 0 && pio_dev) {
8900
pio_string_write(pio_dev, vcpu);
8902
- if (vcpu->pio.count == 0)
8903
+ if (vcpu->arch.pio.count == 0)
8907
@@ -1953,6 +2107,265 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
8909
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
8911
+int kvm_arch_init(void *opaque)
8914
+ struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
8916
+ r = kvm_mmu_module_init();
8920
+ kvm_init_msr_list();
8922
+ if (kvm_x86_ops) {
8923
+ printk(KERN_ERR "kvm: already loaded the other module\n");
8928
+ if (!ops->cpu_has_kvm_support()) {
8929
+ printk(KERN_ERR "kvm: no hardware support\n");
8933
+ if (ops->disabled_by_bios()) {
8934
+ printk(KERN_ERR "kvm: disabled by bios\n");
8939
+ kvm_x86_ops = ops;
8940
+ kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
8944
+ kvm_mmu_module_exit();
8949
+void kvm_arch_exit(void)
8951
+ kvm_x86_ops = NULL;
8952
+ kvm_mmu_module_exit();
8955
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
8957
+ ++vcpu->stat.halt_exits;
8958
+ if (irqchip_in_kernel(vcpu->kvm)) {
8959
+ vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
8960
+ kvm_vcpu_block(vcpu);
8961
+ if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
8965
+ vcpu->run->exit_reason = KVM_EXIT_HLT;
8969
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
8971
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
8973
+ unsigned long nr, a0, a1, a2, a3, ret;
8975
+ kvm_x86_ops->cache_regs(vcpu);
8977
+ nr = vcpu->arch.regs[VCPU_REGS_RAX];
8978
+ a0 = vcpu->arch.regs[VCPU_REGS_RBX];
8979
+ a1 = vcpu->arch.regs[VCPU_REGS_RCX];
8980
+ a2 = vcpu->arch.regs[VCPU_REGS_RDX];
8981
+ a3 = vcpu->arch.regs[VCPU_REGS_RSI];
8983
+ if (!is_long_mode(vcpu)) {
8993
+ ret = -KVM_ENOSYS;
8996
+ vcpu->arch.regs[VCPU_REGS_RAX] = ret;
8997
+ kvm_x86_ops->decache_regs(vcpu);
9000
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
9002
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
9004
+ char instruction[3];
9007
+ mutex_lock(&vcpu->kvm->lock);
9010
+ * Blow out the MMU to ensure that no other VCPU has an active mapping
9011
+ * to ensure that the updated hypercall appears atomically across all
9014
+ kvm_mmu_zap_all(vcpu->kvm);
9016
+ kvm_x86_ops->cache_regs(vcpu);
9017
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
9018
+ if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
9019
+ != X86EMUL_CONTINUE)
9022
+ mutex_unlock(&vcpu->kvm->lock);
9027
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
9029
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
9032
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
9034
+ struct descriptor_table dt = { limit, base };
9036
+ kvm_x86_ops->set_gdt(vcpu, &dt);
9039
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
9041
+ struct descriptor_table dt = { limit, base };
9043
+ kvm_x86_ops->set_idt(vcpu, &dt);
9046
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
9047
+ unsigned long *rflags)
9050
+ *rflags = kvm_x86_ops->get_rflags(vcpu);
9053
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
9055
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
9058
+ return vcpu->arch.cr0;
9060
+ return vcpu->arch.cr2;
9062
+ return vcpu->arch.cr3;
9064
+ return vcpu->arch.cr4;
9066
+ return get_cr8(vcpu);
9068
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
9073
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
9074
+ unsigned long *rflags)
9078
+ set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
9079
+ *rflags = kvm_x86_ops->get_rflags(vcpu);
9082
+ vcpu->arch.cr2 = val;
9085
+ set_cr3(vcpu, val);
9088
+ set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
9091
+ set_cr8(vcpu, val & 0xfUL);
9094
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
9098
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
9100
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
9101
+ int j, nent = vcpu->arch.cpuid_nent;
9103
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
9104
+ /* when no next entry is found, the current entry[i] is reselected */
9105
+ for (j = i + 1; j == i; j = (j + 1) % nent) {
9106
+ struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
9107
+ if (ej->function == e->function) {
9108
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
9112
+ return 0; /* silence gcc, even though control never reaches here */
9115
+/* find an entry with matching function, matching index (if needed), and that
9116
+ * should be read next (if it's stateful) */
9117
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
9118
+ u32 function, u32 index)
9120
+ if (e->function != function)
9122
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
9124
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
9125
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
9130
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
9133
+ u32 function, index;
9134
+ struct kvm_cpuid_entry2 *e, *best;
9136
+ kvm_x86_ops->cache_regs(vcpu);
9137
+ function = vcpu->arch.regs[VCPU_REGS_RAX];
9138
+ index = vcpu->arch.regs[VCPU_REGS_RCX];
9139
+ vcpu->arch.regs[VCPU_REGS_RAX] = 0;
9140
+ vcpu->arch.regs[VCPU_REGS_RBX] = 0;
9141
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
9142
+ vcpu->arch.regs[VCPU_REGS_RDX] = 0;
9144
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
9145
+ e = &vcpu->arch.cpuid_entries[i];
9146
+ if (is_matching_cpuid_entry(e, function, index)) {
9147
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
9148
+ move_to_next_stateful_cpuid_entry(vcpu, i);
9153
+ * Both basic or both extended?
9155
+ if (((e->function ^ function) & 0x80000000) == 0)
9156
+ if (!best || e->function > best->function)
9160
+ vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
9161
+ vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
9162
+ vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
9163
+ vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
9165
+ kvm_x86_ops->decache_regs(vcpu);
9166
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
9168
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
9171
* Check if userspace requested an interrupt window, and that the
9172
* interrupt window is open.
9173
@@ -1962,9 +2375,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
9174
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
9175
struct kvm_run *kvm_run)
9177
- return (!vcpu->irq_summary &&
9178
+ return (!vcpu->arch.irq_summary &&
9179
kvm_run->request_interrupt_window &&
9180
- vcpu->interrupt_window_open &&
9181
+ vcpu->arch.interrupt_window_open &&
9182
(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
9185
@@ -1978,20 +2391,22 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
9186
kvm_run->ready_for_interrupt_injection = 1;
9188
kvm_run->ready_for_interrupt_injection =
9189
- (vcpu->interrupt_window_open &&
9190
- vcpu->irq_summary == 0);
9191
+ (vcpu->arch.interrupt_window_open &&
9192
+ vcpu->arch.irq_summary == 0);
9195
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9199
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
9200
- printk("vcpu %d received sipi with vector # %x\n",
9201
- vcpu->vcpu_id, vcpu->sipi_vector);
9202
+ if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
9203
+ pr_debug("vcpu %d received sipi with vector # %x\n",
9204
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
9205
kvm_lapic_reset(vcpu);
9206
- kvm_x86_ops->vcpu_reset(vcpu);
9207
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
9208
+ r = kvm_x86_ops->vcpu_reset(vcpu);
9211
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
9215
@@ -2003,6 +2418,8 @@ again:
9219
+ kvm_inject_pending_timer_irqs(vcpu);
9223
kvm_x86_ops->prepare_guest_switch(vcpu);
9224
@@ -2019,16 +2436,18 @@ again:
9228
- if (irqchip_in_kernel(vcpu->kvm))
9229
+ if (vcpu->arch.exception.pending)
9230
+ __queue_exception(vcpu);
9231
+ else if (irqchip_in_kernel(vcpu->kvm))
9232
kvm_x86_ops->inject_pending_irq(vcpu);
9233
- else if (!vcpu->mmio_read_completed)
9235
kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
9237
vcpu->guest_mode = 1;
9241
- if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
9242
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
9243
kvm_x86_ops->tlb_flush(vcpu);
9245
kvm_x86_ops->run(vcpu, kvm_run);
9246
@@ -2055,9 +2474,12 @@ again:
9248
if (unlikely(prof_on == KVM_PROFILING)) {
9249
kvm_x86_ops->cache_regs(vcpu);
9250
- profile_hit(KVM_PROFILING, (void *)vcpu->rip);
9251
+ profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
9254
+ if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
9255
+ vcpu->arch.exception.pending = false;
9257
r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
9260
@@ -2067,10 +2489,8 @@ again:
9261
++vcpu->stat.request_irq_exits;
9264
- if (!need_resched()) {
9265
- ++vcpu->stat.light_exits;
9266
+ if (!need_resched())
9272
@@ -2084,15 +2504,14 @@ out:
9277
-static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9278
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9285
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
9286
+ if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
9287
kvm_vcpu_block(vcpu);
9290
@@ -2105,18 +2524,18 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9291
if (!irqchip_in_kernel(vcpu->kvm))
9292
set_cr8(vcpu, kvm_run->cr8);
9294
- if (vcpu->pio.cur_count) {
9295
+ if (vcpu->arch.pio.cur_count) {
9296
r = complete_pio(vcpu);
9301
+#if CONFIG_HAS_IOMEM
9302
if (vcpu->mmio_needed) {
9303
memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
9304
vcpu->mmio_read_completed = 1;
9305
vcpu->mmio_needed = 0;
9306
r = emulate_instruction(vcpu, kvm_run,
9307
- vcpu->mmio_fault_cr2, 0);
9308
+ vcpu->arch.mmio_fault_cr2, 0, 1);
9309
if (r == EMULATE_DO_MMIO) {
9311
* Read-modify-write. Back to userspace.
9312
@@ -2125,10 +2544,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9318
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
9319
kvm_x86_ops->cache_regs(vcpu);
9320
- vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
9321
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
9322
kvm_x86_ops->decache_regs(vcpu);
9325
@@ -2142,33 +2561,32 @@ out:
9329
-static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
9330
- struct kvm_regs *regs)
9331
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9335
kvm_x86_ops->cache_regs(vcpu);
9337
- regs->rax = vcpu->regs[VCPU_REGS_RAX];
9338
- regs->rbx = vcpu->regs[VCPU_REGS_RBX];
9339
- regs->rcx = vcpu->regs[VCPU_REGS_RCX];
9340
- regs->rdx = vcpu->regs[VCPU_REGS_RDX];
9341
- regs->rsi = vcpu->regs[VCPU_REGS_RSI];
9342
- regs->rdi = vcpu->regs[VCPU_REGS_RDI];
9343
- regs->rsp = vcpu->regs[VCPU_REGS_RSP];
9344
- regs->rbp = vcpu->regs[VCPU_REGS_RBP];
9345
+ regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
9346
+ regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
9347
+ regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
9348
+ regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
9349
+ regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
9350
+ regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
9351
+ regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
9352
+ regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
9353
#ifdef CONFIG_X86_64
9354
- regs->r8 = vcpu->regs[VCPU_REGS_R8];
9355
- regs->r9 = vcpu->regs[VCPU_REGS_R9];
9356
- regs->r10 = vcpu->regs[VCPU_REGS_R10];
9357
- regs->r11 = vcpu->regs[VCPU_REGS_R11];
9358
- regs->r12 = vcpu->regs[VCPU_REGS_R12];
9359
- regs->r13 = vcpu->regs[VCPU_REGS_R13];
9360
- regs->r14 = vcpu->regs[VCPU_REGS_R14];
9361
- regs->r15 = vcpu->regs[VCPU_REGS_R15];
9362
+ regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
9363
+ regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
9364
+ regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
9365
+ regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
9366
+ regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
9367
+ regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
9368
+ regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
9369
+ regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
9372
- regs->rip = vcpu->rip;
9373
+ regs->rip = vcpu->arch.rip;
9374
regs->rflags = kvm_x86_ops->get_rflags(vcpu);
9377
@@ -2182,31 +2600,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
9381
-static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
9382
- struct kvm_regs *regs)
9383
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9387
- vcpu->regs[VCPU_REGS_RAX] = regs->rax;
9388
- vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
9389
- vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
9390
- vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
9391
- vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
9392
- vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
9393
- vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
9394
- vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
9395
+ vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
9396
+ vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
9397
+ vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
9398
+ vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
9399
+ vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
9400
+ vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
9401
+ vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
9402
+ vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
9403
#ifdef CONFIG_X86_64
9404
- vcpu->regs[VCPU_REGS_R8] = regs->r8;
9405
- vcpu->regs[VCPU_REGS_R9] = regs->r9;
9406
- vcpu->regs[VCPU_REGS_R10] = regs->r10;
9407
- vcpu->regs[VCPU_REGS_R11] = regs->r11;
9408
- vcpu->regs[VCPU_REGS_R12] = regs->r12;
9409
- vcpu->regs[VCPU_REGS_R13] = regs->r13;
9410
- vcpu->regs[VCPU_REGS_R14] = regs->r14;
9411
- vcpu->regs[VCPU_REGS_R15] = regs->r15;
9412
+ vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
9413
+ vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
9414
+ vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
9415
+ vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
9416
+ vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
9417
+ vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
9418
+ vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
9419
+ vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
9422
- vcpu->rip = regs->rip;
9423
+ vcpu->arch.rip = regs->rip;
9424
kvm_x86_ops->set_rflags(vcpu, regs->rflags);
9426
kvm_x86_ops->decache_regs(vcpu);
9427
@@ -2222,8 +2639,18 @@ static void get_segment(struct kvm_vcpu *vcpu,
9428
return kvm_x86_ops->get_segment(vcpu, var, seg);
9431
-static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9432
- struct kvm_sregs *sregs)
9433
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
9435
+ struct kvm_segment cs;
9437
+ get_segment(vcpu, &cs, VCPU_SREG_CS);
9441
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
9443
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9444
+ struct kvm_sregs *sregs)
9446
struct descriptor_table dt;
9448
@@ -2248,12 +2675,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9449
sregs->gdt.base = dt.base;
9451
kvm_x86_ops->decache_cr4_guest_bits(vcpu);
9452
- sregs->cr0 = vcpu->cr0;
9453
- sregs->cr2 = vcpu->cr2;
9454
- sregs->cr3 = vcpu->cr3;
9455
- sregs->cr4 = vcpu->cr4;
9456
+ sregs->cr0 = vcpu->arch.cr0;
9457
+ sregs->cr2 = vcpu->arch.cr2;
9458
+ sregs->cr3 = vcpu->arch.cr3;
9459
+ sregs->cr4 = vcpu->arch.cr4;
9460
sregs->cr8 = get_cr8(vcpu);
9461
- sregs->efer = vcpu->shadow_efer;
9462
+ sregs->efer = vcpu->arch.shadow_efer;
9463
sregs->apic_base = kvm_get_apic_base(vcpu);
9465
if (irqchip_in_kernel(vcpu->kvm)) {
9466
@@ -2261,9 +2688,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9467
sizeof sregs->interrupt_bitmap);
9468
pending_vec = kvm_x86_ops->get_irq(vcpu);
9469
if (pending_vec >= 0)
9470
- set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
9471
+ set_bit(pending_vec,
9472
+ (unsigned long *)sregs->interrupt_bitmap);
9474
- memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
9475
+ memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
9476
sizeof sregs->interrupt_bitmap);
9479
@@ -2277,8 +2705,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
9480
return kvm_x86_ops->set_segment(vcpu, var, seg);
9483
-static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9484
- struct kvm_sregs *sregs)
9485
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9486
+ struct kvm_sregs *sregs)
9488
int mmu_reset_needed = 0;
9489
int i, pending_vec, max_bits;
9490
@@ -2293,13 +2721,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9491
dt.base = sregs->gdt.base;
9492
kvm_x86_ops->set_gdt(vcpu, &dt);
9494
- vcpu->cr2 = sregs->cr2;
9495
- mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
9496
- vcpu->cr3 = sregs->cr3;
9497
+ vcpu->arch.cr2 = sregs->cr2;
9498
+ mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
9499
+ vcpu->arch.cr3 = sregs->cr3;
9501
set_cr8(vcpu, sregs->cr8);
9503
- mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
9504
+ mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
9505
#ifdef CONFIG_X86_64
9506
kvm_x86_ops->set_efer(vcpu, sregs->efer);
9508
@@ -2307,25 +2735,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9510
kvm_x86_ops->decache_cr4_guest_bits(vcpu);
9512
- mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
9513
- vcpu->cr0 = sregs->cr0;
9514
+ mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
9515
+ vcpu->arch.cr0 = sregs->cr0;
9516
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
9518
- mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
9519
+ mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
9520
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
9521
if (!is_long_mode(vcpu) && is_pae(vcpu))
9522
- load_pdptrs(vcpu, vcpu->cr3);
9523
+ load_pdptrs(vcpu, vcpu->arch.cr3);
9525
if (mmu_reset_needed)
9526
kvm_mmu_reset_context(vcpu);
9528
if (!irqchip_in_kernel(vcpu->kvm)) {
9529
- memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
9530
- sizeof vcpu->irq_pending);
9531
- vcpu->irq_summary = 0;
9532
- for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
9533
- if (vcpu->irq_pending[i])
9534
- __set_bit(i, &vcpu->irq_summary);
9535
+ memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
9536
+ sizeof vcpu->arch.irq_pending);
9537
+ vcpu->arch.irq_summary = 0;
9538
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
9539
+ if (vcpu->arch.irq_pending[i])
9540
+ __set_bit(i, &vcpu->arch.irq_summary);
9542
max_bits = (sizeof sregs->interrupt_bitmap) << 3;
9543
pending_vec = find_first_bit(
9544
@@ -2334,7 +2762,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9545
/* Only pending external irq is handled here */
9546
if (pending_vec < max_bits) {
9547
kvm_x86_ops->set_irq(vcpu, pending_vec);
9548
- printk("Set back pending irq %d\n", pending_vec);
9549
+ pr_debug("Set back pending irq %d\n",
9554
@@ -2353,174 +2782,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9558
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
9560
- struct kvm_segment cs;
9562
- get_segment(vcpu, &cs, VCPU_SREG_CS);
9566
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
9569
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
9570
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
9572
- * This list is modified at module load time to reflect the
9573
- * capabilities of the host cpu.
9575
-static u32 msrs_to_save[] = {
9576
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
9578
-#ifdef CONFIG_X86_64
9579
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
9581
- MSR_IA32_TIME_STAMP_COUNTER,
9584
-static unsigned num_msrs_to_save;
9586
-static u32 emulated_msrs[] = {
9587
- MSR_IA32_MISC_ENABLE,
9590
-static __init void kvm_init_msr_list(void)
9595
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
9596
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
9599
- msrs_to_save[j] = msrs_to_save[i];
9602
- num_msrs_to_save = j;
9606
- * Adapt set_msr() to msr_io()'s calling convention
9608
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
9610
- return kvm_set_msr(vcpu, index, *data);
9614
- * Read or write a bunch of msrs. All parameters are kernel addresses.
9616
- * @return number of msrs set successfully.
9618
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
9619
- struct kvm_msr_entry *entries,
9620
- int (*do_msr)(struct kvm_vcpu *vcpu,
9621
- unsigned index, u64 *data))
9627
- for (i = 0; i < msrs->nmsrs; ++i)
9628
- if (do_msr(vcpu, entries[i].index, &entries[i].data))
9637
- * Read or write a bunch of msrs. Parameters are user addresses.
9639
- * @return number of msrs set successfully.
9641
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
9642
- int (*do_msr)(struct kvm_vcpu *vcpu,
9643
- unsigned index, u64 *data),
9646
- struct kvm_msrs msrs;
9647
- struct kvm_msr_entry *entries;
9652
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
9656
- if (msrs.nmsrs >= MAX_IO_MSRS)
9660
- size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
9661
- entries = vmalloc(size);
9666
- if (copy_from_user(entries, user_msrs->entries, size))
9669
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
9674
- if (writeback && copy_to_user(user_msrs->entries, entries, size))
9686
- * Translate a guest virtual address to a guest physical address.
9688
-static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
9689
- struct kvm_translation *tr)
9691
- unsigned long vaddr = tr->linear_address;
9695
- mutex_lock(&vcpu->kvm->lock);
9696
- gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
9697
- tr->physical_address = gpa;
9698
- tr->valid = gpa != UNMAPPED_GVA;
9699
- tr->writeable = 1;
9701
- mutex_unlock(&vcpu->kvm->lock);
9707
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
9708
- struct kvm_interrupt *irq)
9710
- if (irq->irq < 0 || irq->irq >= 256)
9712
- if (irqchip_in_kernel(vcpu->kvm))
9716
- set_bit(irq->irq, vcpu->irq_pending);
9717
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
9724
-static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
9725
- struct kvm_debug_guest *dbg)
9726
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
9727
+ struct kvm_debug_guest *dbg)
9731
@@ -2533,179 +2796,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
9735
-static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
9736
- unsigned long address,
9739
- struct kvm_vcpu *vcpu = vma->vm_file->private_data;
9740
- unsigned long pgoff;
9741
- struct page *page;
9743
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
9745
- page = virt_to_page(vcpu->run);
9746
- else if (pgoff == KVM_PIO_PAGE_OFFSET)
9747
- page = virt_to_page(vcpu->pio_data);
9749
- return NOPAGE_SIGBUS;
9752
- *type = VM_FAULT_MINOR;
9757
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
9758
- .nopage = kvm_vcpu_nopage,
9761
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
9763
- vma->vm_ops = &kvm_vcpu_vm_ops;
9767
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
9769
- struct kvm_vcpu *vcpu = filp->private_data;
9771
- fput(vcpu->kvm->filp);
9775
-static struct file_operations kvm_vcpu_fops = {
9776
- .release = kvm_vcpu_release,
9777
- .unlocked_ioctl = kvm_vcpu_ioctl,
9778
- .compat_ioctl = kvm_vcpu_ioctl,
9779
- .mmap = kvm_vcpu_mmap,
9783
- * Allocates an inode for the vcpu.
9785
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
9788
- struct inode *inode;
9789
- struct file *file;
9791
- r = anon_inode_getfd(&fd, &inode, &file,
9792
- "kvm-vcpu", &kvm_vcpu_fops, vcpu);
9795
- atomic_inc(&vcpu->kvm->filp->f_count);
9800
- * Creates some virtual cpus. Good luck creating more than one.
9802
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
9805
- struct kvm_vcpu *vcpu;
9807
- if (!valid_vcpu(n))
9810
- vcpu = kvm_x86_ops->vcpu_create(kvm, n);
9812
- return PTR_ERR(vcpu);
9814
- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
9816
- /* We do fxsave: this must be aligned. */
9817
- BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
9820
- r = kvm_mmu_setup(vcpu);
9825
- mutex_lock(&kvm->lock);
9826
- if (kvm->vcpus[n]) {
9828
- mutex_unlock(&kvm->lock);
9831
- kvm->vcpus[n] = vcpu;
9832
- mutex_unlock(&kvm->lock);
9834
- /* Now it's all set up, let userspace reach it */
9835
- r = create_vcpu_fd(vcpu);
9841
- mutex_lock(&kvm->lock);
9842
- kvm->vcpus[n] = NULL;
9843
- mutex_unlock(&kvm->lock);
9847
- kvm_mmu_unload(vcpu);
9851
- kvm_x86_ops->vcpu_free(vcpu);
9855
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
9859
- struct kvm_cpuid_entry *e, *entry;
9861
- rdmsrl(MSR_EFER, efer);
9863
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
9864
- e = &vcpu->cpuid_entries[i];
9865
- if (e->function == 0x80000001) {
9870
- if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
9871
- entry->edx &= ~(1 << 20);
9872
- printk(KERN_INFO "kvm: guest NX capability removed\n");
9876
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
9877
- struct kvm_cpuid *cpuid,
9878
- struct kvm_cpuid_entry __user *entries)
9883
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
9886
- if (copy_from_user(&vcpu->cpuid_entries, entries,
9887
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
9889
- vcpu->cpuid_nent = cpuid->nent;
9890
- cpuid_fix_nx_cap(vcpu);
9897
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
9900
- sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
9901
- vcpu->sigset_active = 1;
9902
- vcpu->sigset = *sigset;
9904
- vcpu->sigset_active = 0;
9909
* fxsave fpu state. Taken from x86_64/processor.h. To be killed when
9910
* we have asm/x86/processor.h
9911
@@ -2727,9 +2817,31 @@ struct fxsave {
9915
-static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9917
+ * Translate a guest virtual address to a guest physical address.
9919
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
9920
+ struct kvm_translation *tr)
9922
+ unsigned long vaddr = tr->linear_address;
9926
+ mutex_lock(&vcpu->kvm->lock);
9927
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
9928
+ tr->physical_address = gpa;
9929
+ tr->valid = gpa != UNMAPPED_GVA;
9930
+ tr->writeable = 1;
9932
+ mutex_unlock(&vcpu->kvm->lock);
9938
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9940
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
9941
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
9945
@@ -2747,9 +2859,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9949
-static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9950
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9952
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
9953
+ struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
9957
@@ -2767,862 +2879,288 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9961
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
9962
- struct kvm_lapic_state *s)
9963
+void fx_init(struct kvm_vcpu *vcpu)
9966
- memcpy(s->regs, vcpu->apic->regs, sizeof *s);
9971
+ unsigned after_mxcsr_mask;
9973
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
9974
- struct kvm_lapic_state *s)
9977
- memcpy(vcpu->apic->regs, s->regs, sizeof *s);
9978
- kvm_apic_post_state_restore(vcpu);
9980
+ /* Initialize guest FPU by resetting ours and saving into guest's */
9981
+ preempt_disable();
9982
+ fx_save(&vcpu->arch.host_fx_image);
9984
+ fx_save(&vcpu->arch.guest_fx_image);
9985
+ fx_restore(&vcpu->arch.host_fx_image);
9989
+ vcpu->arch.cr0 |= X86_CR0_ET;
9990
+ after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
9991
+ vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
9992
+ memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
9993
+ 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
9995
+EXPORT_SYMBOL_GPL(fx_init);
9997
-static long kvm_vcpu_ioctl(struct file *filp,
9998
- unsigned int ioctl, unsigned long arg)
9999
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
10001
- struct kvm_vcpu *vcpu = filp->private_data;
10002
- void __user *argp = (void __user *)arg;
10010
- r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
10012
- case KVM_GET_REGS: {
10013
- struct kvm_regs kvm_regs;
10015
- memset(&kvm_regs, 0, sizeof kvm_regs);
10016
- r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
10020
- if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
10025
- case KVM_SET_REGS: {
10026
- struct kvm_regs kvm_regs;
10029
- if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
10031
- r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
10037
- case KVM_GET_SREGS: {
10038
- struct kvm_sregs kvm_sregs;
10040
- memset(&kvm_sregs, 0, sizeof kvm_sregs);
10041
- r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
10045
- if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
10050
- case KVM_SET_SREGS: {
10051
- struct kvm_sregs kvm_sregs;
10054
- if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
10056
- r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
10062
- case KVM_TRANSLATE: {
10063
- struct kvm_translation tr;
10066
- if (copy_from_user(&tr, argp, sizeof tr))
10068
- r = kvm_vcpu_ioctl_translate(vcpu, &tr);
10072
- if (copy_to_user(argp, &tr, sizeof tr))
10077
- case KVM_INTERRUPT: {
10078
- struct kvm_interrupt irq;
10081
- if (copy_from_user(&irq, argp, sizeof irq))
10083
- r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
10089
- case KVM_DEBUG_GUEST: {
10090
- struct kvm_debug_guest dbg;
10093
- if (copy_from_user(&dbg, argp, sizeof dbg))
10095
- r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
10101
- case KVM_GET_MSRS:
10102
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
10104
- case KVM_SET_MSRS:
10105
- r = msr_io(vcpu, argp, do_set_msr, 0);
10107
- case KVM_SET_CPUID: {
10108
- struct kvm_cpuid __user *cpuid_arg = argp;
10109
- struct kvm_cpuid cpuid;
10112
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
10114
- r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
10119
- case KVM_SET_SIGNAL_MASK: {
10120
- struct kvm_signal_mask __user *sigmask_arg = argp;
10121
- struct kvm_signal_mask kvm_sigmask;
10122
- sigset_t sigset, *p;
10127
- if (copy_from_user(&kvm_sigmask, argp,
10128
- sizeof kvm_sigmask))
10131
- if (kvm_sigmask.len != sizeof sigset)
10134
- if (copy_from_user(&sigset, sigmask_arg->sigset,
10139
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
10142
- case KVM_GET_FPU: {
10143
- struct kvm_fpu fpu;
10145
- memset(&fpu, 0, sizeof fpu);
10146
- r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
10150
- if (copy_to_user(argp, &fpu, sizeof fpu))
10155
- case KVM_SET_FPU: {
10156
- struct kvm_fpu fpu;
10159
- if (copy_from_user(&fpu, argp, sizeof fpu))
10161
- r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
10167
- case KVM_GET_LAPIC: {
10168
- struct kvm_lapic_state lapic;
10170
- memset(&lapic, 0, sizeof lapic);
10171
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
10175
- if (copy_to_user(argp, &lapic, sizeof lapic))
10180
- case KVM_SET_LAPIC: {
10181
- struct kvm_lapic_state lapic;
10182
+ if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
10186
- if (copy_from_user(&lapic, argp, sizeof lapic))
10188
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
10199
+ vcpu->guest_fpu_loaded = 1;
10200
+ fx_save(&vcpu->arch.host_fx_image);
10201
+ fx_restore(&vcpu->arch.guest_fx_image);
10203
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
10205
-static long kvm_vm_ioctl(struct file *filp,
10206
- unsigned int ioctl, unsigned long arg)
10207
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
10209
- struct kvm *kvm = filp->private_data;
10210
- void __user *argp = (void __user *)arg;
10214
- case KVM_CREATE_VCPU:
10215
- r = kvm_vm_ioctl_create_vcpu(kvm, arg);
10219
- case KVM_SET_MEMORY_REGION: {
10220
- struct kvm_memory_region kvm_mem;
10223
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
10225
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
10230
- case KVM_GET_DIRTY_LOG: {
10231
- struct kvm_dirty_log log;
10234
- if (copy_from_user(&log, argp, sizeof log))
10236
- r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
10241
- case KVM_SET_MEMORY_ALIAS: {
10242
- struct kvm_memory_alias alias;
10245
- if (copy_from_user(&alias, argp, sizeof alias))
10247
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
10252
- case KVM_CREATE_IRQCHIP:
10254
- kvm->vpic = kvm_create_pic(kvm);
10256
- r = kvm_ioapic_init(kvm);
10258
- kfree(kvm->vpic);
10259
- kvm->vpic = NULL;
10266
- case KVM_IRQ_LINE: {
10267
- struct kvm_irq_level irq_event;
10270
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
10272
- if (irqchip_in_kernel(kvm)) {
10273
- mutex_lock(&kvm->lock);
10274
- if (irq_event.irq < 16)
10275
- kvm_pic_set_irq(pic_irqchip(kvm),
10277
- irq_event.level);
10278
- kvm_ioapic_set_irq(kvm->vioapic,
10280
- irq_event.level);
10281
- mutex_unlock(&kvm->lock);
10286
- case KVM_GET_IRQCHIP: {
10287
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
10288
- struct kvm_irqchip chip;
10291
- if (copy_from_user(&chip, argp, sizeof chip))
10294
- if (!irqchip_in_kernel(kvm))
10296
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
10300
- if (copy_to_user(argp, &chip, sizeof chip))
10305
- case KVM_SET_IRQCHIP: {
10306
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
10307
- struct kvm_irqchip chip;
10308
+ if (!vcpu->guest_fpu_loaded)
10312
- if (copy_from_user(&chip, argp, sizeof chip))
10315
- if (!irqchip_in_kernel(kvm))
10317
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
10328
+ vcpu->guest_fpu_loaded = 0;
10329
+ fx_save(&vcpu->arch.guest_fx_image);
10330
+ fx_restore(&vcpu->arch.host_fx_image);
10331
+ ++vcpu->stat.fpu_reload;
10333
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
10335
-static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
10336
- unsigned long address,
10338
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
10340
- struct kvm *kvm = vma->vm_file->private_data;
10341
- unsigned long pgoff;
10342
- struct page *page;
10344
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
10345
- page = gfn_to_page(kvm, pgoff);
10347
- return NOPAGE_SIGBUS;
10349
- if (type != NULL)
10350
- *type = VM_FAULT_MINOR;
10353
+ kvm_x86_ops->vcpu_free(vcpu);
10356
-static struct vm_operations_struct kvm_vm_vm_ops = {
10357
- .nopage = kvm_vm_nopage,
10360
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
10361
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
10364
- vma->vm_ops = &kvm_vm_vm_ops;
10366
+ return kvm_x86_ops->vcpu_create(kvm, id);
10369
-static struct file_operations kvm_vm_fops = {
10370
- .release = kvm_vm_release,
10371
- .unlocked_ioctl = kvm_vm_ioctl,
10372
- .compat_ioctl = kvm_vm_ioctl,
10373
- .mmap = kvm_vm_mmap,
10376
-static int kvm_dev_ioctl_create_vm(void)
10377
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
10380
- struct inode *inode;
10381
- struct file *file;
10385
- kvm = kvm_create_vm();
10387
- return PTR_ERR(kvm);
10388
- r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
10390
- kvm_destroy_vm(kvm);
10393
+ /* We do fxsave: this must be aligned. */
10394
+ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
10396
- kvm->filp = file;
10398
+ r = kvm_arch_vcpu_reset(vcpu);
10400
+ r = kvm_mmu_setup(vcpu);
10408
+ kvm_x86_ops->vcpu_free(vcpu);
10412
-static long kvm_dev_ioctl(struct file *filp,
10413
- unsigned int ioctl, unsigned long arg)
10414
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
10416
- void __user *argp = (void __user *)arg;
10417
- long r = -EINVAL;
10420
- case KVM_GET_API_VERSION:
10424
- r = KVM_API_VERSION;
10426
- case KVM_CREATE_VM:
10430
- r = kvm_dev_ioctl_create_vm();
10432
- case KVM_GET_MSR_INDEX_LIST: {
10433
- struct kvm_msr_list __user *user_msr_list = argp;
10434
- struct kvm_msr_list msr_list;
10438
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
10440
- n = msr_list.nmsrs;
10441
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
10442
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
10445
- if (n < num_msrs_to_save)
10448
- if (copy_to_user(user_msr_list->indices, &msrs_to_save,
10449
- num_msrs_to_save * sizeof(u32)))
10451
- if (copy_to_user(user_msr_list->indices
10452
- + num_msrs_to_save * sizeof(u32),
10454
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
10459
- case KVM_CHECK_EXTENSION: {
10460
- int ext = (long)argp;
10462
+ kvm_mmu_unload(vcpu);
10466
- case KVM_CAP_IRQCHIP:
10467
- case KVM_CAP_HLT:
10476
- case KVM_GET_VCPU_MMAP_SIZE:
10480
- r = 2 * PAGE_SIZE;
10487
+ kvm_x86_ops->vcpu_free(vcpu);
10490
-static struct file_operations kvm_chardev_ops = {
10491
- .unlocked_ioctl = kvm_dev_ioctl,
10492
- .compat_ioctl = kvm_dev_ioctl,
10495
-static struct miscdevice kvm_dev = {
10498
- &kvm_chardev_ops,
10502
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
10505
-static void decache_vcpus_on_cpu(int cpu)
10506
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
10509
- struct kvm_vcpu *vcpu;
10512
- spin_lock(&kvm_lock);
10513
- list_for_each_entry(vm, &vm_list, vm_list)
10514
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
10515
- vcpu = vm->vcpus[i];
10519
- * If the vcpu is locked, then it is running on some
10520
- * other cpu and therefore it is not cached on the
10521
- * cpu in question.
10523
- * If it's not locked, check the last cpu it executed
10526
- if (mutex_trylock(&vcpu->mutex)) {
10527
- if (vcpu->cpu == cpu) {
10528
- kvm_x86_ops->vcpu_decache(vcpu);
10531
- mutex_unlock(&vcpu->mutex);
10534
- spin_unlock(&kvm_lock);
10535
+ return kvm_x86_ops->vcpu_reset(vcpu);
10538
-static void hardware_enable(void *junk)
10539
+void kvm_arch_hardware_enable(void *garbage)
10541
- int cpu = raw_smp_processor_id();
10543
- if (cpu_isset(cpu, cpus_hardware_enabled))
10545
- cpu_set(cpu, cpus_hardware_enabled);
10546
- kvm_x86_ops->hardware_enable(NULL);
10547
+ kvm_x86_ops->hardware_enable(garbage);
10550
-static void hardware_disable(void *junk)
10551
+void kvm_arch_hardware_disable(void *garbage)
10553
- int cpu = raw_smp_processor_id();
10555
- if (!cpu_isset(cpu, cpus_hardware_enabled))
10557
- cpu_clear(cpu, cpus_hardware_enabled);
10558
- decache_vcpus_on_cpu(cpu);
10559
- kvm_x86_ops->hardware_disable(NULL);
10560
+ kvm_x86_ops->hardware_disable(garbage);
10563
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
10565
+int kvm_arch_hardware_setup(void)
10567
- int cpu = (long)v;
10571
- case CPU_DYING_FROZEN:
10572
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
10574
- hardware_disable(NULL);
10576
- case CPU_UP_CANCELED:
10577
- case CPU_UP_CANCELED_FROZEN:
10578
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
10580
- smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
10583
- case CPU_ONLINE_FROZEN:
10584
- printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
10586
- smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
10589
- return NOTIFY_OK;
10590
+ return kvm_x86_ops->hardware_setup();
10593
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
10595
+void kvm_arch_hardware_unsetup(void)
10597
- if (val == SYS_RESTART) {
10599
- * Some (well, at least mine) BIOSes hang on reboot if
10600
- * in vmx root mode.
10602
- printk(KERN_INFO "kvm: exiting hardware virtualization\n");
10603
- on_each_cpu(hardware_disable, NULL, 0, 1);
10605
- return NOTIFY_OK;
10606
+ kvm_x86_ops->hardware_unsetup();
10609
-static struct notifier_block kvm_reboot_notifier = {
10610
- .notifier_call = kvm_reboot,
10614
-void kvm_io_bus_init(struct kvm_io_bus *bus)
10615
+void kvm_arch_check_processor_compat(void *rtn)
10617
- memset(bus, 0, sizeof(*bus));
10618
+ kvm_x86_ops->check_processor_compatibility(rtn);
10621
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
10622
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
10625
+ struct page *page;
10629
- for (i = 0; i < bus->dev_count; i++) {
10630
- struct kvm_io_device *pos = bus->devs[i];
10631
+ BUG_ON(vcpu->kvm == NULL);
10634
- kvm_iodevice_destructor(pos);
10637
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
10638
+ if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
10639
+ vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
10641
+ vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
10643
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
10646
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
10651
+ vcpu->arch.pio_data = page_address(page);
10653
- for (i = 0; i < bus->dev_count; i++) {
10654
- struct kvm_io_device *pos = bus->devs[i];
10655
+ r = kvm_mmu_create(vcpu);
10657
+ goto fail_free_pio_data;
10659
- if (pos->in_range(pos, addr))
10661
+ if (irqchip_in_kernel(kvm)) {
10662
+ r = kvm_create_lapic(vcpu);
10664
+ goto fail_mmu_destroy;
10670
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
10672
- BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
10675
- bus->devs[bus->dev_count++] = dev;
10677
+ kvm_mmu_destroy(vcpu);
10678
+fail_free_pio_data:
10679
+ free_page((unsigned long)vcpu->arch.pio_data);
10684
-static struct notifier_block kvm_cpu_notifier = {
10685
- .notifier_call = kvm_cpu_hotplug,
10686
- .priority = 20, /* must be > scheduler priority */
10689
-static u64 stat_get(void *_offset)
10690
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
10692
- unsigned offset = (long)_offset;
10695
- struct kvm_vcpu *vcpu;
10698
- spin_lock(&kvm_lock);
10699
- list_for_each_entry(kvm, &vm_list, vm_list)
10700
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
10701
- vcpu = kvm->vcpus[i];
10703
- total += *(u32 *)((void *)vcpu + offset);
10705
- spin_unlock(&kvm_lock);
10707
+ kvm_free_lapic(vcpu);
10708
+ kvm_mmu_destroy(vcpu);
10709
+ free_page((unsigned long)vcpu->arch.pio_data);
10712
-DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
10714
-static __init void kvm_init_debug(void)
10715
+struct kvm *kvm_arch_create_vm(void)
10717
- struct kvm_stats_debugfs_item *p;
10719
- debugfs_dir = debugfs_create_dir("kvm", NULL);
10720
- for (p = debugfs_entries; p->name; ++p)
10721
- p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
10722
- (void *)(long)p->offset,
10725
+ struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
10727
-static void kvm_exit_debug(void)
10729
- struct kvm_stats_debugfs_item *p;
10731
+ return ERR_PTR(-ENOMEM);
10733
- for (p = debugfs_entries; p->name; ++p)
10734
- debugfs_remove(p->dentry);
10735
- debugfs_remove(debugfs_dir);
10737
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
10739
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
10741
- hardware_disable(NULL);
10746
-static int kvm_resume(struct sys_device *dev)
10747
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
10749
- hardware_enable(NULL);
10752
+ kvm_mmu_unload(vcpu);
10756
-static struct sysdev_class kvm_sysdev_class = {
10757
- set_kset_name("kvm"),
10758
- .suspend = kvm_suspend,
10759
- .resume = kvm_resume,
10762
-static struct sys_device kvm_sysdev = {
10764
- .cls = &kvm_sysdev_class,
10767
-hpa_t bad_page_address;
10770
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
10771
+static void kvm_free_vcpus(struct kvm *kvm)
10773
- return container_of(pn, struct kvm_vcpu, preempt_notifier);
10777
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
10779
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
10781
+ * Unpin any mmu pages first.
10783
+ for (i = 0; i < KVM_MAX_VCPUS; ++i)
10784
+ if (kvm->vcpus[i])
10785
+ kvm_unload_vcpu_mmu(kvm->vcpus[i]);
10786
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
10787
+ if (kvm->vcpus[i]) {
10788
+ kvm_arch_vcpu_free(kvm->vcpus[i]);
10789
+ kvm->vcpus[i] = NULL;
10793
- kvm_x86_ops->vcpu_load(vcpu, cpu);
10796
-static void kvm_sched_out(struct preempt_notifier *pn,
10797
- struct task_struct *next)
10798
+void kvm_arch_destroy_vm(struct kvm *kvm)
10800
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
10802
- kvm_x86_ops->vcpu_put(vcpu);
10803
+ kfree(kvm->arch.vpic);
10804
+ kfree(kvm->arch.vioapic);
10805
+ kvm_free_vcpus(kvm);
10806
+ kvm_free_physmem(kvm);
10810
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
10811
- struct module *module)
10812
+int kvm_arch_set_memory_region(struct kvm *kvm,
10813
+ struct kvm_userspace_memory_region *mem,
10814
+ struct kvm_memory_slot old,
10820
- if (kvm_x86_ops) {
10821
- printk(KERN_ERR "kvm: already loaded the other module\n");
10825
- if (!ops->cpu_has_kvm_support()) {
10826
- printk(KERN_ERR "kvm: no hardware support\n");
10827
- return -EOPNOTSUPP;
10829
- if (ops->disabled_by_bios()) {
10830
- printk(KERN_ERR "kvm: disabled by bios\n");
10831
- return -EOPNOTSUPP;
10834
- kvm_x86_ops = ops;
10836
- r = kvm_x86_ops->hardware_setup();
10839
+ int npages = mem->memory_size >> PAGE_SHIFT;
10840
+ struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
10842
- for_each_online_cpu(cpu) {
10843
- smp_call_function_single(cpu,
10844
- kvm_x86_ops->check_processor_compatibility,
10850
- on_each_cpu(hardware_enable, NULL, 0, 1);
10851
- r = register_cpu_notifier(&kvm_cpu_notifier);
10854
- register_reboot_notifier(&kvm_reboot_notifier);
10856
- r = sysdev_class_register(&kvm_sysdev_class);
10860
- r = sysdev_register(&kvm_sysdev);
10864
- /* A kmem cache lets us meet the alignment requirements of fx_save. */
10865
- kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
10866
- __alignof__(struct kvm_vcpu), 0, 0);
10867
- if (!kvm_vcpu_cache) {
10870
+ /*To keep backward compatibility with older userspace,
10871
+ *x86 needs to hanlde !user_alloc case.
10873
+ if (!user_alloc) {
10874
+ if (npages && !old.rmap) {
10875
+ down_write(¤t->mm->mmap_sem);
10876
+ memslot->userspace_addr = do_mmap(NULL, 0,
10877
+ npages * PAGE_SIZE,
10878
+ PROT_READ | PROT_WRITE,
10879
+ MAP_SHARED | MAP_ANONYMOUS,
10881
+ up_write(¤t->mm->mmap_sem);
10883
+ if (IS_ERR((void *)memslot->userspace_addr))
10884
+ return PTR_ERR((void *)memslot->userspace_addr);
10886
+ if (!old.user_alloc && old.rmap) {
10889
+ down_write(¤t->mm->mmap_sem);
10890
+ ret = do_munmap(current->mm, old.userspace_addr,
10891
+ old.npages * PAGE_SIZE);
10892
+ up_write(¤t->mm->mmap_sem);
10894
+ printk(KERN_WARNING
10895
+ "kvm_vm_ioctl_set_memory_region: "
10896
+ "failed to munmap memory\n");
10901
- kvm_chardev_ops.owner = module;
10903
- r = misc_register(&kvm_dev);
10905
- printk (KERN_ERR "kvm: misc device register failed\n");
10907
+ if (!kvm->arch.n_requested_mmu_pages) {
10908
+ unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
10909
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
10912
- kvm_preempt_ops.sched_in = kvm_sched_in;
10913
- kvm_preempt_ops.sched_out = kvm_sched_out;
10916
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
10917
+ kvm_flush_remote_tlbs(kvm);
10920
- kmem_cache_destroy(kvm_vcpu_cache);
10922
- sysdev_unregister(&kvm_sysdev);
10924
- sysdev_class_unregister(&kvm_sysdev_class);
10926
- unregister_reboot_notifier(&kvm_reboot_notifier);
10927
- unregister_cpu_notifier(&kvm_cpu_notifier);
10929
- on_each_cpu(hardware_disable, NULL, 0, 1);
10931
- kvm_x86_ops->hardware_unsetup();
10933
- kvm_x86_ops = NULL;
10938
-void kvm_exit_x86(void)
10939
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
10941
- misc_deregister(&kvm_dev);
10942
- kmem_cache_destroy(kvm_vcpu_cache);
10943
- sysdev_unregister(&kvm_sysdev);
10944
- sysdev_class_unregister(&kvm_sysdev_class);
10945
- unregister_reboot_notifier(&kvm_reboot_notifier);
10946
- unregister_cpu_notifier(&kvm_cpu_notifier);
10947
- on_each_cpu(hardware_disable, NULL, 0, 1);
10948
- kvm_x86_ops->hardware_unsetup();
10949
- kvm_x86_ops = NULL;
10950
+ return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
10951
+ || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
10954
-static __init int kvm_init(void)
10955
+static void vcpu_kick_intr(void *info)
10957
- static struct page *bad_page;
10960
- r = kvm_mmu_module_init();
10964
- kvm_init_debug();
10966
- kvm_init_msr_list();
10968
- if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
10973
- bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
10974
- memset(__va(bad_page_address), 0, PAGE_SIZE);
10979
- kvm_exit_debug();
10980
- kvm_mmu_module_exit();
10984
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
10985
+ printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
10989
-static __exit void kvm_exit(void)
10990
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
10992
- kvm_exit_debug();
10993
- __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
10994
- kvm_mmu_module_exit();
10997
-module_init(kvm_init)
10998
-module_exit(kvm_exit)
10999
+ int ipi_pcpu = vcpu->cpu;
11001
-EXPORT_SYMBOL_GPL(kvm_init_x86);
11002
-EXPORT_SYMBOL_GPL(kvm_exit_x86);
11003
+ if (waitqueue_active(&vcpu->wq)) {
11004
+ wake_up_interruptible(&vcpu->wq);
11005
+ ++vcpu->stat.halt_wakeup;
11007
+ if (vcpu->guest_mode)
11008
+ smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
11010
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
11011
new file mode 100644
11012
index 0000000..b2f6f0c
11014
+++ b/arch/x86/kvm/x86_emulate.c
11016
+/******************************************************************************
11019
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
11021
+ * Copyright (c) 2005 Keir Fraser
11023
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
11024
+ * privileged instructions:
11026
+ * Copyright (C) 2006 Qumranet
11028
+ * Avi Kivity <avi@qumranet.com>
11029
+ * Yaniv Kamay <yaniv@qumranet.com>
11031
+ * This work is licensed under the terms of the GNU GPL, version 2. See
11032
+ * the COPYING file in the top-level directory.
11034
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
11037
+#ifndef __KERNEL__
11038
+#include <stdio.h>
11039
+#include <stdint.h>
11040
+#include <public/xen.h>
11041
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
11043
+#include <linux/kvm_host.h>
11044
+#define DPRINTF(x...) do {} while (0)
11046
+#include <linux/module.h>
11047
+#include <asm/kvm_x86_emulate.h>
11050
+ * Opcode effective-address decode tables.
11051
+ * Note that we only emulate instructions that have at least one memory
11052
+ * operand (excluding implicit stack references). We assume that stack
11053
+ * references and instruction fetches will never occur in special memory
11054
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
11055
+ * not be handled.
11058
+/* Operand sizes: 8-bit operands or specified/overridden size. */
11059
+#define ByteOp (1<<0) /* 8-bit operands. */
11060
+/* Destination operand type. */
11061
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
11062
+#define DstReg (2<<1) /* Register operand. */
11063
+#define DstMem (3<<1) /* Memory operand. */
11064
+#define DstMask (3<<1)
11065
+/* Source operand type. */
11066
+#define SrcNone (0<<3) /* No source operand. */
11067
+#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
11068
+#define SrcReg (1<<3) /* Register operand. */
11069
+#define SrcMem (2<<3) /* Memory operand. */
11070
+#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
11071
+#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
11072
+#define SrcImm (5<<3) /* Immediate operand. */
11073
+#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
11074
+#define SrcMask (7<<3)
11075
+/* Generic ModRM decode. */
11076
+#define ModRM (1<<6)
11077
+/* Destination is only written; never read. */
11078
+#define Mov (1<<7)
11079
+#define BitOp (1<<8)
11080
+#define MemAbs (1<<9) /* Memory operand is absolute displacement */
11081
+#define String (1<<10) /* String instruction (rep capable) */
11082
+#define Stack (1<<11) /* Stack instruction (push/pop) */
11084
+static u16 opcode_table[256] = {
11085
+ /* 0x00 - 0x07 */
11086
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11087
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11089
+ /* 0x08 - 0x0F */
11090
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11091
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11093
+ /* 0x10 - 0x17 */
11094
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11095
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11097
+ /* 0x18 - 0x1F */
11098
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11099
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11101
+ /* 0x20 - 0x27 */
11102
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11103
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11104
+ SrcImmByte, SrcImm, 0, 0,
11105
+ /* 0x28 - 0x2F */
11106
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11107
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11109
+ /* 0x30 - 0x37 */
11110
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11111
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11113
+ /* 0x38 - 0x3F */
11114
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11115
+ ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
11117
+ /* 0x40 - 0x47 */
11118
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
11119
+ /* 0x48 - 0x4F */
11120
+ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
11121
+ /* 0x50 - 0x57 */
11122
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
11123
+ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
11124
+ /* 0x58 - 0x5F */
11125
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
11126
+ DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
11127
+ /* 0x60 - 0x67 */
11128
+ 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
11130
+ /* 0x68 - 0x6F */
11131
+ 0, 0, ImplicitOps | Mov | Stack, 0,
11132
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
11133
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
11134
+ /* 0x70 - 0x77 */
11135
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11136
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11137
+ /* 0x78 - 0x7F */
11138
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11139
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11140
+ /* 0x80 - 0x87 */
11141
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
11142
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
11143
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11144
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
11145
+ /* 0x88 - 0x8F */
11146
+ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
11147
+ ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11148
+ 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
11149
+ /* 0x90 - 0x9F */
11150
+ 0, 0, 0, 0, 0, 0, 0, 0,
11151
+ 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
11152
+ /* 0xA0 - 0xA7 */
11153
+ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
11154
+ ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
11155
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
11156
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
11157
+ /* 0xA8 - 0xAF */
11158
+ 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
11159
+ ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
11160
+ ByteOp | ImplicitOps | String, ImplicitOps | String,
11161
+ /* 0xB0 - 0xBF */
11162
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11163
+ /* 0xC0 - 0xC7 */
11164
+ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
11165
+ 0, ImplicitOps | Stack, 0, 0,
11166
+ ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
11167
+ /* 0xC8 - 0xCF */
11168
+ 0, 0, 0, 0, 0, 0, 0, 0,
11169
+ /* 0xD0 - 0xD7 */
11170
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
11171
+ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
11173
+ /* 0xD8 - 0xDF */
11174
+ 0, 0, 0, 0, 0, 0, 0, 0,
11175
+ /* 0xE0 - 0xE7 */
11176
+ 0, 0, 0, 0, 0, 0, 0, 0,
11177
+ /* 0xE8 - 0xEF */
11178
+ ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
11180
+ /* 0xF0 - 0xF7 */
11182
+ ImplicitOps, ImplicitOps,
11183
+ ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
11184
+ /* 0xF8 - 0xFF */
11185
+ ImplicitOps, 0, ImplicitOps, ImplicitOps,
11186
+ 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
11189
+static u16 twobyte_table[256] = {
11190
+ /* 0x00 - 0x0F */
11191
+ 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
11192
+ ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
11193
+ /* 0x10 - 0x1F */
11194
+ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
11195
+ /* 0x20 - 0x2F */
11196
+ ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
11197
+ 0, 0, 0, 0, 0, 0, 0, 0,
11198
+ /* 0x30 - 0x3F */
11199
+ ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11200
+ /* 0x40 - 0x47 */
11201
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11202
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11203
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11204
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11205
+ /* 0x48 - 0x4F */
11206
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11207
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11208
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11209
+ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
11210
+ /* 0x50 - 0x5F */
11211
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11212
+ /* 0x60 - 0x6F */
11213
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11214
+ /* 0x70 - 0x7F */
11215
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11216
+ /* 0x80 - 0x8F */
11217
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11218
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11219
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11220
+ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
11221
+ /* 0x90 - 0x9F */
11222
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11223
+ /* 0xA0 - 0xA7 */
11224
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
11225
+ /* 0xA8 - 0xAF */
11226
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
11227
+ /* 0xB0 - 0xB7 */
11228
+ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
11229
+ DstMem | SrcReg | ModRM | BitOp,
11230
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
11231
+ DstReg | SrcMem16 | ModRM | Mov,
11232
+ /* 0xB8 - 0xBF */
11233
+ 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
11234
+ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
11235
+ DstReg | SrcMem16 | ModRM | Mov,
11236
+ /* 0xC0 - 0xCF */
11237
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
11238
+ 0, 0, 0, 0, 0, 0, 0, 0,
11239
+ /* 0xD0 - 0xDF */
11240
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11241
+ /* 0xE0 - 0xEF */
11242
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11243
+ /* 0xF0 - 0xFF */
11244
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
11247
+/* EFLAGS bit definitions. */
11248
+#define EFLG_OF (1<<11)
11249
+#define EFLG_DF (1<<10)
11250
+#define EFLG_SF (1<<7)
11251
+#define EFLG_ZF (1<<6)
11252
+#define EFLG_AF (1<<4)
11253
+#define EFLG_PF (1<<2)
11254
+#define EFLG_CF (1<<0)
11257
+ * Instruction emulation:
11258
+ * Most instructions are emulated directly via a fragment of inline assembly
11259
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
11260
+ * any modified flags.
11263
+#if defined(CONFIG_X86_64)
11264
+#define _LO32 "k" /* force 32-bit operand */
11265
+#define _STK "%%rsp" /* stack pointer */
11266
+#elif defined(__i386__)
11267
+#define _LO32 "" /* force 32-bit operand */
11268
+#define _STK "%%esp" /* stack pointer */
11272
+ * These EFLAGS bits are restored from saved value during emulation, and
11273
+ * any changes are written back to the saved value after emulation.
11275
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
11277
+/* Before executing instruction: restore necessary bits in EFLAGS. */
11278
+#define _PRE_EFLAGS(_sav, _msk, _tmp) \
11279
+ /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
11280
+ "movl %"_sav",%"_LO32 _tmp"; " \
11281
+ "push %"_tmp"; " \
11282
+ "push %"_tmp"; " \
11283
+ "movl %"_msk",%"_LO32 _tmp"; " \
11284
+ "andl %"_LO32 _tmp",("_STK"); " \
11286
+ "notl %"_LO32 _tmp"; " \
11287
+ "andl %"_LO32 _tmp",("_STK"); " \
11288
+ "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
11289
+ "pop %"_tmp"; " \
11290
+ "orl %"_LO32 _tmp",("_STK"); " \
11294
+/* After executing instruction: write-back necessary bits in EFLAGS. */
11295
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
11296
+ /* _sav |= EFLAGS & _msk; */ \
11298
+ "pop %"_tmp"; " \
11299
+ "andl %"_msk",%"_LO32 _tmp"; " \
11300
+ "orl %"_LO32 _tmp",%"_sav"; "
11302
+/* Raw emulation: instruction has two explicit operands. */
11303
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
11305
+ unsigned long _tmp; \
11307
+ switch ((_dst).bytes) { \
11309
+ __asm__ __volatile__ ( \
11310
+ _PRE_EFLAGS("0", "4", "2") \
11311
+ _op"w %"_wx"3,%1; " \
11312
+ _POST_EFLAGS("0", "4", "2") \
11313
+ : "=m" (_eflags), "=m" ((_dst).val), \
11315
+ : _wy ((_src).val), "i" (EFLAGS_MASK)); \
11318
+ __asm__ __volatile__ ( \
11319
+ _PRE_EFLAGS("0", "4", "2") \
11320
+ _op"l %"_lx"3,%1; " \
11321
+ _POST_EFLAGS("0", "4", "2") \
11322
+ : "=m" (_eflags), "=m" ((_dst).val), \
11324
+ : _ly ((_src).val), "i" (EFLAGS_MASK)); \
11327
+ __emulate_2op_8byte(_op, _src, _dst, \
11328
+ _eflags, _qx, _qy); \
11333
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
11335
+ unsigned long _tmp; \
11336
+ switch ((_dst).bytes) { \
11338
+ __asm__ __volatile__ ( \
11339
+ _PRE_EFLAGS("0", "4", "2") \
11340
+ _op"b %"_bx"3,%1; " \
11341
+ _POST_EFLAGS("0", "4", "2") \
11342
+ : "=m" (_eflags), "=m" ((_dst).val), \
11344
+ : _by ((_src).val), "i" (EFLAGS_MASK)); \
11347
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
11348
+ _wx, _wy, _lx, _ly, _qx, _qy); \
11353
+/* Source operand is byte-sized and may be restricted to just %cl. */
11354
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
11355
+ __emulate_2op(_op, _src, _dst, _eflags, \
11356
+ "b", "c", "b", "c", "b", "c", "b", "c")
11358
+/* Source operand is byte, word, long or quad sized. */
11359
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
11360
+ __emulate_2op(_op, _src, _dst, _eflags, \
11361
+ "b", "q", "w", "r", _LO32, "r", "", "r")
11363
+/* Source operand is word, long or quad sized. */
11364
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
11365
+ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
11366
+ "w", "r", _LO32, "r", "", "r")
11368
+/* Instruction has only one explicit operand (no source operand). */
11369
+#define emulate_1op(_op, _dst, _eflags) \
11371
+ unsigned long _tmp; \
11373
+ switch ((_dst).bytes) { \
11375
+ __asm__ __volatile__ ( \
11376
+ _PRE_EFLAGS("0", "3", "2") \
11378
+ _POST_EFLAGS("0", "3", "2") \
11379
+ : "=m" (_eflags), "=m" ((_dst).val), \
11381
+ : "i" (EFLAGS_MASK)); \
11384
+ __asm__ __volatile__ ( \
11385
+ _PRE_EFLAGS("0", "3", "2") \
11387
+ _POST_EFLAGS("0", "3", "2") \
11388
+ : "=m" (_eflags), "=m" ((_dst).val), \
11390
+ : "i" (EFLAGS_MASK)); \
11393
+ __asm__ __volatile__ ( \
11394
+ _PRE_EFLAGS("0", "3", "2") \
11396
+ _POST_EFLAGS("0", "3", "2") \
11397
+ : "=m" (_eflags), "=m" ((_dst).val), \
11399
+ : "i" (EFLAGS_MASK)); \
11402
+ __emulate_1op_8byte(_op, _dst, _eflags); \
11407
+/* Emulate an instruction with quadword operands (x86/64 only). */
11408
+#if defined(CONFIG_X86_64)
11409
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
11411
+ __asm__ __volatile__ ( \
11412
+ _PRE_EFLAGS("0", "4", "2") \
11413
+ _op"q %"_qx"3,%1; " \
11414
+ _POST_EFLAGS("0", "4", "2") \
11415
+ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
11416
+ : _qy ((_src).val), "i" (EFLAGS_MASK)); \
11419
+#define __emulate_1op_8byte(_op, _dst, _eflags) \
11421
+ __asm__ __volatile__ ( \
11422
+ _PRE_EFLAGS("0", "3", "2") \
11424
+ _POST_EFLAGS("0", "3", "2") \
11425
+ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
11426
+ : "i" (EFLAGS_MASK)); \
11429
+#elif defined(__i386__)
11430
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
11431
+#define __emulate_1op_8byte(_op, _dst, _eflags)
11432
+#endif /* __i386__ */
11434
+/* Fetch next part of the instruction being emulated. */
11435
+#define insn_fetch(_type, _size, _eip) \
11436
+({ unsigned long _x; \
11437
+ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
11440
+ (_eip) += (_size); \
11444
+/* Access/update address held in a register, based on addressing mode. */
11445
+#define address_mask(reg) \
11446
+ ((c->ad_bytes == sizeof(unsigned long)) ? \
11447
+ (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
11448
+#define register_address(base, reg) \
11449
+ ((base) + address_mask(reg))
11450
+#define register_address_increment(reg, inc) \
11452
+ /* signed type ensures sign extension to long */ \
11453
+ int _inc = (inc); \
11454
+ if (c->ad_bytes == sizeof(unsigned long)) \
11457
+ (reg) = ((reg) & \
11458
+ ~((1UL << (c->ad_bytes << 3)) - 1)) | \
11459
+ (((reg) + _inc) & \
11460
+ ((1UL << (c->ad_bytes << 3)) - 1)); \
11463
+#define JMP_REL(rel) \
11465
+ register_address_increment(c->eip, rel); \
11468
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
11469
+ struct x86_emulate_ops *ops,
11470
+ unsigned long linear, u8 *dest)
11472
+ struct fetch_cache *fc = &ctxt->decode.fetch;
11476
+ if (linear < fc->start || linear >= fc->end) {
11477
+ size = min(15UL, PAGE_SIZE - offset_in_page(linear));
11478
+ rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
11481
+ fc->start = linear;
11482
+ fc->end = linear + size;
11484
+ *dest = fc->data[linear - fc->start];
11488
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
11489
+ struct x86_emulate_ops *ops,
11490
+ unsigned long eip, void *dest, unsigned size)
11494
+ eip += ctxt->cs_base;
11496
+ rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
11504
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
11505
+ * pointer into the block that addresses the relevant register.
11506
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
11508
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
11509
+ int highbyte_regs)
11513
+ p = ®s[modrm_reg];
11514
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
11515
+ p = (unsigned char *)®s[modrm_reg & 3] + 1;
11519
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
11520
+ struct x86_emulate_ops *ops,
11522
+ u16 *size, unsigned long *address, int op_bytes)
11526
+ if (op_bytes == 2)
11529
+ rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
11533
+ rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
11538
+static int test_cc(unsigned int condition, unsigned int flags)
11542
+ switch ((condition & 15) >> 1) {
11544
+ rc |= (flags & EFLG_OF);
11546
+ case 1: /* b/c/nae */
11547
+ rc |= (flags & EFLG_CF);
11549
+ case 2: /* z/e */
11550
+ rc |= (flags & EFLG_ZF);
11552
+ case 3: /* be/na */
11553
+ rc |= (flags & (EFLG_CF|EFLG_ZF));
11556
+ rc |= (flags & EFLG_SF);
11558
+ case 5: /* p/pe */
11559
+ rc |= (flags & EFLG_PF);
11561
+ case 7: /* le/ng */
11562
+ rc |= (flags & EFLG_ZF);
11563
+ /* fall through */
11564
+ case 6: /* l/nge */
11565
+ rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
11569
+ /* Odd condition identifiers (lsb == 1) have inverted sense. */
11570
+ return (!!rc ^ (condition & 1));
11573
+static void decode_register_operand(struct operand *op,
11574
+ struct decode_cache *c,
11575
+ int inhibit_bytereg)
11577
+ unsigned reg = c->modrm_reg;
11578
+ int highbyte_regs = c->rex_prefix == 0;
11580
+ if (!(c->d & ModRM))
11581
+ reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
11582
+ op->type = OP_REG;
11583
+ if ((c->d & ByteOp) && !inhibit_bytereg) {
11584
+ op->ptr = decode_register(reg, c->regs, highbyte_regs);
11585
+ op->val = *(u8 *)op->ptr;
11588
+ op->ptr = decode_register(reg, c->regs, 0);
11589
+ op->bytes = c->op_bytes;
11590
+ switch (op->bytes) {
11592
+ op->val = *(u16 *)op->ptr;
11595
+ op->val = *(u32 *)op->ptr;
11598
+ op->val = *(u64 *) op->ptr;
11602
+ op->orig_val = op->val;
11605
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
11606
+ struct x86_emulate_ops *ops)
11608
+ struct decode_cache *c = &ctxt->decode;
11610
+ int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
11613
+ if (c->rex_prefix) {
11614
+ c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
11615
+ index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
11616
+ c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
11619
+ c->modrm = insn_fetch(u8, 1, c->eip);
11620
+ c->modrm_mod |= (c->modrm & 0xc0) >> 6;
11621
+ c->modrm_reg |= (c->modrm & 0x38) >> 3;
11622
+ c->modrm_rm |= (c->modrm & 0x07);
11624
+ c->use_modrm_ea = 1;
11626
+ if (c->modrm_mod == 3) {
11627
+ c->modrm_val = *(unsigned long *)
11628
+ decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
11632
+ if (c->ad_bytes == 2) {
11633
+ unsigned bx = c->regs[VCPU_REGS_RBX];
11634
+ unsigned bp = c->regs[VCPU_REGS_RBP];
11635
+ unsigned si = c->regs[VCPU_REGS_RSI];
11636
+ unsigned di = c->regs[VCPU_REGS_RDI];
11638
+ /* 16-bit ModR/M decode. */
11639
+ switch (c->modrm_mod) {
11641
+ if (c->modrm_rm == 6)
11642
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
11645
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
11648
+ c->modrm_ea += insn_fetch(u16, 2, c->eip);
11651
+ switch (c->modrm_rm) {
11653
+ c->modrm_ea += bx + si;
11656
+ c->modrm_ea += bx + di;
11659
+ c->modrm_ea += bp + si;
11662
+ c->modrm_ea += bp + di;
11665
+ c->modrm_ea += si;
11668
+ c->modrm_ea += di;
11671
+ if (c->modrm_mod != 0)
11672
+ c->modrm_ea += bp;
11675
+ c->modrm_ea += bx;
11678
+ if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
11679
+ (c->modrm_rm == 6 && c->modrm_mod != 0))
11680
+ if (!c->override_base)
11681
+ c->override_base = &ctxt->ss_base;
11682
+ c->modrm_ea = (u16)c->modrm_ea;
11684
+ /* 32/64-bit ModR/M decode. */
11685
+ switch (c->modrm_rm) {
11688
+ sib = insn_fetch(u8, 1, c->eip);
11689
+ index_reg |= (sib >> 3) & 7;
11690
+ base_reg |= sib & 7;
11691
+ scale = sib >> 6;
11693
+ switch (base_reg) {
11695
+ if (c->modrm_mod != 0)
11696
+ c->modrm_ea += c->regs[base_reg];
11699
+ insn_fetch(s32, 4, c->eip);
11702
+ c->modrm_ea += c->regs[base_reg];
11704
+ switch (index_reg) {
11708
+ c->modrm_ea += c->regs[index_reg] << scale;
11712
+ if (c->modrm_mod != 0)
11713
+ c->modrm_ea += c->regs[c->modrm_rm];
11714
+ else if (ctxt->mode == X86EMUL_MODE_PROT64)
11715
+ rip_relative = 1;
11718
+ c->modrm_ea += c->regs[c->modrm_rm];
11721
+ switch (c->modrm_mod) {
11723
+ if (c->modrm_rm == 5)
11724
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
11727
+ c->modrm_ea += insn_fetch(s8, 1, c->eip);
11730
+ c->modrm_ea += insn_fetch(s32, 4, c->eip);
11734
+ if (rip_relative) {
11735
+ c->modrm_ea += c->eip;
11736
+ switch (c->d & SrcMask) {
11738
+ c->modrm_ea += 1;
11741
+ if (c->d & ByteOp)
11742
+ c->modrm_ea += 1;
11744
+ if (c->op_bytes == 8)
11745
+ c->modrm_ea += 4;
11747
+ c->modrm_ea += c->op_bytes;
11754
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
11755
+ struct x86_emulate_ops *ops)
11757
+ struct decode_cache *c = &ctxt->decode;
11760
+ switch (c->ad_bytes) {
11762
+ c->modrm_ea = insn_fetch(u16, 2, c->eip);
11765
+ c->modrm_ea = insn_fetch(u32, 4, c->eip);
11768
+ c->modrm_ea = insn_fetch(u64, 8, c->eip);
11776
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
11778
+ struct decode_cache *c = &ctxt->decode;
11780
+ int mode = ctxt->mode;
11781
+ int def_op_bytes, def_ad_bytes;
11783
+ /* Shadow copy of register state. Committed on successful emulation. */
11785
+ memset(c, 0, sizeof(struct decode_cache));
11786
+ c->eip = ctxt->vcpu->arch.rip;
11787
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
11790
+ case X86EMUL_MODE_REAL:
11791
+ case X86EMUL_MODE_PROT16:
11792
+ def_op_bytes = def_ad_bytes = 2;
11794
+ case X86EMUL_MODE_PROT32:
11795
+ def_op_bytes = def_ad_bytes = 4;
11797
+#ifdef CONFIG_X86_64
11798
+ case X86EMUL_MODE_PROT64:
11799
+ def_op_bytes = 4;
11800
+ def_ad_bytes = 8;
11807
+ c->op_bytes = def_op_bytes;
11808
+ c->ad_bytes = def_ad_bytes;
11810
+ /* Legacy prefixes. */
11812
+ switch (c->b = insn_fetch(u8, 1, c->eip)) {
11813
+ case 0x66: /* operand-size override */
11814
+ /* switch between 2/4 bytes */
11815
+ c->op_bytes = def_op_bytes ^ 6;
11817
+ case 0x67: /* address-size override */
11818
+ if (mode == X86EMUL_MODE_PROT64)
11819
+ /* switch between 4/8 bytes */
11820
+ c->ad_bytes = def_ad_bytes ^ 12;
11822
+ /* switch between 2/4 bytes */
11823
+ c->ad_bytes = def_ad_bytes ^ 6;
11825
+ case 0x2e: /* CS override */
11826
+ c->override_base = &ctxt->cs_base;
11828
+ case 0x3e: /* DS override */
11829
+ c->override_base = &ctxt->ds_base;
11831
+ case 0x26: /* ES override */
11832
+ c->override_base = &ctxt->es_base;
11834
+ case 0x64: /* FS override */
11835
+ c->override_base = &ctxt->fs_base;
11837
+ case 0x65: /* GS override */
11838
+ c->override_base = &ctxt->gs_base;
11840
+ case 0x36: /* SS override */
11841
+ c->override_base = &ctxt->ss_base;
11843
+ case 0x40 ... 0x4f: /* REX */
11844
+ if (mode != X86EMUL_MODE_PROT64)
11845
+ goto done_prefixes;
11846
+ c->rex_prefix = c->b;
11848
+ case 0xf0: /* LOCK */
11849
+ c->lock_prefix = 1;
11851
+ case 0xf2: /* REPNE/REPNZ */
11852
+ c->rep_prefix = REPNE_PREFIX;
11854
+ case 0xf3: /* REP/REPE/REPZ */
11855
+ c->rep_prefix = REPE_PREFIX;
11858
+ goto done_prefixes;
11861
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
11863
+ c->rex_prefix = 0;
11868
+ /* REX prefix. */
11869
+ if (c->rex_prefix)
11870
+ if (c->rex_prefix & 8)
11871
+ c->op_bytes = 8; /* REX.W */
11873
+ /* Opcode byte(s). */
11874
+ c->d = opcode_table[c->b];
11876
+ /* Two-byte opcode? */
11877
+ if (c->b == 0x0f) {
11879
+ c->b = insn_fetch(u8, 1, c->eip);
11880
+ c->d = twobyte_table[c->b];
11883
+ /* Unrecognised? */
11885
+ DPRINTF("Cannot emulate %02x\n", c->b);
11890
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
11893
+ /* ModRM and SIB bytes. */
11894
+ if (c->d & ModRM)
11895
+ rc = decode_modrm(ctxt, ops);
11896
+ else if (c->d & MemAbs)
11897
+ rc = decode_abs(ctxt, ops);
11901
+ if (!c->override_base)
11902
+ c->override_base = &ctxt->ds_base;
11903
+ if (mode == X86EMUL_MODE_PROT64 &&
11904
+ c->override_base != &ctxt->fs_base &&
11905
+ c->override_base != &ctxt->gs_base)
11906
+ c->override_base = NULL;
11908
+ if (c->override_base)
11909
+ c->modrm_ea += *c->override_base;
11911
+ if (c->ad_bytes != 8)
11912
+ c->modrm_ea = (u32)c->modrm_ea;
11914
+ * Decode and fetch the source operand: register, memory
11917
+ switch (c->d & SrcMask) {
11921
+ decode_register_operand(&c->src, c, 0);
11924
+ c->src.bytes = 2;
11925
+ goto srcmem_common;
11927
+ c->src.bytes = 4;
11928
+ goto srcmem_common;
11930
+ c->src.bytes = (c->d & ByteOp) ? 1 :
11932
+ /* Don't fetch the address for invlpg: it could be unmapped. */
11933
+ if (c->twobyte && c->b == 0x01
11934
+ && c->modrm_reg == 7)
11938
+ * For instructions with a ModR/M byte, switch to register
11939
+ * access if Mod = 3.
11941
+ if ((c->d & ModRM) && c->modrm_mod == 3) {
11942
+ c->src.type = OP_REG;
11945
+ c->src.type = OP_MEM;
11948
+ c->src.type = OP_IMM;
11949
+ c->src.ptr = (unsigned long *)c->eip;
11950
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
11951
+ if (c->src.bytes == 8)
11952
+ c->src.bytes = 4;
11953
+ /* NB. Immediates are sign-extended as necessary. */
11954
+ switch (c->src.bytes) {
11956
+ c->src.val = insn_fetch(s8, 1, c->eip);
11959
+ c->src.val = insn_fetch(s16, 2, c->eip);
11962
+ c->src.val = insn_fetch(s32, 4, c->eip);
11967
+ c->src.type = OP_IMM;
11968
+ c->src.ptr = (unsigned long *)c->eip;
11969
+ c->src.bytes = 1;
11970
+ c->src.val = insn_fetch(s8, 1, c->eip);
11974
+ /* Decode and fetch the destination operand: register or memory. */
11975
+ switch (c->d & DstMask) {
11976
+ case ImplicitOps:
11977
+ /* Special instructions do their own operand decoding. */
11980
+ decode_register_operand(&c->dst, c,
11981
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
11985
+ * For instructions with a ModR/M byte, switch to register
11986
+ * access if Mod = 3.
11988
+ if ((c->d & ModRM) && c->modrm_mod == 3)
11989
+ c->dst.type = OP_REG;
11991
+ c->dst.type = OP_MEM;
11996
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
11999
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
12001
+ struct decode_cache *c = &ctxt->decode;
12003
+ c->dst.type = OP_MEM;
12004
+ c->dst.bytes = c->op_bytes;
12005
+ c->dst.val = c->src.val;
12006
+ register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
12007
+ c->dst.ptr = (void *) register_address(ctxt->ss_base,
12008
+ c->regs[VCPU_REGS_RSP]);
12011
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
12012
+ struct x86_emulate_ops *ops)
12014
+ struct decode_cache *c = &ctxt->decode;
12017
+ rc = ops->read_std(register_address(ctxt->ss_base,
12018
+ c->regs[VCPU_REGS_RSP]),
12019
+ &c->dst.val, c->dst.bytes, ctxt->vcpu);
12023
+ register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
12028
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
12030
+ struct decode_cache *c = &ctxt->decode;
12031
+ switch (c->modrm_reg) {
12032
+ case 0: /* rol */
12033
+ emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
12035
+ case 1: /* ror */
12036
+ emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
12038
+ case 2: /* rcl */
12039
+ emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
12041
+ case 3: /* rcr */
12042
+ emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
12044
+ case 4: /* sal/shl */
12045
+ case 6: /* sal/shl */
12046
+ emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
12048
+ case 5: /* shr */
12049
+ emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
12051
+ case 7: /* sar */
12052
+ emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
12057
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
12058
+ struct x86_emulate_ops *ops)
12060
+ struct decode_cache *c = &ctxt->decode;
12063
+ switch (c->modrm_reg) {
12064
+ case 0 ... 1: /* test */
12066
+ * Special case in Grp3: test has an immediate
12067
+ * source operand.
12069
+ c->src.type = OP_IMM;
12070
+ c->src.ptr = (unsigned long *)c->eip;
12071
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12072
+ if (c->src.bytes == 8)
12073
+ c->src.bytes = 4;
12074
+ switch (c->src.bytes) {
12076
+ c->src.val = insn_fetch(s8, 1, c->eip);
12079
+ c->src.val = insn_fetch(s16, 2, c->eip);
12082
+ c->src.val = insn_fetch(s32, 4, c->eip);
12085
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
12087
+ case 2: /* not */
12088
+ c->dst.val = ~c->dst.val;
12090
+ case 3: /* neg */
12091
+ emulate_1op("neg", c->dst, ctxt->eflags);
12094
+ DPRINTF("Cannot emulate %02x\n", c->b);
12095
+ rc = X86EMUL_UNHANDLEABLE;
12102
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
12103
+ struct x86_emulate_ops *ops)
12105
+ struct decode_cache *c = &ctxt->decode;
12108
+ switch (c->modrm_reg) {
12109
+ case 0: /* inc */
12110
+ emulate_1op("inc", c->dst, ctxt->eflags);
12112
+ case 1: /* dec */
12113
+ emulate_1op("dec", c->dst, ctxt->eflags);
12115
+ case 4: /* jmp abs */
12116
+ if (c->b == 0xff)
12117
+ c->eip = c->dst.val;
12119
+ DPRINTF("Cannot emulate %02x\n", c->b);
12120
+ return X86EMUL_UNHANDLEABLE;
12123
+ case 6: /* push */
12125
+ /* 64-bit mode: PUSH always pushes a 64-bit operand. */
12127
+ if (ctxt->mode == X86EMUL_MODE_PROT64) {
12128
+ c->dst.bytes = 8;
12129
+ rc = ops->read_std((unsigned long)c->dst.ptr,
12130
+ &c->dst.val, 8, ctxt->vcpu);
12134
+ register_address_increment(c->regs[VCPU_REGS_RSP],
12136
+ rc = ops->write_emulated(register_address(ctxt->ss_base,
12137
+ c->regs[VCPU_REGS_RSP]),
12139
+ c->dst.bytes, ctxt->vcpu);
12142
+ c->dst.type = OP_NONE;
12145
+ DPRINTF("Cannot emulate %02x\n", c->b);
12146
+ return X86EMUL_UNHANDLEABLE;
12151
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
12152
+ struct x86_emulate_ops *ops,
12153
+ unsigned long memop)
12155
+ struct decode_cache *c = &ctxt->decode;
12159
+ rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
12163
+ if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
12164
+ ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
12166
+ c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
12167
+ c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
12168
+ ctxt->eflags &= ~EFLG_ZF;
12171
+ new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
12172
+ (u32) c->regs[VCPU_REGS_RBX];
12174
+ rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
12177
+ ctxt->eflags |= EFLG_ZF;
12182
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
12183
+ struct x86_emulate_ops *ops)
12186
+ struct decode_cache *c = &ctxt->decode;
12188
+ switch (c->dst.type) {
12190
+ /* The 4-byte case *is* correct:
12191
+ * in 64-bit mode we zero-extend.
12193
+ switch (c->dst.bytes) {
12195
+ *(u8 *)c->dst.ptr = (u8)c->dst.val;
12198
+ *(u16 *)c->dst.ptr = (u16)c->dst.val;
12201
+ *c->dst.ptr = (u32)c->dst.val;
12202
+ break; /* 64b: zero-ext */
12204
+ *c->dst.ptr = c->dst.val;
12209
+ if (c->lock_prefix)
12210
+ rc = ops->cmpxchg_emulated(
12211
+ (unsigned long)c->dst.ptr,
12212
+ &c->dst.orig_val,
12217
+ rc = ops->write_emulated(
12218
+ (unsigned long)c->dst.ptr,
12226
+ /* no writeback */
12235
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
12237
+ unsigned long memop = 0;
12239
+ unsigned long saved_eip;
12240
+ struct decode_cache *c = &ctxt->decode;
12243
+ /* Shadow copy of register state. Committed on successful emulation.
12244
+ * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
12248
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
12249
+ saved_eip = c->eip;
12251
+ if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
12252
+ memop = c->modrm_ea;
12254
+ if (c->rep_prefix && (c->d & String)) {
12255
+ /* All REP prefixes have the same first termination condition */
12256
+ if (c->regs[VCPU_REGS_RCX] == 0) {
12257
+ ctxt->vcpu->arch.rip = c->eip;
12260
+ /* The second termination condition only applies for REPE
12261
+ * and REPNE. Test if the repeat string operation prefix is
12262
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
12263
+ * corresponding termination condition according to:
12264
+ * - if REPE/REPZ and ZF = 0 then done
12265
+ * - if REPNE/REPNZ and ZF = 1 then done
12267
+ if ((c->b == 0xa6) || (c->b == 0xa7) ||
12268
+ (c->b == 0xae) || (c->b == 0xaf)) {
12269
+ if ((c->rep_prefix == REPE_PREFIX) &&
12270
+ ((ctxt->eflags & EFLG_ZF) == 0)) {
12271
+ ctxt->vcpu->arch.rip = c->eip;
12274
+ if ((c->rep_prefix == REPNE_PREFIX) &&
12275
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
12276
+ ctxt->vcpu->arch.rip = c->eip;
12280
+ c->regs[VCPU_REGS_RCX]--;
12281
+ c->eip = ctxt->vcpu->arch.rip;
12284
+ if (c->src.type == OP_MEM) {
12285
+ c->src.ptr = (unsigned long *)memop;
12287
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
12293
+ c->src.orig_val = c->src.val;
12296
+ if ((c->d & DstMask) == ImplicitOps)
12297
+ goto special_insn;
12300
+ if (c->dst.type == OP_MEM) {
12301
+ c->dst.ptr = (unsigned long *)memop;
12302
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12304
+ if (c->d & BitOp) {
12305
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
12307
+ c->dst.ptr = (void *)c->dst.ptr +
12308
+ (c->src.val & mask) / 8;
12310
+ if (!(c->d & Mov) &&
12311
+ /* optimisation - avoid slow emulated read */
12312
+ ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
12314
+ c->dst.bytes, ctxt->vcpu)) != 0))
12317
+ c->dst.orig_val = c->dst.val;
12322
+ goto twobyte_insn;
12325
+ case 0x00 ... 0x05:
12327
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
12329
+ case 0x08 ... 0x0d:
12331
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
12333
+ case 0x10 ... 0x15:
12335
+ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
12337
+ case 0x18 ... 0x1d:
12339
+ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
12341
+ case 0x20 ... 0x23:
12343
+ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
12345
+ case 0x24: /* and al imm8 */
12346
+ c->dst.type = OP_REG;
12347
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
12348
+ c->dst.val = *(u8 *)c->dst.ptr;
12349
+ c->dst.bytes = 1;
12350
+ c->dst.orig_val = c->dst.val;
12352
+ case 0x25: /* and ax imm16, or eax imm32 */
12353
+ c->dst.type = OP_REG;
12354
+ c->dst.bytes = c->op_bytes;
12355
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
12356
+ if (c->op_bytes == 2)
12357
+ c->dst.val = *(u16 *)c->dst.ptr;
12359
+ c->dst.val = *(u32 *)c->dst.ptr;
12360
+ c->dst.orig_val = c->dst.val;
12362
+ case 0x28 ... 0x2d:
12364
+ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
12366
+ case 0x30 ... 0x35:
12368
+ emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
12370
+ case 0x38 ... 0x3d:
12372
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
12374
+ case 0x40 ... 0x47: /* inc r16/r32 */
12375
+ emulate_1op("inc", c->dst, ctxt->eflags);
12377
+ case 0x48 ... 0x4f: /* dec r16/r32 */
12378
+ emulate_1op("dec", c->dst, ctxt->eflags);
12380
+ case 0x50 ... 0x57: /* push reg */
12381
+ c->dst.type = OP_MEM;
12382
+ c->dst.bytes = c->op_bytes;
12383
+ c->dst.val = c->src.val;
12384
+ register_address_increment(c->regs[VCPU_REGS_RSP],
12386
+ c->dst.ptr = (void *) register_address(
12387
+ ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
12389
+ case 0x58 ... 0x5f: /* pop reg */
12391
+ if ((rc = ops->read_std(register_address(ctxt->ss_base,
12392
+ c->regs[VCPU_REGS_RSP]), c->dst.ptr,
12393
+ c->op_bytes, ctxt->vcpu)) != 0)
12396
+ register_address_increment(c->regs[VCPU_REGS_RSP],
12398
+ c->dst.type = OP_NONE; /* Disable writeback. */
12400
+ case 0x63: /* movsxd */
12401
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
12402
+ goto cannot_emulate;
12403
+ c->dst.val = (s32) c->src.val;
12405
+ case 0x6a: /* push imm8 */
12407
+ c->src.val = insn_fetch(s8, 1, c->eip);
12408
+ emulate_push(ctxt);
12410
+ case 0x6c: /* insb */
12411
+ case 0x6d: /* insw/insd */
12412
+ if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
12414
+ (c->d & ByteOp) ? 1 : c->op_bytes,
12416
+ address_mask(c->regs[VCPU_REGS_RCX]) : 1,
12417
+ (ctxt->eflags & EFLG_DF),
12418
+ register_address(ctxt->es_base,
12419
+ c->regs[VCPU_REGS_RDI]),
12421
+ c->regs[VCPU_REGS_RDX]) == 0) {
12422
+ c->eip = saved_eip;
12426
+ case 0x6e: /* outsb */
12427
+ case 0x6f: /* outsw/outsd */
12428
+ if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
12430
+ (c->d & ByteOp) ? 1 : c->op_bytes,
12432
+ address_mask(c->regs[VCPU_REGS_RCX]) : 1,
12433
+ (ctxt->eflags & EFLG_DF),
12434
+ register_address(c->override_base ?
12435
+ *c->override_base :
12437
+ c->regs[VCPU_REGS_RSI]),
12439
+ c->regs[VCPU_REGS_RDX]) == 0) {
12440
+ c->eip = saved_eip;
12444
+ case 0x70 ... 0x7f: /* jcc (short) */ {
12445
+ int rel = insn_fetch(s8, 1, c->eip);
12447
+ if (test_cc(c->b, ctxt->eflags))
12451
+ case 0x80 ... 0x83: /* Grp1 */
12452
+ switch (c->modrm_reg) {
12471
+ case 0x84 ... 0x85:
12472
+ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
12474
+ case 0x86 ... 0x87: /* xchg */
12475
+ /* Write back the register source. */
12476
+ switch (c->dst.bytes) {
12478
+ *(u8 *) c->src.ptr = (u8) c->dst.val;
12481
+ *(u16 *) c->src.ptr = (u16) c->dst.val;
12484
+ *c->src.ptr = (u32) c->dst.val;
12485
+ break; /* 64b reg: zero-extend */
12487
+ *c->src.ptr = c->dst.val;
12491
+ * Write back the memory destination with implicit LOCK
12494
+ c->dst.val = c->src.val;
12495
+ c->lock_prefix = 1;
12497
+ case 0x88 ... 0x8b: /* mov */
12499
+ case 0x8d: /* lea r16/r32, m */
12500
+ c->dst.val = c->modrm_val;
12502
+ case 0x8f: /* pop (sole member of Grp1a) */
12503
+ rc = emulate_grp1a(ctxt, ops);
12507
+ case 0x9c: /* pushf */
12508
+ c->src.val = (unsigned long) ctxt->eflags;
12509
+ emulate_push(ctxt);
12511
+ case 0x9d: /* popf */
12512
+ c->dst.ptr = (unsigned long *) &ctxt->eflags;
12513
+ goto pop_instruction;
12514
+ case 0xa0 ... 0xa1: /* mov */
12515
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
12516
+ c->dst.val = c->src.val;
12518
+ case 0xa2 ... 0xa3: /* mov */
12519
+ c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
12521
+ case 0xa4 ... 0xa5: /* movs */
12522
+ c->dst.type = OP_MEM;
12523
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12524
+ c->dst.ptr = (unsigned long *)register_address(
12526
+ c->regs[VCPU_REGS_RDI]);
12527
+ if ((rc = ops->read_emulated(register_address(
12528
+ c->override_base ? *c->override_base :
12530
+ c->regs[VCPU_REGS_RSI]),
12532
+ c->dst.bytes, ctxt->vcpu)) != 0)
12535
+ register_address_increment(c->regs[VCPU_REGS_RSI],
12536
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
12538
+ register_address_increment(c->regs[VCPU_REGS_RDI],
12539
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
12542
+ case 0xa6 ... 0xa7: /* cmps */
12543
+ c->src.type = OP_NONE; /* Disable writeback. */
12544
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12545
+ c->src.ptr = (unsigned long *)register_address(
12546
+ c->override_base ? *c->override_base :
12548
+ c->regs[VCPU_REGS_RSI]);
12549
+ if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
12552
+ ctxt->vcpu)) != 0)
12555
+ c->dst.type = OP_NONE; /* Disable writeback. */
12556
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12557
+ c->dst.ptr = (unsigned long *)register_address(
12559
+ c->regs[VCPU_REGS_RDI]);
12560
+ if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
12563
+ ctxt->vcpu)) != 0)
12566
+ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
12568
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
12570
+ register_address_increment(c->regs[VCPU_REGS_RSI],
12571
+ (ctxt->eflags & EFLG_DF) ? -c->src.bytes
12573
+ register_address_increment(c->regs[VCPU_REGS_RDI],
12574
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
12578
+ case 0xaa ... 0xab: /* stos */
12579
+ c->dst.type = OP_MEM;
12580
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12581
+ c->dst.ptr = (unsigned long *)register_address(
12583
+ c->regs[VCPU_REGS_RDI]);
12584
+ c->dst.val = c->regs[VCPU_REGS_RAX];
12585
+ register_address_increment(c->regs[VCPU_REGS_RDI],
12586
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
12589
+ case 0xac ... 0xad: /* lods */
12590
+ c->dst.type = OP_REG;
12591
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
12592
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
12593
+ if ((rc = ops->read_emulated(register_address(
12594
+ c->override_base ? *c->override_base :
12596
+ c->regs[VCPU_REGS_RSI]),
12599
+ ctxt->vcpu)) != 0)
12602
+ register_address_increment(c->regs[VCPU_REGS_RSI],
12603
+ (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
12606
+ case 0xae ... 0xaf: /* scas */
12607
+ DPRINTF("Urk! I don't handle SCAS.\n");
12608
+ goto cannot_emulate;
12609
+ case 0xc0 ... 0xc1:
12610
+ emulate_grp2(ctxt);
12612
+ case 0xc3: /* ret */
12613
+ c->dst.ptr = &c->eip;
12614
+ goto pop_instruction;
12615
+ case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
12617
+ c->dst.val = c->src.val;
12619
+ case 0xd0 ... 0xd1: /* Grp2 */
12621
+ emulate_grp2(ctxt);
12623
+ case 0xd2 ... 0xd3: /* Grp2 */
12624
+ c->src.val = c->regs[VCPU_REGS_RCX];
12625
+ emulate_grp2(ctxt);
12627
+ case 0xe8: /* call (near) */ {
12629
+ switch (c->op_bytes) {
12631
+ rel = insn_fetch(s16, 2, c->eip);
12634
+ rel = insn_fetch(s32, 4, c->eip);
12637
+ DPRINTF("Call: Invalid op_bytes\n");
12638
+ goto cannot_emulate;
12640
+ c->src.val = (unsigned long) c->eip;
12643
+ * emulate_push() save value in size of c->op_bytes, therefore
12644
+ * we are setting it now to be the size of eip so all the value
12645
+ * of eip will be saved
12647
+ c->op_bytes = c->ad_bytes;
12648
+ emulate_push(ctxt);
12651
+ case 0xe9: /* jmp rel */
12652
+ case 0xeb: /* jmp rel short */
12653
+ JMP_REL(c->src.val);
12654
+ c->dst.type = OP_NONE; /* Disable writeback. */
12656
+ case 0xf4: /* hlt */
12657
+ ctxt->vcpu->arch.halt_request = 1;
12659
+ case 0xf5: /* cmc */
12660
+ /* complement carry flag from eflags reg */
12661
+ ctxt->eflags ^= EFLG_CF;
12662
+ c->dst.type = OP_NONE; /* Disable writeback. */
12664
+ case 0xf6 ... 0xf7: /* Grp3 */
12665
+ rc = emulate_grp3(ctxt, ops);
12669
+ case 0xf8: /* clc */
12670
+ ctxt->eflags &= ~EFLG_CF;
12671
+ c->dst.type = OP_NONE; /* Disable writeback. */
12673
+ case 0xfa: /* cli */
12674
+ ctxt->eflags &= ~X86_EFLAGS_IF;
12675
+ c->dst.type = OP_NONE; /* Disable writeback. */
12677
+ case 0xfb: /* sti */
12678
+ ctxt->eflags |= X86_EFLAGS_IF;
12679
+ c->dst.type = OP_NONE; /* Disable writeback. */
12681
+ case 0xfe ... 0xff: /* Grp4/Grp5 */
12682
+ rc = emulate_grp45(ctxt, ops);
12689
+ rc = writeback(ctxt, ops);
12693
+ /* Commit shadow register state. */
12694
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
12695
+ ctxt->vcpu->arch.rip = c->eip;
12698
+ if (rc == X86EMUL_UNHANDLEABLE) {
12699
+ c->eip = saved_eip;
12706
+ case 0x01: /* lgdt, lidt, lmsw */
12707
+ switch (c->modrm_reg) {
12709
+ unsigned long address;
12711
+ case 0: /* vmcall */
12712
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
12713
+ goto cannot_emulate;
12715
+ rc = kvm_fix_hypercall(ctxt->vcpu);
12719
+ kvm_emulate_hypercall(ctxt->vcpu);
12721
+ case 2: /* lgdt */
12722
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
12723
+ &size, &address, c->op_bytes);
12726
+ realmode_lgdt(ctxt->vcpu, size, address);
12728
+ case 3: /* lidt/vmmcall */
12729
+ if (c->modrm_mod == 3 && c->modrm_rm == 1) {
12730
+ rc = kvm_fix_hypercall(ctxt->vcpu);
12733
+ kvm_emulate_hypercall(ctxt->vcpu);
12735
+ rc = read_descriptor(ctxt, ops, c->src.ptr,
12740
+ realmode_lidt(ctxt->vcpu, size, address);
12743
+ case 4: /* smsw */
12744
+ if (c->modrm_mod != 3)
12745
+ goto cannot_emulate;
12746
+ *(u16 *)&c->regs[c->modrm_rm]
12747
+ = realmode_get_cr(ctxt->vcpu, 0);
12749
+ case 6: /* lmsw */
12750
+ if (c->modrm_mod != 3)
12751
+ goto cannot_emulate;
12752
+ realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
12755
+ case 7: /* invlpg*/
12756
+ emulate_invlpg(ctxt->vcpu, memop);
12759
+ goto cannot_emulate;
12761
+ /* Disable writeback. */
12762
+ c->dst.type = OP_NONE;
12765
+ emulate_clts(ctxt->vcpu);
12766
+ c->dst.type = OP_NONE;
12768
+ case 0x08: /* invd */
12769
+ case 0x09: /* wbinvd */
12770
+ case 0x0d: /* GrpP (prefetch) */
12771
+ case 0x18: /* Grp16 (prefetch/nop) */
12772
+ c->dst.type = OP_NONE;
12774
+ case 0x20: /* mov cr, reg */
12775
+ if (c->modrm_mod != 3)
12776
+ goto cannot_emulate;
12777
+ c->regs[c->modrm_rm] =
12778
+ realmode_get_cr(ctxt->vcpu, c->modrm_reg);
12779
+ c->dst.type = OP_NONE; /* no writeback */
12781
+ case 0x21: /* mov from dr to reg */
12782
+ if (c->modrm_mod != 3)
12783
+ goto cannot_emulate;
12784
+ rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
12786
+ goto cannot_emulate;
12787
+ c->dst.type = OP_NONE; /* no writeback */
12789
+ case 0x22: /* mov reg, cr */
12790
+ if (c->modrm_mod != 3)
12791
+ goto cannot_emulate;
12792
+ realmode_set_cr(ctxt->vcpu,
12793
+ c->modrm_reg, c->modrm_val, &ctxt->eflags);
12794
+ c->dst.type = OP_NONE;
12796
+ case 0x23: /* mov from reg to dr */
12797
+ if (c->modrm_mod != 3)
12798
+ goto cannot_emulate;
12799
+ rc = emulator_set_dr(ctxt, c->modrm_reg,
12800
+ c->regs[c->modrm_rm]);
12802
+ goto cannot_emulate;
12803
+ c->dst.type = OP_NONE; /* no writeback */
12807
+ msr_data = (u32)c->regs[VCPU_REGS_RAX]
12808
+ | ((u64)c->regs[VCPU_REGS_RDX] << 32);
12809
+ rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
12811
+ kvm_inject_gp(ctxt->vcpu, 0);
12812
+ c->eip = ctxt->vcpu->arch.rip;
12814
+ rc = X86EMUL_CONTINUE;
12815
+ c->dst.type = OP_NONE;
12819
+ rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
12821
+ kvm_inject_gp(ctxt->vcpu, 0);
12822
+ c->eip = ctxt->vcpu->arch.rip;
12824
+ c->regs[VCPU_REGS_RAX] = (u32)msr_data;
12825
+ c->regs[VCPU_REGS_RDX] = msr_data >> 32;
12827
+ rc = X86EMUL_CONTINUE;
12828
+ c->dst.type = OP_NONE;
12830
+ case 0x40 ... 0x4f: /* cmov */
12831
+ c->dst.val = c->dst.orig_val = c->src.val;
12832
+ if (!test_cc(c->b, ctxt->eflags))
12833
+ c->dst.type = OP_NONE; /* no writeback */
12835
+ case 0x80 ... 0x8f: /* jnz rel, etc*/ {
12838
+ switch (c->op_bytes) {
12840
+ rel = insn_fetch(s16, 2, c->eip);
12843
+ rel = insn_fetch(s32, 4, c->eip);
12846
+ rel = insn_fetch(s64, 8, c->eip);
12849
+ DPRINTF("jnz: Invalid op_bytes\n");
12850
+ goto cannot_emulate;
12852
+ if (test_cc(c->b, ctxt->eflags))
12854
+ c->dst.type = OP_NONE;
12859
+ c->dst.type = OP_NONE;
12860
+ /* only subword offset */
12861
+ c->src.val &= (c->dst.bytes << 3) - 1;
12862
+ emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
12866
+ /* only subword offset */
12867
+ c->src.val &= (c->dst.bytes << 3) - 1;
12868
+ emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
12870
+ case 0xb0 ... 0xb1: /* cmpxchg */
12872
+ * Save real source value, then compare EAX against
12875
+ c->src.orig_val = c->src.val;
12876
+ c->src.val = c->regs[VCPU_REGS_RAX];
12877
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
12878
+ if (ctxt->eflags & EFLG_ZF) {
12879
+ /* Success: write back to memory. */
12880
+ c->dst.val = c->src.orig_val;
12882
+ /* Failure: write the value we saw to EAX. */
12883
+ c->dst.type = OP_REG;
12884
+ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
12889
+ /* only subword offset */
12890
+ c->src.val &= (c->dst.bytes << 3) - 1;
12891
+ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
12893
+ case 0xb6 ... 0xb7: /* movzx */
12894
+ c->dst.bytes = c->op_bytes;
12895
+ c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
12896
+ : (u16) c->src.val;
12898
+ case 0xba: /* Grp8 */
12899
+ switch (c->modrm_reg & 3) {
12912
+ /* only subword offset */
12913
+ c->src.val &= (c->dst.bytes << 3) - 1;
12914
+ emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
12916
+ case 0xbe ... 0xbf: /* movsx */
12917
+ c->dst.bytes = c->op_bytes;
12918
+ c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
12919
+ (s16) c->src.val;
12921
+ case 0xc3: /* movnti */
12922
+ c->dst.bytes = c->op_bytes;
12923
+ c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
12924
+ (u64) c->src.val;
12926
+ case 0xc7: /* Grp9 (cmpxchg8b) */
12927
+ rc = emulate_grp9(ctxt, ops, memop);
12930
+ c->dst.type = OP_NONE;
12936
+ DPRINTF("Cannot emulate %02x\n", c->b);
12937
+ c->eip = saved_eip;
12940
diff --git a/drivers/Kconfig b/drivers/Kconfig
12941
index f4076d9..08d4ae2 100644
12942
--- a/drivers/Kconfig
12943
+++ b/drivers/Kconfig
12944
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
12946
source "drivers/auxdisplay/Kconfig"
12948
-source "drivers/kvm/Kconfig"
12950
source "drivers/uio/Kconfig"
12952
source "drivers/virtio/Kconfig"
12953
diff --git a/drivers/Makefile b/drivers/Makefile
12954
index 8cb37e3..513ae86 100644
12955
--- a/drivers/Makefile
12956
+++ b/drivers/Makefile
12957
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
12958
obj-$(CONFIG_PCCARD) += pcmcia/
12959
obj-$(CONFIG_DIO) += dio/
12960
obj-$(CONFIG_SBUS) += sbus/
12961
-obj-$(CONFIG_KVM) += kvm/
12962
obj-$(CONFIG_ZORRO) += zorro/
12963
obj-$(CONFIG_MAC) += macintosh/
12964
obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
12965
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
12966
deleted file mode 100644
12967
index 11fc014..0000000
12968
--- a/drivers/kvm/irq.h
12972
- * irq.h: in kernel interrupt controller related definitions
12973
- * Copyright (c) 2007, Intel Corporation.
12975
- * This program is free software; you can redistribute it and/or modify it
12976
- * under the terms and conditions of the GNU General Public License,
12977
- * version 2, as published by the Free Software Foundation.
12979
- * This program is distributed in the hope it will be useful, but WITHOUT
12980
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12981
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12984
- * You should have received a copy of the GNU General Public License along with
12985
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
12986
- * Place - Suite 330, Boston, MA 02111-1307 USA.
12988
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
12997
-typedef void irq_request_func(void *opaque, int level);
12999
-struct kvm_kpic_state {
13000
- u8 last_irr; /* edge detection */
13001
- u8 irr; /* interrupt request register */
13002
- u8 imr; /* interrupt mask register */
13003
- u8 isr; /* interrupt service register */
13004
- u8 priority_add; /* highest irq priority */
13006
- u8 read_reg_select;
13011
- u8 rotate_on_auto_eoi;
13012
- u8 special_fully_nested_mode;
13013
- u8 init4; /* true if 4 byte init */
13014
- u8 elcr; /* PIIX edge/trigger selection */
13016
- struct kvm_pic *pics_state;
13020
- struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
13021
- irq_request_func *irq_request;
13022
- void *irq_request_opaque;
13023
- int output; /* intr from master PIC */
13024
- struct kvm_io_device dev;
13027
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
13028
-void kvm_pic_set_irq(void *opaque, int irq, int level);
13029
-int kvm_pic_read_irq(struct kvm_pic *s);
13030
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
13031
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
13032
-void kvm_pic_update_irq(struct kvm_pic *s);
13034
-#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
13035
-#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13036
-#define IOAPIC_EDGE_TRIG 0
13037
-#define IOAPIC_LEVEL_TRIG 1
13039
-#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
13040
-#define IOAPIC_MEM_LENGTH 0x100
13042
-/* Direct registers. */
13043
-#define IOAPIC_REG_SELECT 0x00
13044
-#define IOAPIC_REG_WINDOW 0x10
13045
-#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
13047
-/* Indirect registers. */
13048
-#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
13049
-#define IOAPIC_REG_VERSION 0x01
13050
-#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
13052
-struct kvm_ioapic {
13053
- u64 base_address;
13058
- union ioapic_redir_entry {
13062
- u8 delivery_mode:3;
13064
- u8 delivery_status:1;
13073
- } redirtbl[IOAPIC_NUM_PINS];
13074
- struct kvm_io_device dev;
13078
-struct kvm_lapic {
13079
- unsigned long base_address;
13080
- struct kvm_io_device dev;
13082
- atomic_t pending;
13083
- s64 period; /* unit: ns */
13084
- u32 divide_count;
13085
- ktime_t last_update;
13086
- struct hrtimer dev;
13088
- struct kvm_vcpu *vcpu;
13089
- struct page *regs_page;
13094
-#define ASSERT(x) \
13097
- printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
13098
- __FILE__, __LINE__, #x); \
13103
-#define ASSERT(x) do { } while (0)
13106
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
13107
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
13108
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
13109
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
13110
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
13111
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
13112
-void kvm_free_apic(struct kvm_lapic *apic);
13113
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
13114
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
13115
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
13116
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
13117
- unsigned long bitmap);
13118
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
13119
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
13120
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
13121
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
13122
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
13123
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
13124
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
13125
-int kvm_ioapic_init(struct kvm *kvm);
13126
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
13127
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
13128
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
13129
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
13130
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
13131
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
13132
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
13133
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
13136
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
13137
deleted file mode 100644
13138
index 71fdf45..0000000
13139
--- a/drivers/kvm/segment_descriptor.h
13142
-struct segment_descriptor {
13150
- u8 limit_high : 4;
13152
- u8 long_mode : 1;
13153
- u8 default_op : 1;
13154
- u8 granularity : 1;
13156
-} __attribute__((packed));
13159
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
13160
deleted file mode 100644
13161
index bd46de6..0000000
13162
--- a/drivers/kvm/x86_emulate.c
13165
-/******************************************************************************
13168
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
13170
- * Copyright (c) 2005 Keir Fraser
13172
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
13173
- * privileged instructions:
13175
- * Copyright (C) 2006 Qumranet
13177
- * Avi Kivity <avi@qumranet.com>
13178
- * Yaniv Kamay <yaniv@qumranet.com>
13180
- * This work is licensed under the terms of the GNU GPL, version 2. See
13181
- * the COPYING file in the top-level directory.
13183
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
13186
-#ifndef __KERNEL__
13187
-#include <stdio.h>
13188
-#include <stdint.h>
13189
-#include <public/xen.h>
13190
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
13193
-#define DPRINTF(x...) do {} while (0)
13195
-#include "x86_emulate.h"
13196
-#include <linux/module.h>
13199
- * Opcode effective-address decode tables.
13200
- * Note that we only emulate instructions that have at least one memory
13201
- * operand (excluding implicit stack references). We assume that stack
13202
- * references and instruction fetches will never occur in special memory
13203
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
13204
- * not be handled.
13207
-/* Operand sizes: 8-bit operands or specified/overridden size. */
13208
-#define ByteOp (1<<0) /* 8-bit operands. */
13209
-/* Destination operand type. */
13210
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
13211
-#define DstReg (2<<1) /* Register operand. */
13212
-#define DstMem (3<<1) /* Memory operand. */
13213
-#define DstMask (3<<1)
13214
-/* Source operand type. */
13215
-#define SrcNone (0<<3) /* No source operand. */
13216
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
13217
-#define SrcReg (1<<3) /* Register operand. */
13218
-#define SrcMem (2<<3) /* Memory operand. */
13219
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
13220
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
13221
-#define SrcImm (5<<3) /* Immediate operand. */
13222
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
13223
-#define SrcMask (7<<3)
13224
-/* Generic ModRM decode. */
13225
-#define ModRM (1<<6)
13226
-/* Destination is only written; never read. */
13227
-#define Mov (1<<7)
13228
-#define BitOp (1<<8)
13230
-static u8 opcode_table[256] = {
13231
- /* 0x00 - 0x07 */
13232
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13233
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13235
- /* 0x08 - 0x0F */
13236
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13237
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13239
- /* 0x10 - 0x17 */
13240
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13241
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13243
- /* 0x18 - 0x1F */
13244
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13245
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13247
- /* 0x20 - 0x27 */
13248
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13249
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13250
- SrcImmByte, SrcImm, 0, 0,
13251
- /* 0x28 - 0x2F */
13252
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13253
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13255
- /* 0x30 - 0x37 */
13256
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13257
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13259
- /* 0x38 - 0x3F */
13260
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13261
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
13263
- /* 0x40 - 0x4F */
13264
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13265
- /* 0x50 - 0x57 */
13266
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13267
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13268
- /* 0x58 - 0x5F */
13269
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13270
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13271
- /* 0x60 - 0x67 */
13272
- 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
13274
- /* 0x68 - 0x6F */
13275
- 0, 0, ImplicitOps|Mov, 0,
13276
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
13277
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
13278
- /* 0x70 - 0x77 */
13279
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13280
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13281
- /* 0x78 - 0x7F */
13282
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13283
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13284
- /* 0x80 - 0x87 */
13285
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
13286
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
13287
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13288
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
13289
- /* 0x88 - 0x8F */
13290
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
13291
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13292
- 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
13293
- /* 0x90 - 0x9F */
13294
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
13295
- /* 0xA0 - 0xA7 */
13296
- ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
13297
- ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
13298
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
13299
- ByteOp | ImplicitOps, ImplicitOps,
13300
- /* 0xA8 - 0xAF */
13301
- 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
13302
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
13303
- ByteOp | ImplicitOps, ImplicitOps,
13304
- /* 0xB0 - 0xBF */
13305
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13306
- /* 0xC0 - 0xC7 */
13307
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
13308
- 0, ImplicitOps, 0, 0,
13309
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
13310
- /* 0xC8 - 0xCF */
13311
- 0, 0, 0, 0, 0, 0, 0, 0,
13312
- /* 0xD0 - 0xD7 */
13313
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
13314
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
13316
- /* 0xD8 - 0xDF */
13317
- 0, 0, 0, 0, 0, 0, 0, 0,
13318
- /* 0xE0 - 0xE7 */
13319
- 0, 0, 0, 0, 0, 0, 0, 0,
13320
- /* 0xE8 - 0xEF */
13321
- ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
13322
- /* 0xF0 - 0xF7 */
13325
- ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
13326
- /* 0xF8 - 0xFF */
13328
- 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
13331
-static u16 twobyte_table[256] = {
13332
- /* 0x00 - 0x0F */
13333
- 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
13334
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
13335
- /* 0x10 - 0x1F */
13336
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
13337
- /* 0x20 - 0x2F */
13338
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
13339
- 0, 0, 0, 0, 0, 0, 0, 0,
13340
- /* 0x30 - 0x3F */
13341
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13342
- /* 0x40 - 0x47 */
13343
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13344
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13345
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13346
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13347
- /* 0x48 - 0x4F */
13348
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13349
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13350
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13351
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
13352
- /* 0x50 - 0x5F */
13353
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13354
- /* 0x60 - 0x6F */
13355
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13356
- /* 0x70 - 0x7F */
13357
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13358
- /* 0x80 - 0x8F */
13359
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13360
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13361
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13362
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
13363
- /* 0x90 - 0x9F */
13364
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13365
- /* 0xA0 - 0xA7 */
13366
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
13367
- /* 0xA8 - 0xAF */
13368
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
13369
- /* 0xB0 - 0xB7 */
13370
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
13371
- DstMem | SrcReg | ModRM | BitOp,
13372
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
13373
- DstReg | SrcMem16 | ModRM | Mov,
13374
- /* 0xB8 - 0xBF */
13375
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
13376
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
13377
- DstReg | SrcMem16 | ModRM | Mov,
13378
- /* 0xC0 - 0xCF */
13379
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
13380
- 0, 0, 0, 0, 0, 0, 0, 0,
13381
- /* 0xD0 - 0xDF */
13382
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13383
- /* 0xE0 - 0xEF */
13384
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13385
- /* 0xF0 - 0xFF */
13386
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
13389
-/* Type, address-of, and value of an instruction's operand. */
13391
- enum { OP_REG, OP_MEM, OP_IMM } type;
13392
- unsigned int bytes;
13393
- unsigned long val, orig_val, *ptr;
13396
-/* EFLAGS bit definitions. */
13397
-#define EFLG_OF (1<<11)
13398
-#define EFLG_DF (1<<10)
13399
-#define EFLG_SF (1<<7)
13400
-#define EFLG_ZF (1<<6)
13401
-#define EFLG_AF (1<<4)
13402
-#define EFLG_PF (1<<2)
13403
-#define EFLG_CF (1<<0)
13406
- * Instruction emulation:
13407
- * Most instructions are emulated directly via a fragment of inline assembly
13408
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
13409
- * any modified flags.
13412
-#if defined(CONFIG_X86_64)
13413
-#define _LO32 "k" /* force 32-bit operand */
13414
-#define _STK "%%rsp" /* stack pointer */
13415
-#elif defined(__i386__)
13416
-#define _LO32 "" /* force 32-bit operand */
13417
-#define _STK "%%esp" /* stack pointer */
13421
- * These EFLAGS bits are restored from saved value during emulation, and
13422
- * any changes are written back to the saved value after emulation.
13424
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
13426
-/* Before executing instruction: restore necessary bits in EFLAGS. */
13427
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
13428
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
13429
- "push %"_sav"; " \
13430
- "movl %"_msk",%"_LO32 _tmp"; " \
13431
- "andl %"_LO32 _tmp",("_STK"); " \
13433
- "notl %"_LO32 _tmp"; " \
13434
- "andl %"_LO32 _tmp",("_STK"); " \
13435
- "pop %"_tmp"; " \
13436
- "orl %"_LO32 _tmp",("_STK"); " \
13438
- /* _sav &= ~msk; */ \
13439
- "movl %"_msk",%"_LO32 _tmp"; " \
13440
- "notl %"_LO32 _tmp"; " \
13441
- "andl %"_LO32 _tmp",%"_sav"; "
13443
-/* After executing instruction: write-back necessary bits in EFLAGS. */
13444
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
13445
- /* _sav |= EFLAGS & _msk; */ \
13447
- "pop %"_tmp"; " \
13448
- "andl %"_msk",%"_LO32 _tmp"; " \
13449
- "orl %"_LO32 _tmp",%"_sav"; "
13451
-/* Raw emulation: instruction has two explicit operands. */
13452
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
13454
- unsigned long _tmp; \
13456
- switch ((_dst).bytes) { \
13458
- __asm__ __volatile__ ( \
13459
- _PRE_EFLAGS("0","4","2") \
13460
- _op"w %"_wx"3,%1; " \
13461
- _POST_EFLAGS("0","4","2") \
13462
- : "=m" (_eflags), "=m" ((_dst).val), \
13464
- : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
13467
- __asm__ __volatile__ ( \
13468
- _PRE_EFLAGS("0","4","2") \
13469
- _op"l %"_lx"3,%1; " \
13470
- _POST_EFLAGS("0","4","2") \
13471
- : "=m" (_eflags), "=m" ((_dst).val), \
13473
- : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
13476
- __emulate_2op_8byte(_op, _src, _dst, \
13477
- _eflags, _qx, _qy); \
13482
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
13484
- unsigned long _tmp; \
13485
- switch ( (_dst).bytes ) \
13488
- __asm__ __volatile__ ( \
13489
- _PRE_EFLAGS("0","4","2") \
13490
- _op"b %"_bx"3,%1; " \
13491
- _POST_EFLAGS("0","4","2") \
13492
- : "=m" (_eflags), "=m" ((_dst).val), \
13494
- : _by ((_src).val), "i" (EFLAGS_MASK) ); \
13497
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
13498
- _wx, _wy, _lx, _ly, _qx, _qy); \
13503
-/* Source operand is byte-sized and may be restricted to just %cl. */
13504
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
13505
- __emulate_2op(_op, _src, _dst, _eflags, \
13506
- "b", "c", "b", "c", "b", "c", "b", "c")
13508
-/* Source operand is byte, word, long or quad sized. */
13509
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
13510
- __emulate_2op(_op, _src, _dst, _eflags, \
13511
- "b", "q", "w", "r", _LO32, "r", "", "r")
13513
-/* Source operand is word, long or quad sized. */
13514
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
13515
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
13516
- "w", "r", _LO32, "r", "", "r")
13518
-/* Instruction has only one explicit operand (no source operand). */
13519
-#define emulate_1op(_op, _dst, _eflags) \
13521
- unsigned long _tmp; \
13523
- switch ( (_dst).bytes ) \
13526
- __asm__ __volatile__ ( \
13527
- _PRE_EFLAGS("0","3","2") \
13529
- _POST_EFLAGS("0","3","2") \
13530
- : "=m" (_eflags), "=m" ((_dst).val), \
13532
- : "i" (EFLAGS_MASK) ); \
13535
- __asm__ __volatile__ ( \
13536
- _PRE_EFLAGS("0","3","2") \
13538
- _POST_EFLAGS("0","3","2") \
13539
- : "=m" (_eflags), "=m" ((_dst).val), \
13541
- : "i" (EFLAGS_MASK) ); \
13544
- __asm__ __volatile__ ( \
13545
- _PRE_EFLAGS("0","3","2") \
13547
- _POST_EFLAGS("0","3","2") \
13548
- : "=m" (_eflags), "=m" ((_dst).val), \
13550
- : "i" (EFLAGS_MASK) ); \
13553
- __emulate_1op_8byte(_op, _dst, _eflags); \
13558
-/* Emulate an instruction with quadword operands (x86/64 only). */
13559
-#if defined(CONFIG_X86_64)
13560
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
13562
- __asm__ __volatile__ ( \
13563
- _PRE_EFLAGS("0","4","2") \
13564
- _op"q %"_qx"3,%1; " \
13565
- _POST_EFLAGS("0","4","2") \
13566
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
13567
- : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
13570
-#define __emulate_1op_8byte(_op, _dst, _eflags) \
13572
- __asm__ __volatile__ ( \
13573
- _PRE_EFLAGS("0","3","2") \
13575
- _POST_EFLAGS("0","3","2") \
13576
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
13577
- : "i" (EFLAGS_MASK) ); \
13580
-#elif defined(__i386__)
13581
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
13582
-#define __emulate_1op_8byte(_op, _dst, _eflags)
13583
-#endif /* __i386__ */
13585
-/* Fetch next part of the instruction being emulated. */
13586
-#define insn_fetch(_type, _size, _eip) \
13587
-({ unsigned long _x; \
13588
- rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
13589
- (_size), ctxt->vcpu); \
13592
- (_eip) += (_size); \
13596
-/* Access/update address held in a register, based on addressing mode. */
13597
-#define address_mask(reg) \
13598
- ((ad_bytes == sizeof(unsigned long)) ? \
13599
- (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
13600
-#define register_address(base, reg) \
13601
- ((base) + address_mask(reg))
13602
-#define register_address_increment(reg, inc) \
13604
- /* signed type ensures sign extension to long */ \
13605
- int _inc = (inc); \
13606
- if ( ad_bytes == sizeof(unsigned long) ) \
13609
- (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
13610
- (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
13613
-#define JMP_REL(rel) \
13615
- register_address_increment(_eip, rel); \
13619
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
13620
- * pointer into the block that addresses the relevant register.
13621
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
13623
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
13624
- int highbyte_regs)
13628
- p = ®s[modrm_reg];
13629
- if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
13630
- p = (unsigned char *)®s[modrm_reg & 3] + 1;
13634
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
13635
- struct x86_emulate_ops *ops,
13637
- u16 *size, unsigned long *address, int op_bytes)
13641
- if (op_bytes == 2)
13644
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
13648
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
13653
-static int test_cc(unsigned int condition, unsigned int flags)
13657
- switch ((condition & 15) >> 1) {
13659
- rc |= (flags & EFLG_OF);
13661
- case 1: /* b/c/nae */
13662
- rc |= (flags & EFLG_CF);
13664
- case 2: /* z/e */
13665
- rc |= (flags & EFLG_ZF);
13667
- case 3: /* be/na */
13668
- rc |= (flags & (EFLG_CF|EFLG_ZF));
13671
- rc |= (flags & EFLG_SF);
13673
- case 5: /* p/pe */
13674
- rc |= (flags & EFLG_PF);
13676
- case 7: /* le/ng */
13677
- rc |= (flags & EFLG_ZF);
13678
- /* fall through */
13679
- case 6: /* l/nge */
13680
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
13684
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
13685
- return (!!rc ^ (condition & 1));
13689
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
13692
- u8 b, sib, twobyte = 0, rex_prefix = 0;
13693
- u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
13694
- unsigned long *override_base = NULL;
13695
- unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
13697
- struct operand src, dst;
13698
- unsigned long cr2 = ctxt->cr2;
13699
- int mode = ctxt->mode;
13700
- unsigned long modrm_ea;
13701
- int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
13705
- /* Shadow copy of register state. Committed on successful emulation. */
13706
- unsigned long _regs[NR_VCPU_REGS];
13707
- unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
13708
- unsigned long modrm_val = 0;
13710
- memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
13713
- case X86EMUL_MODE_REAL:
13714
- case X86EMUL_MODE_PROT16:
13715
- op_bytes = ad_bytes = 2;
13717
- case X86EMUL_MODE_PROT32:
13718
- op_bytes = ad_bytes = 4;
13720
-#ifdef CONFIG_X86_64
13721
- case X86EMUL_MODE_PROT64:
13730
- /* Legacy prefixes. */
13731
- for (i = 0; i < 8; i++) {
13732
- switch (b = insn_fetch(u8, 1, _eip)) {
13733
- case 0x66: /* operand-size override */
13734
- op_bytes ^= 6; /* switch between 2/4 bytes */
13736
- case 0x67: /* address-size override */
13737
- if (mode == X86EMUL_MODE_PROT64)
13738
- ad_bytes ^= 12; /* switch between 4/8 bytes */
13740
- ad_bytes ^= 6; /* switch between 2/4 bytes */
13742
- case 0x2e: /* CS override */
13743
- override_base = &ctxt->cs_base;
13745
- case 0x3e: /* DS override */
13746
- override_base = &ctxt->ds_base;
13748
- case 0x26: /* ES override */
13749
- override_base = &ctxt->es_base;
13751
- case 0x64: /* FS override */
13752
- override_base = &ctxt->fs_base;
13754
- case 0x65: /* GS override */
13755
- override_base = &ctxt->gs_base;
13757
- case 0x36: /* SS override */
13758
- override_base = &ctxt->ss_base;
13760
- case 0xf0: /* LOCK */
13763
- case 0xf2: /* REPNE/REPNZ */
13764
- case 0xf3: /* REP/REPE/REPZ */
13768
- goto done_prefixes;
13774
- /* REX prefix. */
13775
- if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
13778
- op_bytes = 8; /* REX.W */
13779
- modrm_reg = (b & 4) << 1; /* REX.R */
13780
- index_reg = (b & 2) << 2; /* REX.X */
13781
- modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
13782
- b = insn_fetch(u8, 1, _eip);
13785
- /* Opcode byte(s). */
13786
- d = opcode_table[b];
13788
- /* Two-byte opcode? */
13791
- b = insn_fetch(u8, 1, _eip);
13792
- d = twobyte_table[b];
13795
- /* Unrecognised? */
13797
- goto cannot_emulate;
13800
- /* ModRM and SIB bytes. */
13802
- modrm = insn_fetch(u8, 1, _eip);
13803
- modrm_mod |= (modrm & 0xc0) >> 6;
13804
- modrm_reg |= (modrm & 0x38) >> 3;
13805
- modrm_rm |= (modrm & 0x07);
13807
- use_modrm_ea = 1;
13809
- if (modrm_mod == 3) {
13810
- modrm_val = *(unsigned long *)
13811
- decode_register(modrm_rm, _regs, d & ByteOp);
13815
- if (ad_bytes == 2) {
13816
- unsigned bx = _regs[VCPU_REGS_RBX];
13817
- unsigned bp = _regs[VCPU_REGS_RBP];
13818
- unsigned si = _regs[VCPU_REGS_RSI];
13819
- unsigned di = _regs[VCPU_REGS_RDI];
13821
- /* 16-bit ModR/M decode. */
13822
- switch (modrm_mod) {
13824
- if (modrm_rm == 6)
13825
- modrm_ea += insn_fetch(u16, 2, _eip);
13828
- modrm_ea += insn_fetch(s8, 1, _eip);
13831
- modrm_ea += insn_fetch(u16, 2, _eip);
13834
- switch (modrm_rm) {
13836
- modrm_ea += bx + si;
13839
- modrm_ea += bx + di;
13842
- modrm_ea += bp + si;
13845
- modrm_ea += bp + di;
13854
- if (modrm_mod != 0)
13861
- if (modrm_rm == 2 || modrm_rm == 3 ||
13862
- (modrm_rm == 6 && modrm_mod != 0))
13863
- if (!override_base)
13864
- override_base = &ctxt->ss_base;
13865
- modrm_ea = (u16)modrm_ea;
13867
- /* 32/64-bit ModR/M decode. */
13868
- switch (modrm_rm) {
13871
- sib = insn_fetch(u8, 1, _eip);
13872
- index_reg |= (sib >> 3) & 7;
13873
- base_reg |= sib & 7;
13874
- scale = sib >> 6;
13876
- switch (base_reg) {
13878
- if (modrm_mod != 0)
13879
- modrm_ea += _regs[base_reg];
13881
- modrm_ea += insn_fetch(s32, 4, _eip);
13884
- modrm_ea += _regs[base_reg];
13886
- switch (index_reg) {
13890
- modrm_ea += _regs[index_reg] << scale;
13895
- if (modrm_mod != 0)
13896
- modrm_ea += _regs[modrm_rm];
13897
- else if (mode == X86EMUL_MODE_PROT64)
13898
- rip_relative = 1;
13901
- modrm_ea += _regs[modrm_rm];
13904
- switch (modrm_mod) {
13906
- if (modrm_rm == 5)
13907
- modrm_ea += insn_fetch(s32, 4, _eip);
13910
- modrm_ea += insn_fetch(s8, 1, _eip);
13913
- modrm_ea += insn_fetch(s32, 4, _eip);
13917
- if (!override_base)
13918
- override_base = &ctxt->ds_base;
13919
- if (mode == X86EMUL_MODE_PROT64 &&
13920
- override_base != &ctxt->fs_base &&
13921
- override_base != &ctxt->gs_base)
13922
- override_base = NULL;
13924
- if (override_base)
13925
- modrm_ea += *override_base;
13927
- if (rip_relative) {
13928
- modrm_ea += _eip;
13929
- switch (d & SrcMask) {
13937
- if (op_bytes == 8)
13940
- modrm_ea += op_bytes;
13943
- if (ad_bytes != 8)
13944
- modrm_ea = (u32)modrm_ea;
13951
- * Decode and fetch the source operand: register, memory
13954
- switch (d & SrcMask) {
13958
- src.type = OP_REG;
13959
- if (d & ByteOp) {
13960
- src.ptr = decode_register(modrm_reg, _regs,
13961
- (rex_prefix == 0));
13962
- src.val = src.orig_val = *(u8 *) src.ptr;
13965
- src.ptr = decode_register(modrm_reg, _regs, 0);
13966
- switch ((src.bytes = op_bytes)) {
13968
- src.val = src.orig_val = *(u16 *) src.ptr;
13971
- src.val = src.orig_val = *(u32 *) src.ptr;
13974
- src.val = src.orig_val = *(u64 *) src.ptr;
13981
- goto srcmem_common;
13984
- goto srcmem_common;
13986
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
13987
- /* Don't fetch the address for invlpg: it could be unmapped. */
13988
- if (twobyte && b == 0x01 && modrm_reg == 7)
13992
- * For instructions with a ModR/M byte, switch to register
13993
- * access if Mod = 3.
13995
- if ((d & ModRM) && modrm_mod == 3) {
13996
- src.type = OP_REG;
13999
- src.type = OP_MEM;
14000
- src.ptr = (unsigned long *)cr2;
14002
- if ((rc = ops->read_emulated((unsigned long)src.ptr,
14003
- &src.val, src.bytes, ctxt->vcpu)) != 0)
14005
- src.orig_val = src.val;
14008
- src.type = OP_IMM;
14009
- src.ptr = (unsigned long *)_eip;
14010
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
14011
- if (src.bytes == 8)
14013
- /* NB. Immediates are sign-extended as necessary. */
14014
- switch (src.bytes) {
14016
- src.val = insn_fetch(s8, 1, _eip);
14019
- src.val = insn_fetch(s16, 2, _eip);
14022
- src.val = insn_fetch(s32, 4, _eip);
14027
- src.type = OP_IMM;
14028
- src.ptr = (unsigned long *)_eip;
14030
- src.val = insn_fetch(s8, 1, _eip);
14034
- /* Decode and fetch the destination operand: register or memory. */
14035
- switch (d & DstMask) {
14036
- case ImplicitOps:
14037
- /* Special instructions do their own operand decoding. */
14038
- goto special_insn;
14040
- dst.type = OP_REG;
14042
- && !(twobyte && (b == 0xb6 || b == 0xb7))) {
14043
- dst.ptr = decode_register(modrm_reg, _regs,
14044
- (rex_prefix == 0));
14045
- dst.val = *(u8 *) dst.ptr;
14048
- dst.ptr = decode_register(modrm_reg, _regs, 0);
14049
- switch ((dst.bytes = op_bytes)) {
14051
- dst.val = *(u16 *)dst.ptr;
14054
- dst.val = *(u32 *)dst.ptr;
14057
- dst.val = *(u64 *)dst.ptr;
14063
- dst.type = OP_MEM;
14064
- dst.ptr = (unsigned long *)cr2;
14065
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
14068
- * For instructions with a ModR/M byte, switch to register
14069
- * access if Mod = 3.
14071
- if ((d & ModRM) && modrm_mod == 3) {
14072
- dst.type = OP_REG;
14076
- unsigned long mask = ~(dst.bytes * 8 - 1);
14078
- dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
14080
- if (!(d & Mov) && /* optimisation - avoid slow emulated read */
14081
- ((rc = ops->read_emulated((unsigned long)dst.ptr,
14082
- &dst.val, dst.bytes, ctxt->vcpu)) != 0))
14086
- dst.orig_val = dst.val;
14089
- goto twobyte_insn;
14092
- case 0x00 ... 0x05:
14094
- emulate_2op_SrcV("add", src, dst, _eflags);
14096
- case 0x08 ... 0x0d:
14098
- emulate_2op_SrcV("or", src, dst, _eflags);
14100
- case 0x10 ... 0x15:
14102
- emulate_2op_SrcV("adc", src, dst, _eflags);
14104
- case 0x18 ... 0x1d:
14106
- emulate_2op_SrcV("sbb", src, dst, _eflags);
14108
- case 0x20 ... 0x23:
14110
- emulate_2op_SrcV("and", src, dst, _eflags);
14112
- case 0x24: /* and al imm8 */
14113
- dst.type = OP_REG;
14114
- dst.ptr = &_regs[VCPU_REGS_RAX];
14115
- dst.val = *(u8 *)dst.ptr;
14117
- dst.orig_val = dst.val;
14119
- case 0x25: /* and ax imm16, or eax imm32 */
14120
- dst.type = OP_REG;
14121
- dst.bytes = op_bytes;
14122
- dst.ptr = &_regs[VCPU_REGS_RAX];
14123
- if (op_bytes == 2)
14124
- dst.val = *(u16 *)dst.ptr;
14126
- dst.val = *(u32 *)dst.ptr;
14127
- dst.orig_val = dst.val;
14129
- case 0x28 ... 0x2d:
14131
- emulate_2op_SrcV("sub", src, dst, _eflags);
14133
- case 0x30 ... 0x35:
14135
- emulate_2op_SrcV("xor", src, dst, _eflags);
14137
- case 0x38 ... 0x3d:
14139
- emulate_2op_SrcV("cmp", src, dst, _eflags);
14141
- case 0x63: /* movsxd */
14142
- if (mode != X86EMUL_MODE_PROT64)
14143
- goto cannot_emulate;
14144
- dst.val = (s32) src.val;
14146
- case 0x80 ... 0x83: /* Grp1 */
14147
- switch (modrm_reg) {
14166
- case 0x84 ... 0x85:
14168
- emulate_2op_SrcV("test", src, dst, _eflags);
14170
- case 0x86 ... 0x87: /* xchg */
14171
- /* Write back the register source. */
14172
- switch (dst.bytes) {
14174
- *(u8 *) src.ptr = (u8) dst.val;
14177
- *(u16 *) src.ptr = (u16) dst.val;
14180
- *src.ptr = (u32) dst.val;
14181
- break; /* 64b reg: zero-extend */
14183
- *src.ptr = dst.val;
14187
- * Write back the memory destination with implicit LOCK
14190
- dst.val = src.val;
14193
- case 0x88 ... 0x8b: /* mov */
14195
- case 0x8d: /* lea r16/r32, m */
14196
- dst.val = modrm_val;
14198
- case 0x8f: /* pop (sole member of Grp1a) */
14199
- /* 64-bit mode: POP always pops a 64-bit operand. */
14200
- if (mode == X86EMUL_MODE_PROT64)
14202
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
14203
- _regs[VCPU_REGS_RSP]),
14204
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
14206
- register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
14208
- case 0xa0 ... 0xa1: /* mov */
14209
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
14210
- dst.val = src.val;
14211
- _eip += ad_bytes; /* skip src displacement */
14213
- case 0xa2 ... 0xa3: /* mov */
14214
- dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
14215
- _eip += ad_bytes; /* skip dst displacement */
14217
- case 0xc0 ... 0xc1:
14219
- switch (modrm_reg) {
14220
- case 0: /* rol */
14221
- emulate_2op_SrcB("rol", src, dst, _eflags);
14223
- case 1: /* ror */
14224
- emulate_2op_SrcB("ror", src, dst, _eflags);
14226
- case 2: /* rcl */
14227
- emulate_2op_SrcB("rcl", src, dst, _eflags);
14229
- case 3: /* rcr */
14230
- emulate_2op_SrcB("rcr", src, dst, _eflags);
14232
- case 4: /* sal/shl */
14233
- case 6: /* sal/shl */
14234
- emulate_2op_SrcB("sal", src, dst, _eflags);
14236
- case 5: /* shr */
14237
- emulate_2op_SrcB("shr", src, dst, _eflags);
14239
- case 7: /* sar */
14240
- emulate_2op_SrcB("sar", src, dst, _eflags);
14244
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
14246
- dst.val = src.val;
14248
- case 0xd0 ... 0xd1: /* Grp2 */
14251
- case 0xd2 ... 0xd3: /* Grp2 */
14252
- src.val = _regs[VCPU_REGS_RCX];
14254
- case 0xf6 ... 0xf7: /* Grp3 */
14255
- switch (modrm_reg) {
14256
- case 0 ... 1: /* test */
14258
- * Special case in Grp3: test has an immediate
14259
- * source operand.
14261
- src.type = OP_IMM;
14262
- src.ptr = (unsigned long *)_eip;
14263
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
14264
- if (src.bytes == 8)
14266
- switch (src.bytes) {
14268
- src.val = insn_fetch(s8, 1, _eip);
14271
- src.val = insn_fetch(s16, 2, _eip);
14274
- src.val = insn_fetch(s32, 4, _eip);
14278
- case 2: /* not */
14279
- dst.val = ~dst.val;
14281
- case 3: /* neg */
14282
- emulate_1op("neg", dst, _eflags);
14285
- goto cannot_emulate;
14288
- case 0xfe ... 0xff: /* Grp4/Grp5 */
14289
- switch (modrm_reg) {
14290
- case 0: /* inc */
14291
- emulate_1op("inc", dst, _eflags);
14293
- case 1: /* dec */
14294
- emulate_1op("dec", dst, _eflags);
14296
- case 4: /* jmp abs */
14300
- goto cannot_emulate;
14302
- case 6: /* push */
14303
- /* 64-bit mode: PUSH always pushes a 64-bit operand. */
14304
- if (mode == X86EMUL_MODE_PROT64) {
14306
- if ((rc = ops->read_std((unsigned long)dst.ptr,
14308
- ctxt->vcpu)) != 0)
14311
- register_address_increment(_regs[VCPU_REGS_RSP],
14313
- if ((rc = ops->write_emulated(
14314
- register_address(ctxt->ss_base,
14315
- _regs[VCPU_REGS_RSP]),
14316
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
14321
- goto cannot_emulate;
14328
- switch (dst.type) {
14330
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
14331
- switch (dst.bytes) {
14333
- *(u8 *)dst.ptr = (u8)dst.val;
14336
- *(u16 *)dst.ptr = (u16)dst.val;
14339
- *dst.ptr = (u32)dst.val;
14340
- break; /* 64b: zero-ext */
14342
- *dst.ptr = dst.val;
14348
- rc = ops->cmpxchg_emulated((unsigned long)dst.
14349
- ptr, &dst.orig_val,
14350
- &dst.val, dst.bytes,
14353
- rc = ops->write_emulated((unsigned long)dst.ptr,
14354
- &dst.val, dst.bytes,
14363
- /* Commit shadow register state. */
14364
- memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
14365
- ctxt->eflags = _eflags;
14366
- ctxt->vcpu->rip = _eip;
14369
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
14373
- goto twobyte_special_insn;
14375
- case 0x50 ... 0x57: /* push reg */
14376
- if (op_bytes == 2)
14377
- src.val = (u16) _regs[b & 0x7];
14379
- src.val = (u32) _regs[b & 0x7];
14380
- dst.type = OP_MEM;
14381
- dst.bytes = op_bytes;
14382
- dst.val = src.val;
14383
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
14384
- dst.ptr = (void *) register_address(
14385
- ctxt->ss_base, _regs[VCPU_REGS_RSP]);
14387
- case 0x58 ... 0x5f: /* pop reg */
14388
- dst.ptr = (unsigned long *)&_regs[b & 0x7];
14390
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
14391
- _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
14395
- register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
14396
- no_wb = 1; /* Disable writeback. */
14398
- case 0x6a: /* push imm8 */
14400
- src.val = insn_fetch(s8, 1, _eip);
14402
- dst.type = OP_MEM;
14403
- dst.bytes = op_bytes;
14404
- dst.val = src.val;
14405
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
14406
- dst.ptr = (void *) register_address(ctxt->ss_base,
14407
- _regs[VCPU_REGS_RSP]);
14409
- case 0x6c: /* insb */
14410
- case 0x6d: /* insw/insd */
14411
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
14413
- (d & ByteOp) ? 1 : op_bytes, /* size */
14415
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
14416
- (_eflags & EFLG_DF), /* down */
14417
- register_address(ctxt->es_base,
14418
- _regs[VCPU_REGS_RDI]), /* address */
14420
- _regs[VCPU_REGS_RDX] /* port */
14424
- case 0x6e: /* outsb */
14425
- case 0x6f: /* outsw/outsd */
14426
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
14428
- (d & ByteOp) ? 1 : op_bytes, /* size */
14430
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
14431
- (_eflags & EFLG_DF), /* down */
14432
- register_address(override_base ?
14433
- *override_base : ctxt->ds_base,
14434
- _regs[VCPU_REGS_RSI]), /* address */
14436
- _regs[VCPU_REGS_RDX] /* port */
14440
- case 0x70 ... 0x7f: /* jcc (short) */ {
14441
- int rel = insn_fetch(s8, 1, _eip);
14443
- if (test_cc(b, _eflags))
14447
- case 0x9c: /* pushf */
14448
- src.val = (unsigned long) _eflags;
14450
- case 0x9d: /* popf */
14451
- dst.ptr = (unsigned long *) &_eflags;
14452
- goto pop_instruction;
14453
- case 0xc3: /* ret */
14455
- goto pop_instruction;
14456
- case 0xf4: /* hlt */
14457
- ctxt->vcpu->halt_request = 1;
14460
- if (rep_prefix) {
14461
- if (_regs[VCPU_REGS_RCX] == 0) {
14462
- ctxt->vcpu->rip = _eip;
14465
- _regs[VCPU_REGS_RCX]--;
14466
- _eip = ctxt->vcpu->rip;
14469
- case 0xa4 ... 0xa5: /* movs */
14470
- dst.type = OP_MEM;
14471
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
14472
- dst.ptr = (unsigned long *)register_address(ctxt->es_base,
14473
- _regs[VCPU_REGS_RDI]);
14474
- if ((rc = ops->read_emulated(register_address(
14475
- override_base ? *override_base : ctxt->ds_base,
14476
- _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
14478
- register_address_increment(_regs[VCPU_REGS_RSI],
14479
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
14480
- register_address_increment(_regs[VCPU_REGS_RDI],
14481
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
14483
- case 0xa6 ... 0xa7: /* cmps */
14484
- DPRINTF("Urk! I don't handle CMPS.\n");
14485
- goto cannot_emulate;
14486
- case 0xaa ... 0xab: /* stos */
14487
- dst.type = OP_MEM;
14488
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
14489
- dst.ptr = (unsigned long *)cr2;
14490
- dst.val = _regs[VCPU_REGS_RAX];
14491
- register_address_increment(_regs[VCPU_REGS_RDI],
14492
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
14494
- case 0xac ... 0xad: /* lods */
14495
- dst.type = OP_REG;
14496
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
14497
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
14498
- if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
14499
- ctxt->vcpu)) != 0)
14501
- register_address_increment(_regs[VCPU_REGS_RSI],
14502
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
14504
- case 0xae ... 0xaf: /* scas */
14505
- DPRINTF("Urk! I don't handle SCAS.\n");
14506
- goto cannot_emulate;
14507
- case 0xe8: /* call (near) */ {
14509
- switch (op_bytes) {
14511
- rel = insn_fetch(s16, 2, _eip);
14514
- rel = insn_fetch(s32, 4, _eip);
14517
- rel = insn_fetch(s64, 8, _eip);
14520
- DPRINTF("Call: Invalid op_bytes\n");
14521
- goto cannot_emulate;
14523
- src.val = (unsigned long) _eip;
14525
- op_bytes = ad_bytes;
14528
- case 0xe9: /* jmp rel */
14529
- case 0xeb: /* jmp rel short */
14530
- JMP_REL(src.val);
14531
- no_wb = 1; /* Disable writeback. */
14540
- case 0x01: /* lgdt, lidt, lmsw */
14541
- /* Disable writeback. */
14543
- switch (modrm_reg) {
14545
- unsigned long address;
14547
- case 2: /* lgdt */
14548
- rc = read_descriptor(ctxt, ops, src.ptr,
14549
- &size, &address, op_bytes);
14552
- realmode_lgdt(ctxt->vcpu, size, address);
14554
- case 3: /* lidt */
14555
- rc = read_descriptor(ctxt, ops, src.ptr,
14556
- &size, &address, op_bytes);
14559
- realmode_lidt(ctxt->vcpu, size, address);
14561
- case 4: /* smsw */
14562
- if (modrm_mod != 3)
14563
- goto cannot_emulate;
14564
- *(u16 *)&_regs[modrm_rm]
14565
- = realmode_get_cr(ctxt->vcpu, 0);
14567
- case 6: /* lmsw */
14568
- if (modrm_mod != 3)
14569
- goto cannot_emulate;
14570
- realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
14572
- case 7: /* invlpg*/
14573
- emulate_invlpg(ctxt->vcpu, cr2);
14576
- goto cannot_emulate;
14579
- case 0x21: /* mov from dr to reg */
14581
- if (modrm_mod != 3)
14582
- goto cannot_emulate;
14583
- rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
14585
- case 0x23: /* mov from reg to dr */
14587
- if (modrm_mod != 3)
14588
- goto cannot_emulate;
14589
- rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
14591
- case 0x40 ... 0x4f: /* cmov */
14592
- dst.val = dst.orig_val = src.val;
14595
- * First, assume we're decoding an even cmov opcode
14598
- switch ((b & 15) >> 1) {
14599
- case 0: /* cmovo */
14600
- no_wb = (_eflags & EFLG_OF) ? 0 : 1;
14602
- case 1: /* cmovb/cmovc/cmovnae */
14603
- no_wb = (_eflags & EFLG_CF) ? 0 : 1;
14605
- case 2: /* cmovz/cmove */
14606
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
14608
- case 3: /* cmovbe/cmovna */
14609
- no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
14611
- case 4: /* cmovs */
14612
- no_wb = (_eflags & EFLG_SF) ? 0 : 1;
14614
- case 5: /* cmovp/cmovpe */
14615
- no_wb = (_eflags & EFLG_PF) ? 0 : 1;
14617
- case 7: /* cmovle/cmovng */
14618
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
14619
- /* fall through */
14620
- case 6: /* cmovl/cmovnge */
14621
- no_wb &= (!(_eflags & EFLG_SF) !=
14622
- !(_eflags & EFLG_OF)) ? 0 : 1;
14625
- /* Odd cmov opcodes (lsb == 1) have inverted sense. */
14630
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
14631
- emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
14635
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
14636
- emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
14638
- case 0xb0 ... 0xb1: /* cmpxchg */
14640
- * Save real source value, then compare EAX against
14643
- src.orig_val = src.val;
14644
- src.val = _regs[VCPU_REGS_RAX];
14645
- emulate_2op_SrcV("cmp", src, dst, _eflags);
14646
- if (_eflags & EFLG_ZF) {
14647
- /* Success: write back to memory. */
14648
- dst.val = src.orig_val;
14650
- /* Failure: write the value we saw to EAX. */
14651
- dst.type = OP_REG;
14652
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
14657
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
14658
- emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
14660
- case 0xb6 ... 0xb7: /* movzx */
14661
- dst.bytes = op_bytes;
14662
- dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
14664
- case 0xba: /* Grp8 */
14665
- switch (modrm_reg & 3) {
14678
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
14679
- emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
14681
- case 0xbe ... 0xbf: /* movsx */
14682
- dst.bytes = op_bytes;
14683
- dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
14685
- case 0xc3: /* movnti */
14686
- dst.bytes = op_bytes;
14687
- dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
14692
-twobyte_special_insn:
14693
- /* Disable writeback. */
14697
- emulate_clts(ctxt->vcpu);
14699
- case 0x08: /* invd */
14701
- case 0x09: /* wbinvd */
14703
- case 0x0d: /* GrpP (prefetch) */
14704
- case 0x18: /* Grp16 (prefetch/nop) */
14706
- case 0x20: /* mov cr, reg */
14707
- if (modrm_mod != 3)
14708
- goto cannot_emulate;
14709
- _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
14711
- case 0x22: /* mov reg, cr */
14712
- if (modrm_mod != 3)
14713
- goto cannot_emulate;
14714
- realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
14718
- msr_data = (u32)_regs[VCPU_REGS_RAX]
14719
- | ((u64)_regs[VCPU_REGS_RDX] << 32);
14720
- rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
14722
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
14723
- _eip = ctxt->vcpu->rip;
14725
- rc = X86EMUL_CONTINUE;
14729
- rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
14731
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
14732
- _eip = ctxt->vcpu->rip;
14734
- _regs[VCPU_REGS_RAX] = (u32)msr_data;
14735
- _regs[VCPU_REGS_RDX] = msr_data >> 32;
14737
- rc = X86EMUL_CONTINUE;
14739
- case 0x80 ... 0x8f: /* jnz rel, etc*/ {
14742
- switch (op_bytes) {
14744
- rel = insn_fetch(s16, 2, _eip);
14747
- rel = insn_fetch(s32, 4, _eip);
14750
- rel = insn_fetch(s64, 8, _eip);
14753
- DPRINTF("jnz: Invalid op_bytes\n");
14754
- goto cannot_emulate;
14756
- if (test_cc(b, _eflags))
14760
- case 0xc7: /* Grp9 (cmpxchg8b) */
14763
- if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
14766
- if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
14767
- ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
14768
- _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
14769
- _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
14770
- _eflags &= ~EFLG_ZF;
14772
- new = ((u64)_regs[VCPU_REGS_RCX] << 32)
14773
- | (u32) _regs[VCPU_REGS_RBX];
14774
- if ((rc = ops->cmpxchg_emulated(cr2, &old,
14775
- &new, 8, ctxt->vcpu)) != 0)
14777
- _eflags |= EFLG_ZF;
14785
- DPRINTF("Cannot emulate %02x\n", b);
14791
-#include <asm/mm.h>
14792
-#include <asm/uaccess.h>
14795
-x86_emulate_read_std(unsigned long addr,
14796
- unsigned long *val,
14797
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
14803
- if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
14804
- propagate_page_fault(addr + bytes - rc, 0); /* read fault */
14805
- return X86EMUL_PROPAGATE_FAULT;
14808
- return X86EMUL_CONTINUE;
14812
-x86_emulate_write_std(unsigned long addr,
14813
- unsigned long val,
14814
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
14818
- if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
14819
- propagate_page_fault(addr + bytes - rc, PGERR_write_access);
14820
- return X86EMUL_PROPAGATE_FAULT;
14823
- return X86EMUL_CONTINUE;
14827
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
14828
index 12db5a1..da5eb69 100644
14829
--- a/include/asm-x86/Kbuild
14830
+++ b/include/asm-x86/Kbuild
14831
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
14833
header-y += bootparam.h
14834
header-y += debugreg.h
14837
header-y += msr-index.h
14838
header-y += prctl.h
14839
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
14840
new file mode 100644
14841
index 0000000..17afa81
14843
+++ b/include/asm-x86/kvm.h
14845
+#ifndef __LINUX_KVM_X86_H
14846
+#define __LINUX_KVM_X86_H
14849
+ * KVM x86 specific structures and definitions
14853
+#include <asm/types.h>
14854
+#include <linux/ioctl.h>
14856
+/* Architectural interrupt line count. */
14857
+#define KVM_NR_INTERRUPTS 256
14859
+struct kvm_memory_alias {
14860
+ __u32 slot; /* this has a different namespace than memory slots */
14862
+ __u64 guest_phys_addr;
14863
+ __u64 memory_size;
14864
+ __u64 target_phys_addr;
14867
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
14868
+struct kvm_pic_state {
14869
+ __u8 last_irr; /* edge detection */
14870
+ __u8 irr; /* interrupt request register */
14871
+ __u8 imr; /* interrupt mask register */
14872
+ __u8 isr; /* interrupt service register */
14873
+ __u8 priority_add; /* highest irq priority */
14875
+ __u8 read_reg_select;
14877
+ __u8 special_mask;
14880
+ __u8 rotate_on_auto_eoi;
14881
+ __u8 special_fully_nested_mode;
14882
+ __u8 init4; /* true if 4 byte init */
14883
+ __u8 elcr; /* PIIX edge/trigger selection */
14887
+#define KVM_IOAPIC_NUM_PINS 24
14888
+struct kvm_ioapic_state {
14889
+ __u64 base_address;
14898
+ __u8 delivery_mode:3;
14899
+ __u8 dest_mode:1;
14900
+ __u8 delivery_status:1;
14902
+ __u8 remote_irr:1;
14903
+ __u8 trig_mode:1;
14906
+ __u8 reserved[4];
14909
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
14912
+#define KVM_IRQCHIP_PIC_MASTER 0
14913
+#define KVM_IRQCHIP_PIC_SLAVE 1
14914
+#define KVM_IRQCHIP_IOAPIC 2
14916
+/* for KVM_GET_REGS and KVM_SET_REGS */
14918
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
14919
+ __u64 rax, rbx, rcx, rdx;
14920
+ __u64 rsi, rdi, rsp, rbp;
14921
+ __u64 r8, r9, r10, r11;
14922
+ __u64 r12, r13, r14, r15;
14923
+ __u64 rip, rflags;
14926
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
14927
+#define KVM_APIC_REG_SIZE 0x400
14928
+struct kvm_lapic_state {
14929
+ char regs[KVM_APIC_REG_SIZE];
14932
+struct kvm_segment {
14937
+ __u8 present, dpl, db, s, l, g, avl;
14942
+struct kvm_dtable {
14945
+ __u16 padding[3];
14949
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
14950
+struct kvm_sregs {
14951
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
14952
+ struct kvm_segment cs, ds, es, fs, gs, ss;
14953
+ struct kvm_segment tr, ldt;
14954
+ struct kvm_dtable gdt, idt;
14955
+ __u64 cr0, cr2, cr3, cr4, cr8;
14958
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
14961
+struct kvm_msr_entry {
14967
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
14969
+ __u32 nmsrs; /* number of msrs in entries */
14972
+ struct kvm_msr_entry entries[0];
14975
+/* for KVM_GET_MSR_INDEX_LIST */
14976
+struct kvm_msr_list {
14977
+ __u32 nmsrs; /* number of msrs in entries */
14978
+ __u32 indices[0];
14982
+struct kvm_cpuid_entry {
14991
+/* for KVM_SET_CPUID */
14992
+struct kvm_cpuid {
14995
+ struct kvm_cpuid_entry entries[0];
14998
+struct kvm_cpuid_entry2 {
15006
+ __u32 padding[3];
15009
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
15010
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
15011
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
15013
+/* for KVM_SET_CPUID2 */
15014
+struct kvm_cpuid2 {
15017
+ struct kvm_cpuid_entry2 entries[0];
15021
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h
15022
similarity index 64%
15023
rename from drivers/kvm/kvm.h
15024
rename to include/asm-x86/kvm_host.h
15025
index 3b0bc4b..28940e1 100644
15026
--- a/drivers/kvm/kvm.h
15027
+++ b/include/asm-x86/kvm_host.h
15034
+ * Kernel-based Virtual Machine driver for Linux
15036
+ * This header defines architecture specific interfaces, x86 version
15038
* This work is licensed under the terms of the GNU GPL, version 2. See
15039
* the COPYING file in the top-level directory.
15043
+#ifndef ASM_KVM_HOST_H
15044
+#define ASM_KVM_HOST_H
15046
#include <linux/types.h>
15047
-#include <linux/list.h>
15048
-#include <linux/mutex.h>
15049
-#include <linux/spinlock.h>
15050
-#include <linux/signal.h>
15051
-#include <linux/sched.h>
15052
#include <linux/mm.h>
15053
-#include <linux/preempt.h>
15054
-#include <asm/signal.h>
15056
#include <linux/kvm.h>
15057
#include <linux/kvm_para.h>
15058
+#include <linux/kvm_types.h>
15060
+#include <asm/desc.h>
15062
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
15063
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
15065
#define INVALID_PAGE (~(hpa_t)0)
15066
#define UNMAPPED_GVA (~(gpa_t)0)
15068
-#define KVM_MAX_VCPUS 4
15069
-#define KVM_ALIAS_SLOTS 4
15070
-#define KVM_MEMORY_SLOTS 8
15071
-#define KVM_NUM_MMU_PAGES 1024
15072
-#define KVM_MIN_FREE_MMU_PAGES 5
15073
-#define KVM_REFILL_PAGES 25
15074
-#define KVM_MAX_CPUID_ENTRIES 40
15076
#define DE_VECTOR 0
15077
+#define UD_VECTOR 6
15078
#define NM_VECTOR 7
15079
#define DF_VECTOR 8
15080
#define TS_VECTOR 10
15081
@@ -59,31 +53,66 @@
15083
#define IOPL_SHIFT 12
15085
-#define KVM_PIO_PAGE_OFFSET 1
15086
+#define KVM_ALIAS_SLOTS 4
15089
- * vcpu->requests bit members
15091
-#define KVM_TLB_FLUSH 0
15092
+#define KVM_PERMILLE_MMU_PAGES 20
15093
+#define KVM_MIN_ALLOC_MMU_PAGES 64
15094
+#define KVM_NUM_MMU_PAGES 1024
15095
+#define KVM_MIN_FREE_MMU_PAGES 5
15096
+#define KVM_REFILL_PAGES 25
15097
+#define KVM_MAX_CPUID_ENTRIES 40
15102
- * gva - guest virtual address
15103
- * gpa - guest physical address
15104
- * gfn - guest frame number
15105
- * hva - host virtual address
15106
- * hpa - host physical address
15107
- * hfn - host frame number
15109
+extern spinlock_t kvm_lock;
15110
+extern struct list_head vm_list;
15116
+ VCPU_REGS_RAX = 0,
15117
+ VCPU_REGS_RCX = 1,
15118
+ VCPU_REGS_RDX = 2,
15119
+ VCPU_REGS_RBX = 3,
15120
+ VCPU_REGS_RSP = 4,
15121
+ VCPU_REGS_RBP = 5,
15122
+ VCPU_REGS_RSI = 6,
15123
+ VCPU_REGS_RDI = 7,
15124
+#ifdef CONFIG_X86_64
15125
+ VCPU_REGS_R8 = 8,
15126
+ VCPU_REGS_R9 = 9,
15127
+ VCPU_REGS_R10 = 10,
15128
+ VCPU_REGS_R11 = 11,
15129
+ VCPU_REGS_R12 = 12,
15130
+ VCPU_REGS_R13 = 13,
15131
+ VCPU_REGS_R14 = 14,
15132
+ VCPU_REGS_R15 = 15,
15148
-typedef unsigned long gva_t;
15149
-typedef u64 gpa_t;
15150
-typedef unsigned long gfn_t;
15151
+#include <asm/kvm_x86_emulate.h>
15153
-typedef unsigned long hva_t;
15154
-typedef u64 hpa_t;
15155
-typedef unsigned long hfn_t;
15156
+#define KVM_NR_MEM_OBJS 40
15159
+ * We don't want allocation failures within the mmu code, so we preallocate
15160
+ * enough memory for a single page fault in a cache.
15162
+struct kvm_mmu_memory_cache {
15164
+ void *objects[KVM_NR_MEM_OBJS];
15167
#define NR_PTE_CHAIN_ENTRIES 5
15169
@@ -99,7 +128,7 @@ struct kvm_pte_chain {
15170
* bits 4:7 - page table level for this shadow (1-4)
15171
* bits 8:9 - page table quadrant for 2-level guests
15172
* bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
15173
- * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
15174
+ * bits 17:19 - common access permissions for all ptes in this shadow page
15176
union kvm_mmu_page_role {
15178
@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
15179
unsigned quadrant : 2;
15180
unsigned pad_for_nice_hex_output : 6;
15181
unsigned metaphysical : 1;
15182
- unsigned hugepage_access : 3;
15183
+ unsigned access : 3;
15187
@@ -125,6 +154,8 @@ struct kvm_mmu_page {
15188
union kvm_mmu_page_role role;
15191
+ /* hold the gfn of each spte inside spt */
15193
unsigned long slot_bitmap; /* One bit set per slot which has memory
15194
* in this shadow page.
15196
@@ -136,9 +167,6 @@ struct kvm_mmu_page {
15201
-extern struct kmem_cache *kvm_vcpu_cache;
15204
* x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
15205
* 32-bit). The kvm_mmu structure abstracts the details of the current mmu
15206
@@ -149,6 +177,8 @@ struct kvm_mmu {
15207
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
15208
void (*free)(struct kvm_vcpu *vcpu);
15209
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
15210
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
15211
+ struct kvm_mmu_page *page);
15214
int shadow_root_level;
15215
@@ -156,159 +186,9 @@ struct kvm_mmu {
15219
-#define KVM_NR_MEM_OBJS 20
15221
-struct kvm_mmu_memory_cache {
15223
- void *objects[KVM_NR_MEM_OBJS];
15227
- * We don't want allocation failures within the mmu code, so we preallocate
15228
- * enough memory for a single page fault in a cache.
15230
-struct kvm_guest_debug {
15232
- unsigned long bp[4];
15237
- VCPU_REGS_RAX = 0,
15238
- VCPU_REGS_RCX = 1,
15239
- VCPU_REGS_RDX = 2,
15240
- VCPU_REGS_RBX = 3,
15241
- VCPU_REGS_RSP = 4,
15242
- VCPU_REGS_RBP = 5,
15243
- VCPU_REGS_RSI = 6,
15244
- VCPU_REGS_RDI = 7,
15245
-#ifdef CONFIG_X86_64
15246
- VCPU_REGS_R8 = 8,
15247
- VCPU_REGS_R9 = 9,
15248
- VCPU_REGS_R10 = 10,
15249
- VCPU_REGS_R11 = 11,
15250
- VCPU_REGS_R12 = 12,
15251
- VCPU_REGS_R13 = 13,
15252
- VCPU_REGS_R14 = 14,
15253
- VCPU_REGS_R15 = 15,
15269
-struct kvm_pio_request {
15270
- unsigned long count;
15272
- struct page *guest_pages[2];
15273
- unsigned guest_page_offset;
15291
- u32 signal_exits;
15292
- u32 irq_window_exits;
15295
- u32 request_irq_exits;
15301
-struct kvm_io_device {
15302
- void (*read)(struct kvm_io_device *this,
15306
- void (*write)(struct kvm_io_device *this,
15309
- const void *val);
15310
- int (*in_range)(struct kvm_io_device *this, gpa_t addr);
15311
- void (*destructor)(struct kvm_io_device *this);
15316
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
15321
- dev->read(dev, addr, len, val);
15324
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
15329
- dev->write(dev, addr, len, val);
15332
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
15334
- return dev->in_range(dev, addr);
15337
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
15339
- if (dev->destructor)
15340
- dev->destructor(dev);
15344
- * It would be nice to use something smarter than a linear search, TBD...
15345
- * Thankfully we dont expect many devices to register (famous last words :),
15346
- * so until then it will suffice. At least its abstracted so we can change
15349
-struct kvm_io_bus {
15351
-#define NR_IOBUS_DEVS 6
15352
- struct kvm_io_device *devs[NR_IOBUS_DEVS];
15355
-void kvm_io_bus_init(struct kvm_io_bus *bus);
15356
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
15357
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
15358
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
15359
- struct kvm_io_device *dev);
15363
- struct preempt_notifier preempt_notifier;
15365
- struct mutex mutex;
15367
+struct kvm_vcpu_arch {
15369
- struct kvm_run *run;
15370
int interrupt_window_open;
15372
- unsigned long requests;
15373
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
15374
DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
15375
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
15376
@@ -317,9 +197,6 @@ struct kvm_vcpu {
15380
- gpa_t para_state_gpa;
15381
- struct page *para_state_page;
15382
- gpa_t hypercall_gpa;
15385
u64 pdptrs[4]; /* pae */
15386
@@ -344,29 +221,21 @@ struct kvm_vcpu {
15388
gfn_t last_pt_write_gfn;
15389
int last_pt_write_count;
15391
- struct kvm_guest_debug guest_debug;
15392
+ u64 *last_pte_updated;
15394
struct i387_fxsave_struct host_fx_image;
15395
struct i387_fxsave_struct guest_fx_image;
15397
- int guest_fpu_loaded;
15400
- int mmio_read_completed;
15401
- int mmio_is_write;
15403
- unsigned char mmio_data[8];
15404
- gpa_t mmio_phys_addr;
15406
gva_t mmio_fault_cr2;
15407
struct kvm_pio_request pio;
15409
- wait_queue_head_t wq;
15411
- int sigset_active;
15414
- struct kvm_stat stat;
15415
+ struct kvm_queued_exception {
15417
+ bool has_error_code;
15424
@@ -381,7 +250,10 @@ struct kvm_vcpu {
15425
int halt_request; /* real mode on Intel only */
15428
- struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
15429
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
15430
+ /* emulate context */
15432
+ struct x86_emulate_ctxt emulate_ctxt;
15435
struct kvm_mem_alias {
15436
@@ -390,51 +262,57 @@ struct kvm_mem_alias {
15440
-struct kvm_memory_slot {
15442
- unsigned long npages;
15443
- unsigned long flags;
15444
- struct page **phys_mem;
15445
- unsigned long *dirty_bitmap;
15449
- struct mutex lock; /* protects everything except vcpus */
15452
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
15454
- struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
15456
+ unsigned int n_free_mmu_pages;
15457
+ unsigned int n_requested_mmu_pages;
15458
+ unsigned int n_alloc_mmu_pages;
15459
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
15461
* Hash table of struct kvm_mmu_page.
15463
struct list_head active_mmu_pages;
15464
- int n_free_mmu_pages;
15465
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
15466
- struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
15467
- unsigned long rmap_overflow;
15468
- struct list_head vm_list;
15469
- struct file *filp;
15470
- struct kvm_io_bus mmio_bus;
15471
- struct kvm_io_bus pio_bus;
15472
struct kvm_pic *vpic;
15473
struct kvm_ioapic *vioapic;
15475
int round_robin_prev_vcpu;
15476
+ unsigned int tss_addr;
15477
+ struct page *apic_access_page;
15480
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
15482
- return kvm->vpic;
15484
+struct kvm_vm_stat {
15485
+ u32 mmu_shadow_zapped;
15486
+ u32 mmu_pte_write;
15487
+ u32 mmu_pte_updated;
15488
+ u32 mmu_pde_zapped;
15490
+ u32 mmu_recycled;
15491
+ u32 remote_tlb_flush;
15494
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
15496
- return kvm->vioapic;
15498
+struct kvm_vcpu_stat {
15504
-static inline int irqchip_in_kernel(struct kvm *kvm)
15506
- return pic_irqchip(kvm) != 0;
15511
+ u32 signal_exits;
15512
+ u32 irq_window_exits;
15515
+ u32 request_irq_exits;
15517
+ u32 host_state_reload;
15520
+ u32 insn_emulation;
15521
+ u32 insn_emulation_fail;
15524
struct descriptor_table {
15526
@@ -453,7 +331,7 @@ struct kvm_x86_ops {
15527
/* Create, but do not attach this VCPU */
15528
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
15529
void (*vcpu_free)(struct kvm_vcpu *vcpu);
15530
- void (*vcpu_reset)(struct kvm_vcpu *vcpu);
15531
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
15533
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
15534
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
15535
@@ -489,10 +367,6 @@ struct kvm_x86_ops {
15536
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
15538
void (*tlb_flush)(struct kvm_vcpu *vcpu);
15539
- void (*inject_page_fault)(struct kvm_vcpu *vcpu,
15540
- unsigned long addr, u32 err_code);
15542
- void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
15544
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
15545
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
15546
@@ -501,54 +375,31 @@ struct kvm_x86_ops {
15547
unsigned char *hypercall_addr);
15548
int (*get_irq)(struct kvm_vcpu *vcpu);
15549
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
15550
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
15551
+ bool has_error_code, u32 error_code);
15552
+ bool (*exception_injected)(struct kvm_vcpu *vcpu);
15553
void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
15554
void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
15555
struct kvm_run *run);
15557
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
15560
extern struct kvm_x86_ops *kvm_x86_ops;
15562
-/* The guest did something we don't support. */
15563
-#define pr_unimpl(vcpu, fmt, ...) \
15565
- if (printk_ratelimit()) \
15566
- printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
15567
- current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
15570
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
15571
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
15573
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
15574
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
15576
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
15577
- struct module *module);
15578
-void kvm_exit_x86(void);
15580
int kvm_mmu_module_init(void);
15581
void kvm_mmu_module_exit(void);
15583
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
15584
int kvm_mmu_create(struct kvm_vcpu *vcpu);
15585
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
15586
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
15588
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
15589
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
15590
void kvm_mmu_zap_all(struct kvm *kvm);
15592
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
15593
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
15594
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
15595
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
15596
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
15597
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
15599
-extern hpa_t bad_page_address;
15601
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
15602
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
15603
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
15604
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
15605
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
15607
enum emulation_result {
15608
EMULATE_DONE, /* no further processing */
15609
@@ -557,7 +408,7 @@ enum emulation_result {
15612
int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
15613
- unsigned long cr2, u16 error_code);
15614
+ unsigned long cr2, u16 error_code, int no_decode);
15615
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
15616
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
15617
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
15618
@@ -572,7 +423,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
15620
struct x86_emulate_ctxt;
15622
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
15623
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
15624
int size, unsigned port);
15625
int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
15626
int size, unsigned long count, int down,
15627
@@ -581,7 +432,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
15628
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
15629
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
15630
int emulate_clts(struct kvm_vcpu *vcpu);
15631
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
15632
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
15633
unsigned long *dest);
15634
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
15635
unsigned long value);
15636
@@ -597,15 +448,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
15637
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
15638
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
15640
-void fx_init(struct kvm_vcpu *vcpu);
15641
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
15642
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
15643
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
15646
-void kvm_resched(struct kvm_vcpu *vcpu);
15647
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
15648
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
15649
-void kvm_flush_remote_tlbs(struct kvm *kvm);
15650
+void fx_init(struct kvm_vcpu *vcpu);
15652
int emulator_read_std(unsigned long addr,
15655
unsigned int bytes,
15656
struct kvm_vcpu *vcpu);
15657
int emulator_write_emulated(unsigned long addr,
15658
@@ -615,6 +466,7 @@ int emulator_write_emulated(unsigned long addr,
15660
unsigned long segment_base(u16 selector);
15662
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
15663
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
15664
const u8 *new, int bytes);
15665
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
15666
@@ -622,66 +474,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
15667
int kvm_mmu_load(struct kvm_vcpu *vcpu);
15668
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
15670
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
15671
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
15673
-static inline void kvm_guest_enter(void)
15675
- current->flags |= PF_VCPU;
15677
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
15679
-static inline void kvm_guest_exit(void)
15681
- current->flags &= ~PF_VCPU;
15683
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
15685
-static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
15688
- return vcpu->mmu.page_fault(vcpu, gva, error_code);
15691
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
15693
- if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
15694
- __kvm_mmu_free_some_pages(vcpu);
15697
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
15699
- if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
15702
- return kvm_mmu_load(vcpu);
15705
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
15707
-#ifdef CONFIG_X86_64
15708
- return vcpu->shadow_efer & EFER_LME;
15714
-static inline int is_pae(struct kvm_vcpu *vcpu)
15716
- return vcpu->cr4 & X86_CR4_PAE;
15719
-static inline int is_pse(struct kvm_vcpu *vcpu)
15721
- return vcpu->cr4 & X86_CR4_PSE;
15724
-static inline int is_paging(struct kvm_vcpu *vcpu)
15726
- return vcpu->cr0 & X86_CR0_PG;
15729
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
15731
- return slot - kvm->memslots;
15733
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
15734
+int complete_pio(struct kvm_vcpu *vcpu);
15736
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
15738
@@ -693,55 +493,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
15739
static inline u16 read_fs(void)
15742
- asm ("mov %%fs, %0" : "=g"(seg));
15743
+ asm("mov %%fs, %0" : "=g"(seg));
15747
static inline u16 read_gs(void)
15750
- asm ("mov %%gs, %0" : "=g"(seg));
15751
+ asm("mov %%gs, %0" : "=g"(seg));
15755
static inline u16 read_ldt(void)
15758
- asm ("sldt %0" : "=g"(ldt));
15759
+ asm("sldt %0" : "=g"(ldt));
15763
static inline void load_fs(u16 sel)
15765
- asm ("mov %0, %%fs" : : "rm"(sel));
15766
+ asm("mov %0, %%fs" : : "rm"(sel));
15769
static inline void load_gs(u16 sel)
15771
- asm ("mov %0, %%gs" : : "rm"(sel));
15772
+ asm("mov %0, %%gs" : : "rm"(sel));
15776
static inline void load_ldt(u16 sel)
15778
- asm ("lldt %0" : : "rm"(sel));
15779
+ asm("lldt %0" : : "rm"(sel));
15783
static inline void get_idt(struct descriptor_table *table)
15785
- asm ("sidt %0" : "=m"(*table));
15786
+ asm("sidt %0" : "=m"(*table));
15789
static inline void get_gdt(struct descriptor_table *table)
15791
- asm ("sgdt %0" : "=m"(*table));
15792
+ asm("sgdt %0" : "=m"(*table));
15795
static inline unsigned long read_tr_base(void)
15798
- asm ("str %0" : "=g"(tr));
15799
+ asm("str %0" : "=g"(tr));
15800
return segment_base(tr);
15803
@@ -757,17 +557,17 @@ static inline unsigned long read_msr(unsigned long msr)
15805
static inline void fx_save(struct i387_fxsave_struct *image)
15807
- asm ("fxsave (%0)":: "r" (image));
15808
+ asm("fxsave (%0)":: "r" (image));
15811
static inline void fx_restore(struct i387_fxsave_struct *image)
15813
- asm ("fxrstor (%0)":: "r" (image));
15814
+ asm("fxrstor (%0)":: "r" (image));
15817
static inline void fpu_init(void)
15823
static inline u32 get_rdx_init_val(void)
15824
@@ -775,6 +575,11 @@ static inline u32 get_rdx_init_val(void)
15825
return 0x600; /* P6 family */
15828
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
15830
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
15833
#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
15834
#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
15835
#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
15836
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
15837
new file mode 100644
15838
index 0000000..c6f3fd8
15840
+++ b/include/asm-x86/kvm_para.h
15842
+#ifndef __X86_KVM_PARA_H
15843
+#define __X86_KVM_PARA_H
15845
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
15846
+ * should be used to determine that a VM is running under KVM.
15848
+#define KVM_CPUID_SIGNATURE 0x40000000
15850
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
15851
+ * paravirtualization, the appropriate feature bit should be checked.
15853
+#define KVM_CPUID_FEATURES 0x40000001
15856
+#include <asm/processor.h>
15858
+/* This instruction is vmcall. On non-VT architectures, it will generate a
15859
+ * trap that we will then rewrite to the appropriate instruction.
15861
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
15863
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
15864
+ * instruction. The hypervisor may replace it with something else but only the
15865
+ * instructions are guaranteed to be supported.
15867
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
15868
+ * The hypercall number should be placed in rax and the return value will be
15869
+ * placed in rax. No other registers will be clobbered unless explicited
15870
+ * noted by the particular hypercall.
15873
+static inline long kvm_hypercall0(unsigned int nr)
15876
+ asm volatile(KVM_HYPERCALL
15882
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
15885
+ asm volatile(KVM_HYPERCALL
15887
+ : "a"(nr), "b"(p1));
15891
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
15892
+ unsigned long p2)
15895
+ asm volatile(KVM_HYPERCALL
15897
+ : "a"(nr), "b"(p1), "c"(p2));
15901
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
15902
+ unsigned long p2, unsigned long p3)
15905
+ asm volatile(KVM_HYPERCALL
15907
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
15911
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
15912
+ unsigned long p2, unsigned long p3,
15913
+ unsigned long p4)
15916
+ asm volatile(KVM_HYPERCALL
15918
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
15922
+static inline int kvm_para_available(void)
15924
+ unsigned int eax, ebx, ecx, edx;
15925
+ char signature[13];
15927
+ cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
15928
+ memcpy(signature + 0, &ebx, 4);
15929
+ memcpy(signature + 4, &ecx, 4);
15930
+ memcpy(signature + 8, &edx, 4);
15931
+ signature[12] = 0;
15933
+ if (strcmp(signature, "KVMKVMKVM") == 0)
15939
+static inline unsigned int kvm_arch_para_features(void)
15941
+ return cpuid_eax(KVM_CPUID_FEATURES);
15947
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
15948
similarity index 83%
15949
rename from drivers/kvm/x86_emulate.h
15950
rename to include/asm-x86/kvm_x86_emulate.h
15951
index 92c73aa..7db91b9 100644
15952
--- a/drivers/kvm/x86_emulate.h
15953
+++ b/include/asm-x86/kvm_x86_emulate.h
15954
@@ -63,17 +63,6 @@ struct x86_emulate_ops {
15955
unsigned int bytes, struct kvm_vcpu *vcpu);
15958
- * write_std: Write bytes of standard (non-emulated/special) memory.
15959
- * Used for stack operations, and others.
15960
- * @addr: [IN ] Linear address to which to write.
15961
- * @val: [IN ] Value to write to memory (low-order bytes used as
15963
- * @bytes: [IN ] Number of bytes to write to memory.
15965
- int (*write_std)(unsigned long addr, const void *val,
15966
- unsigned int bytes, struct kvm_vcpu *vcpu);
15969
* read_emulated: Read bytes from emulated/special memory area.
15970
* @addr: [IN ] Linear address from which to read.
15971
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
15972
@@ -112,13 +101,50 @@ struct x86_emulate_ops {
15976
+/* Type, address-of, and value of an instruction's operand. */
15978
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
15979
+ unsigned int bytes;
15980
+ unsigned long val, orig_val, *ptr;
15983
+struct fetch_cache {
15985
+ unsigned long start;
15986
+ unsigned long end;
15989
+struct decode_cache {
15997
+ struct operand src;
15998
+ struct operand dst;
15999
+ unsigned long *override_base;
16001
+ unsigned long regs[NR_VCPU_REGS];
16002
+ unsigned long eip;
16009
+ unsigned long modrm_ea;
16010
+ unsigned long modrm_val;
16011
+ struct fetch_cache fetch;
16014
struct x86_emulate_ctxt {
16015
/* Register state before/after emulation. */
16016
struct kvm_vcpu *vcpu;
16018
/* Linear faulting address (if emulating a page-faulting instruction). */
16019
unsigned long eflags;
16020
- unsigned long cr2;
16022
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
16024
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
16025
unsigned long ss_base;
16026
unsigned long gs_base;
16027
unsigned long fs_base;
16029
+ /* decode cache */
16031
+ struct decode_cache decode;
16034
+/* Repeat String Operation Prefix */
16035
+#define REPE_PREFIX 1
16036
+#define REPNE_PREFIX 2
16038
/* Execution mode, passed to the emulator. */
16039
#define X86EMUL_MODE_REAL 0 /* Real mode. */
16040
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
16041
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
16042
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
16046
- * x86_emulate_memop: Emulate an instruction that faulted attempting to
16047
- * read/write a 'special' memory area.
16048
- * Returns -1 on failure, 0 on success.
16050
-int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
16051
- struct x86_emulate_ops *ops);
16052
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
16053
+ struct x86_emulate_ops *ops);
16054
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
16055
+ struct x86_emulate_ops *ops);
16057
#endif /* __X86_EMULATE_H__ */
16058
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
16059
index 37bfa19..397197f 100644
16060
--- a/include/linux/Kbuild
16061
+++ b/include/linux/Kbuild
16062
@@ -98,7 +98,6 @@ header-y += iso_fs.h
16063
header-y += ixjuser.h
16064
header-y += jffs2.h
16065
header-y += keyctl.h
16067
header-y += limits.h
16068
header-y += lock_dlm_plock.h
16069
header-y += magic.h
16070
@@ -255,6 +254,7 @@ unifdef-y += kd.h
16071
unifdef-y += kernelcapi.h
16072
unifdef-y += kernel.h
16073
unifdef-y += keyboard.h
16074
+unifdef-$(CONFIG_ARCH_SUPPORTS_KVM) += kvm.h
16076
unifdef-y += loop.h
16078
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
16079
index 057a7f3..de9f28d 100644
16080
--- a/include/linux/kvm.h
16081
+++ b/include/linux/kvm.h
16084
#include <asm/types.h>
16085
#include <linux/ioctl.h>
16086
+#include <asm/kvm.h>
16088
#define KVM_API_VERSION 12
16090
-/* Architectural interrupt line count. */
16091
-#define KVM_NR_INTERRUPTS 256
16093
/* for KVM_CREATE_MEMORY_REGION */
16094
struct kvm_memory_region {
16096
@@ -23,17 +21,19 @@ struct kvm_memory_region {
16097
__u64 memory_size; /* bytes */
16100
-/* for kvm_memory_region::flags */
16101
-#define KVM_MEM_LOG_DIRTY_PAGES 1UL
16103
-struct kvm_memory_alias {
16104
- __u32 slot; /* this has a different namespace than memory slots */
16105
+/* for KVM_SET_USER_MEMORY_REGION */
16106
+struct kvm_userspace_memory_region {
16109
__u64 guest_phys_addr;
16110
- __u64 memory_size;
16111
- __u64 target_phys_addr;
16112
+ __u64 memory_size; /* bytes */
16113
+ __u64 userspace_addr; /* start of the userspace allocated memory */
16116
+/* for kvm_memory_region::flags */
16117
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
16120
/* for KVM_IRQ_LINE */
16121
struct kvm_irq_level {
16123
@@ -45,62 +45,18 @@ struct kvm_irq_level {
16127
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
16128
-struct kvm_pic_state {
16129
- __u8 last_irr; /* edge detection */
16130
- __u8 irr; /* interrupt request register */
16131
- __u8 imr; /* interrupt mask register */
16132
- __u8 isr; /* interrupt service register */
16133
- __u8 priority_add; /* highest irq priority */
16135
- __u8 read_reg_select;
16137
- __u8 special_mask;
16140
- __u8 rotate_on_auto_eoi;
16141
- __u8 special_fully_nested_mode;
16142
- __u8 init4; /* true if 4 byte init */
16143
- __u8 elcr; /* PIIX edge/trigger selection */
16147
-#define KVM_IOAPIC_NUM_PINS 24
16148
-struct kvm_ioapic_state {
16149
- __u64 base_address;
16158
- __u8 delivery_mode:3;
16159
- __u8 dest_mode:1;
16160
- __u8 delivery_status:1;
16162
- __u8 remote_irr:1;
16163
- __u8 trig_mode:1;
16166
- __u8 reserved[4];
16169
- } redirtbl[KVM_IOAPIC_NUM_PINS];
16172
-#define KVM_IRQCHIP_PIC_MASTER 0
16173
-#define KVM_IRQCHIP_PIC_SLAVE 1
16174
-#define KVM_IRQCHIP_IOAPIC 2
16176
struct kvm_irqchip {
16180
char dummy[512]; /* reserving space */
16182
struct kvm_pic_state pic;
16184
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
16185
struct kvm_ioapic_state ioapic;
16190
@@ -179,15 +135,6 @@ struct kvm_run {
16194
-/* for KVM_GET_REGS and KVM_SET_REGS */
16196
- /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
16197
- __u64 rax, rbx, rcx, rdx;
16198
- __u64 rsi, rdi, rsp, rbp;
16199
- __u64 r8, r9, r10, r11;
16200
- __u64 r12, r13, r14, r15;
16201
- __u64 rip, rflags;
16204
/* for KVM_GET_FPU and KVM_SET_FPU */
16206
@@ -204,59 +151,6 @@ struct kvm_fpu {
16210
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
16211
-#define KVM_APIC_REG_SIZE 0x400
16212
-struct kvm_lapic_state {
16213
- char regs[KVM_APIC_REG_SIZE];
16216
-struct kvm_segment {
16221
- __u8 present, dpl, db, s, l, g, avl;
16226
-struct kvm_dtable {
16229
- __u16 padding[3];
16232
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
16233
-struct kvm_sregs {
16234
- /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
16235
- struct kvm_segment cs, ds, es, fs, gs, ss;
16236
- struct kvm_segment tr, ldt;
16237
- struct kvm_dtable gdt, idt;
16238
- __u64 cr0, cr2, cr3, cr4, cr8;
16241
- __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
16244
-struct kvm_msr_entry {
16250
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
16252
- __u32 nmsrs; /* number of msrs in entries */
16255
- struct kvm_msr_entry entries[0];
16258
-/* for KVM_GET_MSR_INDEX_LIST */
16259
-struct kvm_msr_list {
16260
- __u32 nmsrs; /* number of msrs in entries */
16261
- __u32 indices[0];
16264
/* for KVM_TRANSLATE */
16265
struct kvm_translation {
16266
@@ -302,22 +196,6 @@ struct kvm_dirty_log {
16270
-struct kvm_cpuid_entry {
16279
-/* for KVM_SET_CPUID */
16280
-struct kvm_cpuid {
16283
- struct kvm_cpuid_entry entries[0];
16286
/* for KVM_SET_SIGNAL_MASK */
16287
struct kvm_signal_mask {
16289
@@ -347,11 +225,20 @@ struct kvm_signal_mask {
16291
#define KVM_CAP_IRQCHIP 0
16292
#define KVM_CAP_HLT 1
16293
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
16294
+#define KVM_CAP_USER_MEMORY 3
16295
+#define KVM_CAP_SET_TSS_ADDR 4
16296
+#define KVM_CAP_EXT_CPUID 5
16299
* ioctls for VM fds
16301
#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
16302
+#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
16303
+#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
16304
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
16305
+ struct kvm_userspace_memory_region)
16306
+#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
16308
* KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
16310
@@ -359,6 +246,7 @@ struct kvm_signal_mask {
16311
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
16312
#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
16313
#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
16314
+#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
16315
/* Device model IOC */
16316
#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
16317
#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
16318
@@ -384,5 +272,7 @@ struct kvm_signal_mask {
16319
#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
16320
#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
16321
#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
16322
+#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
16323
+#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
16326
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
16327
new file mode 100644
16328
index 0000000..953b50a
16330
+++ b/include/linux/kvm_host.h
16332
+#ifndef __KVM_HOST_H
16333
+#define __KVM_HOST_H
16336
+ * This work is licensed under the terms of the GNU GPL, version 2. See
16337
+ * the COPYING file in the top-level directory.
16340
+#include <linux/types.h>
16341
+#include <linux/hardirq.h>
16342
+#include <linux/list.h>
16343
+#include <linux/mutex.h>
16344
+#include <linux/spinlock.h>
16345
+#include <linux/signal.h>
16346
+#include <linux/sched.h>
16347
+#include <linux/mm.h>
16348
+#include <linux/preempt.h>
16349
+#include <asm/signal.h>
16351
+#include <linux/kvm.h>
16352
+#include <linux/kvm_para.h>
16354
+#include <linux/kvm_types.h>
16356
+#include <asm/kvm_host.h>
16358
+#define KVM_MAX_VCPUS 4
16359
+#define KVM_MEMORY_SLOTS 8
16360
+/* memory slots that does not exposed to userspace */
16361
+#define KVM_PRIVATE_MEM_SLOTS 4
16363
+#define KVM_PIO_PAGE_OFFSET 1
16366
+ * vcpu->requests bit members
16368
+#define KVM_REQ_TLB_FLUSH 0
16372
+extern struct kmem_cache *kvm_vcpu_cache;
16374
+struct kvm_guest_debug {
16376
+ unsigned long bp[4];
16381
+ * It would be nice to use something smarter than a linear search, TBD...
16382
+ * Thankfully we dont expect many devices to register (famous last words :),
16383
+ * so until then it will suffice. At least its abstracted so we can change
16386
+struct kvm_io_bus {
16388
+#define NR_IOBUS_DEVS 6
16389
+ struct kvm_io_device *devs[NR_IOBUS_DEVS];
16392
+void kvm_io_bus_init(struct kvm_io_bus *bus);
16393
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
16394
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
16395
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
16396
+ struct kvm_io_device *dev);
16400
+ struct preempt_notifier preempt_notifier;
16402
+ struct mutex mutex;
16404
+ struct kvm_run *run;
16406
+ unsigned long requests;
16407
+ struct kvm_guest_debug guest_debug;
16409
+ int guest_fpu_loaded;
16410
+ wait_queue_head_t wq;
16411
+ int sigset_active;
16413
+ struct kvm_vcpu_stat stat;
16415
+#ifdef CONFIG_HAS_IOMEM
16417
+ int mmio_read_completed;
16418
+ int mmio_is_write;
16420
+ unsigned char mmio_data[8];
16421
+ gpa_t mmio_phys_addr;
16424
+ struct kvm_vcpu_arch arch;
16427
+struct kvm_memory_slot {
16429
+ unsigned long npages;
16430
+ unsigned long flags;
16431
+ unsigned long *rmap;
16432
+ unsigned long *dirty_bitmap;
16433
+ unsigned long userspace_addr;
16438
+ struct mutex lock; /* protects everything except vcpus */
16439
+ struct mm_struct *mm; /* userspace tied to this vm */
16441
+ struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
16442
+ KVM_PRIVATE_MEM_SLOTS];
16443
+ struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
16444
+ struct list_head vm_list;
16445
+ struct file *filp;
16446
+ struct kvm_io_bus mmio_bus;
16447
+ struct kvm_io_bus pio_bus;
16448
+ struct kvm_vm_stat stat;
16449
+ struct kvm_arch arch;
16452
+/* The guest did something we don't support. */
16453
+#define pr_unimpl(vcpu, fmt, ...) \
16455
+ if (printk_ratelimit()) \
16456
+ printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
16457
+ current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
16460
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
16461
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
16463
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
16464
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
16466
+void vcpu_load(struct kvm_vcpu *vcpu);
16467
+void vcpu_put(struct kvm_vcpu *vcpu);
16469
+void decache_vcpus_on_cpu(int cpu);
16472
+int kvm_init(void *opaque, unsigned int vcpu_size,
16473
+ struct module *module);
16474
+void kvm_exit(void);
16476
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
16477
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
16478
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
16479
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
16481
+extern struct page *bad_page;
16483
+int is_error_page(struct page *page);
16484
+int kvm_is_error_hva(unsigned long addr);
16485
+int kvm_set_memory_region(struct kvm *kvm,
16486
+ struct kvm_userspace_memory_region *mem,
16488
+int __kvm_set_memory_region(struct kvm *kvm,
16489
+ struct kvm_userspace_memory_region *mem,
16491
+int kvm_arch_set_memory_region(struct kvm *kvm,
16492
+ struct kvm_userspace_memory_region *mem,
16493
+ struct kvm_memory_slot old,
16495
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
16496
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
16497
+void kvm_release_page_clean(struct page *page);
16498
+void kvm_release_page_dirty(struct page *page);
16499
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
16501
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
16502
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
16503
+ int offset, int len);
16504
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
16505
+ unsigned long len);
16506
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
16507
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
16508
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
16509
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
16510
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
16512
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
16513
+void kvm_resched(struct kvm_vcpu *vcpu);
16514
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
16515
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
16516
+void kvm_flush_remote_tlbs(struct kvm *kvm);
16518
+long kvm_arch_dev_ioctl(struct file *filp,
16519
+ unsigned int ioctl, unsigned long arg);
16520
+long kvm_arch_vcpu_ioctl(struct file *filp,
16521
+ unsigned int ioctl, unsigned long arg);
16522
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
16523
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
16525
+int kvm_dev_ioctl_check_extension(long ext);
16527
+int kvm_get_dirty_log(struct kvm *kvm,
16528
+ struct kvm_dirty_log *log, int *is_dirty);
16529
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
16530
+ struct kvm_dirty_log *log);
16532
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
16534
+ kvm_userspace_memory_region *mem,
16536
+long kvm_arch_vm_ioctl(struct file *filp,
16537
+ unsigned int ioctl, unsigned long arg);
16538
+void kvm_arch_destroy_vm(struct kvm *kvm);
16540
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
16541
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
16543
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
16544
+ struct kvm_translation *tr);
16546
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
16547
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
16548
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
16549
+ struct kvm_sregs *sregs);
16550
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
16551
+ struct kvm_sregs *sregs);
16552
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
16553
+ struct kvm_debug_guest *dbg);
16554
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
16556
+int kvm_arch_init(void *opaque);
16557
+void kvm_arch_exit(void);
16559
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
16560
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
16562
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
16563
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
16564
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
16565
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
16566
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
16567
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
16569
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
16570
+void kvm_arch_hardware_enable(void *garbage);
16571
+void kvm_arch_hardware_disable(void *garbage);
16572
+int kvm_arch_hardware_setup(void);
16573
+void kvm_arch_hardware_unsetup(void);
16574
+void kvm_arch_check_processor_compat(void *rtn);
16575
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
16577
+void kvm_free_physmem(struct kvm *kvm);
16579
+struct kvm *kvm_arch_create_vm(void);
16580
+void kvm_arch_destroy_vm(struct kvm *kvm);
16582
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
16583
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
16584
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
16586
+static inline void kvm_guest_enter(void)
16588
+ account_system_vtime(current);
16589
+ current->flags |= PF_VCPU;
16592
+static inline void kvm_guest_exit(void)
16594
+ account_system_vtime(current);
16595
+ current->flags &= ~PF_VCPU;
16598
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
16600
+ return slot - kvm->memslots;
16603
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
16605
+ return (gpa_t)gfn << PAGE_SHIFT;
16608
+enum kvm_stat_kind {
16613
+struct kvm_stats_debugfs_item {
16614
+ const char *name;
16616
+ enum kvm_stat_kind kind;
16617
+ struct dentry *dentry;
16619
+extern struct kvm_stats_debugfs_item debugfs_entries[];
16622
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
16623
index 3b29256..e4db25f 100644
16624
--- a/include/linux/kvm_para.h
16625
+++ b/include/linux/kvm_para.h
16627
#define __LINUX_KVM_PARA_H
16630
- * Guest OS interface for KVM paravirtualization
16632
- * Note: this interface is totally experimental, and is certain to change
16633
- * as we make progress.
16634
+ * This header file provides a method for making a hypercall to the host
16635
+ * Architectures should define:
16636
+ * - kvm_hypercall0, kvm_hypercall1...
16637
+ * - kvm_arch_para_features
16638
+ * - kvm_para_available
16642
- * Per-VCPU descriptor area shared between guest and host. Writable to
16643
- * both guest and host. Registered with the host by the guest when
16644
- * a guest acknowledges paravirtual mode.
16646
- * NOTE: all addresses are guest-physical addresses (gpa), to make it
16647
- * easier for the hypervisor to map between the various addresses.
16649
-struct kvm_vcpu_para_state {
16651
- * API version information for compatibility. If there's any support
16652
- * mismatch (too old host trying to execute too new guest) then
16653
- * the host will deny entry into paravirtual mode. Any other
16654
- * combination (new host + old guest and new host + new guest)
16655
- * is supposed to work - new host versions will support all old
16656
- * guest API versions.
16658
- u32 guest_version;
16659
- u32 host_version;
16664
- * The address of the vm exit instruction (VMCALL or VMMCALL),
16665
- * which the host will patch according to the CPU model the
16668
- u64 hypercall_gpa;
16670
-} __attribute__ ((aligned(PAGE_SIZE)));
16672
-#define KVM_PARA_API_VERSION 1
16673
+/* Return values for hypercalls */
16674
+#define KVM_ENOSYS 1000
16678
- * This is used for an RDMSR's ECX parameter to probe for a KVM host.
16679
- * Hopefully no CPU vendor will use up this number. This is placed well
16680
- * out of way of the typical space occupied by CPU vendors' MSR indices,
16681
- * and we think (or at least hope) it wont be occupied in the future
16683
+ * hypercalls use architecture specific
16685
-#define MSR_KVM_API_MAGIC 0x87655678
16687
-#define KVM_EINVAL 1
16688
+#include <asm/kvm_para.h>
16691
- * Hypercall calling convention:
16693
- * Each hypercall may have 0-6 parameters.
16695
- * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
16697
- * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
16698
- * order: RDI, RSI, RDX, RCX, R8, R9.
16700
- * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
16701
- * (the first 3 are according to the gcc regparm calling convention)
16703
- * No registers are clobbered by the hypercall, except that the
16704
- * return value is in RAX.
16706
-#define __NR_hypercalls 0
16707
+static inline int kvm_para_has_feature(unsigned int feature)
16709
+ if (kvm_arch_para_features() & (1UL << feature))
16713
+#endif /* __KERNEL__ */
16714
+#endif /* __LINUX_KVM_PARA_H */
16717
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
16718
new file mode 100644
16719
index 0000000..1c4e46d
16721
+++ b/include/linux/kvm_types.h
16724
+ * This program is free software; you can redistribute it and/or modify
16725
+ * it under the terms of the GNU General Public License as published by
16726
+ * the Free Software Foundation; either version 2 of the License.
16728
+ * This program is distributed in the hope that it will be useful,
16729
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16730
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16731
+ * GNU General Public License for more details.
16733
+ * You should have received a copy of the GNU General Public License
16734
+ * along with this program; if not, write to the Free Software
16735
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16739
+#ifndef __KVM_TYPES_H__
16740
+#define __KVM_TYPES_H__
16742
+#include <asm/types.h>
16747
+ * gva - guest virtual address
16748
+ * gpa - guest physical address
16749
+ * gfn - guest frame number
16750
+ * hva - host virtual address
16751
+ * hpa - host physical address
16752
+ * hfn - host frame number
16755
+typedef unsigned long gva_t;
16756
+typedef u64 gpa_t;
16757
+typedef unsigned long gfn_t;
16759
+typedef unsigned long hva_t;
16760
+typedef u64 hpa_t;
16761
+typedef unsigned long hfn_t;
16763
+struct kvm_pio_request {
16764
+ unsigned long count;
16766
+ struct page *guest_pages[2];
16767
+ unsigned guest_page_offset;
16776
+#endif /* __KVM_TYPES_H__ */
16777
diff --git a/kernel/fork.c b/kernel/fork.c
16778
index 8dd8ff2..591c8df 100644
16779
--- a/kernel/fork.c
16780
+++ b/kernel/fork.c
16781
@@ -392,6 +392,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
16782
destroy_context(mm);
16785
+EXPORT_SYMBOL_GPL(__mmdrop);
16788
* Decrement the use count and release all resources for an mm.
16789
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c
16790
similarity index 83%
16791
rename from drivers/kvm/ioapic.c
16792
rename to virt/kvm/ioapic.c
16793
index c7992e6..317f8e2 100644
16794
--- a/drivers/kvm/ioapic.c
16795
+++ b/virt/kvm/ioapic.c
16797
* Based on Xen 3.1 code.
16801
+#include <linux/kvm_host.h>
16802
#include <linux/kvm.h>
16803
#include <linux/mm.h>
16804
#include <linux/highmem.h>
16805
@@ -34,14 +34,17 @@
16806
#include <linux/hrtimer.h>
16807
#include <linux/io.h>
16808
#include <asm/processor.h>
16809
-#include <asm/msr.h>
16810
#include <asm/page.h>
16811
#include <asm/current.h>
16812
-#include <asm/apicdef.h>
16813
-#include <asm/io_apic.h>
16815
-/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
16817
+#include "ioapic.h"
16818
+#include "lapic.h"
16821
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
16823
#define ioapic_debug(fmt, arg...)
16825
static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
16827
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
16828
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
16830
index = (ioapic->ioregsel - 0x10) >> 1;
16832
- ioapic_debug("change redir index %x val %x", index, val);
16833
+ ioapic_debug("change redir index %x val %x\n", index, val);
16834
if (index >= IOAPIC_NUM_PINS)
16836
if (ioapic->ioregsel & 1) {
16837
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
16840
static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
16841
- struct kvm_lapic *target,
16842
+ struct kvm_vcpu *vcpu,
16843
u8 vector, u8 trig_mode, u8 delivery_mode)
16845
- ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
16846
+ ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
16849
- ASSERT((delivery_mode == dest_Fixed) ||
16850
- (delivery_mode == dest_LowestPrio));
16851
+ ASSERT((delivery_mode == IOAPIC_FIXED) ||
16852
+ (delivery_mode == IOAPIC_LOWEST_PRIORITY));
16854
- kvm_apic_set_irq(target, vector, trig_mode);
16855
+ kvm_apic_set_irq(vcpu, vector, trig_mode);
16858
static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
16859
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
16860
struct kvm *kvm = ioapic->kvm;
16861
struct kvm_vcpu *vcpu;
16863
- ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
16864
+ ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
16866
if (dest_mode == 0) { /* Physical mode. */
16867
if (dest == 0xFF) { /* Broadcast. */
16868
for (i = 0; i < KVM_MAX_VCPUS; ++i)
16869
- if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
16870
+ if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
16874
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
16875
vcpu = kvm->vcpus[i];
16878
- if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
16880
+ if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
16881
+ if (vcpu->arch.apic)
16885
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
16886
vcpu = kvm->vcpus[i];
16889
- if (vcpu->apic &&
16890
- kvm_apic_match_logical_addr(vcpu->apic, dest))
16891
+ if (vcpu->arch.apic &&
16892
+ kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
16893
mask |= 1 << vcpu->vcpu_id;
16895
- ioapic_debug("mask %x", mask);
16896
+ ioapic_debug("mask %x\n", mask);
16900
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
16901
u8 vector = ioapic->redirtbl[irq].fields.vector;
16902
u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
16903
u32 deliver_bitmask;
16904
- struct kvm_lapic *target;
16905
struct kvm_vcpu *vcpu;
16908
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
16909
- "vector=%x trig_mode=%x",
16910
+ "vector=%x trig_mode=%x\n",
16911
dest, dest_mode, delivery_mode, vector, trig_mode);
16913
deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
16914
if (!deliver_bitmask) {
16915
- ioapic_debug("no target on destination");
16916
+ ioapic_debug("no target on destination\n");
16920
switch (delivery_mode) {
16921
- case dest_LowestPrio:
16923
- kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
16924
- if (target != NULL)
16925
- ioapic_inj_irq(ioapic, target, vector,
16926
+ case IOAPIC_LOWEST_PRIORITY:
16927
+ vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
16928
+ deliver_bitmask);
16929
+ if (vcpu != NULL)
16930
+ ioapic_inj_irq(ioapic, vcpu, vector,
16931
trig_mode, delivery_mode);
16933
- ioapic_debug("null round robin: "
16934
- "mask=%x vector=%x delivery_mode=%x",
16935
- deliver_bitmask, vector, dest_LowestPrio);
16936
+ ioapic_debug("null lowest prio vcpu: "
16937
+ "mask=%x vector=%x delivery_mode=%x\n",
16938
+ deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
16941
+ case IOAPIC_FIXED:
16942
for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
16943
if (!(deliver_bitmask & (1 << vcpu_id)))
16945
deliver_bitmask &= ~(1 << vcpu_id);
16946
vcpu = ioapic->kvm->vcpus[vcpu_id];
16948
- target = vcpu->apic;
16949
- ioapic_inj_irq(ioapic, target, vector,
16950
+ ioapic_inj_irq(ioapic, vcpu, vector,
16951
trig_mode, delivery_mode);
16954
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
16956
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
16958
- struct kvm_ioapic *ioapic = kvm->vioapic;
16959
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
16960
union ioapic_redir_entry *ent;
16963
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
16964
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
16967
- ioapic_debug("addr %lx", (unsigned long)addr);
16968
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
16969
ASSERT(!(addr & 0xf)); /* check alignment */
16972
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
16973
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
16976
- ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
16978
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
16979
+ (void*)addr, len, val);
16980
ASSERT(!(addr & 0xf)); /* check alignment */
16981
if (len == 4 || len == 8)
16982
data = *(u32 *) val;
16983
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
16984
case IOAPIC_REG_WINDOW:
16985
ioapic_write_indirect(ioapic, data);
16987
+#ifdef CONFIG_IA64
16988
+ case IOAPIC_REG_EOI:
16989
+ kvm_ioapic_update_eoi(ioapic->kvm, data);
16998
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
17002
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
17003
+ ioapic->redirtbl[i].fields.mask = 1;
17004
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
17005
+ ioapic->ioregsel = 0;
17010
int kvm_ioapic_init(struct kvm *kvm)
17012
struct kvm_ioapic *ioapic;
17015
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
17018
- kvm->vioapic = ioapic;
17019
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
17020
- ioapic->redirtbl[i].fields.mask = 1;
17021
- ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
17022
+ kvm->arch.vioapic = ioapic;
17023
+ kvm_ioapic_reset(ioapic);
17024
ioapic->dev.read = ioapic_mmio_read;
17025
ioapic->dev.write = ioapic_mmio_write;
17026
ioapic->dev.in_range = ioapic_in_range;
17027
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
17028
new file mode 100644
17029
index 0000000..7f16675
17031
+++ b/virt/kvm/ioapic.h
17033
+#ifndef __KVM_IO_APIC_H
17034
+#define __KVM_IO_APIC_H
17036
+#include <linux/kvm_host.h>
17038
+#include "iodev.h"
17043
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
17044
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
17045
+#define IOAPIC_EDGE_TRIG 0
17046
+#define IOAPIC_LEVEL_TRIG 1
17048
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
17049
+#define IOAPIC_MEM_LENGTH 0x100
17051
+/* Direct registers. */
17052
+#define IOAPIC_REG_SELECT 0x00
17053
+#define IOAPIC_REG_WINDOW 0x10
17054
+#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
17056
+/* Indirect registers. */
17057
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
17058
+#define IOAPIC_REG_VERSION 0x01
17059
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
17061
+/*ioapic delivery mode*/
17062
+#define IOAPIC_FIXED 0x0
17063
+#define IOAPIC_LOWEST_PRIORITY 0x1
17064
+#define IOAPIC_PMI 0x2
17065
+#define IOAPIC_NMI 0x4
17066
+#define IOAPIC_INIT 0x5
17067
+#define IOAPIC_EXTINT 0x7
17069
+struct kvm_ioapic {
17070
+ u64 base_address;
17075
+ union ioapic_redir_entry {
17079
+ u8 delivery_mode:3;
17081
+ u8 delivery_status:1;
17090
+ } redirtbl[IOAPIC_NUM_PINS];
17091
+ struct kvm_io_device dev;
17096
+#define ASSERT(x) \
17099
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
17100
+ __FILE__, __LINE__, #x); \
17105
+#define ASSERT(x) do { } while (0)
17108
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
17110
+ return kvm->arch.vioapic;
17113
+#ifdef CONFIG_IA64
17114
+static inline int irqchip_in_kernel(struct kvm *kvm)
17120
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
17121
+ unsigned long bitmap);
17122
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
17123
+int kvm_ioapic_init(struct kvm *kvm);
17124
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
17125
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
17128
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
17129
new file mode 100644
17130
index 0000000..c14e642
17132
+++ b/virt/kvm/iodev.h
17135
+ * This program is free software; you can redistribute it and/or modify
17136
+ * it under the terms of the GNU General Public License as published by
17137
+ * the Free Software Foundation; either version 2 of the License.
17139
+ * This program is distributed in the hope that it will be useful,
17140
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17141
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17142
+ * GNU General Public License for more details.
17144
+ * You should have received a copy of the GNU General Public License
17145
+ * along with this program; if not, write to the Free Software
17146
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17149
+#ifndef __KVM_IODEV_H__
17150
+#define __KVM_IODEV_H__
17152
+#include <linux/kvm_types.h>
17154
+struct kvm_io_device {
17155
+ void (*read)(struct kvm_io_device *this,
17159
+ void (*write)(struct kvm_io_device *this,
17162
+ const void *val);
17163
+ int (*in_range)(struct kvm_io_device *this, gpa_t addr);
17164
+ void (*destructor)(struct kvm_io_device *this);
17169
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
17174
+ dev->read(dev, addr, len, val);
17177
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
17182
+ dev->write(dev, addr, len, val);
17185
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
17187
+ return dev->in_range(dev, addr);
17190
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
17192
+ if (dev->destructor)
17193
+ dev->destructor(dev);
17196
+#endif /* __KVM_IODEV_H__ */
17197
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
17198
new file mode 100644
17199
index 0000000..845beb2
17201
+++ b/virt/kvm/kvm_main.c
17204
+ * Kernel-based Virtual Machine driver for Linux
17206
+ * This module enables machines with Intel VT-x extensions to run virtual
17207
+ * machines without emulation or binary translation.
17209
+ * Copyright (C) 2006 Qumranet, Inc.
17212
+ * Avi Kivity <avi@qumranet.com>
17213
+ * Yaniv Kamay <yaniv@qumranet.com>
17215
+ * This work is licensed under the terms of the GNU GPL, version 2. See
17216
+ * the COPYING file in the top-level directory.
17220
+#include "iodev.h"
17222
+#include <linux/kvm_host.h>
17223
+#include <linux/kvm.h>
17224
+#include <linux/module.h>
17225
+#include <linux/errno.h>
17226
+#include <linux/percpu.h>
17227
+#include <linux/gfp.h>
17228
+#include <linux/mm.h>
17229
+#include <linux/miscdevice.h>
17230
+#include <linux/vmalloc.h>
17231
+#include <linux/reboot.h>
17232
+#include <linux/debugfs.h>
17233
+#include <linux/highmem.h>
17234
+#include <linux/file.h>
17235
+#include <linux/sysdev.h>
17236
+#include <linux/cpu.h>
17237
+#include <linux/sched.h>
17238
+#include <linux/cpumask.h>
17239
+#include <linux/smp.h>
17240
+#include <linux/anon_inodes.h>
17241
+#include <linux/profile.h>
17242
+#include <linux/kvm_para.h>
17243
+#include <linux/pagemap.h>
17244
+#include <linux/mman.h>
17246
+#include <asm/processor.h>
17247
+#include <asm/io.h>
17248
+#include <asm/uaccess.h>
17249
+#include <asm/pgtable.h>
17251
+MODULE_AUTHOR("Qumranet");
17252
+MODULE_LICENSE("GPL");
17254
+DEFINE_SPINLOCK(kvm_lock);
17255
+LIST_HEAD(vm_list);
17257
+static cpumask_t cpus_hardware_enabled;
17259
+struct kmem_cache *kvm_vcpu_cache;
17260
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
17262
+static __read_mostly struct preempt_ops kvm_preempt_ops;
17264
+static struct dentry *debugfs_dir;
17266
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
17267
+ unsigned long arg);
17269
+static inline int valid_vcpu(int n)
17271
+ return likely(n >= 0 && n < KVM_MAX_VCPUS);
17275
+ * Switches to specified vcpu, until a matching vcpu_put()
17277
+void vcpu_load(struct kvm_vcpu *vcpu)
17281
+ mutex_lock(&vcpu->mutex);
17283
+ preempt_notifier_register(&vcpu->preempt_notifier);
17284
+ kvm_arch_vcpu_load(vcpu, cpu);
17288
+void vcpu_put(struct kvm_vcpu *vcpu)
17290
+ preempt_disable();
17291
+ kvm_arch_vcpu_put(vcpu);
17292
+ preempt_notifier_unregister(&vcpu->preempt_notifier);
17293
+ preempt_enable();
17294
+ mutex_unlock(&vcpu->mutex);
17297
+static void ack_flush(void *_completed)
17301
+void kvm_flush_remote_tlbs(struct kvm *kvm)
17305
+ struct kvm_vcpu *vcpu;
17307
+ cpus_clear(cpus);
17308
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
17309
+ vcpu = kvm->vcpus[i];
17312
+ if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
17315
+ if (cpu != -1 && cpu != raw_smp_processor_id())
17316
+ cpu_set(cpu, cpus);
17318
+ if (cpus_empty(cpus))
17320
+ ++kvm->stat.remote_tlb_flush;
17321
+ smp_call_function_mask(cpus, ack_flush, NULL, 1);
17324
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
17326
+ struct page *page;
17329
+ mutex_init(&vcpu->mutex);
17332
+ vcpu->vcpu_id = id;
17333
+ init_waitqueue_head(&vcpu->wq);
17335
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
17340
+ vcpu->run = page_address(page);
17342
+ r = kvm_arch_vcpu_init(vcpu);
17344
+ goto fail_free_run;
17348
+ free_page((unsigned long)vcpu->run);
17352
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
17354
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
17356
+ kvm_arch_vcpu_uninit(vcpu);
17357
+ free_page((unsigned long)vcpu->run);
17359
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
17361
+static struct kvm *kvm_create_vm(void)
17363
+ struct kvm *kvm = kvm_arch_create_vm();
17368
+ kvm->mm = current->mm;
17369
+ atomic_inc(&kvm->mm->mm_count);
17370
+ kvm_io_bus_init(&kvm->pio_bus);
17371
+ mutex_init(&kvm->lock);
17372
+ kvm_io_bus_init(&kvm->mmio_bus);
17373
+ spin_lock(&kvm_lock);
17374
+ list_add(&kvm->vm_list, &vm_list);
17375
+ spin_unlock(&kvm_lock);
17381
+ * Free any memory in @free but not in @dont.
17383
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
17384
+ struct kvm_memory_slot *dont)
17386
+ if (!dont || free->rmap != dont->rmap)
17387
+ vfree(free->rmap);
17389
+ if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
17390
+ vfree(free->dirty_bitmap);
17392
+ free->npages = 0;
17393
+ free->dirty_bitmap = NULL;
17394
+ free->rmap = NULL;
17397
+void kvm_free_physmem(struct kvm *kvm)
17401
+ for (i = 0; i < kvm->nmemslots; ++i)
17402
+ kvm_free_physmem_slot(&kvm->memslots[i], NULL);
17405
+static void kvm_destroy_vm(struct kvm *kvm)
17407
+ struct mm_struct *mm = kvm->mm;
17409
+ spin_lock(&kvm_lock);
17410
+ list_del(&kvm->vm_list);
17411
+ spin_unlock(&kvm_lock);
17412
+ kvm_io_bus_destroy(&kvm->pio_bus);
17413
+ kvm_io_bus_destroy(&kvm->mmio_bus);
17414
+ kvm_arch_destroy_vm(kvm);
17418
+static int kvm_vm_release(struct inode *inode, struct file *filp)
17420
+ struct kvm *kvm = filp->private_data;
17422
+ kvm_destroy_vm(kvm);
17427
+ * Allocate some memory and give it an address in the guest physical address
17430
+ * Discontiguous memory is allowed, mostly for framebuffers.
17432
+ * Must be called holding kvm->lock.
17434
+int __kvm_set_memory_region(struct kvm *kvm,
17435
+ struct kvm_userspace_memory_region *mem,
17440
+ unsigned long npages;
17442
+ struct kvm_memory_slot *memslot;
17443
+ struct kvm_memory_slot old, new;
17446
+ /* General sanity checks */
17447
+ if (mem->memory_size & (PAGE_SIZE - 1))
17449
+ if (mem->guest_phys_addr & (PAGE_SIZE - 1))
17451
+ if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
17453
+ if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
17456
+ memslot = &kvm->memslots[mem->slot];
17457
+ base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
17458
+ npages = mem->memory_size >> PAGE_SHIFT;
17461
+ mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
17463
+ new = old = *memslot;
17465
+ new.base_gfn = base_gfn;
17466
+ new.npages = npages;
17467
+ new.flags = mem->flags;
17469
+ /* Disallow changing a memory slot's size. */
17471
+ if (npages && old.npages && npages != old.npages)
17474
+ /* Check for overlaps */
17476
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
17477
+ struct kvm_memory_slot *s = &kvm->memslots[i];
17479
+ if (s == memslot)
17481
+ if (!((base_gfn + npages <= s->base_gfn) ||
17482
+ (base_gfn >= s->base_gfn + s->npages)))
17486
+ /* Free page dirty bitmap if unneeded */
17487
+ if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
17488
+ new.dirty_bitmap = NULL;
17492
+ /* Allocate if a slot is being created */
17493
+ if (npages && !new.rmap) {
17494
+ new.rmap = vmalloc(npages * sizeof(struct page *));
17499
+ memset(new.rmap, 0, npages * sizeof(*new.rmap));
17501
+ new.user_alloc = user_alloc;
17502
+ new.userspace_addr = mem->userspace_addr;
17505
+ /* Allocate page dirty bitmap if needed */
17506
+ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
17507
+ unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
17509
+ new.dirty_bitmap = vmalloc(dirty_bytes);
17510
+ if (!new.dirty_bitmap)
17512
+ memset(new.dirty_bitmap, 0, dirty_bytes);
17515
+ if (mem->slot >= kvm->nmemslots)
17516
+ kvm->nmemslots = mem->slot + 1;
17520
+ r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
17526
+ kvm_free_physmem_slot(&old, &new);
17530
+ kvm_free_physmem_slot(&new, &old);
17535
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
17537
+int kvm_set_memory_region(struct kvm *kvm,
17538
+ struct kvm_userspace_memory_region *mem,
17543
+ mutex_lock(&kvm->lock);
17544
+ r = __kvm_set_memory_region(kvm, mem, user_alloc);
17545
+ mutex_unlock(&kvm->lock);
17548
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
17550
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
17552
+ kvm_userspace_memory_region *mem,
17555
+ if (mem->slot >= KVM_MEMORY_SLOTS)
17557
+ return kvm_set_memory_region(kvm, mem, user_alloc);
17560
+int kvm_get_dirty_log(struct kvm *kvm,
17561
+ struct kvm_dirty_log *log, int *is_dirty)
17563
+ struct kvm_memory_slot *memslot;
17566
+ unsigned long any = 0;
17569
+ if (log->slot >= KVM_MEMORY_SLOTS)
17572
+ memslot = &kvm->memslots[log->slot];
17574
+ if (!memslot->dirty_bitmap)
17577
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
17579
+ for (i = 0; !any && i < n/sizeof(long); ++i)
17580
+ any = memslot->dirty_bitmap[i];
17583
+ if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
17594
+int is_error_page(struct page *page)
17596
+ return page == bad_page;
17598
+EXPORT_SYMBOL_GPL(is_error_page);
17600
+static inline unsigned long bad_hva(void)
17602
+ return PAGE_OFFSET;
17605
+int kvm_is_error_hva(unsigned long addr)
17607
+ return addr == bad_hva();
17609
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
17611
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
17615
+ for (i = 0; i < kvm->nmemslots; ++i) {
17616
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
17618
+ if (gfn >= memslot->base_gfn
17619
+ && gfn < memslot->base_gfn + memslot->npages)
17625
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
17627
+ gfn = unalias_gfn(kvm, gfn);
17628
+ return __gfn_to_memslot(kvm, gfn);
17631
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
17635
+ gfn = unalias_gfn(kvm, gfn);
17636
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
17637
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
17639
+ if (gfn >= memslot->base_gfn
17640
+ && gfn < memslot->base_gfn + memslot->npages)
17645
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
17647
+static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
17649
+ struct kvm_memory_slot *slot;
17651
+ gfn = unalias_gfn(kvm, gfn);
17652
+ slot = __gfn_to_memslot(kvm, gfn);
17654
+ return bad_hva();
17655
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
17659
+ * Requires current->mm->mmap_sem to be held
17661
+static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
17663
+ struct page *page[1];
17664
+ unsigned long addr;
17669
+ addr = gfn_to_hva(kvm, gfn);
17670
+ if (kvm_is_error_hva(addr)) {
17671
+ get_page(bad_page);
17675
+ npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
17678
+ if (npages != 1) {
17679
+ get_page(bad_page);
17686
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
17688
+ struct page *page;
17690
+ down_read(¤t->mm->mmap_sem);
17691
+ page = __gfn_to_page(kvm, gfn);
17692
+ up_read(¤t->mm->mmap_sem);
17697
+EXPORT_SYMBOL_GPL(gfn_to_page);
17699
+void kvm_release_page_clean(struct page *page)
17703
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
17705
+void kvm_release_page_dirty(struct page *page)
17707
+ if (!PageReserved(page))
17708
+ SetPageDirty(page);
17711
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
17713
+static int next_segment(unsigned long len, int offset)
17715
+ if (len > PAGE_SIZE - offset)
17716
+ return PAGE_SIZE - offset;
17721
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
17725
+ unsigned long addr;
17727
+ addr = gfn_to_hva(kvm, gfn);
17728
+ if (kvm_is_error_hva(addr))
17730
+ r = copy_from_user(data, (void __user *)addr + offset, len);
17735
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
17737
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
17739
+ gfn_t gfn = gpa >> PAGE_SHIFT;
17741
+ int offset = offset_in_page(gpa);
17744
+ while ((seg = next_segment(len, offset)) != 0) {
17745
+ ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
17755
+EXPORT_SYMBOL_GPL(kvm_read_guest);
17757
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
17758
+ int offset, int len)
17761
+ unsigned long addr;
17763
+ addr = gfn_to_hva(kvm, gfn);
17764
+ if (kvm_is_error_hva(addr))
17766
+ r = copy_to_user((void __user *)addr + offset, data, len);
17769
+ mark_page_dirty(kvm, gfn);
17772
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
17774
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
17775
+ unsigned long len)
17777
+ gfn_t gfn = gpa >> PAGE_SHIFT;
17779
+ int offset = offset_in_page(gpa);
17782
+ while ((seg = next_segment(len, offset)) != 0) {
17783
+ ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
17794
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
17796
+ return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
17798
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
17800
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
17802
+ gfn_t gfn = gpa >> PAGE_SHIFT;
17804
+ int offset = offset_in_page(gpa);
17807
+ while ((seg = next_segment(len, offset)) != 0) {
17808
+ ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
17817
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
17819
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
17821
+ struct kvm_memory_slot *memslot;
17823
+ gfn = unalias_gfn(kvm, gfn);
17824
+ memslot = __gfn_to_memslot(kvm, gfn);
17825
+ if (memslot && memslot->dirty_bitmap) {
17826
+ unsigned long rel_gfn = gfn - memslot->base_gfn;
17829
+ if (!test_bit(rel_gfn, memslot->dirty_bitmap))
17830
+ set_bit(rel_gfn, memslot->dirty_bitmap);
17835
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
17837
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
17839
+ DECLARE_WAITQUEUE(wait, current);
17841
+ add_wait_queue(&vcpu->wq, &wait);
17844
+ * We will block until either an interrupt or a signal wakes us up
17846
+ while (!kvm_cpu_has_interrupt(vcpu)
17847
+ && !signal_pending(current)
17848
+ && !kvm_arch_vcpu_runnable(vcpu)) {
17849
+ set_current_state(TASK_INTERRUPTIBLE);
17855
+ __set_current_state(TASK_RUNNING);
17856
+ remove_wait_queue(&vcpu->wq, &wait);
17859
+void kvm_resched(struct kvm_vcpu *vcpu)
17861
+ if (!need_resched())
17865
+EXPORT_SYMBOL_GPL(kvm_resched);
17867
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
17869
+ struct kvm_vcpu *vcpu = vma->vm_file->private_data;
17870
+ struct page *page;
17872
+ if (vmf->pgoff == 0)
17873
+ page = virt_to_page(vcpu->run);
17874
+ else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
17875
+ page = virt_to_page(vcpu->arch.pio_data);
17877
+ return VM_FAULT_SIGBUS;
17879
+ vmf->page = page;
17883
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
17884
+ .fault = kvm_vcpu_fault,
17887
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
17889
+ vma->vm_ops = &kvm_vcpu_vm_ops;
17893
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
17895
+ struct kvm_vcpu *vcpu = filp->private_data;
17897
+ fput(vcpu->kvm->filp);
17901
+static struct file_operations kvm_vcpu_fops = {
17902
+ .release = kvm_vcpu_release,
17903
+ .unlocked_ioctl = kvm_vcpu_ioctl,
17904
+ .compat_ioctl = kvm_vcpu_ioctl,
17905
+ .mmap = kvm_vcpu_mmap,
17909
+ * Allocates an inode for the vcpu.
17911
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
17914
+ struct inode *inode;
17915
+ struct file *file;
17917
+ r = anon_inode_getfd(&fd, &inode, &file,
17918
+ "kvm-vcpu", &kvm_vcpu_fops, vcpu);
17921
+ atomic_inc(&vcpu->kvm->filp->f_count);
17926
+ * Creates some virtual cpus. Good luck creating more than one.
17928
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
17931
+ struct kvm_vcpu *vcpu;
17933
+ if (!valid_vcpu(n))
17936
+ vcpu = kvm_arch_vcpu_create(kvm, n);
17937
+ if (IS_ERR(vcpu))
17938
+ return PTR_ERR(vcpu);
17940
+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
17942
+ r = kvm_arch_vcpu_setup(vcpu);
17944
+ goto vcpu_destroy;
17946
+ mutex_lock(&kvm->lock);
17947
+ if (kvm->vcpus[n]) {
17949
+ mutex_unlock(&kvm->lock);
17950
+ goto vcpu_destroy;
17952
+ kvm->vcpus[n] = vcpu;
17953
+ mutex_unlock(&kvm->lock);
17955
+ /* Now it's all set up, let userspace reach it */
17956
+ r = create_vcpu_fd(vcpu);
17962
+ mutex_lock(&kvm->lock);
17963
+ kvm->vcpus[n] = NULL;
17964
+ mutex_unlock(&kvm->lock);
17966
+ kvm_arch_vcpu_destroy(vcpu);
17970
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
17973
+ sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
17974
+ vcpu->sigset_active = 1;
17975
+ vcpu->sigset = *sigset;
17977
+ vcpu->sigset_active = 0;
17981
+static long kvm_vcpu_ioctl(struct file *filp,
17982
+ unsigned int ioctl, unsigned long arg)
17984
+ struct kvm_vcpu *vcpu = filp->private_data;
17985
+ void __user *argp = (void __user *)arg;
17988
+ if (vcpu->kvm->mm != current->mm)
17995
+ r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
17997
+ case KVM_GET_REGS: {
17998
+ struct kvm_regs kvm_regs;
18000
+ memset(&kvm_regs, 0, sizeof kvm_regs);
18001
+ r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
18005
+ if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
18010
+ case KVM_SET_REGS: {
18011
+ struct kvm_regs kvm_regs;
18014
+ if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
18016
+ r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
18022
+ case KVM_GET_SREGS: {
18023
+ struct kvm_sregs kvm_sregs;
18025
+ memset(&kvm_sregs, 0, sizeof kvm_sregs);
18026
+ r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
18030
+ if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
18035
+ case KVM_SET_SREGS: {
18036
+ struct kvm_sregs kvm_sregs;
18039
+ if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
18041
+ r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
18047
+ case KVM_TRANSLATE: {
18048
+ struct kvm_translation tr;
18051
+ if (copy_from_user(&tr, argp, sizeof tr))
18053
+ r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
18057
+ if (copy_to_user(argp, &tr, sizeof tr))
18062
+ case KVM_DEBUG_GUEST: {
18063
+ struct kvm_debug_guest dbg;
18066
+ if (copy_from_user(&dbg, argp, sizeof dbg))
18068
+ r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
18074
+ case KVM_SET_SIGNAL_MASK: {
18075
+ struct kvm_signal_mask __user *sigmask_arg = argp;
18076
+ struct kvm_signal_mask kvm_sigmask;
18077
+ sigset_t sigset, *p;
18082
+ if (copy_from_user(&kvm_sigmask, argp,
18083
+ sizeof kvm_sigmask))
18086
+ if (kvm_sigmask.len != sizeof sigset)
18089
+ if (copy_from_user(&sigset, sigmask_arg->sigset,
18094
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
18097
+ case KVM_GET_FPU: {
18098
+ struct kvm_fpu fpu;
18100
+ memset(&fpu, 0, sizeof fpu);
18101
+ r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
18105
+ if (copy_to_user(argp, &fpu, sizeof fpu))
18110
+ case KVM_SET_FPU: {
18111
+ struct kvm_fpu fpu;
18114
+ if (copy_from_user(&fpu, argp, sizeof fpu))
18116
+ r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
18123
+ r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
18129
+static long kvm_vm_ioctl(struct file *filp,
18130
+ unsigned int ioctl, unsigned long arg)
18132
+ struct kvm *kvm = filp->private_data;
18133
+ void __user *argp = (void __user *)arg;
18136
+ if (kvm->mm != current->mm)
18139
+ case KVM_CREATE_VCPU:
18140
+ r = kvm_vm_ioctl_create_vcpu(kvm, arg);
18144
+ case KVM_SET_USER_MEMORY_REGION: {
18145
+ struct kvm_userspace_memory_region kvm_userspace_mem;
18148
+ if (copy_from_user(&kvm_userspace_mem, argp,
18149
+ sizeof kvm_userspace_mem))
18152
+ r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
18157
+ case KVM_GET_DIRTY_LOG: {
18158
+ struct kvm_dirty_log log;
18161
+ if (copy_from_user(&log, argp, sizeof log))
18163
+ r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
18169
+ r = kvm_arch_vm_ioctl(filp, ioctl, arg);
18175
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
18177
+ struct kvm *kvm = vma->vm_file->private_data;
18178
+ struct page *page;
18180
+ if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
18181
+ return VM_FAULT_SIGBUS;
18182
+ /* current->mm->mmap_sem is already held so call lockless version */
18183
+ page = __gfn_to_page(kvm, vmf->pgoff);
18184
+ if (is_error_page(page)) {
18185
+ kvm_release_page_clean(page);
18186
+ return VM_FAULT_SIGBUS;
18188
+ vmf->page = page;
18192
+static struct vm_operations_struct kvm_vm_vm_ops = {
18193
+ .fault = kvm_vm_fault,
18196
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
18198
+ vma->vm_ops = &kvm_vm_vm_ops;
18202
+static struct file_operations kvm_vm_fops = {
18203
+ .release = kvm_vm_release,
18204
+ .unlocked_ioctl = kvm_vm_ioctl,
18205
+ .compat_ioctl = kvm_vm_ioctl,
18206
+ .mmap = kvm_vm_mmap,
18209
+static int kvm_dev_ioctl_create_vm(void)
18212
+ struct inode *inode;
18213
+ struct file *file;
18216
+ kvm = kvm_create_vm();
18218
+ return PTR_ERR(kvm);
18219
+ r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
18221
+ kvm_destroy_vm(kvm);
18225
+ kvm->filp = file;
18230
+static long kvm_dev_ioctl(struct file *filp,
18231
+ unsigned int ioctl, unsigned long arg)
18233
+ void __user *argp = (void __user *)arg;
18234
+ long r = -EINVAL;
18237
+ case KVM_GET_API_VERSION:
18241
+ r = KVM_API_VERSION;
18243
+ case KVM_CREATE_VM:
18247
+ r = kvm_dev_ioctl_create_vm();
18249
+ case KVM_CHECK_EXTENSION:
18250
+ r = kvm_dev_ioctl_check_extension((long)argp);
18252
+ case KVM_GET_VCPU_MMAP_SIZE:
18256
+ r = 2 * PAGE_SIZE;
18259
+ return kvm_arch_dev_ioctl(filp, ioctl, arg);
18265
+static struct file_operations kvm_chardev_ops = {
18266
+ .unlocked_ioctl = kvm_dev_ioctl,
18267
+ .compat_ioctl = kvm_dev_ioctl,
18270
+static struct miscdevice kvm_dev = {
18273
+ &kvm_chardev_ops,
18276
+static void hardware_enable(void *junk)
18278
+ int cpu = raw_smp_processor_id();
18280
+ if (cpu_isset(cpu, cpus_hardware_enabled))
18282
+ cpu_set(cpu, cpus_hardware_enabled);
18283
+ kvm_arch_hardware_enable(NULL);
18286
+static void hardware_disable(void *junk)
18288
+ int cpu = raw_smp_processor_id();
18290
+ if (!cpu_isset(cpu, cpus_hardware_enabled))
18292
+ cpu_clear(cpu, cpus_hardware_enabled);
18293
+ decache_vcpus_on_cpu(cpu);
18294
+ kvm_arch_hardware_disable(NULL);
18297
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
18300
+ int cpu = (long)v;
18302
+ val &= ~CPU_TASKS_FROZEN;
18305
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
18307
+ hardware_disable(NULL);
18309
+ case CPU_UP_CANCELED:
18310
+ printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
18312
+ smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
18315
+ printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
18317
+ smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
18320
+ return NOTIFY_OK;
18323
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
18326
+ if (val == SYS_RESTART) {
18328
+ * Some (well, at least mine) BIOSes hang on reboot if
18329
+ * in vmx root mode.
18331
+ printk(KERN_INFO "kvm: exiting hardware virtualization\n");
18332
+ on_each_cpu(hardware_disable, NULL, 0, 1);
18334
+ return NOTIFY_OK;
18337
+static struct notifier_block kvm_reboot_notifier = {
18338
+ .notifier_call = kvm_reboot,
18342
+void kvm_io_bus_init(struct kvm_io_bus *bus)
18344
+ memset(bus, 0, sizeof(*bus));
18347
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
18351
+ for (i = 0; i < bus->dev_count; i++) {
18352
+ struct kvm_io_device *pos = bus->devs[i];
18354
+ kvm_iodevice_destructor(pos);
18358
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
18362
+ for (i = 0; i < bus->dev_count; i++) {
18363
+ struct kvm_io_device *pos = bus->devs[i];
18365
+ if (pos->in_range(pos, addr))
18372
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
18374
+ BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
18376
+ bus->devs[bus->dev_count++] = dev;
18379
+static struct notifier_block kvm_cpu_notifier = {
18380
+ .notifier_call = kvm_cpu_hotplug,
18381
+ .priority = 20, /* must be > scheduler priority */
18384
+static u64 vm_stat_get(void *_offset)
18386
+ unsigned offset = (long)_offset;
18390
+ spin_lock(&kvm_lock);
18391
+ list_for_each_entry(kvm, &vm_list, vm_list)
18392
+ total += *(u32 *)((void *)kvm + offset);
18393
+ spin_unlock(&kvm_lock);
18397
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
18399
+static u64 vcpu_stat_get(void *_offset)
18401
+ unsigned offset = (long)_offset;
18404
+ struct kvm_vcpu *vcpu;
18407
+ spin_lock(&kvm_lock);
18408
+ list_for_each_entry(kvm, &vm_list, vm_list)
18409
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
18410
+ vcpu = kvm->vcpus[i];
18412
+ total += *(u32 *)((void *)vcpu + offset);
18414
+ spin_unlock(&kvm_lock);
18418
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
18420
+static struct file_operations *stat_fops[] = {
18421
+ [KVM_STAT_VCPU] = &vcpu_stat_fops,
18422
+ [KVM_STAT_VM] = &vm_stat_fops,
18425
+static void kvm_init_debug(void)
18427
+ struct kvm_stats_debugfs_item *p;
18429
+ debugfs_dir = debugfs_create_dir("kvm", NULL);
18430
+ for (p = debugfs_entries; p->name; ++p)
18431
+ p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
18432
+ (void *)(long)p->offset,
18433
+ stat_fops[p->kind]);
18436
+static void kvm_exit_debug(void)
18438
+ struct kvm_stats_debugfs_item *p;
18440
+ for (p = debugfs_entries; p->name; ++p)
18441
+ debugfs_remove(p->dentry);
18442
+ debugfs_remove(debugfs_dir);
18445
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
18447
+ hardware_disable(NULL);
18451
+static int kvm_resume(struct sys_device *dev)
18453
+ hardware_enable(NULL);
18457
+static struct sysdev_class kvm_sysdev_class = {
18458
+ set_kset_name("kvm"),
18459
+ .suspend = kvm_suspend,
18460
+ .resume = kvm_resume,
18463
+static struct sys_device kvm_sysdev = {
18465
+ .cls = &kvm_sysdev_class,
18468
+struct page *bad_page;
18471
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
18473
+ return container_of(pn, struct kvm_vcpu, preempt_notifier);
18476
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
18478
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
18480
+ kvm_arch_vcpu_load(vcpu, cpu);
18483
+static void kvm_sched_out(struct preempt_notifier *pn,
18484
+ struct task_struct *next)
18486
+ struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
18488
+ kvm_arch_vcpu_put(vcpu);
18491
+int kvm_init(void *opaque, unsigned int vcpu_size,
18492
+ struct module *module)
18497
+ kvm_init_debug();
18499
+ r = kvm_arch_init(opaque);
18503
+ bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
18505
+ if (bad_page == NULL) {
18510
+ r = kvm_arch_hardware_setup();
18514
+ for_each_online_cpu(cpu) {
18515
+ smp_call_function_single(cpu,
18516
+ kvm_arch_check_processor_compat,
18522
+ on_each_cpu(hardware_enable, NULL, 0, 1);
18523
+ r = register_cpu_notifier(&kvm_cpu_notifier);
18526
+ register_reboot_notifier(&kvm_reboot_notifier);
18528
+ r = sysdev_class_register(&kvm_sysdev_class);
18532
+ r = sysdev_register(&kvm_sysdev);
18536
+ /* A kmem cache lets us meet the alignment requirements of fx_save. */
18537
+ kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
18538
+ __alignof__(struct kvm_vcpu),
18540
+ if (!kvm_vcpu_cache) {
18545
+ kvm_chardev_ops.owner = module;
18547
+ r = misc_register(&kvm_dev);
18549
+ printk(KERN_ERR "kvm: misc device register failed\n");
18553
+ kvm_preempt_ops.sched_in = kvm_sched_in;
18554
+ kvm_preempt_ops.sched_out = kvm_sched_out;
18559
+ kmem_cache_destroy(kvm_vcpu_cache);
18561
+ sysdev_unregister(&kvm_sysdev);
18563
+ sysdev_class_unregister(&kvm_sysdev_class);
18565
+ unregister_reboot_notifier(&kvm_reboot_notifier);
18566
+ unregister_cpu_notifier(&kvm_cpu_notifier);
18568
+ on_each_cpu(hardware_disable, NULL, 0, 1);
18570
+ kvm_arch_hardware_unsetup();
18572
+ __free_page(bad_page);
18575
+ kvm_exit_debug();
18579
+EXPORT_SYMBOL_GPL(kvm_init);
18581
+void kvm_exit(void)
18583
+ misc_deregister(&kvm_dev);
18584
+ kmem_cache_destroy(kvm_vcpu_cache);
18585
+ sysdev_unregister(&kvm_sysdev);
18586
+ sysdev_class_unregister(&kvm_sysdev_class);
18587
+ unregister_reboot_notifier(&kvm_reboot_notifier);
18588
+ unregister_cpu_notifier(&kvm_cpu_notifier);
18589
+ on_each_cpu(hardware_disable, NULL, 0, 1);
18590
+ kvm_arch_hardware_unsetup();
18592
+ kvm_exit_debug();
18593
+ __free_page(bad_page);
18595
+EXPORT_SYMBOL_GPL(kvm_exit);