2
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
6
* Paul Mackerras <paulus@au1.ibm.com>
7
* Alexander Graf <agraf@suse.de>
8
* Kevin Wolf <mail@kevin-wolf.de>
10
* Description: KVM functions specific to running on Book 3S
11
* processors in hypervisor mode (specifically POWER7 and later).
13
* This file is derived from arch/powerpc/kvm/book3s.c,
14
* by Alexander Graf <agraf@suse.de>.
16
* This program is free software; you can redistribute it and/or modify
17
* it under the terms of the GNU General Public License, version 2, as
18
* published by the Free Software Foundation.
21
#include <linux/kvm_host.h>
22
#include <linux/err.h>
23
#include <linux/slab.h>
24
#include <linux/preempt.h>
25
#include <linux/sched.h>
26
#include <linux/delay.h>
27
#include <linux/export.h>
29
#include <linux/anon_inodes.h>
30
#include <linux/cpumask.h>
31
#include <linux/spinlock.h>
32
#include <linux/page-flags.h>
35
#include <asm/cputable.h>
36
#include <asm/cacheflush.h>
37
#include <asm/tlbflush.h>
38
#include <asm/uaccess.h>
40
#include <asm/kvm_ppc.h>
41
#include <asm/kvm_book3s.h>
42
#include <asm/mmu_context.h>
43
#include <asm/lppaca.h>
44
#include <asm/processor.h>
45
#include <asm/cputhreads.h>
47
#include <asm/hvcall.h>
48
#include <linux/gfp.h>
49
#include <linux/sched.h>
50
#include <linux/vmalloc.h>
51
#include <linux/highmem.h>
54
* For now, limit memory to 64GB and require it to be large pages.
55
* This value is chosen because it makes the ram_pginfo array be
56
* 64kB in size, which is about as large as we want to be trying
57
* to allocate with kmalloc.
59
#define MAX_MEM_ORDER 36
61
#define LARGE_PAGE_ORDER 24 /* 16MB pages */
63
/* #define EXIT_DEBUG */
64
/* #define EXIT_DEBUG_SIMPLE */
65
/* #define EXIT_DEBUG_INT */
67
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
69
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
71
local_paca->kvm_hstate.kvm_vcpu = vcpu;
72
local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
75
void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
79
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
81
vcpu->arch.shregs.msr = msr;
82
kvmppc_end_cede(vcpu);
85
void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
90
void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
94
pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
95
pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
96
vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
97
for (r = 0; r < 16; ++r)
98
pr_err("r%2d = %.16lx r%d = %.16lx\n",
99
r, kvmppc_get_gpr(vcpu, r),
100
r+16, kvmppc_get_gpr(vcpu, r+16));
101
pr_err("ctr = %.16lx lr = %.16lx\n",
102
vcpu->arch.ctr, vcpu->arch.lr);
103
pr_err("srr0 = %.16llx srr1 = %.16llx\n",
104
vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
105
pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
106
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
107
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
108
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
109
pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
110
vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
111
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
112
pr_err("fault dar = %.16lx dsisr = %.8x\n",
113
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
114
pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
115
for (r = 0; r < vcpu->arch.slb_max; ++r)
116
pr_err(" ESID = %.16llx VSID = %.16llx\n",
117
vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
118
pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
119
vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
120
vcpu->arch.last_inst);
123
struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
126
struct kvm_vcpu *v, *ret = NULL;
128
mutex_lock(&kvm->lock);
129
kvm_for_each_vcpu(r, v, kvm) {
130
if (v->vcpu_id == id) {
135
mutex_unlock(&kvm->lock);
139
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
141
vpa->shared_proc = 1;
142
vpa->yield_count = 1;
145
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
147
unsigned long vcpuid, unsigned long vpa)
149
struct kvm *kvm = vcpu->kvm;
150
unsigned long pg_index, ra, len;
151
unsigned long pg_offset;
153
struct kvm_vcpu *tvcpu;
155
tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
161
if (flags == 0 || flags == 4)
166
/* registering new area; convert logical addr to real */
167
pg_index = vpa >> kvm->arch.ram_porder;
168
pg_offset = vpa & (kvm->arch.ram_psize - 1);
169
if (pg_index >= kvm->arch.ram_npages)
171
if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
173
ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
177
len = *(unsigned short *)(va + 4);
179
len = *(unsigned int *)(va + 4);
180
if (pg_offset + len > kvm->arch.ram_psize)
183
case 1: /* register VPA */
186
tvcpu->arch.vpa = va;
189
case 2: /* register DTL */
192
if (!tvcpu->arch.vpa)
195
tvcpu->arch.dtl = va;
196
tvcpu->arch.dtl_end = va + len;
198
case 3: /* register SLB shadow buffer */
201
if (!tvcpu->arch.vpa)
203
tvcpu->arch.slb_shadow = va;
204
len = (len - 16) / 16;
205
tvcpu->arch.slb_shadow = va;
210
case 5: /* unregister VPA */
211
if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
213
tvcpu->arch.vpa = NULL;
215
case 6: /* unregister DTL */
216
tvcpu->arch.dtl = NULL;
218
case 7: /* unregister SLB shadow buffer */
219
tvcpu->arch.slb_shadow = NULL;
226
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
228
unsigned long req = kvmppc_get_gpr(vcpu, 3);
229
unsigned long target, ret = H_SUCCESS;
230
struct kvm_vcpu *tvcpu;
236
target = kvmppc_get_gpr(vcpu, 4);
237
tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
242
tvcpu->arch.prodded = 1;
244
if (vcpu->arch.ceded) {
245
if (waitqueue_active(&vcpu->wq)) {
246
wake_up_interruptible(&vcpu->wq);
247
vcpu->stat.halt_wakeup++;
254
ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
255
kvmppc_get_gpr(vcpu, 5),
256
kvmppc_get_gpr(vcpu, 6));
261
kvmppc_set_gpr(vcpu, 3, ret);
262
vcpu->arch.hcall_needed = 0;
266
static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
267
struct task_struct *tsk)
271
vcpu->stat.sum_exits++;
273
run->exit_reason = KVM_EXIT_UNKNOWN;
274
run->ready_for_interrupt_injection = 1;
275
switch (vcpu->arch.trap) {
276
/* We're good on these - the host merely wanted to get our attention */
277
case BOOK3S_INTERRUPT_HV_DECREMENTER:
278
vcpu->stat.dec_exits++;
281
case BOOK3S_INTERRUPT_EXTERNAL:
282
vcpu->stat.ext_intr_exits++;
285
case BOOK3S_INTERRUPT_PERFMON:
288
case BOOK3S_INTERRUPT_PROGRAM:
292
* Normally program interrupts are delivered directly
293
* to the guest by the hardware, but we can get here
294
* as a result of a hypervisor emulation interrupt
295
* (e40) getting turned into a 700 by BML RTAS.
297
flags = vcpu->arch.shregs.msr & 0x1f0000ull;
298
kvmppc_core_queue_program(vcpu, flags);
302
case BOOK3S_INTERRUPT_SYSCALL:
304
/* hcall - punt to userspace */
307
if (vcpu->arch.shregs.msr & MSR_PR) {
308
/* sc 1 from userspace - reflect to guest syscall */
309
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
313
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
314
for (i = 0; i < 9; ++i)
315
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
316
run->exit_reason = KVM_EXIT_PAPR_HCALL;
317
vcpu->arch.hcall_needed = 1;
322
* We get these next two if the guest does a bad real-mode access,
323
* as we have enabled VRMA (virtualized real mode area) mode in the
324
* LPCR. We just generate an appropriate DSI/ISI to the guest.
326
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
327
vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
328
vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
329
kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
332
case BOOK3S_INTERRUPT_H_INST_STORAGE:
333
kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
338
* This occurs if the guest executes an illegal instruction.
339
* We just generate a program interrupt to the guest, since
340
* we don't emulate any guest instructions at this stage.
342
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
343
kvmppc_core_queue_program(vcpu, 0x80000);
347
kvmppc_dump_regs(vcpu);
348
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
349
vcpu->arch.trap, kvmppc_get_pc(vcpu),
350
vcpu->arch.shregs.msr);
359
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
360
struct kvm_sregs *sregs)
364
sregs->pvr = vcpu->arch.pvr;
366
memset(sregs, 0, sizeof(struct kvm_sregs));
367
for (i = 0; i < vcpu->arch.slb_max; i++) {
368
sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
369
sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
375
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
376
struct kvm_sregs *sregs)
380
kvmppc_set_pvr(vcpu, sregs->pvr);
383
for (i = 0; i < vcpu->arch.slb_nr; i++) {
384
if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
385
vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
386
vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
390
vcpu->arch.slb_max = j;
395
int kvmppc_core_check_processor_compat(void)
397
if (cpu_has_feature(CPU_FTR_HVMODE))
402
struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
404
struct kvm_vcpu *vcpu;
407
struct kvmppc_vcore *vcore;
409
core = id / threads_per_core;
410
if (core >= KVM_MAX_VCORES)
414
vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
418
err = kvm_vcpu_init(vcpu, kvm, id);
422
vcpu->arch.shared = &vcpu->arch.shregs;
423
vcpu->arch.last_cpu = -1;
424
vcpu->arch.mmcr[0] = MMCR0_FC;
425
vcpu->arch.ctrl = CTRL_RUNLATCH;
426
/* default to host PVR, since we can't spoof it */
427
vcpu->arch.pvr = mfspr(SPRN_PVR);
428
kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
430
kvmppc_mmu_book3s_hv_init(vcpu);
433
* We consider the vcpu stopped until we see the first run ioctl for it.
435
vcpu->arch.state = KVMPPC_VCPU_STOPPED;
437
init_waitqueue_head(&vcpu->arch.cpu_run);
439
mutex_lock(&kvm->lock);
440
vcore = kvm->arch.vcores[core];
442
vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
444
INIT_LIST_HEAD(&vcore->runnable_threads);
445
spin_lock_init(&vcore->lock);
446
init_waitqueue_head(&vcore->wq);
448
kvm->arch.vcores[core] = vcore;
450
mutex_unlock(&kvm->lock);
455
spin_lock(&vcore->lock);
456
++vcore->num_threads;
457
spin_unlock(&vcore->lock);
458
vcpu->arch.vcore = vcore;
460
vcpu->arch.cpu_type = KVM_CPU_3S_64;
461
kvmppc_sanity_check(vcpu);
471
void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
473
kvm_vcpu_uninit(vcpu);
477
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
479
unsigned long dec_nsec, now;
482
if (now > vcpu->arch.dec_expires) {
483
/* decrementer has already gone negative */
484
kvmppc_core_queue_dec(vcpu);
485
kvmppc_core_deliver_interrupts(vcpu);
488
dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
490
hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
492
vcpu->arch.timer_running = 1;
495
static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
497
vcpu->arch.ceded = 0;
498
if (vcpu->arch.timer_running) {
499
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
500
vcpu->arch.timer_running = 0;
504
extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
505
extern void xics_wake_cpu(int cpu);
507
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
508
struct kvm_vcpu *vcpu)
512
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
514
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
517
/* decrement the physical thread id of each following vcpu */
519
list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
521
list_del(&vcpu->arch.run_list);
524
static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
527
struct paca_struct *tpaca;
528
struct kvmppc_vcore *vc = vcpu->arch.vcore;
530
if (vcpu->arch.timer_running) {
531
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
532
vcpu->arch.timer_running = 0;
534
cpu = vc->pcpu + vcpu->arch.ptid;
536
tpaca->kvm_hstate.kvm_vcpu = vcpu;
537
tpaca->kvm_hstate.kvm_vcore = vc;
538
tpaca->kvm_hstate.napping = 0;
539
vcpu->cpu = vc->pcpu;
541
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
542
if (vcpu->arch.ptid) {
543
tpaca->cpu_start = 0x80;
551
static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
557
while (vc->nap_count < vc->n_woken) {
558
if (++i >= 1000000) {
559
pr_err("kvmppc_wait_for_nap timeout %d %d\n",
560
vc->nap_count, vc->n_woken);
569
* Check that we are on thread 0 and that any other threads in
570
* this core are off-line.
572
static int on_primary_thread(void)
574
int cpu = smp_processor_id();
575
int thr = cpu_thread_in_core(cpu);
579
while (++thr < threads_per_core)
580
if (cpu_online(cpu + thr))
586
* Run a set of guest threads on a physical core.
587
* Called with vc->lock held.
589
static int kvmppc_run_core(struct kvmppc_vcore *vc)
591
struct kvm_vcpu *vcpu, *vcpu0, *vnext;
596
/* don't start if any threads have a signal pending */
597
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
598
if (signal_pending(vcpu->arch.run_task))
602
* Make sure we are running on thread 0, and that
603
* secondary threads are offline.
604
* XXX we should also block attempts to bring any
605
* secondary threads online.
607
if (threads_per_core > 1 && !on_primary_thread()) {
608
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
609
vcpu->arch.ret = -EBUSY;
614
* Assign physical thread IDs, first to non-ceded vcpus
615
* and then to ceded ones.
619
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
620
if (!vcpu->arch.ceded) {
623
vcpu->arch.ptid = ptid++;
627
return 0; /* nothing to run */
628
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
629
if (vcpu->arch.ceded)
630
vcpu->arch.ptid = ptid++;
634
vc->entry_exit_count = 0;
635
vc->vcore_state = VCORE_RUNNING;
637
vc->pcpu = smp_processor_id();
638
vc->napping_threads = 0;
639
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
640
kvmppc_start_thread(vcpu);
643
spin_unlock(&vc->lock);
646
__kvmppc_vcore_entry(NULL, vcpu0);
648
spin_lock(&vc->lock);
649
/* disable sending of IPIs on virtual external irqs */
650
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
652
/* wait for secondary threads to finish writing their state to memory */
653
if (vc->nap_count < vc->n_woken)
654
kvmppc_wait_for_nap(vc);
655
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
656
vc->vcore_state = VCORE_EXITING;
657
spin_unlock(&vc->lock);
659
/* make sure updates to secondary vcpu structs are visible now */
667
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
668
/* cancel pending dec exception if dec is positive */
669
if (now < vcpu->arch.dec_expires &&
670
kvmppc_core_pending_dec(vcpu))
671
kvmppc_core_dequeue_dec(vcpu);
675
ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
676
vcpu->arch.run_task);
678
vcpu->arch.ret = ret;
681
if (vcpu->arch.ceded) {
682
if (ret != RESUME_GUEST)
683
kvmppc_end_cede(vcpu);
685
kvmppc_set_timer(vcpu);
689
spin_lock(&vc->lock);
691
vc->vcore_state = VCORE_INACTIVE;
692
list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
694
if (vcpu->arch.ret != RESUME_GUEST) {
695
kvmppc_remove_runnable(vc, vcpu);
696
wake_up(&vcpu->arch.cpu_run);
704
* Wait for some other vcpu thread to execute us, and
705
* wake us up when we need to handle something in the host.
707
static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
711
prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
712
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
714
finish_wait(&vcpu->arch.cpu_run, &wait);
718
* All the vcpus in this vcore are idle, so wait for a decrementer
719
* or external interrupt to one of the vcpus. vc->lock is held.
721
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
727
prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
728
vc->vcore_state = VCORE_SLEEPING;
729
spin_unlock(&vc->lock);
730
list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
731
if (!v->arch.ceded || v->arch.pending_exceptions) {
738
finish_wait(&vc->wq, &wait);
739
spin_lock(&vc->lock);
740
vc->vcore_state = VCORE_INACTIVE;
743
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
747
struct kvmppc_vcore *vc;
748
struct kvm_vcpu *v, *vn;
750
kvm_run->exit_reason = 0;
751
vcpu->arch.ret = RESUME_GUEST;
755
* Synchronize with other threads in this virtual core
757
vc = vcpu->arch.vcore;
758
spin_lock(&vc->lock);
759
vcpu->arch.ceded = 0;
760
vcpu->arch.run_task = current;
761
vcpu->arch.kvm_run = kvm_run;
762
prev_state = vcpu->arch.state;
763
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
764
list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
768
* This happens the first time this is called for a vcpu.
769
* If the vcore is already running, we may be able to start
770
* this thread straight away and have it join in.
772
if (prev_state == KVMPPC_VCPU_STOPPED) {
773
if (vc->vcore_state == VCORE_RUNNING &&
774
VCORE_EXIT_COUNT(vc) == 0) {
775
vcpu->arch.ptid = vc->n_runnable - 1;
776
kvmppc_start_thread(vcpu);
779
} else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
782
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
783
!signal_pending(current)) {
784
if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
785
spin_unlock(&vc->lock);
786
kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
787
spin_lock(&vc->lock);
791
list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
792
n_ceded += v->arch.ceded;
793
if (n_ceded == vc->n_runnable)
794
kvmppc_vcore_blocked(vc);
798
list_for_each_entry_safe(v, vn, &vc->runnable_threads,
800
kvmppc_core_deliver_interrupts(v);
801
if (signal_pending(v->arch.run_task)) {
802
kvmppc_remove_runnable(vc, v);
803
v->stat.signal_exits++;
804
v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
805
v->arch.ret = -EINTR;
806
wake_up(&v->arch.cpu_run);
811
if (signal_pending(current)) {
812
if (vc->vcore_state == VCORE_RUNNING ||
813
vc->vcore_state == VCORE_EXITING) {
814
spin_unlock(&vc->lock);
815
kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
816
spin_lock(&vc->lock);
818
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
819
kvmppc_remove_runnable(vc, vcpu);
820
vcpu->stat.signal_exits++;
821
kvm_run->exit_reason = KVM_EXIT_INTR;
822
vcpu->arch.ret = -EINTR;
826
spin_unlock(&vc->lock);
827
return vcpu->arch.ret;
830
int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
834
if (!vcpu->arch.sane) {
835
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
839
/* No need to go into the guest when all we'll do is come back out */
840
if (signal_pending(current)) {
841
run->exit_reason = KVM_EXIT_INTR;
845
/* On PPC970, check that we have an RMA region */
846
if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
849
flush_fp_to_thread(current);
850
flush_altivec_to_thread(current);
851
flush_vsx_to_thread(current);
852
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
855
r = kvmppc_run_vcpu(run, vcpu);
857
if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
858
!(vcpu->arch.shregs.msr & MSR_PR)) {
859
r = kvmppc_pseries_do_hcall(vcpu);
860
kvmppc_core_deliver_interrupts(vcpu);
862
} while (r == RESUME_GUEST);
866
static long kvmppc_stt_npages(unsigned long window_size)
868
return ALIGN((window_size >> SPAPR_TCE_SHIFT)
869
* sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
872
static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
874
struct kvm *kvm = stt->kvm;
877
mutex_lock(&kvm->lock);
878
list_del(&stt->list);
879
for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
880
__free_page(stt->pages[i]);
882
mutex_unlock(&kvm->lock);
887
static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
889
struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
892
if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
893
return VM_FAULT_SIGBUS;
895
page = stt->pages[vmf->pgoff];
901
static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
902
.fault = kvm_spapr_tce_fault,
905
static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
907
vma->vm_ops = &kvm_spapr_tce_vm_ops;
911
static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
913
struct kvmppc_spapr_tce_table *stt = filp->private_data;
915
release_spapr_tce_table(stt);
919
static struct file_operations kvm_spapr_tce_fops = {
920
.mmap = kvm_spapr_tce_mmap,
921
.release = kvm_spapr_tce_release,
924
long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
925
struct kvm_create_spapr_tce *args)
927
struct kvmppc_spapr_tce_table *stt = NULL;
932
/* Check this LIOBN hasn't been previously allocated */
933
list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
934
if (stt->liobn == args->liobn)
938
npages = kvmppc_stt_npages(args->window_size);
940
stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
945
stt->liobn = args->liobn;
946
stt->window_size = args->window_size;
949
for (i = 0; i < npages; i++) {
950
stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
957
mutex_lock(&kvm->lock);
958
list_add(&stt->list, &kvm->arch.spapr_tce_tables);
960
mutex_unlock(&kvm->lock);
962
return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
967
for (i = 0; i < npages; i++)
969
__free_page(stt->pages[i]);
976
/* Work out RMLS (real mode limit selector) field value for a given RMA size.
977
Assumes POWER7 or PPC970. */
978
static inline int lpcr_rmls(unsigned long rma_size)
981
case 32ul << 20: /* 32 MB */
982
if (cpu_has_feature(CPU_FTR_ARCH_206))
983
return 8; /* only supported on POWER7 */
985
case 64ul << 20: /* 64 MB */
987
case 128ul << 20: /* 128 MB */
989
case 256ul << 20: /* 256 MB */
991
case 1ul << 30: /* 1 GB */
993
case 16ul << 30: /* 16 GB */
995
case 256ul << 30: /* 256 GB */
1002
static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1004
struct kvmppc_rma_info *ri = vma->vm_file->private_data;
1007
if (vmf->pgoff >= ri->npages)
1008
return VM_FAULT_SIGBUS;
1010
page = pfn_to_page(ri->base_pfn + vmf->pgoff);
1016
static const struct vm_operations_struct kvm_rma_vm_ops = {
1017
.fault = kvm_rma_fault,
1020
static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
1022
vma->vm_flags |= VM_RESERVED;
1023
vma->vm_ops = &kvm_rma_vm_ops;
1027
static int kvm_rma_release(struct inode *inode, struct file *filp)
1029
struct kvmppc_rma_info *ri = filp->private_data;
1031
kvm_release_rma(ri);
1035
static struct file_operations kvm_rma_fops = {
1036
.mmap = kvm_rma_mmap,
1037
.release = kvm_rma_release,
1040
long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
1042
struct kvmppc_rma_info *ri;
1045
ri = kvm_alloc_rma();
1049
fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
1051
kvm_release_rma(ri);
1053
ret->rma_size = ri->npages << PAGE_SHIFT;
1057
static struct page *hva_to_page(unsigned long addr)
1059
struct page *page[1];
1064
npages = get_user_pages_fast(addr, 1, 1, page);
1066
if (unlikely(npages != 1))
1072
int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1073
struct kvm_userspace_memory_region *mem)
1075
unsigned long psize, porder;
1076
unsigned long i, npages, totalpages;
1077
unsigned long pg_ix;
1078
struct kvmppc_pginfo *pginfo;
1080
struct kvmppc_rma_info *ri = NULL;
1083
/* For now, only allow 16MB pages */
1084
porder = LARGE_PAGE_ORDER;
1085
psize = 1ul << porder;
1086
if ((mem->memory_size & (psize - 1)) ||
1087
(mem->guest_phys_addr & (psize - 1))) {
1088
pr_err("bad memory_size=%llx @ %llx\n",
1089
mem->memory_size, mem->guest_phys_addr);
1093
npages = mem->memory_size >> porder;
1094
totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
1096
/* More memory than we have space to track? */
1097
if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
1100
/* Do we already have an RMA registered? */
1101
if (mem->guest_phys_addr == 0 && kvm->arch.rma)
1104
if (totalpages > kvm->arch.ram_npages)
1105
kvm->arch.ram_npages = totalpages;
1107
/* Is this one of our preallocated RMAs? */
1108
if (mem->guest_phys_addr == 0) {
1109
struct vm_area_struct *vma;
1111
down_read(¤t->mm->mmap_sem);
1112
vma = find_vma(current->mm, mem->userspace_addr);
1113
if (vma && vma->vm_file &&
1114
vma->vm_file->f_op == &kvm_rma_fops &&
1115
mem->userspace_addr == vma->vm_start)
1116
ri = vma->vm_file->private_data;
1117
up_read(¤t->mm->mmap_sem);
1118
if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
1119
pr_err("CPU requires an RMO\n");
1125
unsigned long rma_size;
1129
rma_size = ri->npages << PAGE_SHIFT;
1130
if (rma_size > mem->memory_size)
1131
rma_size = mem->memory_size;
1132
rmls = lpcr_rmls(rma_size);
1134
pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
1137
atomic_inc(&ri->use_count);
1139
kvm->arch.n_rma_pages = rma_size >> porder;
1141
/* Update LPCR and RMOR */
1142
lpcr = kvm->arch.lpcr;
1143
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1144
/* PPC970; insert RMLS value (split field) in HID4 */
1145
lpcr &= ~((1ul << HID4_RMLS0_SH) |
1146
(3ul << HID4_RMLS2_SH));
1147
lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
1148
((rmls & 3) << HID4_RMLS2_SH);
1149
/* RMOR is also in HID4 */
1150
lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
1154
lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
1155
lpcr |= rmls << LPCR_RMLS_SH;
1156
kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
1158
kvm->arch.lpcr = lpcr;
1159
pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1160
ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
1163
pg_ix = mem->guest_phys_addr >> porder;
1164
pginfo = kvm->arch.ram_pginfo + pg_ix;
1165
for (i = 0; i < npages; ++i, ++pg_ix) {
1166
if (ri && pg_ix < kvm->arch.n_rma_pages) {
1167
pginfo[i].pfn = ri->base_pfn +
1168
(pg_ix << (porder - PAGE_SHIFT));
1171
hva = mem->userspace_addr + (i << porder);
1172
page = hva_to_page(hva);
1174
pr_err("oops, no pfn for hva %lx\n", hva);
1177
/* Check it's a 16MB page */
1178
if (!PageHead(page) ||
1179
compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
1180
pr_err("page at %lx isn't 16MB (o=%d)\n",
1181
hva, compound_order(page));
1184
pginfo[i].pfn = page_to_pfn(page);
1193
void kvmppc_core_commit_memory_region(struct kvm *kvm,
1194
struct kvm_userspace_memory_region *mem)
1196
if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
1198
kvmppc_map_vrma(kvm, mem);
1201
int kvmppc_core_init_vm(struct kvm *kvm)
1204
unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
1208
/* Allocate hashed page table */
1209
r = kvmppc_alloc_hpt(kvm);
1213
INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1215
kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
1217
if (!kvm->arch.ram_pginfo) {
1218
pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1219
npages * sizeof(struct kvmppc_pginfo));
1223
kvm->arch.ram_npages = 0;
1224
kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
1225
kvm->arch.ram_porder = LARGE_PAGE_ORDER;
1226
kvm->arch.rma = NULL;
1227
kvm->arch.n_rma_pages = 0;
1229
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
1231
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1232
/* PPC970; HID4 is effectively the LPCR */
1233
unsigned long lpid = kvm->arch.lpid;
1234
kvm->arch.host_lpid = 0;
1235
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
1236
lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
1237
lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
1238
((lpid & 0xf) << HID4_LPID5_SH);
1240
/* POWER7; init LPCR for virtual RMA mode */
1241
kvm->arch.host_lpid = mfspr(SPRN_LPID);
1242
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
1243
lpcr &= LPCR_PECE | LPCR_LPES;
1244
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
1245
LPCR_VPM0 | LPCR_VRMA_L;
1247
kvm->arch.lpcr = lpcr;
1252
kvmppc_free_hpt(kvm);
1256
void kvmppc_core_destroy_vm(struct kvm *kvm)
1258
struct kvmppc_pginfo *pginfo;
1261
if (kvm->arch.ram_pginfo) {
1262
pginfo = kvm->arch.ram_pginfo;
1263
kvm->arch.ram_pginfo = NULL;
1264
for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
1266
put_page(pfn_to_page(pginfo[i].pfn));
1269
if (kvm->arch.rma) {
1270
kvm_release_rma(kvm->arch.rma);
1271
kvm->arch.rma = NULL;
1274
kvmppc_free_hpt(kvm);
1275
WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
1278
/* These are stubs for now */
1279
void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
1283
/* We don't need to emulate any privileged instructions or dcbz */
1284
int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
1285
unsigned int inst, int *advance)
1287
return EMULATE_FAIL;
1290
int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
1292
return EMULATE_FAIL;
1295
int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
1297
return EMULATE_FAIL;
1300
static int kvmppc_book3s_hv_init(void)
1304
r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
1309
r = kvmppc_mmu_hv_init();
1314
static void kvmppc_book3s_hv_exit(void)
1319
module_init(kvmppc_book3s_hv_init);
1320
module_exit(kvmppc_book3s_hv_exit);