~ubuntu-branches/debian/wheezy/linux-2.6/wheezy

« back to all changes in this revision

Viewing changes to virt/kvm/kvm_main.c

  • Committer: Bazaar Package Importer
  • Author(s): Ben Hutchings, Ben Hutchings, Aurelien Jarno, Martin Michlmayr
  • Date: 2011-04-06 13:53:30 UTC
  • mfrom: (43.1.5 sid)
  • Revision ID: james.westby@ubuntu.com-20110406135330-wjufxhd0tvn3zx4z
Tags: 2.6.38-3
[ Ben Hutchings ]
* [ppc64] Add to linux-tools package architectures (Closes: #620124)
* [amd64] Save cr4 to mmu_cr4_features at boot time (Closes: #620284)
* appletalk: Fix bugs introduced when removing use of BKL
* ALSA: Fix yet another race in disconnection
* cciss: Fix lost command issue
* ath9k: Fix kernel panic in AR2427
* ses: Avoid kernel panic when lun 0 is not mapped
* PCI/ACPI: Report ASPM support to BIOS if not disabled from command line

[ Aurelien Jarno ]
* rtlwifi: fix build when PCI is not enabled.

[ Martin Michlmayr ]
* rtlwifi: Eliminate udelay calls with too large values (Closes: #620204)

Show diffs side-by-side

added added

removed removed

Lines of Context:
5
5
 * machines without emulation or binary translation.
6
6
 *
7
7
 * Copyright (C) 2006 Qumranet, Inc.
 
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8
9
 *
9
10
 * Authors:
10
11
 *   Avi Kivity   <avi@qumranet.com>
22
23
#include <linux/module.h>
23
24
#include <linux/errno.h>
24
25
#include <linux/percpu.h>
25
 
#include <linux/gfp.h>
26
26
#include <linux/mm.h>
27
27
#include <linux/miscdevice.h>
28
28
#include <linux/vmalloc.h>
43
43
#include <linux/swap.h>
44
44
#include <linux/bitops.h>
45
45
#include <linux/spinlock.h>
 
46
#include <linux/compat.h>
 
47
#include <linux/srcu.h>
 
48
#include <linux/hugetlb.h>
 
49
#include <linux/slab.h>
46
50
 
47
51
#include <asm/processor.h>
48
52
#include <asm/io.h>
49
53
#include <asm/uaccess.h>
50
54
#include <asm/pgtable.h>
 
55
#include <asm-generic/bitops/le.h>
51
56
 
52
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
53
57
#include "coalesced_mmio.h"
54
 
#endif
55
 
 
56
 
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
57
 
#include <linux/pci.h>
58
 
#include <linux/interrupt.h>
59
 
#include "irq.h"
60
 
#endif
 
58
#include "async_pf.h"
61
59
 
62
60
#define CREATE_TRACE_POINTS
63
61
#include <trace/events/kvm.h>
68
66
/*
69
67
 * Ordering of locks:
70
68
 *
71
 
 *              kvm->slots_lock --> kvm->lock --> kvm->irq_lock
 
69
 *              kvm->lock --> kvm->slots_lock --> kvm->irq_lock
72
70
 */
73
71
 
74
72
DEFINE_SPINLOCK(kvm_lock);
75
73
LIST_HEAD(vm_list);
76
74
 
77
75
static cpumask_var_t cpus_hardware_enabled;
 
76
static int kvm_usage_count = 0;
 
77
static atomic_t hardware_enable_failed;
78
78
 
79
79
struct kmem_cache *kvm_vcpu_cache;
80
80
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
85
85
 
86
86
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
87
87
                           unsigned long arg);
88
 
 
89
 
static bool kvm_rebooting;
 
88
static int hardware_enable_all(void);
 
89
static void hardware_disable_all(void);
 
90
 
 
91
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
92
 
 
93
bool kvm_rebooting;
 
94
EXPORT_SYMBOL_GPL(kvm_rebooting);
90
95
 
91
96
static bool largepages_enabled = true;
92
97
 
93
 
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
94
 
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
95
 
                                                      int assigned_dev_id)
96
 
{
97
 
        struct list_head *ptr;
98
 
        struct kvm_assigned_dev_kernel *match;
99
 
 
100
 
        list_for_each(ptr, head) {
101
 
                match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
102
 
                if (match->assigned_dev_id == assigned_dev_id)
103
 
                        return match;
104
 
        }
105
 
        return NULL;
106
 
}
107
 
 
108
 
static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
109
 
                                    *assigned_dev, int irq)
110
 
{
111
 
        int i, index;
112
 
        struct msix_entry *host_msix_entries;
113
 
 
114
 
        host_msix_entries = assigned_dev->host_msix_entries;
115
 
 
116
 
        index = -1;
117
 
        for (i = 0; i < assigned_dev->entries_nr; i++)
118
 
                if (irq == host_msix_entries[i].vector) {
119
 
                        index = i;
120
 
                        break;
121
 
                }
122
 
        if (index < 0) {
123
 
                printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
124
 
                return 0;
125
 
        }
126
 
 
127
 
        return index;
128
 
}
129
 
 
130
 
static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
131
 
{
132
 
        struct kvm_assigned_dev_kernel *assigned_dev;
133
 
        struct kvm *kvm;
134
 
        int i;
135
 
 
136
 
        assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
137
 
                                    interrupt_work);
138
 
        kvm = assigned_dev->kvm;
139
 
 
140
 
        mutex_lock(&kvm->irq_lock);
141
 
        spin_lock_irq(&assigned_dev->assigned_dev_lock);
142
 
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
143
 
                struct kvm_guest_msix_entry *guest_entries =
144
 
                        assigned_dev->guest_msix_entries;
145
 
                for (i = 0; i < assigned_dev->entries_nr; i++) {
146
 
                        if (!(guest_entries[i].flags &
147
 
                                        KVM_ASSIGNED_MSIX_PENDING))
148
 
                                continue;
149
 
                        guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
150
 
                        kvm_set_irq(assigned_dev->kvm,
151
 
                                    assigned_dev->irq_source_id,
152
 
                                    guest_entries[i].vector, 1);
153
 
                }
154
 
        } else
155
 
                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
156
 
                            assigned_dev->guest_irq, 1);
157
 
 
158
 
        spin_unlock_irq(&assigned_dev->assigned_dev_lock);
159
 
        mutex_unlock(&assigned_dev->kvm->irq_lock);
160
 
}
161
 
 
162
 
static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
163
 
{
164
 
        unsigned long flags;
165
 
        struct kvm_assigned_dev_kernel *assigned_dev =
166
 
                (struct kvm_assigned_dev_kernel *) dev_id;
167
 
 
168
 
        spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
169
 
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
170
 
                int index = find_index_from_host_irq(assigned_dev, irq);
171
 
                if (index < 0)
172
 
                        goto out;
173
 
                assigned_dev->guest_msix_entries[index].flags |=
174
 
                        KVM_ASSIGNED_MSIX_PENDING;
175
 
        }
176
 
 
177
 
        schedule_work(&assigned_dev->interrupt_work);
178
 
 
179
 
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
180
 
                disable_irq_nosync(irq);
181
 
                assigned_dev->host_irq_disabled = true;
182
 
        }
183
 
 
184
 
out:
185
 
        spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
186
 
        return IRQ_HANDLED;
187
 
}
188
 
 
189
 
/* Ack the irq line for an assigned device */
190
 
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
191
 
{
192
 
        struct kvm_assigned_dev_kernel *dev;
193
 
        unsigned long flags;
194
 
 
195
 
        if (kian->gsi == -1)
196
 
                return;
197
 
 
198
 
        dev = container_of(kian, struct kvm_assigned_dev_kernel,
199
 
                           ack_notifier);
200
 
 
201
 
        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
202
 
 
203
 
        /* The guest irq may be shared so this ack may be
204
 
         * from another device.
205
 
         */
206
 
        spin_lock_irqsave(&dev->assigned_dev_lock, flags);
207
 
        if (dev->host_irq_disabled) {
208
 
                enable_irq(dev->host_irq);
209
 
                dev->host_irq_disabled = false;
210
 
        }
211
 
        spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
212
 
}
213
 
 
214
 
static void deassign_guest_irq(struct kvm *kvm,
215
 
                               struct kvm_assigned_dev_kernel *assigned_dev)
216
 
{
217
 
        kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
218
 
        assigned_dev->ack_notifier.gsi = -1;
219
 
 
220
 
        if (assigned_dev->irq_source_id != -1)
221
 
                kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
222
 
        assigned_dev->irq_source_id = -1;
223
 
        assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
224
 
}
225
 
 
226
 
/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
227
 
static void deassign_host_irq(struct kvm *kvm,
228
 
                              struct kvm_assigned_dev_kernel *assigned_dev)
229
 
{
230
 
        /*
231
 
         * In kvm_free_device_irq, cancel_work_sync return true if:
232
 
         * 1. work is scheduled, and then cancelled.
233
 
         * 2. work callback is executed.
234
 
         *
235
 
         * The first one ensured that the irq is disabled and no more events
236
 
         * would happen. But for the second one, the irq may be enabled (e.g.
237
 
         * for MSI). So we disable irq here to prevent further events.
238
 
         *
239
 
         * Notice this maybe result in nested disable if the interrupt type is
240
 
         * INTx, but it's OK for we are going to free it.
241
 
         *
242
 
         * If this function is a part of VM destroy, please ensure that till
243
 
         * now, the kvm state is still legal for probably we also have to wait
244
 
         * interrupt_work done.
245
 
         */
246
 
        if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
247
 
                int i;
248
 
                for (i = 0; i < assigned_dev->entries_nr; i++)
249
 
                        disable_irq_nosync(assigned_dev->
250
 
                                           host_msix_entries[i].vector);
251
 
 
252
 
                cancel_work_sync(&assigned_dev->interrupt_work);
253
 
 
254
 
                for (i = 0; i < assigned_dev->entries_nr; i++)
255
 
                        free_irq(assigned_dev->host_msix_entries[i].vector,
256
 
                                 (void *)assigned_dev);
257
 
 
258
 
                assigned_dev->entries_nr = 0;
259
 
                kfree(assigned_dev->host_msix_entries);
260
 
                kfree(assigned_dev->guest_msix_entries);
261
 
                pci_disable_msix(assigned_dev->dev);
262
 
        } else {
263
 
                /* Deal with MSI and INTx */
264
 
                disable_irq_nosync(assigned_dev->host_irq);
265
 
                cancel_work_sync(&assigned_dev->interrupt_work);
266
 
 
267
 
                free_irq(assigned_dev->host_irq, (void *)assigned_dev);
268
 
 
269
 
                if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
270
 
                        pci_disable_msi(assigned_dev->dev);
271
 
        }
272
 
 
273
 
        assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
274
 
}
275
 
 
276
 
static int kvm_deassign_irq(struct kvm *kvm,
277
 
                            struct kvm_assigned_dev_kernel *assigned_dev,
278
 
                            unsigned long irq_requested_type)
279
 
{
280
 
        unsigned long guest_irq_type, host_irq_type;
281
 
 
282
 
        if (!irqchip_in_kernel(kvm))
283
 
                return -EINVAL;
284
 
        /* no irq assignment to deassign */
285
 
        if (!assigned_dev->irq_requested_type)
286
 
                return -ENXIO;
287
 
 
288
 
        host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
289
 
        guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
290
 
 
291
 
        if (host_irq_type)
292
 
                deassign_host_irq(kvm, assigned_dev);
293
 
        if (guest_irq_type)
294
 
                deassign_guest_irq(kvm, assigned_dev);
295
 
 
296
 
        return 0;
297
 
}
298
 
 
299
 
static void kvm_free_assigned_irq(struct kvm *kvm,
300
 
                                  struct kvm_assigned_dev_kernel *assigned_dev)
301
 
{
302
 
        kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
303
 
}
304
 
 
305
 
static void kvm_free_assigned_device(struct kvm *kvm,
306
 
                                     struct kvm_assigned_dev_kernel
307
 
                                     *assigned_dev)
308
 
{
309
 
        kvm_free_assigned_irq(kvm, assigned_dev);
310
 
 
311
 
        pci_reset_function(assigned_dev->dev);
312
 
 
313
 
        pci_release_regions(assigned_dev->dev);
314
 
        pci_disable_device(assigned_dev->dev);
315
 
        pci_dev_put(assigned_dev->dev);
316
 
 
317
 
        list_del(&assigned_dev->list);
318
 
        kfree(assigned_dev);
319
 
}
320
 
 
321
 
void kvm_free_all_assigned_devices(struct kvm *kvm)
322
 
{
323
 
        struct list_head *ptr, *ptr2;
324
 
        struct kvm_assigned_dev_kernel *assigned_dev;
325
 
 
326
 
        list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
327
 
                assigned_dev = list_entry(ptr,
328
 
                                          struct kvm_assigned_dev_kernel,
329
 
                                          list);
330
 
 
331
 
                kvm_free_assigned_device(kvm, assigned_dev);
332
 
        }
333
 
}
334
 
 
335
 
static int assigned_device_enable_host_intx(struct kvm *kvm,
336
 
                                            struct kvm_assigned_dev_kernel *dev)
337
 
{
338
 
        dev->host_irq = dev->dev->irq;
339
 
        /* Even though this is PCI, we don't want to use shared
340
 
         * interrupts. Sharing host devices with guest-assigned devices
341
 
         * on the same interrupt line is not a happy situation: there
342
 
         * are going to be long delays in accepting, acking, etc.
343
 
         */
344
 
        if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
345
 
                        0, "kvm_assigned_intx_device", (void *)dev))
346
 
                return -EIO;
347
 
        return 0;
348
 
}
349
 
 
350
 
#ifdef __KVM_HAVE_MSI
351
 
static int assigned_device_enable_host_msi(struct kvm *kvm,
352
 
                                           struct kvm_assigned_dev_kernel *dev)
353
 
{
354
 
        int r;
355
 
 
356
 
        if (!dev->dev->msi_enabled) {
357
 
                r = pci_enable_msi(dev->dev);
358
 
                if (r)
359
 
                        return r;
360
 
        }
361
 
 
362
 
        dev->host_irq = dev->dev->irq;
363
 
        if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
364
 
                        "kvm_assigned_msi_device", (void *)dev)) {
365
 
                pci_disable_msi(dev->dev);
366
 
                return -EIO;
367
 
        }
368
 
 
369
 
        return 0;
370
 
}
371
 
#endif
372
 
 
373
 
#ifdef __KVM_HAVE_MSIX
374
 
static int assigned_device_enable_host_msix(struct kvm *kvm,
375
 
                                            struct kvm_assigned_dev_kernel *dev)
376
 
{
377
 
        int i, r = -EINVAL;
378
 
 
379
 
        /* host_msix_entries and guest_msix_entries should have been
380
 
         * initialized */
381
 
        if (dev->entries_nr == 0)
382
 
                return r;
383
 
 
384
 
        r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
385
 
        if (r)
386
 
                return r;
387
 
 
388
 
        for (i = 0; i < dev->entries_nr; i++) {
389
 
                r = request_irq(dev->host_msix_entries[i].vector,
390
 
                                kvm_assigned_dev_intr, 0,
391
 
                                "kvm_assigned_msix_device",
392
 
                                (void *)dev);
393
 
                /* FIXME: free requested_irq's on failure */
394
 
                if (r)
395
 
                        return r;
396
 
        }
397
 
 
398
 
        return 0;
399
 
}
400
 
 
401
 
#endif
402
 
 
403
 
static int assigned_device_enable_guest_intx(struct kvm *kvm,
404
 
                                struct kvm_assigned_dev_kernel *dev,
405
 
                                struct kvm_assigned_irq *irq)
406
 
{
407
 
        dev->guest_irq = irq->guest_irq;
408
 
        dev->ack_notifier.gsi = irq->guest_irq;
409
 
        return 0;
410
 
}
411
 
 
412
 
#ifdef __KVM_HAVE_MSI
413
 
static int assigned_device_enable_guest_msi(struct kvm *kvm,
414
 
                        struct kvm_assigned_dev_kernel *dev,
415
 
                        struct kvm_assigned_irq *irq)
416
 
{
417
 
        dev->guest_irq = irq->guest_irq;
418
 
        dev->ack_notifier.gsi = -1;
419
 
        dev->host_irq_disabled = false;
420
 
        return 0;
421
 
}
422
 
#endif
423
 
#ifdef __KVM_HAVE_MSIX
424
 
static int assigned_device_enable_guest_msix(struct kvm *kvm,
425
 
                        struct kvm_assigned_dev_kernel *dev,
426
 
                        struct kvm_assigned_irq *irq)
427
 
{
428
 
        dev->guest_irq = irq->guest_irq;
429
 
        dev->ack_notifier.gsi = -1;
430
 
        dev->host_irq_disabled = false;
431
 
        return 0;
432
 
}
433
 
#endif
434
 
 
435
 
static int assign_host_irq(struct kvm *kvm,
436
 
                           struct kvm_assigned_dev_kernel *dev,
437
 
                           __u32 host_irq_type)
438
 
{
439
 
        int r = -EEXIST;
440
 
 
441
 
        if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
442
 
                return r;
443
 
 
444
 
        switch (host_irq_type) {
445
 
        case KVM_DEV_IRQ_HOST_INTX:
446
 
                r = assigned_device_enable_host_intx(kvm, dev);
447
 
                break;
448
 
#ifdef __KVM_HAVE_MSI
449
 
        case KVM_DEV_IRQ_HOST_MSI:
450
 
                r = assigned_device_enable_host_msi(kvm, dev);
451
 
                break;
452
 
#endif
453
 
#ifdef __KVM_HAVE_MSIX
454
 
        case KVM_DEV_IRQ_HOST_MSIX:
455
 
                r = assigned_device_enable_host_msix(kvm, dev);
456
 
                break;
457
 
#endif
458
 
        default:
459
 
                r = -EINVAL;
460
 
        }
461
 
 
462
 
        if (!r)
463
 
                dev->irq_requested_type |= host_irq_type;
464
 
 
465
 
        return r;
466
 
}
467
 
 
468
 
static int assign_guest_irq(struct kvm *kvm,
469
 
                            struct kvm_assigned_dev_kernel *dev,
470
 
                            struct kvm_assigned_irq *irq,
471
 
                            unsigned long guest_irq_type)
472
 
{
473
 
        int id;
474
 
        int r = -EEXIST;
475
 
 
476
 
        if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
477
 
                return r;
478
 
 
479
 
        id = kvm_request_irq_source_id(kvm);
480
 
        if (id < 0)
481
 
                return id;
482
 
 
483
 
        dev->irq_source_id = id;
484
 
 
485
 
        switch (guest_irq_type) {
486
 
        case KVM_DEV_IRQ_GUEST_INTX:
487
 
                r = assigned_device_enable_guest_intx(kvm, dev, irq);
488
 
                break;
489
 
#ifdef __KVM_HAVE_MSI
490
 
        case KVM_DEV_IRQ_GUEST_MSI:
491
 
                r = assigned_device_enable_guest_msi(kvm, dev, irq);
492
 
                break;
493
 
#endif
494
 
#ifdef __KVM_HAVE_MSIX
495
 
        case KVM_DEV_IRQ_GUEST_MSIX:
496
 
                r = assigned_device_enable_guest_msix(kvm, dev, irq);
497
 
                break;
498
 
#endif
499
 
        default:
500
 
                r = -EINVAL;
501
 
        }
502
 
 
503
 
        if (!r) {
504
 
                dev->irq_requested_type |= guest_irq_type;
505
 
                kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
506
 
        } else
507
 
                kvm_free_irq_source_id(kvm, dev->irq_source_id);
508
 
 
509
 
        return r;
510
 
}
511
 
 
512
 
/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
513
 
static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
514
 
                                   struct kvm_assigned_irq *assigned_irq)
515
 
{
516
 
        int r = -EINVAL;
517
 
        struct kvm_assigned_dev_kernel *match;
518
 
        unsigned long host_irq_type, guest_irq_type;
519
 
 
520
 
        if (!capable(CAP_SYS_RAWIO))
521
 
                return -EPERM;
522
 
 
523
 
        if (!irqchip_in_kernel(kvm))
524
 
                return r;
525
 
 
526
 
        mutex_lock(&kvm->lock);
527
 
        r = -ENODEV;
528
 
        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
529
 
                                      assigned_irq->assigned_dev_id);
530
 
        if (!match)
531
 
                goto out;
532
 
 
533
 
        host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
534
 
        guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
535
 
 
536
 
        r = -EINVAL;
537
 
        /* can only assign one type at a time */
538
 
        if (hweight_long(host_irq_type) > 1)
539
 
                goto out;
540
 
        if (hweight_long(guest_irq_type) > 1)
541
 
                goto out;
542
 
        if (host_irq_type == 0 && guest_irq_type == 0)
543
 
                goto out;
544
 
 
545
 
        r = 0;
546
 
        if (host_irq_type)
547
 
                r = assign_host_irq(kvm, match, host_irq_type);
548
 
        if (r)
549
 
                goto out;
550
 
 
551
 
        if (guest_irq_type)
552
 
                r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
553
 
out:
554
 
        mutex_unlock(&kvm->lock);
555
 
        return r;
556
 
}
557
 
 
558
 
static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
559
 
                                         struct kvm_assigned_irq
560
 
                                         *assigned_irq)
561
 
{
562
 
        int r = -ENODEV;
563
 
        struct kvm_assigned_dev_kernel *match;
564
 
 
565
 
        mutex_lock(&kvm->lock);
566
 
 
567
 
        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
568
 
                                      assigned_irq->assigned_dev_id);
569
 
        if (!match)
570
 
                goto out;
571
 
 
572
 
        r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
573
 
out:
574
 
        mutex_unlock(&kvm->lock);
575
 
        return r;
576
 
}
577
 
 
578
 
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
579
 
                                      struct kvm_assigned_pci_dev *assigned_dev)
580
 
{
581
 
        int r = 0;
582
 
        struct kvm_assigned_dev_kernel *match;
583
 
        struct pci_dev *dev;
584
 
 
585
 
        down_read(&kvm->slots_lock);
586
 
        mutex_lock(&kvm->lock);
587
 
 
588
 
        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
589
 
                                      assigned_dev->assigned_dev_id);
590
 
        if (match) {
591
 
                /* device already assigned */
592
 
                r = -EEXIST;
593
 
                goto out;
594
 
        }
595
 
 
596
 
        match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
597
 
        if (match == NULL) {
598
 
                printk(KERN_INFO "%s: Couldn't allocate memory\n",
599
 
                       __func__);
600
 
                r = -ENOMEM;
601
 
                goto out;
602
 
        }
603
 
        dev = pci_get_bus_and_slot(assigned_dev->busnr,
604
 
                                   assigned_dev->devfn);
605
 
        if (!dev) {
606
 
                printk(KERN_INFO "%s: host device not found\n", __func__);
607
 
                r = -EINVAL;
608
 
                goto out_free;
609
 
        }
610
 
        if (pci_enable_device(dev)) {
611
 
                printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
612
 
                r = -EBUSY;
613
 
                goto out_put;
614
 
        }
615
 
        r = pci_request_regions(dev, "kvm_assigned_device");
616
 
        if (r) {
617
 
                printk(KERN_INFO "%s: Could not get access to device regions\n",
618
 
                       __func__);
619
 
                goto out_disable;
620
 
        }
621
 
 
622
 
        pci_reset_function(dev);
623
 
 
624
 
        match->assigned_dev_id = assigned_dev->assigned_dev_id;
625
 
        match->host_busnr = assigned_dev->busnr;
626
 
        match->host_devfn = assigned_dev->devfn;
627
 
        match->flags = assigned_dev->flags;
628
 
        match->dev = dev;
629
 
        spin_lock_init(&match->assigned_dev_lock);
630
 
        match->irq_source_id = -1;
631
 
        match->kvm = kvm;
632
 
        match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
633
 
        INIT_WORK(&match->interrupt_work,
634
 
                  kvm_assigned_dev_interrupt_work_handler);
635
 
 
636
 
        list_add(&match->list, &kvm->arch.assigned_dev_head);
637
 
 
638
 
        if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
639
 
                if (!kvm->arch.iommu_domain) {
640
 
                        r = kvm_iommu_map_guest(kvm);
641
 
                        if (r)
642
 
                                goto out_list_del;
643
 
                }
644
 
                r = kvm_assign_device(kvm, match);
645
 
                if (r)
646
 
                        goto out_list_del;
647
 
        }
648
 
 
649
 
out:
650
 
        mutex_unlock(&kvm->lock);
651
 
        up_read(&kvm->slots_lock);
652
 
        return r;
653
 
out_list_del:
654
 
        list_del(&match->list);
655
 
        pci_release_regions(dev);
656
 
out_disable:
657
 
        pci_disable_device(dev);
658
 
out_put:
659
 
        pci_dev_put(dev);
660
 
out_free:
661
 
        kfree(match);
662
 
        mutex_unlock(&kvm->lock);
663
 
        up_read(&kvm->slots_lock);
664
 
        return r;
665
 
}
666
 
#endif
667
 
 
668
 
#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
669
 
static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
670
 
                struct kvm_assigned_pci_dev *assigned_dev)
671
 
{
672
 
        int r = 0;
673
 
        struct kvm_assigned_dev_kernel *match;
674
 
 
675
 
        mutex_lock(&kvm->lock);
676
 
 
677
 
        match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
678
 
                                      assigned_dev->assigned_dev_id);
679
 
        if (!match) {
680
 
                printk(KERN_INFO "%s: device hasn't been assigned before, "
681
 
                  "so cannot be deassigned\n", __func__);
682
 
                r = -EINVAL;
683
 
                goto out;
684
 
        }
685
 
 
686
 
        if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
687
 
                kvm_deassign_device(kvm, match);
688
 
 
689
 
        kvm_free_assigned_device(kvm, match);
690
 
 
691
 
out:
692
 
        mutex_unlock(&kvm->lock);
693
 
        return r;
694
 
}
695
 
#endif
 
98
static struct page *hwpoison_page;
 
99
static pfn_t hwpoison_pfn;
 
100
 
 
101
static struct page *fault_page;
 
102
static pfn_t fault_pfn;
696
103
 
697
104
inline int kvm_is_mmio_pfn(pfn_t pfn)
698
105
{
699
106
        if (pfn_valid(pfn)) {
700
 
                struct page *page = compound_head(pfn_to_page(pfn));
701
 
                return PageReserved(page);
 
107
                int reserved;
 
108
                struct page *tail = pfn_to_page(pfn);
 
109
                struct page *head = compound_trans_head(tail);
 
110
                reserved = PageReserved(head);
 
111
                if (head != tail) {
 
112
                        /*
 
113
                         * "head" is not a dangling pointer
 
114
                         * (compound_trans_head takes care of that)
 
115
                         * but the hugepage may have been splitted
 
116
                         * from under us (and we may not hold a
 
117
                         * reference count on the head page so it can
 
118
                         * be reused before we run PageReferenced), so
 
119
                         * we've to check PageTail before returning
 
120
                         * what we just read.
 
121
                         */
 
122
                        smp_rmb();
 
123
                        if (PageTail(tail))
 
124
                                return reserved;
 
125
                }
 
126
                return PageReserved(tail);
702
127
        }
703
128
 
704
129
        return true;
740
165
 
741
166
        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
742
167
 
743
 
        spin_lock(&kvm->requests_lock);
 
168
        raw_spin_lock(&kvm->requests_lock);
744
169
        me = smp_processor_id();
745
170
        kvm_for_each_vcpu(i, vcpu, kvm) {
746
 
                if (test_and_set_bit(req, &vcpu->requests))
 
171
                if (kvm_make_check_request(req, vcpu))
747
172
                        continue;
748
173
                cpu = vcpu->cpu;
749
174
                if (cpus != NULL && cpu != -1 && cpu != me)
755
180
                smp_call_function_many(cpus, ack_flush, NULL, 1);
756
181
        else
757
182
                called = false;
758
 
        spin_unlock(&kvm->requests_lock);
 
183
        raw_spin_unlock(&kvm->requests_lock);
759
184
        free_cpumask_var(cpus);
760
185
        return called;
761
186
}
762
187
 
763
188
void kvm_flush_remote_tlbs(struct kvm *kvm)
764
189
{
 
190
        int dirty_count = kvm->tlbs_dirty;
 
191
 
 
192
        smp_mb();
765
193
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
766
194
                ++kvm->stat.remote_tlb_flush;
 
195
        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
767
196
}
768
197
 
769
198
void kvm_reload_remote_mmus(struct kvm *kvm)
781
210
        vcpu->kvm = kvm;
782
211
        vcpu->vcpu_id = id;
783
212
        init_waitqueue_head(&vcpu->wq);
 
213
        kvm_async_pf_vcpu_init(vcpu);
784
214
 
785
215
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
786
216
        if (!page) {
819
249
                                             unsigned long address)
820
250
{
821
251
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
822
 
        int need_tlb_flush;
 
252
        int need_tlb_flush, idx;
823
253
 
824
254
        /*
825
255
         * When ->invalidate_page runs, the linux pte has been zapped
839
269
         * pte after kvm_unmap_hva returned, without noticing the page
840
270
         * is going to be freed.
841
271
         */
 
272
        idx = srcu_read_lock(&kvm->srcu);
842
273
        spin_lock(&kvm->mmu_lock);
843
274
        kvm->mmu_notifier_seq++;
844
 
        need_tlb_flush = kvm_unmap_hva(kvm, address);
 
275
        need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
845
276
        spin_unlock(&kvm->mmu_lock);
 
277
        srcu_read_unlock(&kvm->srcu, idx);
846
278
 
847
279
        /* we've to flush the tlb before the pages can be freed */
848
280
        if (need_tlb_flush)
856
288
                                        pte_t pte)
857
289
{
858
290
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
291
        int idx;
859
292
 
 
293
        idx = srcu_read_lock(&kvm->srcu);
860
294
        spin_lock(&kvm->mmu_lock);
861
295
        kvm->mmu_notifier_seq++;
862
296
        kvm_set_spte_hva(kvm, address, pte);
863
297
        spin_unlock(&kvm->mmu_lock);
 
298
        srcu_read_unlock(&kvm->srcu, idx);
864
299
}
865
300
 
866
301
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
869
304
                                                    unsigned long end)
870
305
{
871
306
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
872
 
        int need_tlb_flush = 0;
 
307
        int need_tlb_flush = 0, idx;
873
308
 
 
309
        idx = srcu_read_lock(&kvm->srcu);
874
310
        spin_lock(&kvm->mmu_lock);
875
311
        /*
876
312
         * The count increase must become visible at unlock time as no
880
316
        kvm->mmu_notifier_count++;
881
317
        for (; start < end; start += PAGE_SIZE)
882
318
                need_tlb_flush |= kvm_unmap_hva(kvm, start);
 
319
        need_tlb_flush |= kvm->tlbs_dirty;
883
320
        spin_unlock(&kvm->mmu_lock);
 
321
        srcu_read_unlock(&kvm->srcu, idx);
884
322
 
885
323
        /* we've to flush the tlb before the pages can be freed */
886
324
        if (need_tlb_flush)
918
356
                                              unsigned long address)
919
357
{
920
358
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
921
 
        int young;
 
359
        int young, idx;
922
360
 
 
361
        idx = srcu_read_lock(&kvm->srcu);
923
362
        spin_lock(&kvm->mmu_lock);
924
363
        young = kvm_age_hva(kvm, address);
925
364
        spin_unlock(&kvm->mmu_lock);
 
365
        srcu_read_unlock(&kvm->srcu, idx);
926
366
 
927
367
        if (young)
928
368
                kvm_flush_remote_tlbs(kvm);
930
370
        return young;
931
371
}
932
372
 
 
373
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 
374
                                       struct mm_struct *mm,
 
375
                                       unsigned long address)
 
376
{
 
377
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
378
        int young, idx;
 
379
 
 
380
        idx = srcu_read_lock(&kvm->srcu);
 
381
        spin_lock(&kvm->mmu_lock);
 
382
        young = kvm_test_age_hva(kvm, address);
 
383
        spin_unlock(&kvm->mmu_lock);
 
384
        srcu_read_unlock(&kvm->srcu, idx);
 
385
 
 
386
        return young;
 
387
}
 
388
 
933
389
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
934
390
                                     struct mm_struct *mm)
935
391
{
936
392
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
393
        int idx;
 
394
 
 
395
        idx = srcu_read_lock(&kvm->srcu);
937
396
        kvm_arch_flush_shadow(kvm);
 
397
        srcu_read_unlock(&kvm->srcu, idx);
938
398
}
939
399
 
940
400
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
942
402
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
943
403
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
944
404
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 
405
        .test_young             = kvm_mmu_notifier_test_young,
945
406
        .change_pte             = kvm_mmu_notifier_change_pte,
946
407
        .release                = kvm_mmu_notifier_release,
947
408
};
 
409
 
 
410
static int kvm_init_mmu_notifier(struct kvm *kvm)
 
411
{
 
412
        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 
413
        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 
414
}
 
415
 
 
416
#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 
417
 
 
418
static int kvm_init_mmu_notifier(struct kvm *kvm)
 
419
{
 
420
        return 0;
 
421
}
 
422
 
948
423
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
949
424
 
950
425
static struct kvm *kvm_create_vm(void)
951
426
{
952
 
        struct kvm *kvm = kvm_arch_create_vm();
953
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
954
 
        struct page *page;
955
 
#endif
956
 
 
957
 
        if (IS_ERR(kvm))
958
 
                goto out;
 
427
        int r, i;
 
428
        struct kvm *kvm = kvm_arch_alloc_vm();
 
429
 
 
430
        if (!kvm)
 
431
                return ERR_PTR(-ENOMEM);
 
432
 
 
433
        r = kvm_arch_init_vm(kvm);
 
434
        if (r)
 
435
                goto out_err_nodisable;
 
436
 
 
437
        r = hardware_enable_all();
 
438
        if (r)
 
439
                goto out_err_nodisable;
 
440
 
959
441
#ifdef CONFIG_HAVE_KVM_IRQCHIP
960
 
        INIT_LIST_HEAD(&kvm->irq_routing);
961
442
        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
962
 
#endif
963
 
 
964
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
965
 
        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
966
 
        if (!page) {
967
 
                kfree(kvm);
968
 
                return ERR_PTR(-ENOMEM);
969
 
        }
970
 
        kvm->coalesced_mmio_ring =
971
 
                        (struct kvm_coalesced_mmio_ring *)page_address(page);
972
 
#endif
973
 
 
974
 
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
975
 
        {
976
 
                int err;
977
 
                kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
978
 
                err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
979
 
                if (err) {
980
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
981
 
                        put_page(page);
982
 
#endif
983
 
                        kfree(kvm);
984
 
                        return ERR_PTR(err);
985
 
                }
986
 
        }
987
 
#endif
 
443
        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 
444
#endif
 
445
 
 
446
        r = -ENOMEM;
 
447
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 
448
        if (!kvm->memslots)
 
449
                goto out_err_nosrcu;
 
450
        if (init_srcu_struct(&kvm->srcu))
 
451
                goto out_err_nosrcu;
 
452
        for (i = 0; i < KVM_NR_BUSES; i++) {
 
453
                kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 
454
                                        GFP_KERNEL);
 
455
                if (!kvm->buses[i])
 
456
                        goto out_err;
 
457
        }
 
458
 
 
459
        r = kvm_init_mmu_notifier(kvm);
 
460
        if (r)
 
461
                goto out_err;
988
462
 
989
463
        kvm->mm = current->mm;
990
464
        atomic_inc(&kvm->mm->mm_count);
991
465
        spin_lock_init(&kvm->mmu_lock);
992
 
        spin_lock_init(&kvm->requests_lock);
993
 
        kvm_io_bus_init(&kvm->pio_bus);
 
466
        raw_spin_lock_init(&kvm->requests_lock);
994
467
        kvm_eventfd_init(kvm);
995
468
        mutex_init(&kvm->lock);
996
469
        mutex_init(&kvm->irq_lock);
997
 
        kvm_io_bus_init(&kvm->mmio_bus);
998
 
        init_rwsem(&kvm->slots_lock);
 
470
        mutex_init(&kvm->slots_lock);
999
471
        atomic_set(&kvm->users_count, 1);
1000
472
        spin_lock(&kvm_lock);
1001
473
        list_add(&kvm->vm_list, &vm_list);
1002
474
        spin_unlock(&kvm_lock);
1003
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1004
 
        kvm_coalesced_mmio_init(kvm);
1005
 
#endif
1006
 
out:
 
475
 
1007
476
        return kvm;
 
477
 
 
478
out_err:
 
479
        cleanup_srcu_struct(&kvm->srcu);
 
480
out_err_nosrcu:
 
481
        hardware_disable_all();
 
482
out_err_nodisable:
 
483
        for (i = 0; i < KVM_NR_BUSES; i++)
 
484
                kfree(kvm->buses[i]);
 
485
        kfree(kvm->memslots);
 
486
        kvm_arch_free_vm(kvm);
 
487
        return ERR_PTR(r);
 
488
}
 
489
 
 
490
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 
491
{
 
492
        if (!memslot->dirty_bitmap)
 
493
                return;
 
494
 
 
495
        if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
 
496
                vfree(memslot->dirty_bitmap_head);
 
497
        else
 
498
                kfree(memslot->dirty_bitmap_head);
 
499
 
 
500
        memslot->dirty_bitmap = NULL;
 
501
        memslot->dirty_bitmap_head = NULL;
1008
502
}
1009
503
 
1010
504
/*
1019
513
                vfree(free->rmap);
1020
514
 
1021
515
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
1022
 
                vfree(free->dirty_bitmap);
 
516
                kvm_destroy_dirty_bitmap(free);
1023
517
 
1024
518
 
1025
519
        for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
1030
524
        }
1031
525
 
1032
526
        free->npages = 0;
1033
 
        free->dirty_bitmap = NULL;
1034
527
        free->rmap = NULL;
1035
528
}
1036
529
 
1037
530
void kvm_free_physmem(struct kvm *kvm)
1038
531
{
1039
532
        int i;
1040
 
 
1041
 
        for (i = 0; i < kvm->nmemslots; ++i)
1042
 
                kvm_free_physmem_slot(&kvm->memslots[i], NULL);
 
533
        struct kvm_memslots *slots = kvm->memslots;
 
534
 
 
535
        for (i = 0; i < slots->nmemslots; ++i)
 
536
                kvm_free_physmem_slot(&slots->memslots[i], NULL);
 
537
 
 
538
        kfree(kvm->memslots);
1043
539
}
1044
540
 
1045
541
static void kvm_destroy_vm(struct kvm *kvm)
1046
542
{
 
543
        int i;
1047
544
        struct mm_struct *mm = kvm->mm;
1048
545
 
1049
546
        kvm_arch_sync_events(kvm);
1051
548
        list_del(&kvm->vm_list);
1052
549
        spin_unlock(&kvm_lock);
1053
550
        kvm_free_irq_routing(kvm);
1054
 
        kvm_io_bus_destroy(&kvm->pio_bus);
1055
 
        kvm_io_bus_destroy(&kvm->mmio_bus);
1056
 
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1057
 
        if (kvm->coalesced_mmio_ring != NULL)
1058
 
                free_page((unsigned long)kvm->coalesced_mmio_ring);
1059
 
#endif
 
551
        for (i = 0; i < KVM_NR_BUSES; i++)
 
552
                kvm_io_bus_destroy(kvm->buses[i]);
 
553
        kvm_coalesced_mmio_free(kvm);
1060
554
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1061
555
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1062
556
#else
1063
557
        kvm_arch_flush_shadow(kvm);
1064
558
#endif
1065
559
        kvm_arch_destroy_vm(kvm);
 
560
        kvm_free_physmem(kvm);
 
561
        cleanup_srcu_struct(&kvm->srcu);
 
562
        kvm_arch_free_vm(kvm);
 
563
        hardware_disable_all();
1066
564
        mmdrop(mm);
1067
565
}
1068
566
 
1091
589
}
1092
590
 
1093
591
/*
 
592
 * Allocation size is twice as large as the actual dirty bitmap size.
 
593
 * This makes it possible to do double buffering: see x86's
 
594
 * kvm_vm_ioctl_get_dirty_log().
 
595
 */
 
596
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 
597
{
 
598
        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
599
 
 
600
        if (dirty_bytes > PAGE_SIZE)
 
601
                memslot->dirty_bitmap = vzalloc(dirty_bytes);
 
602
        else
 
603
                memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
 
604
 
 
605
        if (!memslot->dirty_bitmap)
 
606
                return -ENOMEM;
 
607
 
 
608
        memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 
609
        return 0;
 
610
}
 
611
 
 
612
/*
1094
613
 * Allocate some memory and give it an address in the guest physical address
1095
614
 * space.
1096
615
 *
1102
621
                            struct kvm_userspace_memory_region *mem,
1103
622
                            int user_alloc)
1104
623
{
1105
 
        int r;
 
624
        int r, flush_shadow = 0;
1106
625
        gfn_t base_gfn;
1107
626
        unsigned long npages;
1108
627
        unsigned long i;
1109
628
        struct kvm_memory_slot *memslot;
1110
629
        struct kvm_memory_slot old, new;
 
630
        struct kvm_memslots *slots, *old_memslots;
1111
631
 
1112
632
        r = -EINVAL;
1113
633
        /* General sanity checks */
1122
642
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1123
643
                goto out;
1124
644
 
1125
 
        memslot = &kvm->memslots[mem->slot];
 
645
        memslot = &kvm->memslots->memslots[mem->slot];
1126
646
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1127
647
        npages = mem->memory_size >> PAGE_SHIFT;
1128
648
 
 
649
        r = -EINVAL;
 
650
        if (npages > KVM_MEM_MAX_NR_PAGES)
 
651
                goto out;
 
652
 
1129
653
        if (!npages)
1130
654
                mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1131
655
 
1132
656
        new = old = *memslot;
1133
657
 
 
658
        new.id = mem->slot;
1134
659
        new.base_gfn = base_gfn;
1135
660
        new.npages = npages;
1136
661
        new.flags = mem->flags;
1143
668
        /* Check for overlaps */
1144
669
        r = -EEXIST;
1145
670
        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1146
 
                struct kvm_memory_slot *s = &kvm->memslots[i];
 
671
                struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
1147
672
 
1148
673
                if (s == memslot || !s->npages)
1149
674
                        continue;
1161
686
        /* Allocate if a slot is being created */
1162
687
#ifndef CONFIG_S390
1163
688
        if (npages && !new.rmap) {
1164
 
                new.rmap = vmalloc(npages * sizeof(struct page *));
 
689
                new.rmap = vzalloc(npages * sizeof(*new.rmap));
1165
690
 
1166
691
                if (!new.rmap)
1167
692
                        goto out_free;
1168
693
 
1169
 
                memset(new.rmap, 0, npages * sizeof(*new.rmap));
1170
 
 
1171
694
                new.user_alloc = user_alloc;
1172
 
                /*
1173
 
                 * hva_to_rmmap() serialzies with the mmu_lock and to be
1174
 
                 * safe it has to ignore memslots with !user_alloc &&
1175
 
                 * !userspace_addr.
1176
 
                 */
1177
 
                if (user_alloc)
1178
 
                        new.userspace_addr = mem->userspace_addr;
1179
 
                else
1180
 
                        new.userspace_addr = 0;
 
695
                new.userspace_addr = mem->userspace_addr;
1181
696
        }
1182
697
        if (!npages)
1183
698
                goto skip_lpage;
1194
709
                if (new.lpage_info[i])
1195
710
                        continue;
1196
711
 
1197
 
                lpages = 1 + (base_gfn + npages - 1) /
1198
 
                             KVM_PAGES_PER_HPAGE(level);
1199
 
                lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
 
712
                lpages = 1 + ((base_gfn + npages - 1)
 
713
                             >> KVM_HPAGE_GFN_SHIFT(level));
 
714
                lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
1200
715
 
1201
 
                new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
 
716
                new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
1202
717
 
1203
718
                if (!new.lpage_info[i])
1204
719
                        goto out_free;
1205
720
 
1206
 
                memset(new.lpage_info[i], 0,
1207
 
                       lpages * sizeof(*new.lpage_info[i]));
1208
 
 
1209
 
                if (base_gfn % KVM_PAGES_PER_HPAGE(level))
 
721
                if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
1210
722
                        new.lpage_info[i][0].write_count = 1;
1211
 
                if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
 
723
                if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
1212
724
                        new.lpage_info[i][lpages - 1].write_count = 1;
1213
725
                ugfn = new.userspace_addr >> PAGE_SHIFT;
1214
726
                /*
1226
738
 
1227
739
        /* Allocate page dirty bitmap if needed */
1228
740
        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1229
 
                unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
1230
 
 
1231
 
                new.dirty_bitmap = vmalloc(dirty_bytes);
1232
 
                if (!new.dirty_bitmap)
 
741
                if (kvm_create_dirty_bitmap(&new) < 0)
1233
742
                        goto out_free;
1234
 
                memset(new.dirty_bitmap, 0, dirty_bytes);
 
743
                /* destroy any largepage mappings for dirty tracking */
1235
744
                if (old.npages)
1236
 
                        kvm_arch_flush_shadow(kvm);
 
745
                        flush_shadow = 1;
1237
746
        }
1238
747
#else  /* not defined CONFIG_S390 */
1239
748
        new.user_alloc = user_alloc;
1241
750
                new.userspace_addr = mem->userspace_addr;
1242
751
#endif /* not defined CONFIG_S390 */
1243
752
 
1244
 
        if (!npages)
 
753
        if (!npages) {
 
754
                r = -ENOMEM;
 
755
                slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 
756
                if (!slots)
 
757
                        goto out_free;
 
758
                memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 
759
                if (mem->slot >= slots->nmemslots)
 
760
                        slots->nmemslots = mem->slot + 1;
 
761
                slots->generation++;
 
762
                slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
 
763
 
 
764
                old_memslots = kvm->memslots;
 
765
                rcu_assign_pointer(kvm->memslots, slots);
 
766
                synchronize_srcu_expedited(&kvm->srcu);
 
767
                /* From this point no new shadow pages pointing to a deleted
 
768
                 * memslot will be created.
 
769
                 *
 
770
                 * validation of sp->gfn happens in:
 
771
                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
 
772
                 *      - kvm_is_visible_gfn (mmu_check_roots)
 
773
                 */
1245
774
                kvm_arch_flush_shadow(kvm);
1246
 
 
1247
 
        spin_lock(&kvm->mmu_lock);
1248
 
        if (mem->slot >= kvm->nmemslots)
1249
 
                kvm->nmemslots = mem->slot + 1;
1250
 
 
1251
 
        *memslot = new;
1252
 
        spin_unlock(&kvm->mmu_lock);
1253
 
 
1254
 
        r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
1255
 
        if (r) {
1256
 
                spin_lock(&kvm->mmu_lock);
1257
 
                *memslot = old;
1258
 
                spin_unlock(&kvm->mmu_lock);
 
775
                kfree(old_memslots);
 
776
        }
 
777
 
 
778
        r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
 
779
        if (r)
1259
780
                goto out_free;
1260
 
        }
1261
781
 
1262
 
        kvm_free_physmem_slot(&old, npages ? &new : NULL);
1263
 
        /* Slot deletion case: we have to update the current slot */
1264
 
        spin_lock(&kvm->mmu_lock);
1265
 
        if (!npages)
1266
 
                *memslot = old;
1267
 
        spin_unlock(&kvm->mmu_lock);
1268
 
#ifdef CONFIG_DMAR
1269
782
        /* map the pages in iommu page table */
1270
 
        r = kvm_iommu_map_pages(kvm, base_gfn, npages);
1271
 
        if (r)
1272
 
                goto out;
1273
 
#endif
 
783
        if (npages) {
 
784
                r = kvm_iommu_map_pages(kvm, &new);
 
785
                if (r)
 
786
                        goto out_free;
 
787
        }
 
788
 
 
789
        r = -ENOMEM;
 
790
        slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 
791
        if (!slots)
 
792
                goto out_free;
 
793
        memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 
794
        if (mem->slot >= slots->nmemslots)
 
795
                slots->nmemslots = mem->slot + 1;
 
796
        slots->generation++;
 
797
 
 
798
        /* actual memory is freed via old in kvm_free_physmem_slot below */
 
799
        if (!npages) {
 
800
                new.rmap = NULL;
 
801
                new.dirty_bitmap = NULL;
 
802
                for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
 
803
                        new.lpage_info[i] = NULL;
 
804
        }
 
805
 
 
806
        slots->memslots[mem->slot] = new;
 
807
        old_memslots = kvm->memslots;
 
808
        rcu_assign_pointer(kvm->memslots, slots);
 
809
        synchronize_srcu_expedited(&kvm->srcu);
 
810
 
 
811
        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
812
 
 
813
        kvm_free_physmem_slot(&old, &new);
 
814
        kfree(old_memslots);
 
815
 
 
816
        if (flush_shadow)
 
817
                kvm_arch_flush_shadow(kvm);
 
818
 
1274
819
        return 0;
1275
820
 
1276
821
out_free:
1287
832
{
1288
833
        int r;
1289
834
 
1290
 
        down_write(&kvm->slots_lock);
 
835
        mutex_lock(&kvm->slots_lock);
1291
836
        r = __kvm_set_memory_region(kvm, mem, user_alloc);
1292
 
        up_write(&kvm->slots_lock);
 
837
        mutex_unlock(&kvm->slots_lock);
1293
838
        return r;
1294
839
}
1295
840
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1309
854
{
1310
855
        struct kvm_memory_slot *memslot;
1311
856
        int r, i;
1312
 
        int n;
 
857
        unsigned long n;
1313
858
        unsigned long any = 0;
1314
859
 
1315
860
        r = -EINVAL;
1316
861
        if (log->slot >= KVM_MEMORY_SLOTS)
1317
862
                goto out;
1318
863
 
1319
 
        memslot = &kvm->memslots[log->slot];
 
864
        memslot = &kvm->memslots->memslots[log->slot];
1320
865
        r = -ENOENT;
1321
866
        if (!memslot->dirty_bitmap)
1322
867
                goto out;
1323
868
 
1324
 
        n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 
869
        n = kvm_dirty_bitmap_bytes(memslot);
1325
870
 
1326
871
        for (i = 0; !any && i < n/sizeof(long); ++i)
1327
872
                any = memslot->dirty_bitmap[i];
1346
891
 
1347
892
int is_error_page(struct page *page)
1348
893
{
1349
 
        return page == bad_page;
 
894
        return page == bad_page || page == hwpoison_page || page == fault_page;
1350
895
}
1351
896
EXPORT_SYMBOL_GPL(is_error_page);
1352
897
 
1353
898
int is_error_pfn(pfn_t pfn)
1354
899
{
1355
 
        return pfn == bad_pfn;
 
900
        return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
1356
901
}
1357
902
EXPORT_SYMBOL_GPL(is_error_pfn);
1358
903
 
 
904
int is_hwpoison_pfn(pfn_t pfn)
 
905
{
 
906
        return pfn == hwpoison_pfn;
 
907
}
 
908
EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 
909
 
 
910
int is_fault_pfn(pfn_t pfn)
 
911
{
 
912
        return pfn == fault_pfn;
 
913
}
 
914
EXPORT_SYMBOL_GPL(is_fault_pfn);
 
915
 
1359
916
static inline unsigned long bad_hva(void)
1360
917
{
1361
918
        return PAGE_OFFSET;
1367
924
}
1368
925
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
1369
926
 
1370
 
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 
927
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
 
928
                                                gfn_t gfn)
1371
929
{
1372
930
        int i;
1373
931
 
1374
 
        for (i = 0; i < kvm->nmemslots; ++i) {
1375
 
                struct kvm_memory_slot *memslot = &kvm->memslots[i];
 
932
        for (i = 0; i < slots->nmemslots; ++i) {
 
933
                struct kvm_memory_slot *memslot = &slots->memslots[i];
1376
934
 
1377
935
                if (gfn >= memslot->base_gfn
1378
936
                    && gfn < memslot->base_gfn + memslot->npages)
1380
938
        }
1381
939
        return NULL;
1382
940
}
1383
 
EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
1384
941
 
1385
942
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1386
943
{
1387
 
        gfn = unalias_gfn(kvm, gfn);
1388
 
        return gfn_to_memslot_unaliased(kvm, gfn);
 
944
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1389
945
}
 
946
EXPORT_SYMBOL_GPL(gfn_to_memslot);
1390
947
 
1391
948
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1392
949
{
1393
950
        int i;
 
951
        struct kvm_memslots *slots = kvm_memslots(kvm);
1394
952
 
1395
 
        gfn = unalias_gfn(kvm, gfn);
1396
953
        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1397
 
                struct kvm_memory_slot *memslot = &kvm->memslots[i];
 
954
                struct kvm_memory_slot *memslot = &slots->memslots[i];
 
955
 
 
956
                if (memslot->flags & KVM_MEMSLOT_INVALID)
 
957
                        continue;
1398
958
 
1399
959
                if (gfn >= memslot->base_gfn
1400
960
                    && gfn < memslot->base_gfn + memslot->npages)
1404
964
}
1405
965
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1406
966
 
 
967
unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
 
968
{
 
969
        struct vm_area_struct *vma;
 
970
        unsigned long addr, size;
 
971
 
 
972
        size = PAGE_SIZE;
 
973
 
 
974
        addr = gfn_to_hva(kvm, gfn);
 
975
        if (kvm_is_error_hva(addr))
 
976
                return PAGE_SIZE;
 
977
 
 
978
        down_read(&current->mm->mmap_sem);
 
979
        vma = find_vma(current->mm, addr);
 
980
        if (!vma)
 
981
                goto out;
 
982
 
 
983
        size = vma_kernel_pagesize(vma);
 
984
 
 
985
out:
 
986
        up_read(&current->mm->mmap_sem);
 
987
 
 
988
        return size;
 
989
}
 
990
 
 
991
int memslot_id(struct kvm *kvm, gfn_t gfn)
 
992
{
 
993
        int i;
 
994
        struct kvm_memslots *slots = kvm_memslots(kvm);
 
995
        struct kvm_memory_slot *memslot = NULL;
 
996
 
 
997
        for (i = 0; i < slots->nmemslots; ++i) {
 
998
                memslot = &slots->memslots[i];
 
999
 
 
1000
                if (gfn >= memslot->base_gfn
 
1001
                    && gfn < memslot->base_gfn + memslot->npages)
 
1002
                        break;
 
1003
        }
 
1004
 
 
1005
        return memslot - slots->memslots;
 
1006
}
 
1007
 
 
1008
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 
1009
                                     gfn_t *nr_pages)
 
1010
{
 
1011
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 
1012
                return bad_hva();
 
1013
 
 
1014
        if (nr_pages)
 
1015
                *nr_pages = slot->npages - (gfn - slot->base_gfn);
 
1016
 
 
1017
        return gfn_to_hva_memslot(slot, gfn);
 
1018
}
 
1019
 
1407
1020
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1408
1021
{
1409
 
        struct kvm_memory_slot *slot;
1410
 
 
1411
 
        gfn = unalias_gfn(kvm, gfn);
1412
 
        slot = gfn_to_memslot_unaliased(kvm, gfn);
1413
 
        if (!slot)
1414
 
                return bad_hva();
1415
 
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 
1022
        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1416
1023
}
1417
1024
EXPORT_SYMBOL_GPL(gfn_to_hva);
1418
1025
 
1419
 
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 
1026
static pfn_t get_fault_pfn(void)
 
1027
{
 
1028
        get_page(fault_page);
 
1029
        return fault_pfn;
 
1030
}
 
1031
 
 
1032
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 
1033
                        bool *async, bool write_fault, bool *writable)
1420
1034
{
1421
1035
        struct page *page[1];
1422
 
        unsigned long addr;
1423
 
        int npages;
 
1036
        int npages = 0;
1424
1037
        pfn_t pfn;
1425
1038
 
1426
 
        might_sleep();
1427
 
 
1428
 
        addr = gfn_to_hva(kvm, gfn);
1429
 
        if (kvm_is_error_hva(addr)) {
1430
 
                get_page(bad_page);
1431
 
                return page_to_pfn(bad_page);
 
1039
        /* we can do it either atomically or asynchronously, not both */
 
1040
        BUG_ON(atomic && async);
 
1041
 
 
1042
        BUG_ON(!write_fault && !writable);
 
1043
 
 
1044
        if (writable)
 
1045
                *writable = true;
 
1046
 
 
1047
        if (atomic || async)
 
1048
                npages = __get_user_pages_fast(addr, 1, 1, page);
 
1049
 
 
1050
        if (unlikely(npages != 1) && !atomic) {
 
1051
                might_sleep();
 
1052
 
 
1053
                if (writable)
 
1054
                        *writable = write_fault;
 
1055
 
 
1056
                npages = get_user_pages_fast(addr, 1, write_fault, page);
 
1057
 
 
1058
                /* map read fault as writable if possible */
 
1059
                if (unlikely(!write_fault) && npages == 1) {
 
1060
                        struct page *wpage[1];
 
1061
 
 
1062
                        npages = __get_user_pages_fast(addr, 1, 1, wpage);
 
1063
                        if (npages == 1) {
 
1064
                                *writable = true;
 
1065
                                put_page(page[0]);
 
1066
                                page[0] = wpage[0];
 
1067
                        }
 
1068
                        npages = 1;
 
1069
                }
1432
1070
        }
1433
1071
 
1434
 
        npages = get_user_pages_fast(addr, 1, 1, page);
1435
 
 
1436
1072
        if (unlikely(npages != 1)) {
1437
1073
                struct vm_area_struct *vma;
1438
1074
 
 
1075
                if (atomic)
 
1076
                        return get_fault_pfn();
 
1077
 
1439
1078
                down_read(&current->mm->mmap_sem);
1440
 
                vma = find_vma(current->mm, addr);
1441
 
 
1442
 
                if (vma == NULL || addr < vma->vm_start ||
1443
 
                    !(vma->vm_flags & VM_PFNMAP)) {
 
1079
                if (is_hwpoison_address(addr)) {
1444
1080
                        up_read(&current->mm->mmap_sem);
1445
 
                        get_page(bad_page);
1446
 
                        return page_to_pfn(bad_page);
1447
 
                }
1448
 
 
1449
 
                pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
1081
                        get_page(hwpoison_page);
 
1082
                        return page_to_pfn(hwpoison_page);
 
1083
                }
 
1084
 
 
1085
                vma = find_vma_intersection(current->mm, addr, addr+1);
 
1086
 
 
1087
                if (vma == NULL)
 
1088
                        pfn = get_fault_pfn();
 
1089
                else if ((vma->vm_flags & VM_PFNMAP)) {
 
1090
                        pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 
1091
                                vma->vm_pgoff;
 
1092
                        BUG_ON(!kvm_is_mmio_pfn(pfn));
 
1093
                } else {
 
1094
                        if (async && (vma->vm_flags & VM_WRITE))
 
1095
                                *async = true;
 
1096
                        pfn = get_fault_pfn();
 
1097
                }
1450
1098
                up_read(&current->mm->mmap_sem);
1451
 
                BUG_ON(!kvm_is_mmio_pfn(pfn));
1452
1099
        } else
1453
1100
                pfn = page_to_pfn(page[0]);
1454
1101
 
1455
1102
        return pfn;
1456
1103
}
1457
1104
 
 
1105
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 
1106
{
 
1107
        return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
 
1108
}
 
1109
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
1110
 
 
1111
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 
1112
                          bool write_fault, bool *writable)
 
1113
{
 
1114
        unsigned long addr;
 
1115
 
 
1116
        if (async)
 
1117
                *async = false;
 
1118
 
 
1119
        addr = gfn_to_hva(kvm, gfn);
 
1120
        if (kvm_is_error_hva(addr)) {
 
1121
                get_page(bad_page);
 
1122
                return page_to_pfn(bad_page);
 
1123
        }
 
1124
 
 
1125
        return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
 
1126
}
 
1127
 
 
1128
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 
1129
{
 
1130
        return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
 
1131
}
 
1132
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
1133
 
 
1134
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
 
1135
                       bool write_fault, bool *writable)
 
1136
{
 
1137
        return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
 
1138
}
 
1139
EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
 
1140
 
 
1141
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 
1142
{
 
1143
        return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
 
1144
}
1458
1145
EXPORT_SYMBOL_GPL(gfn_to_pfn);
1459
1146
 
 
1147
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 
1148
                      bool *writable)
 
1149
{
 
1150
        return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
 
1151
}
 
1152
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
1153
 
 
1154
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 
1155
                         struct kvm_memory_slot *slot, gfn_t gfn)
 
1156
{
 
1157
        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
 
1158
        return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
 
1159
}
 
1160
 
 
1161
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 
1162
                                                                  int nr_pages)
 
1163
{
 
1164
        unsigned long addr;
 
1165
        gfn_t entry;
 
1166
 
 
1167
        addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
 
1168
        if (kvm_is_error_hva(addr))
 
1169
                return -1;
 
1170
 
 
1171
        if (entry < nr_pages)
 
1172
                return 0;
 
1173
 
 
1174
        return __get_user_pages_fast(addr, nr_pages, 1, pages);
 
1175
}
 
1176
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
1177
 
1460
1178
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1461
1179
{
1462
1180
        pfn_t pfn;
1630
1348
        return 0;
1631
1349
}
1632
1350
 
 
1351
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
1352
                              gpa_t gpa)
 
1353
{
 
1354
        struct kvm_memslots *slots = kvm_memslots(kvm);
 
1355
        int offset = offset_in_page(gpa);
 
1356
        gfn_t gfn = gpa >> PAGE_SHIFT;
 
1357
 
 
1358
        ghc->gpa = gpa;
 
1359
        ghc->generation = slots->generation;
 
1360
        ghc->memslot = __gfn_to_memslot(slots, gfn);
 
1361
        ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
 
1362
        if (!kvm_is_error_hva(ghc->hva))
 
1363
                ghc->hva += offset;
 
1364
        else
 
1365
                return -EFAULT;
 
1366
 
 
1367
        return 0;
 
1368
}
 
1369
EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
1370
 
 
1371
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
1372
                           void *data, unsigned long len)
 
1373
{
 
1374
        struct kvm_memslots *slots = kvm_memslots(kvm);
 
1375
        int r;
 
1376
 
 
1377
        if (slots->generation != ghc->generation)
 
1378
                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
 
1379
 
 
1380
        if (kvm_is_error_hva(ghc->hva))
 
1381
                return -EFAULT;
 
1382
 
 
1383
        r = copy_to_user((void __user *)ghc->hva, data, len);
 
1384
        if (r)
 
1385
                return -EFAULT;
 
1386
        mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
 
1387
 
 
1388
        return 0;
 
1389
}
 
1390
EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
1391
 
1633
1392
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1634
1393
{
1635
 
        return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
 
1394
        return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
 
1395
                                    offset, len);
1636
1396
}
1637
1397
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1638
1398
 
1655
1415
}
1656
1416
EXPORT_SYMBOL_GPL(kvm_clear_guest);
1657
1417
 
1658
 
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 
1418
void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
1419
                             gfn_t gfn)
1659
1420
{
1660
 
        struct kvm_memory_slot *memslot;
1661
 
 
1662
 
        gfn = unalias_gfn(kvm, gfn);
1663
 
        memslot = gfn_to_memslot_unaliased(kvm, gfn);
1664
1421
        if (memslot && memslot->dirty_bitmap) {
1665
1422
                unsigned long rel_gfn = gfn - memslot->base_gfn;
1666
1423
 
1667
 
                /* avoid RMW */
1668
 
                if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1669
 
                        set_bit(rel_gfn, memslot->dirty_bitmap);
 
1424
                generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
1670
1425
        }
1671
1426
}
1672
1427
 
 
1428
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 
1429
{
 
1430
        struct kvm_memory_slot *memslot;
 
1431
 
 
1432
        memslot = gfn_to_memslot(kvm, gfn);
 
1433
        mark_page_dirty_in_slot(kvm, memslot, gfn);
 
1434
}
 
1435
 
1673
1436
/*
1674
1437
 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1675
1438
 */
1681
1444
                prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1682
1445
 
1683
1446
                if (kvm_arch_vcpu_runnable(vcpu)) {
1684
 
                        set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
1447
                        kvm_make_request(KVM_REQ_UNHALT, vcpu);
1685
1448
                        break;
1686
1449
                }
1687
1450
                if (kvm_cpu_has_pending_timer(vcpu))
1689
1452
                if (signal_pending(current))
1690
1453
                        break;
1691
1454
 
1692
 
                vcpu_put(vcpu);
1693
1455
                schedule();
1694
 
                vcpu_load(vcpu);
1695
1456
        }
1696
1457
 
1697
1458
        finish_wait(&vcpu->wq, &wait);
1705
1466
}
1706
1467
EXPORT_SYMBOL_GPL(kvm_resched);
1707
1468
 
 
1469
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
 
1470
{
 
1471
        ktime_t expires;
 
1472
        DEFINE_WAIT(wait);
 
1473
 
 
1474
        prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
1475
 
 
1476
        /* Sleep for 100 us, and hope lock-holder got scheduled */
 
1477
        expires = ktime_add_ns(ktime_get(), 100000UL);
 
1478
        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
 
1479
 
 
1480
        finish_wait(&vcpu->wq, &wait);
 
1481
}
 
1482
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
1483
 
1708
1484
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1709
1485
{
1710
1486
        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1750
1526
        .unlocked_ioctl = kvm_vcpu_ioctl,
1751
1527
        .compat_ioctl   = kvm_vcpu_ioctl,
1752
1528
        .mmap           = kvm_vcpu_mmap,
 
1529
        .llseek         = noop_llseek,
1753
1530
};
1754
1531
 
1755
1532
/*
1757
1534
 */
1758
1535
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1759
1536
{
1760
 
        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
 
1537
        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1761
1538
}
1762
1539
 
1763
1540
/*
1828
1605
        return 0;
1829
1606
}
1830
1607
 
1831
 
#ifdef __KVM_HAVE_MSIX
1832
 
static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
1833
 
                                    struct kvm_assigned_msix_nr *entry_nr)
1834
 
{
1835
 
        int r = 0;
1836
 
        struct kvm_assigned_dev_kernel *adev;
1837
 
 
1838
 
        mutex_lock(&kvm->lock);
1839
 
 
1840
 
        adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
1841
 
                                      entry_nr->assigned_dev_id);
1842
 
        if (!adev) {
1843
 
                r = -EINVAL;
1844
 
                goto msix_nr_out;
1845
 
        }
1846
 
 
1847
 
        if (adev->entries_nr == 0) {
1848
 
                adev->entries_nr = entry_nr->entry_nr;
1849
 
                if (adev->entries_nr == 0 ||
1850
 
                    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
1851
 
                        r = -EINVAL;
1852
 
                        goto msix_nr_out;
1853
 
                }
1854
 
 
1855
 
                adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
1856
 
                                                entry_nr->entry_nr,
1857
 
                                                GFP_KERNEL);
1858
 
                if (!adev->host_msix_entries) {
1859
 
                        r = -ENOMEM;
1860
 
                        goto msix_nr_out;
1861
 
                }
1862
 
                adev->guest_msix_entries = kzalloc(
1863
 
                                sizeof(struct kvm_guest_msix_entry) *
1864
 
                                entry_nr->entry_nr, GFP_KERNEL);
1865
 
                if (!adev->guest_msix_entries) {
1866
 
                        kfree(adev->host_msix_entries);
1867
 
                        r = -ENOMEM;
1868
 
                        goto msix_nr_out;
1869
 
                }
1870
 
        } else /* Not allowed set MSI-X number twice */
1871
 
                r = -EINVAL;
1872
 
msix_nr_out:
1873
 
        mutex_unlock(&kvm->lock);
1874
 
        return r;
1875
 
}
1876
 
 
1877
 
static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
1878
 
                                       struct kvm_assigned_msix_entry *entry)
1879
 
{
1880
 
        int r = 0, i;
1881
 
        struct kvm_assigned_dev_kernel *adev;
1882
 
 
1883
 
        mutex_lock(&kvm->lock);
1884
 
 
1885
 
        adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
1886
 
                                      entry->assigned_dev_id);
1887
 
 
1888
 
        if (!adev) {
1889
 
                r = -EINVAL;
1890
 
                goto msix_entry_out;
1891
 
        }
1892
 
 
1893
 
        for (i = 0; i < adev->entries_nr; i++)
1894
 
                if (adev->guest_msix_entries[i].vector == 0 ||
1895
 
                    adev->guest_msix_entries[i].entry == entry->entry) {
1896
 
                        adev->guest_msix_entries[i].entry = entry->entry;
1897
 
                        adev->guest_msix_entries[i].vector = entry->gsi;
1898
 
                        adev->host_msix_entries[i].entry = entry->entry;
1899
 
                        break;
1900
 
                }
1901
 
        if (i == adev->entries_nr) {
1902
 
                r = -ENOSPC;
1903
 
                goto msix_entry_out;
1904
 
        }
1905
 
 
1906
 
msix_entry_out:
1907
 
        mutex_unlock(&kvm->lock);
1908
 
 
1909
 
        return r;
1910
 
}
1911
 
#endif
1912
 
 
1913
1608
static long kvm_vcpu_ioctl(struct file *filp,
1914
1609
                           unsigned int ioctl, unsigned long arg)
1915
1610
{
1921
1616
 
1922
1617
        if (vcpu->kvm->mm != current->mm)
1923
1618
                return -EIO;
 
1619
 
 
1620
#if defined(CONFIG_S390) || defined(CONFIG_PPC)
 
1621
        /*
 
1622
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
 
1623
         * so vcpu_load() would break it.
 
1624
         */
 
1625
        if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
 
1626
                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 
1627
#endif
 
1628
 
 
1629
 
 
1630
        vcpu_load(vcpu);
1924
1631
        switch (ioctl) {
1925
1632
        case KVM_RUN:
1926
1633
                r = -EINVAL;
1927
1634
                if (arg)
1928
1635
                        goto out;
1929
1636
                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
 
1637
                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1930
1638
                break;
1931
1639
        case KVM_GET_REGS: {
1932
1640
                struct kvm_regs *kvm_regs;
2063
1771
                                goto out;
2064
1772
                        p = &sigset;
2065
1773
                }
2066
 
                r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 
1774
                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
2067
1775
                break;
2068
1776
        }
2069
1777
        case KVM_GET_FPU: {
2098
1806
                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
2099
1807
        }
2100
1808
out:
 
1809
        vcpu_put(vcpu);
2101
1810
        kfree(fpu);
2102
1811
        kfree(kvm_sregs);
2103
1812
        return r;
2148
1857
                r = -EFAULT;
2149
1858
                if (copy_from_user(&zone, argp, sizeof zone))
2150
1859
                        goto out;
2151
 
                r = -ENXIO;
2152
1860
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
2153
1861
                if (r)
2154
1862
                        goto out;
2160
1868
                r = -EFAULT;
2161
1869
                if (copy_from_user(&zone, argp, sizeof zone))
2162
1870
                        goto out;
2163
 
                r = -ENXIO;
2164
1871
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2165
1872
                if (r)
2166
1873
                        goto out;
2168
1875
                break;
2169
1876
        }
2170
1877
#endif
2171
 
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
2172
 
        case KVM_ASSIGN_PCI_DEVICE: {
2173
 
                struct kvm_assigned_pci_dev assigned_dev;
2174
 
 
2175
 
                r = -EFAULT;
2176
 
                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
2177
 
                        goto out;
2178
 
                r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
2179
 
                if (r)
2180
 
                        goto out;
2181
 
                break;
2182
 
        }
2183
 
        case KVM_ASSIGN_IRQ: {
2184
 
                r = -EOPNOTSUPP;
2185
 
                break;
2186
 
        }
2187
 
#ifdef KVM_CAP_ASSIGN_DEV_IRQ
2188
 
        case KVM_ASSIGN_DEV_IRQ: {
2189
 
                struct kvm_assigned_irq assigned_irq;
2190
 
 
2191
 
                r = -EFAULT;
2192
 
                if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
2193
 
                        goto out;
2194
 
                r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
2195
 
                if (r)
2196
 
                        goto out;
2197
 
                break;
2198
 
        }
2199
 
        case KVM_DEASSIGN_DEV_IRQ: {
2200
 
                struct kvm_assigned_irq assigned_irq;
2201
 
 
2202
 
                r = -EFAULT;
2203
 
                if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
2204
 
                        goto out;
2205
 
                r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
2206
 
                if (r)
2207
 
                        goto out;
2208
 
                break;
2209
 
        }
2210
 
#endif
2211
 
#endif
2212
 
#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
2213
 
        case KVM_DEASSIGN_PCI_DEVICE: {
2214
 
                struct kvm_assigned_pci_dev assigned_dev;
2215
 
 
2216
 
                r = -EFAULT;
2217
 
                if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
2218
 
                        goto out;
2219
 
                r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
2220
 
                if (r)
2221
 
                        goto out;
2222
 
                break;
2223
 
        }
2224
 
#endif
2225
 
#ifdef KVM_CAP_IRQ_ROUTING
2226
 
        case KVM_SET_GSI_ROUTING: {
2227
 
                struct kvm_irq_routing routing;
2228
 
                struct kvm_irq_routing __user *urouting;
2229
 
                struct kvm_irq_routing_entry *entries;
2230
 
 
2231
 
                r = -EFAULT;
2232
 
                if (copy_from_user(&routing, argp, sizeof(routing)))
2233
 
                        goto out;
2234
 
                r = -EINVAL;
2235
 
                if (routing.nr >= KVM_MAX_IRQ_ROUTES)
2236
 
                        goto out;
2237
 
                if (routing.flags)
2238
 
                        goto out;
2239
 
                r = -ENOMEM;
2240
 
                entries = vmalloc(routing.nr * sizeof(*entries));
2241
 
                if (!entries)
2242
 
                        goto out;
2243
 
                r = -EFAULT;
2244
 
                urouting = argp;
2245
 
                if (copy_from_user(entries, urouting->entries,
2246
 
                                   routing.nr * sizeof(*entries)))
2247
 
                        goto out_free_irq_routing;
2248
 
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
2249
 
                                        routing.flags);
2250
 
        out_free_irq_routing:
2251
 
                vfree(entries);
2252
 
                break;
2253
 
        }
2254
 
#endif /* KVM_CAP_IRQ_ROUTING */
2255
 
#ifdef __KVM_HAVE_MSIX
2256
 
        case KVM_ASSIGN_SET_MSIX_NR: {
2257
 
                struct kvm_assigned_msix_nr entry_nr;
2258
 
                r = -EFAULT;
2259
 
                if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
2260
 
                        goto out;
2261
 
                r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
2262
 
                if (r)
2263
 
                        goto out;
2264
 
                break;
2265
 
        }
2266
 
        case KVM_ASSIGN_SET_MSIX_ENTRY: {
2267
 
                struct kvm_assigned_msix_entry entry;
2268
 
                r = -EFAULT;
2269
 
                if (copy_from_user(&entry, argp, sizeof entry))
2270
 
                        goto out;
2271
 
                r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
2272
 
                if (r)
2273
 
                        goto out;
2274
 
                break;
2275
 
        }
2276
 
#endif
2277
1878
        case KVM_IRQFD: {
2278
1879
                struct kvm_irqfd data;
2279
1880
 
2305
1906
#endif
2306
1907
        default:
2307
1908
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2308
 
        }
2309
 
out:
2310
 
        return r;
2311
 
}
 
1909
                if (r == -ENOTTY)
 
1910
                        r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 
1911
        }
 
1912
out:
 
1913
        return r;
 
1914
}
 
1915
 
 
1916
#ifdef CONFIG_COMPAT
 
1917
struct compat_kvm_dirty_log {
 
1918
        __u32 slot;
 
1919
        __u32 padding1;
 
1920
        union {
 
1921
                compat_uptr_t dirty_bitmap; /* one bit per page */
 
1922
                __u64 padding2;
 
1923
        };
 
1924
};
 
1925
 
 
1926
static long kvm_vm_compat_ioctl(struct file *filp,
 
1927
                           unsigned int ioctl, unsigned long arg)
 
1928
{
 
1929
        struct kvm *kvm = filp->private_data;
 
1930
        int r;
 
1931
 
 
1932
        if (kvm->mm != current->mm)
 
1933
                return -EIO;
 
1934
        switch (ioctl) {
 
1935
        case KVM_GET_DIRTY_LOG: {
 
1936
                struct compat_kvm_dirty_log compat_log;
 
1937
                struct kvm_dirty_log log;
 
1938
 
 
1939
                r = -EFAULT;
 
1940
                if (copy_from_user(&compat_log, (void __user *)arg,
 
1941
                                   sizeof(compat_log)))
 
1942
                        goto out;
 
1943
                log.slot         = compat_log.slot;
 
1944
                log.padding1     = compat_log.padding1;
 
1945
                log.padding2     = compat_log.padding2;
 
1946
                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
 
1947
 
 
1948
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 
1949
                if (r)
 
1950
                        goto out;
 
1951
                break;
 
1952
        }
 
1953
        default:
 
1954
                r = kvm_vm_ioctl(filp, ioctl, arg);
 
1955
        }
 
1956
 
 
1957
out:
 
1958
        return r;
 
1959
}
 
1960
#endif
2312
1961
 
2313
1962
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2314
1963
{
2344
1993
static struct file_operations kvm_vm_fops = {
2345
1994
        .release        = kvm_vm_release,
2346
1995
        .unlocked_ioctl = kvm_vm_ioctl,
2347
 
        .compat_ioctl   = kvm_vm_ioctl,
 
1996
#ifdef CONFIG_COMPAT
 
1997
        .compat_ioctl   = kvm_vm_compat_ioctl,
 
1998
#endif
2348
1999
        .mmap           = kvm_vm_mmap,
 
2000
        .llseek         = noop_llseek,
2349
2001
};
2350
2002
 
2351
2003
static int kvm_dev_ioctl_create_vm(void)
2352
2004
{
2353
 
        int fd;
 
2005
        int r;
2354
2006
        struct kvm *kvm;
2355
2007
 
2356
2008
        kvm = kvm_create_vm();
2357
2009
        if (IS_ERR(kvm))
2358
2010
                return PTR_ERR(kvm);
2359
 
        fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
2360
 
        if (fd < 0)
 
2011
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 
2012
        r = kvm_coalesced_mmio_init(kvm);
 
2013
        if (r < 0) {
 
2014
                kvm_put_kvm(kvm);
 
2015
                return r;
 
2016
        }
 
2017
#endif
 
2018
        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 
2019
        if (r < 0)
2361
2020
                kvm_put_kvm(kvm);
2362
2021
 
2363
 
        return fd;
 
2022
        return r;
2364
2023
}
2365
2024
 
2366
2025
static long kvm_dev_ioctl_check_extension_generic(long arg)
2372
2031
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2373
2032
        case KVM_CAP_SET_BOOT_CPU_ID:
2374
2033
#endif
 
2034
        case KVM_CAP_INTERNAL_ERROR_DATA:
2375
2035
                return 1;
2376
2036
#ifdef CONFIG_HAVE_KVM_IRQCHIP
2377
2037
        case KVM_CAP_IRQ_ROUTING:
2431
2091
static struct file_operations kvm_chardev_ops = {
2432
2092
        .unlocked_ioctl = kvm_dev_ioctl,
2433
2093
        .compat_ioctl   = kvm_dev_ioctl,
 
2094
        .llseek         = noop_llseek,
2434
2095
};
2435
2096
 
2436
2097
static struct miscdevice kvm_dev = {
2439
2100
        &kvm_chardev_ops,
2440
2101
};
2441
2102
 
2442
 
static void hardware_enable(void *junk)
 
2103
static void hardware_enable_nolock(void *junk)
2443
2104
{
2444
2105
        int cpu = raw_smp_processor_id();
 
2106
        int r;
2445
2107
 
2446
2108
        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2447
2109
                return;
 
2110
 
2448
2111
        cpumask_set_cpu(cpu, cpus_hardware_enabled);
2449
 
        kvm_arch_hardware_enable(NULL);
2450
 
}
2451
 
 
2452
 
static void hardware_disable(void *junk)
 
2112
 
 
2113
        r = kvm_arch_hardware_enable(NULL);
 
2114
 
 
2115
        if (r) {
 
2116
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
 
2117
                atomic_inc(&hardware_enable_failed);
 
2118
                printk(KERN_INFO "kvm: enabling virtualization on "
 
2119
                                 "CPU%d failed\n", cpu);
 
2120
        }
 
2121
}
 
2122
 
 
2123
static void hardware_enable(void *junk)
 
2124
{
 
2125
        spin_lock(&kvm_lock);
 
2126
        hardware_enable_nolock(junk);
 
2127
        spin_unlock(&kvm_lock);
 
2128
}
 
2129
 
 
2130
static void hardware_disable_nolock(void *junk)
2453
2131
{
2454
2132
        int cpu = raw_smp_processor_id();
2455
2133
 
2459
2137
        kvm_arch_hardware_disable(NULL);
2460
2138
}
2461
2139
 
 
2140
static void hardware_disable(void *junk)
 
2141
{
 
2142
        spin_lock(&kvm_lock);
 
2143
        hardware_disable_nolock(junk);
 
2144
        spin_unlock(&kvm_lock);
 
2145
}
 
2146
 
 
2147
static void hardware_disable_all_nolock(void)
 
2148
{
 
2149
        BUG_ON(!kvm_usage_count);
 
2150
 
 
2151
        kvm_usage_count--;
 
2152
        if (!kvm_usage_count)
 
2153
                on_each_cpu(hardware_disable_nolock, NULL, 1);
 
2154
}
 
2155
 
 
2156
static void hardware_disable_all(void)
 
2157
{
 
2158
        spin_lock(&kvm_lock);
 
2159
        hardware_disable_all_nolock();
 
2160
        spin_unlock(&kvm_lock);
 
2161
}
 
2162
 
 
2163
static int hardware_enable_all(void)
 
2164
{
 
2165
        int r = 0;
 
2166
 
 
2167
        spin_lock(&kvm_lock);
 
2168
 
 
2169
        kvm_usage_count++;
 
2170
        if (kvm_usage_count == 1) {
 
2171
                atomic_set(&hardware_enable_failed, 0);
 
2172
                on_each_cpu(hardware_enable_nolock, NULL, 1);
 
2173
 
 
2174
                if (atomic_read(&hardware_enable_failed)) {
 
2175
                        hardware_disable_all_nolock();
 
2176
                        r = -EBUSY;
 
2177
                }
 
2178
        }
 
2179
 
 
2180
        spin_unlock(&kvm_lock);
 
2181
 
 
2182
        return r;
 
2183
}
 
2184
 
2462
2185
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2463
2186
                           void *v)
2464
2187
{
2465
2188
        int cpu = (long)v;
2466
2189
 
 
2190
        if (!kvm_usage_count)
 
2191
                return NOTIFY_OK;
 
2192
 
2467
2193
        val &= ~CPU_TASKS_FROZEN;
2468
2194
        switch (val) {
2469
2195
        case CPU_DYING:
2471
2197
                       cpu);
2472
2198
                hardware_disable(NULL);
2473
2199
                break;
2474
 
        case CPU_UP_CANCELED:
2475
 
                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2476
 
                       cpu);
2477
 
                smp_call_function_single(cpu, hardware_disable, NULL, 1);
2478
 
                break;
2479
 
        case CPU_ONLINE:
 
2200
        case CPU_STARTING:
2480
2201
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2481
2202
                       cpu);
2482
 
                smp_call_function_single(cpu, hardware_enable, NULL, 1);
 
2203
                hardware_enable(NULL);
2483
2204
                break;
2484
2205
        }
2485
2206
        return NOTIFY_OK;
2486
2207
}
2487
2208
 
2488
2209
 
2489
 
asmlinkage void kvm_handle_fault_on_reboot(void)
 
2210
asmlinkage void kvm_spurious_fault(void)
2490
2211
{
2491
 
        if (kvm_rebooting)
2492
 
                /* spin while reset goes on */
2493
 
                while (true)
2494
 
                        ;
2495
2212
        /* Fault while not rebooting.  We want the trace. */
2496
2213
        BUG();
2497
2214
}
2498
 
EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
 
2215
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2499
2216
 
2500
2217
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2501
2218
                      void *v)
2508
2225
         */
2509
2226
        printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2510
2227
        kvm_rebooting = true;
2511
 
        on_each_cpu(hardware_disable, NULL, 1);
 
2228
        on_each_cpu(hardware_disable_nolock, NULL, 1);
2512
2229
        return NOTIFY_OK;
2513
2230
}
2514
2231
 
2517
2234
        .priority = 0,
2518
2235
};
2519
2236
 
2520
 
void kvm_io_bus_init(struct kvm_io_bus *bus)
2521
 
{
2522
 
        memset(bus, 0, sizeof(*bus));
2523
 
}
2524
 
 
2525
 
void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 
2237
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2526
2238
{
2527
2239
        int i;
2528
2240
 
2531
2243
 
2532
2244
                kvm_iodevice_destructor(pos);
2533
2245
        }
 
2246
        kfree(bus);
2534
2247
}
2535
2248
 
2536
2249
/* kvm_io_bus_write - called under kvm->slots_lock */
2537
 
int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
 
2250
int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2538
2251
                     int len, const void *val)
2539
2252
{
2540
2253
        int i;
 
2254
        struct kvm_io_bus *bus;
 
2255
 
 
2256
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2541
2257
        for (i = 0; i < bus->dev_count; i++)
2542
2258
                if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2543
2259
                        return 0;
2545
2261
}
2546
2262
 
2547
2263
/* kvm_io_bus_read - called under kvm->slots_lock */
2548
 
int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
 
2264
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 
2265
                    int len, void *val)
2549
2266
{
2550
2267
        int i;
 
2268
        struct kvm_io_bus *bus;
 
2269
 
 
2270
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2551
2271
        for (i = 0; i < bus->dev_count; i++)
2552
2272
                if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2553
2273
                        return 0;
2554
2274
        return -EOPNOTSUPP;
2555
2275
}
2556
2276
 
2557
 
int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
2558
 
                             struct kvm_io_device *dev)
2559
 
{
2560
 
        int ret;
2561
 
 
2562
 
        down_write(&kvm->slots_lock);
2563
 
        ret = __kvm_io_bus_register_dev(bus, dev);
2564
 
        up_write(&kvm->slots_lock);
2565
 
 
2566
 
        return ret;
2567
 
}
2568
 
 
2569
 
/* An unlocked version. Caller must have write lock on slots_lock. */
2570
 
int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
2571
 
                              struct kvm_io_device *dev)
2572
 
{
 
2277
/* Caller must hold slots_lock. */
 
2278
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
2279
                            struct kvm_io_device *dev)
 
2280
{
 
2281
        struct kvm_io_bus *new_bus, *bus;
 
2282
 
 
2283
        bus = kvm->buses[bus_idx];
2573
2284
        if (bus->dev_count > NR_IOBUS_DEVS-1)
2574
2285
                return -ENOSPC;
2575
2286
 
2576
 
        bus->devs[bus->dev_count++] = dev;
 
2287
        new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
 
2288
        if (!new_bus)
 
2289
                return -ENOMEM;
 
2290
        memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
 
2291
        new_bus->devs[new_bus->dev_count++] = dev;
 
2292
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 
2293
        synchronize_srcu_expedited(&kvm->srcu);
 
2294
        kfree(bus);
2577
2295
 
2578
2296
        return 0;
2579
2297
}
2580
2298
 
2581
 
void kvm_io_bus_unregister_dev(struct kvm *kvm,
2582
 
                               struct kvm_io_bus *bus,
2583
 
                               struct kvm_io_device *dev)
2584
 
{
2585
 
        down_write(&kvm->slots_lock);
2586
 
        __kvm_io_bus_unregister_dev(bus, dev);
2587
 
        up_write(&kvm->slots_lock);
2588
 
}
2589
 
 
2590
 
/* An unlocked version. Caller must have write lock on slots_lock. */
2591
 
void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
2592
 
                                 struct kvm_io_device *dev)
2593
 
{
2594
 
        int i;
2595
 
 
2596
 
        for (i = 0; i < bus->dev_count; i++)
2597
 
                if (bus->devs[i] == dev) {
2598
 
                        bus->devs[i] = bus->devs[--bus->dev_count];
 
2299
/* Caller must hold slots_lock. */
 
2300
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
2301
                              struct kvm_io_device *dev)
 
2302
{
 
2303
        int i, r;
 
2304
        struct kvm_io_bus *new_bus, *bus;
 
2305
 
 
2306
        new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
 
2307
        if (!new_bus)
 
2308
                return -ENOMEM;
 
2309
 
 
2310
        bus = kvm->buses[bus_idx];
 
2311
        memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
 
2312
 
 
2313
        r = -ENOENT;
 
2314
        for (i = 0; i < new_bus->dev_count; i++)
 
2315
                if (new_bus->devs[i] == dev) {
 
2316
                        r = 0;
 
2317
                        new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2599
2318
                        break;
2600
2319
                }
 
2320
 
 
2321
        if (r) {
 
2322
                kfree(new_bus);
 
2323
                return r;
 
2324
        }
 
2325
 
 
2326
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 
2327
        synchronize_srcu_expedited(&kvm->srcu);
 
2328
        kfree(bus);
 
2329
        return r;
2601
2330
}
2602
2331
 
2603
2332
static struct notifier_block kvm_cpu_notifier = {
2604
2333
        .notifier_call = kvm_cpu_hotplug,
2605
 
        .priority = 20, /* must be > scheduler priority */
2606
2334
};
2607
2335
 
2608
2336
static int vm_stat_get(void *_offset, u64 *val)
2666
2394
 
2667
2395
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2668
2396
{
2669
 
        hardware_disable(NULL);
 
2397
        if (kvm_usage_count)
 
2398
                hardware_disable_nolock(NULL);
2670
2399
        return 0;
2671
2400
}
2672
2401
 
2673
2402
static int kvm_resume(struct sys_device *dev)
2674
2403
{
2675
 
        hardware_enable(NULL);
 
2404
        if (kvm_usage_count) {
 
2405
                WARN_ON(spin_is_locked(&kvm_lock));
 
2406
                hardware_enable_nolock(NULL);
 
2407
        }
2676
2408
        return 0;
2677
2409
}
2678
2410
 
2711
2443
        kvm_arch_vcpu_put(vcpu);
2712
2444
}
2713
2445
 
2714
 
int kvm_init(void *opaque, unsigned int vcpu_size,
 
2446
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2715
2447
                  struct module *module)
2716
2448
{
2717
2449
        int r;
2730
2462
 
2731
2463
        bad_pfn = page_to_pfn(bad_page);
2732
2464
 
 
2465
        hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 
2466
 
 
2467
        if (hwpoison_page == NULL) {
 
2468
                r = -ENOMEM;
 
2469
                goto out_free_0;
 
2470
        }
 
2471
 
 
2472
        hwpoison_pfn = page_to_pfn(hwpoison_page);
 
2473
 
 
2474
        fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 
2475
 
 
2476
        if (fault_page == NULL) {
 
2477
                r = -ENOMEM;
 
2478
                goto out_free_0;
 
2479
        }
 
2480
 
 
2481
        fault_pfn = page_to_pfn(fault_page);
 
2482
 
2733
2483
        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2734
2484
                r = -ENOMEM;
2735
2485
                goto out_free_0;
2747
2497
                        goto out_free_1;
2748
2498
        }
2749
2499
 
2750
 
        on_each_cpu(hardware_enable, NULL, 1);
2751
2500
        r = register_cpu_notifier(&kvm_cpu_notifier);
2752
2501
        if (r)
2753
2502
                goto out_free_2;
2762
2511
                goto out_free_4;
2763
2512
 
2764
2513
        /* A kmem cache lets us meet the alignment requirements of fx_save. */
2765
 
        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2766
 
                                           __alignof__(struct kvm_vcpu),
 
2514
        if (!vcpu_align)
 
2515
                vcpu_align = __alignof__(struct kvm_vcpu);
 
2516
        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2767
2517
                                           0, NULL);
2768
2518
        if (!kvm_vcpu_cache) {
2769
2519
                r = -ENOMEM;
2770
2520
                goto out_free_5;
2771
2521
        }
2772
2522
 
 
2523
        r = kvm_async_pf_init();
 
2524
        if (r)
 
2525
                goto out_free;
 
2526
 
2773
2527
        kvm_chardev_ops.owner = module;
2774
2528
        kvm_vm_fops.owner = module;
2775
2529
        kvm_vcpu_fops.owner = module;
2777
2531
        r = misc_register(&kvm_dev);
2778
2532
        if (r) {
2779
2533
                printk(KERN_ERR "kvm: misc device register failed\n");
2780
 
                goto out_free;
 
2534
                goto out_unreg;
2781
2535
        }
2782
2536
 
2783
2537
        kvm_preempt_ops.sched_in = kvm_sched_in;
2787
2541
 
2788
2542
        return 0;
2789
2543
 
 
2544
out_unreg:
 
2545
        kvm_async_pf_deinit();
2790
2546
out_free:
2791
2547
        kmem_cache_destroy(kvm_vcpu_cache);
2792
2548
out_free_5:
2797
2553
        unregister_reboot_notifier(&kvm_reboot_notifier);
2798
2554
        unregister_cpu_notifier(&kvm_cpu_notifier);
2799
2555
out_free_2:
2800
 
        on_each_cpu(hardware_disable, NULL, 1);
2801
2556
out_free_1:
2802
2557
        kvm_arch_hardware_unsetup();
2803
2558
out_free_0a:
2804
2559
        free_cpumask_var(cpus_hardware_enabled);
2805
2560
out_free_0:
 
2561
        if (fault_page)
 
2562
                __free_page(fault_page);
 
2563
        if (hwpoison_page)
 
2564
                __free_page(hwpoison_page);
2806
2565
        __free_page(bad_page);
2807
2566
out:
2808
2567
        kvm_arch_exit();
2813
2572
 
2814
2573
void kvm_exit(void)
2815
2574
{
2816
 
        tracepoint_synchronize_unregister();
2817
2575
        kvm_exit_debug();
2818
2576
        misc_deregister(&kvm_dev);
2819
2577
        kmem_cache_destroy(kvm_vcpu_cache);
 
2578
        kvm_async_pf_deinit();
2820
2579
        sysdev_unregister(&kvm_sysdev);
2821
2580
        sysdev_class_unregister(&kvm_sysdev_class);
2822
2581
        unregister_reboot_notifier(&kvm_reboot_notifier);
2823
2582
        unregister_cpu_notifier(&kvm_cpu_notifier);
2824
 
        on_each_cpu(hardware_disable, NULL, 1);
 
2583
        on_each_cpu(hardware_disable_nolock, NULL, 1);
2825
2584
        kvm_arch_hardware_unsetup();
2826
2585
        kvm_arch_exit();
2827
2586
        free_cpumask_var(cpus_hardware_enabled);
 
2587
        __free_page(hwpoison_page);
2828
2588
        __free_page(bad_page);
2829
2589
}
2830
2590
EXPORT_SYMBOL_GPL(kvm_exit);