3
* Copyright (C) 2011 Citrix Systems Inc.
5
* This file is part of Blktap2.
7
* Blktap2 is free software: you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License version 2
9
* as published by the Free Software Foundation.
11
* Blktap2 is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License version 2 for more details.
16
* You should have received a copy of the GNU General Public License
17
* version 2 along with Blktap2. If not, see
18
* <http://www.gnu.org/licenses/>.
23
#include <linux/device.h>
24
#include <linux/signal.h>
25
#include <linux/sched.h>
26
#include <linux/poll.h>
27
#include <linux/blkdev.h>
28
#include <linux/mman.h>
31
/* VM_RESERVED has disappeared starting from Linux 3.7 and has been
32
* replaced by VM_DONTDUMP since then.
35
#define VM_DONTDUMP VM_RESERVED
40
int blktap_ring_major;
41
static struct cdev blktap_ring_cdev;
44
* BLKTAP - immediately before the mmap area,
45
* we have a bunch of pages reserved for shared memory rings.
49
#define BLKTAP_INFO_SIZE_AT(_memb) \
50
offsetof(struct blktap_device_info, _memb) + \
51
sizeof(((struct blktap_device_info*)0)->_memb)
54
blktap_ring_read_response(struct blktap *tap,
55
const struct blktap_ring_response *rsp)
57
struct blktap_ring *ring = &tap->ring;
58
struct blktap_request *request;
64
if (usr_idx < 0 || usr_idx >= BLKTAP_RING_SIZE) {
69
request = ring->pending[usr_idx];
76
if (rsp->operation != request->operation) {
82
"request %d [%p] response: %d\n",
83
request->usr_idx, request, rsp->status);
85
err = rsp->status == BLKTAP_RSP_OKAY ? 0 : -EIO;
87
blktap_device_end_request(tap, request, err);
92
"invalid response, idx:%d status:%d op:%d/%d: err %d\n",
94
rsp->operation, request->operation,
101
blktap_read_ring(struct blktap *tap)
103
struct blktap_ring *ring = &tap->ring;
104
struct blktap_ring_response rsp;
107
mutex_lock(&ring->vma_lock);
109
mutex_unlock(&ring->vma_lock);
113
/* for each outstanding message on the ring */
114
rp = ring->ring.sring->rsp_prod;
117
for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
118
memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
119
blktap_ring_read_response(tap, &rsp);
122
ring->ring.rsp_cons = rc;
124
mutex_unlock(&ring->vma_lock);
127
#define MMAP_VADDR(_start, _req, _seg) \
129
((_req) * BLKTAP_SEGMENT_MAX * BLKTAP_PAGE_SIZE) + \
130
((_seg) * BLKTAP_PAGE_SIZE))
132
static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
134
return VM_FAULT_SIGBUS;
138
blktap_ring_fail_pending(struct blktap *tap)
140
struct blktap_ring *ring = &tap->ring;
141
struct blktap_request *request;
144
for (usr_idx = 0; usr_idx < BLKTAP_RING_SIZE; usr_idx++) {
145
request = ring->pending[usr_idx];
149
blktap_device_end_request(tap, request, -EIO);
154
blktap_ring_vm_close_sring(struct blktap *tap,
155
struct vm_area_struct *vma)
157
struct blktap_ring *ring = &tap->ring;
158
struct page *page = virt_to_page(ring->ring.sring);
160
blktap_ring_fail_pending(tap);
162
ClearPageReserved(page);
167
if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
168
blktap_control_destroy_tap(tap);
172
blktap_ring_vm_close(struct vm_area_struct *vma)
174
struct blktap *tap = vma->vm_private_data;
176
dev_dbg(tap->ring.dev,
177
"vm_close %lx-%lx (%lu) pgoff %lu\n",
178
vma->vm_start, vma->vm_end, vma_pages(vma),
182
blktap_ring_vm_close_sring(tap, vma);
185
static struct vm_operations_struct blktap_ring_vm_operations = {
186
.close = blktap_ring_vm_close,
187
.fault = blktap_ring_fault,
191
blktap_ring_map_request(struct blktap *tap, struct file *filp,
192
struct blktap_request *request)
194
struct blktap_ring *ring = &tap->ring;
195
unsigned long addr, len, pgoff;
196
int read, write, prot, flags;
198
write = request->operation == BLKTAP_OP_WRITE;
199
read = request->operation == BLKTAP_OP_READ;
202
blktap_request_bounce(tap, request, write);
205
prot |= read ? PROT_WRITE : 0;
207
flags = MAP_FIXED|MAP_SHARED;
209
addr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
210
len = request->nr_pages << PAGE_SHIFT;
212
pgoff = 1 + request->usr_idx * BLKTAP_SEGMENT_MAX;
214
addr = vm_mmap(filp, addr, len, prot, flags, pgoff << PAGE_SHIFT);
216
return IS_ERR_VALUE(addr) ? addr : 0;
220
blktap_ring_unmap_request(struct blktap *tap,
221
struct blktap_request *request)
223
struct blktap_ring *ring = &tap->ring;
224
unsigned long addr, len;
227
read = request->operation == BLKTAP_OP_READ;
229
blktap_request_bounce(tap, request, !read);
231
addr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
232
len = request->nr_pages << PAGE_SHIFT;
234
err = vm_munmap(addr, len);
239
blktap_ring_free_request(struct blktap *tap,
240
struct blktap_request *request)
242
struct blktap_ring *ring = &tap->ring;
244
ring->pending[request->usr_idx] = NULL;
247
blktap_request_free(tap, request);
250
struct blktap_request*
251
blktap_ring_make_request(struct blktap *tap)
253
struct blktap_ring *ring = &tap->ring;
254
struct blktap_request *request;
257
if (RING_FULL(&ring->ring))
258
return ERR_PTR(-ENOSPC);
260
request = blktap_request_alloc(tap);
262
return ERR_PTR(-ENOMEM);
264
for (usr_idx = 0; usr_idx < BLKTAP_RING_SIZE; usr_idx++)
265
if (!ring->pending[usr_idx])
268
BUG_ON(usr_idx >= BLKTAP_RING_SIZE);
271
request->usr_idx = usr_idx;
273
ring->pending[usr_idx] = request;
280
blktap_ring_make_rw_request(struct blktap *tap,
281
struct blktap_request *request,
282
struct blktap_ring_request *breq)
284
struct scatterlist *sg;
285
unsigned int i, nsecs = 0;
287
blktap_for_each_sg(sg, request, i) {
288
struct blktap_segment *seg = &breq->u.rw.seg[i];
291
count = sg->length >> 9;
292
first = sg->offset >> 9;
294
seg->first_sect = first;
295
seg->last_sect = first + count - 1;
300
breq->u.rw.sector_number = blk_rq_pos(request->rq);
306
blktap_ring_make_tr_request(struct blktap *tap,
307
struct blktap_request *request,
308
struct blktap_ring_request *breq)
310
struct bio *bio = request->rq->bio;
313
breq->u.tr.nr_sectors = nsecs = bio_sectors(bio);
314
breq->u.tr.sector_number = bio->bi_iter.bi_sector;
320
blktap_ring_submit_request(struct blktap *tap,
321
struct blktap_request *request)
323
struct blktap_ring *ring = &tap->ring;
324
struct blktap_ring_request *breq;
328
"request %d [%p] submit\n", request->usr_idx, request);
330
breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
332
breq->id = request->usr_idx;
334
breq->operation = request->operation;
335
breq->nr_segments = request->nr_pages;
337
switch (breq->operation) {
339
nsecs = blktap_ring_make_rw_request(tap, request, breq);
341
tap->stats.st_rd_sect += nsecs;
342
tap->stats.st_rd_req++;
345
case BLKTAP_OP_WRITE:
346
nsecs = blktap_ring_make_rw_request(tap, request, breq);
348
tap->stats.st_wr_sect += nsecs;
349
tap->stats.st_wr_req++;
352
case BLKTAP_OP_FLUSH:
353
breq->u.rw.sector_number = 0;
354
tap->stats.st_fl_req++;
358
nsecs = blktap_ring_make_tr_request(tap, request, breq);
360
tap->stats.st_tr_sect += nsecs;
361
tap->stats.st_tr_req++;
367
ring->ring.req_prod_pvt++;
371
blktap_ring_open(struct inode *inode, struct file *filp)
373
struct blktap *tap = NULL;
376
minor = iminor(inode);
378
if (minor < blktap_max_minor)
379
tap = blktaps[minor];
384
if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
390
filp->private_data = tap;
391
tap->ring.task = current;
397
blktap_ring_release(struct inode *inode, struct file *filp)
399
struct blktap *tap = filp->private_data;
401
blktap_device_destroy_sync(tap);
403
tap->ring.task = NULL;
405
if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
406
blktap_control_destroy_tap(tap);
412
blktap_ring_mmap_request(struct blktap *tap,
413
struct vm_area_struct *vma)
415
struct blktap_ring *ring = &tap->ring;
416
struct blktap_request *request;
417
int usr_idx, seg, err;
418
unsigned long addr, n_segs;
420
usr_idx = vma->vm_pgoff - 1;
421
seg = usr_idx % BLKTAP_SEGMENT_MAX;
422
usr_idx /= BLKTAP_SEGMENT_MAX;
424
request = ring->pending[usr_idx];
428
n_segs = request->nr_pages - seg;
429
n_segs = min(n_segs, vma_pages(vma));
431
for (addr = vma->vm_start;
433
seg++, addr += PAGE_SIZE) {
434
struct page *page = request->pages[seg];
436
dev_dbg(tap->ring.dev,
437
"mmap request %d seg %d addr %lx\n",
440
err = vm_insert_page(vma, addr, page);
445
vma->vm_flags |= VM_DONTCOPY;
446
vma->vm_flags |= VM_DONTDUMP;
452
blktap_ring_mmap_sring(struct blktap *tap, struct vm_area_struct *vma)
454
struct blktap_ring *ring = &tap->ring;
455
struct blktap_sring *sring;
456
struct page *page = NULL;
462
page = alloc_page(GFP_KERNEL|__GFP_ZERO);
466
SetPageReserved(page);
468
err = vm_insert_page(vma, vma->vm_start, page);
472
sring = page_address(page);
473
SHARED_RING_INIT(sring);
474
FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
476
ring->ring_vstart = vma->vm_start;
477
ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
479
vma->vm_private_data = tap;
481
vma->vm_flags |= VM_DONTCOPY;
482
vma->vm_flags |= VM_DONTDUMP;
484
vma->vm_ops = &blktap_ring_vm_operations;
491
ClearPageReserved(page);
499
blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
501
struct blktap *tap = filp->private_data;
503
dev_dbg(tap->ring.dev,
504
"mmap %lx-%lx (%lu) pgoff %lu\n",
505
vma->vm_start, vma->vm_end, vma_pages(vma),
509
return blktap_ring_mmap_sring(tap, vma);
511
return blktap_ring_mmap_request(tap, vma);
515
blktap_ring_ioctl(struct file *filp,
516
unsigned int cmd, unsigned long arg)
518
struct blktap *tap = filp->private_data;
519
struct blktap_ring *ring = &tap->ring;
520
void __user *ptr = (void *)arg;
523
BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
525
if (!ring->vma || ring->vma->vm_mm != current->mm)
529
case BLKTAP_IOCTL_RESPOND:
531
blktap_read_ring(tap);
534
case BLKTAP_IOCTL_CREATE_DEVICE_COMPAT: {
535
struct blktap_device_info info;
536
struct blktap2_params params;
538
if (copy_from_user(¶ms, ptr, sizeof(params)))
541
info.capacity = params.capacity;
542
info.sector_size = params.sector_size;
545
err = blktap_device_create(tap, &info);
549
if (params.name[0]) {
550
strncpy(tap->name, params.name, sizeof(params.name));
551
tap->name[sizeof(tap->name)-1] = 0;
557
case BLKTAP_IOCTL_CREATE_DEVICE: {
558
struct blktap_device_info __user *ptr = (void *)arg;
559
struct blktap_device_info info;
563
mask = BLKTAP_DEVICE_FLAG_RO;
564
mask |= BLKTAP_DEVICE_FLAG_PSZ;
565
mask |= BLKTAP_DEVICE_FLAG_FLUSH;
566
mask |= BLKTAP_DEVICE_FLAG_TRIM;
567
mask |= BLKTAP_DEVICE_FLAG_TRIM_RZ;
569
memset(&info, 0, sizeof(info));
570
sz = base_sz = BLKTAP_INFO_SIZE_AT(flags);
572
if (copy_from_user(&info, ptr, sz))
575
if ((info.flags & BLKTAP_DEVICE_FLAG_PSZ) != 0)
576
sz = BLKTAP_INFO_SIZE_AT(phys_block_offset);
578
if (info.flags & BLKTAP_DEVICE_FLAG_TRIM)
579
sz = BLKTAP_INFO_SIZE_AT(trim_block_offset);
582
if (copy_from_user(&info, ptr, sz))
585
if (put_user(info.flags & mask, &ptr->flags))
588
return blktap_device_create(tap, &info);
591
case BLKTAP_IOCTL_REMOVE_DEVICE:
593
return blktap_device_destroy(tap);
599
static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
601
struct blktap *tap = filp->private_data;
602
struct blktap_ring *ring = &tap->ring;
605
poll_wait(filp, &tap->pool->wait, wait);
606
poll_wait(filp, &ring->poll_wait, wait);
608
mutex_lock(&ring->vma_lock);
609
if (ring->vma && tap->device.gd)
610
blktap_device_run_queue(tap, filp);
611
mutex_unlock(&ring->vma_lock);
613
work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
614
RING_PUSH_REQUESTS(&ring->ring);
617
*BLKTAP_RING_MESSAGE(ring->ring.sring) ||
618
test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
619
return POLLIN | POLLRDNORM;
624
static struct file_operations blktap_ring_file_operations = {
625
.owner = THIS_MODULE,
626
.open = blktap_ring_open,
627
.release = blktap_ring_release,
628
.unlocked_ioctl = blktap_ring_ioctl,
629
.mmap = blktap_ring_mmap,
630
.poll = blktap_ring_poll,
634
blktap_ring_kick_user(struct blktap *tap)
636
wake_up(&tap->ring.poll_wait);
640
blktap_ring_destroy(struct blktap *tap)
642
struct blktap_ring *ring = &tap->ring;
644
if (ring->task || ring->vma)
651
blktap_ring_create(struct blktap *tap)
653
struct blktap_ring *ring = &tap->ring;
655
init_waitqueue_head(&ring->poll_wait);
656
ring->devno = MKDEV(blktap_ring_major, tap->minor);
657
mutex_init(&ring->vma_lock);
663
blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
665
struct blktap_ring *ring = &tap->ring;
666
char *s = buf, *end = buf + size;
669
s += snprintf(s, end - s,
670
"begin pending:%d\n", ring->n_pending);
672
for (usr_idx = 0; usr_idx < BLKTAP_RING_SIZE; usr_idx++) {
673
struct blktap_request *request;
676
request = ring->pending[usr_idx];
680
jiffies_to_timeval(jiffies - request->rq->start_time, &t);
682
s += snprintf(s, end - s,
683
"%02d: usr_idx:%02d "
684
"op:%x nr_pages:%02d time:%lu.%09lu\n",
685
usr_idx, request->usr_idx,
686
request->operation, request->nr_pages,
687
t.tv_sec, t.tv_usec);
690
s += snprintf(s, end - s, "end pending\n");
697
blktap_ring_init(void)
702
cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations);
703
blktap_ring_cdev.owner = THIS_MODULE;
705
err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2");
707
BTERR("error registering ring devices: %d\n", err);
711
err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE);
713
BTERR("error adding ring device: %d\n", err);
714
unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE);
718
blktap_ring_major = MAJOR(dev);
719
BTINFO("blktap ring major: %d\n", blktap_ring_major);
725
blktap_ring_exit(void)
727
if (!blktap_ring_major)
730
cdev_del(&blktap_ring_cdev);
731
unregister_chrdev_region(MKDEV(blktap_ring_major, 0),
734
blktap_ring_major = 0;