5
#include <linux/module.h>
7
#include <linux/genhd.h>
8
#include <linux/kdev_t.h>
9
#include <linux/kernel.h>
10
#include <linux/blkdev.h>
11
#include <linux/init.h>
12
#include <linux/spinlock.h>
13
#include <linux/proc_fs.h>
14
#include <linux/seq_file.h>
15
#include <linux/slab.h>
16
#include <linux/kmod.h>
17
#include <linux/kobj_map.h>
18
#include <linux/buffer_head.h>
19
#include <linux/mutex.h>
20
#include <linux/idr.h>
21
#include <linux/log2.h>
25
static DEFINE_MUTEX(block_class_lock);
26
struct kobject *block_depr;
28
/* for extended dynamic devt allocation, currently only one major is used */
29
#define MAX_EXT_DEVT (1 << MINORBITS)
31
/* For extended devt allocation. ext_devt_mutex prevents look up
32
* results from going away underneath its user.
34
static DEFINE_MUTEX(ext_devt_mutex);
35
static DEFINE_IDR(ext_devt_idr);
37
static struct device_type disk_type;
39
static void disk_add_events(struct gendisk *disk);
40
static void disk_del_events(struct gendisk *disk);
41
static void disk_release_events(struct gendisk *disk);
44
* disk_get_part - get partition
45
* @disk: disk to look partition from
46
* @partno: partition number
48
* Look for partition @partno from @disk. If found, increment
49
* reference count and return it.
55
* Pointer to the found partition on success, NULL if not found.
57
struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
59
struct hd_struct *part = NULL;
60
struct disk_part_tbl *ptbl;
62
if (unlikely(partno < 0))
67
ptbl = rcu_dereference(disk->part_tbl);
68
if (likely(partno < ptbl->len)) {
69
part = rcu_dereference(ptbl->part[partno]);
71
get_device(part_to_dev(part));
78
EXPORT_SYMBOL_GPL(disk_get_part);
81
* disk_part_iter_init - initialize partition iterator
82
* @piter: iterator to initialize
83
* @disk: disk to iterate over
84
* @flags: DISK_PITER_* flags
86
* Initialize @piter so that it iterates over partitions of @disk.
91
void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
94
struct disk_part_tbl *ptbl;
97
ptbl = rcu_dereference(disk->part_tbl);
102
if (flags & DISK_PITER_REVERSE)
103
piter->idx = ptbl->len - 1;
104
else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
109
piter->flags = flags;
113
EXPORT_SYMBOL_GPL(disk_part_iter_init);
116
* disk_part_iter_next - proceed iterator to the next partition and return it
117
* @piter: iterator of interest
119
* Proceed @piter to the next partition and return it.
124
struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
126
struct disk_part_tbl *ptbl;
129
/* put the last partition */
130
disk_put_part(piter->part);
135
ptbl = rcu_dereference(piter->disk->part_tbl);
137
/* determine iteration parameters */
138
if (piter->flags & DISK_PITER_REVERSE) {
140
if (piter->flags & (DISK_PITER_INCL_PART0 |
141
DISK_PITER_INCL_EMPTY_PART0))
150
/* iterate to the next partition */
151
for (; piter->idx != end; piter->idx += inc) {
152
struct hd_struct *part;
154
part = rcu_dereference(ptbl->part[piter->idx]);
157
if (!part->nr_sects &&
158
!(piter->flags & DISK_PITER_INCL_EMPTY) &&
159
!(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
163
get_device(part_to_dev(part));
173
EXPORT_SYMBOL_GPL(disk_part_iter_next);
176
* disk_part_iter_exit - finish up partition iteration
177
* @piter: iter of interest
179
* Called when iteration is over. Cleans up @piter.
184
void disk_part_iter_exit(struct disk_part_iter *piter)
186
disk_put_part(piter->part);
189
EXPORT_SYMBOL_GPL(disk_part_iter_exit);
191
static inline int sector_in_part(struct hd_struct *part, sector_t sector)
193
return part->start_sect <= sector &&
194
sector < part->start_sect + part->nr_sects;
198
* disk_map_sector_rcu - map sector to partition
199
* @disk: gendisk of interest
200
* @sector: sector to map
202
* Find out which partition @sector maps to on @disk. This is
203
* primarily used for stats accounting.
206
* RCU read locked. The returned partition pointer is valid only
207
* while preemption is disabled.
210
* Found partition on success, part0 is returned if no partition matches
212
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
214
struct disk_part_tbl *ptbl;
215
struct hd_struct *part;
218
ptbl = rcu_dereference(disk->part_tbl);
220
part = rcu_dereference(ptbl->last_lookup);
221
if (part && sector_in_part(part, sector))
224
for (i = 1; i < ptbl->len; i++) {
225
part = rcu_dereference(ptbl->part[i]);
227
if (part && sector_in_part(part, sector)) {
228
rcu_assign_pointer(ptbl->last_lookup, part);
234
EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
237
* Can be deleted altogether. Later.
240
static struct blk_major_name {
241
struct blk_major_name *next;
244
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
246
/* index in the above - for now: assume no multimajor ranges */
247
static inline int major_to_index(unsigned major)
249
return major % BLKDEV_MAJOR_HASH_SIZE;
252
#ifdef CONFIG_PROC_FS
253
void blkdev_show(struct seq_file *seqf, off_t offset)
255
struct blk_major_name *dp;
257
if (offset < BLKDEV_MAJOR_HASH_SIZE) {
258
mutex_lock(&block_class_lock);
259
for (dp = major_names[offset]; dp; dp = dp->next)
260
seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
261
mutex_unlock(&block_class_lock);
264
#endif /* CONFIG_PROC_FS */
267
* register_blkdev - register a new block device
269
* @major: the requested major device number [1..255]. If @major=0, try to
270
* allocate any unused major number.
271
* @name: the name of the new block device as a zero terminated string
273
* The @name must be unique within the system.
275
* The return value depends on the @major input parameter.
276
* - if a major device number was requested in range [1..255] then the
277
* function returns zero on success, or a negative error code
278
* - if any unused major number was requested with @major=0 parameter
279
* then the return value is the allocated major number in range
280
* [1..255] or a negative error code otherwise
282
int register_blkdev(unsigned int major, const char *name)
284
struct blk_major_name **n, *p;
287
mutex_lock(&block_class_lock);
291
for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
292
if (major_names[index] == NULL)
297
printk("register_blkdev: failed to get major for %s\n",
306
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
313
strlcpy(p->name, name, sizeof(p->name));
315
index = major_to_index(major);
317
for (n = &major_names[index]; *n; n = &(*n)->next) {
318
if ((*n)->major == major)
327
printk("register_blkdev: cannot get major %d for %s\n",
332
mutex_unlock(&block_class_lock);
336
EXPORT_SYMBOL(register_blkdev);
338
void unregister_blkdev(unsigned int major, const char *name)
340
struct blk_major_name **n;
341
struct blk_major_name *p = NULL;
342
int index = major_to_index(major);
344
mutex_lock(&block_class_lock);
345
for (n = &major_names[index]; *n; n = &(*n)->next)
346
if ((*n)->major == major)
348
if (!*n || strcmp((*n)->name, name)) {
354
mutex_unlock(&block_class_lock);
358
EXPORT_SYMBOL(unregister_blkdev);
360
static struct kobj_map *bdev_map;
363
* blk_mangle_minor - scatter minor numbers apart
364
* @minor: minor number to mangle
366
* Scatter consecutively allocated @minor number apart if MANGLE_DEVT
367
* is enabled. Mangling twice gives the original value.
375
static int blk_mangle_minor(int minor)
377
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
380
for (i = 0; i < MINORBITS / 2; i++) {
381
int low = minor & (1 << i);
382
int high = minor & (1 << (MINORBITS - 1 - i));
383
int distance = MINORBITS - 1 - 2 * i;
385
minor ^= low | high; /* clear both bits */
386
low <<= distance; /* swap the positions */
388
minor |= low | high; /* and set */
395
* blk_alloc_devt - allocate a dev_t for a partition
396
* @part: partition to allocate dev_t for
397
* @devt: out parameter for resulting dev_t
399
* Allocate a dev_t for block device.
402
* 0 on success, allocated dev_t is returned in *@devt. -errno on
408
int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
410
struct gendisk *disk = part_to_disk(part);
413
/* in consecutive minor range? */
414
if (part->partno < disk->minors) {
415
*devt = MKDEV(disk->major, disk->first_minor + part->partno);
419
/* allocate ext devt */
421
if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
423
rc = idr_get_new(&ext_devt_idr, part, &idx);
424
} while (rc == -EAGAIN);
429
if (idx > MAX_EXT_DEVT) {
430
idr_remove(&ext_devt_idr, idx);
434
*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
439
* blk_free_devt - free a dev_t
440
* @devt: dev_t to free
442
* Free @devt which was allocated using blk_alloc_devt().
447
void blk_free_devt(dev_t devt)
451
if (devt == MKDEV(0, 0))
454
if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
455
mutex_lock(&ext_devt_mutex);
456
idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
457
mutex_unlock(&ext_devt_mutex);
461
static char *bdevt_str(dev_t devt, char *buf)
463
if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
464
char tbuf[BDEVT_SIZE];
465
snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
466
snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
468
snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
474
* Register device numbers dev..(dev+range-1)
475
* range must be nonzero
476
* The hash chain is sorted on range, so that subranges can override.
478
void blk_register_region(dev_t devt, unsigned long range, struct module *module,
479
struct kobject *(*probe)(dev_t, int *, void *),
480
int (*lock)(dev_t, void *), void *data)
482
kobj_map(bdev_map, devt, range, module, probe, lock, data);
485
EXPORT_SYMBOL(blk_register_region);
487
void blk_unregister_region(dev_t devt, unsigned long range)
489
kobj_unmap(bdev_map, devt, range);
492
EXPORT_SYMBOL(blk_unregister_region);
494
static struct kobject *exact_match(dev_t devt, int *partno, void *data)
496
struct gendisk *p = data;
498
return &disk_to_dev(p)->kobj;
501
static int exact_lock(dev_t devt, void *data)
503
struct gendisk *p = data;
510
void register_disk(struct gendisk *disk)
512
struct device *ddev = disk_to_dev(disk);
513
struct block_device *bdev;
514
struct disk_part_iter piter;
515
struct hd_struct *part;
518
ddev->parent = disk->driverfs_dev;
520
dev_set_name(ddev, disk->disk_name);
522
/* delay uevents, until we scanned partition table */
523
dev_set_uevent_suppress(ddev, 1);
525
if (device_add(ddev))
527
if (!sysfs_deprecated) {
528
err = sysfs_create_link(block_depr, &ddev->kobj,
529
kobject_name(&ddev->kobj));
535
disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
536
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
538
/* No minors to use for partitions */
539
if (!disk_part_scan_enabled(disk))
542
/* No such device (e.g., media were just removed) */
543
if (!get_capacity(disk))
546
bdev = bdget_disk(disk, 0);
550
bdev->bd_invalidated = 1;
551
err = blkdev_get(bdev, FMODE_READ, NULL);
554
blkdev_put(bdev, FMODE_READ);
557
/* announce disk after possible partitions are created */
558
dev_set_uevent_suppress(ddev, 0);
559
kobject_uevent(&ddev->kobj, KOBJ_ADD);
561
/* announce possible partitions */
562
disk_part_iter_init(&piter, disk, 0);
563
while ((part = disk_part_iter_next(&piter)))
564
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
565
disk_part_iter_exit(&piter);
569
* add_disk - add partitioning information to kernel list
570
* @disk: per-device partitioning information
572
* This function registers the partitioning information in @disk
575
* FIXME: error handling
577
void add_disk(struct gendisk *disk)
579
struct backing_dev_info *bdi;
583
/* minors == 0 indicates to use ext devt from part0 and should
584
* be accompanied with EXT_DEVT flag. Make sure all
585
* parameters make sense.
587
WARN_ON(disk->minors && !(disk->major || disk->first_minor));
588
WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
590
disk->flags |= GENHD_FL_UP;
592
retval = blk_alloc_devt(&disk->part0, &devt);
597
disk_to_dev(disk)->devt = devt;
599
/* ->major and ->first_minor aren't supposed to be
600
* dereferenced from here on, but set them just in case.
602
disk->major = MAJOR(devt);
603
disk->first_minor = MINOR(devt);
605
/* Register BDI before referencing it from bdev */
606
bdi = &disk->queue->backing_dev_info;
607
bdi_register_dev(bdi, disk_devt(disk));
609
blk_register_region(disk_devt(disk), disk->minors, NULL,
610
exact_match, exact_lock, disk);
612
blk_register_queue(disk);
615
* Take an extra ref on queue which will be put on disk_release()
616
* so that it sticks around as long as @disk is there.
618
WARN_ON_ONCE(blk_get_queue(disk->queue));
620
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
624
disk_add_events(disk);
626
EXPORT_SYMBOL(add_disk);
628
void del_gendisk(struct gendisk *disk)
630
struct disk_part_iter piter;
631
struct hd_struct *part;
633
disk_del_events(disk);
635
/* invalidate stuff */
636
disk_part_iter_init(&piter, disk,
637
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
638
while ((part = disk_part_iter_next(&piter))) {
639
invalidate_partition(disk, part->partno);
640
delete_partition(disk, part->partno);
642
disk_part_iter_exit(&piter);
644
invalidate_partition(disk, 0);
645
blk_free_devt(disk_to_dev(disk)->devt);
646
set_capacity(disk, 0);
647
disk->flags &= ~GENHD_FL_UP;
649
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
650
bdi_unregister(&disk->queue->backing_dev_info);
651
blk_unregister_queue(disk);
652
blk_unregister_region(disk_devt(disk), disk->minors);
654
part_stat_set_all(&disk->part0, 0);
655
disk->part0.stamp = 0;
657
kobject_put(disk->part0.holder_dir);
658
kobject_put(disk->slave_dir);
659
disk->driverfs_dev = NULL;
660
if (!sysfs_deprecated)
661
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
662
device_del(disk_to_dev(disk));
664
EXPORT_SYMBOL(del_gendisk);
667
* get_gendisk - get partitioning information for a given device
668
* @devt: device to get partitioning information for
669
* @partno: returned partition index
671
* This function gets the structure containing partitioning
672
* information for the given device @devt.
674
struct gendisk *get_gendisk(dev_t devt, int *partno)
676
struct gendisk *disk = NULL;
678
if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
679
struct kobject *kobj;
681
kobj = kobj_lookup(bdev_map, devt, partno);
683
disk = dev_to_disk(kobj_to_dev(kobj));
685
struct hd_struct *part;
687
mutex_lock(&ext_devt_mutex);
688
part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
689
if (part && get_disk(part_to_disk(part))) {
690
*partno = part->partno;
691
disk = part_to_disk(part);
693
mutex_unlock(&ext_devt_mutex);
698
EXPORT_SYMBOL(get_gendisk);
701
* bdget_disk - do bdget() by gendisk and partition number
702
* @disk: gendisk of interest
703
* @partno: partition number
705
* Find partition @partno from @disk, do bdget() on it.
711
* Resulting block_device on success, NULL on failure.
713
struct block_device *bdget_disk(struct gendisk *disk, int partno)
715
struct hd_struct *part;
716
struct block_device *bdev = NULL;
718
part = disk_get_part(disk, partno);
720
bdev = bdget(part_devt(part));
725
EXPORT_SYMBOL(bdget_disk);
728
* print a full list of all partitions - intended for places where the root
729
* filesystem can't be mounted and thus to give the victim some idea of what
732
void __init printk_all_partitions(void)
734
struct class_dev_iter iter;
737
class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
738
while ((dev = class_dev_iter_next(&iter))) {
739
struct gendisk *disk = dev_to_disk(dev);
740
struct disk_part_iter piter;
741
struct hd_struct *part;
742
char name_buf[BDEVNAME_SIZE];
743
char devt_buf[BDEVT_SIZE];
744
u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
747
* Don't show empty devices or things that have been
750
if (get_capacity(disk) == 0 ||
751
(disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
755
* Note, unlike /proc/partitions, I am showing the
756
* numbers in hex - the same format as the root=
759
disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
760
while ((part = disk_part_iter_next(&piter))) {
761
bool is_part0 = part == &disk->part0;
765
part_unpack_uuid(part->info->uuid, uuid);
767
printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
768
bdevt_str(part_devt(part), devt_buf),
769
(unsigned long long)part->nr_sects >> 1,
770
disk_name(disk, part->partno, name_buf), uuid);
772
if (disk->driverfs_dev != NULL &&
773
disk->driverfs_dev->driver != NULL)
774
printk(" driver: %s\n",
775
disk->driverfs_dev->driver->name);
777
printk(" (driver?)\n");
781
disk_part_iter_exit(&piter);
783
class_dev_iter_exit(&iter);
786
#ifdef CONFIG_PROC_FS
788
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
791
struct class_dev_iter *iter;
794
iter = kmalloc(sizeof(*iter), GFP_KERNEL);
796
return ERR_PTR(-ENOMEM);
798
seqf->private = iter;
799
class_dev_iter_init(iter, &block_class, NULL, &disk_type);
801
dev = class_dev_iter_next(iter);
806
return dev_to_disk(dev);
809
static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
814
dev = class_dev_iter_next(seqf->private);
816
return dev_to_disk(dev);
821
static void disk_seqf_stop(struct seq_file *seqf, void *v)
823
struct class_dev_iter *iter = seqf->private;
825
/* stop is called even after start failed :-( */
827
class_dev_iter_exit(iter);
832
static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
836
p = disk_seqf_start(seqf, pos);
837
if (!IS_ERR_OR_NULL(p) && !*pos)
838
seq_puts(seqf, "major minor #blocks name\n\n");
842
static int show_partition(struct seq_file *seqf, void *v)
844
struct gendisk *sgp = v;
845
struct disk_part_iter piter;
846
struct hd_struct *part;
847
char buf[BDEVNAME_SIZE];
849
/* Don't show non-partitionable removeable devices or empty devices */
850
if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
851
(sgp->flags & GENHD_FL_REMOVABLE)))
853
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
856
/* show the full disk and all non-0 size partitions of it */
857
disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
858
while ((part = disk_part_iter_next(&piter)))
859
seq_printf(seqf, "%4d %7d %10llu %s\n",
860
MAJOR(part_devt(part)), MINOR(part_devt(part)),
861
(unsigned long long)part->nr_sects >> 1,
862
disk_name(sgp, part->partno, buf));
863
disk_part_iter_exit(&piter);
868
static const struct seq_operations partitions_op = {
869
.start = show_partition_start,
870
.next = disk_seqf_next,
871
.stop = disk_seqf_stop,
872
.show = show_partition
875
static int partitions_open(struct inode *inode, struct file *file)
877
return seq_open(file, &partitions_op);
880
static const struct file_operations proc_partitions_operations = {
881
.open = partitions_open,
884
.release = seq_release,
889
static struct kobject *base_probe(dev_t devt, int *partno, void *data)
891
if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
892
/* Make old-style 2.4 aliases work */
893
request_module("block-major-%d", MAJOR(devt));
897
static int __init genhd_device_init(void)
901
block_class.dev_kobj = sysfs_dev_block_kobj;
902
error = class_register(&block_class);
905
bdev_map = kobj_map_init(base_probe, &block_class_lock);
908
register_blkdev(BLOCK_EXT_MAJOR, "blkext");
910
/* create top-level block dir */
911
if (!sysfs_deprecated)
912
block_depr = kobject_create_and_add("block", NULL);
916
subsys_initcall(genhd_device_init);
918
static ssize_t disk_range_show(struct device *dev,
919
struct device_attribute *attr, char *buf)
921
struct gendisk *disk = dev_to_disk(dev);
923
return sprintf(buf, "%d\n", disk->minors);
926
static ssize_t disk_ext_range_show(struct device *dev,
927
struct device_attribute *attr, char *buf)
929
struct gendisk *disk = dev_to_disk(dev);
931
return sprintf(buf, "%d\n", disk_max_parts(disk));
934
static ssize_t disk_removable_show(struct device *dev,
935
struct device_attribute *attr, char *buf)
937
struct gendisk *disk = dev_to_disk(dev);
939
return sprintf(buf, "%d\n",
940
(disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
943
static ssize_t disk_ro_show(struct device *dev,
944
struct device_attribute *attr, char *buf)
946
struct gendisk *disk = dev_to_disk(dev);
948
return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
951
static ssize_t disk_capability_show(struct device *dev,
952
struct device_attribute *attr, char *buf)
954
struct gendisk *disk = dev_to_disk(dev);
956
return sprintf(buf, "%x\n", disk->flags);
959
static ssize_t disk_alignment_offset_show(struct device *dev,
960
struct device_attribute *attr,
963
struct gendisk *disk = dev_to_disk(dev);
965
return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
968
static ssize_t disk_discard_alignment_show(struct device *dev,
969
struct device_attribute *attr,
972
struct gendisk *disk = dev_to_disk(dev);
974
return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
977
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
978
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
979
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
980
static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
981
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
982
static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
983
static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
985
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
986
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
987
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
988
#ifdef CONFIG_FAIL_MAKE_REQUEST
989
static struct device_attribute dev_attr_fail =
990
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
992
#ifdef CONFIG_FAIL_IO_TIMEOUT
993
static struct device_attribute dev_attr_fail_timeout =
994
__ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show,
998
static struct attribute *disk_attrs[] = {
999
&dev_attr_range.attr,
1000
&dev_attr_ext_range.attr,
1001
&dev_attr_removable.attr,
1003
&dev_attr_size.attr,
1004
&dev_attr_alignment_offset.attr,
1005
&dev_attr_discard_alignment.attr,
1006
&dev_attr_capability.attr,
1007
&dev_attr_stat.attr,
1008
&dev_attr_inflight.attr,
1009
#ifdef CONFIG_FAIL_MAKE_REQUEST
1010
&dev_attr_fail.attr,
1012
#ifdef CONFIG_FAIL_IO_TIMEOUT
1013
&dev_attr_fail_timeout.attr,
1018
static struct attribute_group disk_attr_group = {
1019
.attrs = disk_attrs,
1022
static const struct attribute_group *disk_attr_groups[] = {
1028
* disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1029
* @disk: disk to replace part_tbl for
1030
* @new_ptbl: new part_tbl to install
1032
* Replace disk->part_tbl with @new_ptbl in RCU-safe way. The
1033
* original ptbl is freed using RCU callback.
1036
* Matching bd_mutx locked.
1038
static void disk_replace_part_tbl(struct gendisk *disk,
1039
struct disk_part_tbl *new_ptbl)
1041
struct disk_part_tbl *old_ptbl = disk->part_tbl;
1043
rcu_assign_pointer(disk->part_tbl, new_ptbl);
1046
rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1047
kfree_rcu(old_ptbl, rcu_head);
1052
* disk_expand_part_tbl - expand disk->part_tbl
1053
* @disk: disk to expand part_tbl for
1054
* @partno: expand such that this partno can fit in
1056
* Expand disk->part_tbl such that @partno can fit in. disk->part_tbl
1057
* uses RCU to allow unlocked dereferencing for stats and other stuff.
1060
* Matching bd_mutex locked, might sleep.
1063
* 0 on success, -errno on failure.
1065
int disk_expand_part_tbl(struct gendisk *disk, int partno)
1067
struct disk_part_tbl *old_ptbl = disk->part_tbl;
1068
struct disk_part_tbl *new_ptbl;
1069
int len = old_ptbl ? old_ptbl->len : 0;
1070
int target = partno + 1;
1074
/* disk_max_parts() is zero during initialization, ignore if so */
1075
if (disk_max_parts(disk) && target > disk_max_parts(disk))
1081
size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1082
new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1086
new_ptbl->len = target;
1088
for (i = 0; i < len; i++)
1089
rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1091
disk_replace_part_tbl(disk, new_ptbl);
1095
static void disk_release(struct device *dev)
1097
struct gendisk *disk = dev_to_disk(dev);
1099
disk_release_events(disk);
1100
kfree(disk->random);
1101
disk_replace_part_tbl(disk, NULL);
1102
free_part_stats(&disk->part0);
1103
free_part_info(&disk->part0);
1105
blk_put_queue(disk->queue);
1108
struct class block_class = {
1112
static char *block_devnode(struct device *dev, mode_t *mode)
1114
struct gendisk *disk = dev_to_disk(dev);
1117
return disk->devnode(disk, mode);
1121
static struct device_type disk_type = {
1123
.groups = disk_attr_groups,
1124
.release = disk_release,
1125
.devnode = block_devnode,
1128
#ifdef CONFIG_PROC_FS
1130
* aggregate disk stat collector. Uses the same stats that the sysfs
1131
* entries do, above, but makes them available through one seq_file.
1133
* The output looks suspiciously like /proc/partitions with a bunch of
1136
static int diskstats_show(struct seq_file *seqf, void *v)
1138
struct gendisk *gp = v;
1139
struct disk_part_iter piter;
1140
struct hd_struct *hd;
1141
char buf[BDEVNAME_SIZE];
1145
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1146
seq_puts(seqf, "major minor name"
1147
" rio rmerge rsect ruse wio wmerge "
1148
"wsect wuse running use aveq"
1152
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1153
while ((hd = disk_part_iter_next(&piter))) {
1154
cpu = part_stat_lock();
1155
part_round_stats(cpu, hd);
1157
seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
1158
"%u %lu %lu %lu %u %u %u %u\n",
1159
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1160
disk_name(gp, hd->partno, buf),
1161
part_stat_read(hd, ios[READ]),
1162
part_stat_read(hd, merges[READ]),
1163
part_stat_read(hd, sectors[READ]),
1164
jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1165
part_stat_read(hd, ios[WRITE]),
1166
part_stat_read(hd, merges[WRITE]),
1167
part_stat_read(hd, sectors[WRITE]),
1168
jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1170
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1171
jiffies_to_msecs(part_stat_read(hd, time_in_queue))
1174
disk_part_iter_exit(&piter);
1179
static const struct seq_operations diskstats_op = {
1180
.start = disk_seqf_start,
1181
.next = disk_seqf_next,
1182
.stop = disk_seqf_stop,
1183
.show = diskstats_show
1186
static int diskstats_open(struct inode *inode, struct file *file)
1188
return seq_open(file, &diskstats_op);
1191
static const struct file_operations proc_diskstats_operations = {
1192
.open = diskstats_open,
1194
.llseek = seq_lseek,
1195
.release = seq_release,
1198
static int __init proc_genhd_init(void)
1200
proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
1201
proc_create("partitions", 0, NULL, &proc_partitions_operations);
1204
module_init(proc_genhd_init);
1205
#endif /* CONFIG_PROC_FS */
1207
dev_t blk_lookup_devt(const char *name, int partno)
1209
dev_t devt = MKDEV(0, 0);
1210
struct class_dev_iter iter;
1213
class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1214
while ((dev = class_dev_iter_next(&iter))) {
1215
struct gendisk *disk = dev_to_disk(dev);
1216
struct hd_struct *part;
1218
if (strcmp(dev_name(dev), name))
1221
if (partno < disk->minors) {
1222
/* We need to return the right devno, even
1223
* if the partition doesn't exist yet.
1225
devt = MKDEV(MAJOR(dev->devt),
1226
MINOR(dev->devt) + partno);
1229
part = disk_get_part(disk, partno);
1231
devt = part_devt(part);
1232
disk_put_part(part);
1235
disk_put_part(part);
1237
class_dev_iter_exit(&iter);
1240
EXPORT_SYMBOL(blk_lookup_devt);
1242
struct gendisk *alloc_disk(int minors)
1244
return alloc_disk_node(minors, -1);
1246
EXPORT_SYMBOL(alloc_disk);
1248
struct gendisk *alloc_disk_node(int minors, int node_id)
1250
struct gendisk *disk;
1252
disk = kmalloc_node(sizeof(struct gendisk),
1253
GFP_KERNEL | __GFP_ZERO, node_id);
1255
if (!init_part_stats(&disk->part0)) {
1259
disk->node_id = node_id;
1260
if (disk_expand_part_tbl(disk, 0)) {
1261
free_part_stats(&disk->part0);
1265
disk->part_tbl->part[0] = &disk->part0;
1267
hd_ref_init(&disk->part0);
1269
disk->minors = minors;
1270
rand_initialize_disk(disk);
1271
disk_to_dev(disk)->class = &block_class;
1272
disk_to_dev(disk)->type = &disk_type;
1273
device_initialize(disk_to_dev(disk));
1277
EXPORT_SYMBOL(alloc_disk_node);
1279
struct kobject *get_disk(struct gendisk *disk)
1281
struct module *owner;
1282
struct kobject *kobj;
1286
owner = disk->fops->owner;
1287
if (owner && !try_module_get(owner))
1289
kobj = kobject_get(&disk_to_dev(disk)->kobj);
1298
EXPORT_SYMBOL(get_disk);
1300
void put_disk(struct gendisk *disk)
1303
kobject_put(&disk_to_dev(disk)->kobj);
1306
EXPORT_SYMBOL(put_disk);
1308
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1310
char event[] = "DISK_RO=1";
1311
char *envp[] = { event, NULL };
1315
kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1318
void set_device_ro(struct block_device *bdev, int flag)
1320
bdev->bd_part->policy = flag;
1323
EXPORT_SYMBOL(set_device_ro);
1325
void set_disk_ro(struct gendisk *disk, int flag)
1327
struct disk_part_iter piter;
1328
struct hd_struct *part;
1330
if (disk->part0.policy != flag) {
1331
set_disk_ro_uevent(disk, flag);
1332
disk->part0.policy = flag;
1335
disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1336
while ((part = disk_part_iter_next(&piter)))
1337
part->policy = flag;
1338
disk_part_iter_exit(&piter);
1341
EXPORT_SYMBOL(set_disk_ro);
1343
int bdev_read_only(struct block_device *bdev)
1347
return bdev->bd_part->policy;
1350
EXPORT_SYMBOL(bdev_read_only);
1352
int invalidate_partition(struct gendisk *disk, int partno)
1355
struct block_device *bdev = bdget_disk(disk, partno);
1358
res = __invalidate_device(bdev, true);
1364
EXPORT_SYMBOL(invalidate_partition);
1367
* Disk events - monitor disk events like media change and eject request.
1369
struct disk_events {
1370
struct list_head node; /* all disk_event's */
1371
struct gendisk *disk; /* the associated disk */
1374
struct mutex block_mutex; /* protects blocking */
1375
int block; /* event blocking depth */
1376
unsigned int pending; /* events already sent out */
1377
unsigned int clearing; /* events being cleared */
1379
long poll_msecs; /* interval, -1 for default */
1380
struct delayed_work dwork;
1383
static const char *disk_events_strs[] = {
1384
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
1385
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
1388
static char *disk_uevents[] = {
1389
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
1390
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
1393
/* list of all disk_events */
1394
static DEFINE_MUTEX(disk_events_mutex);
1395
static LIST_HEAD(disk_events);
1397
/* disable in-kernel polling by default */
1398
static unsigned long disk_events_dfl_poll_msecs = 0;
1400
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1402
struct disk_events *ev = disk->ev;
1403
long intv_msecs = 0;
1406
* If device-specific poll interval is set, always use it. If
1407
* the default is being used, poll iff there are events which
1408
* can't be monitored asynchronously.
1410
if (ev->poll_msecs >= 0)
1411
intv_msecs = ev->poll_msecs;
1412
else if (disk->events & ~disk->async_events)
1413
intv_msecs = disk_events_dfl_poll_msecs;
1415
return msecs_to_jiffies(intv_msecs);
1419
* disk_block_events - block and flush disk event checking
1420
* @disk: disk to block events for
1422
* On return from this function, it is guaranteed that event checking
1423
* isn't in progress and won't happen until unblocked by
1424
* disk_unblock_events(). Events blocking is counted and the actual
1425
* unblocking happens after the matching number of unblocks are done.
1427
* Note that this intentionally does not block event checking from
1428
* disk_clear_events().
1433
void disk_block_events(struct gendisk *disk)
1435
struct disk_events *ev = disk->ev;
1436
unsigned long flags;
1443
* Outer mutex ensures that the first blocker completes canceling
1444
* the event work before further blockers are allowed to finish.
1446
mutex_lock(&ev->block_mutex);
1448
spin_lock_irqsave(&ev->lock, flags);
1449
cancel = !ev->block++;
1450
spin_unlock_irqrestore(&ev->lock, flags);
1453
cancel_delayed_work_sync(&disk->ev->dwork);
1455
mutex_unlock(&ev->block_mutex);
1458
static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1460
struct disk_events *ev = disk->ev;
1462
unsigned long flags;
1464
spin_lock_irqsave(&ev->lock, flags);
1466
if (WARN_ON_ONCE(ev->block <= 0))
1473
* Not exactly a latency critical operation, set poll timer
1474
* slack to 25% and kick event check.
1476
intv = disk_events_poll_jiffies(disk);
1477
set_timer_slack(&ev->dwork.timer, intv / 4);
1479
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1481
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1483
spin_unlock_irqrestore(&ev->lock, flags);
1487
* disk_unblock_events - unblock disk event checking
1488
* @disk: disk to unblock events for
1490
* Undo disk_block_events(). When the block count reaches zero, it
1491
* starts events polling if configured.
1494
* Don't care. Safe to call from irq context.
1496
void disk_unblock_events(struct gendisk *disk)
1499
__disk_unblock_events(disk, false);
1503
* disk_flush_events - schedule immediate event checking and flushing
1504
* @disk: disk to check and flush events for
1505
* @mask: events to flush
1507
* Schedule immediate event checking on @disk if not blocked. Events in
1508
* @mask are scheduled to be cleared from the driver. Note that this
1509
* doesn't clear the events from @disk->ev.
1512
* If @mask is non-zero must be called with bdev->bd_mutex held.
1514
void disk_flush_events(struct gendisk *disk, unsigned int mask)
1516
struct disk_events *ev = disk->ev;
1521
spin_lock_irq(&ev->lock);
1522
ev->clearing |= mask;
1524
cancel_delayed_work(&ev->dwork);
1525
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1527
spin_unlock_irq(&ev->lock);
1531
* disk_clear_events - synchronously check, clear and return pending events
1532
* @disk: disk to fetch and clear events from
1533
* @mask: mask of events to be fetched and clearted
1535
* Disk events are synchronously checked and pending events in @mask
1536
* are cleared and returned. This ignores the block count.
1541
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1543
const struct block_device_operations *bdops = disk->fops;
1544
struct disk_events *ev = disk->ev;
1545
unsigned int pending;
1548
/* for drivers still using the old ->media_changed method */
1549
if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1550
bdops->media_changed && bdops->media_changed(disk))
1551
return DISK_EVENT_MEDIA_CHANGE;
1555
/* tell the workfn about the events being cleared */
1556
spin_lock_irq(&ev->lock);
1557
ev->clearing |= mask;
1558
spin_unlock_irq(&ev->lock);
1560
/* uncondtionally schedule event check and wait for it to finish */
1561
disk_block_events(disk);
1562
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1563
flush_delayed_work(&ev->dwork);
1564
__disk_unblock_events(disk, false);
1566
/* then, fetch and clear pending events */
1567
spin_lock_irq(&ev->lock);
1568
WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
1569
pending = ev->pending & mask;
1570
ev->pending &= ~mask;
1571
spin_unlock_irq(&ev->lock);
1576
static void disk_events_workfn(struct work_struct *work)
1578
struct delayed_work *dwork = to_delayed_work(work);
1579
struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1580
struct gendisk *disk = ev->disk;
1581
char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1582
unsigned int clearing = ev->clearing;
1583
unsigned int events;
1585
int nr_events = 0, i;
1588
events = disk->fops->check_events(disk, clearing);
1590
/* accumulate pending events and schedule next poll if necessary */
1591
spin_lock_irq(&ev->lock);
1593
events &= ~ev->pending;
1594
ev->pending |= events;
1595
ev->clearing &= ~clearing;
1597
intv = disk_events_poll_jiffies(disk);
1598
if (!ev->block && intv)
1599
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1601
spin_unlock_irq(&ev->lock);
1604
* Tell userland about new events. Only the events listed in
1605
* @disk->events are reported. Unlisted events are processed the
1606
* same internally but never get reported to userland.
1608
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1609
if (events & disk->events & (1 << i))
1610
envp[nr_events++] = disk_uevents[i];
1613
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1617
* A disk events enabled device has the following sysfs nodes under
1618
* its /sys/block/X/ directory.
1620
* events : list of all supported events
1621
* events_async : list of events which can be detected w/o polling
1622
* events_poll_msecs : polling interval, 0: disable, -1: system default
1624
static ssize_t __disk_events_show(unsigned int events, char *buf)
1626
const char *delim = "";
1630
for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1631
if (events & (1 << i)) {
1632
pos += sprintf(buf + pos, "%s%s",
1633
delim, disk_events_strs[i]);
1637
pos += sprintf(buf + pos, "\n");
1641
static ssize_t disk_events_show(struct device *dev,
1642
struct device_attribute *attr, char *buf)
1644
struct gendisk *disk = dev_to_disk(dev);
1646
return __disk_events_show(disk->events, buf);
1649
static ssize_t disk_events_async_show(struct device *dev,
1650
struct device_attribute *attr, char *buf)
1652
struct gendisk *disk = dev_to_disk(dev);
1654
return __disk_events_show(disk->async_events, buf);
1657
static ssize_t disk_events_poll_msecs_show(struct device *dev,
1658
struct device_attribute *attr,
1661
struct gendisk *disk = dev_to_disk(dev);
1663
return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1666
static ssize_t disk_events_poll_msecs_store(struct device *dev,
1667
struct device_attribute *attr,
1668
const char *buf, size_t count)
1670
struct gendisk *disk = dev_to_disk(dev);
1673
if (!count || !sscanf(buf, "%ld", &intv))
1676
if (intv < 0 && intv != -1)
1679
disk_block_events(disk);
1680
disk->ev->poll_msecs = intv;
1681
__disk_unblock_events(disk, true);
1686
static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1687
static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1688
static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1689
disk_events_poll_msecs_show,
1690
disk_events_poll_msecs_store);
1692
static const struct attribute *disk_events_attrs[] = {
1693
&dev_attr_events.attr,
1694
&dev_attr_events_async.attr,
1695
&dev_attr_events_poll_msecs.attr,
1700
* The default polling interval can be specified by the kernel
1701
* parameter block.events_dfl_poll_msecs which defaults to 0
1702
* (disable). This can also be modified runtime by writing to
1703
* /sys/module/block/events_dfl_poll_msecs.
1705
static int disk_events_set_dfl_poll_msecs(const char *val,
1706
const struct kernel_param *kp)
1708
struct disk_events *ev;
1711
ret = param_set_ulong(val, kp);
1715
mutex_lock(&disk_events_mutex);
1717
list_for_each_entry(ev, &disk_events, node)
1718
disk_flush_events(ev->disk, 0);
1720
mutex_unlock(&disk_events_mutex);
1725
static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1726
.set = disk_events_set_dfl_poll_msecs,
1727
.get = param_get_ulong,
1730
#undef MODULE_PARAM_PREFIX
1731
#define MODULE_PARAM_PREFIX "block."
1733
module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1734
&disk_events_dfl_poll_msecs, 0644);
1737
* disk_{add|del|release}_events - initialize and destroy disk_events.
1739
static void disk_add_events(struct gendisk *disk)
1741
struct disk_events *ev;
1743
if (!disk->fops->check_events)
1746
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1748
pr_warn("%s: failed to initialize events\n", disk->disk_name);
1752
if (sysfs_create_files(&disk_to_dev(disk)->kobj,
1753
disk_events_attrs) < 0) {
1754
pr_warn("%s: failed to create sysfs files for events\n",
1762
INIT_LIST_HEAD(&ev->node);
1764
spin_lock_init(&ev->lock);
1765
mutex_init(&ev->block_mutex);
1767
ev->poll_msecs = -1;
1768
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1770
mutex_lock(&disk_events_mutex);
1771
list_add_tail(&ev->node, &disk_events);
1772
mutex_unlock(&disk_events_mutex);
1775
* Block count is initialized to 1 and the following initial
1776
* unblock kicks it into action.
1778
__disk_unblock_events(disk, true);
1781
static void disk_del_events(struct gendisk *disk)
1786
disk_block_events(disk);
1788
mutex_lock(&disk_events_mutex);
1789
list_del_init(&disk->ev->node);
1790
mutex_unlock(&disk_events_mutex);
1792
sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1795
static void disk_release_events(struct gendisk *disk)
1797
/* the block count should be 1 from disk_del_events() */
1798
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);