136
139
* that covers the entire device
138
141
static struct extent_map *btree_get_extent(struct inode *inode,
139
struct page *page, size_t page_offset, u64 start, u64 len,
142
struct page *page, size_t pg_offset, u64 start, u64 len,
142
145
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
253
256
memcpy(&found, result, csum_size);
255
258
read_extent_buffer(buf, &val, 0, csum_size);
256
if (printk_ratelimit()) {
257
printk(KERN_INFO "btrfs: %s checksum verify "
259
printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
258
260
"failed on %llu wanted %X found %X "
260
262
root->fs_info->sb->s_id,
261
263
(unsigned long long)buf->start, val, found,
262
264
btrfs_header_level(buf));
264
265
if (result != (char *)&inline_result)
298
if (printk_ratelimit()) {
299
printk("parent transid verify failed on %llu wanted %llu "
299
printk_ratelimited("parent transid verify failed on %llu wanted %llu "
301
301
(unsigned long long)eb->start,
302
302
(unsigned long long)parent_transid,
303
303
(unsigned long long)btrfs_header_generation(eb));
306
305
clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
331
331
!verify_parent_transid(io_tree, eb, parent_transid))
335
* This buffer's crc is fine, but its contents are corrupted, so
336
* there is no reason to read the other copies, they won't be
339
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
334
342
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
335
343
eb->start, eb->len);
336
344
if (num_copies == 1)
430
#define CORRUPT(reason, eb, root, slot) \
431
printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
432
"root=%llu, slot=%d\n", reason, \
433
(unsigned long long)btrfs_header_bytenr(eb), \
434
(unsigned long long)root->objectid, slot)
436
static noinline int check_leaf(struct btrfs_root *root,
437
struct extent_buffer *leaf)
439
struct btrfs_key key;
440
struct btrfs_key leaf_key;
441
u32 nritems = btrfs_header_nritems(leaf);
447
/* Check the 0 item */
448
if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
449
BTRFS_LEAF_DATA_SIZE(root)) {
450
CORRUPT("invalid item offset size pair", leaf, root, 0);
455
* Check to make sure each items keys are in the correct order and their
456
* offsets make sense. We only have to loop through nritems-1 because
457
* we check the current slot against the next slot, which verifies the
458
* next slot's offset+size makes sense and that the current's slot
461
for (slot = 0; slot < nritems - 1; slot++) {
462
btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
463
btrfs_item_key_to_cpu(leaf, &key, slot + 1);
465
/* Make sure the keys are in the right order */
466
if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
467
CORRUPT("bad key order", leaf, root, slot);
472
* Make sure the offset and ends are right, remember that the
473
* item data starts at the end of the leaf and grows towards the
476
if (btrfs_item_offset_nr(leaf, slot) !=
477
btrfs_item_end_nr(leaf, slot + 1)) {
478
CORRUPT("slot offset bad", leaf, root, slot);
483
* Check to make sure that we don't point outside of the leaf,
484
* just incase all the items are consistent to eachother, but
485
* all point outside of the leaf.
487
if (btrfs_item_end_nr(leaf, slot) >
488
BTRFS_LEAF_DATA_SIZE(root)) {
489
CORRUPT("slot end outside of leaf", leaf, root, slot);
422
497
#ifdef CONFIG_DEBUG_LOCK_ALLOC
423
498
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
457
532
found_start = btrfs_header_bytenr(eb);
458
533
if (found_start != start) {
459
if (printk_ratelimit()) {
460
printk(KERN_INFO "btrfs bad tree block start "
534
printk_ratelimited(KERN_INFO "btrfs bad tree block start "
462
536
(unsigned long long)found_start,
463
537
(unsigned long long)eb->start);
485
556
btrfs_set_buffer_lockdep_class(eb, found_level);
487
558
ret = csum_tree_block(root, eb, 1);
565
* If this is a leaf block and it is corrupt, set the corrupt bit so
566
* that we don't try and read the other copies of this block, just
569
if (found_level == 0 && check_leaf(root, eb)) {
570
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
491
574
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
492
575
end = eb->start + end - 1;
968
1044
root->last_trans = 0;
969
1045
root->highest_objectid = 0;
970
1046
root->name = NULL;
972
1047
root->inode_tree = RB_ROOT;
1048
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
973
1049
root->block_rsv = NULL;
974
1050
root->orphan_block_rsv = NULL;
976
1052
INIT_LIST_HEAD(&root->dirty_list);
977
1053
INIT_LIST_HEAD(&root->orphan_list);
978
1054
INIT_LIST_HEAD(&root->root_list);
979
spin_lock_init(&root->node_lock);
980
1055
spin_lock_init(&root->orphan_lock);
981
1056
spin_lock_init(&root->inode_lock);
982
1057
spin_lock_init(&root->accounting_lock);
992
1067
root->log_transid = 0;
993
1068
root->last_log_commit = 0;
994
1069
extent_io_tree_init(&root->dirty_log_pages,
995
fs_info->btree_inode->i_mapping, GFP_NOFS);
1070
fs_info->btree_inode->i_mapping);
997
1072
memset(&root->root_key, 0, sizeof(root->root_key));
998
1073
memset(&root->root_item, 0, sizeof(root->root_item));
1195
struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1198
struct btrfs_root *root;
1200
if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1201
return fs_info->tree_root;
1202
if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1203
return fs_info->extent_root;
1205
root = radix_tree_lookup(&fs_info->fs_roots_radix,
1206
(unsigned long)root_objectid);
1210
1273
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1211
1274
struct btrfs_key *location)
1235
1298
if (IS_ERR(root))
1238
set_anon_super(&root->anon_super, NULL);
1301
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1302
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1304
if (!root->free_ino_pinned || !root->free_ino_ctl) {
1309
btrfs_init_free_ino_ctl(root);
1310
mutex_init(&root->fs_commit_mutex);
1311
spin_lock_init(&root->cache_lock);
1312
init_waitqueue_head(&root->cache_wait);
1314
ret = set_anon_super(&root->anon_super, NULL);
1240
1318
if (btrfs_root_refs(&root->root_item) == 0) {
1278
1356
return ERR_PTR(ret);
1281
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1282
struct btrfs_key *location,
1283
const char *name, int namelen)
1285
return btrfs_read_fs_root_no_name(fs_info, location);
1287
struct btrfs_root *root;
1290
root = btrfs_read_fs_root_no_name(fs_info, location);
1297
ret = btrfs_set_root_name(root, name, namelen);
1299
free_extent_buffer(root->node);
1301
return ERR_PTR(ret);
1304
ret = btrfs_sysfs_add_root(root);
1306
free_extent_buffer(root->node);
1309
return ERR_PTR(ret);
1316
1359
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1318
1361
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1336
* this unplugs every device on the box, and it is only used when page
1339
static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1341
struct btrfs_device *device;
1342
struct btrfs_fs_info *info;
1344
info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1345
list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1349
bdi = blk_get_backing_dev_info(device->bdev);
1350
if (bdi->unplug_io_fn)
1351
bdi->unplug_io_fn(bdi, page);
1355
static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1357
struct inode *inode;
1358
struct extent_map_tree *em_tree;
1359
struct extent_map *em;
1360
struct address_space *mapping;
1363
/* the generic O_DIRECT read code does this */
1365
__unplug_io_fn(bdi, page);
1370
* page->mapping may change at any time. Get a consistent copy
1371
* and use that for everything below
1374
mapping = page->mapping;
1378
inode = mapping->host;
1381
* don't do the expensive searching for a small number of
1384
if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1385
__unplug_io_fn(bdi, page);
1389
offset = page_offset(page);
1391
em_tree = &BTRFS_I(inode)->extent_tree;
1392
read_lock(&em_tree->lock);
1393
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1394
read_unlock(&em_tree->lock);
1396
__unplug_io_fn(bdi, page);
1400
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1401
free_extent_map(em);
1402
__unplug_io_fn(bdi, page);
1405
offset = offset - em->start;
1406
btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1407
em->block_start + offset, page);
1408
free_extent_map(em);
1412
1381
* If this fails, caller must call bdi_destroy() to get rid of the
1538
1506
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1539
1507
mutex_lock(&root->fs_info->transaction_kthread_mutex);
1541
spin_lock(&root->fs_info->new_trans_lock);
1509
spin_lock(&root->fs_info->trans_lock);
1542
1510
cur = root->fs_info->running_transaction;
1544
spin_unlock(&root->fs_info->new_trans_lock);
1512
spin_unlock(&root->fs_info->trans_lock);
1548
1516
now = get_seconds();
1549
1517
if (!cur->blocked &&
1550
1518
(now < cur->start_time || now - cur->start_time < 30)) {
1551
spin_unlock(&root->fs_info->new_trans_lock);
1519
spin_unlock(&root->fs_info->trans_lock);
1552
1520
delay = HZ * 5;
1555
1523
transid = cur->transid;
1556
spin_unlock(&root->fs_info->new_trans_lock);
1524
spin_unlock(&root->fs_info->trans_lock);
1558
trans = btrfs_join_transaction(root, 1);
1526
trans = btrfs_join_transaction(root);
1559
1527
BUG_ON(IS_ERR(trans));
1560
1528
if (transid == trans->transid) {
1561
1529
ret = btrfs_commit_transaction(trans, root);
1598
1566
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1600
1568
struct btrfs_root *tree_root = btrfs_sb(sb);
1601
struct btrfs_fs_info *fs_info = tree_root->fs_info;
1569
struct btrfs_fs_info *fs_info = NULL;
1602
1570
struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1604
1572
struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1643
1614
INIT_LIST_HEAD(&fs_info->ordered_operations);
1644
1615
INIT_LIST_HEAD(&fs_info->caching_block_groups);
1645
1616
spin_lock_init(&fs_info->delalloc_lock);
1646
spin_lock_init(&fs_info->new_trans_lock);
1617
spin_lock_init(&fs_info->trans_lock);
1647
1618
spin_lock_init(&fs_info->ref_cache_lock);
1648
1619
spin_lock_init(&fs_info->fs_roots_radix_lock);
1649
1620
spin_lock_init(&fs_info->delayed_iput_lock);
1621
spin_lock_init(&fs_info->defrag_inodes_lock);
1622
mutex_init(&fs_info->reloc_mutex);
1651
1624
init_completion(&fs_info->kobj_unregister);
1652
1625
fs_info->tree_root = tree_root;
1669
1642
atomic_set(&fs_info->async_delalloc_pages, 0);
1670
1643
atomic_set(&fs_info->async_submit_draining, 0);
1671
1644
atomic_set(&fs_info->nr_async_bios, 0);
1645
atomic_set(&fs_info->defrag_running, 0);
1672
1646
fs_info->sb = sb;
1673
1647
fs_info->max_inline = 8192 * 1024;
1674
1648
fs_info->metadata_ratio = 0;
1649
fs_info->defrag_inodes = RB_ROOT;
1650
fs_info->trans_no_join = 0;
1676
1652
fs_info->thread_pool_size = min_t(unsigned long,
1677
1653
num_online_cpus() + 2, 8);
1679
1655
INIT_LIST_HEAD(&fs_info->ordered_extents);
1680
1656
spin_lock_init(&fs_info->ordered_extent_lock);
1657
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
1659
if (!fs_info->delayed_root) {
1663
btrfs_init_delayed_root(fs_info->delayed_root);
1665
mutex_init(&fs_info->scrub_lock);
1666
atomic_set(&fs_info->scrubs_running, 0);
1667
atomic_set(&fs_info->scrub_pause_req, 0);
1668
atomic_set(&fs_info->scrubs_paused, 0);
1669
atomic_set(&fs_info->scrub_cancel_req, 0);
1670
init_waitqueue_head(&fs_info->scrub_pause_wait);
1671
init_rwsem(&fs_info->scrub_super_lock);
1672
fs_info->scrub_workers_refcnt = 0;
1682
1674
sb->s_blocksize = 4096;
1683
1675
sb->s_blocksize_bits = blksize_bits(4096);
1697
1689
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1698
1690
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1699
fs_info->btree_inode->i_mapping,
1701
extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1691
fs_info->btree_inode->i_mapping);
1692
extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
1704
1694
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1713
1703
fs_info->block_group_cache_tree = RB_ROOT;
1715
1705
extent_io_tree_init(&fs_info->freed_extents[0],
1716
fs_info->btree_inode->i_mapping, GFP_NOFS);
1706
fs_info->btree_inode->i_mapping);
1717
1707
extent_io_tree_init(&fs_info->freed_extents[1],
1718
fs_info->btree_inode->i_mapping, GFP_NOFS);
1708
fs_info->btree_inode->i_mapping);
1719
1709
fs_info->pinned_extents = &fs_info->freed_extents[0];
1720
1710
fs_info->do_barriers = 1;
1723
mutex_init(&fs_info->trans_mutex);
1724
1713
mutex_init(&fs_info->ordered_operations_mutex);
1725
1714
mutex_init(&fs_info->tree_log_mutex);
1726
1715
mutex_init(&fs_info->chunk_mutex);
1758
1747
disk_super = &fs_info->super_copy;
1759
1748
if (!btrfs_super_root(disk_super))
1762
1751
/* check FS state, whether FS is broken. */
1763
1752
fs_info->fs_state |= btrfs_super_flags(disk_super);
1765
1754
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1757
* In the long term, we'll store the compression type in the super
1758
* block, and it'll be used for per file compression control.
1760
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1767
1762
ret = btrfs_parse_options(tree_root, options);
1773
1768
features = btrfs_super_incompat_flags(disk_super) &
1840
1835
&fs_info->generic_worker);
1841
1836
btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1842
1837
1, &fs_info->generic_worker);
1838
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1839
fs_info->thread_pool_size,
1840
&fs_info->generic_worker);
1845
1843
* endios are largely parallel and should have a very
1861
1859
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1862
1860
btrfs_start_workers(&fs_info->endio_write_workers, 1);
1863
1861
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1862
btrfs_start_workers(&fs_info->delayed_workers, 1);
1865
1864
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1866
1865
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1969
1968
fs_info->metadata_alloc_profile = (u64)-1;
1970
1969
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1971
ret = btrfs_init_space_info(fs_info);
1973
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1974
goto fail_block_groups;
1972
1977
ret = btrfs_read_block_groups(extent_root);
1974
1979
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2106
2116
btrfs_stop_workers(&fs_info->endio_write_workers);
2107
2117
btrfs_stop_workers(&fs_info->endio_freespace_worker);
2108
2118
btrfs_stop_workers(&fs_info->submit_workers);
2119
btrfs_stop_workers(&fs_info->delayed_workers);
2121
kfree(fs_info->delayed_root);
2110
2123
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2111
2124
iput(fs_info->btree_inode);
2372
2383
if (btrfs_root_refs(&root->root_item) == 0)
2373
2384
synchronize_srcu(&fs_info->subvol_srcu);
2386
__btrfs_remove_free_space_cache(root->free_ino_pinned);
2387
__btrfs_remove_free_space_cache(root->free_ino_ctl);
2375
2388
free_fs_root(root);
2379
2392
static void free_fs_root(struct btrfs_root *root)
2394
iput(root->cache_inode);
2381
2395
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2382
2396
if (root->anon_super.s_dev) {
2383
2397
down_write(&root->anon_super.s_umount);
2459
2479
down_write(&root->fs_info->cleanup_work_sem);
2460
2480
up_write(&root->fs_info->cleanup_work_sem);
2462
trans = btrfs_join_transaction(root, 1);
2482
trans = btrfs_join_transaction(root);
2463
2483
if (IS_ERR(trans))
2464
2484
return PTR_ERR(trans);
2465
2485
ret = btrfs_commit_transaction(trans, root);
2467
2487
/* run commit again to drop the original snapshot */
2468
trans = btrfs_join_transaction(root, 1);
2488
trans = btrfs_join_transaction(root);
2469
2489
if (IS_ERR(trans))
2470
2490
return PTR_ERR(trans);
2471
2491
btrfs_commit_transaction(trans, root);
2484
2504
fs_info->closing = 1;
2507
btrfs_scrub_cancel(root);
2509
/* wait for any defraggers to finish */
2510
wait_event(fs_info->transaction_wait,
2511
(atomic_read(&fs_info->defrag_running) == 0));
2513
/* clear out the rbtree of defraggable inodes */
2514
btrfs_run_defrag_inodes(root->fs_info);
2487
2516
btrfs_put_block_group_cache(fs_info);
2495
2524
* ERROR state on disk.
2497
2526
* 2. when btrfs flips readonly just in btrfs_commit_super,
2498
* and in such case, btrfs cannnot write sb via btrfs_commit_super,
2527
* and in such case, btrfs cannot write sb via btrfs_commit_super,
2499
2528
* and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2500
2529
* btrfs will cleanup all FS resources first and write sb then.
2629
2660
if (current->flags & PF_MEMALLOC)
2663
btrfs_balance_delayed_items(root);
2665
num_dirty = root->fs_info->dirty_metadata_bytes;
2667
if (num_dirty > thresh) {
2668
balance_dirty_pages_ratelimited_nr(
2669
root->fs_info->btree_inode->i_mapping, 1);
2674
void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2677
* looks as though older kernels can get into trouble with
2678
* this code, they end up stuck in balance_dirty_pages forever
2681
unsigned long thresh = 32 * 1024 * 1024;
2683
if (current->flags & PF_MEMALLOC)
2632
2686
num_dirty = root->fs_info->dirty_metadata_bytes;
2634
2688
if (num_dirty > thresh) {
2951
3005
/* opt_discard */
2952
ret = btrfs_error_discard_extent(root, start, end + 1 - start);
3006
if (btrfs_test_opt(root, DISCARD))
3007
ret = btrfs_error_discard_extent(root, start,
2954
3011
clear_extent_dirty(unpin, start, end, GFP_NOFS);
2955
3012
btrfs_error_unpin_extent_range(root, start, end);
2969
mutex_lock(&root->fs_info->trans_mutex);
2970
3026
mutex_lock(&root->fs_info->transaction_kthread_mutex);
3028
spin_lock(&root->fs_info->trans_lock);
2972
3029
list_splice_init(&root->fs_info->trans_list, &list);
3030
root->fs_info->trans_no_join = 1;
3031
spin_unlock(&root->fs_info->trans_lock);
2973
3033
while (!list_empty(&list)) {
2974
3034
t = list_entry(list.next, struct btrfs_transaction, list);
2994
3054
t->blocked = 0;
2995
3055
if (waitqueue_active(&root->fs_info->transaction_wait))
2996
3056
wake_up(&root->fs_info->transaction_wait);
2997
mutex_unlock(&root->fs_info->trans_mutex);
2999
mutex_lock(&root->fs_info->trans_mutex);
3000
3058
t->commit_done = 1;
3001
3059
if (waitqueue_active(&t->commit_wait))
3002
3060
wake_up(&t->commit_wait);
3003
mutex_unlock(&root->fs_info->trans_mutex);
3005
mutex_lock(&root->fs_info->trans_mutex);
3007
3062
btrfs_destroy_pending_snapshots(t);
3009
3064
btrfs_destroy_delalloc_inodes(root);
3011
spin_lock(&root->fs_info->new_trans_lock);
3066
spin_lock(&root->fs_info->trans_lock);
3012
3067
root->fs_info->running_transaction = NULL;
3013
spin_unlock(&root->fs_info->new_trans_lock);
3068
spin_unlock(&root->fs_info->trans_lock);
3015
3070
btrfs_destroy_marked_extents(root, &t->dirty_pages,
3018
3073
btrfs_destroy_pinned_extent(root,
3019
3074
root->fs_info->pinned_extents);
3076
atomic_set(&t->use_count, 0);
3022
3077
list_del_init(&t->list);
3023
3078
memset(t, 0, sizeof(*t));
3024
3079
kmem_cache_free(btrfs_transaction_cachep, t);
3082
spin_lock(&root->fs_info->trans_lock);
3083
root->fs_info->trans_no_join = 0;
3084
spin_unlock(&root->fs_info->trans_lock);
3027
3085
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3028
mutex_unlock(&root->fs_info->trans_mutex);