~ubuntu-branches/ubuntu/precise/linux-ti-omap4/precise

« back to all changes in this revision

Viewing changes to fs/btrfs/disk-io.c

  • Committer: Bazaar Package Importer
  • Author(s): Paolo Pisati
  • Date: 2011-06-29 15:23:51 UTC
  • mfrom: (26.1.1 natty-proposed)
  • Revision ID: james.westby@ubuntu.com-20110629152351-xs96tm303d95rpbk
Tags: 3.0.0-1200.2
* Rebased against 3.0.0-6.7
* BSP from TI based on 3.0.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
29
29
#include <linux/crc32c.h>
30
30
#include <linux/slab.h>
31
31
#include <linux/migrate.h>
 
32
#include <linux/ratelimit.h>
 
33
#include <asm/unaligned.h>
32
34
#include "compat.h"
33
35
#include "ctree.h"
34
36
#include "disk-io.h"
40
42
#include "locking.h"
41
43
#include "tree-log.h"
42
44
#include "free-space-cache.h"
 
45
#include "inode-map.h"
43
46
 
44
47
static struct extent_io_ops btree_extent_io_ops;
45
48
static void end_workqueue_fn(struct btrfs_work *work);
136
139
 * that covers the entire device
137
140
 */
138
141
static struct extent_map *btree_get_extent(struct inode *inode,
139
 
                struct page *page, size_t page_offset, u64 start, u64 len,
 
142
                struct page *page, size_t pg_offset, u64 start, u64 len,
140
143
                int create)
141
144
{
142
145
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
153
156
        }
154
157
        read_unlock(&em_tree->lock);
155
158
 
156
 
        em = alloc_extent_map(GFP_NOFS);
 
159
        em = alloc_extent_map();
157
160
        if (!em) {
158
161
                em = ERR_PTR(-ENOMEM);
159
162
                goto out;
198
201
 
199
202
void btrfs_csum_final(u32 crc, char *result)
200
203
{
201
 
        *(__le32 *)result = ~cpu_to_le32(crc);
 
204
        put_unaligned_le32(~crc, result);
202
205
}
203
206
 
204
207
/*
253
256
                        memcpy(&found, result, csum_size);
254
257
 
255
258
                        read_extent_buffer(buf, &val, 0, csum_size);
256
 
                        if (printk_ratelimit()) {
257
 
                                printk(KERN_INFO "btrfs: %s checksum verify "
 
259
                        printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
258
260
                                       "failed on %llu wanted %X found %X "
259
261
                                       "level %d\n",
260
262
                                       root->fs_info->sb->s_id,
261
263
                                       (unsigned long long)buf->start, val, found,
262
264
                                       btrfs_header_level(buf));
263
 
                        }
264
265
                        if (result != (char *)&inline_result)
265
266
                                kfree(result);
266
267
                        return 1;
295
296
                ret = 0;
296
297
                goto out;
297
298
        }
298
 
        if (printk_ratelimit()) {
299
 
                printk("parent transid verify failed on %llu wanted %llu "
 
299
        printk_ratelimited("parent transid verify failed on %llu wanted %llu "
300
300
                       "found %llu\n",
301
301
                       (unsigned long long)eb->start,
302
302
                       (unsigned long long)parent_transid,
303
303
                       (unsigned long long)btrfs_header_generation(eb));
304
 
        }
305
304
        ret = 1;
306
305
        clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
307
306
out:
323
322
        int num_copies = 0;
324
323
        int mirror_num = 0;
325
324
 
 
325
        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
326
326
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
327
327
        while (1) {
328
328
                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
331
331
                    !verify_parent_transid(io_tree, eb, parent_transid))
332
332
                        return ret;
333
333
 
 
334
                /*
 
335
                 * This buffer's crc is fine, but its contents are corrupted, so
 
336
                 * there is no reason to read the other copies, they won't be
 
337
                 * any less wrong.
 
338
                 */
 
339
                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 
340
                        return ret;
 
341
 
334
342
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
335
343
                                              eb->start, eb->len);
336
344
                if (num_copies == 1)
370
378
        len = page->private >> 2;
371
379
        WARN_ON(len == 0);
372
380
 
373
 
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
381
        eb = alloc_extent_buffer(tree, start, len, page);
374
382
        if (eb == NULL) {
375
383
                WARN_ON(1);
376
384
                goto out;
419
427
        return ret;
420
428
}
421
429
 
 
430
#define CORRUPT(reason, eb, root, slot)                         \
 
431
        printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
 
432
               "root=%llu, slot=%d\n", reason,                  \
 
433
               (unsigned long long)btrfs_header_bytenr(eb),     \
 
434
               (unsigned long long)root->objectid, slot)
 
435
 
 
436
static noinline int check_leaf(struct btrfs_root *root,
 
437
                               struct extent_buffer *leaf)
 
438
{
 
439
        struct btrfs_key key;
 
440
        struct btrfs_key leaf_key;
 
441
        u32 nritems = btrfs_header_nritems(leaf);
 
442
        int slot;
 
443
 
 
444
        if (nritems == 0)
 
445
                return 0;
 
446
 
 
447
        /* Check the 0 item */
 
448
        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
 
449
            BTRFS_LEAF_DATA_SIZE(root)) {
 
450
                CORRUPT("invalid item offset size pair", leaf, root, 0);
 
451
                return -EIO;
 
452
        }
 
453
 
 
454
        /*
 
455
         * Check to make sure each items keys are in the correct order and their
 
456
         * offsets make sense.  We only have to loop through nritems-1 because
 
457
         * we check the current slot against the next slot, which verifies the
 
458
         * next slot's offset+size makes sense and that the current's slot
 
459
         * offset is correct.
 
460
         */
 
461
        for (slot = 0; slot < nritems - 1; slot++) {
 
462
                btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
 
463
                btrfs_item_key_to_cpu(leaf, &key, slot + 1);
 
464
 
 
465
                /* Make sure the keys are in the right order */
 
466
                if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
 
467
                        CORRUPT("bad key order", leaf, root, slot);
 
468
                        return -EIO;
 
469
                }
 
470
 
 
471
                /*
 
472
                 * Make sure the offset and ends are right, remember that the
 
473
                 * item data starts at the end of the leaf and grows towards the
 
474
                 * front.
 
475
                 */
 
476
                if (btrfs_item_offset_nr(leaf, slot) !=
 
477
                        btrfs_item_end_nr(leaf, slot + 1)) {
 
478
                        CORRUPT("slot offset bad", leaf, root, slot);
 
479
                        return -EIO;
 
480
                }
 
481
 
 
482
                /*
 
483
                 * Check to make sure that we don't point outside of the leaf,
 
484
                 * just incase all the items are consistent to eachother, but
 
485
                 * all point outside of the leaf.
 
486
                 */
 
487
                if (btrfs_item_end_nr(leaf, slot) >
 
488
                    BTRFS_LEAF_DATA_SIZE(root)) {
 
489
                        CORRUPT("slot end outside of leaf", leaf, root, slot);
 
490
                        return -EIO;
 
491
                }
 
492
        }
 
493
 
 
494
        return 0;
 
495
}
 
496
 
422
497
#ifdef CONFIG_DEBUG_LOCK_ALLOC
423
498
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
424
499
{
448
523
        len = page->private >> 2;
449
524
        WARN_ON(len == 0);
450
525
 
451
 
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
526
        eb = alloc_extent_buffer(tree, start, len, page);
452
527
        if (eb == NULL) {
453
528
                ret = -EIO;
454
529
                goto out;
456
531
 
457
532
        found_start = btrfs_header_bytenr(eb);
458
533
        if (found_start != start) {
459
 
                if (printk_ratelimit()) {
460
 
                        printk(KERN_INFO "btrfs bad tree block start "
 
534
                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
461
535
                               "%llu %llu\n",
462
536
                               (unsigned long long)found_start,
463
537
                               (unsigned long long)eb->start);
464
 
                }
465
538
                ret = -EIO;
466
539
                goto err;
467
540
        }
473
546
                goto err;
474
547
        }
475
548
        if (check_tree_block_fsid(root, eb)) {
476
 
                if (printk_ratelimit()) {
477
 
                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
 
549
                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
478
550
                               (unsigned long long)eb->start);
479
 
                }
480
551
                ret = -EIO;
481
552
                goto err;
482
553
        }
485
556
        btrfs_set_buffer_lockdep_class(eb, found_level);
486
557
 
487
558
        ret = csum_tree_block(root, eb, 1);
488
 
        if (ret)
489
 
                ret = -EIO;
 
559
        if (ret) {
 
560
                ret = -EIO;
 
561
                goto err;
 
562
        }
 
563
 
 
564
        /*
 
565
         * If this is a leaf block and it is corrupt, set the corrupt bit so
 
566
         * that we don't try and read the other copies of this block, just
 
567
         * return -EIO.
 
568
         */
 
569
        if (found_level == 0 && check_leaf(root, eb)) {
 
570
                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 
571
                ret = -EIO;
 
572
        }
490
573
 
491
574
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
492
575
        end = eb->start + end - 1;
561
644
        return 256 * limit;
562
645
}
563
646
 
564
 
int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
565
 
{
566
 
        return atomic_read(&info->nr_async_bios) >
567
 
                btrfs_async_submit_limit(info);
568
 
}
569
 
 
570
647
static void run_one_async_start(struct btrfs_work *work)
571
648
{
572
649
        struct async_submit_bio *async;
847
924
        .writepages     = btree_writepages,
848
925
        .releasepage    = btree_releasepage,
849
926
        .invalidatepage = btree_invalidatepage,
850
 
        .sync_page      = block_sync_page,
851
927
#ifdef CONFIG_MIGRATION
852
928
        .migratepage    = btree_migratepage,
853
929
#endif
875
951
        struct inode *btree_inode = root->fs_info->btree_inode;
876
952
        struct extent_buffer *eb;
877
953
        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
878
 
                                bytenr, blocksize, GFP_NOFS);
 
954
                                bytenr, blocksize);
879
955
        return eb;
880
956
}
881
957
 
886
962
        struct extent_buffer *eb;
887
963
 
888
964
        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
889
 
                                 bytenr, blocksize, NULL, GFP_NOFS);
 
965
                                 bytenr, blocksize, NULL);
890
966
        return eb;
891
967
}
892
968
 
968
1044
        root->last_trans = 0;
969
1045
        root->highest_objectid = 0;
970
1046
        root->name = NULL;
971
 
        root->in_sysfs = 0;
972
1047
        root->inode_tree = RB_ROOT;
 
1048
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
973
1049
        root->block_rsv = NULL;
974
1050
        root->orphan_block_rsv = NULL;
975
1051
 
976
1052
        INIT_LIST_HEAD(&root->dirty_list);
977
1053
        INIT_LIST_HEAD(&root->orphan_list);
978
1054
        INIT_LIST_HEAD(&root->root_list);
979
 
        spin_lock_init(&root->node_lock);
980
1055
        spin_lock_init(&root->orphan_lock);
981
1056
        spin_lock_init(&root->inode_lock);
982
1057
        spin_lock_init(&root->accounting_lock);
992
1067
        root->log_transid = 0;
993
1068
        root->last_log_commit = 0;
994
1069
        extent_io_tree_init(&root->dirty_log_pages,
995
 
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
 
1070
                             fs_info->btree_inode->i_mapping);
996
1071
 
997
1072
        memset(&root->root_key, 0, sizeof(root->root_key));
998
1073
        memset(&root->root_item, 0, sizeof(root->root_item));
1160
1235
                     root, fs_info, location->objectid);
1161
1236
 
1162
1237
        path = btrfs_alloc_path();
1163
 
        BUG_ON(!path);
 
1238
        if (!path) {
 
1239
                kfree(root);
 
1240
                return ERR_PTR(-ENOMEM);
 
1241
        }
1164
1242
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1165
1243
        if (ret == 0) {
1166
1244
                l = path->nodes[0];
1192
1270
        return root;
1193
1271
}
1194
1272
 
1195
 
struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1196
 
                                        u64 root_objectid)
1197
 
{
1198
 
        struct btrfs_root *root;
1199
 
 
1200
 
        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1201
 
                return fs_info->tree_root;
1202
 
        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1203
 
                return fs_info->extent_root;
1204
 
 
1205
 
        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1206
 
                                 (unsigned long)root_objectid);
1207
 
        return root;
1208
 
}
1209
 
 
1210
1273
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1211
1274
                                              struct btrfs_key *location)
1212
1275
{
1235
1298
        if (IS_ERR(root))
1236
1299
                return root;
1237
1300
 
1238
 
        set_anon_super(&root->anon_super, NULL);
 
1301
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 
1302
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
 
1303
                                        GFP_NOFS);
 
1304
        if (!root->free_ino_pinned || !root->free_ino_ctl) {
 
1305
                ret = -ENOMEM;
 
1306
                goto fail;
 
1307
        }
 
1308
 
 
1309
        btrfs_init_free_ino_ctl(root);
 
1310
        mutex_init(&root->fs_commit_mutex);
 
1311
        spin_lock_init(&root->cache_lock);
 
1312
        init_waitqueue_head(&root->cache_wait);
 
1313
 
 
1314
        ret = set_anon_super(&root->anon_super, NULL);
 
1315
        if (ret)
 
1316
                goto fail;
1239
1317
 
1240
1318
        if (btrfs_root_refs(&root->root_item) == 0) {
1241
1319
                ret = -ENOENT;
1278
1356
        return ERR_PTR(ret);
1279
1357
}
1280
1358
 
1281
 
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1282
 
                                      struct btrfs_key *location,
1283
 
                                      const char *name, int namelen)
1284
 
{
1285
 
        return btrfs_read_fs_root_no_name(fs_info, location);
1286
 
#if 0
1287
 
        struct btrfs_root *root;
1288
 
        int ret;
1289
 
 
1290
 
        root = btrfs_read_fs_root_no_name(fs_info, location);
1291
 
        if (!root)
1292
 
                return NULL;
1293
 
 
1294
 
        if (root->in_sysfs)
1295
 
                return root;
1296
 
 
1297
 
        ret = btrfs_set_root_name(root, name, namelen);
1298
 
        if (ret) {
1299
 
                free_extent_buffer(root->node);
1300
 
                kfree(root);
1301
 
                return ERR_PTR(ret);
1302
 
        }
1303
 
 
1304
 
        ret = btrfs_sysfs_add_root(root);
1305
 
        if (ret) {
1306
 
                free_extent_buffer(root->node);
1307
 
                kfree(root->name);
1308
 
                kfree(root);
1309
 
                return ERR_PTR(ret);
1310
 
        }
1311
 
        root->in_sysfs = 1;
1312
 
        return root;
1313
 
#endif
1314
 
}
1315
 
 
1316
1359
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1317
1360
{
1318
1361
        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1320
1363
        struct btrfs_device *device;
1321
1364
        struct backing_dev_info *bdi;
1322
1365
 
1323
 
        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 
1366
        rcu_read_lock();
 
1367
        list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1324
1368
                if (!device->bdev)
1325
1369
                        continue;
1326
1370
                bdi = blk_get_backing_dev_info(device->bdev);
1329
1373
                        break;
1330
1374
                }
1331
1375
        }
 
1376
        rcu_read_unlock();
1332
1377
        return ret;
1333
1378
}
1334
1379
 
1335
1380
/*
1336
 
 * this unplugs every device on the box, and it is only used when page
1337
 
 * is null
1338
 
 */
1339
 
static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1340
 
{
1341
 
        struct btrfs_device *device;
1342
 
        struct btrfs_fs_info *info;
1343
 
 
1344
 
        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1345
 
        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1346
 
                if (!device->bdev)
1347
 
                        continue;
1348
 
 
1349
 
                bdi = blk_get_backing_dev_info(device->bdev);
1350
 
                if (bdi->unplug_io_fn)
1351
 
                        bdi->unplug_io_fn(bdi, page);
1352
 
        }
1353
 
}
1354
 
 
1355
 
static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1356
 
{
1357
 
        struct inode *inode;
1358
 
        struct extent_map_tree *em_tree;
1359
 
        struct extent_map *em;
1360
 
        struct address_space *mapping;
1361
 
        u64 offset;
1362
 
 
1363
 
        /* the generic O_DIRECT read code does this */
1364
 
        if (1 || !page) {
1365
 
                __unplug_io_fn(bdi, page);
1366
 
                return;
1367
 
        }
1368
 
 
1369
 
        /*
1370
 
         * page->mapping may change at any time.  Get a consistent copy
1371
 
         * and use that for everything below
1372
 
         */
1373
 
        smp_mb();
1374
 
        mapping = page->mapping;
1375
 
        if (!mapping)
1376
 
                return;
1377
 
 
1378
 
        inode = mapping->host;
1379
 
 
1380
 
        /*
1381
 
         * don't do the expensive searching for a small number of
1382
 
         * devices
1383
 
         */
1384
 
        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1385
 
                __unplug_io_fn(bdi, page);
1386
 
                return;
1387
 
        }
1388
 
 
1389
 
        offset = page_offset(page);
1390
 
 
1391
 
        em_tree = &BTRFS_I(inode)->extent_tree;
1392
 
        read_lock(&em_tree->lock);
1393
 
        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1394
 
        read_unlock(&em_tree->lock);
1395
 
        if (!em) {
1396
 
                __unplug_io_fn(bdi, page);
1397
 
                return;
1398
 
        }
1399
 
 
1400
 
        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1401
 
                free_extent_map(em);
1402
 
                __unplug_io_fn(bdi, page);
1403
 
                return;
1404
 
        }
1405
 
        offset = offset - em->start;
1406
 
        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1407
 
                          em->block_start + offset, page);
1408
 
        free_extent_map(em);
1409
 
}
1410
 
 
1411
 
/*
1412
1381
 * If this fails, caller must call bdi_destroy() to get rid of the
1413
1382
 * bdi again.
1414
1383
 */
1422
1391
                return err;
1423
1392
 
1424
1393
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
1425
 
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
1426
 
        bdi->unplug_io_data     = info;
1427
1394
        bdi->congested_fn       = btrfs_congested_fn;
1428
1395
        bdi->congested_data     = info;
1429
1396
        return 0;
1509
1476
                        btrfs_run_delayed_iputs(root);
1510
1477
                        btrfs_clean_old_snapshots(root);
1511
1478
                        mutex_unlock(&root->fs_info->cleaner_mutex);
 
1479
                        btrfs_run_defrag_inodes(root->fs_info);
1512
1480
                }
1513
1481
 
1514
1482
                if (freezing(current)) {
1538
1506
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1539
1507
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
1540
1508
 
1541
 
                spin_lock(&root->fs_info->new_trans_lock);
 
1509
                spin_lock(&root->fs_info->trans_lock);
1542
1510
                cur = root->fs_info->running_transaction;
1543
1511
                if (!cur) {
1544
 
                        spin_unlock(&root->fs_info->new_trans_lock);
 
1512
                        spin_unlock(&root->fs_info->trans_lock);
1545
1513
                        goto sleep;
1546
1514
                }
1547
1515
 
1548
1516
                now = get_seconds();
1549
1517
                if (!cur->blocked &&
1550
1518
                    (now < cur->start_time || now - cur->start_time < 30)) {
1551
 
                        spin_unlock(&root->fs_info->new_trans_lock);
 
1519
                        spin_unlock(&root->fs_info->trans_lock);
1552
1520
                        delay = HZ * 5;
1553
1521
                        goto sleep;
1554
1522
                }
1555
1523
                transid = cur->transid;
1556
 
                spin_unlock(&root->fs_info->new_trans_lock);
 
1524
                spin_unlock(&root->fs_info->trans_lock);
1557
1525
 
1558
 
                trans = btrfs_join_transaction(root, 1);
 
1526
                trans = btrfs_join_transaction(root);
1559
1527
                BUG_ON(IS_ERR(trans));
1560
1528
                if (transid == trans->transid) {
1561
1529
                        ret = btrfs_commit_transaction(trans, root);
1598
1566
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1599
1567
                                                 GFP_NOFS);
1600
1568
        struct btrfs_root *tree_root = btrfs_sb(sb);
1601
 
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
 
1569
        struct btrfs_fs_info *fs_info = NULL;
1602
1570
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1603
1571
                                                GFP_NOFS);
1604
1572
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1610
1578
 
1611
1579
        struct btrfs_super_block *disk_super;
1612
1580
 
1613
 
        if (!extent_root || !tree_root || !fs_info ||
 
1581
        if (!extent_root || !tree_root || !tree_root->fs_info ||
1614
1582
            !chunk_root || !dev_root || !csum_root) {
1615
1583
                err = -ENOMEM;
1616
1584
                goto fail;
1617
1585
        }
 
1586
        fs_info = tree_root->fs_info;
1618
1587
 
1619
1588
        ret = init_srcu_struct(&fs_info->subvol_srcu);
1620
1589
        if (ret) {
1634
1603
                goto fail_bdi;
1635
1604
        }
1636
1605
 
 
1606
        fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
 
1607
 
1637
1608
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1638
1609
        INIT_LIST_HEAD(&fs_info->trans_list);
1639
1610
        INIT_LIST_HEAD(&fs_info->dead_roots);
1643
1614
        INIT_LIST_HEAD(&fs_info->ordered_operations);
1644
1615
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
1645
1616
        spin_lock_init(&fs_info->delalloc_lock);
1646
 
        spin_lock_init(&fs_info->new_trans_lock);
 
1617
        spin_lock_init(&fs_info->trans_lock);
1647
1618
        spin_lock_init(&fs_info->ref_cache_lock);
1648
1619
        spin_lock_init(&fs_info->fs_roots_radix_lock);
1649
1620
        spin_lock_init(&fs_info->delayed_iput_lock);
 
1621
        spin_lock_init(&fs_info->defrag_inodes_lock);
 
1622
        mutex_init(&fs_info->reloc_mutex);
1650
1623
 
1651
1624
        init_completion(&fs_info->kobj_unregister);
1652
1625
        fs_info->tree_root = tree_root;
1669
1642
        atomic_set(&fs_info->async_delalloc_pages, 0);
1670
1643
        atomic_set(&fs_info->async_submit_draining, 0);
1671
1644
        atomic_set(&fs_info->nr_async_bios, 0);
 
1645
        atomic_set(&fs_info->defrag_running, 0);
1672
1646
        fs_info->sb = sb;
1673
1647
        fs_info->max_inline = 8192 * 1024;
1674
1648
        fs_info->metadata_ratio = 0;
 
1649
        fs_info->defrag_inodes = RB_ROOT;
 
1650
        fs_info->trans_no_join = 0;
1675
1651
 
1676
1652
        fs_info->thread_pool_size = min_t(unsigned long,
1677
1653
                                          num_online_cpus() + 2, 8);
1678
1654
 
1679
1655
        INIT_LIST_HEAD(&fs_info->ordered_extents);
1680
1656
        spin_lock_init(&fs_info->ordered_extent_lock);
 
1657
        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
 
1658
                                        GFP_NOFS);
 
1659
        if (!fs_info->delayed_root) {
 
1660
                err = -ENOMEM;
 
1661
                goto fail_iput;
 
1662
        }
 
1663
        btrfs_init_delayed_root(fs_info->delayed_root);
 
1664
 
 
1665
        mutex_init(&fs_info->scrub_lock);
 
1666
        atomic_set(&fs_info->scrubs_running, 0);
 
1667
        atomic_set(&fs_info->scrub_pause_req, 0);
 
1668
        atomic_set(&fs_info->scrubs_paused, 0);
 
1669
        atomic_set(&fs_info->scrub_cancel_req, 0);
 
1670
        init_waitqueue_head(&fs_info->scrub_pause_wait);
 
1671
        init_rwsem(&fs_info->scrub_super_lock);
 
1672
        fs_info->scrub_workers_refcnt = 0;
1681
1673
 
1682
1674
        sb->s_blocksize = 4096;
1683
1675
        sb->s_blocksize_bits = blksize_bits(4096);
1696
1688
 
1697
1689
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1698
1690
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1699
 
                             fs_info->btree_inode->i_mapping,
1700
 
                             GFP_NOFS);
1701
 
        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1702
 
                             GFP_NOFS);
 
1691
                             fs_info->btree_inode->i_mapping);
 
1692
        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
1703
1693
 
1704
1694
        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1705
1695
 
1713
1703
        fs_info->block_group_cache_tree = RB_ROOT;
1714
1704
 
1715
1705
        extent_io_tree_init(&fs_info->freed_extents[0],
1716
 
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
 
1706
                             fs_info->btree_inode->i_mapping);
1717
1707
        extent_io_tree_init(&fs_info->freed_extents[1],
1718
 
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
 
1708
                             fs_info->btree_inode->i_mapping);
1719
1709
        fs_info->pinned_extents = &fs_info->freed_extents[0];
1720
1710
        fs_info->do_barriers = 1;
1721
1711
 
1722
1712
 
1723
 
        mutex_init(&fs_info->trans_mutex);
1724
1713
        mutex_init(&fs_info->ordered_operations_mutex);
1725
1714
        mutex_init(&fs_info->tree_log_mutex);
1726
1715
        mutex_init(&fs_info->chunk_mutex);
1745
1734
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1746
1735
        if (!bh) {
1747
1736
                err = -EINVAL;
1748
 
                goto fail_iput;
 
1737
                goto fail_alloc;
1749
1738
        }
1750
1739
 
1751
1740
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1757
1746
 
1758
1747
        disk_super = &fs_info->super_copy;
1759
1748
        if (!btrfs_super_root(disk_super))
1760
 
                goto fail_iput;
 
1749
                goto fail_alloc;
1761
1750
 
1762
1751
        /* check FS state, whether FS is broken. */
1763
1752
        fs_info->fs_state |= btrfs_super_flags(disk_super);
1764
1753
 
1765
1754
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1766
1755
 
 
1756
        /*
 
1757
         * In the long term, we'll store the compression type in the super
 
1758
         * block, and it'll be used for per file compression control.
 
1759
         */
 
1760
        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
 
1761
 
1767
1762
        ret = btrfs_parse_options(tree_root, options);
1768
1763
        if (ret) {
1769
1764
                err = ret;
1770
 
                goto fail_iput;
 
1765
                goto fail_alloc;
1771
1766
        }
1772
1767
 
1773
1768
        features = btrfs_super_incompat_flags(disk_super) &
1777
1772
                       "unsupported optional features (%Lx).\n",
1778
1773
                       (unsigned long long)features);
1779
1774
                err = -EINVAL;
1780
 
                goto fail_iput;
 
1775
                goto fail_alloc;
1781
1776
        }
1782
1777
 
1783
1778
        features = btrfs_super_incompat_flags(disk_super);
1793
1788
                       "unsupported option features (%Lx).\n",
1794
1789
                       (unsigned long long)features);
1795
1790
                err = -EINVAL;
1796
 
                goto fail_iput;
 
1791
                goto fail_alloc;
1797
1792
        }
1798
1793
 
1799
1794
        btrfs_init_workers(&fs_info->generic_worker,
1840
1835
                           &fs_info->generic_worker);
1841
1836
        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1842
1837
                           1, &fs_info->generic_worker);
 
1838
        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 
1839
                           fs_info->thread_pool_size,
 
1840
                           &fs_info->generic_worker);
1843
1841
 
1844
1842
        /*
1845
1843
         * endios are largely parallel and should have a very
1861
1859
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1862
1860
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
1863
1861
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 
1862
        btrfs_start_workers(&fs_info->delayed_workers, 1);
1864
1863
 
1865
1864
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1866
1865
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1969
1968
        fs_info->metadata_alloc_profile = (u64)-1;
1970
1969
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1971
1970
 
 
1971
        ret = btrfs_init_space_info(fs_info);
 
1972
        if (ret) {
 
1973
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
 
1974
                goto fail_block_groups;
 
1975
        }
 
1976
 
1972
1977
        ret = btrfs_read_block_groups(extent_root);
1973
1978
        if (ret) {
1974
1979
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2060
2065
 
2061
2066
        if (!(sb->s_flags & MS_RDONLY)) {
2062
2067
                down_read(&fs_info->cleanup_work_sem);
2063
 
                btrfs_orphan_cleanup(fs_info->fs_root);
2064
 
                btrfs_orphan_cleanup(fs_info->tree_root);
 
2068
                err = btrfs_orphan_cleanup(fs_info->fs_root);
 
2069
                if (!err)
 
2070
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
2065
2071
                up_read(&fs_info->cleanup_work_sem);
 
2072
                if (err) {
 
2073
                        close_ctree(tree_root);
 
2074
                        return ERR_PTR(err);
 
2075
                }
2066
2076
        }
2067
2077
 
2068
2078
        return tree_root;
2106
2116
        btrfs_stop_workers(&fs_info->endio_write_workers);
2107
2117
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
2108
2118
        btrfs_stop_workers(&fs_info->submit_workers);
 
2119
        btrfs_stop_workers(&fs_info->delayed_workers);
 
2120
fail_alloc:
 
2121
        kfree(fs_info->delayed_root);
2109
2122
fail_iput:
2110
2123
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2111
2124
        iput(fs_info->btree_inode);
2133
2146
        if (uptodate) {
2134
2147
                set_buffer_uptodate(bh);
2135
2148
        } else {
2136
 
                if (printk_ratelimit()) {
2137
 
                        printk(KERN_WARNING "lost page write due to "
 
2149
                printk_ratelimited(KERN_WARNING "lost page write due to "
2138
2150
                                        "I/O error on %s\n",
2139
2151
                                       bdevname(bh->b_bdev, b));
2140
 
                }
2141
2152
                /* note, we dont' set_buffer_write_io_error because we have
2142
2153
                 * our own ways of dealing with the IO errors
2143
2154
                 */
2301
2312
 
2302
2313
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2303
2314
        head = &root->fs_info->fs_devices->devices;
2304
 
        list_for_each_entry(dev, head, dev_list) {
 
2315
        list_for_each_entry_rcu(dev, head, dev_list) {
2305
2316
                if (!dev->bdev) {
2306
2317
                        total_errors++;
2307
2318
                        continue;
2334
2345
        }
2335
2346
 
2336
2347
        total_errors = 0;
2337
 
        list_for_each_entry(dev, head, dev_list) {
 
2348
        list_for_each_entry_rcu(dev, head, dev_list) {
2338
2349
                if (!dev->bdev)
2339
2350
                        continue;
2340
2351
                if (!dev->in_fs_metadata || !dev->writeable)
2372
2383
        if (btrfs_root_refs(&root->root_item) == 0)
2373
2384
                synchronize_srcu(&fs_info->subvol_srcu);
2374
2385
 
 
2386
        __btrfs_remove_free_space_cache(root->free_ino_pinned);
 
2387
        __btrfs_remove_free_space_cache(root->free_ino_ctl);
2375
2388
        free_fs_root(root);
2376
2389
        return 0;
2377
2390
}
2378
2391
 
2379
2392
static void free_fs_root(struct btrfs_root *root)
2380
2393
{
 
2394
        iput(root->cache_inode);
2381
2395
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2382
2396
        if (root->anon_super.s_dev) {
2383
2397
                down_write(&root->anon_super.s_umount);
2385
2399
        }
2386
2400
        free_extent_buffer(root->node);
2387
2401
        free_extent_buffer(root->commit_root);
 
2402
        kfree(root->free_ino_ctl);
 
2403
        kfree(root->free_ino_pinned);
2388
2404
        kfree(root->name);
2389
2405
        kfree(root);
2390
2406
}
2437
2453
 
2438
2454
                root_objectid = gang[ret - 1]->root_key.objectid + 1;
2439
2455
                for (i = 0; i < ret; i++) {
 
2456
                        int err;
 
2457
 
2440
2458
                        root_objectid = gang[i]->root_key.objectid;
2441
 
                        btrfs_orphan_cleanup(gang[i]);
 
2459
                        err = btrfs_orphan_cleanup(gang[i]);
 
2460
                        if (err)
 
2461
                                return err;
2442
2462
                }
2443
2463
                root_objectid++;
2444
2464
        }
2459
2479
        down_write(&root->fs_info->cleanup_work_sem);
2460
2480
        up_write(&root->fs_info->cleanup_work_sem);
2461
2481
 
2462
 
        trans = btrfs_join_transaction(root, 1);
 
2482
        trans = btrfs_join_transaction(root);
2463
2483
        if (IS_ERR(trans))
2464
2484
                return PTR_ERR(trans);
2465
2485
        ret = btrfs_commit_transaction(trans, root);
2466
2486
        BUG_ON(ret);
2467
2487
        /* run commit again to drop the original snapshot */
2468
 
        trans = btrfs_join_transaction(root, 1);
 
2488
        trans = btrfs_join_transaction(root);
2469
2489
        if (IS_ERR(trans))
2470
2490
                return PTR_ERR(trans);
2471
2491
        btrfs_commit_transaction(trans, root);
2484
2504
        fs_info->closing = 1;
2485
2505
        smp_mb();
2486
2506
 
 
2507
        btrfs_scrub_cancel(root);
 
2508
 
 
2509
        /* wait for any defraggers to finish */
 
2510
        wait_event(fs_info->transaction_wait,
 
2511
                   (atomic_read(&fs_info->defrag_running) == 0));
 
2512
 
 
2513
        /* clear out the rbtree of defraggable inodes */
 
2514
        btrfs_run_defrag_inodes(root->fs_info);
 
2515
 
2487
2516
        btrfs_put_block_group_cache(fs_info);
2488
2517
 
2489
2518
        /*
2495
2524
         * ERROR state on disk.
2496
2525
         *
2497
2526
         * 2. when btrfs flips readonly just in btrfs_commit_super,
2498
 
         * and in such case, btrfs cannnot write sb via btrfs_commit_super,
 
2527
         * and in such case, btrfs cannot write sb via btrfs_commit_super,
2499
2528
         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2500
2529
         * btrfs will cleanup all FS resources first and write sb then.
2501
2530
         */
2542
2571
        del_fs_roots(fs_info);
2543
2572
 
2544
2573
        iput(fs_info->btree_inode);
 
2574
        kfree(fs_info->delayed_root);
2545
2575
 
2546
2576
        btrfs_stop_workers(&fs_info->generic_worker);
2547
2577
        btrfs_stop_workers(&fs_info->fixup_workers);
2553
2583
        btrfs_stop_workers(&fs_info->endio_write_workers);
2554
2584
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
2555
2585
        btrfs_stop_workers(&fs_info->submit_workers);
 
2586
        btrfs_stop_workers(&fs_info->delayed_workers);
2556
2587
 
2557
2588
        btrfs_close_devices(fs_info->fs_devices);
2558
2589
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
2629
2660
        if (current->flags & PF_MEMALLOC)
2630
2661
                return;
2631
2662
 
 
2663
        btrfs_balance_delayed_items(root);
 
2664
 
 
2665
        num_dirty = root->fs_info->dirty_metadata_bytes;
 
2666
 
 
2667
        if (num_dirty > thresh) {
 
2668
                balance_dirty_pages_ratelimited_nr(
 
2669
                                   root->fs_info->btree_inode->i_mapping, 1);
 
2670
        }
 
2671
        return;
 
2672
}
 
2673
 
 
2674
void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
2675
{
 
2676
        /*
 
2677
         * looks as though older kernels can get into trouble with
 
2678
         * this code, they end up stuck in balance_dirty_pages forever
 
2679
         */
 
2680
        u64 num_dirty;
 
2681
        unsigned long thresh = 32 * 1024 * 1024;
 
2682
 
 
2683
        if (current->flags & PF_MEMALLOC)
 
2684
                return;
 
2685
 
2632
2686
        num_dirty = root->fs_info->dirty_metadata_bytes;
2633
2687
 
2634
2688
        if (num_dirty > thresh) {
2661
2715
                goto out;
2662
2716
 
2663
2717
        len = page->private >> 2;
2664
 
        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
 
2718
        eb = find_extent_buffer(io_tree, bytenr, len);
2665
2719
        if (!eb)
2666
2720
                goto out;
2667
2721
 
2788
2842
 
2789
2843
        spin_lock(&delayed_refs->lock);
2790
2844
        if (delayed_refs->num_entries == 0) {
 
2845
                spin_unlock(&delayed_refs->lock);
2791
2846
                printk(KERN_INFO "delayed_refs has NO entry\n");
2792
2847
                return ret;
2793
2848
        }
2856
2911
 
2857
2912
        INIT_LIST_HEAD(&splice);
2858
2913
 
 
2914
        spin_lock(&root->fs_info->delalloc_lock);
2859
2915
        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2860
2916
 
2861
 
        spin_lock(&root->fs_info->delalloc_lock);
2862
 
 
2863
2917
        while (!list_empty(&splice)) {
2864
2918
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2865
2919
                                    delalloc_inodes);
2949
3003
                        break;
2950
3004
 
2951
3005
                /* opt_discard */
2952
 
                ret = btrfs_error_discard_extent(root, start, end + 1 - start);
 
3006
                if (btrfs_test_opt(root, DISCARD))
 
3007
                        ret = btrfs_error_discard_extent(root, start,
 
3008
                                                         end + 1 - start,
 
3009
                                                         NULL);
2953
3010
 
2954
3011
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
2955
3012
                btrfs_error_unpin_extent_range(root, start, end);
2966
3023
 
2967
3024
        WARN_ON(1);
2968
3025
 
2969
 
        mutex_lock(&root->fs_info->trans_mutex);
2970
3026
        mutex_lock(&root->fs_info->transaction_kthread_mutex);
2971
3027
 
 
3028
        spin_lock(&root->fs_info->trans_lock);
2972
3029
        list_splice_init(&root->fs_info->trans_list, &list);
 
3030
        root->fs_info->trans_no_join = 1;
 
3031
        spin_unlock(&root->fs_info->trans_lock);
 
3032
 
2973
3033
        while (!list_empty(&list)) {
2974
3034
                t = list_entry(list.next, struct btrfs_transaction, list);
2975
3035
                if (!t)
2994
3054
                t->blocked = 0;
2995
3055
                if (waitqueue_active(&root->fs_info->transaction_wait))
2996
3056
                        wake_up(&root->fs_info->transaction_wait);
2997
 
                mutex_unlock(&root->fs_info->trans_mutex);
2998
3057
 
2999
 
                mutex_lock(&root->fs_info->trans_mutex);
3000
3058
                t->commit_done = 1;
3001
3059
                if (waitqueue_active(&t->commit_wait))
3002
3060
                        wake_up(&t->commit_wait);
3003
 
                mutex_unlock(&root->fs_info->trans_mutex);
3004
 
 
3005
 
                mutex_lock(&root->fs_info->trans_mutex);
3006
3061
 
3007
3062
                btrfs_destroy_pending_snapshots(t);
3008
3063
 
3009
3064
                btrfs_destroy_delalloc_inodes(root);
3010
3065
 
3011
 
                spin_lock(&root->fs_info->new_trans_lock);
 
3066
                spin_lock(&root->fs_info->trans_lock);
3012
3067
                root->fs_info->running_transaction = NULL;
3013
 
                spin_unlock(&root->fs_info->new_trans_lock);
 
3068
                spin_unlock(&root->fs_info->trans_lock);
3014
3069
 
3015
3070
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
3016
3071
                                             EXTENT_DIRTY);
3018
3073
                btrfs_destroy_pinned_extent(root,
3019
3074
                                            root->fs_info->pinned_extents);
3020
3075
 
3021
 
                t->use_count = 0;
 
3076
                atomic_set(&t->use_count, 0);
3022
3077
                list_del_init(&t->list);
3023
3078
                memset(t, 0, sizeof(*t));
3024
3079
                kmem_cache_free(btrfs_transaction_cachep, t);
3025
3080
        }
3026
3081
 
 
3082
        spin_lock(&root->fs_info->trans_lock);
 
3083
        root->fs_info->trans_no_join = 0;
 
3084
        spin_unlock(&root->fs_info->trans_lock);
3027
3085
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3028
 
        mutex_unlock(&root->fs_info->trans_mutex);
3029
3086
 
3030
3087
        return 0;
3031
3088
}