97
* Page Region interfaces.
99
* For pages in filesystems where the blocksize is smaller than the
100
* pagesize, we use the page->private field (long) to hold a bitmap
101
* of uptodate regions within the page.
103
* Each such region is "bytes per page / bits per long" bytes long.
105
* NBPPR == number-of-bytes-per-page-region
106
* BTOPR == bytes-to-page-region (rounded up)
107
* BTOPRT == bytes-to-page-region-truncated (rounded down)
109
#if (BITS_PER_LONG == 32)
110
#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
111
#elif (BITS_PER_LONG == 64)
112
#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
114
#error BITS_PER_LONG must be 32 or 64
116
#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
117
#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
118
#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
128
first = BTOPR(offset);
129
final = BTOPRT(offset + length - 1);
130
first = min(first, final);
133
mask <<= BITS_PER_LONG - (final - first);
134
mask >>= BITS_PER_LONG - (final);
136
ASSERT(offset + length <= PAGE_CACHE_SIZE);
137
ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
148
set_page_private(page,
149
page_private(page) | page_region_mask(offset, length));
150
if (page_private(page) == ~0UL)
151
SetPageUptodate(page);
160
unsigned long mask = page_region_mask(offset, length);
162
return (mask && (page_private(page) & mask) == mask);
166
96
* xfs_buf_lru_add - add a buffer to the LRU.
168
98
* The LRU takes a new reference to the buffer so that it will only be freed
189
119
* The unlocked check is safe here because it only occurs when there are not
190
120
* b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191
121
* to optimise the shrinker removing the buffer from the LRU and calling
192
* xfs_buf_free(). i.e. it removes an unneccessary round trip on the
122
* xfs_buf_free(). i.e. it removes an unnecessary round trip on the
342
272
for (i = 0; i < bp->b_page_count; i++) {
343
273
struct page *page = bp->b_pages[i];
345
if (bp->b_flags & _XBF_PAGE_CACHE)
346
ASSERT(!PagePrivate(page));
347
page_cache_release(page);
277
} else if (bp->b_flags & _XBF_KMEM)
278
kmem_free(bp->b_addr);
350
279
_xfs_buf_free_pages(bp);
351
280
xfs_buf_deallocate(bp);
355
* Finds all pages for buffer in question and builds it's page list.
284
* Allocates all the pages for buffer in question and builds it's page list.
358
_xfs_buf_lookup_pages(
287
xfs_buf_allocate_memory(
362
struct address_space *mapping = bp->b_target->bt_mapping;
363
size_t blocksize = bp->b_target->bt_bsize;
364
291
size_t size = bp->b_count_desired;
365
292
size_t nbytes, offset;
366
293
gfp_t gfp_mask = xb_to_gfp(flags);
367
294
unsigned short page_count, i;
299
* for buffers that are contained within a single page, just allocate
300
* the memory from the heap - there's no need for the complexity of
301
* page arrays to keep allocation down to order 0.
303
if (bp->b_buffer_length < PAGE_SIZE) {
304
bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
306
/* low memory - use alloc_page loop instead */
310
if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
312
((unsigned long)bp->b_addr & PAGE_MASK)) {
313
/* b_addr spans two pages - use alloc_page instead */
314
kmem_free(bp->b_addr);
318
bp->b_offset = offset_in_page(bp->b_addr);
319
bp->b_pages = bp->b_page_array;
320
bp->b_pages[0] = virt_to_page(bp->b_addr);
321
bp->b_page_count = 1;
322
bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
372
327
end = bp->b_file_offset + bp->b_buffer_length;
373
328
page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
375
329
error = _xfs_buf_get_pages(bp, page_count, flags);
376
330
if (unlikely(error))
378
bp->b_flags |= _XBF_PAGE_CACHE;
380
333
offset = bp->b_offset;
381
first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
334
bp->b_flags |= _XBF_PAGES;
383
336
for (i = 0; i < bp->b_page_count; i++) {
384
337
struct page *page;
385
338
uint retries = 0;
388
page = find_or_create_page(mapping, first + i, gfp_mask);
340
page = alloc_page(gfp_mask);
389
341
if (unlikely(page == NULL)) {
390
342
if (flags & XBF_READ_AHEAD) {
391
343
bp->b_page_count = i;
392
for (i = 0; i < bp->b_page_count; i++)
393
unlock_page(bp->b_pages[i]);
401
352
* handle buffer allocation failures we can't do much.
403
354
if (!(++retries % 100))
405
"XFS: possible memory allocation "
406
"deadlock in %s (mode:0x%x)\n",
356
"possible memory allocation deadlock in %s (mode:0x%x)",
407
357
__func__, gfp_mask);
409
359
XFS_STATS_INC(xb_page_retries);
414
364
XFS_STATS_INC(xb_page_found);
416
nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
366
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
419
ASSERT(!PagePrivate(page));
420
if (!PageUptodate(page)) {
422
if (blocksize >= PAGE_CACHE_SIZE) {
423
if (flags & XBF_READ)
424
bp->b_flags |= _XBF_PAGE_LOCKED;
425
} else if (!PagePrivate(page)) {
426
if (test_page_region(page, offset, nbytes))
431
368
bp->b_pages[i] = page;
435
if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
436
for (i = 0; i < bp->b_page_count; i++)
437
unlock_page(bp->b_pages[i]);
440
if (page_count == bp->b_page_count)
441
bp->b_flags |= XBF_DONE;
374
for (i = 0; i < bp->b_page_count; i++)
375
__free_page(bp->b_pages[i]);
447
* Map buffer into kernel address-space if nessecary.
380
* Map buffer into kernel address-space if necessary.
450
383
_xfs_buf_map_pages(
454
/* A single page buffer is always mappable */
387
ASSERT(bp->b_flags & _XBF_PAGES);
455
388
if (bp->b_page_count == 1) {
389
/* A single page buffer is always mappable */
456
390
bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
457
391
bp->b_flags |= XBF_MAPPED;
458
392
} else if (flags & XBF_MAPPED) {
459
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
461
if (unlikely(bp->b_addr == NULL))
396
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
401
} while (retried++ <= 1);
463
405
bp->b_addr += bp->b_offset;
464
406
bp->b_flags |= XBF_MAPPED;
515
* if the buffer is stale, clear all the external state associated with
516
* it. We need to keep flags such as how we allocated the buffer memory
572
519
if (bp->b_flags & XBF_STALE) {
573
520
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
574
bp->b_flags &= XBF_MAPPED;
521
bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
577
524
trace_xfs_buf_find(bp, flags, _RET_IP_);
601
548
bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
602
549
if (bp == new_bp) {
603
error = _xfs_buf_lookup_pages(bp, flags);
550
error = xfs_buf_allocate_memory(bp, flags);
612
for (i = 0; i < bp->b_page_count; i++)
613
mark_page_accessed(bp->b_pages[i]);
615
559
if (!(bp->b_flags & XBF_MAPPED)) {
616
560
error = _xfs_buf_map_pages(bp, flags);
617
561
if (unlikely(error)) {
618
printk(KERN_WARNING "%s: failed to map pages\n",
562
xfs_warn(target->bt_mount,
563
"%s: failed to map pages\n", __func__);
713
struct backing_dev_info *bdi;
715
bdi = target->bt_mapping->backing_dev_info;
716
if (bdi_read_congested(bdi))
657
if (bdi_read_congested(target->bt_bdi))
719
660
xfs_buf_read(target, ioff, isize,
712
* Return a buffer allocated as an empty buffer and associated to external
713
* memory via xfs_buf_associate_memory() back to it's empty state.
721
_xfs_buf_free_pages(bp);
724
bp->b_page_count = 0;
726
bp->b_file_offset = 0;
727
bp->b_buffer_length = bp->b_count_desired = len;
728
bp->b_bn = XFS_BUF_DADDR_NULL;
729
bp->b_flags &= ~XBF_MAPPED;
770
732
static inline struct page *
794
pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
756
pageaddr = (unsigned long)mem & PAGE_MASK;
795
757
offset = (unsigned long)mem - pageaddr;
796
buflen = PAGE_CACHE_ALIGN(len + offset);
797
page_count = buflen >> PAGE_CACHE_SHIFT;
758
buflen = PAGE_ALIGN(len + offset);
759
page_count = buflen >> PAGE_SHIFT;
799
761
/* Free any previous set of page pointers */
812
774
for (i = 0; i < bp->b_page_count; i++) {
813
775
bp->b_pages[i] = mem_to_page((void *)pageaddr);
814
pageaddr += PAGE_CACHE_SIZE;
776
pageaddr += PAGE_SIZE;
817
779
bp->b_count_desired = len;
818
780
bp->b_buffer_length = buflen;
819
781
bp->b_flags |= XBF_MAPPED;
820
bp->b_flags &= ~_XBF_PAGE_LOCKED;
851
812
error = _xfs_buf_map_pages(bp, XBF_MAPPED);
852
813
if (unlikely(error)) {
853
printk(KERN_WARNING "%s: failed to map pages\n",
814
xfs_warn(target->bt_mount,
815
"%s: failed to map pages\n", __func__);
855
816
goto fail_free_mem;
927
* Mutual exclusion on buffers. Locking model:
929
* Buffers associated with inodes for which buffer locking
930
* is not enabled are not protected by semaphores, and are
931
* assumed to be exclusively owned by the caller. There is a
932
* spinlock in the buffer, used by the caller when concurrent
933
* access is possible.
937
* Locks a buffer object, if it is not already locked. Note that this in
938
* no way locks the underlying pages, so it is only useful for
939
* synchronizing concurrent use of buffer objects, not for synchronizing
940
* independent access to the underlying pages.
888
* Lock a buffer object, if it is not already locked.
942
890
* If we come across a stale, pinned, locked buffer, we know that we are
943
891
* being asked to lock a buffer that has been reallocated. Because it is
974
* Locks a buffer object.
975
* Note that this in no way locks the underlying pages, so it is only
976
* useful for synchronizing concurrent use of buffer objects, not for
977
* synchronizing independent access to the underlying pages.
922
* Lock a buffer object.
979
924
* If we come across a stale, pinned, locked buffer, we know that we
980
925
* are being asked to lock a buffer that has been reallocated. Because
1035
978
set_current_state(TASK_UNINTERRUPTIBLE);
1036
979
if (atomic_read(&bp->b_pin_count) == 0)
1038
if (atomic_read(&bp->b_io_remaining))
1039
blk_run_address_space(bp->b_target->bt_mapping);
1042
983
remove_wait_queue(&bp->b_waiters, &wait);
1043
984
set_current_state(TASK_RUNNING);
1263
1202
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1264
unsigned int blocksize = bp->b_target->bt_bsize;
1265
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1267
1204
xfs_buf_ioerror(bp, -error);
1269
1206
if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1270
1207
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1273
struct page *page = bvec->bv_page;
1275
ASSERT(!PagePrivate(page));
1276
if (unlikely(bp->b_error)) {
1277
if (bp->b_flags & XBF_READ)
1278
ClearPageUptodate(page);
1279
} else if (blocksize >= PAGE_CACHE_SIZE) {
1280
SetPageUptodate(page);
1281
} else if (!PagePrivate(page) &&
1282
(bp->b_flags & _XBF_PAGE_CACHE)) {
1283
set_page_region(page, bvec->bv_offset, bvec->bv_len);
1286
if (--bvec >= bio->bi_io_vec)
1287
prefetchw(&bvec->bv_page->flags);
1289
if (bp->b_flags & _XBF_PAGE_LOCKED)
1291
} while (bvec >= bio->bi_io_vec);
1293
1209
_xfs_buf_ioend(bp, 1);
1324
1239
(bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1327
/* Special code path for reading a sub page size buffer in --
1328
* we populate up the whole page, and hence the other metadata
1329
* in the same page. This optimization is only valid when the
1330
* filesystem block size is not smaller than the page size.
1332
if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1333
((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1334
(XBF_READ|_XBF_PAGE_LOCKED)) &&
1335
(blocksize >= PAGE_CACHE_SIZE)) {
1336
bio = bio_alloc(GFP_NOIO, 1);
1338
bio->bi_bdev = bp->b_target->bt_bdev;
1339
bio->bi_sector = sector - (offset >> BBSHIFT);
1340
bio->bi_end_io = xfs_buf_bio_end_io;
1341
bio->bi_private = bp;
1343
bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1346
atomic_inc(&bp->b_io_remaining);
1352
1244
atomic_inc(&bp->b_io_remaining);
1360
1252
bio->bi_end_io = xfs_buf_bio_end_io;
1361
1253
bio->bi_private = bp;
1363
1256
for (; size && nr_pages; nr_pages--, map_i++) {
1364
int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1257
int rbytes, nbytes = PAGE_SIZE - offset;
1366
1259
if (nbytes > size)
1387
1279
goto next_chunk;
1390
* if we get here, no pages were added to the bio. However,
1391
* we can't just error out here - if the pages are locked then
1392
* we have to unlock them otherwise we can hang on a later
1393
* access to the page.
1395
1281
xfs_buf_ioerror(bp, EIO);
1396
if (bp->b_flags & _XBF_PAGE_LOCKED) {
1398
for (i = 0; i < bp->b_page_count; i++)
1399
unlock_page(bp->b_pages[i]);
1443
1324
trace_xfs_buf_iowait(bp, _RET_IP_);
1445
if (atomic_read(&bp->b_io_remaining))
1446
blk_run_address_space(bp->b_target->bt_mapping);
1447
1326
wait_for_completion(&bp->b_iowait);
1449
1328
trace_xfs_buf_iowait_done(bp, _RET_IP_);
1461
1340
return XFS_BUF_PTR(bp) + offset;
1463
1342
offset += bp->b_offset;
1464
page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1465
return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1343
page = bp->b_pages[offset >> PAGE_SHIFT];
1344
return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1484
1363
page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1485
1364
cpoff = xfs_buf_poff(boff + bp->b_offset);
1486
1365
csize = min_t(size_t,
1487
PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1366
PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1489
ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1368
ASSERT(((csize + cpoff) <= PAGE_SIZE));
1491
1370
switch (mode) {
1492
1371
case XBRW_ZERO:
1599
1478
xfs_flush_buftarg(btp, 1);
1600
1479
if (mp->m_flags & XFS_MOUNT_BARRIER)
1601
1480
xfs_blkdev_issue_flush(btp);
1602
iput(btp->bt_mapping->host);
1604
1482
kthread_stop(btp->bt_task);
1605
1483
kmem_free(btp);
1617
1495
btp->bt_smask = sectorsize - 1;
1619
1497
if (set_blocksize(btp->bt_bdev, sectorsize)) {
1621
"XFS: Cannot set_blocksize to %u on device %s\n",
1498
xfs_warn(btp->bt_mount,
1499
"Cannot set_blocksize to %u on device %s\n",
1622
1500
sectorsize, XFS_BUFTARG_NAME(btp));
1627
(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1629
"XFS: %u byte sectors in use on device %s. "
1630
"This is suboptimal; %u or greater is ideal.\n",
1631
sectorsize, XFS_BUFTARG_NAME(btp),
1632
(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1646
1515
struct block_device *bdev)
1648
1517
return xfs_setsize_buftarg_flags(btp,
1649
PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
1518
PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1662
xfs_mapping_buftarg(
1664
struct block_device *bdev)
1666
struct backing_dev_info *bdi;
1667
struct inode *inode;
1668
struct address_space *mapping;
1669
static const struct address_space_operations mapping_aops = {
1670
.sync_page = block_sync_page,
1671
.migratepage = fail_migrate_page,
1674
inode = new_inode(bdev->bd_inode->i_sb);
1677
"XFS: Cannot allocate mapping inode for device %s\n",
1678
XFS_BUFTARG_NAME(btp));
1681
inode->i_ino = get_next_ino();
1682
inode->i_mode = S_IFBLK;
1683
inode->i_bdev = bdev;
1684
inode->i_rdev = bdev->bd_dev;
1685
bdi = blk_get_backing_dev_info(bdev);
1687
bdi = &default_backing_dev_info;
1688
mapping = &inode->i_data;
1689
mapping->a_ops = &mapping_aops;
1690
mapping->backing_dev_info = bdi;
1691
mapping_set_gfp_mask(mapping, GFP_NOFS);
1692
btp->bt_mapping = mapping;
1697
1531
xfs_alloc_delwrite_queue(
1698
1532
xfs_buftarg_t *btp,
1699
1533
const char *fsname)
1721
1555
btp->bt_mount = mp;
1722
1556
btp->bt_dev = bdev->bd_dev;
1723
1557
btp->bt_bdev = bdev;
1558
btp->bt_bdi = blk_get_backing_dev_info(bdev);
1724
1562
INIT_LIST_HEAD(&btp->bt_lru);
1725
1563
spin_lock_init(&btp->bt_lru_lock);
1726
1564
if (xfs_setsize_buftarg_early(btp, bdev))
1728
if (xfs_mapping_buftarg(btp, bdev))
1730
1566
if (xfs_alloc_delwrite_queue(btp, fsname))
1732
1568
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1941
1777
xfs_buf_delwri_split(target, &tmp, age);
1942
1778
list_sort(NULL, &tmp, xfs_buf_cmp);
1780
blk_start_plug(&plug);
1943
1781
while (!list_empty(&tmp)) {
1944
1782
struct xfs_buf *bp;
1945
1783
bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1946
1784
list_del_init(&bp->b_list);
1947
1785
xfs_bdstrat_cb(bp);
1951
blk_run_address_space(target->bt_mapping);
1787
blk_finish_plug(&plug);
1953
1788
} while (!kthread_should_stop());
1983
1819
* we do that after issuing all the IO.
1985
1821
list_sort(NULL, &tmp_list, xfs_buf_cmp);
1823
blk_start_plug(&plug);
1986
1824
while (!list_empty(&tmp_list)) {
1987
1825
bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1988
1826
ASSERT(target == bp->b_target);
1994
1832
xfs_bdstrat_cb(bp);
1834
blk_finish_plug(&plug);
1998
/* Expedite and wait for IO to complete. */
1999
blk_run_address_space(target->bt_mapping);
1837
/* Wait for IO to complete. */
2000
1838
while (!list_empty(&wait_list)) {
2001
1839
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2022
1860
if (!xfslogd_workqueue)
2023
1861
goto out_free_buf_zone;
2025
xfsdatad_workqueue = create_workqueue("xfsdatad");
1863
xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
2026
1864
if (!xfsdatad_workqueue)
2027
1865
goto out_destroy_xfslogd_workqueue;
2029
xfsconvertd_workqueue = create_workqueue("xfsconvertd");
1867
xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
2030
1869
if (!xfsconvertd_workqueue)
2031
1870
goto out_destroy_xfsdatad_workqueue;