3449
static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3450
struct buffer_head *bh_result, int create)
3452
handle_t *handle = NULL;
3454
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3457
ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3458
inode->i_ino, create);
3460
* DIO VFS code passes create = 0 flag for write to
3461
* the middle of file. It does this to avoid block
3462
* allocation for holes, to prevent expose stale data
3463
* out when there is parallel buffered read (which does
3464
* not hold the i_mutex lock) while direct IO write has
3465
* not completed. DIO request on holes finally falls back
3466
* to buffered IO for this reason.
3468
* For ext4 extent based file, since we support fallocate,
3469
* new allocated extent as uninitialized, for holes, we
3470
* could fallocate blocks for holes, thus parallel
3471
* buffered IO read will zero out the page when read on
3472
* a hole while parallel DIO write to the hole has not completed.
3474
* when we come here, we know it's a direct IO write to
3475
* to the middle of file (<i_size)
3476
* so it's safe to override the create flag from VFS.
3478
create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3480
if (max_blocks > DIO_MAX_BLOCKS)
3481
max_blocks = DIO_MAX_BLOCKS;
3482
dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3483
handle = ext4_journal_start(inode, dio_credits);
3484
if (IS_ERR(handle)) {
3485
ret = PTR_ERR(handle);
3488
ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3491
bh_result->b_size = (ret << inode->i_blkbits);
3494
ext4_journal_stop(handle);
3499
static void ext4_free_io_end(ext4_io_end_t *io)
3505
static void dump_aio_dio_list(struct inode * inode)
3508
struct list_head *cur, *before, *after;
3509
ext4_io_end_t *io, *io0, *io1;
3511
if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3512
ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3516
ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3517
list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3520
io0 = container_of(before, ext4_io_end_t, list);
3522
io1 = container_of(after, ext4_io_end_t, list);
3524
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525
io, inode->i_ino, io0, io1);
3531
* check a range of space and convert unwritten extents to written.
3533
static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3535
struct inode *inode = io->inode;
3536
loff_t offset = io->offset;
3537
size_t size = io->size;
3540
ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3541
"list->prev 0x%p\n",
3542
io, inode->i_ino, io->list.next, io->list.prev);
3544
if (list_empty(&io->list))
3547
if (io->flag != DIO_AIO_UNWRITTEN)
3550
if (offset + size <= i_size_read(inode))
3551
ret = ext4_convert_unwritten_extents(inode, offset, size);
3554
printk(KERN_EMERG "%s: failed to convert unwritten"
3555
"extents to written extents, error is %d"
3556
" io is still on inode %lu aio dio list\n",
3557
__func__, ret, inode->i_ino);
3561
/* clear the DIO AIO unwritten flag */
3566
* work on completed aio dio IO, to convert unwritten extents to extents
3568
static void ext4_end_aio_dio_work(struct work_struct *work)
3570
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571
struct inode *inode = io->inode;
3574
mutex_lock(&inode->i_mutex);
3575
ret = ext4_end_aio_dio_nolock(io);
3577
if (!list_empty(&io->list))
3578
list_del_init(&io->list);
3579
ext4_free_io_end(io);
3581
mutex_unlock(&inode->i_mutex);
3584
* This function is called from ext4_sync_file().
3586
* When AIO DIO IO is completed, the work to convert unwritten
3587
* extents to written is queued on workqueue but may not get immediately
3588
* scheduled. When fsync is called, we need to ensure the
3589
* conversion is complete before fsync returns.
3590
* The inode keeps track of a list of completed AIO from DIO path
3591
* that might needs to do the conversion. This function walks through
3592
* the list and convert the related unwritten extents to written.
3594
int flush_aio_dio_completed_IO(struct inode *inode)
3600
if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3603
dump_aio_dio_list(inode);
3604
while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3605
io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3606
ext4_io_end_t, list);
3608
* Calling ext4_end_aio_dio_nolock() to convert completed
3611
* When ext4_sync_file() is called, run_queue() may already
3612
* about to flush the work corresponding to this io structure.
3613
* It will be upset if it founds the io structure related
3614
* to the work-to-be schedule is freed.
3616
* Thus we need to keep the io structure still valid here after
3617
* convertion finished. The io structure has a flag to
3618
* avoid double converting from both fsync and background work
3621
ret = ext4_end_aio_dio_nolock(io);
3625
list_del_init(&io->list);
3627
return (ret2 < 0) ? ret2 : 0;
3630
static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3632
ext4_io_end_t *io = NULL;
3634
io = kmalloc(sizeof(*io), GFP_NOFS);
3643
INIT_WORK(&io->work, ext4_end_aio_dio_work);
3644
INIT_LIST_HEAD(&io->list);
3650
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3651
ssize_t size, void *private)
3653
ext4_io_end_t *io_end = iocb->private;
3654
struct workqueue_struct *wq;
3656
/* if not async direct IO or dio with 0 bytes write, just return */
3657
if (!io_end || !size)
3660
ext_debug("ext4_end_io_dio(): io_end 0x%p"
3661
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3662
iocb->private, io_end->inode->i_ino, iocb, offset,
3665
/* if not aio dio with unwritten extents, just free io and return */
3666
if (io_end->flag != DIO_AIO_UNWRITTEN){
3667
ext4_free_io_end(io_end);
3668
iocb->private = NULL;
3672
io_end->offset = offset;
3673
io_end->size = size;
3674
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3676
/* queue the work to convert unwritten extents to written */
3677
queue_work(wq, &io_end->work);
3679
/* Add the io_end to per-inode completed aio dio list*/
3680
list_add_tail(&io_end->list,
3681
&EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682
iocb->private = NULL;
3685
* For ext4 extent files, ext4 will do direct-io write to holes,
3686
* preallocated extents, and those write extend the file, no need to
3687
* fall back to buffered IO.
3689
* For holes, we fallocate those blocks, mark them as unintialized
3690
* If those blocks were preallocated, we mark sure they are splited, but
3691
* still keep the range to write as unintialized.
3693
* The unwrritten extents will be converted to written when DIO is completed.
3694
* For async direct IO, since the IO may still pending when return, we
3695
* set up an end_io call back function, which will do the convertion
3696
* when async direct IO completed.
3698
* If the O_DIRECT write will extend the file then add this inode to the
3699
* orphan list. So recovery will truncate it back to the original size
3700
* if the machine crashes during the write.
3703
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704
const struct iovec *iov, loff_t offset,
3705
unsigned long nr_segs)
3707
struct file *file = iocb->ki_filp;
3708
struct inode *inode = file->f_mapping->host;
3710
size_t count = iov_length(iov, nr_segs);
3712
loff_t final_size = offset + count;
3713
if (rw == WRITE && final_size <= inode->i_size) {
3715
* We could direct write to holes and fallocate.
3717
* Allocated blocks to fill the hole are marked as uninitialized
3718
* to prevent paralel buffered read to expose the stale data
3719
* before DIO complete the data IO.
3721
* As to previously fallocated extents, ext4 get_block
3722
* will just simply mark the buffer mapped but still
3723
* keep the extents uninitialized.
3725
* for non AIO case, we will convert those unwritten extents
3726
* to written after return back from blockdev_direct_IO.
3728
* for async DIO, the conversion needs to be defered when
3729
* the IO is completed. The ext4 end_io callback function
3730
* will be called to take care of the conversion work.
3731
* Here for async case, we allocate an io_end structure to
3734
iocb->private = NULL;
3735
EXT4_I(inode)->cur_aio_dio = NULL;
3736
if (!is_sync_kiocb(iocb)) {
3737
iocb->private = ext4_init_io_end(inode);
3741
* we save the io structure for current async
3742
* direct IO, so that later ext4_get_blocks()
3743
* could flag the io structure whether there
3744
* is a unwritten extents needs to be converted
3745
* when IO is completed.
3747
EXT4_I(inode)->cur_aio_dio = iocb->private;
3750
ret = blockdev_direct_IO(rw, iocb, inode,
3751
inode->i_sb->s_bdev, iov,
3753
ext4_get_block_dio_write,
3756
EXT4_I(inode)->cur_aio_dio = NULL;
3758
* The io_end structure takes a reference to the inode,
3759
* that structure needs to be destroyed and the
3760
* reference to the inode need to be dropped, when IO is
3761
* complete, even with 0 byte write, or failed.
3763
* In the successful AIO DIO case, the io_end structure will be
3764
* desctroyed and the reference to the inode will be dropped
3765
* after the end_io call back function is called.
3767
* In the case there is 0 byte write, or error case, since
3768
* VFS direct IO won't invoke the end_io call back function,
3769
* we need to free the end_io structure here.
3771
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772
ext4_free_io_end(iocb->private);
3773
iocb->private = NULL;
3774
} else if (ret > 0 && (EXT4_I(inode)->i_state &
3775
EXT4_STATE_DIO_UNWRITTEN)) {
3778
* for non AIO case, since the IO is already
3779
* completed, we could do the convertion right here
3781
err = ext4_convert_unwritten_extents(inode,
3785
EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
3790
/* for write the the end of file case, we fall back to old way */
3791
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3794
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3795
const struct iovec *iov, loff_t offset,
3796
unsigned long nr_segs)
3798
struct file *file = iocb->ki_filp;
3799
struct inode *inode = file->f_mapping->host;
3801
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3802
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3804
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3345
3808
* Pages can be marked dirty completely asynchronously from ext4's journalling
3346
3809
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do