76
73
sector_t block_in_file; /* Current offset into the underlying
77
74
file in dio_block units. */
78
75
unsigned blocks_available; /* At block_in_file. changes */
76
int reap_counter; /* rate limit reaping */
79
77
sector_t final_block_in_request;/* doesn't change */
80
78
unsigned first_block_in_page; /* doesn't change, Used only once */
81
79
int boundary; /* prev block is at a boundary */
82
int reap_counter; /* rate limit reaping */
83
80
get_block_t *get_block; /* block mapping function */
84
dio_iodone_t *end_io; /* IO completion function */
85
81
dio_submit_t *submit_io; /* IO submition function */
86
83
loff_t logical_offset_in_bio; /* current first logical block in bio */
87
84
sector_t final_block_in_bio; /* current final block in bio + 1 */
88
85
sector_t next_block_for_io; /* next block to be put under IO,
89
86
in dio_blocks units */
90
struct buffer_head map_bh; /* last get_block() result */
93
89
* Deferred addition of a page to the dio. These variables are
100
96
sector_t cur_page_block; /* Where it starts */
101
97
loff_t cur_page_fs_offset; /* Offset in file */
103
/* BIO completion state */
104
spinlock_t bio_lock; /* protects BIO fields below */
105
unsigned long refcount; /* direct_io_worker() and bios */
106
struct bio *bio_list; /* singly linked via bi_private */
107
struct task_struct *waiter; /* waiting task (NULL if none) */
109
/* AIO related stuff */
110
struct kiocb *iocb; /* kiocb */
111
int is_async; /* is IO async ? */
112
int io_error; /* IO error in completion path */
113
ssize_t result; /* IO result */
116
100
* Page fetching state. These variables belong to dio_refill_pages().
126
110
unsigned head; /* next page to process */
127
111
unsigned tail; /* last valid page + 1 */
114
/* dio_state communicated between submission path and end_io */
116
int flags; /* doesn't change */
119
loff_t i_size; /* i_size when submitted */
120
dio_iodone_t *end_io; /* IO completion function */
122
void *private; /* copy from map_bh.b_private */
124
/* BIO completion state */
125
spinlock_t bio_lock; /* protects BIO fields below */
128
126
int page_errors; /* errno from get_user_pages() */
127
int is_async; /* is IO async ? */
128
int io_error; /* IO error in completion path */
129
unsigned long refcount; /* direct_io_worker() and bios */
130
struct bio *bio_list; /* singly linked via bi_private */
131
struct task_struct *waiter; /* waiting task (NULL if none) */
133
/* AIO related stuff */
134
struct kiocb *iocb; /* kiocb */
135
ssize_t result; /* IO result */
131
138
* pages[] (and any fields placed after it) are not zeroed out at
133
140
* wish that they not be zeroed.
135
142
struct page *pages[DIO_PAGES]; /* page buffer */
143
} ____cacheline_aligned_in_smp;
145
static struct kmem_cache *dio_cache __read_mostly;
147
static void __inode_dio_wait(struct inode *inode)
149
wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
150
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
153
prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
154
if (atomic_read(&inode->i_dio_count))
156
} while (atomic_read(&inode->i_dio_count));
157
finish_wait(wq, &q.wait);
161
* inode_dio_wait - wait for outstanding DIO requests to finish
162
* @inode: inode to wait for
164
* Waits for all pending direct I/O requests to finish so that we can
165
* proceed with a truncate or equivalent operation.
167
* Must be called under a lock that serializes taking new references
168
* to i_dio_count, usually by inode->i_mutex.
170
void inode_dio_wait(struct inode *inode)
172
if (atomic_read(&inode->i_dio_count))
173
__inode_dio_wait(inode);
175
EXPORT_SYMBOL_GPL(inode_dio_wait);
178
* inode_dio_done - signal finish of a direct I/O requests
179
* @inode: inode the direct I/O happens on
181
* This is called once we've finished processing a direct I/O request,
182
* and is used to wake up callers waiting for direct I/O to be quiesced.
184
void inode_dio_done(struct inode *inode)
186
if (atomic_dec_and_test(&inode->i_dio_count))
187
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
189
EXPORT_SYMBOL_GPL(inode_dio_done);
139
192
* How many pages are in the queue?
141
static inline unsigned dio_pages_present(struct dio *dio)
194
static inline unsigned dio_pages_present(struct dio_submit *sdio)
143
return dio->tail - dio->head;
196
return sdio->tail - sdio->head;
147
200
* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
149
static int dio_refill_pages(struct dio *dio)
202
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
154
nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
207
nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
155
208
ret = get_user_pages_fast(
156
dio->curr_user_address, /* Where from? */
209
sdio->curr_user_address, /* Where from? */
157
210
nr_pages, /* How many pages? */
158
211
dio->rw == READ, /* Write to memory? */
159
212
&dio->pages[0]); /* Put results here */
161
if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
214
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
162
215
struct page *page = ZERO_PAGE(0);
164
217
* A memory fault, but the filesystem has some outstanding
192
245
* decent number of pages, less frequently. To provide nicer use of the
195
static struct page *dio_get_page(struct dio *dio)
248
static inline struct page *dio_get_page(struct dio *dio,
249
struct dio_submit *sdio)
197
if (dio_pages_present(dio) == 0) {
251
if (dio_pages_present(sdio) == 0) {
200
ret = dio_refill_pages(dio);
254
ret = dio_refill_pages(dio, sdio);
202
256
return ERR_PTR(ret);
203
BUG_ON(dio_pages_present(dio) == 0);
257
BUG_ON(dio_pages_present(sdio) == 0);
205
return dio->pages[dio->head++];
259
return dio->pages[sdio->head++];
249
303
if (dio->end_io && dio->result) {
250
304
dio->end_io(dio->iocb, offset, transferred,
251
dio->map_bh.b_private, ret, is_async);
252
} else if (is_async) {
253
aio_complete(dio->iocb, ret, 0);
305
dio->private, ret, is_async);
308
aio_complete(dio->iocb, ret, 0);
309
inode_dio_done(dio->inode);
256
if (dio->flags & DIO_LOCKING)
257
/* lockdep: non-owner release */
258
up_read_non_owner(&dio->inode->i_alloc_sem);
369
422
if (dio->is_async && dio->rw == READ)
370
423
bio_set_pages_dirty(bio);
373
dio->submit_io(dio->rw, bio, dio->inode,
374
dio->logical_offset_in_bio);
426
sdio->submit_io(dio->rw, bio, dio->inode,
427
sdio->logical_offset_in_bio);
376
429
submit_bio(dio->rw, bio);
380
dio->logical_offset_in_bio = 0;
433
sdio->logical_offset_in_bio = 0;
384
437
* Release any resources in case of a failure
386
static void dio_cleanup(struct dio *dio)
439
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
388
while (dio_pages_present(dio))
389
page_cache_release(dio_get_page(dio));
441
while (dio_pages_present(sdio))
442
page_cache_release(dio_get_page(dio, sdio));
577
633
* There is no bio. Make one now.
579
static int dio_new_bio(struct dio *dio, sector_t start_sector)
635
static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
636
sector_t start_sector, struct buffer_head *map_bh)
582
639
int ret, nr_pages;
584
ret = dio_bio_reap(dio);
641
ret = dio_bio_reap(dio, sdio);
587
sector = start_sector << (dio->blkbits - 9);
588
nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
644
sector = start_sector << (sdio->blkbits - 9);
645
nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
589
646
nr_pages = min(nr_pages, BIO_MAX_PAGES);
590
647
BUG_ON(nr_pages <= 0);
591
dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
648
dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
602
659
* Return zero on success. Non-zero means the caller needs to start a new BIO.
604
static int dio_bio_add_page(struct dio *dio)
661
static inline int dio_bio_add_page(struct dio_submit *sdio)
608
ret = bio_add_page(dio->bio, dio->cur_page,
609
dio->cur_page_len, dio->cur_page_offset);
610
if (ret == dio->cur_page_len) {
665
ret = bio_add_page(sdio->bio, sdio->cur_page,
666
sdio->cur_page_len, sdio->cur_page_offset);
667
if (ret == sdio->cur_page_len) {
612
669
* Decrement count only, if we are done with this page
614
if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
616
page_cache_get(dio->cur_page);
617
dio->final_block_in_bio = dio->cur_page_block +
618
(dio->cur_page_len >> dio->blkbits);
671
if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
673
page_cache_get(sdio->cur_page);
674
sdio->final_block_in_bio = sdio->cur_page_block +
675
(sdio->cur_page_len >> sdio->blkbits);
656
714
* be the next logical offset in the bio, submit the bio we
659
if (dio->final_block_in_bio != dio->cur_page_block ||
717
if (sdio->final_block_in_bio != sdio->cur_page_block ||
660
718
cur_offset != bio_next_offset)
719
dio_bio_submit(dio, sdio);
663
721
* Submit now if the underlying fs is about to perform a
666
else if (dio->boundary)
724
else if (sdio->boundary)
725
dio_bio_submit(dio, sdio);
670
if (dio->bio == NULL) {
671
ret = dio_new_bio(dio, dio->cur_page_block);
728
if (sdio->bio == NULL) {
729
ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
676
if (dio_bio_add_page(dio) != 0) {
678
ret = dio_new_bio(dio, dio->cur_page_block);
734
if (dio_bio_add_page(sdio) != 0) {
735
dio_bio_submit(dio, sdio);
736
ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
680
ret = dio_bio_add_page(dio);
738
ret = dio_bio_add_page(sdio);
681
739
BUG_ON(ret != 0);
719
778
* Can we just grow the current page's presence in the dio?
721
if ( (dio->cur_page == page) &&
722
(dio->cur_page_offset + dio->cur_page_len == offset) &&
723
(dio->cur_page_block +
724
(dio->cur_page_len >> dio->blkbits) == blocknr)) {
725
dio->cur_page_len += len;
780
if (sdio->cur_page == page &&
781
sdio->cur_page_offset + sdio->cur_page_len == offset &&
782
sdio->cur_page_block +
783
(sdio->cur_page_len >> sdio->blkbits) == blocknr) {
784
sdio->cur_page_len += len;
728
* If dio->boundary then we want to schedule the IO now to
787
* If sdio->boundary then we want to schedule the IO now to
729
788
* avoid metadata seeks.
732
ret = dio_send_cur_page(dio);
733
page_cache_release(dio->cur_page);
734
dio->cur_page = NULL;
790
if (sdio->boundary) {
791
ret = dio_send_cur_page(dio, sdio, map_bh);
792
page_cache_release(sdio->cur_page);
793
sdio->cur_page = NULL;
740
799
* If there's a deferred page already there then send it.
743
ret = dio_send_cur_page(dio);
744
page_cache_release(dio->cur_page);
745
dio->cur_page = NULL;
801
if (sdio->cur_page) {
802
ret = dio_send_cur_page(dio, sdio, map_bh);
803
page_cache_release(sdio->cur_page);
804
sdio->cur_page = NULL;
750
809
page_cache_get(page); /* It is in dio */
751
dio->cur_page = page;
752
dio->cur_page_offset = offset;
753
dio->cur_page_len = len;
754
dio->cur_page_block = blocknr;
755
dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
810
sdio->cur_page = page;
811
sdio->cur_page_offset = offset;
812
sdio->cur_page_len = len;
813
sdio->cur_page_block = blocknr;
814
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
784
843
* `end' is zero if we're doing the start of the IO, 1 at the end of the
787
static void dio_zero_block(struct dio *dio, int end)
846
static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
847
int end, struct buffer_head *map_bh)
789
849
unsigned dio_blocks_per_fs_block;
790
850
unsigned this_chunk_blocks; /* In dio_blocks */
791
851
unsigned this_chunk_bytes;
792
852
struct page *page;
794
dio->start_zero_done = 1;
795
if (!dio->blkfactor || !buffer_new(&dio->map_bh))
854
sdio->start_zero_done = 1;
855
if (!sdio->blkfactor || !buffer_new(map_bh))
798
dio_blocks_per_fs_block = 1 << dio->blkfactor;
799
this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
858
dio_blocks_per_fs_block = 1 << sdio->blkfactor;
859
this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
801
861
if (!this_chunk_blocks)
809
869
this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
811
this_chunk_bytes = this_chunk_blocks << dio->blkbits;
871
this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
813
873
page = ZERO_PAGE(0);
814
if (submit_page_section(dio, page, 0, this_chunk_bytes,
815
dio->next_block_for_io))
874
if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
875
sdio->next_block_for_io, map_bh))
818
dio->next_block_for_io += this_chunk_blocks;
878
sdio->next_block_for_io += this_chunk_blocks;
834
894
* it should set b_size to PAGE_SIZE or more inside get_block(). This gives
835
895
* fine alignment but still allows this function to work in PAGE_SIZE units.
837
static int do_direct_IO(struct dio *dio)
897
static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
898
struct buffer_head *map_bh)
839
const unsigned blkbits = dio->blkbits;
900
const unsigned blkbits = sdio->blkbits;
840
901
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
841
902
struct page *page;
842
903
unsigned block_in_page;
843
struct buffer_head *map_bh = &dio->map_bh;
846
906
/* The I/O can start at any block offset within the first page */
847
block_in_page = dio->first_block_in_page;
907
block_in_page = sdio->first_block_in_page;
849
while (dio->block_in_file < dio->final_block_in_request) {
850
page = dio_get_page(dio);
909
while (sdio->block_in_file < sdio->final_block_in_request) {
910
page = dio_get_page(dio, sdio);
851
911
if (IS_ERR(page)) {
852
912
ret = PTR_ERR(page);
874
934
if (!buffer_mapped(map_bh))
877
dio->blocks_available =
878
map_bh->b_size >> dio->blkbits;
879
dio->next_block_for_io =
880
map_bh->b_blocknr << dio->blkfactor;
937
sdio->blocks_available =
938
map_bh->b_size >> sdio->blkbits;
939
sdio->next_block_for_io =
940
map_bh->b_blocknr << sdio->blkfactor;
881
941
if (buffer_new(map_bh))
882
clean_blockdev_aliases(dio);
942
clean_blockdev_aliases(dio, map_bh);
944
if (!sdio->blkfactor)
887
blkmask = (1 << dio->blkfactor) - 1;
888
dio_remainder = (dio->block_in_file & blkmask);
947
blkmask = (1 << sdio->blkfactor) - 1;
948
dio_remainder = (sdio->block_in_file & blkmask);
891
951
* If we are at the start of IO and that IO
937
997
* is finer than the underlying fs, go check to see if
938
998
* we must zero out the start of this block.
940
if (unlikely(dio->blkfactor && !dio->start_zero_done))
941
dio_zero_block(dio, 0);
1000
if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
1001
dio_zero_block(dio, sdio, 0, map_bh);
944
1004
* Work out, in this_chunk_blocks, how much disk we
945
1005
* can add to this page
947
this_chunk_blocks = dio->blocks_available;
1007
this_chunk_blocks = sdio->blocks_available;
948
1008
u = (PAGE_SIZE - offset_in_page) >> blkbits;
949
1009
if (this_chunk_blocks > u)
950
1010
this_chunk_blocks = u;
951
u = dio->final_block_in_request - dio->block_in_file;
1011
u = sdio->final_block_in_request - sdio->block_in_file;
952
1012
if (this_chunk_blocks > u)
953
1013
this_chunk_blocks = u;
954
1014
this_chunk_bytes = this_chunk_blocks << blkbits;
955
1015
BUG_ON(this_chunk_bytes == 0);
957
dio->boundary = buffer_boundary(map_bh);
958
ret = submit_page_section(dio, page, offset_in_page,
959
this_chunk_bytes, dio->next_block_for_io);
1017
sdio->boundary = buffer_boundary(map_bh);
1018
ret = submit_page_section(dio, sdio, page,
1021
sdio->next_block_for_io,
961
1024
page_cache_release(page);
964
dio->next_block_for_io += this_chunk_blocks;
1027
sdio->next_block_for_io += this_chunk_blocks;
966
dio->block_in_file += this_chunk_blocks;
1029
sdio->block_in_file += this_chunk_blocks;
967
1030
block_in_page += this_chunk_blocks;
968
dio->blocks_available -= this_chunk_blocks;
1031
sdio->blocks_available -= this_chunk_blocks;
970
BUG_ON(dio->block_in_file > dio->final_block_in_request);
971
if (dio->block_in_file == dio->final_block_in_request)
1033
BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1034
if (sdio->block_in_file == sdio->final_block_in_request)
984
* Releases both i_mutex and i_alloc_sem
987
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
988
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
989
unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
990
dio_submit_t submit_io, struct dio *dio)
1046
static inline int drop_refcount(struct dio *dio)
992
unsigned long user_addr;
993
1049
unsigned long flags;
1001
dio->blkbits = blkbits;
1002
dio->blkfactor = inode->i_blkbits - blkbits;
1003
dio->block_in_file = offset >> blkbits;
1005
dio->get_block = get_block;
1006
dio->end_io = end_io;
1007
dio->submit_io = submit_io;
1008
dio->final_block_in_bio = -1;
1009
dio->next_block_for_io = -1;
1012
dio->i_size = i_size_read(inode);
1014
spin_lock_init(&dio->bio_lock);
1018
* In case of non-aligned buffers, we may need 2 more
1019
* pages since we need to zero out first and last block.
1021
if (unlikely(dio->blkfactor))
1022
dio->pages_in_io = 2;
1024
for (seg = 0; seg < nr_segs; seg++) {
1025
user_addr = (unsigned long)iov[seg].iov_base;
1027
((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
1028
- user_addr/PAGE_SIZE);
1031
for (seg = 0; seg < nr_segs; seg++) {
1032
user_addr = (unsigned long)iov[seg].iov_base;
1033
dio->size += bytes = iov[seg].iov_len;
1035
/* Index into the first page of the first block */
1036
dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1037
dio->final_block_in_request = dio->block_in_file +
1039
/* Page fetching state */
1044
dio->total_pages = 0;
1045
if (user_addr & (PAGE_SIZE-1)) {
1047
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1049
dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1050
dio->curr_user_address = user_addr;
1052
ret = do_direct_IO(dio);
1054
dio->result += iov[seg].iov_len -
1055
((dio->final_block_in_request - dio->block_in_file) <<
1062
} /* end iovec loop */
1064
if (ret == -ENOTBLK) {
1066
* The remaining part of the request will be
1067
* be handled by buffered I/O when we return
1072
* There may be some unwritten disk at the end of a part-written
1073
* fs-block-sized block. Go zero that now.
1075
dio_zero_block(dio, 1);
1077
if (dio->cur_page) {
1078
ret2 = dio_send_cur_page(dio);
1081
page_cache_release(dio->cur_page);
1082
dio->cur_page = NULL;
1085
dio_bio_submit(dio);
1088
* It is possible that, we return short IO due to end of file.
1089
* In that case, we need to release all the pages we got hold on.
1094
* All block lookups have been performed. For READ requests
1095
* we can let i_mutex go now that its achieved its purpose
1096
* of protecting us from looking up uninitialized blocks.
1098
if (rw == READ && (dio->flags & DIO_LOCKING))
1099
mutex_unlock(&dio->inode->i_mutex);
1102
* The only time we want to leave bios in flight is when a successful
1103
* partial aio read or full aio write have been setup. In that case
1104
* bio completion will call aio_complete. The only time it's safe to
1105
* call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1106
* This had *better* be the only place that raises -EIOCBQUEUED.
1108
BUG_ON(ret == -EIOCBQUEUED);
1109
if (dio->is_async && ret == 0 && dio->result &&
1110
((rw & READ) || (dio->result == dio->size)))
1113
if (ret != -EIOCBQUEUED)
1114
dio_await_completion(dio);
1117
1052
* Sync will always be dropping the final ref and completing the
1146
1074
* For writes this function is called under i_mutex and returns with
1147
1075
* i_mutex held, for reads, i_mutex is not held on entry, but it is
1148
1076
* taken and dropped again before returning.
1149
* For reads and writes i_alloc_sem is taken in shared mode and released
1150
* on I/O completion (which may happen asynchronously after returning to
1153
1077
* - if the flags value does NOT contain DIO_LOCKING we don't use any
1154
1078
* internal locking but rather rely on the filesystem to synchronize
1155
1079
* direct I/O reads/writes versus each other and truncate.
1156
* For reads and writes both i_mutex and i_alloc_sem are not held on
1157
* entry and are never taken.
1081
* To help with locking against truncate we incremented the i_dio_count
1082
* counter before starting direct I/O, and decrement it once we are done.
1083
* Truncate can wait for it to reach zero to provide exclusion. It is
1084
* expected that filesystem provide exclusion between new direct I/O
1085
* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
1086
* but other filesystems need to take care of this on their own.
1088
* NOTE: if you pass "sdio" to anything by pointer make sure that function
1089
* is always inlined. Otherwise gcc is unable to split the structure into
1090
* individual fields and will generate much worse code. This is important
1091
* for the whole file.
1160
1094
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1246
1186
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1247
1187
(end > i_size_read(inode)));
1249
retval = direct_io_worker(rw, iocb, inode, iov, offset,
1250
nr_segs, blkbits, get_block, end_io,
1193
sdio.blkbits = blkbits;
1194
sdio.blkfactor = inode->i_blkbits - blkbits;
1195
sdio.block_in_file = offset >> blkbits;
1197
sdio.get_block = get_block;
1198
dio->end_io = end_io;
1199
sdio.submit_io = submit_io;
1200
sdio.final_block_in_bio = -1;
1201
sdio.next_block_for_io = -1;
1204
dio->i_size = i_size_read(inode);
1206
spin_lock_init(&dio->bio_lock);
1210
* In case of non-aligned buffers, we may need 2 more
1211
* pages since we need to zero out first and last block.
1213
if (unlikely(sdio.blkfactor))
1214
sdio.pages_in_io = 2;
1216
for (seg = 0; seg < nr_segs; seg++) {
1217
user_addr = (unsigned long)iov[seg].iov_base;
1219
((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1220
PAGE_SIZE - user_addr / PAGE_SIZE);
1223
for (seg = 0; seg < nr_segs; seg++) {
1224
user_addr = (unsigned long)iov[seg].iov_base;
1225
sdio.size += bytes = iov[seg].iov_len;
1227
/* Index into the first page of the first block */
1228
sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1229
sdio.final_block_in_request = sdio.block_in_file +
1231
/* Page fetching state */
1236
sdio.total_pages = 0;
1237
if (user_addr & (PAGE_SIZE-1)) {
1239
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1241
sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1242
sdio.curr_user_address = user_addr;
1244
retval = do_direct_IO(dio, &sdio, &map_bh);
1246
dio->result += iov[seg].iov_len -
1247
((sdio.final_block_in_request - sdio.block_in_file) <<
1251
dio_cleanup(dio, &sdio);
1254
} /* end iovec loop */
1256
if (retval == -ENOTBLK) {
1258
* The remaining part of the request will be
1259
* be handled by buffered I/O when we return
1264
* There may be some unwritten disk at the end of a part-written
1265
* fs-block-sized block. Go zero that now.
1267
dio_zero_block(dio, &sdio, 1, &map_bh);
1269
if (sdio.cur_page) {
1272
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1275
page_cache_release(sdio.cur_page);
1276
sdio.cur_page = NULL;
1279
dio_bio_submit(dio, &sdio);
1282
* It is possible that, we return short IO due to end of file.
1283
* In that case, we need to release all the pages we got hold on.
1285
dio_cleanup(dio, &sdio);
1288
* All block lookups have been performed. For READ requests
1289
* we can let i_mutex go now that its achieved its purpose
1290
* of protecting us from looking up uninitialized blocks.
1292
if (rw == READ && (dio->flags & DIO_LOCKING))
1293
mutex_unlock(&dio->inode->i_mutex);
1296
* The only time we want to leave bios in flight is when a successful
1297
* partial aio read or full aio write have been setup. In that case
1298
* bio completion will call aio_complete. The only time it's safe to
1299
* call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1300
* This had *better* be the only place that raises -EIOCBQUEUED.
1302
BUG_ON(retval == -EIOCBQUEUED);
1303
if (dio->is_async && retval == 0 && dio->result &&
1304
((rw & READ) || (dio->result == sdio.size)))
1305
retval = -EIOCBQUEUED;
1307
if (retval != -EIOCBQUEUED)
1308
dio_await_completion(dio);
1310
if (drop_refcount(dio) == 0) {
1311
retval = dio_complete(dio, offset, retval, false);
1312
kmem_cache_free(dio_cache, dio);
1314
BUG_ON(retval != -EIOCBQUEUED);
1256
1319
EXPORT_SYMBOL(__blockdev_direct_IO);
1321
static __init int dio_init(void)
1323
dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1326
module_init(dio_init)