1
/******************************************************
2
The low-level file system
6
Created 10/25/1995 Heikki Tuuri
7
*******************************************************/
12
#include "sync0sync.h"
13
#include "hash0hash.h"
16
#include "mach0data.h"
17
#include "ibuf0ibuf.h"
24
IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM
25
===========================================
27
The file system is responsible for providing fast read/write access to
28
tablespaces and logs of the database. File creation and deletion is done
29
in other modules which know more of the logic of the operation, however.
31
A tablespace consists of a chain of files. The size of the files does not
32
have to be divisible by the database block size, because we may just leave
33
the last incomplete block unused. When a new file is appended to the
34
tablespace, the maximum size of the file is also specified. At the moment,
35
we think that it is best to extend the file to its maximum size already at
36
the creation of the file, because then we can avoid dynamically extending
37
the file when more space is needed for the tablespace.
39
A block's position in the tablespace is specified with a 32-bit unsigned
40
integer. The files in the chain are thought to be catenated, and the block
41
corresponding to an address n is the nth block in the catenated file (where
42
the first block is named the 0th block, and the incomplete block fragments
43
at the end of files are not taken into account). A tablespace can be extended
44
by appending a new file at the end of the chain.
46
Our tablespace concept is similar to the one of Oracle.
48
To acquire more speed in disk transfers, a technique called disk striping is
49
sometimes used. This means that logical block addresses are divided in a
50
round-robin fashion across several disks. Windows NT supports disk striping,
51
so there we do not need to support it in the database. Disk striping is
52
implemented in hardware in RAID disks. We conclude that it is not necessary
53
to implement it in the database. Oracle 7 does not support disk striping,
56
Another trick used at some database sites is replacing tablespace files by
57
raw disks, that is, the whole physical disk drive, or a partition of it, is
58
opened as a single file, and it is accessed through byte offsets calculated
59
from the start of the disk or the partition. This is recommended in some
60
books on database tuning to achieve more speed in i/o. Using raw disk
61
certainly prevents the OS from fragmenting disk space, but it is not clear
62
if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
63
system + EIDE Conner disk only a negligible difference in speed when reading
64
from a file, versus reading from a raw disk.
66
To have fast access to a tablespace or a log file, we put the data structures
67
to a hash table. Each tablespace and log file is given an unique 32-bit
70
Some operating systems do not support many open files at the same time,
71
though NT seems to tolerate at least 900 open files. Therefore, we put the
72
open files in an LRU-list. If we need to open another file, we may close the
73
file at the end of the LRU-list. When an i/o-operation is pending on a file,
74
the file cannot be closed. We take the file nodes with pending i/o-operations
75
out of the LRU-list and keep a count of pending operations. When an operation
76
completes, we decrement the count and return the file node to the LRU-list if
77
the count drops to zero. */
79
/* Null file address */
80
fil_addr_t fil_addr_null = {FIL_NULL, 0};
82
/* File system file node data structure */
83
typedef struct fil_node_struct fil_node_t;
84
struct fil_node_struct {
85
char* name; /* the file name or path */
86
ibool open; /* TRUE if file open */
87
os_file_t handle; /* OS handle to the file, if file open */
88
ulint size; /* size of the file in database blocks
89
(where the possible last incomplete block
92
/* count of pending i/o-ops on this file */
93
UT_LIST_NODE_T(fil_node_t) chain;
94
/* link field for the file chain */
95
UT_LIST_NODE_T(fil_node_t) LRU;
96
/* link field for the LRU list */
100
#define FIL_NODE_MAGIC_N 89389
102
/* File system tablespace or log data structure: let us call them by a common
104
struct fil_space_struct {
105
char* name; /* space name */
106
ulint id; /* space id */
107
ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */
108
UT_LIST_BASE_NODE_T(fil_node_t) chain;
109
/* base node for the file chain */
110
ulint size; /* space size in pages */
111
ulint n_reserved_extents;
112
/* number of reserved free extents for
113
ongoing operations like B-tree page split */
114
hash_node_t hash; /* hash chain node */
115
rw_lock_t latch; /* latch protecting the file space storage
117
UT_LIST_NODE_T(fil_space_t) space_list;
118
/* list of all spaces */
119
ibuf_data_t* ibuf_data;
120
/* insert buffer data */
124
#define FIL_SPACE_MAGIC_N 89472
126
/* The file system data structure */
128
typedef struct fil_system_struct fil_system_t;
129
struct fil_system_struct {
130
mutex_t mutex; /* The mutex protecting the system */
131
hash_table_t* spaces; /* The hash table of spaces in the
133
UT_LIST_BASE_NODE_T(fil_node_t) LRU;
134
/* base node for the LRU list of the
135
most recently used open files */
136
ulint n_open_pending; /* current number of open files with
137
pending i/o-ops on them */
138
ulint max_n_open; /* maximum allowed open files */
139
os_event_t can_open; /* this event is set to the signaled
140
state when the system is capable of
141
opening a new file, i.e.,
142
n_open_pending < max_n_open */
143
UT_LIST_BASE_NODE_T(fil_space_t) space_list;
144
/* list of all file spaces */
147
/* The file system. This variable is NULL before the module is initialized. */
148
fil_system_t* fil_system = NULL;
150
/* The file system hash table size */
151
#define FIL_SYSTEM_HASH_SIZE 500
154
/***********************************************************************
155
Reserves a right to open a single file. The right must be released with
156
fil_release_right_to_open. */
159
fil_reserve_right_to_open(void)
160
/*===========================*/
163
mutex_enter(&(fil_system->mutex));
165
if (fil_system->n_open_pending == fil_system->max_n_open) {
167
/* It is not sure we can open the file if it is closed: wait */
169
os_event_reset(fil_system->can_open);
171
mutex_exit(&(fil_system->mutex));
173
os_event_wait(fil_system->can_open);
178
fil_system->max_n_open--;
180
mutex_exit(&(fil_system->mutex));
183
/***********************************************************************
184
Releases a right to open a single file. */
187
fil_release_right_to_open(void)
188
/*===========================*/
190
mutex_enter(&(fil_system->mutex));
192
if (fil_system->n_open_pending == fil_system->max_n_open) {
194
os_event_set(fil_system->can_open);
197
fil_system->max_n_open++;
199
mutex_exit(&(fil_system->mutex));
202
/***********************************************************************
203
Returns the latch of a file space. */
208
/* out: latch protecting storage allocation */
209
ulint id) /* in: space id */
212
fil_system_t* system = fil_system;
216
mutex_enter(&(system->mutex));
218
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
220
mutex_exit(&(system->mutex));
222
return(&(space->latch));
225
/***********************************************************************
226
Returns the type of a file space. */
231
/* out: FIL_TABLESPACE or FIL_LOG */
232
ulint id) /* in: space id */
235
fil_system_t* system = fil_system;
239
mutex_enter(&(system->mutex));
241
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
243
mutex_exit(&(system->mutex));
245
return(space->purpose);
248
/***********************************************************************
249
Returns the ibuf data of a file space. */
252
fil_space_get_ibuf_data(
253
/*====================*/
254
/* out: ibuf data for this space */
255
ulint id) /* in: space id */
258
fil_system_t* system = fil_system;
262
mutex_enter(&(system->mutex));
264
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
266
mutex_exit(&(system->mutex));
268
return(space->ibuf_data);
271
/***********************************************************************
272
Appends a new file to the chain of files of a space. File must be closed. */
277
char* name, /* in: file name (file must be closed) */
278
ulint size, /* in: file size in database blocks, rounded downwards
280
ulint id) /* in: space id where to append */
285
fil_system_t* system = fil_system;
291
mutex_enter(&(system->mutex));
293
node = mem_alloc(sizeof(fil_node_t));
295
name2 = mem_alloc(ut_strlen(name) + 1);
297
ut_strcpy(name2, name);
302
node->magic_n = FIL_NODE_MAGIC_N;
305
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
309
UT_LIST_ADD_LAST(chain, space->chain, node);
311
mutex_exit(&(system->mutex));
314
/**************************************************************************
320
fil_node_t* node, /* in: file node */
321
fil_system_t* system) /* in: file system */
325
ut_ad(node && system);
326
ut_ad(mutex_own(&(system->mutex)));
328
ut_a(node->n_pending == 0);
330
ret = os_file_close(node->handle);
335
/* The node is in the LRU list, remove it */
336
UT_LIST_REMOVE(LRU, system->LRU, node);
339
/***********************************************************************
340
Frees a file node object from a file system. */
345
fil_node_t* node, /* in, own: file node */
346
fil_system_t* system, /* in: file system */
347
fil_space_t* space) /* in: space where the file node is chained */
349
ut_ad(node && system && space);
350
ut_ad(mutex_own(&(system->mutex)));
351
ut_a(node->magic_n == FIL_NODE_MAGIC_N);
354
fil_node_close(node, system);
357
space->size -= node->size;
359
UT_LIST_REMOVE(chain, space->chain, node);
361
mem_free(node->name);
365
/********************************************************************
366
Drops files from the start of a file space, so that its size is cut by
370
fil_space_truncate_start(
371
/*=====================*/
372
ulint id, /* in: space id */
373
ulint trunc_len) /* in: truncate by this much; it is an error
374
if this does not equal to the combined size of
375
some initial files in the space */
379
fil_system_t* system = fil_system;
381
mutex_enter(&(system->mutex));
383
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
387
while (trunc_len > 0) {
389
node = UT_LIST_GET_FIRST(space->chain);
391
ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len);
393
trunc_len -= node->size * UNIV_PAGE_SIZE;
395
fil_node_free(node, system, space);
398
mutex_exit(&(system->mutex));
401
/********************************************************************
402
Creates a file system object. */
407
/* out, own: file system object */
408
ulint hash_size, /* in: hash table size */
409
ulint max_n_open) /* in: maximum number of open files */
411
fil_system_t* system;
414
ut_a(max_n_open > 0);
416
system = mem_alloc(sizeof(fil_system_t));
418
mutex_create(&(system->mutex));
420
mutex_set_level(&(system->mutex), SYNC_ANY_LATCH);
422
system->spaces = hash_create(hash_size);
424
UT_LIST_INIT(system->LRU);
426
system->n_open_pending = 0;
427
system->max_n_open = max_n_open;
428
system->can_open = os_event_create(NULL);
430
UT_LIST_INIT(system->space_list);
435
/********************************************************************
436
Initializes the file system of this module. */
441
ulint max_n_open) /* in: max number of open files */
443
ut_a(fil_system == NULL);
445
fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open);
448
/********************************************************************
449
Writes the flushed lsn to the header of each file space. */
452
fil_ibuf_init_at_db_start(void)
453
/*===========================*/
457
space = UT_LIST_GET_FIRST(fil_system->space_list);
460
if (space->purpose == FIL_TABLESPACE) {
461
space->ibuf_data = ibuf_data_init_for_space(space->id);
464
space = UT_LIST_GET_NEXT(space_list, space);
468
/********************************************************************
469
Writes the flushed lsn and the latest archived log number to the page
470
header of the first page of a data file. */
473
fil_write_lsn_and_arch_no_to_file(
474
/*==============================*/
475
ulint space_id, /* in: space number */
476
ulint sum_of_sizes, /* in: combined size of previous files in space,
478
dulint lsn, /* in: lsn to write */
479
ulint arch_log_no) /* in: archived log number to write */
484
buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
485
buf = ut_align(buf1, UNIV_PAGE_SIZE);
487
fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
489
mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
490
mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no);
492
fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
497
/********************************************************************
498
Writes the flushed lsn and the latest archived log number to the page
499
header of the first page of each data file. */
502
fil_write_flushed_lsn_to_data_files(
503
/*================================*/
504
/* out: DB_SUCCESS or error number */
505
dulint lsn, /* in: lsn to write */
506
ulint arch_log_no) /* in: latest archived log file number */
513
mutex_enter(&(fil_system->mutex));
515
space = UT_LIST_GET_FIRST(fil_system->space_list);
518
if (space->purpose == FIL_TABLESPACE) {
521
node = UT_LIST_GET_FIRST(space->chain);
524
mutex_exit(&(fil_system->mutex));
526
err = fil_write_lsn_and_arch_no_to_file(
530
if (err != DB_SUCCESS) {
535
mutex_enter(&(fil_system->mutex));
537
sum_of_sizes += node->size;
539
node = UT_LIST_GET_NEXT(chain, node);
543
space = UT_LIST_GET_NEXT(space_list, space);
546
mutex_exit(&(fil_system->mutex));
549
/***********************************************************************
550
Reads the flushed lsn and arch no fields from a data file at database
554
fil_read_flushed_lsn_and_arch_log_no(
555
/*=================================*/
556
os_file_t data_file, /* in: open data file */
557
ibool one_read_already, /* in: TRUE if min and max parameters
558
below already contain sensible data */
559
dulint* min_flushed_lsn, /* in/out: */
560
ulint* min_arch_log_no, /* in/out: */
561
dulint* max_flushed_lsn, /* in/out: */
562
ulint* max_arch_log_no) /* in/out: */
568
buf = ut_malloc(UNIV_PAGE_SIZE);
570
os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
572
flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN);
573
arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO);
577
if (!one_read_already) {
578
*min_flushed_lsn = flushed_lsn;
579
*max_flushed_lsn = flushed_lsn;
580
*min_arch_log_no = arch_log_no;
581
*max_arch_log_no = arch_log_no;
586
if (ut_dulint_cmp(*min_flushed_lsn, flushed_lsn) > 0) {
587
*min_flushed_lsn = flushed_lsn;
589
if (ut_dulint_cmp(*max_flushed_lsn, flushed_lsn) < 0) {
590
*max_flushed_lsn = flushed_lsn;
592
if (*min_arch_log_no > arch_log_no) {
593
*min_arch_log_no = arch_log_no;
595
if (*max_arch_log_no < arch_log_no) {
596
*max_arch_log_no = arch_log_no;
600
/***********************************************************************
601
Creates a space object and puts it to the file system. */
606
char* name, /* in: space name */
607
ulint id, /* in: space id */
608
ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */
612
fil_system_t* system = fil_system;
617
#ifndef UNIV_BASIC_LOG_DEBUG
618
/* Spaces with an odd id number are reserved to replicate spaces
619
used in log debugging */
621
ut_a((purpose == FIL_LOG) || (id % 2 == 0));
623
mutex_enter(&(system->mutex));
625
space = mem_alloc(sizeof(fil_space_t));
627
name2 = mem_alloc(ut_strlen(name) + 1);
629
ut_strcpy(name2, name);
633
space->purpose = purpose;
636
space->n_reserved_extents = 0;
638
UT_LIST_INIT(space->chain);
639
space->magic_n = FIL_SPACE_MAGIC_N;
641
space->ibuf_data = NULL;
643
rw_lock_create(&(space->latch));
644
rw_lock_set_level(&(space->latch), SYNC_FSP);
646
HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
648
UT_LIST_ADD_LAST(space_list, system->space_list, space);
650
mutex_exit(&(system->mutex));
653
/***********************************************************************
654
Frees a space object from a file system. Closes the files in the chain
655
but does not delete them. */
660
ulint id) /* in: space id */
663
fil_node_t* fil_node;
664
fil_system_t* system = fil_system;
666
mutex_enter(&(system->mutex));
668
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
670
HASH_DELETE(fil_space_t, hash, system->spaces, id, space);
672
UT_LIST_REMOVE(space_list, system->space_list, space);
674
ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
676
fil_node = UT_LIST_GET_FIRST(space->chain);
678
ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
680
while (fil_node != NULL) {
681
fil_node_free(fil_node, system, space);
683
fil_node = UT_LIST_GET_FIRST(space->chain);
686
ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
687
ut_ad(0 == UT_LIST_GET_LEN(space->chain));
689
mutex_exit(&(system->mutex));
691
mem_free(space->name);
695
/***********************************************************************
696
Returns the size of the space in pages. */
701
/* out: space size */
702
ulint id) /* in: space id */
705
fil_system_t* system = fil_system;
710
mutex_enter(&(system->mutex));
712
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
716
mutex_exit(&(system->mutex));
721
/***********************************************************************
722
Tries to reserve free extents in a file space. */
725
fil_space_reserve_free_extents(
726
/*===========================*/
727
/* out: TRUE if succeed */
728
ulint id, /* in: space id */
729
ulint n_free_now, /* in: number of free extents now */
730
ulint n_to_reserve) /* in: how many one wants to reserve */
733
fil_system_t* system = fil_system;
738
mutex_enter(&(system->mutex));
740
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
742
if (space->n_reserved_extents + n_to_reserve > n_free_now) {
745
space->n_reserved_extents += n_to_reserve;
749
mutex_exit(&(system->mutex));
754
/***********************************************************************
755
Releases free extents in a file space. */
758
fil_space_release_free_extents(
759
/*===========================*/
760
ulint id, /* in: space id */
761
ulint n_reserved) /* in: how many one reserved */
764
fil_system_t* system = fil_system;
768
mutex_enter(&(system->mutex));
770
HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
772
ut_a(space->n_reserved_extents >= n_reserved);
774
space->n_reserved_extents -= n_reserved;
776
mutex_exit(&(system->mutex));
779
/************************************************************************
780
Prepares a file node for i/o. Opens the file if it is closed. Updates the
781
pending i/o's field in the node and the system appropriately. Takes the node
782
off the LRU list if it is in the LRU list. */
785
fil_node_prepare_for_io(
786
/*====================*/
787
fil_node_t* node, /* in: file node */
788
fil_system_t* system, /* in: file system */
789
fil_space_t* space) /* in: space */
792
fil_node_t* last_node;
794
ut_ad(node && system && space);
795
ut_ad(mutex_own(&(system->mutex)));
797
if (node->open == FALSE) {
799
ut_a(node->n_pending == 0);
801
/* If too many files are open, close one */
803
if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU)
804
== system->max_n_open) {
806
ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
808
last_node = UT_LIST_GET_LAST(system->LRU);
810
fil_node_close(last_node, system);
813
node->handle = os_file_create(node->name, OS_FILE_OPEN,
819
system->n_open_pending++;
822
/* File was closed: the node was not in the LRU list */
828
if (node->n_pending == 0) {
829
/* The node is in the LRU list, remove it */
831
UT_LIST_REMOVE(LRU, system->LRU, node);
833
system->n_open_pending++;
836
/* There is already a pending i/o-op on the file: the node is
837
not in the LRU list */
843
/************************************************************************
844
Updates the data structures when an i/o operation finishes. Updates the
845
pending i/os field in the node and the system appropriately. Puts the node
846
in the LRU list if there are no other pending i/os. */
849
fil_node_complete_io(
850
/*=================*/
851
fil_node_t* node, /* in: file node */
852
fil_system_t* system) /* in: file system */
856
ut_ad(mutex_own(&(system->mutex)));
857
ut_a(node->n_pending > 0);
861
if (node->n_pending == 0) {
862
/* The node must be put back to the LRU list */
863
UT_LIST_ADD_FIRST(LRU, system->LRU, node);
865
ut_a(system->n_open_pending > 0);
867
system->n_open_pending--;
869
if (system->n_open_pending == system->max_n_open - 1) {
871
os_event_set(system->can_open);
876
/************************************************************************
877
Reads or writes data. This operation is asynchronous (aio). */
882
ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE,
883
ORed to OS_FILE_LOG, if a log i/o
884
and ORed to OS_AIO_SIMULATED_WAKE_LATER
885
if simulated aio and we want to post a
886
batch of i/os; NOTE that a simulated batch
887
may introduce hidden chances of deadlocks,
888
because i/os are not actually handled until
889
all have been posted: use with great
891
ibool sync, /* in: TRUE if synchronous aio is desired */
892
ulint space_id, /* in: space id */
893
ulint block_offset, /* in: offset in number of blocks */
894
ulint byte_offset, /* in: remainder of offset in bytes; in
895
aio this must be divisible by the OS block
897
ulint len, /* in: how many bytes to read; this must
898
not cross a file boundary; in aio this must
899
be a block size multiple */
900
void* buf, /* in/out: buffer where to store read data
901
or from where to write; in aio this must be
902
appropriately aligned */
903
void* message) /* in: message for aio handler if non-sync
904
aio used, else ignored */
911
fil_system_t* system;
917
is_log = type & OS_FILE_LOG;
918
type = type & ~OS_FILE_LOG;
920
wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
921
type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
923
ut_ad(byte_offset < UNIV_PAGE_SIZE);
926
ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE);
927
ut_ad(fil_validate());
928
#ifndef UNIV_LOG_DEBUG
929
/* ibuf bitmap pages must be read in the sync aio mode: */
930
ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
931
|| !ibuf_bitmap_page(block_offset) || sync || is_log);
932
#ifdef UNIV_SYNC_DEBUG
933
ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
934
|| ibuf_page(space_id, block_offset));
939
} else if ((type == OS_FILE_READ) && !is_log
940
&& ibuf_page(space_id, block_offset)) {
945
mode = OS_AIO_NORMAL;
950
mutex_enter(&(system->mutex));
952
if (system->n_open_pending == system->max_n_open) {
954
/* It is not sure we can open the file if it is closed: wait */
956
event = system->can_open;
957
os_event_reset(event);
959
mutex_exit(&(system->mutex));
961
os_event_wait(event);
966
HASH_SEARCH(hash, system->spaces, space_id, space,
967
space->id == space_id);
970
ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
972
node = UT_LIST_GET_FIRST(space->chain);
977
if (node->size > block_offset) {
981
block_offset -= node->size;
982
node = UT_LIST_GET_NEXT(chain, node);
986
/* Open file if closed */
987
fil_node_prepare_for_io(node, system, space);
989
/* Now we have made the changes in the data structures of system */
990
mutex_exit(&(system->mutex));
992
/* Calculate the low 32 bits and the high 32 bits of the file offset */
994
offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
995
offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF)
998
ut_a(node->size - block_offset >=
999
(byte_offset + len + (UNIV_PAGE_SIZE - 1)) / UNIV_PAGE_SIZE);
1003
ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
1004
ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
1006
/* Queue the aio request */
1007
ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
1008
offset_low, offset_high, len, node, message);
1011
if (mode == OS_AIO_SYNC) {
1012
/* The i/o operation is already completed when we return from
1015
mutex_enter(&(system->mutex));
1017
fil_node_complete_io(node, system);
1019
mutex_exit(&(system->mutex));
1021
ut_ad(fil_validate());
1025
/************************************************************************
1026
Reads data from a space to a buffer. Remember that the possible incomplete
1027
blocks at the end of file are ignored: they are not taken into account when
1028
calculating the byte offset within a space. */
1033
ibool sync, /* in: TRUE if synchronous aio is desired */
1034
ulint space_id, /* in: space id */
1035
ulint block_offset, /* in: offset in number of blocks */
1036
ulint byte_offset, /* in: remainder of offset in bytes; in aio
1037
this must be divisible by the OS block size */
1038
ulint len, /* in: how many bytes to read; this must not
1039
cross a file boundary; in aio this must be a
1040
block size multiple */
1041
void* buf, /* in/out: buffer where to store data read;
1042
in aio this must be appropriately aligned */
1043
void* message) /* in: message for aio handler if non-sync
1044
aio used, else ignored */
1046
fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len,
1050
/************************************************************************
1051
Writes data to a space from a buffer. Remember that the possible incomplete
1052
blocks at the end of file are ignored: they are not taken into account when
1053
calculating the byte offset within a space. */
1058
ibool sync, /* in: TRUE if synchronous aio is desired */
1059
ulint space_id, /* in: space id */
1060
ulint block_offset, /* in: offset in number of blocks */
1061
ulint byte_offset, /* in: remainder of offset in bytes; in aio
1062
this must be divisible by the OS block size */
1063
ulint len, /* in: how many bytes to write; this must
1064
not cross a file boundary; in aio this must
1065
be a block size multiple */
1066
void* buf, /* in: buffer from which to write; in aio
1067
this must be appropriately aligned */
1068
void* message) /* in: message for aio handler if non-sync
1069
aio used, else ignored */
1071
fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len,
1075
/**************************************************************************
1076
Waits for an aio operation to complete. This function is used to write the
1077
handler for completed requests. The aio array of pending requests is divided
1078
into segments (see os0file.c for more info). The thread specifies which
1079
segment it wants to wait for. */
1084
ulint segment) /* in: the number of the segment in the aio
1085
array to wait for */
1088
fil_node_t* fil_node;
1089
fil_system_t* system = fil_system;
1092
ut_ad(fil_validate());
1094
if (os_aio_use_native_aio) {
1096
ret = os_aio_windows_handle(segment, 0, &fil_node, &message);
1097
#elif defined(POSIX_ASYNC_IO)
1098
ret = os_aio_posix_handle(segment, &fil_node, &message);
1103
ret = os_aio_simulated_handle(segment, (void**) &fil_node,
1109
mutex_enter(&(system->mutex));
1111
fil_node_complete_io(fil_node, fil_system);
1113
mutex_exit(&(system->mutex));
1115
ut_ad(fil_validate());
1117
/* Do the i/o handling */
1119
if (buf_pool_is_block(message)) {
1121
buf_page_io_complete(message);
1123
log_io_complete(message);
1127
/**************************************************************************
1128
Flushes to disk possible writes cached by the OS. */
1133
ulint space_id) /* in: file space id (this can be a group of
1134
log files or a tablespace of the database) */
1136
fil_system_t* system = fil_system;
1141
mutex_enter(&(system->mutex));
1143
HASH_SEARCH(hash, system->spaces, space_id, space,
1144
space->id == space_id);
1147
node = UT_LIST_GET_FIRST(space->chain);
1151
file = node->handle;
1153
mutex_exit(&(system->mutex));
1155
/* Note that it is not certain, when we have
1156
released the mutex above, that the file of the
1157
handle is still open: we assume that the OS
1158
will not crash or trap even if we pass a handle
1159
to a closed file below in os_file_flush! */
1161
os_file_flush(file);
1163
mutex_enter(&(system->mutex));
1166
node = UT_LIST_GET_NEXT(chain, node);
1169
mutex_exit(&(system->mutex));
1172
/**************************************************************************
1173
Flushes to disk writes in file spaces of the given type possibly cached by
1177
fil_flush_file_spaces(
1178
/*==================*/
1179
ulint purpose) /* in: FIL_TABLESPACE, FIL_LOG */
1181
fil_system_t* system = fil_system;
1184
mutex_enter(&(system->mutex));
1186
space = UT_LIST_GET_FIRST(system->space_list);
1189
if (space->purpose == purpose) {
1190
mutex_exit(&(system->mutex));
1192
fil_flush(space->id);
1194
mutex_enter(&(system->mutex));
1197
space = UT_LIST_GET_NEXT(space_list, space);
1200
mutex_exit(&(system->mutex));
1203
/**********************************************************************
1204
Checks the consistency of the file system. */
1209
/* out: TRUE if ok */
1212
fil_node_t* fil_node;
1213
ulint pending_count = 0;
1214
fil_system_t* system;
1217
system = fil_system;
1219
mutex_enter(&(system->mutex));
1221
/* Look for spaces in the hash table */
1223
for (i = 0; i < hash_get_n_cells(system->spaces); i++) {
1225
space = HASH_GET_FIRST(system->spaces, i);
1227
while (space != NULL) {
1229
UT_LIST_VALIDATE(chain, fil_node_t, space->chain);
1231
fil_node = UT_LIST_GET_FIRST(space->chain);
1233
while (fil_node != NULL) {
1235
if (fil_node->n_pending > 0) {
1238
ut_a(fil_node->open);
1241
fil_node = UT_LIST_GET_NEXT(chain, fil_node);
1244
space = HASH_GET_NEXT(hash, space);
1248
ut_a(pending_count == system->n_open_pending);
1250
UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU);
1252
fil_node = UT_LIST_GET_FIRST(system->LRU);
1254
while (fil_node != NULL) {
1256
ut_a(fil_node->n_pending == 0);
1257
ut_a(fil_node->open);
1259
fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
1262
mutex_exit(&(system->mutex));
1267
/************************************************************************
1268
Returns TRUE if file address is undefined. */
1272
/* out: TRUE if undefined */
1273
fil_addr_t addr) /* in: address */
1275
if (addr.page == FIL_NULL) {
1283
/************************************************************************
1284
Accessor functions for a file page */
1287
fil_page_get_prev(byte* page)
1289
return(mach_read_from_4(page + FIL_PAGE_PREV));
1293
fil_page_get_next(byte* page)
1295
return(mach_read_from_4(page + FIL_PAGE_NEXT));
1298
/*************************************************************************
1299
Sets the file page type. */
1304
byte* page, /* in: file page */
1305
ulint type) /* in: type */
1308
ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_INDEX));
1310
mach_write_to_2(page + FIL_PAGE_TYPE, type);
1313
/*************************************************************************
1314
Gets the file page type. */
1319
/* out: type; NOTE that if the type has not been
1320
written to page, the return value not defined */
1321
byte* page) /* in: file page */
1325
return(mach_read_from_2(page + FIL_PAGE_TYPE));