1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
18
/***********************************************************************
20
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
21
Copyright (c) 2009, Percona Inc.
23
Portions of this file contain modifications contributed and copyrighted
24
by Percona Inc.. Those modifications are
25
gratefully acknowledged and are described briefly in the InnoDB
26
documentation. The contributions by Percona Inc. are incorporated with
27
their permission, and subject to the conditions contained in the file
30
This program is free software; you can redistribute it and/or modify it
31
under the terms of the GNU General Public License as published by the
32
Free Software Foundation; version 2 of the License.
34
This program is distributed in the hope that it will be useful, but
35
WITHOUT ANY WARRANTY; without even the implied warranty of
36
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
37
Public License for more details.
39
You should have received a copy of the GNU General Public License along
40
with this program; if not, write to the Free Software Foundation, Inc.,
41
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
43
***********************************************************************/
45
/**************************************************//**
47
The interface to the operating system file i/o primitives
49
Created 10/21/1995 Heikki Tuuri
50
*******************************************************/
55
#include "srv0start.h"
58
#ifndef UNIV_HOTBACKUP
60
# include "os0thread.h"
61
#else /* !UNIV_HOTBACKUP */
63
/* Add includes for the _stat() call to compile on Windows */
64
# include <sys/types.h>
65
# include <sys/stat.h>
68
#endif /* !UNIV_HOTBACKUP */
70
/* This specifies the file permissions InnoDB uses when it creates files in
71
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
75
/** Umask for creating files */
76
UNIV_INTERN ulint os_innodb_umask
77
= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
79
/** Umask for creating files */
80
UNIV_INTERN ulint os_innodb_umask = 0;
84
/* If the following is set to TRUE, we do not call os_file_flush in every
85
os_file_write. We can set this TRUE when the doublewrite buffer is used. */
86
UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
88
/* We do not call os_file_flush in every os_file_write. */
89
#endif /* UNIV_DO_FLUSH */
92
# define os_aio_use_native_aio FALSE
93
#else /* UNIV_HOTBACKUP */
94
/* We use these mutexes to protect lseek + file i/o operation, if the
95
OS does not provide an atomic pread or pwrite, or similar */
96
#define OS_FILE_N_SEEK_MUTEXES 16
97
UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
99
/* In simulated aio, merge at most this many consecutive i/os */
100
#define OS_AIO_MERGE_N_CONSECUTIVE 64
102
/** If this flag is TRUE, then we will use the native aio of the
103
OS (provided we compiled Innobase with it in), otherwise we will
104
use simulated aio we build below with threads */
106
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
108
/** Flag: enable debug printout for asynchronous i/o */
109
UNIV_INTERN ibool os_aio_print_debug = FALSE;
111
/** The asynchronous i/o array slot structure */
112
typedef struct os_aio_slot_struct os_aio_slot_t;
114
/** The asynchronous i/o array slot structure */
115
struct os_aio_slot_struct{
116
ibool is_read; /*!< TRUE if a read operation */
117
ulint pos; /*!< index of the slot in the aio
119
ibool reserved; /*!< TRUE if this slot is reserved */
120
time_t reservation_time;/*!< time when reserved */
121
ulint len; /*!< length of the block to read or
123
byte* buf; /*!< buffer used in i/o */
124
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
125
ulint offset; /*!< 32 low bits of file offset in
127
ulint offset_high; /*!< 32 high bits of file offset */
128
os_file_t file; /*!< file where to read or write */
129
const char* name; /*!< file name or path */
130
ibool io_already_done;/*!< used only in simulated aio:
131
TRUE if the physical i/o already
132
made and only the slot message
133
needs to be passed to the caller
134
of os_aio_simulated_handle */
135
fil_node_t* message1; /*!< message which is given by the */
136
void* message2; /*!< the requester of an aio operation
137
and which can be used to identify
138
which pending aio operation was
141
os_event_t event; /*!< event object we need in the
143
OVERLAPPED control; /*!< Windows control block for the
148
/** The asynchronous i/o array structure */
149
typedef struct os_aio_array_struct os_aio_array_t;
151
/** The asynchronous i/o array structure */
152
struct os_aio_array_struct{
153
os_mutex_t mutex; /*!< the mutex protecting the aio array */
155
/*!< The event which is set to the
156
signaled state when there is space in
157
the aio outside the ibuf segment */
159
/*!< The event which is set to the
160
signaled state when there are no
161
pending i/os in this array */
162
ulint n_slots;/*!< Total number of slots in the aio
163
array. This must be divisible by
166
/*!< Number of segments in the aio
167
array of pending aio requests. A
168
thread can wait separately for any one
171
/*!< Number of reserved slots in the
172
aio array outside the ibuf segment */
173
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
175
os_native_event_t* native_events;
176
/*!< Pointer to an array of OS native
177
event handles where we copied the
178
handles from slots, in the same
179
order. This can be used in
180
WaitForMultipleObjects; used only in
185
/** Array of events used in simulated aio */
186
static os_event_t* os_aio_segment_wait_events = NULL;
188
/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
189
are NULL when the module has not yet been initialized. @{ */
190
static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
191
static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
192
static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
193
static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
194
static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
197
/** Number of asynchronous I/O segments. Set by os_aio_init(). */
198
static ulint os_aio_n_segments = ULINT_UNDEFINED;
200
/** If the following is TRUE, read i/o handler threads try to
201
wait until a batch of new read requests have been posted */
202
static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
203
#endif /* UNIV_HOTBACKUP */
205
UNIV_INTERN ulint os_n_file_reads = 0;
206
UNIV_INTERN ulint os_bytes_read_since_printout = 0;
207
UNIV_INTERN ulint os_n_file_writes = 0;
208
UNIV_INTERN ulint os_n_fsyncs = 0;
209
UNIV_INTERN ulint os_n_file_reads_old = 0;
210
UNIV_INTERN ulint os_n_file_writes_old = 0;
211
UNIV_INTERN ulint os_n_fsyncs_old = 0;
212
UNIV_INTERN time_t os_last_printout;
214
UNIV_INTERN ibool os_has_said_disk_full = FALSE;
216
#ifndef UNIV_HOTBACKUP
217
/** The mutex protecting the following counts of pending I/O operations */
218
static os_mutex_t os_file_count_mutex;
219
#endif /* !UNIV_HOTBACKUP */
220
/** Number of pending os_file_pread() operations */
221
UNIV_INTERN ulint os_file_n_pending_preads = 0;
222
/** Number of pending os_file_pwrite() operations */
223
UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
224
/** Number of pending write operations */
225
UNIV_INTERN ulint os_n_pending_writes = 0;
226
/** Number of pending read operations */
227
UNIV_INTERN ulint os_n_pending_reads = 0;
229
/***********************************************************************//**
230
Gets the operating system version. Currently works only on Windows.
231
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
234
os_get_os_version(void)
235
/*===================*/
238
OSVERSIONINFO os_info;
240
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
242
ut_a(GetVersionEx(&os_info));
244
if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
246
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
248
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
249
if (os_info.dwMajorVersion <= 4) {
265
/***********************************************************************//**
266
Retrieves the last error number if an error occurs in a file io function.
267
The number should be retrieved before any other OS calls (because they may
268
overwrite the error number). If the number is not known to this program,
269
the OS error number + 100 is returned.
270
@return error number, or OS error number + 100 */
273
os_file_get_last_error(
274
/*===================*/
275
ibool report_all_errors) /*!< in: TRUE if we want an error message
276
printed of all errors */
282
err = (ulint) GetLastError();
284
if (report_all_errors
285
|| (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
287
ut_print_timestamp(stderr);
289
" InnoDB: Operating system error number %lu"
290
" in a file operation.\n", (ulong) err);
292
if (err == ERROR_PATH_NOT_FOUND) {
294
"InnoDB: The error means the system"
295
" cannot find the path specified.\n");
297
if (srv_is_being_started) {
299
"InnoDB: If you are installing InnoDB,"
300
" remember that you must create\n"
301
"InnoDB: directories yourself, InnoDB"
302
" does not create them.\n");
304
} else if (err == ERROR_ACCESS_DENIED) {
306
"InnoDB: The error means mysqld does not have"
307
" the access rights to\n"
308
"InnoDB: the directory. It may also be"
309
" you have created a subdirectory\n"
310
"InnoDB: of the same name as a data file.\n");
311
} else if (err == ERROR_SHARING_VIOLATION
312
|| err == ERROR_LOCK_VIOLATION) {
314
"InnoDB: The error means that another program"
315
" is using InnoDB's files.\n"
316
"InnoDB: This might be a backup or antivirus"
317
" software or another instance\n"
319
" Please close it to get rid of this error.\n");
320
} else if (err == ERROR_WORKING_SET_QUOTA
321
|| err == ERROR_NO_SYSTEM_RESOURCES) {
323
"InnoDB: The error means that there are no"
324
" sufficient system resources or quota to"
325
" complete the operation.\n");
326
} else if (err == ERROR_OPERATION_ABORTED) {
328
"InnoDB: The error means that the I/O"
329
" operation has been aborted\n"
330
"InnoDB: because of either a thread exit"
331
" or an application request.\n"
332
"InnoDB: Retry attempt is made.\n");
335
"InnoDB: Some operating system error numbers"
336
" are described at\n"
339
"operating-system-error-codes.html\n");
345
if (err == ERROR_FILE_NOT_FOUND) {
346
return(OS_FILE_NOT_FOUND);
347
} else if (err == ERROR_DISK_FULL) {
348
return(OS_FILE_DISK_FULL);
349
} else if (err == ERROR_FILE_EXISTS) {
350
return(OS_FILE_ALREADY_EXISTS);
351
} else if (err == ERROR_SHARING_VIOLATION
352
|| err == ERROR_LOCK_VIOLATION) {
353
return(OS_FILE_SHARING_VIOLATION);
354
} else if (err == ERROR_WORKING_SET_QUOTA
355
|| err == ERROR_NO_SYSTEM_RESOURCES) {
356
return(OS_FILE_INSUFFICIENT_RESOURCE);
357
} else if (err == ERROR_OPERATION_ABORTED) {
358
return(OS_FILE_OPERATION_ABORTED);
365
if (report_all_errors
366
|| (err != ENOSPC && err != EEXIST)) {
368
ut_print_timestamp(stderr);
370
" InnoDB: Operating system error number %lu"
371
" in a file operation.\n", (ulong) err);
375
"InnoDB: The error means the system"
376
" cannot find the path specified.\n");
378
if (srv_is_being_started) {
380
"InnoDB: If you are installing InnoDB,"
381
" remember that you must create\n"
382
"InnoDB: directories yourself, InnoDB"
383
" does not create them.\n");
385
} else if (err == EACCES) {
387
"InnoDB: The error means mysqld does not have"
388
" the access rights to\n"
389
"InnoDB: the directory.\n");
391
if (strerror((int)err) != NULL) {
393
"InnoDB: Error number %lu"
395
err, strerror((int)err));
399
"InnoDB: Some operating system"
400
" error numbers are described at\n"
403
"operating-system-error-codes.html\n");
410
return(OS_FILE_DISK_FULL);
411
} else if (err == ENOENT) {
412
return(OS_FILE_NOT_FOUND);
413
} else if (err == EEXIST) {
414
return(OS_FILE_ALREADY_EXISTS);
415
} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
416
return(OS_FILE_PATH_ERROR);
423
/****************************************************************//**
424
Does error handling when a file operation fails.
425
Conditionally exits (calling exit(3)) based on should_exit value and the
427
@return TRUE if we should retry the operation */
430
os_file_handle_error_cond_exit(
431
/*===========================*/
432
const char* name, /*!< in: name of a file or NULL */
433
const char* operation, /*!< in: operation */
434
ibool should_exit) /*!< in: call exit(3) if unknown error
435
and this parameter is TRUE */
439
err = os_file_get_last_error(FALSE);
441
if (err == OS_FILE_DISK_FULL) {
442
/* We only print a warning about disk full once */
444
if (os_has_said_disk_full) {
450
ut_print_timestamp(stderr);
452
" InnoDB: Encountered a problem with"
456
ut_print_timestamp(stderr);
458
" InnoDB: Disk is full. Try to clean the disk"
459
" to free space.\n");
461
os_has_said_disk_full = TRUE;
466
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
469
} else if (err == OS_FILE_ALREADY_EXISTS
470
|| err == OS_FILE_PATH_ERROR) {
473
} else if (err == OS_FILE_SHARING_VIOLATION) {
475
os_thread_sleep(10000000); /* 10 sec */
477
} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
479
os_thread_sleep(100000); /* 100 ms */
481
} else if (err == OS_FILE_OPERATION_ABORTED) {
483
os_thread_sleep(100000); /* 100 ms */
487
fprintf(stderr, "InnoDB: File name %s\n", name);
490
fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
494
fprintf(stderr, "InnoDB: Cannot continue operation.\n");
505
/****************************************************************//**
506
Does error handling when a file operation fails.
507
@return TRUE if we should retry the operation */
510
os_file_handle_error(
511
/*=================*/
512
const char* name, /*!< in: name of a file or NULL */
513
const char* operation)/*!< in: operation */
515
/* exit in case of unknown error */
516
return(os_file_handle_error_cond_exit(name, operation, TRUE));
519
/****************************************************************//**
520
Does error handling when a file operation fails.
521
@return TRUE if we should retry the operation */
524
os_file_handle_error_no_exit(
525
/*=========================*/
526
const char* name, /*!< in: name of a file or NULL */
527
const char* operation)/*!< in: operation */
529
/* don't exit in case of unknown error */
530
return(os_file_handle_error_cond_exit(name, operation, FALSE));
534
#define USE_FILE_LOCK
535
#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
536
/* InnoDB Hot Backup does not lock the data files.
537
* On Windows, mandatory locking is used.
539
# undef USE_FILE_LOCK
542
/****************************************************************//**
543
Obtain an exclusive lock on a file.
544
@return 0 on success */
549
int fd, /*!< in: file descriptor */
550
const char* name) /*!< in: file name */
554
lk.l_whence = SEEK_SET;
555
lk.l_start = lk.l_len = 0;
556
if (fcntl(fd, F_SETLK, &lk) == -1) {
558
"InnoDB: Unable to lock %s, error: %d\n", name, errno);
560
if (errno == EAGAIN || errno == EACCES) {
562
"InnoDB: Check that you do not already have"
563
" another mysqld process\n"
564
"InnoDB: using the same InnoDB data"
573
#endif /* USE_FILE_LOCK */
575
#ifndef UNIV_HOTBACKUP
576
/****************************************************************//**
577
Creates the seek mutexes used in positioned reads and writes. */
580
os_io_init_simple(void)
581
/*===================*/
585
os_file_count_mutex = os_mutex_create(NULL);
587
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
588
os_file_seek_mutexes[i] = os_mutex_create(NULL);
592
/***********************************************************************//**
593
Creates a temporary file. This function is like tmpfile(3), but
594
the temporary file is created in the MySQL temporary directory.
595
On Netware, this function is like tmpfile(3), because the C run-time
596
library of Netware does not expose the delete-on-close flag.
597
@return temporary file handle, or NULL on error */
600
os_file_create_tmpfile(void)
601
/*========================*/
604
FILE* file = tmpfile();
605
#else /* __NETWARE__ */
607
int fd = innobase_mysql_tmpfile();
610
file = fdopen(fd, "w+b");
612
#endif /* __NETWARE__ */
615
ut_print_timestamp(stderr);
617
" InnoDB: Error: unable to create temporary file;"
618
" errno: %d\n", errno);
623
#endif /* !__NETWARE__ */
628
#endif /* !UNIV_HOTBACKUP */
630
/***********************************************************************//**
631
The os_file_opendir() function opens a directory stream corresponding to the
632
directory named by the dirname argument. The directory stream is positioned
633
at the first entry. In both Unix and Windows we automatically skip the '.'
634
and '..' items at the start of the directory listing.
635
@return directory stream, NULL if error */
640
const char* dirname, /*!< in: directory name; it must not
641
contain a trailing '\' or '/' */
642
ibool error_is_fatal) /*!< in: TRUE if we should treat an
643
error as a fatal error; if we try to
644
open symlinks then we do not wish a
645
fatal error if it happens not to be
650
LPWIN32_FIND_DATA lpFindFileData;
651
char path[OS_FILE_MAX_PATH + 3];
653
ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
655
strcpy(path, dirname);
656
strcpy(path + strlen(path), "\\*");
658
/* Note that in Windows opening the 'directory stream' also retrieves
659
the first entry in the directory. Since it is '.', that is no problem,
660
as we will skip over the '.' and '..' entries anyway. */
662
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
664
dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
666
ut_free(lpFindFileData);
668
if (dir == INVALID_HANDLE_VALUE) {
670
if (error_is_fatal) {
671
os_file_handle_error(dirname, "opendir");
679
dir = opendir(dirname);
681
if (dir == NULL && error_is_fatal) {
682
os_file_handle_error(dirname, "opendir");
689
/***********************************************************************//**
690
Closes a directory stream.
691
@return 0 if success, -1 if failure */
696
os_file_dir_t dir) /*!< in: directory stream */
701
ret = FindClose(dir);
704
os_file_handle_error_no_exit(NULL, "closedir");
716
os_file_handle_error_no_exit(NULL, "closedir");
723
/***********************************************************************//**
724
This function returns information of the next file in the directory. We jump
725
over the '.' and '..' entries in the directory.
726
@return 0 if ok, -1 if error, 1 if at the end of the directory */
729
os_file_readdir_next_file(
730
/*======================*/
731
const char* dirname,/*!< in: directory name or path */
732
os_file_dir_t dir, /*!< in: directory stream */
733
os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
736
LPWIN32_FIND_DATA lpFindFileData;
739
lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
741
ret = FindNextFile(dir, lpFindFileData);
744
ut_a(strlen((char *) lpFindFileData->cFileName)
747
if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
748
|| strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
753
strcpy(info->name, (char *) lpFindFileData->cFileName);
755
info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
756
+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
759
if (lpFindFileData->dwFileAttributes
760
& FILE_ATTRIBUTE_REPARSE_POINT) {
761
/* TODO: test Windows symlinks */
762
/* TODO: MySQL has apparently its own symlink
763
implementation in Windows, dbname.sym can
764
redirect a database directory:
765
REFMAN "windows-symbolic-links.html" */
766
info->type = OS_FILE_TYPE_LINK;
767
} else if (lpFindFileData->dwFileAttributes
768
& FILE_ATTRIBUTE_DIRECTORY) {
769
info->type = OS_FILE_TYPE_DIR;
771
/* It is probably safest to assume that all other
772
file types are normal. Better to check them rather
773
than blindly skip them. */
775
info->type = OS_FILE_TYPE_FILE;
779
ut_free(lpFindFileData);
783
} else if (GetLastError() == ERROR_NO_MORE_FILES) {
787
os_file_handle_error_no_exit(dirname,
788
"readdir_next_file");
795
struct stat statinfo;
796
#ifdef HAVE_READDIR_R
797
char dirent_buf[sizeof(struct dirent)
798
+ _POSIX_PATH_MAX + 100];
799
/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
800
the max file name len; but in most standards, the
801
length is NAME_MAX; we add 100 to be even safer */
806
#ifdef HAVE_READDIR_R
807
ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
811
"InnoDB: cannot read directory %s, error %lu\n",
812
dirname, (ulong)ret);
818
/* End of directory */
823
ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
832
ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
834
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
839
strcpy(info->name, ent->d_name);
841
full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
843
sprintf(full_path, "%s/%s", dirname, ent->d_name);
845
ret = stat(full_path, &statinfo);
849
if (errno == ENOENT) {
850
/* readdir() returned a file that does not exist,
851
it must have been deleted in the meantime. Do what
852
would have happened if the file was deleted before
853
readdir() - ignore and go to the next entry.
854
If this is the last entry then info->name will still
855
contain the name of the deleted file when this
856
function returns, but this is not an issue since the
857
caller shouldn't be looking at info when end of
858
directory is returned. */
865
os_file_handle_error_no_exit(full_path, "stat");
872
info->size = (ib_int64_t)statinfo.st_size;
874
if (S_ISDIR(statinfo.st_mode)) {
875
info->type = OS_FILE_TYPE_DIR;
876
} else if (S_ISLNK(statinfo.st_mode)) {
877
info->type = OS_FILE_TYPE_LINK;
878
} else if (S_ISREG(statinfo.st_mode)) {
879
info->type = OS_FILE_TYPE_FILE;
881
info->type = OS_FILE_TYPE_UNKNOWN;
890
/*****************************************************************//**
891
This function attempts to create a directory named pathname. The new directory
892
gets default permissions. On Unix the permissions are (0770 & ~umask). If the
893
directory exists already, nothing is done and the call succeeds, unless the
894
fail_if_exists arguments is true.
895
@return TRUE if call succeeds, FALSE on error */
898
os_file_create_directory(
899
/*=====================*/
900
const char* pathname, /*!< in: directory name as
901
null-terminated string */
902
ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
903
is treated as an error. */
908
rcode = CreateDirectory((LPCTSTR) pathname, NULL);
910
|| (GetLastError() == ERROR_ALREADY_EXISTS
911
&& !fail_if_exists))) {
913
os_file_handle_error(pathname, "CreateDirectory");
922
rcode = mkdir(pathname, 0770);
924
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
926
os_file_handle_error(pathname, "mkdir");
935
/****************************************************************//**
936
A simple function to open or create a file.
937
@return own: handle to the file, not defined if error, error number
938
can be retrieved with os_file_get_last_error */
941
os_file_create_simple(
942
/*==================*/
943
const char* name, /*!< in: name of the file or path as a
944
null-terminated string */
945
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
946
opened (if does not exist, error), or
947
OS_FILE_CREATE if a new file is created
948
(if exists, error), or
949
OS_FILE_CREATE_PATH if new file
950
(if exists, error) and subdirectories along
951
its path are created (if needed)*/
952
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
953
OS_FILE_READ_WRITE */
954
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
960
DWORD attributes = 0;
966
if (create_mode == OS_FILE_OPEN) {
967
create_flag = OPEN_EXISTING;
968
} else if (create_mode == OS_FILE_CREATE) {
969
create_flag = CREATE_NEW;
970
} else if (create_mode == OS_FILE_CREATE_PATH) {
971
/* create subdirs along the path if needed */
972
*success = os_file_create_subdirs_if_needed(name);
976
create_flag = CREATE_NEW;
977
create_mode = OS_FILE_CREATE;
983
if (access_type == OS_FILE_READ_ONLY) {
984
access = GENERIC_READ;
985
} else if (access_type == OS_FILE_READ_WRITE) {
986
access = GENERIC_READ | GENERIC_WRITE;
992
file = CreateFile((LPCTSTR) name,
994
FILE_SHARE_READ | FILE_SHARE_WRITE,
995
/* file can be read and written also
996
by other processes */
997
NULL, /* default security attributes */
1000
NULL); /*!< no template file */
1002
if (file == INVALID_HANDLE_VALUE) {
1005
retry = os_file_handle_error(name,
1006
create_mode == OS_FILE_OPEN ?
1024
if (create_mode == OS_FILE_OPEN) {
1025
if (access_type == OS_FILE_READ_ONLY) {
1026
create_flag = O_RDONLY;
1028
create_flag = O_RDWR;
1030
} else if (create_mode == OS_FILE_CREATE) {
1031
create_flag = O_RDWR | O_CREAT | O_EXCL;
1032
} else if (create_mode == OS_FILE_CREATE_PATH) {
1033
/* create subdirs along the path if needed */
1034
*success = os_file_create_subdirs_if_needed(name);
1038
create_flag = O_RDWR | O_CREAT | O_EXCL;
1039
create_mode = OS_FILE_CREATE;
1045
if (create_mode == OS_FILE_CREATE) {
1046
file = open(name, create_flag, S_IRUSR | S_IWUSR
1047
| S_IRGRP | S_IWGRP);
1049
file = open(name, create_flag);
1055
retry = os_file_handle_error(name,
1056
create_mode == OS_FILE_OPEN ?
1061
#ifdef USE_FILE_LOCK
1062
} else if (access_type == OS_FILE_READ_WRITE
1063
&& os_file_lock(file, name)) {
1073
#endif /* __WIN__ */
1076
/****************************************************************//**
1077
A simple function to open or create a file.
1078
@return own: handle to the file, not defined if error, error number
1079
can be retrieved with os_file_get_last_error */
1082
os_file_create_simple_no_error_handling(
1083
/*====================================*/
1084
const char* name, /*!< in: name of the file or path as a
1085
null-terminated string */
1086
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1087
is opened (if does not exist, error), or
1088
OS_FILE_CREATE if a new file is created
1089
(if exists, error) */
1090
ulint access_type,/*!< in: OS_FILE_READ_ONLY,
1091
OS_FILE_READ_WRITE, or
1092
OS_FILE_READ_ALLOW_DELETE; the last option is
1093
used by a backup program reading the file */
1094
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1100
DWORD attributes = 0;
1101
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1105
if (create_mode == OS_FILE_OPEN) {
1106
create_flag = OPEN_EXISTING;
1107
} else if (create_mode == OS_FILE_CREATE) {
1108
create_flag = CREATE_NEW;
1114
if (access_type == OS_FILE_READ_ONLY) {
1115
access = GENERIC_READ;
1116
} else if (access_type == OS_FILE_READ_WRITE) {
1117
access = GENERIC_READ | GENERIC_WRITE;
1118
} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1119
access = GENERIC_READ;
1120
share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1121
| FILE_SHARE_WRITE; /*!< A backup program has to give
1122
mysqld the maximum freedom to
1123
do what it likes with the
1130
file = CreateFile((LPCTSTR) name,
1133
NULL, /* default security attributes */
1136
NULL); /*!< no template file */
1138
if (file == INVALID_HANDLE_VALUE) {
1151
if (create_mode == OS_FILE_OPEN) {
1152
if (access_type == OS_FILE_READ_ONLY) {
1153
create_flag = O_RDONLY;
1155
create_flag = O_RDWR;
1157
} else if (create_mode == OS_FILE_CREATE) {
1158
create_flag = O_RDWR | O_CREAT | O_EXCL;
1164
if (create_mode == OS_FILE_CREATE) {
1165
file = open(name, create_flag, S_IRUSR | S_IWUSR
1166
| S_IRGRP | S_IWGRP);
1168
file = open(name, create_flag);
1173
#ifdef USE_FILE_LOCK
1174
} else if (access_type == OS_FILE_READ_WRITE
1175
&& os_file_lock(file, name)) {
1185
#endif /* __WIN__ */
1188
/****************************************************************//**
1189
Tries to disable OS caching on an opened file descriptor. */
1192
os_file_set_nocache(
1193
/*================*/
1194
int fd, /*!< in: file descriptor to alter */
1195
const char* file_name, /*!< in: file name, used in the
1196
diagnostic message */
1197
const char* operation_name) /*!< in: "open" or "create"; used in the
1198
diagnostic message */
1200
/* some versions of Solaris may not have DIRECTIO_ON */
1201
#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1202
if (directio(fd, DIRECTIO_ON) == -1) {
1204
errno_save = (int)errno;
1205
ut_print_timestamp(stderr);
1207
" InnoDB: Failed to set DIRECTIO_ON "
1208
"on file %s: %s: %s, continuing anyway\n",
1209
file_name, operation_name, strerror(errno_save));
1211
#elif defined(O_DIRECT)
1212
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1214
errno_save = (int)errno;
1215
ut_print_timestamp(stderr);
1217
" InnoDB: Failed to set O_DIRECT "
1218
"on file %s: %s: %s, continuing anyway\n",
1219
file_name, operation_name, strerror(errno_save));
1220
if (errno_save == EINVAL) {
1221
ut_print_timestamp(stderr);
1223
" InnoDB: O_DIRECT is known to result in "
1224
"'Invalid argument' on Linux on tmpfs, "
1225
"see MySQL Bug#26662\n");
1231
/****************************************************************//**
1232
Opens an existing file or creates a new.
1233
@return own: handle to the file, not defined if error, error number
1234
can be retrieved with os_file_get_last_error */
1239
const char* name, /*!< in: name of the file or path as a
1240
null-terminated string */
1241
ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
1242
is opened (if does not exist, error), or
1243
OS_FILE_CREATE if a new file is created
1245
OS_FILE_OVERWRITE if a new file is created
1246
or an old overwritten;
1247
OS_FILE_OPEN_RAW, if a raw device or disk
1248
partition should be opened */
1249
ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
1250
non-buffered i/o is desired,
1251
OS_FILE_NORMAL, if any normal file;
1252
NOTE that it also depends on type, os_aio_..
1253
and srv_.. variables whether we really use
1254
async i/o or unbuffered i/o: look in the
1255
function source code for the exact rules */
1256
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
1257
ibool* success)/*!< out: TRUE if succeed, FALSE if error */
1261
DWORD share_mode = FILE_SHARE_READ;
1268
if (create_mode == OS_FILE_OPEN_RAW) {
1269
create_flag = OPEN_EXISTING;
1270
share_mode = FILE_SHARE_WRITE;
1271
} else if (create_mode == OS_FILE_OPEN
1272
|| create_mode == OS_FILE_OPEN_RETRY) {
1273
create_flag = OPEN_EXISTING;
1274
} else if (create_mode == OS_FILE_CREATE) {
1275
create_flag = CREATE_NEW;
1276
} else if (create_mode == OS_FILE_OVERWRITE) {
1277
create_flag = CREATE_ALWAYS;
1283
if (purpose == OS_FILE_AIO) {
1284
/* If specified, use asynchronous (overlapped) io and no
1285
buffering of writes in the OS */
1288
if (os_aio_use_native_aio) {
1289
attributes = attributes | FILE_FLAG_OVERLAPPED;
1292
#ifdef UNIV_NON_BUFFERED_IO
1293
# ifndef UNIV_HOTBACKUP
1294
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1295
/* Do not use unbuffered i/o to log files because
1296
value 2 denotes that we do not flush the log at every
1297
commit, but only once per second */
1298
} else if (srv_win_file_flush_method
1299
== SRV_WIN_IO_UNBUFFERED) {
1300
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1302
# else /* !UNIV_HOTBACKUP */
1303
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1304
# endif /* !UNIV_HOTBACKUP */
1305
#endif /* UNIV_NON_BUFFERED_IO */
1306
} else if (purpose == OS_FILE_NORMAL) {
1308
#ifdef UNIV_NON_BUFFERED_IO
1309
# ifndef UNIV_HOTBACKUP
1310
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1311
/* Do not use unbuffered i/o to log files because
1312
value 2 denotes that we do not flush the log at every
1313
commit, but only once per second */
1314
} else if (srv_win_file_flush_method
1315
== SRV_WIN_IO_UNBUFFERED) {
1316
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1318
# else /* !UNIV_HOTBACKUP */
1319
attributes = attributes | FILE_FLAG_NO_BUFFERING;
1320
# endif /* !UNIV_HOTBACKUP */
1321
#endif /* UNIV_NON_BUFFERED_IO */
1327
file = CreateFile((LPCTSTR) name,
1328
GENERIC_READ | GENERIC_WRITE, /* read and write
1330
share_mode, /* File can be read also by other
1331
processes; we must give the read
1332
permission because of ibbackup. We do
1333
not give the write permission to
1334
others because if one would succeed to
1335
start 2 instances of mysqld on the
1336
SAME files, that could cause severe
1337
database corruption! When opening
1338
raw disk partitions, Microsoft manuals
1339
say that we must give also the write
1341
NULL, /* default security attributes */
1344
NULL); /*!< no template file */
1346
if (file == INVALID_HANDLE_VALUE) {
1349
/* When srv_file_per_table is on, file creation failure may not
1350
be critical to the whole instance. Do not crash the server in
1351
case of unknown errors. */
1352
if (srv_file_per_table) {
1353
retry = os_file_handle_error_no_exit(name,
1354
create_mode == OS_FILE_CREATE ?
1357
retry = os_file_handle_error(name,
1358
create_mode == OS_FILE_CREATE ?
1374
const char* mode_str = NULL;
1375
const char* type_str = NULL;
1376
const char* purpose_str = NULL;
1381
if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1382
|| create_mode == OS_FILE_OPEN_RETRY) {
1384
create_flag = O_RDWR;
1385
} else if (create_mode == OS_FILE_CREATE) {
1386
mode_str = "CREATE";
1387
create_flag = O_RDWR | O_CREAT | O_EXCL;
1388
} else if (create_mode == OS_FILE_OVERWRITE) {
1389
mode_str = "OVERWRITE";
1390
create_flag = O_RDWR | O_CREAT | O_TRUNC;
1396
if (type == OS_LOG_FILE) {
1398
} else if (type == OS_DATA_FILE) {
1404
if (purpose == OS_FILE_AIO) {
1405
purpose_str = "AIO";
1406
} else if (purpose == OS_FILE_NORMAL) {
1407
purpose_str = "NORMAL";
1413
fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
1414
name, mode_str, type_str, purpose_str);
1417
/* We let O_SYNC only affect log files; note that we map O_DSYNC to
1418
O_SYNC because the datasync options seemed to corrupt files in 2001
1419
in both Linux and Solaris */
1420
if (type == OS_LOG_FILE
1421
&& srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1424
fprintf(stderr, "Using O_SYNC for file %s\n", name);
1427
create_flag = create_flag | O_SYNC;
1431
file = open(name, create_flag, os_innodb_umask);
1436
/* When srv_file_per_table is on, file creation failure may not
1437
be critical to the whole instance. Do not crash the server in
1438
case of unknown errors. */
1439
if (srv_file_per_table) {
1440
retry = os_file_handle_error_no_exit(name,
1441
create_mode == OS_FILE_CREATE ?
1444
retry = os_file_handle_error(name,
1445
create_mode == OS_FILE_CREATE ?
1452
return(file /* -1 */);
1459
/* We disable OS caching (O_DIRECT) only on data files */
1460
if (type != OS_LOG_FILE
1461
&& srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1463
os_file_set_nocache(file, name, mode_str);
1466
#ifdef USE_FILE_LOCK
1467
if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1469
if (create_mode == OS_FILE_OPEN_RETRY) {
1471
ut_print_timestamp(stderr);
1472
fputs(" InnoDB: Retrying to lock"
1473
" the first data file\n",
1475
for (i = 0; i < 100; i++) {
1476
os_thread_sleep(1000000);
1477
if (!os_file_lock(file, name)) {
1482
ut_print_timestamp(stderr);
1483
fputs(" InnoDB: Unable to open the first data file\n",
1491
#endif /* USE_FILE_LOCK */
1494
#endif /* __WIN__ */
1497
/***********************************************************************//**
1498
Deletes a file if it exists. The file has to be closed before calling this.
1499
@return TRUE if success */
1502
os_file_delete_if_exists(
1503
/*=====================*/
1504
const char* name) /*!< in: file path as a null-terminated string */
1510
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1513
ret = DeleteFile((LPCTSTR)name);
1519
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1520
/* the file does not exist, this not an error */
1527
if (count > 100 && 0 == (count % 10)) {
1529
"InnoDB: Warning: cannot delete file %s\n"
1530
"InnoDB: Are you running ibbackup"
1531
" to back up the file?\n", name);
1533
os_file_get_last_error(TRUE); /* print error information */
1536
os_thread_sleep(1000000); /* sleep for a second */
1549
if (ret != 0 && errno != ENOENT) {
1550
os_file_handle_error_no_exit(name, "delete");
1559
/***********************************************************************//**
1560
Deletes a file. The file has to be closed before calling this.
1561
@return TRUE if success */
1566
const char* name) /*!< in: file path as a null-terminated string */
1572
/* In Windows, deleting an .ibd file may fail if ibbackup is copying
1575
ret = DeleteFile((LPCTSTR)name);
1581
if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1582
/* If the file does not exist, we classify this as a 'mild'
1590
if (count > 100 && 0 == (count % 10)) {
1592
"InnoDB: Warning: cannot delete file %s\n"
1593
"InnoDB: Are you running ibbackup"
1594
" to back up the file?\n", name);
1596
os_file_get_last_error(TRUE); /* print error information */
1599
os_thread_sleep(1000000); /* sleep for a second */
1613
os_file_handle_error_no_exit(name, "delete");
1622
/***********************************************************************//**
1623
Renames a file (can also move it to another directory). It is safest that the
1624
file is closed before calling this function.
1625
@return TRUE if success */
1630
const char* oldpath,/*!< in: old file path as a null-terminated
1632
const char* newpath)/*!< in: new file path */
1637
ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1643
os_file_handle_error_no_exit(oldpath, "rename");
1649
ret = rename(oldpath, newpath);
1652
os_file_handle_error_no_exit(oldpath, "rename");
1661
/***********************************************************************//**
1662
Closes a file handle. In case of error, error number can be retrieved with
1663
os_file_get_last_error.
1664
@return TRUE if success */
1669
os_file_t file) /*!< in, own: handle to a file */
1676
ret = CloseHandle(file);
1682
os_file_handle_error(NULL, "close");
1691
os_file_handle_error(NULL, "close");
1700
#ifdef UNIV_HOTBACKUP
1701
/***********************************************************************//**
1702
Closes a file handle.
1703
@return TRUE if success */
1706
os_file_close_no_error_handling(
1707
/*============================*/
1708
os_file_t file) /*!< in, own: handle to a file */
1715
ret = CloseHandle(file);
1735
#endif /* UNIV_HOTBACKUP */
1737
/***********************************************************************//**
1739
@return TRUE if success */
1744
os_file_t file, /*!< in: handle to a file */
1745
ulint* size, /*!< out: least significant 32 bits of file
1747
ulint* size_high)/*!< out: most significant 32 bits of size */
1753
low = GetFileSize(file, &high);
1755
if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1766
offs = lseek(file, 0, SEEK_END);
1768
if (offs == ((off_t)-1)) {
1773
if (sizeof(off_t) > 4) {
1774
*size = (ulint)(offs & 0xFFFFFFFFUL);
1775
*size_high = (ulint)(offs >> 32);
1777
*size = (ulint) offs;
1785
/***********************************************************************//**
1786
Gets file size as a 64-bit integer ib_int64_t.
1787
@return size in bytes, -1 if error */
1790
os_file_get_size_as_iblonglong(
1791
/*===========================*/
1792
os_file_t file) /*!< in: handle to a file */
1798
success = os_file_get_size(file, &size, &size_high);
1805
return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1808
/***********************************************************************//**
1809
Write the specified number of zeros to a newly created file.
1810
@return TRUE if success */
1815
const char* name, /*!< in: name of the file or path as a
1816
null-terminated string */
1817
os_file_t file, /*!< in: handle to a file */
1818
ulint size, /*!< in: least significant 32 bits of file
1820
ulint size_high)/*!< in: most significant 32 bits of size */
1822
ib_int64_t current_size;
1823
ib_int64_t desired_size;
1829
ut_a(size == (size & 0xFFFFFFFF));
1832
desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1834
/* Write up to 1 megabyte at a time. */
1835
buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1837
buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1839
/* Align the buffer for possible raw i/o */
1840
buf = ut_align(buf2, UNIV_PAGE_SIZE);
1842
/* Write buffer full of zeros */
1843
memset(buf, 0, buf_size);
1845
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1847
fprintf(stderr, "InnoDB: Progress in MB:");
1850
while (current_size < desired_size) {
1853
if (desired_size - current_size < (ib_int64_t) buf_size) {
1854
n_bytes = (ulint) (desired_size - current_size);
1859
ret = os_file_write(name, file, buf,
1860
(ulint)(current_size & 0xFFFFFFFF),
1861
(ulint)(current_size >> 32),
1865
goto error_handling;
1868
/* Print about progress for each 100 MB written */
1869
if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
1870
!= current_size / (ib_int64_t)(100 * 1024 * 1024)) {
1872
fprintf(stderr, " %lu00",
1873
(ulong) ((current_size + n_bytes)
1874
/ (ib_int64_t)(100 * 1024 * 1024)));
1877
current_size += n_bytes;
1880
if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1882
fprintf(stderr, "\n");
1887
ret = os_file_flush(file);
1897
/***********************************************************************//**
1898
Truncates a file at its current position.
1899
@return TRUE if success */
1904
FILE* file) /*!< in: file to be truncated */
1907
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
1908
return(SetEndOfFile(h));
1910
return(!ftruncate(fileno(file), ftell(file)));
1911
#endif /* __WIN__ */
1915
/***********************************************************************//**
1916
Wrapper to fsync(2) that retries the call on some errors.
1917
Returns the value 0 if successful; otherwise the value -1 is returned and
1918
the global variable errno is set to indicate the error.
1919
@return 0 if success, -1 otherwise */
1925
os_file_t file) /*!< in: handle to a file */
1938
if (ret == -1 && errno == ENOLCK) {
1940
if (failures % 100 == 0) {
1942
ut_print_timestamp(stderr);
1944
" InnoDB: fsync(): "
1945
"No locks available; retrying\n");
1948
os_thread_sleep(200000 /* 0.2 sec */);
1961
#endif /* !__WIN__ */
1963
/***********************************************************************//**
1964
Flushes the write buffers of a given file to the disk.
1965
@return TRUE if success */
1970
os_file_t file) /*!< in, own: handle to a file */
1979
ret = FlushFileBuffers(file);
1985
/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1986
actually a raw device, we choose to ignore that error if we are using
1989
if (srv_start_raw_disk_in_use && GetLastError()
1990
== ERROR_INVALID_FUNCTION) {
1994
os_file_handle_error(NULL, "flush");
1996
/* It is a fatal error if a file flush does not succeed, because then
1997
the database can get corrupt on disk */
2004
#if defined(HAVE_DARWIN_THREADS)
2005
# ifndef F_FULLFSYNC
2006
/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2007
# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2008
# elif F_FULLFSYNC != 51
2009
# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2011
/* Apple has disabled fsync() for internal disk drives in OS X. That
2012
caused corruption for a user when he tested a power outage. Let us in
2013
OS X use a nonstandard flush method recommended by an Apple
2016
if (!srv_have_fullfsync) {
2017
/* If we are not on an operating system that supports this,
2018
then fall back to a plain fsync. */
2020
ret = os_file_fsync(file);
2022
ret = fcntl(file, F_FULLFSYNC, NULL);
2025
/* If we are not on a file system that supports this,
2026
then fall back to a plain fsync. */
2027
ret = os_file_fsync(file);
2031
ret = os_file_fsync(file);
2038
/* Since Linux returns EINVAL if the 'file' is actually a raw device,
2039
we choose to ignore that error if we are using raw disks */
2041
if (srv_start_raw_disk_in_use && errno == EINVAL) {
2046
ut_print_timestamp(stderr);
2049
" InnoDB: Error: the OS said file flush did not succeed\n");
2051
os_file_handle_error(NULL, "flush");
2053
/* It is a fatal error if a file flush does not succeed, because then
2054
the database can get corrupt on disk */
2062
/*******************************************************************//**
2063
Does a synchronous read operation in Posix.
2064
@return number of bytes read, -1 if error */
2069
os_file_t file, /*!< in: handle to a file */
2070
void* buf, /*!< in: buffer where to read */
2071
ulint n, /*!< in: number of bytes to read */
2072
ulint offset, /*!< in: least significant 32 bits of file
2073
offset from where to read */
2074
ulint offset_high) /*!< in: most significant 32 bits of
2078
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2080
#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2082
ut_a((offset & 0xFFFFFFFFUL) == offset);
2084
/* If off_t is > 4 bytes in size, then we assume we can pass a
2087
if (sizeof(off_t) > 4) {
2088
offs = (off_t)offset + (((off_t)offset_high) << 32);
2091
offs = (off_t)offset;
2093
if (offset_high > 0) {
2095
"InnoDB: Error: file read at offset > 4 GB\n");
2101
#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2102
os_mutex_enter(os_file_count_mutex);
2103
os_file_n_pending_preads++;
2104
os_n_pending_reads++;
2105
os_mutex_exit(os_file_count_mutex);
2107
n_bytes = pread(file, buf, (ssize_t)n, offs);
2109
os_mutex_enter(os_file_count_mutex);
2110
os_file_n_pending_preads--;
2111
os_n_pending_reads--;
2112
os_mutex_exit(os_file_count_mutex);
2119
#ifndef UNIV_HOTBACKUP
2121
#endif /* !UNIV_HOTBACKUP */
2123
os_mutex_enter(os_file_count_mutex);
2124
os_n_pending_reads++;
2125
os_mutex_exit(os_file_count_mutex);
2127
#ifndef UNIV_HOTBACKUP
2128
/* Protect the seek / read operation with a mutex */
2129
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2131
os_mutex_enter(os_file_seek_mutexes[i]);
2132
#endif /* !UNIV_HOTBACKUP */
2134
ret_offset = lseek(file, offs, SEEK_SET);
2136
if (ret_offset < 0) {
2139
ret = read(file, buf, (ssize_t)n);
2142
#ifndef UNIV_HOTBACKUP
2143
os_mutex_exit(os_file_seek_mutexes[i]);
2144
#endif /* !UNIV_HOTBACKUP */
2146
os_mutex_enter(os_file_count_mutex);
2147
os_n_pending_reads--;
2148
os_mutex_exit(os_file_count_mutex);
2155
/*******************************************************************//**
2156
Does a synchronous write operation in Posix.
2157
@return number of bytes written, -1 if error */
2162
os_file_t file, /*!< in: handle to a file */
2163
const void* buf, /*!< in: buffer from where to write */
2164
ulint n, /*!< in: number of bytes to write */
2165
ulint offset, /*!< in: least significant 32 bits of file
2166
offset where to write */
2167
ulint offset_high) /*!< in: most significant 32 bits of
2173
ut_a((offset & 0xFFFFFFFFUL) == offset);
2175
/* If off_t is > 4 bytes in size, then we assume we can pass a
2178
if (sizeof(off_t) > 4) {
2179
offs = (off_t)offset + (((off_t)offset_high) << 32);
2181
offs = (off_t)offset;
2183
if (offset_high > 0) {
2185
"InnoDB: Error: file write"
2186
" at offset > 4 GB\n");
2192
#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2193
os_mutex_enter(os_file_count_mutex);
2194
os_file_n_pending_pwrites++;
2195
os_n_pending_writes++;
2196
os_mutex_exit(os_file_count_mutex);
2198
ret = pwrite(file, buf, (ssize_t)n, offs);
2200
os_mutex_enter(os_file_count_mutex);
2201
os_file_n_pending_pwrites--;
2202
os_n_pending_writes--;
2203
os_mutex_exit(os_file_count_mutex);
2205
# ifdef UNIV_DO_FLUSH
2206
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2207
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2208
&& !os_do_not_call_flush_at_each_write) {
2210
/* Always do fsync to reduce the probability that when
2211
the OS crashes, a database page is only partially
2212
physically written to disk. */
2214
ut_a(TRUE == os_file_flush(file));
2216
# endif /* UNIV_DO_FLUSH */
2222
# ifndef UNIV_HOTBACKUP
2224
# endif /* !UNIV_HOTBACKUP */
2226
os_mutex_enter(os_file_count_mutex);
2227
os_n_pending_writes++;
2228
os_mutex_exit(os_file_count_mutex);
2230
# ifndef UNIV_HOTBACKUP
2231
/* Protect the seek / write operation with a mutex */
2232
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2234
os_mutex_enter(os_file_seek_mutexes[i]);
2235
# endif /* UNIV_HOTBACKUP */
2237
ret_offset = lseek(file, offs, SEEK_SET);
2239
if (ret_offset < 0) {
2245
ret = write(file, buf, (ssize_t)n);
2247
# ifdef UNIV_DO_FLUSH
2248
if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2249
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2250
&& !os_do_not_call_flush_at_each_write) {
2252
/* Always do fsync to reduce the probability that when
2253
the OS crashes, a database page is only partially
2254
physically written to disk. */
2256
ut_a(TRUE == os_file_flush(file));
2258
# endif /* UNIV_DO_FLUSH */
2261
# ifndef UNIV_HOTBACKUP
2262
os_mutex_exit(os_file_seek_mutexes[i]);
2263
# endif /* !UNIV_HOTBACKUP */
2265
os_mutex_enter(os_file_count_mutex);
2266
os_n_pending_writes--;
2267
os_mutex_exit(os_file_count_mutex);
2275
/*******************************************************************//**
2276
Requests a synchronous positioned read operation.
2277
@return TRUE if request was successful, FALSE if fail */
2282
os_file_t file, /*!< in: handle to a file */
2283
void* buf, /*!< in: buffer where to read */
2284
ulint offset, /*!< in: least significant 32 bits of file
2285
offset where to read */
2286
ulint offset_high, /*!< in: most significant 32 bits of
2288
ulint n) /*!< in: number of bytes to read */
2297
#ifndef UNIV_HOTBACKUP
2299
#endif /* !UNIV_HOTBACKUP */
2301
ut_a((offset & 0xFFFFFFFFUL) == offset);
2304
os_bytes_read_since_printout += n;
2311
low = (DWORD) offset;
2312
high = (DWORD) offset_high;
2314
os_mutex_enter(os_file_count_mutex);
2315
os_n_pending_reads++;
2316
os_mutex_exit(os_file_count_mutex);
2318
#ifndef UNIV_HOTBACKUP
2319
/* Protect the seek / read operation with a mutex */
2320
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2322
os_mutex_enter(os_file_seek_mutexes[i]);
2323
#endif /* !UNIV_HOTBACKUP */
2325
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2327
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2329
#ifndef UNIV_HOTBACKUP
2330
os_mutex_exit(os_file_seek_mutexes[i]);
2331
#endif /* !UNIV_HOTBACKUP */
2333
os_mutex_enter(os_file_count_mutex);
2334
os_n_pending_reads--;
2335
os_mutex_exit(os_file_count_mutex);
2337
goto error_handling;
2340
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2342
#ifndef UNIV_HOTBACKUP
2343
os_mutex_exit(os_file_seek_mutexes[i]);
2344
#endif /* !UNIV_HOTBACKUP */
2346
os_mutex_enter(os_file_count_mutex);
2347
os_n_pending_reads--;
2348
os_mutex_exit(os_file_count_mutex);
2350
if (ret && len == n) {
2357
os_bytes_read_since_printout += n;
2360
ret = os_file_pread(file, buf, n, offset, offset_high);
2362
if ((ulint)ret == n) {
2368
"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2369
"InnoDB: Was only able to read %ld.\n",
2370
(ulong)n, (ulong)offset_high,
2371
(ulong)offset, (long)ret);
2372
#endif /* __WIN__ */
2376
retry = os_file_handle_error(NULL, "read");
2383
"InnoDB: Fatal error: cannot read from file."
2384
" OS error number %lu.\n",
2386
(ulong) GetLastError()
2398
/*******************************************************************//**
2399
Requests a synchronous positioned read operation. This function does not do
2400
any error handling. In case of error it returns FALSE.
2401
@return TRUE if request was successful, FALSE if fail */
2404
os_file_read_no_error_handling(
2405
/*===========================*/
2406
os_file_t file, /*!< in: handle to a file */
2407
void* buf, /*!< in: buffer where to read */
2408
ulint offset, /*!< in: least significant 32 bits of file
2409
offset where to read */
2410
ulint offset_high, /*!< in: most significant 32 bits of
2412
ulint n) /*!< in: number of bytes to read */
2421
#ifndef UNIV_HOTBACKUP
2423
#endif /* !UNIV_HOTBACKUP */
2425
ut_a((offset & 0xFFFFFFFFUL) == offset);
2428
os_bytes_read_since_printout += n;
2435
low = (DWORD) offset;
2436
high = (DWORD) offset_high;
2438
os_mutex_enter(os_file_count_mutex);
2439
os_n_pending_reads++;
2440
os_mutex_exit(os_file_count_mutex);
2442
#ifndef UNIV_HOTBACKUP
2443
/* Protect the seek / read operation with a mutex */
2444
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2446
os_mutex_enter(os_file_seek_mutexes[i]);
2447
#endif /* !UNIV_HOTBACKUP */
2449
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2451
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2453
#ifndef UNIV_HOTBACKUP
2454
os_mutex_exit(os_file_seek_mutexes[i]);
2455
#endif /* !UNIV_HOTBACKUP */
2457
os_mutex_enter(os_file_count_mutex);
2458
os_n_pending_reads--;
2459
os_mutex_exit(os_file_count_mutex);
2461
goto error_handling;
2464
ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2466
#ifndef UNIV_HOTBACKUP
2467
os_mutex_exit(os_file_seek_mutexes[i]);
2468
#endif /* !UNIV_HOTBACKUP */
2470
os_mutex_enter(os_file_count_mutex);
2471
os_n_pending_reads--;
2472
os_mutex_exit(os_file_count_mutex);
2474
if (ret && len == n) {
2481
os_bytes_read_since_printout += n;
2484
ret = os_file_pread(file, buf, n, offset, offset_high);
2486
if ((ulint)ret == n) {
2490
#endif /* __WIN__ */
2494
retry = os_file_handle_error_no_exit(NULL, "read");
2503
/*******************************************************************//**
2504
Rewind file to its start, read at most size - 1 bytes from it to str, and
2505
NUL-terminate str. All errors are silently ignored. This function is
2506
mostly meant to be used with temporary files. */
2509
os_file_read_string(
2510
/*================*/
2511
FILE* file, /*!< in: file to read from */
2512
char* str, /*!< in: buffer where to read */
2513
ulint size) /*!< in: size of buffer */
2522
flen = fread(str, 1, size - 1, file);
2526
/*******************************************************************//**
2527
Requests a synchronous write operation.
2528
@return TRUE if request was successful, FALSE if fail */
2533
const char* name, /*!< in: name of the file or path as a
2534
null-terminated string */
2535
os_file_t file, /*!< in: handle to a file */
2536
const void* buf, /*!< in: buffer from which to write */
2537
ulint offset, /*!< in: least significant 32 bits of file
2538
offset where to write */
2539
ulint offset_high, /*!< in: most significant 32 bits of
2541
ulint n) /*!< in: number of bytes to write */
2549
ulint n_retries = 0;
2551
#ifndef UNIV_HOTBACKUP
2553
#endif /* !UNIV_HOTBACKUP */
2555
ut_a((offset & 0xFFFFFFFF) == offset);
2563
low = (DWORD) offset;
2564
high = (DWORD) offset_high;
2566
os_mutex_enter(os_file_count_mutex);
2567
os_n_pending_writes++;
2568
os_mutex_exit(os_file_count_mutex);
2570
#ifndef UNIV_HOTBACKUP
2571
/* Protect the seek / write operation with a mutex */
2572
i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2574
os_mutex_enter(os_file_seek_mutexes[i]);
2575
#endif /* !UNIV_HOTBACKUP */
2577
ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2579
if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2581
#ifndef UNIV_HOTBACKUP
2582
os_mutex_exit(os_file_seek_mutexes[i]);
2583
#endif /* !UNIV_HOTBACKUP */
2585
os_mutex_enter(os_file_count_mutex);
2586
os_n_pending_writes--;
2587
os_mutex_exit(os_file_count_mutex);
2589
ut_print_timestamp(stderr);
2592
" InnoDB: Error: File pointer positioning to"
2593
" file %s failed at\n"
2594
"InnoDB: offset %lu %lu. Operating system"
2595
" error number %lu.\n"
2596
"InnoDB: Some operating system error numbers"
2597
" are described at\n"
2599
REFMAN "operating-system-error-codes.html\n",
2600
name, (ulong) offset_high, (ulong) offset,
2601
(ulong) GetLastError());
2606
ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2608
/* Always do fsync to reduce the probability that when the OS crashes,
2609
a database page is only partially physically written to disk. */
2611
# ifdef UNIV_DO_FLUSH
2612
if (!os_do_not_call_flush_at_each_write) {
2613
ut_a(TRUE == os_file_flush(file));
2615
# endif /* UNIV_DO_FLUSH */
2617
#ifndef UNIV_HOTBACKUP
2618
os_mutex_exit(os_file_seek_mutexes[i]);
2619
#endif /* !UNIV_HOTBACKUP */
2621
os_mutex_enter(os_file_count_mutex);
2622
os_n_pending_writes--;
2623
os_mutex_exit(os_file_count_mutex);
2625
if (ret && len == n) {
2630
/* If some background file system backup tool is running, then, at
2631
least in Windows 2000, we may get here a specific error. Let us
2632
retry the operation 100 times, with 1 second waits. */
2634
if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2636
os_thread_sleep(1000000);
2643
if (!os_has_said_disk_full) {
2645
err = (ulint)GetLastError();
2647
ut_print_timestamp(stderr);
2650
" InnoDB: Error: Write to file %s failed"
2651
" at offset %lu %lu.\n"
2652
"InnoDB: %lu bytes should have been written,"
2653
" only %lu were written.\n"
2654
"InnoDB: Operating system error number %lu.\n"
2655
"InnoDB: Check that your OS and file system"
2656
" support files of this size.\n"
2657
"InnoDB: Check also that the disk is not full"
2658
" or a disk quota exceeded.\n",
2659
name, (ulong) offset_high, (ulong) offset,
2660
(ulong) n, (ulong) len, (ulong) err);
2662
if (strerror((int)err) != NULL) {
2664
"InnoDB: Error number %lu means '%s'.\n",
2665
(ulong) err, strerror((int)err));
2669
"InnoDB: Some operating system error numbers"
2670
" are described at\n"
2672
REFMAN "operating-system-error-codes.html\n");
2674
os_has_said_disk_full = TRUE;
2681
ret = os_file_pwrite(file, buf, n, offset, offset_high);
2683
if ((ulint)ret == n) {
2688
if (!os_has_said_disk_full) {
2690
ut_print_timestamp(stderr);
2693
" InnoDB: Error: Write to file %s failed"
2694
" at offset %lu %lu.\n"
2695
"InnoDB: %lu bytes should have been written,"
2696
" only %ld were written.\n"
2697
"InnoDB: Operating system error number %lu.\n"
2698
"InnoDB: Check that your OS and file system"
2699
" support files of this size.\n"
2700
"InnoDB: Check also that the disk is not full"
2701
" or a disk quota exceeded.\n",
2702
name, offset_high, offset, n, (long int)ret,
2704
if (strerror(errno) != NULL) {
2706
"InnoDB: Error number %lu means '%s'.\n",
2707
(ulint)errno, strerror(errno));
2711
"InnoDB: Some operating system error numbers"
2712
" are described at\n"
2714
REFMAN "operating-system-error-codes.html\n");
2716
os_has_said_disk_full = TRUE;
2723
/*******************************************************************//**
2724
Check the existence and type of the given file.
2725
@return TRUE if call succeeded */
2730
const char* path, /*!< in: pathname of the file */
2731
ibool* exists, /*!< out: TRUE if file exists */
2732
os_file_type_t* type) /*!< out: type of the file (if it exists) */
2736
struct _stat statinfo;
2738
ret = _stat(path, &statinfo);
2739
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2740
/* file does not exist */
2744
/* file exists, but stat call failed */
2746
os_file_handle_error_no_exit(path, "stat");
2751
if (_S_IFDIR & statinfo.st_mode) {
2752
*type = OS_FILE_TYPE_DIR;
2753
} else if (_S_IFREG & statinfo.st_mode) {
2754
*type = OS_FILE_TYPE_FILE;
2756
*type = OS_FILE_TYPE_UNKNOWN;
2764
struct stat statinfo;
2766
ret = stat(path, &statinfo);
2767
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2768
/* file does not exist */
2772
/* file exists, but stat call failed */
2774
os_file_handle_error_no_exit(path, "stat");
2779
if (S_ISDIR(statinfo.st_mode)) {
2780
*type = OS_FILE_TYPE_DIR;
2781
} else if (S_ISLNK(statinfo.st_mode)) {
2782
*type = OS_FILE_TYPE_LINK;
2783
} else if (S_ISREG(statinfo.st_mode)) {
2784
*type = OS_FILE_TYPE_FILE;
2786
*type = OS_FILE_TYPE_UNKNOWN;
2795
/*******************************************************************//**
2796
This function returns information about the specified file
2797
@return TRUE if stat information found */
2802
const char* path, /*!< in: pathname of the file */
2803
os_file_stat_t* stat_info) /*!< information of a file in a
2808
struct _stat statinfo;
2810
ret = _stat(path, &statinfo);
2811
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2812
/* file does not exist */
2816
/* file exists, but stat call failed */
2818
os_file_handle_error_no_exit(path, "stat");
2822
if (_S_IFDIR & statinfo.st_mode) {
2823
stat_info->type = OS_FILE_TYPE_DIR;
2824
} else if (_S_IFREG & statinfo.st_mode) {
2825
stat_info->type = OS_FILE_TYPE_FILE;
2827
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2830
stat_info->ctime = statinfo.st_ctime;
2831
stat_info->atime = statinfo.st_atime;
2832
stat_info->mtime = statinfo.st_mtime;
2833
stat_info->size = statinfo.st_size;
2838
struct stat statinfo;
2840
ret = stat(path, &statinfo);
2842
if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2843
/* file does not exist */
2847
/* file exists, but stat call failed */
2849
os_file_handle_error_no_exit(path, "stat");
2854
if (S_ISDIR(statinfo.st_mode)) {
2855
stat_info->type = OS_FILE_TYPE_DIR;
2856
} else if (S_ISLNK(statinfo.st_mode)) {
2857
stat_info->type = OS_FILE_TYPE_LINK;
2858
} else if (S_ISREG(statinfo.st_mode)) {
2859
stat_info->type = OS_FILE_TYPE_FILE;
2861
stat_info->type = OS_FILE_TYPE_UNKNOWN;
2864
stat_info->ctime = statinfo.st_ctime;
2865
stat_info->atime = statinfo.st_atime;
2866
stat_info->mtime = statinfo.st_mtime;
2867
stat_info->size = statinfo.st_size;
2873
/* path name separator character */
2875
# define OS_FILE_PATH_SEPARATOR '\\'
2877
# define OS_FILE_PATH_SEPARATOR '/'
2880
/****************************************************************//**
2881
The function os_file_dirname returns a directory component of a
2882
null-terminated pathname string. In the usual case, dirname returns
2883
the string up to, but not including, the final '/', and basename
2884
is the component following the final '/'. Trailing '/' characļæ½
2885
ters are not counted as part of the pathname.
2887
If path does not contain a slash, dirname returns the string ".".
2889
Concatenating the string returned by dirname, a "/", and the basename
2890
yields a complete pathname.
2892
The return value is a copy of the directory component of the pathname.
2893
The copy is allocated from heap. It is the caller responsibility
2894
to free it after it is no longer needed.
2896
The following list of examples (taken from SUSv2) shows the strings
2897
returned by dirname and basename for different paths:
2899
path dirname basename
2900
"/usr/lib" "/usr" "lib"
2907
@return own: directory component of the pathname */
2912
const char* path) /*!< in: pathname */
2914
/* Find the offset of the last slash */
2915
const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
2917
/* No slash in the path, return "." */
2919
return(mem_strdup("."));
2922
/* Ok, there is a slash */
2924
if (last_slash == path) {
2925
/* last slash is the first char of the path */
2927
return(mem_strdup("/"));
2930
/* Non-trivial directory component */
2932
return(mem_strdupl(path, last_slash - path));
2935
/****************************************************************//**
2936
Creates all missing subdirectories along the given path.
2937
@return TRUE if call succeeded FALSE otherwise */
2940
os_file_create_subdirs_if_needed(
2941
/*=============================*/
2942
const char* path) /*!< in: path name */
2945
ibool success, subdir_exists;
2946
os_file_type_t type;
2948
subdir = os_file_dirname(path);
2949
if (strlen(subdir) == 1
2950
&& (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
2951
/* subdir is root or cwd, nothing to do */
2957
/* Test if subdir exists */
2958
success = os_file_status(subdir, &subdir_exists, &type);
2959
if (success && !subdir_exists) {
2960
/* subdir does not exist, create it */
2961
success = os_file_create_subdirs_if_needed(subdir);
2967
success = os_file_create_directory(subdir, FALSE);
2975
#ifndef UNIV_HOTBACKUP
2976
/****************************************************************//**
2977
Returns a pointer to the nth slot in the aio array.
2978
@return pointer to slot */
2981
os_aio_array_get_nth_slot(
2982
/*======================*/
2983
os_aio_array_t* array, /*!< in: aio array */
2984
ulint index) /*!< in: index of the slot */
2986
ut_a(index < array->n_slots);
2988
return((array->slots) + index);
2991
/************************************************************************//**
2992
Creates an aio wait array.
2993
@return own: aio array */
2996
os_aio_array_create(
2997
/*================*/
2998
ulint n, /*!< in: maximum number of pending aio operations
2999
allowed; n must be divisible by n_segments */
3000
ulint n_segments) /*!< in: number of segments in the aio array */
3002
os_aio_array_t* array;
3004
os_aio_slot_t* slot;
3009
ut_a(n_segments > 0);
3011
array = ut_malloc(sizeof(os_aio_array_t));
3013
array->mutex = os_mutex_create(NULL);
3014
array->not_full = os_event_create(NULL);
3015
array->is_empty = os_event_create(NULL);
3017
os_event_set(array->is_empty);
3020
array->n_segments = n_segments;
3021
array->n_reserved = 0;
3022
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
3024
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
3026
for (i = 0; i < n; i++) {
3027
slot = os_aio_array_get_nth_slot(array, i);
3030
slot->reserved = FALSE;
3032
slot->event = os_event_create(NULL);
3034
over = &(slot->control);
3036
over->hEvent = slot->event->handle;
3038
*((array->native_events) + i) = over->hEvent;
3045
/************************************************************************//**
3046
Frees an aio wait array. */
3051
os_aio_array_t* array) /*!< in, own: array to free */
3056
for (i = 0; i < array->n_slots; i++) {
3057
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3058
os_event_free(slot->event);
3060
#endif /* WIN_ASYNC_IO */
3063
ut_free(array->native_events);
3064
#endif /* __WIN__ */
3065
os_mutex_free(array->mutex);
3066
os_event_free(array->not_full);
3067
os_event_free(array->is_empty);
3069
ut_free(array->slots);
3073
/***********************************************************************
3074
Initializes the asynchronous io system. Creates one array each for ibuf
3075
and log i/o. Also creates one array each for read and write where each
3076
array is divided logically into n_read_segs and n_write_segs
3077
respectively. The caller must create an i/o handler thread for each
3078
segment in these arrays. This function also creates the sync array.
3079
No i/o handler thread needs to be created for that */
3084
ulint n_per_seg, /*<! in: maximum number of pending aio
3085
operations allowed per segment */
3086
ulint n_read_segs, /*<! in: number of reader threads */
3087
ulint n_write_segs, /*<! in: number of writer threads */
3088
ulint n_slots_sync) /*<! in: number of slots in the sync aio
3092
ulint n_segments = 2 + n_read_segs + n_write_segs;
3094
ut_ad(n_segments >= 4);
3096
os_io_init_simple();
3098
for (i = 0; i < n_segments; i++) {
3099
srv_set_io_thread_op_info(i, "not started yet");
3103
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3105
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3107
srv_io_thread_function[0] = "insert buffer thread";
3109
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3111
srv_io_thread_function[1] = "log thread";
3113
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3115
for (i = 2; i < 2 + n_read_segs; i++) {
3116
ut_a(i < SRV_MAX_N_IO_THREADS);
3117
srv_io_thread_function[i] = "read thread";
3120
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3122
for (i = 2 + n_read_segs; i < n_segments; i++) {
3123
ut_a(i < SRV_MAX_N_IO_THREADS);
3124
srv_io_thread_function[i] = "write thread";
3127
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3129
os_aio_n_segments = n_segments;
3133
os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
3135
for (i = 0; i < n_segments; i++) {
3136
os_aio_segment_wait_events[i] = os_event_create(NULL);
3139
os_last_printout = time(NULL);
3143
/***********************************************************************
3144
Frees the asynchronous io system. */
3152
os_aio_array_free(os_aio_ibuf_array);
3153
os_aio_ibuf_array = NULL;
3154
os_aio_array_free(os_aio_log_array);
3155
os_aio_log_array = NULL;
3156
os_aio_array_free(os_aio_read_array);
3157
os_aio_read_array = NULL;
3158
os_aio_array_free(os_aio_write_array);
3159
os_aio_write_array = NULL;
3160
os_aio_array_free(os_aio_sync_array);
3161
os_aio_sync_array = NULL;
3163
for (i = 0; i < os_aio_n_segments; i++) {
3164
os_event_free(os_aio_segment_wait_events[i]);
3167
ut_free(os_aio_segment_wait_events);
3168
os_aio_segment_wait_events = 0;
3169
os_aio_n_segments = 0;
3173
/************************************************************************//**
3174
Wakes up all async i/o threads in the array in Windows async i/o at
3178
os_aio_array_wake_win_aio_at_shutdown(
3179
/*==================================*/
3180
os_aio_array_t* array) /*!< in: aio array */
3184
for (i = 0; i < array->n_slots; i++) {
3186
os_event_set((array->slots + i)->event);
3191
/************************************************************************//**
3192
Wakes up all async i/o threads so that they know to exit themselves in
3196
os_aio_wake_all_threads_at_shutdown(void)
3197
/*=====================================*/
3202
/* This code wakes up all ai/o threads in Windows native aio */
3203
os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3204
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3205
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3206
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3208
/* This loop wakes up all simulated ai/o threads */
3210
for (i = 0; i < os_aio_n_segments; i++) {
3212
os_event_set(os_aio_segment_wait_events[i]);
3216
/************************************************************************//**
3217
Waits until there are no pending writes in os_aio_write_array. There can
3218
be other, synchronous, pending writes. */
3221
os_aio_wait_until_no_pending_writes(void)
3222
/*=====================================*/
3224
os_event_wait(os_aio_write_array->is_empty);
3227
/**********************************************************************//**
3228
Calculates segment number for a slot.
3229
@return segment number (which is the number used by, for example,
3230
i/o-handler threads) */
3233
os_aio_get_segment_no_from_slot(
3234
/*============================*/
3235
os_aio_array_t* array, /*!< in: aio wait array */
3236
os_aio_slot_t* slot) /*!< in: slot in this array */
3241
if (array == os_aio_ibuf_array) {
3244
} else if (array == os_aio_log_array) {
3247
} else if (array == os_aio_read_array) {
3248
seg_len = os_aio_read_array->n_slots
3249
/ os_aio_read_array->n_segments;
3251
segment = 2 + slot->pos / seg_len;
3253
ut_a(array == os_aio_write_array);
3254
seg_len = os_aio_write_array->n_slots
3255
/ os_aio_write_array->n_segments;
3257
segment = os_aio_read_array->n_segments + 2
3258
+ slot->pos / seg_len;
3264
/**********************************************************************//**
3265
Calculates local segment number and aio array from global segment number.
3266
@return local segment number within the aio array */
3269
os_aio_get_array_and_local_segment(
3270
/*===============================*/
3271
os_aio_array_t** array, /*!< out: aio wait array */
3272
ulint global_segment)/*!< in: global segment number */
3276
ut_a(global_segment < os_aio_n_segments);
3278
if (global_segment == 0) {
3279
*array = os_aio_ibuf_array;
3282
} else if (global_segment == 1) {
3283
*array = os_aio_log_array;
3286
} else if (global_segment < os_aio_read_array->n_segments + 2) {
3287
*array = os_aio_read_array;
3289
segment = global_segment - 2;
3291
*array = os_aio_write_array;
3293
segment = global_segment - (os_aio_read_array->n_segments + 2);
3299
/*******************************************************************//**
3300
Requests for a slot in the aio array. If no slot is available, waits until
3301
not_full-event becomes signaled.
3302
@return pointer to slot */
3305
os_aio_array_reserve_slot(
3306
/*======================*/
3307
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3308
os_aio_array_t* array, /*!< in: aio array */
3309
fil_node_t* message1,/*!< in: message to be passed along with
3310
the aio operation */
3311
void* message2,/*!< in: message to be passed along with
3312
the aio operation */
3313
os_file_t file, /*!< in: file handle */
3314
const char* name, /*!< in: name of the file or path as a
3315
null-terminated string */
3316
void* buf, /*!< in: buffer where to read or from which
3318
ulint offset, /*!< in: least significant 32 bits of file
3320
ulint offset_high, /*!< in: most significant 32 bits of
3322
ulint len) /*!< in: length of the block to read or write */
3324
os_aio_slot_t* slot;
3326
OVERLAPPED* control;
3329
ulint slots_per_seg;
3332
/* No need of a mutex. Only reading constant fields */
3333
slots_per_seg = array->n_slots / array->n_segments;
3335
/* We attempt to keep adjacent blocks in the same local
3336
segment. This can help in merging IO requests when we are
3337
doing simulated AIO */
3338
local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3339
% array->n_segments;
3342
os_mutex_enter(array->mutex);
3344
if (array->n_reserved == array->n_slots) {
3345
os_mutex_exit(array->mutex);
3347
if (!os_aio_use_native_aio) {
3348
/* If the handler threads are suspended, wake them
3349
so that we get more slots */
3351
os_aio_simulated_wake_handler_threads();
3354
os_event_wait(array->not_full);
3359
/* First try to find a slot in the preferred local segment */
3360
for (i = local_seg * slots_per_seg; i < array->n_slots; i++) {
3361
slot = os_aio_array_get_nth_slot(array, i);
3363
if (slot->reserved == FALSE) {
3368
/* Fall back to a full scan. We are guaranteed to find a slot */
3370
slot = os_aio_array_get_nth_slot(array, i);
3372
if (slot->reserved == FALSE) {
3378
ut_a(slot->reserved == FALSE);
3379
array->n_reserved++;
3381
if (array->n_reserved == 1) {
3382
os_event_reset(array->is_empty);
3385
if (array->n_reserved == array->n_slots) {
3386
os_event_reset(array->not_full);
3389
slot->reserved = TRUE;
3390
slot->reservation_time = time(NULL);
3391
slot->message1 = message1;
3392
slot->message2 = message2;
3398
slot->offset = offset;
3399
slot->offset_high = offset_high;
3400
slot->io_already_done = FALSE;
3403
control = &(slot->control);
3404
control->Offset = (DWORD)offset;
3405
control->OffsetHigh = (DWORD)offset_high;
3406
os_event_reset(slot->event);
3409
os_mutex_exit(array->mutex);
3414
/*******************************************************************//**
3415
Frees a slot in the aio array. */
3418
os_aio_array_free_slot(
3419
/*===================*/
3420
os_aio_array_t* array, /*!< in: aio array */
3421
os_aio_slot_t* slot) /*!< in: pointer to slot */
3426
os_mutex_enter(array->mutex);
3428
ut_ad(slot->reserved);
3430
slot->reserved = FALSE;
3432
array->n_reserved--;
3434
if (array->n_reserved == array->n_slots - 1) {
3435
os_event_set(array->not_full);
3438
if (array->n_reserved == 0) {
3439
os_event_set(array->is_empty);
3443
os_event_reset(slot->event);
3445
os_mutex_exit(array->mutex);
3448
/**********************************************************************//**
3449
Wakes up a simulated aio i/o-handler thread if it has something to do. */
3452
os_aio_simulated_wake_handler_thread(
3453
/*=================================*/
3454
ulint global_segment) /*!< in: the number of the segment in the aio
3457
os_aio_array_t* array;
3458
os_aio_slot_t* slot;
3463
ut_ad(!os_aio_use_native_aio);
3465
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3467
n = array->n_slots / array->n_segments;
3469
/* Look through n slots after the segment * n'th slot */
3471
os_mutex_enter(array->mutex);
3473
for (i = 0; i < n; i++) {
3474
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3476
if (slot->reserved) {
3477
/* Found an i/o request */
3483
os_mutex_exit(array->mutex);
3486
os_event_set(os_aio_segment_wait_events[global_segment]);
3490
/**********************************************************************//**
3491
Wakes up simulated aio i/o-handler threads if they have something to do. */
3494
os_aio_simulated_wake_handler_threads(void)
3495
/*=======================================*/
3499
if (os_aio_use_native_aio) {
3500
/* We do not use simulated aio: do nothing */
3505
os_aio_recommend_sleep_for_read_threads = FALSE;
3507
for (i = 0; i < os_aio_n_segments; i++) {
3508
os_aio_simulated_wake_handler_thread(i);
3512
/**********************************************************************//**
3513
This function can be called if one wants to post a batch of reads and
3514
prefers an i/o-handler thread to handle them all at once later. You must
3515
call os_aio_simulated_wake_handler_threads later to ensure the threads
3516
are not left sleeping! */
3519
os_aio_simulated_put_read_threads_to_sleep(void)
3520
/*============================================*/
3523
/* The idea of putting background IO threads to sleep is only for
3524
Windows when using simulated AIO. Windows XP seems to schedule
3525
background threads too eagerly to allow for coalescing during
3526
readahead requests. */
3528
os_aio_array_t* array;
3531
if (os_aio_use_native_aio) {
3532
/* We do not use simulated aio: do nothing */
3537
os_aio_recommend_sleep_for_read_threads = TRUE;
3539
for (g = 0; g < os_aio_n_segments; g++) {
3540
os_aio_get_array_and_local_segment(&array, g);
3542
if (array == os_aio_read_array) {
3544
os_event_reset(os_aio_segment_wait_events[g]);
3547
#endif /* __WIN__ */
3550
/*******************************************************************//**
3551
Requests an asynchronous i/o operation.
3552
@return TRUE if request was queued successfully, FALSE if fail */
3557
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
3558
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
3559
to OS_AIO_SIMULATED_WAKE_LATER: the
3560
last flag advises this function not to wake
3561
i/o-handler threads, but the caller will
3562
do the waking explicitly later, in this
3563
way the caller can post several requests in
3564
a batch; NOTE that the batch must not be
3565
so big that it exhausts the slots in aio
3566
arrays! NOTE that a simulated batch
3567
may introduce hidden chances of deadlocks,
3568
because i/os are not actually handled until
3569
all have been posted: use with great
3571
const char* name, /*!< in: name of the file or path as a
3572
null-terminated string */
3573
os_file_t file, /*!< in: handle to a file */
3574
void* buf, /*!< in: buffer where to read or from which
3576
ulint offset, /*!< in: least significant 32 bits of file
3577
offset where to read or write */
3578
ulint offset_high, /*!< in: most significant 32 bits of
3580
ulint n, /*!< in: number of bytes to read or write */
3581
fil_node_t* message1,/*!< in: message for the aio handler
3582
(can be used to identify a completed
3583
aio operation); ignored if mode is
3585
void* message2)/*!< in: message for the aio handler
3586
(can be used to identify a completed
3587
aio operation); ignored if mode is
3590
os_aio_array_t* array;
3591
os_aio_slot_t* slot;
3595
DWORD len = (DWORD) n;
3596
struct fil_node_struct * dummy_mess1;
3607
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
3608
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
3609
ut_ad(os_aio_validate());
3611
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
3612
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
3614
if (mode == OS_AIO_SYNC
3616
&& !os_aio_use_native_aio
3619
/* This is actually an ordinary synchronous read or write:
3620
no need to use an i/o-handler thread. NOTE that if we use
3621
Windows async i/o, Windows does not allow us to use
3622
ordinary synchronous os_file_read etc. on the same file,
3623
therefore we have built a special mechanism for synchronous
3624
wait in the Windows case. */
3626
if (type == OS_FILE_READ) {
3627
return(os_file_read(file, buf, offset,
3631
ut_a(type == OS_FILE_WRITE);
3633
return(os_file_write(name, file, buf, offset, offset_high, n));
3637
if (mode == OS_AIO_NORMAL) {
3638
if (type == OS_FILE_READ) {
3639
array = os_aio_read_array;
3641
array = os_aio_write_array;
3643
} else if (mode == OS_AIO_IBUF) {
3644
ut_ad(type == OS_FILE_READ);
3645
/* Reduce probability of deadlock bugs in connection with ibuf:
3646
do not let the ibuf i/o handler sleep */
3650
array = os_aio_ibuf_array;
3651
} else if (mode == OS_AIO_LOG) {
3653
array = os_aio_log_array;
3654
} else if (mode == OS_AIO_SYNC) {
3655
array = os_aio_sync_array;
3657
array = NULL; /* Eliminate compiler warning */
3661
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
3662
name, buf, offset, offset_high, n);
3663
if (type == OS_FILE_READ) {
3664
if (os_aio_use_native_aio) {
3667
os_bytes_read_since_printout += len;
3669
ret = ReadFile(file, buf, (DWORD)n, &len,
3674
os_aio_simulated_wake_handler_thread(
3675
os_aio_get_segment_no_from_slot(
3679
} else if (type == OS_FILE_WRITE) {
3680
if (os_aio_use_native_aio) {
3683
ret = WriteFile(file, buf, (DWORD)n, &len,
3688
os_aio_simulated_wake_handler_thread(
3689
os_aio_get_segment_no_from_slot(
3698
if (os_aio_use_native_aio) {
3699
if ((ret && len == n)
3700
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
3701
/* aio was queued successfully! */
3703
if (mode == OS_AIO_SYNC) {
3704
/* We want a synchronous i/o operation on a
3705
file where we also use async i/o: in Windows
3706
we must use the same wait mechanism as for
3709
retval = os_aio_windows_handle(ULINT_UNDEFINED,
3721
err = 1; /* Fall through the next if */
3725
/* aio was queued successfully! */
3730
os_aio_array_free_slot(array, slot);
3732
retry = os_file_handle_error(name,
3733
type == OS_FILE_READ
3734
? "aio read" : "aio write");
3744
/**********************************************************************//**
3745
This function is only used in Windows asynchronous i/o.
3746
Waits for an aio operation to complete. This function is used to wait the
3747
for completed requests. The aio array of pending requests is divided
3748
into segments. The thread specifies which segment or slot it wants to wait
3749
for. NOTE: this function will also take care of freeing the aio slot,
3750
therefore no other thread is allowed to do the freeing!
3751
@return TRUE if the aio operation succeeded */
3754
os_aio_windows_handle(
3755
/*==================*/
3756
ulint segment, /*!< in: the number of the segment in the aio
3757
arrays to wait for; segment 0 is the ibuf
3758
i/o thread, segment 1 the log i/o thread,
3759
then follow the non-ibuf read threads, and as
3760
the last are the non-ibuf write threads; if
3761
this is ULINT_UNDEFINED, then it means that
3762
sync aio is used, and this parameter is
3764
ulint pos, /*!< this parameter is used only in sync aio:
3765
wait for the aio slot at this position */
3766
fil_node_t**message1, /*!< out: the messages passed with the aio
3767
request; note that also in the case where
3768
the aio operation failed, these output
3769
parameters are valid and can be used to
3770
restart the operation, for example */
3772
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
3774
ulint orig_seg = segment;
3775
os_aio_array_t* array;
3776
os_aio_slot_t* slot;
3784
if (segment == ULINT_UNDEFINED) {
3785
array = os_aio_sync_array;
3788
segment = os_aio_get_array_and_local_segment(&array, segment);
3791
/* NOTE! We only access constant fields in os_aio_array. Therefore
3792
we do not have to acquire the protecting mutex yet */
3794
ut_ad(os_aio_validate());
3795
ut_ad(segment < array->n_segments);
3797
n = array->n_slots / array->n_segments;
3799
if (array == os_aio_sync_array) {
3800
os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
3803
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
3804
i = os_event_wait_multiple(n,
3805
(array->native_events)
3809
os_mutex_enter(array->mutex);
3811
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3813
ut_a(slot->reserved);
3815
if (orig_seg != ULINT_UNDEFINED) {
3816
srv_set_io_thread_op_info(orig_seg,
3817
"get windows aio return value");
3820
ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
3822
*message1 = slot->message1;
3823
*message2 = slot->message2;
3827
if (ret && len == slot->len) {
3830
#ifdef UNIV_DO_FLUSH
3831
if (slot->type == OS_FILE_WRITE
3832
&& !os_do_not_call_flush_at_each_write) {
3833
ut_a(TRUE == os_file_flush(slot->file));
3835
#endif /* UNIV_DO_FLUSH */
3836
} else if (os_file_handle_error(slot->name, "Windows aio")) {
3844
os_mutex_exit(array->mutex);
3847
/* retry failed read/write operation synchronously.
3848
No need to hold array->mutex. */
3850
switch (slot->type) {
3852
ret = WriteFile(slot->file, slot->buf,
3858
ret = ReadFile(slot->file, slot->buf,
3867
if (!ret && GetLastError() == ERROR_IO_PENDING) {
3868
/* aio was queued successfully!
3869
We want a synchronous i/o operation on a
3870
file where we also use async i/o: in Windows
3871
we must use the same wait mechanism as for
3874
ret = GetOverlappedResult(slot->file,
3879
ret_val = ret && len == slot->len;
3882
os_aio_array_free_slot(array, slot);
3888
/**********************************************************************//**
3889
Does simulated aio. This function should be called by an i/o-handler
3891
@return TRUE if the aio operation succeeded */
3894
os_aio_simulated_handle(
3895
/*====================*/
3896
ulint global_segment, /*!< in: the number of the segment in the aio
3897
arrays to wait for; segment 0 is the ibuf
3898
i/o thread, segment 1 the log i/o thread,
3899
then follow the non-ibuf read threads, and as
3900
the last are the non-ibuf write threads */
3901
fil_node_t**message1, /*!< out: the messages passed with the aio
3902
request; note that also in the case where
3903
the aio operation failed, these output
3904
parameters are valid and can be used to
3905
restart the operation, for example */
3907
ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
3909
os_aio_array_t* array;
3911
os_aio_slot_t* slot;
3912
os_aio_slot_t* slot2;
3913
os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
3914
ulint n_consecutive;
3917
ulint lowest_offset;
3921
byte* combined_buf2;
3926
segment = os_aio_get_array_and_local_segment(&array, global_segment);
3929
/* NOTE! We only access constant fields in os_aio_array. Therefore
3930
we do not have to acquire the protecting mutex yet */
3932
srv_set_io_thread_op_info(global_segment,
3933
"looking for i/o requests (a)");
3934
ut_ad(os_aio_validate());
3935
ut_ad(segment < array->n_segments);
3937
n = array->n_slots / array->n_segments;
3939
/* Look through n slots after the segment * n'th slot */
3941
if (array == os_aio_read_array
3942
&& os_aio_recommend_sleep_for_read_threads) {
3944
/* Give other threads chance to add several i/os to the array
3947
goto recommended_sleep;
3950
os_mutex_enter(array->mutex);
3952
srv_set_io_thread_op_info(global_segment,
3953
"looking for i/o requests (b)");
3955
/* Check if there is a slot for which the i/o has already been
3958
for (i = 0; i < n; i++) {
3959
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3961
if (slot->reserved && slot->io_already_done) {
3963
if (os_aio_print_debug) {
3965
"InnoDB: i/o for slot %lu"
3966
" already done, returning\n",
3978
/* If there are at least 2 seconds old requests, then pick the oldest
3979
one to prevent starvation. If several requests have the same age,
3980
then pick the one at the lowest offset. */
3983
lowest_offset = ULINT_MAX;
3985
for (i = 0; i < n; i++) {
3986
slot = os_aio_array_get_nth_slot(array, i + segment * n);
3988
if (slot->reserved) {
3989
age = (ulint)difftime(time(NULL),
3990
slot->reservation_time);
3992
if ((age >= 2 && age > biggest_age)
3993
|| (age >= 2 && age == biggest_age
3994
&& slot->offset < lowest_offset)) {
3996
/* Found an i/o request */
3997
consecutive_ios[0] = slot;
4002
lowest_offset = slot->offset;
4007
if (n_consecutive == 0) {
4008
/* There were no old requests. Look for an i/o request at the
4009
lowest offset in the array (we ignore the high 32 bits of the
4010
offset in these heuristics) */
4012
lowest_offset = ULINT_MAX;
4014
for (i = 0; i < n; i++) {
4015
slot = os_aio_array_get_nth_slot(array,
4018
if (slot->reserved && slot->offset < lowest_offset) {
4020
/* Found an i/o request */
4021
consecutive_ios[0] = slot;
4025
lowest_offset = slot->offset;
4030
if (n_consecutive == 0) {
4032
/* No i/o requested at the moment */
4037
slot = consecutive_ios[0];
4039
/* Check if there are several consecutive blocks to read or write */
4042
for (i = 0; i < n; i++) {
4043
slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4045
if (slot2->reserved && slot2 != slot
4046
&& slot2->offset == slot->offset + slot->len
4047
/* check that sum does not wrap over */
4048
&& slot->offset + slot->len > slot->offset
4049
&& slot2->offset_high == slot->offset_high
4050
&& slot2->type == slot->type
4051
&& slot2->file == slot->file) {
4053
/* Found a consecutive i/o request */
4055
consecutive_ios[n_consecutive] = slot2;
4060
if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4062
goto consecutive_loop;
4069
srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4071
/* We have now collected n_consecutive i/o requests in the array;
4072
allocate a single buffer which can hold all data, and perform the
4076
slot = consecutive_ios[0];
4078
for (i = 0; i < n_consecutive; i++) {
4079
total_len += consecutive_ios[i]->len;
4082
if (n_consecutive == 1) {
4083
/* We can use the buffer of the i/o request */
4084
combined_buf = slot->buf;
4085
combined_buf2 = NULL;
4087
combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4089
ut_a(combined_buf2);
4091
combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4094
/* We release the array mutex for the time of the i/o: NOTE that
4095
this assumes that there is just one i/o-handler thread serving
4096
a single segment of slots! */
4098
os_mutex_exit(array->mutex);
4100
if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4101
/* Copy the buffers to the combined buffer */
4104
for (i = 0; i < n_consecutive; i++) {
4106
ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4107
consecutive_ios[i]->len);
4108
offs += consecutive_ios[i]->len;
4112
srv_set_io_thread_op_info(global_segment, "doing file i/o");
4114
if (os_aio_print_debug) {
4116
"InnoDB: doing i/o of type %lu at offset %lu %lu,"
4118
(ulong) slot->type, (ulong) slot->offset_high,
4119
(ulong) slot->offset, (ulong) total_len);
4122
/* Do the i/o with ordinary, synchronous i/o functions: */
4123
if (slot->type == OS_FILE_WRITE) {
4124
ret = os_file_write(slot->name, slot->file, combined_buf,
4125
slot->offset, slot->offset_high,
4128
ret = os_file_read(slot->file, combined_buf,
4129
slot->offset, slot->offset_high, total_len);
4133
srv_set_io_thread_op_info(global_segment, "file i/o done");
4137
"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4138
n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4141
if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4142
/* Copy the combined buffer to individual buffers */
4145
for (i = 0; i < n_consecutive; i++) {
4147
ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4148
consecutive_ios[i]->len);
4149
offs += consecutive_ios[i]->len;
4153
if (combined_buf2) {
4154
ut_free(combined_buf2);
4157
os_mutex_enter(array->mutex);
4159
/* Mark the i/os done in slots */
4161
for (i = 0; i < n_consecutive; i++) {
4162
consecutive_ios[i]->io_already_done = TRUE;
4165
/* We return the messages for the first slot now, and if there were
4166
several slots, the messages will be returned with subsequent calls
4171
ut_a(slot->reserved);
4173
*message1 = slot->message1;
4174
*message2 = slot->message2;
4178
os_mutex_exit(array->mutex);
4180
os_aio_array_free_slot(array, slot);
4185
srv_set_io_thread_op_info(global_segment, "resetting wait event");
4187
/* We wait here until there again can be i/os in the segment
4190
os_event_reset(os_aio_segment_wait_events[global_segment]);
4192
os_mutex_exit(array->mutex);
4195
srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4197
os_event_wait(os_aio_segment_wait_events[global_segment]);
4199
if (os_aio_print_debug) {
4201
"InnoDB: i/o handler thread for i/o"
4202
" segment %lu wakes up\n",
4203
(ulong) global_segment);
4209
/**********************************************************************//**
4210
Validates the consistency of an aio array.
4211
@return TRUE if ok */
4214
os_aio_array_validate(
4215
/*==================*/
4216
os_aio_array_t* array) /*!< in: aio wait array */
4218
os_aio_slot_t* slot;
4219
ulint n_reserved = 0;
4224
os_mutex_enter(array->mutex);
4226
ut_a(array->n_slots > 0);
4227
ut_a(array->n_segments > 0);
4229
for (i = 0; i < array->n_slots; i++) {
4230
slot = os_aio_array_get_nth_slot(array, i);
4232
if (slot->reserved) {
4234
ut_a(slot->len > 0);
4238
ut_a(array->n_reserved == n_reserved);
4240
os_mutex_exit(array->mutex);
4245
/**********************************************************************//**
4246
Validates the consistency the aio system.
4247
@return TRUE if ok */
4250
os_aio_validate(void)
4251
/*=================*/
4253
os_aio_array_validate(os_aio_read_array);
4254
os_aio_array_validate(os_aio_write_array);
4255
os_aio_array_validate(os_aio_ibuf_array);
4256
os_aio_array_validate(os_aio_log_array);
4257
os_aio_array_validate(os_aio_sync_array);
4262
/**********************************************************************//**
4263
Prints info of the aio arrays. */
4268
FILE* file) /*!< in: file where to print */
4270
os_aio_array_t* array;
4271
os_aio_slot_t* slot;
4273
time_t current_time;
4274
double time_elapsed;
4275
double avg_bytes_read;
4278
for (i = 0; i < srv_n_file_io_threads; i++) {
4279
fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
4280
srv_io_thread_op_info[i],
4281
srv_io_thread_function[i]);
4284
if (os_aio_segment_wait_events[i]->is_set) {
4285
fprintf(file, " ev set");
4289
fprintf(file, "\n");
4292
fputs("Pending normal aio reads:", file);
4294
array = os_aio_read_array;
4298
os_mutex_enter(array->mutex);
4300
ut_a(array->n_slots > 0);
4301
ut_a(array->n_segments > 0);
4305
for (i = 0; i < array->n_slots; i++) {
4306
slot = os_aio_array_get_nth_slot(array, i);
4308
if (slot->reserved) {
4311
fprintf(stderr, "Reserved slot, messages %p %p\n",
4312
(void*) slot->message1,
4313
(void*) slot->message2);
4315
ut_a(slot->len > 0);
4319
ut_a(array->n_reserved == n_reserved);
4321
fprintf(file, " %lu", (ulong) n_reserved);
4323
os_mutex_exit(array->mutex);
4325
if (array == os_aio_read_array) {
4326
fputs(", aio writes:", file);
4328
array = os_aio_write_array;
4333
if (array == os_aio_write_array) {
4334
fputs(",\n ibuf aio reads:", file);
4335
array = os_aio_ibuf_array;
4340
if (array == os_aio_ibuf_array) {
4341
fputs(", log i/o's:", file);
4342
array = os_aio_log_array;
4347
if (array == os_aio_log_array) {
4348
fputs(", sync i/o's:", file);
4349
array = os_aio_sync_array;
4355
current_time = time(NULL);
4356
time_elapsed = 0.001 + difftime(current_time, os_last_printout);
4359
"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4360
"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4361
(ulong) fil_n_pending_log_flushes,
4362
(ulong) fil_n_pending_tablespace_flushes,
4363
(ulong) os_n_file_reads, (ulong) os_n_file_writes,
4364
(ulong) os_n_fsyncs);
4366
if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
4368
"%lu pending preads, %lu pending pwrites\n",
4369
(ulong) os_file_n_pending_preads,
4370
(ulong) os_file_n_pending_pwrites);
4373
if (os_n_file_reads == os_n_file_reads_old) {
4374
avg_bytes_read = 0.0;
4376
avg_bytes_read = (double) os_bytes_read_since_printout
4377
/ (os_n_file_reads - os_n_file_reads_old);
4381
"%.2f reads/s, %lu avg bytes/read,"
4382
" %.2f writes/s, %.2f fsyncs/s\n",
4383
(os_n_file_reads - os_n_file_reads_old)
4385
(ulong)avg_bytes_read,
4386
(os_n_file_writes - os_n_file_writes_old)
4388
(os_n_fsyncs - os_n_fsyncs_old)
4391
os_n_file_reads_old = os_n_file_reads;
4392
os_n_file_writes_old = os_n_file_writes;
4393
os_n_fsyncs_old = os_n_fsyncs;
4394
os_bytes_read_since_printout = 0;
4396
os_last_printout = current_time;
4399
/**********************************************************************//**
4400
Refreshes the statistics used to print per-second averages. */
4403
os_aio_refresh_stats(void)
4404
/*======================*/
4406
os_n_file_reads_old = os_n_file_reads;
4407
os_n_file_writes_old = os_n_file_writes;
4408
os_n_fsyncs_old = os_n_fsyncs;
4409
os_bytes_read_since_printout = 0;
4411
os_last_printout = time(NULL);
4415
/**********************************************************************//**
4416
Checks that all slots in the system have been freed, that is, there are
4417
no pending io operations.
4418
@return TRUE if all free */
4421
os_aio_all_slots_free(void)
4422
/*=======================*/
4424
os_aio_array_t* array;
4427
array = os_aio_read_array;
4429
os_mutex_enter(array->mutex);
4431
n_res += array->n_reserved;
4433
os_mutex_exit(array->mutex);
4435
array = os_aio_write_array;
4437
os_mutex_enter(array->mutex);
4439
n_res += array->n_reserved;
4441
os_mutex_exit(array->mutex);
4443
array = os_aio_ibuf_array;
4445
os_mutex_enter(array->mutex);
4447
n_res += array->n_reserved;
4449
os_mutex_exit(array->mutex);
4451
array = os_aio_log_array;
4453
os_mutex_enter(array->mutex);
4455
n_res += array->n_reserved;
4457
os_mutex_exit(array->mutex);
4459
array = os_aio_sync_array;
4461
os_mutex_enter(array->mutex);
4463
n_res += array->n_reserved;
4465
os_mutex_exit(array->mutex);
4474
#endif /* UNIV_DEBUG */
4476
#endif /* !UNIV_HOTBACKUP */