Drizzled Public API Documentation

os0file.cc
00001 /*****************************************************************************
00002 
00003 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
00004 Copyright (C) 2009, Percona Inc.
00005 
00006 Portions of this file contain modifications contributed and copyrighted
00007 by Percona Inc.. Those modifications are
00008 gratefully acknowledged and are described briefly in the InnoDB
00009 documentation. The contributions by Percona Inc. are incorporated with
00010 their permission, and subject to the conditions contained in the file
00011 COPYING.Percona.
00012 
00013 This program is free software; you can redistribute it and/or modify it under
00014 the terms of the GNU General Public License as published by the Free Software
00015 Foundation; version 2 of the License.
00016 
00017 This program is distributed in the hope that it will be useful, but WITHOUT
00018 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00019 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
00020 
00021 You should have received a copy of the GNU General Public License along with
00022 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
00023 St, Fifth Floor, Boston, MA 02110-1301 USA
00024 
00025 *****************************************************************************/
00026 
00027 /**************************************************/
00034 #include "os0file.h"
00035 
00036 #ifdef UNIV_NONINL
00037 #include "os0file.ic"
00038 #endif
00039 
00040 #include "ut0mem.h"
00041 #include "srv0srv.h"
00042 #include "srv0start.h"
00043 #include "fil0fil.h"
00044 #include "buf0buf.h"
00045 #include <errno.h>
00046 #include <fcntl.h>
00047 #include <limits.h>
00048 #include <unistd.h>
00049 #ifndef UNIV_HOTBACKUP
00050 # include "os0sync.h"
00051 # include "os0thread.h"
00052 #else /* !UNIV_HOTBACKUP */
00053 # ifdef __WIN__
00054 /* Add includes for the _stat() call to compile on Windows */
00055 #  include <sys/types.h>
00056 #  include <sys/stat.h>
00057 # endif /* __WIN__ */
00058 #endif /* !UNIV_HOTBACKUP */
00059 
00060 #if defined(LINUX_NATIVE_AIO)
00061 #include <libaio.h>
00062 #endif
00063 
00064 /* This specifies the file permissions InnoDB uses when it creates files in
00065 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
00066 my_umask */
00067 
00068 #ifndef __WIN__
00069 
00070 UNIV_INTERN ulint os_innodb_umask
00071       = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
00072 #else
00073 
00074 UNIV_INTERN ulint os_innodb_umask   = 0;
00075 #endif
00076 
00077 #ifdef UNIV_DO_FLUSH
00078 /* If the following is set to TRUE, we do not call os_file_flush in every
00079 os_file_write. We can set this TRUE when the doublewrite buffer is used. */
00080 UNIV_INTERN ibool os_do_not_call_flush_at_each_write  = FALSE;
00081 #else
00082 /* We do not call os_file_flush in every os_file_write. */
00083 #endif /* UNIV_DO_FLUSH */
00084 
00085 #ifndef UNIV_HOTBACKUP
00086 /* We use these mutexes to protect lseek + file i/o operation, if the
00087 OS does not provide an atomic pread or pwrite, or similar */
00088 #define OS_FILE_N_SEEK_MUTEXES  16
00089 UNIV_INTERN os_mutex_t  os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
00090 
00091 /* In simulated aio, merge at most this many consecutive i/os */
00092 #define OS_AIO_MERGE_N_CONSECUTIVE  64
00093 
00094 /**********************************************************************
00095 
00096 InnoDB AIO Implementation:
00097 =========================
00098 
00099 We support native AIO for windows and linux. For rest of the platforms
00100 we simulate AIO by special io-threads servicing the IO-requests.
00101 
00102 Simulated AIO:
00103 ==============
00104 
00105 In platforms where we 'simulate' AIO following is a rough explanation
00106 of the high level design.
00107 There are four io-threads (for ibuf, log, read, write).
00108 All synchronous IO requests are serviced by the calling thread using
00109 os_file_write/os_file_read. The Asynchronous requests are queued up
00110 in an array (there are four such arrays) by the calling thread. 
00111 Later these requests are picked up by the io-thread and are serviced
00112 synchronously.
00113 
00114 Windows native AIO:
00115 ==================
00116 
00117 If srv_use_native_aio is not set then windows follow the same
00118 code as simulated AIO. If the flag is set then native AIO interface
00119 is used. On windows, one of the limitation is that if a file is opened
00120 for AIO no synchronous IO can be done on it. Therefore we have an
00121 extra fifth array to queue up synchronous IO requests.
00122 There are innodb_file_io_threads helper threads. These threads work
00123 on the four arrays mentioned above in Simulated AIO. No thread is
00124 required for the sync array.
00125 If a synchronous IO request is made, it is first queued in the sync
00126 array. Then the calling thread itself waits on the request, thus
00127 making the call synchronous.
00128 If an AIO request is made the calling thread not only queues it in the
00129 array but also submits the requests. The helper thread then collects
00130 the completed IO request and calls completion routine on it.
00131 
00132 Linux native AIO:
00133 =================
00134 
00135 If we have libaio installed on the system and innodb_use_native_aio
00136 is set to TRUE we follow the code path of native AIO, otherwise we
00137 do simulated AIO.
00138 There are innodb_file_io_threads helper threads. These threads work
00139 on the four arrays mentioned above in Simulated AIO.
00140 If a synchronous IO request is made, it is handled by calling
00141 os_file_write/os_file_read.
00142 If an AIO request is made the calling thread not only queues it in the
00143 array but also submits the requests. The helper thread then collects
00144 the completed IO request and calls completion routine on it.
00145 
00146 **********************************************************************/
00147 
00149 UNIV_INTERN ibool os_aio_print_debug  = FALSE;
00150 
00151 #ifdef UNIV_PFS_IO
00152 /* Keys to register InnoDB I/O with performance schema */
00153 UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
00154 UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
00155 UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
00156 #endif /* UNIV_PFS_IO */
00157 
00159 typedef struct os_aio_slot_struct os_aio_slot_t;
00160 
00162 struct os_aio_slot_struct{
00163   ibool   is_read;  
00164   ulint   pos;    
00166   ibool   reserved; 
00167   time_t    reservation_time;
00168   ulint   len;    
00170   byte*   buf;    
00171   ulint   type;   
00172   ulint   offset;   
00174   ulint   offset_high;  
00175   os_file_t file;   
00176   const char* name;   
00177   ibool   io_already_done;
00182   fil_node_t* message1; 
00183   void*   message2; 
00187 #ifdef WIN_ASYNC_IO
00188   HANDLE    handle;   
00190   OVERLAPPED  control;  
00192 #elif defined(LINUX_NATIVE_AIO)
00193   struct iocb control;  /* Linux control block for aio */
00194   int   n_bytes;  /* bytes written/read. */
00195   int   ret;    /* AIO return code */
00196 #endif
00197 };
00198 
00200 typedef struct os_aio_array_struct  os_aio_array_t;
00201 
00203 struct os_aio_array_struct{
00204   os_mutex_t  mutex;  
00205   os_event_t  not_full;
00209   os_event_t  is_empty;
00213   ulint   n_slots;
00216   ulint   n_segments;
00221   ulint   cur_seg;
00225   ulint   n_reserved;
00228   os_aio_slot_t*  slots;  
00229 #ifdef __WIN__
00230   HANDLE*   handles;
00237 #endif
00238 
00239 #if defined(LINUX_NATIVE_AIO)
00240   io_context_t*   aio_ctx;
00241         /* completion queue for IO. There is 
00242         one such queue per segment. Each thread
00243         will work on one ctx exclusively. */
00244   struct io_event*  aio_events;
00245         /* The array to collect completed IOs.
00246         There is one such event for each
00247         possible pending IO. The size of the
00248         array is equal to n_slots. */
00249 #endif
00250 };
00251 
00252 #if defined(LINUX_NATIVE_AIO)
00253 
00254 #define OS_AIO_REAP_TIMEOUT (500000000UL)
00255 
00257 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
00258 
00260 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS  5
00261 #endif
00262 
00264 static os_event_t*  os_aio_segment_wait_events  = NULL;
00265 
00268 static os_aio_array_t*  os_aio_read_array = NULL; 
00269 static os_aio_array_t*  os_aio_write_array  = NULL; 
00270 static os_aio_array_t*  os_aio_ibuf_array = NULL; 
00271 static os_aio_array_t*  os_aio_log_array  = NULL; 
00272 static os_aio_array_t*  os_aio_sync_array = NULL; 
00273 /* @} */
00274 
00276 static ulint  os_aio_n_segments = ULINT_UNDEFINED;
00277 
00280 static ibool  os_aio_recommend_sleep_for_read_threads = FALSE;
00281 #endif /* !UNIV_HOTBACKUP */
00282 
00283 UNIV_INTERN ulint os_n_file_reads   = 0;
00284 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
00285 UNIV_INTERN ulint os_n_file_writes  = 0;
00286 UNIV_INTERN ulint os_n_fsyncs   = 0;
00287 UNIV_INTERN ulint os_n_file_reads_old = 0;
00288 UNIV_INTERN ulint os_n_file_writes_old  = 0;
00289 UNIV_INTERN ulint os_n_fsyncs_old   = 0;
00290 UNIV_INTERN time_t  os_last_printout;
00291 
00292 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
00293 
00294 #ifndef UNIV_HOTBACKUP
00295 
00296 static os_mutex_t os_file_count_mutex;
00297 #endif /* !UNIV_HOTBACKUP */
00298 
00299 UNIV_INTERN ulint os_file_n_pending_preads  = 0;
00301 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
00303 UNIV_INTERN ulint os_n_pending_writes = 0;
00305 UNIV_INTERN ulint os_n_pending_reads = 0;
00306 
00307 /***********************************************************************/
00311 UNIV_INTERN
00312 ulint
00313 os_get_os_version(void)
00314 /*===================*/
00315 {
00316 #ifdef __WIN__
00317   OSVERSIONINFO   os_info;
00318 
00319   os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
00320 
00321   ut_a(GetVersionEx(&os_info));
00322 
00323   if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
00324     return(OS_WIN31);
00325   } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
00326     return(OS_WIN95);
00327   } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
00328     switch (os_info.dwMajorVersion) {
00329     case 3:
00330     case 4:
00331       return OS_WINNT;
00332     case 5:
00333       return (os_info.dwMinorVersion == 0) ? OS_WIN2000
00334                    : OS_WINXP;
00335     case 6:
00336       return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
00337                    : OS_WIN7;
00338     default:
00339       return OS_WIN7;
00340     }
00341   } else {
00342     ut_error;
00343     return(0);
00344   }
00345 #else
00346   ut_error;
00347 
00348   return(0);
00349 #endif
00350 }
00351 
00352 /***********************************************************************/
00358 UNIV_INTERN
00359 ulint
00360 os_file_get_last_error(
00361 /*===================*/
00362   ibool report_all_errors)  
00364 {
00365   ulint err;
00366 
00367 #ifdef __WIN__
00368 
00369   err = (ulint) GetLastError();
00370 
00371   if (report_all_errors
00372       || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
00373 
00374     ut_print_timestamp(stderr);
00375     fprintf(stderr,
00376       "  InnoDB: Operating system error number %lu"
00377       " in a file operation.\n", (ulong) err);
00378 
00379     if (err == ERROR_PATH_NOT_FOUND) {
00380       fprintf(stderr,
00381         "InnoDB: The error means the system"
00382         " cannot find the path specified.\n");
00383 
00384       if (srv_is_being_started) {
00385         fprintf(stderr,
00386           "InnoDB: If you are installing InnoDB,"
00387           " remember that you must create\n"
00388           "InnoDB: directories yourself, InnoDB"
00389           " does not create them.\n");
00390       }
00391     } else if (err == ERROR_ACCESS_DENIED) {
00392       fprintf(stderr,
00393         "InnoDB: The error means mysqld does not have"
00394         " the access rights to\n"
00395         "InnoDB: the directory. It may also be"
00396         " you have created a subdirectory\n"
00397         "InnoDB: of the same name as a data file.\n");
00398     } else if (err == ERROR_SHARING_VIOLATION
00399          || err == ERROR_LOCK_VIOLATION) {
00400       fprintf(stderr,
00401         "InnoDB: The error means that another program"
00402         " is using InnoDB's files.\n"
00403         "InnoDB: This might be a backup or antivirus"
00404         " software or another instance\n"
00405         "InnoDB: of MySQL."
00406         " Please close it to get rid of this error.\n");
00407     } else if (err == ERROR_WORKING_SET_QUOTA
00408          || err == ERROR_NO_SYSTEM_RESOURCES) {
00409       fprintf(stderr,
00410         "InnoDB: The error means that there are no"
00411         " sufficient system resources or quota to"
00412         " complete the operation.\n");
00413     } else if (err == ERROR_OPERATION_ABORTED) {
00414       fprintf(stderr,
00415         "InnoDB: The error means that the I/O"
00416         " operation has been aborted\n"
00417         "InnoDB: because of either a thread exit"
00418         " or an application request.\n"
00419         "InnoDB: Retry attempt is made.\n");
00420     } else {
00421       fprintf(stderr,
00422         "InnoDB: Some operating system error numbers"
00423         " are described at\n"
00424         "InnoDB: "
00425         REFMAN
00426         "operating-system-error-codes.html\n");
00427     }
00428   }
00429 
00430   fflush(stderr);
00431 
00432   if (err == ERROR_FILE_NOT_FOUND) {
00433     return(OS_FILE_NOT_FOUND);
00434   } else if (err == ERROR_DISK_FULL) {
00435     return(OS_FILE_DISK_FULL);
00436   } else if (err == ERROR_FILE_EXISTS) {
00437     return(OS_FILE_ALREADY_EXISTS);
00438   } else if (err == ERROR_SHARING_VIOLATION
00439        || err == ERROR_LOCK_VIOLATION) {
00440     return(OS_FILE_SHARING_VIOLATION);
00441   } else if (err == ERROR_WORKING_SET_QUOTA
00442        || err == ERROR_NO_SYSTEM_RESOURCES) {
00443     return(OS_FILE_INSUFFICIENT_RESOURCE);
00444   } else if (err == ERROR_OPERATION_ABORTED) {
00445     return(OS_FILE_OPERATION_ABORTED);
00446   } else {
00447     return(100 + err);
00448   }
00449 #else
00450   err = (ulint) errno;
00451 
00452   if (report_all_errors
00453       || (err != ENOSPC && err != EEXIST)) {
00454 
00455     ut_print_timestamp(stderr);
00456     fprintf(stderr,
00457       "  InnoDB: Operating system error number %lu"
00458       " in a file operation.\n", (ulong) err);
00459 
00460     if (err == ENOENT) {
00461       fprintf(stderr,
00462         "InnoDB: The error means the system"
00463         " cannot find the path specified.\n");
00464 
00465       if (srv_is_being_started) {
00466         fprintf(stderr,
00467           "InnoDB: If you are installing InnoDB,"
00468           " remember that you must create\n"
00469           "InnoDB: directories yourself, InnoDB"
00470           " does not create them.\n");
00471       }
00472     } else if (err == EACCES) {
00473       fprintf(stderr,
00474         "InnoDB: The error means mysqld does not have"
00475         " the access rights to\n"
00476         "InnoDB: the directory.\n");
00477     } else {
00478       if (strerror((int)err) != NULL) {
00479         fprintf(stderr,
00480           "InnoDB: Error number %lu"
00481           " means '%s'.\n",
00482           err, strerror((int)err));
00483       }
00484 
00485       fprintf(stderr,
00486         "InnoDB: Some operating system"
00487         " error numbers are described at\n"
00488         "InnoDB: "
00489         REFMAN
00490         "operating-system-error-codes.html\n");
00491     }
00492   }
00493 
00494   fflush(stderr);
00495 
00496   switch (err) {
00497   case ENOSPC:
00498     return(OS_FILE_DISK_FULL);
00499   case ENOENT:
00500     return(OS_FILE_NOT_FOUND);
00501   case EEXIST:
00502     return(OS_FILE_ALREADY_EXISTS);
00503   case EXDEV:
00504   case ENOTDIR:
00505   case EISDIR:
00506     return(OS_FILE_PATH_ERROR);
00507   case EAGAIN:
00508     if (srv_use_native_aio) {
00509       return(OS_FILE_AIO_RESOURCES_RESERVED);
00510     }
00511     break;
00512   case EINTR:
00513     if (srv_use_native_aio) {
00514       return(OS_FILE_AIO_INTERRUPTED);
00515     }
00516     break;
00517   }
00518   return(100 + err);
00519 #endif
00520 }
00521 
00522 /****************************************************************/
00527 static
00528 ibool
00529 os_file_handle_error_cond_exit(
00530 /*===========================*/
00531   const char* name,   
00532   const char* operation,  
00533   ibool   should_exit)  
00535 {
00536   ulint err;
00537 
00538   err = os_file_get_last_error(FALSE);
00539 
00540   if (err == OS_FILE_DISK_FULL) {
00541     /* We only print a warning about disk full once */
00542 
00543     if (os_has_said_disk_full) {
00544 
00545       return(FALSE);
00546     }
00547 
00548     if (name) {
00549       ut_print_timestamp(stderr);
00550       fprintf(stderr,
00551         "  InnoDB: Encountered a problem with"
00552         " file %s\n", name);
00553     }
00554 
00555     ut_print_timestamp(stderr);
00556     fprintf(stderr,
00557       "  InnoDB: Disk is full. Try to clean the disk"
00558       " to free space.\n");
00559 
00560     os_has_said_disk_full = TRUE;
00561 
00562     fflush(stderr);
00563 
00564     return(FALSE);
00565   } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
00566 
00567     return(TRUE);
00568   } else if (err == OS_FILE_AIO_INTERRUPTED) {
00569 
00570     return(TRUE);
00571   } else if (err == OS_FILE_ALREADY_EXISTS
00572        || err == OS_FILE_PATH_ERROR) {
00573 
00574     return(FALSE);
00575   } else if (err == OS_FILE_SHARING_VIOLATION) {
00576 
00577     os_thread_sleep(10000000);  /* 10 sec */
00578     return(TRUE);
00579   } else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
00580 
00581     os_thread_sleep(100000);  /* 100 ms */
00582     return(TRUE);
00583   } else if (err == OS_FILE_OPERATION_ABORTED) {
00584 
00585     os_thread_sleep(100000);  /* 100 ms */
00586     return(TRUE);
00587   } else {
00588     if (name) {
00589       fprintf(stderr, "InnoDB: File name %s\n", name);
00590     }
00591 
00592     fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
00593       operation);
00594 
00595     if (should_exit) {
00596       fprintf(stderr, "InnoDB: Cannot continue operation.\n");
00597 
00598       fflush(stderr);
00599 
00600       exit(1);
00601     }
00602   }
00603 
00604   return(FALSE);
00605 }
00606 
00607 /****************************************************************/
00610 static
00611 ibool
00612 os_file_handle_error(
00613 /*=================*/
00614   const char* name, 
00615   const char* operation)
00616 {
00617   /* exit in case of unknown error */
00618   return(os_file_handle_error_cond_exit(name, operation, TRUE));
00619 }
00620 
00621 /****************************************************************/
00624 static
00625 ibool
00626 os_file_handle_error_no_exit(
00627 /*=========================*/
00628   const char* name, 
00629   const char* operation)
00630 {
00631   /* don't exit in case of unknown error */
00632   return(os_file_handle_error_cond_exit(name, operation, FALSE));
00633 }
00634 
00635 #undef USE_FILE_LOCK
00636 #define USE_FILE_LOCK
00637 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
00638 /* InnoDB Hot Backup does not lock the data files.
00639  * On Windows, mandatory locking is used.
00640  */
00641 # undef USE_FILE_LOCK
00642 #endif
00643 #ifdef USE_FILE_LOCK
00644 /****************************************************************/
00647 static
00648 int
00649 os_file_lock(
00650 /*=========*/
00651   int   fd, 
00652   const char* name) 
00653 {
00654   struct flock lk;
00655   lk.l_type = F_WRLCK;
00656   lk.l_whence = SEEK_SET;
00657   lk.l_start = lk.l_len = 0;
00658   if (fcntl(fd, F_SETLK, &lk) == -1) {
00659     fprintf(stderr,
00660       "InnoDB: Unable to lock %s, error: %d\n", name, errno);
00661 
00662     if (errno == EAGAIN || errno == EACCES) {
00663       fprintf(stderr,
00664         "InnoDB: Check that you do not already have"
00665         " another drizzled process\n"
00666         "InnoDB: using the same InnoDB data"
00667         " or log files.\n");
00668     }
00669 
00670     return(-1);
00671   }
00672 
00673   return(0);
00674 }
00675 #endif /* USE_FILE_LOCK */
00676 
00677 #ifndef UNIV_HOTBACKUP
00678 /****************************************************************/
00680 UNIV_INTERN
00681 void
00682 os_io_init_simple(void)
00683 /*===================*/
00684 {
00685   ulint i;
00686 
00687   os_file_count_mutex = os_mutex_create();
00688 
00689   for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
00690     os_file_seek_mutexes[i] = os_mutex_create();
00691   }
00692 }
00693 
00694 /***********************************************************************/
00698 UNIV_INTERN
00699 FILE*
00700 os_file_create_tmpfile(void)
00701 /*========================*/
00702 {
00703   FILE* file  = NULL;
00704   int fd  = innobase_mysql_tmpfile();
00705 
00706   if (fd >= 0) {
00707     file = fdopen(fd, "w+b");
00708   }
00709 
00710   if (!file) {
00711     ut_print_timestamp(stderr);
00712     fprintf(stderr,
00713       "  InnoDB: Error: unable to create temporary file;"
00714       " errno: %d\n", errno);
00715     if (fd >= 0) {
00716       close(fd);
00717     }
00718   }
00719 
00720   return(file);
00721 }
00722 #endif /* !UNIV_HOTBACKUP */
00723 
00724 /***********************************************************************/
00730 UNIV_INTERN
00731 os_file_dir_t
00732 os_file_opendir(
00733 /*============*/
00734   const char* dirname,  
00736   ibool   error_is_fatal) 
00741 {
00742   os_file_dir_t   dir;
00743 #ifdef __WIN__
00744   LPWIN32_FIND_DATA lpFindFileData;
00745   char      path[OS_FILE_MAX_PATH + 3];
00746 
00747   ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
00748 
00749   strcpy(path, dirname);
00750   strcpy(path + strlen(path), "\\*");
00751 
00752   /* Note that in Windows opening the 'directory stream' also retrieves
00753   the first entry in the directory. Since it is '.', that is no problem,
00754   as we will skip over the '.' and '..' entries anyway. */
00755 
00756   lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
00757 
00758   dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
00759 
00760   ut_free(lpFindFileData);
00761 
00762   if (dir == INVALID_HANDLE_VALUE) {
00763 
00764     if (error_is_fatal) {
00765       os_file_handle_error(dirname, "opendir");
00766     }
00767 
00768     return(NULL);
00769   }
00770 
00771   return(dir);
00772 #else
00773   dir = opendir(dirname);
00774 
00775   if (dir == NULL && error_is_fatal) {
00776     os_file_handle_error(dirname, "opendir");
00777   }
00778 
00779   return(dir);
00780 #endif
00781 }
00782 
00783 /***********************************************************************/
00786 UNIV_INTERN
00787 int
00788 os_file_closedir(
00789 /*=============*/
00790   os_file_dir_t dir)  
00791 {
00792 #ifdef __WIN__
00793   BOOL    ret;
00794 
00795   ret = FindClose(dir);
00796 
00797   if (!ret) {
00798     os_file_handle_error_no_exit(NULL, "closedir");
00799 
00800     return(-1);
00801   }
00802 
00803   return(0);
00804 #else
00805   int ret;
00806 
00807   ret = closedir(dir);
00808 
00809   if (ret) {
00810     os_file_handle_error_no_exit(NULL, "closedir");
00811   }
00812 
00813   return(ret);
00814 #endif
00815 }
00816 
00817 /***********************************************************************/
00821 UNIV_INTERN
00822 int
00823 os_file_readdir_next_file(
00824 /*======================*/
00825   const char* dirname,
00826   os_file_dir_t dir,  
00827   os_file_stat_t* info) 
00828 {
00829 #ifdef __WIN__
00830   LPWIN32_FIND_DATA lpFindFileData;
00831   BOOL      ret;
00832 
00833   lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
00834 next_file:
00835   ret = FindNextFile(dir, lpFindFileData);
00836 
00837   if (ret) {
00838     ut_a(strlen((char *) lpFindFileData->cFileName)
00839          < OS_FILE_MAX_PATH);
00840 
00841     if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
00842         || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
00843 
00844       goto next_file;
00845     }
00846 
00847     strcpy(info->name, (char *) lpFindFileData->cFileName);
00848 
00849     info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
00850       + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
00851          << 32);
00852 
00853     if (lpFindFileData->dwFileAttributes
00854         & FILE_ATTRIBUTE_REPARSE_POINT) {
00855       /* TODO: test Windows symlinks */
00856       /* TODO: MySQL has apparently its own symlink
00857       implementation in Windows, dbname.sym can
00858       redirect a database directory:
00859       REFMAN "windows-symbolic-links.html" */
00860       info->type = OS_FILE_TYPE_LINK;
00861     } else if (lpFindFileData->dwFileAttributes
00862          & FILE_ATTRIBUTE_DIRECTORY) {
00863       info->type = OS_FILE_TYPE_DIR;
00864     } else {
00865       /* It is probably safest to assume that all other
00866       file types are normal. Better to check them rather
00867       than blindly skip them. */
00868 
00869       info->type = OS_FILE_TYPE_FILE;
00870     }
00871   }
00872 
00873   ut_free(lpFindFileData);
00874 
00875   if (ret) {
00876     return(0);
00877   } else if (GetLastError() == ERROR_NO_MORE_FILES) {
00878 
00879     return(1);
00880   } else {
00881     os_file_handle_error_no_exit(dirname,
00882                "readdir_next_file");
00883     return(-1);
00884   }
00885 #else
00886   struct dirent*  ent;
00887   char*   full_path;
00888   int   ret;
00889   struct stat statinfo;
00890 #ifdef HAVE_READDIR_R
00891   char    dirent_buf[sizeof(struct dirent)
00892            + _POSIX_PATH_MAX + 100];
00893   /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
00894   the max file name len; but in most standards, the
00895   length is NAME_MAX; we add 100 to be even safer */
00896 #endif
00897 
00898 next_file:
00899 
00900 #ifdef HAVE_READDIR_R
00901   ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
00902 
00903   if (ret != 0
00904 #ifdef UNIV_AIX
00905       /* On AIX, only if we got non-NULL 'ent' (result) value and
00906       a non-zero 'ret' (return) value, it indicates a failed
00907       readdir_r() call. An NULL 'ent' with an non-zero 'ret'
00908       would indicate the "end of the directory" is reached. */
00909       && ent != NULL
00910 #endif
00911      ) {
00912     fprintf(stderr,
00913       "InnoDB: cannot read directory %s, error %lu\n",
00914       dirname, (ulong)ret);
00915 
00916     return(-1);
00917   }
00918 
00919   if (ent == NULL) {
00920     /* End of directory */
00921 
00922     return(1);
00923   }
00924 
00925   ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
00926 #else
00927   ent = readdir(dir);
00928 
00929   if (ent == NULL) {
00930 
00931     return(1);
00932   }
00933 #endif
00934   ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
00935 
00936   if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
00937 
00938     goto next_file;
00939   }
00940 
00941   strcpy(info->name, ent->d_name);
00942 
00943   full_path = static_cast<char* >(ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
00944 
00945   sprintf(full_path, "%s/%s", dirname, ent->d_name);
00946 
00947   ret = stat(full_path, &statinfo);
00948 
00949   if (ret) {
00950 
00951     if (errno == ENOENT) {
00952       /* readdir() returned a file that does not exist,
00953       it must have been deleted in the meantime. Do what
00954       would have happened if the file was deleted before
00955       readdir() - ignore and go to the next entry.
00956       If this is the last entry then info->name will still
00957       contain the name of the deleted file when this
00958       function returns, but this is not an issue since the
00959       caller shouldn't be looking at info when end of
00960       directory is returned. */
00961 
00962       ut_free(full_path);
00963 
00964       goto next_file;
00965     }
00966 
00967     os_file_handle_error_no_exit(full_path, "stat");
00968 
00969     ut_free(full_path);
00970 
00971     return(-1);
00972   }
00973 
00974   info->size = (ib_int64_t)statinfo.st_size;
00975 
00976   if (S_ISDIR(statinfo.st_mode)) {
00977     info->type = OS_FILE_TYPE_DIR;
00978   } else if (S_ISLNK(statinfo.st_mode)) {
00979     info->type = OS_FILE_TYPE_LINK;
00980   } else if (S_ISREG(statinfo.st_mode)) {
00981     info->type = OS_FILE_TYPE_FILE;
00982   } else {
00983     info->type = OS_FILE_TYPE_UNKNOWN;
00984   }
00985 
00986   ut_free(full_path);
00987 
00988   return(0);
00989 #endif
00990 }
00991 
00992 /*****************************************************************/
00998 UNIV_INTERN
00999 ibool
01000 os_file_create_directory(
01001 /*=====================*/
01002   const char* pathname, 
01004   ibool   fail_if_exists) 
01006 {
01007 #ifdef __WIN__
01008   BOOL  rcode;
01009 
01010   rcode = CreateDirectory((LPCTSTR) pathname, NULL);
01011   if (!(rcode != 0
01012         || (GetLastError() == ERROR_ALREADY_EXISTS
01013       && !fail_if_exists))) {
01014     /* failure */
01015     os_file_handle_error(pathname, "CreateDirectory");
01016 
01017     return(FALSE);
01018   }
01019 
01020   return (TRUE);
01021 #else
01022   int rcode;
01023 
01024   rcode = mkdir(pathname, 0770);
01025 
01026   if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
01027     /* failure */
01028     os_file_handle_error(pathname, "mkdir");
01029 
01030     return(FALSE);
01031   }
01032 
01033   return (TRUE);
01034 #endif
01035 }
01036 
01037 /****************************************************************/
01043 UNIV_INTERN
01044 os_file_t
01045 os_file_create_simple_func(
01046 /*=======================*/
01047   const char* name, 
01049   ulint   create_mode,
01056   ulint   access_type,
01058   ibool*    success)
01059 {
01060 #ifdef __WIN__
01061   os_file_t file;
01062   DWORD   create_flag;
01063   DWORD   access;
01064   DWORD   attributes  = 0;
01065   ibool   retry;
01066 
01067 try_again:
01068   ut_a(name);
01069 
01070   if (create_mode == OS_FILE_OPEN) {
01071     create_flag = OPEN_EXISTING;
01072   } else if (create_mode == OS_FILE_CREATE) {
01073     create_flag = CREATE_NEW;
01074   } else if (create_mode == OS_FILE_CREATE_PATH) {
01075     /* create subdirs along the path if needed  */
01076     *success = os_file_create_subdirs_if_needed(name);
01077     if (!*success) {
01078       ut_error;
01079     }
01080     create_flag = CREATE_NEW;
01081     create_mode = OS_FILE_CREATE;
01082   } else {
01083     create_flag = 0;
01084     ut_error;
01085   }
01086 
01087   if (access_type == OS_FILE_READ_ONLY) {
01088     access = GENERIC_READ;
01089   } else if (access_type == OS_FILE_READ_WRITE) {
01090     access = GENERIC_READ | GENERIC_WRITE;
01091   } else {
01092     access = 0;
01093     ut_error;
01094   }
01095 
01096   file = CreateFile((LPCTSTR) name,
01097         access,
01098         FILE_SHARE_READ | FILE_SHARE_WRITE,
01099         /* file can be read and written also
01100         by other processes */
01101         NULL, /* default security attributes */
01102         create_flag,
01103         attributes,
01104         NULL);  
01106   if (file == INVALID_HANDLE_VALUE) {
01107     *success = FALSE;
01108 
01109     retry = os_file_handle_error(name,
01110                create_mode == OS_FILE_OPEN ?
01111                "open" : "create");
01112     if (retry) {
01113       goto try_again;
01114     }
01115   } else {
01116     *success = TRUE;
01117   }
01118 
01119   return(file);
01120 #else /* __WIN__ */
01121   os_file_t file;
01122   int   create_flag;
01123   ibool   retry;
01124 
01125 try_again:
01126   ut_a(name);
01127 
01128   if (create_mode == OS_FILE_OPEN) {
01129     if (access_type == OS_FILE_READ_ONLY) {
01130       create_flag = O_RDONLY;
01131     } else {
01132       create_flag = O_RDWR;
01133     }
01134   } else if (create_mode == OS_FILE_CREATE) {
01135     create_flag = O_RDWR | O_CREAT | O_EXCL;
01136   } else if (create_mode == OS_FILE_CREATE_PATH) {
01137     /* create subdirs along the path if needed  */
01138     *success = os_file_create_subdirs_if_needed(name);
01139     if (!*success) {
01140       return (-1);
01141     }
01142     create_flag = O_RDWR | O_CREAT | O_EXCL;
01143     create_mode = OS_FILE_CREATE;
01144   } else {
01145     create_flag = 0;
01146     ut_error;
01147   }
01148 
01149   if (create_mode == OS_FILE_CREATE) {
01150     file = open(name, create_flag, S_IRUSR | S_IWUSR
01151           | S_IRGRP | S_IWGRP);
01152   } else {
01153     file = open(name, create_flag);
01154   }
01155 
01156   if (file == -1) {
01157     *success = FALSE;
01158 
01159     retry = os_file_handle_error(name,
01160                create_mode == OS_FILE_OPEN ?
01161                "open" : "create");
01162     if (retry) {
01163       goto try_again;
01164     }
01165 #ifdef USE_FILE_LOCK
01166   } else if (access_type == OS_FILE_READ_WRITE
01167        && os_file_lock(file, name)) {
01168     *success = FALSE;
01169     close(file);
01170     file = -1;
01171 #endif
01172   } else {
01173     *success = TRUE;
01174   }
01175 
01176   return(file);
01177 #endif /* __WIN__ */
01178 }
01179 
01180 /****************************************************************/
01186 UNIV_INTERN
01187 os_file_t
01188 os_file_create_simple_no_error_handling_func(
01189 /*=========================================*/
01190   const char* name, 
01192   ulint   create_mode,
01196   ulint   access_type,
01200   ibool*    success)
01201 {
01202 #ifdef __WIN__
01203   os_file_t file;
01204   DWORD   create_flag;
01205   DWORD   access;
01206   DWORD   attributes  = 0;
01207   DWORD   share_mode  = FILE_SHARE_READ | FILE_SHARE_WRITE;
01208 
01209   ut_a(name);
01210 
01211   if (create_mode == OS_FILE_OPEN) {
01212     create_flag = OPEN_EXISTING;
01213   } else if (create_mode == OS_FILE_CREATE) {
01214     create_flag = CREATE_NEW;
01215   } else {
01216     create_flag = 0;
01217     ut_error;
01218   }
01219 
01220   if (access_type == OS_FILE_READ_ONLY) {
01221     access = GENERIC_READ;
01222   } else if (access_type == OS_FILE_READ_WRITE) {
01223     access = GENERIC_READ | GENERIC_WRITE;
01224   } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
01225     access = GENERIC_READ;
01226     share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
01227       | FILE_SHARE_WRITE; 
01231   } else {
01232     access = 0;
01233     ut_error;
01234   }
01235 
01236   file = CreateFile((LPCTSTR) name,
01237         access,
01238         share_mode,
01239         NULL, /* default security attributes */
01240         create_flag,
01241         attributes,
01242         NULL);  
01244   if (file == INVALID_HANDLE_VALUE) {
01245     *success = FALSE;
01246   } else {
01247     *success = TRUE;
01248   }
01249 
01250   return(file);
01251 #else /* __WIN__ */
01252   os_file_t file;
01253   int   create_flag;
01254 
01255   ut_a(name);
01256 
01257   if (create_mode == OS_FILE_OPEN) {
01258     if (access_type == OS_FILE_READ_ONLY) {
01259       create_flag = O_RDONLY;
01260     } else {
01261       create_flag = O_RDWR;
01262     }
01263   } else if (create_mode == OS_FILE_CREATE) {
01264     create_flag = O_RDWR | O_CREAT | O_EXCL;
01265   } else {
01266     create_flag = 0;
01267     ut_error;
01268   }
01269 
01270   if (create_mode == OS_FILE_CREATE) {
01271     file = open(name, create_flag, S_IRUSR | S_IWUSR
01272           | S_IRGRP | S_IWGRP);
01273   } else {
01274     file = open(name, create_flag);
01275   }
01276 
01277   if (file == -1) {
01278     *success = FALSE;
01279 #ifdef USE_FILE_LOCK
01280   } else if (access_type == OS_FILE_READ_WRITE
01281        && os_file_lock(file, name)) {
01282     *success = FALSE;
01283     close(file);
01284     file = -1;
01285 #endif
01286   } else {
01287     *success = TRUE;
01288   }
01289 
01290   return(file);
01291 #endif /* __WIN__ */
01292 }
01293 
01294 /****************************************************************/
01296 UNIV_INTERN
01297 void
01298 os_file_set_nocache(
01299 /*================*/
01300   int   fd,   
01301   const char* file_name,  
01302   const char* operation_name)
01305 {
01306   /* some versions of Solaris may not have DIRECTIO_ON */
01307 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
01308   if (directio(fd, DIRECTIO_ON) == -1) {
01309     int errno_save;
01310     errno_save = (int)errno;
01311     ut_print_timestamp(stderr);
01312     fprintf(stderr,
01313       "  InnoDB: Failed to set DIRECTIO_ON "
01314       "on file %s: %s: %s, continuing anyway\n",
01315       file_name, operation_name, strerror(errno_save));
01316   }
01317 #elif defined(O_DIRECT)
01318   if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
01319     int errno_save;
01320     errno_save = (int)errno;
01321     ut_print_timestamp(stderr);
01322     fprintf(stderr,
01323       "  InnoDB: Failed to set O_DIRECT "
01324       "on file %s: %s: %s, continuing anyway\n",
01325       file_name, operation_name, strerror(errno_save));
01326     if (errno_save == EINVAL) {
01327       ut_print_timestamp(stderr);
01328       fprintf(stderr,
01329         "  InnoDB: O_DIRECT is known to result in "
01330         "'Invalid argument' on Linux on tmpfs, "
01331         "see MySQL Bug#26662\n");
01332     }
01333   }
01334 #else /* Required for OSX */
01335         (void)fd;
01336         (void)file_name;
01337         (void)operation_name;
01338 #endif
01339 }
01340 
01341 /****************************************************************/
01347 UNIV_INTERN
01348 os_file_t
01349 os_file_create_func(
01350 /*================*/
01351   const char* name, 
01353   ulint   create_mode,
01361   ulint   purpose,
01368   ulint   type, 
01369   ibool*    success)
01370 {
01371 #ifdef __WIN__
01372   os_file_t file;
01373   DWORD   share_mode  = FILE_SHARE_READ;
01374   DWORD   create_flag;
01375   DWORD   attributes;
01376   ibool   retry;
01377 try_again:
01378   ut_a(name);
01379 
01380   if (create_mode == OS_FILE_OPEN_RAW) {
01381     create_flag = OPEN_EXISTING;
01382     share_mode = FILE_SHARE_WRITE;
01383   } else if (create_mode == OS_FILE_OPEN
01384        || create_mode == OS_FILE_OPEN_RETRY) {
01385     create_flag = OPEN_EXISTING;
01386   } else if (create_mode == OS_FILE_CREATE) {
01387     create_flag = CREATE_NEW;
01388   } else if (create_mode == OS_FILE_OVERWRITE) {
01389     create_flag = CREATE_ALWAYS;
01390   } else {
01391     create_flag = 0;
01392     ut_error;
01393   }
01394 
01395   if (purpose == OS_FILE_AIO) {
01396     /* If specified, use asynchronous (overlapped) io and no
01397     buffering of writes in the OS */
01398     attributes = 0;
01399 #ifdef WIN_ASYNC_IO
01400     if (srv_use_native_aio) {
01401       attributes = attributes | FILE_FLAG_OVERLAPPED;
01402     }
01403 #endif
01404 #ifdef UNIV_NON_BUFFERED_IO
01405 # ifndef UNIV_HOTBACKUP
01406     if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
01407       /* Do not use unbuffered i/o to log files because
01408       value 2 denotes that we do not flush the log at every
01409       commit, but only once per second */
01410     } else if (srv_win_file_flush_method
01411          == SRV_WIN_IO_UNBUFFERED) {
01412       attributes = attributes | FILE_FLAG_NO_BUFFERING;
01413     }
01414 # else /* !UNIV_HOTBACKUP */
01415     attributes = attributes | FILE_FLAG_NO_BUFFERING;
01416 # endif /* !UNIV_HOTBACKUP */
01417 #endif /* UNIV_NON_BUFFERED_IO */
01418   } else if (purpose == OS_FILE_NORMAL) {
01419     attributes = 0;
01420 #ifdef UNIV_NON_BUFFERED_IO
01421 # ifndef UNIV_HOTBACKUP
01422     if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
01423       /* Do not use unbuffered i/o to log files because
01424       value 2 denotes that we do not flush the log at every
01425       commit, but only once per second */
01426     } else if (srv_win_file_flush_method
01427          == SRV_WIN_IO_UNBUFFERED) {
01428       attributes = attributes | FILE_FLAG_NO_BUFFERING;
01429     }
01430 # else /* !UNIV_HOTBACKUP */
01431     attributes = attributes | FILE_FLAG_NO_BUFFERING;
01432 # endif /* !UNIV_HOTBACKUP */
01433 #endif /* UNIV_NON_BUFFERED_IO */
01434   } else {
01435     attributes = 0;
01436     ut_error;
01437   }
01438 
01439   file = CreateFile((LPCTSTR) name,
01440         GENERIC_READ | GENERIC_WRITE, /* read and write
01441               access */
01442         share_mode, /* File can be read also by other
01443           processes; we must give the read
01444           permission because of ibbackup. We do
01445           not give the write permission to
01446           others because if one would succeed to
01447           start 2 instances of mysqld on the
01448           SAME files, that could cause severe
01449           database corruption! When opening
01450           raw disk partitions, Microsoft manuals
01451           say that we must give also the write
01452           permission. */
01453         NULL, /* default security attributes */
01454         create_flag,
01455         attributes,
01456         NULL);  
01458   if (file == INVALID_HANDLE_VALUE) {
01459     *success = FALSE;
01460 
01461     /* When srv_file_per_table is on, file creation failure may not
01462     be critical to the whole instance. Do not crash the server in
01463     case of unknown errors.
01464     Please note "srv_file_per_table" is a global variable with
01465     no explicit synchronization protection. It could be
01466     changed during this execution path. It might not have the
01467     same value as the one when building the table definition */
01468     if (srv_file_per_table) {
01469       retry = os_file_handle_error_no_exit(name,
01470             create_mode == OS_FILE_CREATE ?
01471             "create" : "open");
01472     } else {
01473       retry = os_file_handle_error(name,
01474             create_mode == OS_FILE_CREATE ?
01475             "create" : "open");
01476     }
01477 
01478     if (retry) {
01479       goto try_again;
01480     }
01481   } else {
01482     *success = TRUE;
01483   }
01484 
01485   return(file);
01486 #else /* __WIN__ */
01487   os_file_t file;
01488   int   create_flag;
01489   ibool   retry;
01490   const char* mode_str  = NULL;
01491 
01492 try_again:
01493   ut_a(name);
01494 
01495   if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
01496       || create_mode == OS_FILE_OPEN_RETRY) {
01497     mode_str = "OPEN";
01498     create_flag = O_RDWR;
01499   } else if (create_mode == OS_FILE_CREATE) {
01500     mode_str = "CREATE";
01501     create_flag = O_RDWR | O_CREAT | O_EXCL;
01502   } else if (create_mode == OS_FILE_OVERWRITE) {
01503     mode_str = "OVERWRITE";
01504     create_flag = O_RDWR | O_CREAT | O_TRUNC;
01505   } else {
01506     create_flag = 0;
01507     ut_error;
01508   }
01509 
01510   ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
01511   ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
01512 
01513 #ifdef O_SYNC
01514   /* We let O_SYNC only affect log files; note that we map O_DSYNC to
01515   O_SYNC because the datasync options seemed to corrupt files in 2001
01516   in both Linux and Solaris */
01517   if (type == OS_LOG_FILE
01518       && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
01519 
01520 # if 0
01521     fprintf(stderr, "Using O_SYNC for file %s\n", name);
01522 # endif
01523 
01524     create_flag = create_flag | O_SYNC;
01525   }
01526 #endif /* O_SYNC */
01527 
01528   file = open(name, create_flag, os_innodb_umask);
01529 
01530   if (file == -1) {
01531     *success = FALSE;
01532 
01533     /* When srv_file_per_table is on, file creation failure may not
01534     be critical to the whole instance. Do not crash the server in
01535     case of unknown errors.
01536     Please note "srv_file_per_table" is a global variable with
01537     no explicit synchronization protection. It could be
01538     changed during this execution path. It might not have the
01539     same value as the one when building the table definition */
01540     if (srv_file_per_table) {
01541       retry = os_file_handle_error_no_exit(name,
01542             create_mode == OS_FILE_CREATE ?
01543             "create" : "open");
01544     } else {
01545       retry = os_file_handle_error(name,
01546             create_mode == OS_FILE_CREATE ?
01547             "create" : "open");
01548     }
01549 
01550     if (retry) {
01551       goto try_again;
01552     } else {
01553       return(file /* -1 */);
01554     }
01555   }
01556   /* else */
01557 
01558   *success = TRUE;
01559 
01560   /* We disable OS caching (O_DIRECT) only on data files */
01561   if (type != OS_LOG_FILE
01562       && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
01563     
01564     os_file_set_nocache(file, name, mode_str);
01565   }
01566 
01567 #ifdef USE_FILE_LOCK
01568   if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
01569 
01570     if (create_mode == OS_FILE_OPEN_RETRY) {
01571       int i;
01572       ut_print_timestamp(stderr);
01573       fputs("  InnoDB: Retrying to lock"
01574             " the first data file\n",
01575             stderr);
01576       for (i = 0; i < 100; i++) {
01577         os_thread_sleep(1000000);
01578         if (!os_file_lock(file, name)) {
01579           *success = TRUE;
01580           return(file);
01581         }
01582       }
01583       ut_print_timestamp(stderr);
01584       fputs("  InnoDB: Unable to open the first data file\n",
01585             stderr);
01586     }
01587 
01588     *success = FALSE;
01589     close(file);
01590     file = -1;
01591   }
01592 #endif /* USE_FILE_LOCK */
01593 
01594   return(file);
01595 #endif /* __WIN__ */
01596 }
01597 
01598 /***********************************************************************/
01601 UNIV_INTERN
01602 ibool
01603 os_file_delete_if_exists(
01604 /*=====================*/
01605   const char* name) 
01606 {
01607 #ifdef __WIN__
01608   BOOL  ret;
01609   ulint count = 0;
01610 loop:
01611   /* In Windows, deleting an .ibd file may fail if ibbackup is copying
01612   it */
01613 
01614   ret = DeleteFile((LPCTSTR)name);
01615 
01616   if (ret) {
01617     return(TRUE);
01618   }
01619 
01620   if (GetLastError() == ERROR_FILE_NOT_FOUND) {
01621     /* the file does not exist, this not an error */
01622 
01623     return(TRUE);
01624   }
01625 
01626   count++;
01627 
01628   if (count > 100 && 0 == (count % 10)) {
01629     fprintf(stderr,
01630       "InnoDB: Warning: cannot delete file %s\n"
01631       "InnoDB: Are you running ibbackup"
01632       " to back up the file?\n", name);
01633 
01634     os_file_get_last_error(TRUE); /* print error information */
01635   }
01636 
01637   os_thread_sleep(1000000); /* sleep for a second */
01638 
01639   if (count > 2000) {
01640 
01641     return(FALSE);
01642   }
01643 
01644   goto loop;
01645 #else
01646   int ret;
01647 
01648   ret = unlink(name);
01649 
01650   if (ret != 0 && errno != ENOENT) {
01651     os_file_handle_error_no_exit(name, "delete");
01652 
01653     return(FALSE);
01654   }
01655 
01656   return(TRUE);
01657 #endif
01658 }
01659 
01660 /***********************************************************************/
01663 UNIV_INTERN
01664 ibool
01665 os_file_delete(
01666 /*===========*/
01667   const char* name) 
01668 {
01669 #ifdef __WIN__
01670   BOOL  ret;
01671   ulint count = 0;
01672 loop:
01673   /* In Windows, deleting an .ibd file may fail if ibbackup is copying
01674   it */
01675 
01676   ret = DeleteFile((LPCTSTR)name);
01677 
01678   if (ret) {
01679     return(TRUE);
01680   }
01681 
01682   if (GetLastError() == ERROR_FILE_NOT_FOUND) {
01683     /* If the file does not exist, we classify this as a 'mild'
01684     error and return */
01685 
01686     return(FALSE);
01687   }
01688 
01689   count++;
01690 
01691   if (count > 100 && 0 == (count % 10)) {
01692     fprintf(stderr,
01693       "InnoDB: Warning: cannot delete file %s\n"
01694       "InnoDB: Are you running ibbackup"
01695       " to back up the file?\n", name);
01696 
01697     os_file_get_last_error(TRUE); /* print error information */
01698   }
01699 
01700   os_thread_sleep(1000000); /* sleep for a second */
01701 
01702   if (count > 2000) {
01703 
01704     return(FALSE);
01705   }
01706 
01707   goto loop;
01708 #else
01709   int ret;
01710 
01711   ret = unlink(name);
01712 
01713   if (ret != 0) {
01714     os_file_handle_error_no_exit(name, "delete");
01715 
01716     return(FALSE);
01717   }
01718 
01719   return(TRUE);
01720 #endif
01721 }
01722 
01723 /***********************************************************************/
01728 UNIV_INTERN
01729 ibool
01730 os_file_rename_func(
01731 /*================*/
01732   const char* oldpath,
01734   const char* newpath)
01735 {
01736 #ifdef __WIN__
01737   BOOL  ret;
01738 
01739   ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
01740 
01741   if (ret) {
01742     return(TRUE);
01743   }
01744 
01745   os_file_handle_error_no_exit(oldpath, "rename");
01746 
01747   return(FALSE);
01748 #else
01749   int ret;
01750 
01751   ret = rename(oldpath, newpath);
01752 
01753   if (ret != 0) {
01754     os_file_handle_error_no_exit(oldpath, "rename");
01755 
01756     return(FALSE);
01757   }
01758 
01759   return(TRUE);
01760 #endif
01761 }
01762 
01763 /***********************************************************************/
01768 UNIV_INTERN
01769 ibool
01770 os_file_close_func(
01771 /*===============*/
01772   os_file_t file) 
01773 {
01774 #ifdef __WIN__
01775   BOOL  ret;
01776 
01777   ut_a(file);
01778 
01779   ret = CloseHandle(file);
01780 
01781   if (ret) {
01782     return(TRUE);
01783   }
01784 
01785   os_file_handle_error(NULL, "close");
01786 
01787   return(FALSE);
01788 #else
01789   int ret;
01790 
01791   ret = close(file);
01792 
01793   if (ret == -1) {
01794     os_file_handle_error(NULL, "close");
01795 
01796     return(FALSE);
01797   }
01798 
01799   return(TRUE);
01800 #endif
01801 }
01802 
01803 #ifdef UNIV_HOTBACKUP
01804 /***********************************************************************/
01807 UNIV_INTERN
01808 ibool
01809 os_file_close_no_error_handling(
01810 /*============================*/
01811   os_file_t file) 
01812 {
01813 #ifdef __WIN__
01814   BOOL  ret;
01815 
01816   ut_a(file);
01817 
01818   ret = CloseHandle(file);
01819 
01820   if (ret) {
01821     return(TRUE);
01822   }
01823 
01824   return(FALSE);
01825 #else
01826   int ret;
01827 
01828   ret = close(file);
01829 
01830   if (ret == -1) {
01831 
01832     return(FALSE);
01833   }
01834 
01835   return(TRUE);
01836 #endif
01837 }
01838 #endif /* UNIV_HOTBACKUP */
01839 
01840 /***********************************************************************/
01843 UNIV_INTERN
01844 ibool
01845 os_file_get_size(
01846 /*=============*/
01847   os_file_t file, 
01848   ulint*    size, 
01850   ulint*    size_high)
01851 {
01852 #ifdef __WIN__
01853   DWORD high;
01854   DWORD low;
01855 
01856   low = GetFileSize(file, &high);
01857 
01858   if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
01859     return(FALSE);
01860   }
01861 
01862   *size = low;
01863   *size_high = high;
01864 
01865   return(TRUE);
01866 #else
01867   off_t offs;
01868 
01869   offs = lseek(file, 0, SEEK_END);
01870 
01871   if (offs == ((off_t)-1)) {
01872 
01873     return(FALSE);
01874   }
01875 
01876   if (sizeof(off_t) > 4) {
01877     *size = (ulint)(offs & 0xFFFFFFFFUL);
01878     *size_high = (ulint)(offs >> 32);
01879   } else {
01880     *size = (ulint) offs;
01881     *size_high = 0;
01882   }
01883 
01884   return(TRUE);
01885 #endif
01886 }
01887 
01888 /***********************************************************************/
01891 UNIV_INTERN
01892 ib_int64_t
01893 os_file_get_size_as_iblonglong(
01894 /*===========================*/
01895   os_file_t file) 
01896 {
01897   ulint size;
01898   ulint size_high;
01899   ibool success;
01900 
01901   success = os_file_get_size(file, &size, &size_high);
01902 
01903   if (!success) {
01904 
01905     return(-1);
01906   }
01907 
01908   return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
01909 }
01910 
01911 /***********************************************************************/
01914 UNIV_INTERN
01915 ibool
01916 os_file_set_size(
01917 /*=============*/
01918   const char* name, 
01920   os_file_t file, 
01921   ulint   size, 
01923   ulint   size_high)
01924 {
01925   ib_int64_t  current_size;
01926   ib_int64_t  desired_size;
01927   ibool   ret;
01928   byte*   buf;
01929   byte*   buf2;
01930   ulint   buf_size;
01931 
01932   ut_a(size == (size & 0xFFFFFFFF));
01933 
01934   current_size = 0;
01935   desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
01936 
01937   /* Write up to 1 megabyte at a time. */
01938   buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
01939     * UNIV_PAGE_SIZE;
01940   buf2 = static_cast<unsigned char *>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
01941 
01942   /* Align the buffer for possible raw i/o */
01943   buf = static_cast<unsigned char *>(ut_align(buf2, UNIV_PAGE_SIZE));
01944 
01945   /* Write buffer full of zeros */
01946   memset(buf, 0, buf_size);
01947 
01948   if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
01949 
01950     fprintf(stderr, "InnoDB: Progress in MB:");
01951   }
01952 
01953   while (current_size < desired_size) {
01954     ulint n_bytes;
01955 
01956     if (desired_size - current_size < (ib_int64_t) buf_size) {
01957       n_bytes = (ulint) (desired_size - current_size);
01958     } else {
01959       n_bytes = buf_size;
01960     }
01961 
01962     ret = os_file_write(name, file, buf,
01963             (ulint)(current_size & 0xFFFFFFFF),
01964             (ulint)(current_size >> 32),
01965             n_bytes);
01966     if (!ret) {
01967       ut_free(buf2);
01968       goto error_handling;
01969     }
01970 
01971     /* Print about progress for each 100 MB written */
01972     if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
01973         != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
01974 
01975       fprintf(stderr, " %lu00",
01976         (ulong) ((current_size + n_bytes)
01977            / (ib_int64_t)(100 * 1024 * 1024)));
01978     }
01979 
01980     current_size += n_bytes;
01981   }
01982 
01983   if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
01984 
01985     fprintf(stderr, "\n");
01986   }
01987 
01988   ut_free(buf2);
01989 
01990   ret = os_file_flush(file);
01991 
01992   if (ret) {
01993     return(TRUE);
01994   }
01995 
01996 error_handling:
01997   return(FALSE);
01998 }
01999 
02000 /***********************************************************************/
02003 UNIV_INTERN
02004 ibool
02005 os_file_set_eof(
02006 /*============*/
02007   FILE*   file) 
02008 {
02009 #ifdef __WIN__
02010   HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
02011   return(SetEndOfFile(h));
02012 #else /* __WIN__ */
02013   return(!ftruncate(fileno(file), ftell(file)));
02014 #endif /* __WIN__ */
02015 }
02016 
02017 #ifndef __WIN__
02018 /***********************************************************************/
02024 static
02025 int
02026 os_file_fsync(
02027 /*==========*/
02028   os_file_t file) 
02029 {
02030   int ret;
02031   int failures;
02032   ibool retry;
02033 
02034   failures = 0;
02035 
02036   do {
02037     ret = fsync(file);
02038 
02039     os_n_fsyncs++;
02040 
02041     if (ret == -1 && errno == ENOLCK) {
02042 
02043       if (failures % 100 == 0) {
02044 
02045         ut_print_timestamp(stderr);
02046         fprintf(stderr,
02047           "  InnoDB: fsync(): "
02048           "No locks available; retrying\n");
02049       }
02050 
02051       os_thread_sleep(200000 /* 0.2 sec */);
02052 
02053       failures++;
02054 
02055       retry = TRUE;
02056     } else {
02057 
02058       retry = FALSE;
02059     }
02060   } while (retry);
02061 
02062   return(ret);
02063 }
02064 #endif /* !__WIN__ */
02065 
02066 /***********************************************************************/
02070 UNIV_INTERN
02071 ibool
02072 os_file_flush_func(
02073 /*===============*/
02074   os_file_t file) 
02075 {
02076 #ifdef __WIN__
02077   BOOL  ret;
02078 
02079   ut_a(file);
02080 
02081   os_n_fsyncs++;
02082 
02083   ret = FlushFileBuffers(file);
02084 
02085   if (ret) {
02086     return(TRUE);
02087   }
02088 
02089   /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
02090   actually a raw device, we choose to ignore that error if we are using
02091   raw disks */
02092 
02093   if (srv_start_raw_disk_in_use && GetLastError()
02094       == ERROR_INVALID_FUNCTION) {
02095     return(TRUE);
02096   }
02097 
02098   os_file_handle_error(NULL, "flush");
02099 
02100   /* It is a fatal error if a file flush does not succeed, because then
02101   the database can get corrupt on disk */
02102   ut_error;
02103 
02104   return(FALSE);
02105 #else
02106   int ret;
02107 
02108 #if defined(HAVE_DARWIN_THREADS)
02109 # ifndef F_FULLFSYNC
02110   /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
02111 #  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
02112 # elif F_FULLFSYNC != 51
02113 #  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
02114 # endif
02115   /* Apple has disabled fsync() for internal disk drives in OS X. That
02116   caused corruption for a user when he tested a power outage. Let us in
02117   OS X use a nonstandard flush method recommended by an Apple
02118   engineer. */
02119 
02120   if (!srv_have_fullfsync) {
02121     /* If we are not on an operating system that supports this,
02122     then fall back to a plain fsync. */
02123 
02124     ret = os_file_fsync(file);
02125   } else {
02126     ret = fcntl(file, F_FULLFSYNC, NULL);
02127 
02128     if (ret) {
02129       /* If we are not on a file system that supports this,
02130       then fall back to a plain fsync. */
02131       ret = os_file_fsync(file);
02132     }
02133   }
02134 #else
02135   ret = os_file_fsync(file);
02136 #endif
02137 
02138   if (ret == 0) {
02139     return(TRUE);
02140   }
02141 
02142   /* Since Linux returns EINVAL if the 'file' is actually a raw device,
02143   we choose to ignore that error if we are using raw disks */
02144 
02145   if (srv_start_raw_disk_in_use && errno == EINVAL) {
02146 
02147     return(TRUE);
02148   }
02149 
02150   ut_print_timestamp(stderr);
02151 
02152   fprintf(stderr,
02153     "  InnoDB: Error: the OS said file flush did not succeed\n");
02154 
02155   os_file_handle_error(NULL, "flush");
02156 
02157   /* It is a fatal error if a file flush does not succeed, because then
02158   the database can get corrupt on disk */
02159   ut_error;
02160 
02161   return(FALSE);
02162 #endif
02163 }
02164 
02165 #ifndef __WIN__
02166 /*******************************************************************/
02169 static
02170 ssize_t
02171 os_file_pread(
02172 /*==========*/
02173   os_file_t file, 
02174   void*   buf,  
02175   ulint   n,  
02176   ulint   offset, 
02178   ulint   offset_high) 
02180 {
02181   off_t offs;
02182 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
02183   ssize_t n_bytes;
02184 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
02185 
02186   ut_a((offset & 0xFFFFFFFFUL) == offset);
02187 
02188   /* If off_t is > 4 bytes in size, then we assume we can pass a
02189   64-bit address */
02190 
02191   if (sizeof(off_t) > 4) {
02192     offs = (off_t)offset + (((off_t)offset_high) << 32);
02193 
02194   } else {
02195     offs = (off_t)offset;
02196 
02197     if (offset_high > 0) {
02198       fprintf(stderr,
02199         "InnoDB: Error: file read at offset > 4 GB\n");
02200     }
02201   }
02202 
02203   os_n_file_reads++;
02204 
02205 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
02206   os_mutex_enter(os_file_count_mutex);
02207   os_file_n_pending_preads++;
02208   os_n_pending_reads++;
02209   os_mutex_exit(os_file_count_mutex);
02210 
02211   n_bytes = pread(file, buf, (ssize_t)n, offs);
02212 
02213   os_mutex_enter(os_file_count_mutex);
02214   os_file_n_pending_preads--;
02215   os_n_pending_reads--;
02216   os_mutex_exit(os_file_count_mutex);
02217 
02218   return(n_bytes);
02219 #else
02220   {
02221     off_t ret_offset;
02222     ssize_t ret;
02223 #ifndef UNIV_HOTBACKUP
02224     ulint i;
02225 #endif /* !UNIV_HOTBACKUP */
02226 
02227     os_mutex_enter(os_file_count_mutex);
02228     os_n_pending_reads++;
02229     os_mutex_exit(os_file_count_mutex);
02230 
02231 #ifndef UNIV_HOTBACKUP
02232     /* Protect the seek / read operation with a mutex */
02233     i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
02234 
02235     os_mutex_enter(os_file_seek_mutexes[i]);
02236 #endif /* !UNIV_HOTBACKUP */
02237 
02238     ret_offset = lseek(file, offs, SEEK_SET);
02239 
02240     if (ret_offset < 0) {
02241       ret = -1;
02242     } else {
02243       ret = read(file, buf, (ssize_t)n);
02244     }
02245 
02246 #ifndef UNIV_HOTBACKUP
02247     os_mutex_exit(os_file_seek_mutexes[i]);
02248 #endif /* !UNIV_HOTBACKUP */
02249 
02250     os_mutex_enter(os_file_count_mutex);
02251     os_n_pending_reads--;
02252     os_mutex_exit(os_file_count_mutex);
02253 
02254     return(ret);
02255   }
02256 #endif
02257 }
02258 
02259 /*******************************************************************/
02262 static
02263 ssize_t
02264 os_file_pwrite(
02265 /*===========*/
02266   os_file_t file, 
02267   const void* buf,  
02268   ulint   n,  
02269   ulint   offset, 
02271   ulint   offset_high) 
02273 {
02274   ssize_t ret;
02275   off_t offs;
02276 
02277   ut_a((offset & 0xFFFFFFFFUL) == offset);
02278 
02279   /* If off_t is > 4 bytes in size, then we assume we can pass a
02280   64-bit address */
02281 
02282   if (sizeof(off_t) > 4) {
02283     offs = (off_t)offset + (((off_t)offset_high) << 32);
02284   } else {
02285     offs = (off_t)offset;
02286 
02287     if (offset_high > 0) {
02288       fprintf(stderr,
02289         "InnoDB: Error: file write"
02290         " at offset > 4 GB\n");
02291     }
02292   }
02293 
02294   os_n_file_writes++;
02295 
02296 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
02297   os_mutex_enter(os_file_count_mutex);
02298   os_file_n_pending_pwrites++;
02299   os_n_pending_writes++;
02300   os_mutex_exit(os_file_count_mutex);
02301 
02302   ret = pwrite(file, buf, (ssize_t)n, offs);
02303 
02304   os_mutex_enter(os_file_count_mutex);
02305   os_file_n_pending_pwrites--;
02306   os_n_pending_writes--;
02307   os_mutex_exit(os_file_count_mutex);
02308 
02309 # ifdef UNIV_DO_FLUSH
02310   if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
02311       && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
02312       && !os_do_not_call_flush_at_each_write) {
02313 
02314     /* Always do fsync to reduce the probability that when
02315     the OS crashes, a database page is only partially
02316     physically written to disk. */
02317 
02318     ut_a(TRUE == os_file_flush(file));
02319   }
02320 # endif /* UNIV_DO_FLUSH */
02321 
02322   return(ret);
02323 #else
02324   {
02325     off_t ret_offset;
02326 # ifndef UNIV_HOTBACKUP
02327     ulint i;
02328 # endif /* !UNIV_HOTBACKUP */
02329 
02330     os_mutex_enter(os_file_count_mutex);
02331     os_n_pending_writes++;
02332     os_mutex_exit(os_file_count_mutex);
02333 
02334 # ifndef UNIV_HOTBACKUP
02335     /* Protect the seek / write operation with a mutex */
02336     i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
02337 
02338     os_mutex_enter(os_file_seek_mutexes[i]);
02339 # endif /* UNIV_HOTBACKUP */
02340 
02341     ret_offset = lseek(file, offs, SEEK_SET);
02342 
02343     if (ret_offset < 0) {
02344       ret = -1;
02345 
02346       goto func_exit;
02347     }
02348 
02349     ret = write(file, buf, (ssize_t)n);
02350 
02351 # ifdef UNIV_DO_FLUSH
02352     if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
02353         && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
02354         && !os_do_not_call_flush_at_each_write) {
02355 
02356       /* Always do fsync to reduce the probability that when
02357       the OS crashes, a database page is only partially
02358       physically written to disk. */
02359 
02360       ut_a(TRUE == os_file_flush(file));
02361     }
02362 # endif /* UNIV_DO_FLUSH */
02363 
02364 func_exit:
02365 # ifndef UNIV_HOTBACKUP
02366     os_mutex_exit(os_file_seek_mutexes[i]);
02367 # endif /* !UNIV_HOTBACKUP */
02368 
02369     os_mutex_enter(os_file_count_mutex);
02370     os_n_pending_writes--;
02371     os_mutex_exit(os_file_count_mutex);
02372 
02373     return(ret);
02374   }
02375 #endif
02376 }
02377 #endif
02378 
02379 /*******************************************************************/
02384 UNIV_INTERN
02385 ibool
02386 os_file_read_func(
02387 /*==============*/
02388   os_file_t file, 
02389   void*   buf,  
02390   ulint   offset, 
02392   ulint   offset_high, 
02394   ulint   n)  
02395 {
02396 #ifdef __WIN__
02397   BOOL    ret;
02398   DWORD   len;
02399   DWORD   ret2;
02400   DWORD   low;
02401   DWORD   high;
02402   ibool   retry;
02403 #ifndef UNIV_HOTBACKUP
02404   ulint   i;
02405 #endif /* !UNIV_HOTBACKUP */
02406 
02407   /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
02408   no more than 32 bits. */
02409   ut_a((offset & 0xFFFFFFFFUL) == offset);
02410   ut_a((n & 0xFFFFFFFFUL) == n);
02411 
02412   os_n_file_reads++;
02413   os_bytes_read_since_printout += n;
02414 
02415 try_again:
02416   ut_ad(file);
02417   ut_ad(buf);
02418   ut_ad(n > 0);
02419 
02420   low = (DWORD) offset;
02421   high = (DWORD) offset_high;
02422 
02423   os_mutex_enter(os_file_count_mutex);
02424   os_n_pending_reads++;
02425   os_mutex_exit(os_file_count_mutex);
02426 
02427 #ifndef UNIV_HOTBACKUP
02428   /* Protect the seek / read operation with a mutex */
02429   i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
02430 
02431   os_mutex_enter(os_file_seek_mutexes[i]);
02432 #endif /* !UNIV_HOTBACKUP */
02433 
02434   ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
02435 
02436   if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
02437 
02438 #ifndef UNIV_HOTBACKUP
02439     os_mutex_exit(os_file_seek_mutexes[i]);
02440 #endif /* !UNIV_HOTBACKUP */
02441 
02442     os_mutex_enter(os_file_count_mutex);
02443     os_n_pending_reads--;
02444     os_mutex_exit(os_file_count_mutex);
02445 
02446     goto error_handling;
02447   }
02448 
02449   ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
02450 
02451 #ifndef UNIV_HOTBACKUP
02452   os_mutex_exit(os_file_seek_mutexes[i]);
02453 #endif /* !UNIV_HOTBACKUP */
02454 
02455   os_mutex_enter(os_file_count_mutex);
02456   os_n_pending_reads--;
02457   os_mutex_exit(os_file_count_mutex);
02458 
02459   if (ret && len == n) {
02460     return(TRUE);
02461   }
02462 #else /* __WIN__ */
02463   ibool retry;
02464   ssize_t ret;
02465 
02466   os_bytes_read_since_printout += n;
02467 
02468 try_again:
02469   ret = os_file_pread(file, buf, n, offset, offset_high);
02470 
02471   if ((ulint)ret == n) {
02472 
02473     return(TRUE);
02474   }
02475 
02476   fprintf(stderr,
02477     "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
02478     "InnoDB: Was only able to read %ld.\n",
02479     (ulong)n, (ulong)offset_high,
02480     (ulong)offset, (long)ret);
02481 #endif /* __WIN__ */
02482 #ifdef __WIN__
02483 error_handling:
02484 #endif
02485   retry = os_file_handle_error(NULL, "read");
02486 
02487   if (retry) {
02488     goto try_again;
02489   }
02490 
02491   fprintf(stderr,
02492     "InnoDB: Fatal error: cannot read from file."
02493     " OS error number %lu.\n",
02494 #ifdef __WIN__
02495     (ulong) GetLastError()
02496 #else
02497     (ulong) errno
02498 #endif
02499     );
02500   fflush(stderr);
02501 
02502   ut_error;
02503 
02504   return(FALSE);
02505 }
02506 
02507 /*******************************************************************/
02513 UNIV_INTERN
02514 ibool
02515 os_file_read_no_error_handling_func(
02516 /*================================*/
02517   os_file_t file, 
02518   void*   buf,  
02519   ulint   offset, 
02521   ulint   offset_high, 
02523   ulint   n)  
02524 {
02525 #ifdef __WIN__
02526   BOOL    ret;
02527   DWORD   len;
02528   DWORD   ret2;
02529   DWORD   low;
02530   DWORD   high;
02531   ibool   retry;
02532 #ifndef UNIV_HOTBACKUP
02533   ulint   i;
02534 #endif /* !UNIV_HOTBACKUP */
02535 
02536   /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
02537   no more than 32 bits. */
02538   ut_a((offset & 0xFFFFFFFFUL) == offset);
02539   ut_a((n & 0xFFFFFFFFUL) == n);
02540 
02541   os_n_file_reads++;
02542   os_bytes_read_since_printout += n;
02543 
02544 try_again:
02545   ut_ad(file);
02546   ut_ad(buf);
02547   ut_ad(n > 0);
02548 
02549   low = (DWORD) offset;
02550   high = (DWORD) offset_high;
02551 
02552   os_mutex_enter(os_file_count_mutex);
02553   os_n_pending_reads++;
02554   os_mutex_exit(os_file_count_mutex);
02555 
02556 #ifndef UNIV_HOTBACKUP
02557   /* Protect the seek / read operation with a mutex */
02558   i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
02559 
02560   os_mutex_enter(os_file_seek_mutexes[i]);
02561 #endif /* !UNIV_HOTBACKUP */
02562 
02563   ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
02564 
02565   if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
02566 
02567 #ifndef UNIV_HOTBACKUP
02568     os_mutex_exit(os_file_seek_mutexes[i]);
02569 #endif /* !UNIV_HOTBACKUP */
02570 
02571     os_mutex_enter(os_file_count_mutex);
02572     os_n_pending_reads--;
02573     os_mutex_exit(os_file_count_mutex);
02574 
02575     goto error_handling;
02576   }
02577 
02578   ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
02579 
02580 #ifndef UNIV_HOTBACKUP
02581   os_mutex_exit(os_file_seek_mutexes[i]);
02582 #endif /* !UNIV_HOTBACKUP */
02583 
02584   os_mutex_enter(os_file_count_mutex);
02585   os_n_pending_reads--;
02586   os_mutex_exit(os_file_count_mutex);
02587 
02588   if (ret && len == n) {
02589     return(TRUE);
02590   }
02591 #else /* __WIN__ */
02592   ibool retry;
02593   ssize_t ret;
02594 
02595   os_bytes_read_since_printout += n;
02596 
02597 try_again:
02598   ret = os_file_pread(file, buf, n, offset, offset_high);
02599 
02600   if ((ulint)ret == n) {
02601 
02602     return(TRUE);
02603   }
02604 #endif /* __WIN__ */
02605 #ifdef __WIN__
02606 error_handling:
02607 #endif
02608   retry = os_file_handle_error_no_exit(NULL, "read");
02609 
02610   if (retry) {
02611     goto try_again;
02612   }
02613 
02614   return(FALSE);
02615 }
02616 
02617 /*******************************************************************/
02621 UNIV_INTERN
02622 void
02623 os_file_read_string(
02624 /*================*/
02625   FILE* file, 
02626   char* str,  
02627   ulint size) 
02628 {
02629   size_t  flen;
02630 
02631   if (size == 0) {
02632     return;
02633   }
02634 
02635   rewind(file);
02636   flen = fread(str, 1, size - 1, file);
02637   str[flen] = '\0';
02638 }
02639 
02640 /*******************************************************************/
02645 UNIV_INTERN
02646 ibool
02647 os_file_write_func(
02648 /*===============*/
02649   const char* name, 
02651   os_file_t file, 
02652   const void* buf,  
02653   ulint   offset, 
02655   ulint   offset_high, 
02657   ulint   n)  
02658 {
02659 #ifdef __WIN__
02660   BOOL    ret;
02661   DWORD   len;
02662   DWORD   ret2;
02663   DWORD   low;
02664   DWORD   high;
02665   ulint   n_retries = 0;
02666   ulint   err;
02667 #ifndef UNIV_HOTBACKUP
02668   ulint   i;
02669 #endif /* !UNIV_HOTBACKUP */
02670 
02671   /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
02672   no more than 32 bits. */
02673   ut_a((offset & 0xFFFFFFFFUL) == offset);
02674   ut_a((n & 0xFFFFFFFFUL) == n);
02675 
02676   os_n_file_writes++;
02677 
02678   ut_ad(file);
02679   ut_ad(buf);
02680   ut_ad(n > 0);
02681 retry:
02682   low = (DWORD) offset;
02683   high = (DWORD) offset_high;
02684 
02685   os_mutex_enter(os_file_count_mutex);
02686   os_n_pending_writes++;
02687   os_mutex_exit(os_file_count_mutex);
02688 
02689 #ifndef UNIV_HOTBACKUP
02690   /* Protect the seek / write operation with a mutex */
02691   i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
02692 
02693   os_mutex_enter(os_file_seek_mutexes[i]);
02694 #endif /* !UNIV_HOTBACKUP */
02695 
02696   ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
02697 
02698   if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
02699 
02700 #ifndef UNIV_HOTBACKUP
02701     os_mutex_exit(os_file_seek_mutexes[i]);
02702 #endif /* !UNIV_HOTBACKUP */
02703 
02704     os_mutex_enter(os_file_count_mutex);
02705     os_n_pending_writes--;
02706     os_mutex_exit(os_file_count_mutex);
02707 
02708     ut_print_timestamp(stderr);
02709 
02710     fprintf(stderr,
02711       "  InnoDB: Error: File pointer positioning to"
02712       " file %s failed at\n"
02713       "InnoDB: offset %lu %lu. Operating system"
02714       " error number %lu.\n"
02715       "InnoDB: Some operating system error numbers"
02716       " are described at\n"
02717       "InnoDB: "
02718       REFMAN "operating-system-error-codes.html\n",
02719       name, (ulong) offset_high, (ulong) offset,
02720       (ulong) GetLastError());
02721 
02722     return(FALSE);
02723   }
02724 
02725   ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
02726 
02727   /* Always do fsync to reduce the probability that when the OS crashes,
02728   a database page is only partially physically written to disk. */
02729 
02730 # ifdef UNIV_DO_FLUSH
02731   if (!os_do_not_call_flush_at_each_write) {
02732     ut_a(TRUE == os_file_flush(file));
02733   }
02734 # endif /* UNIV_DO_FLUSH */
02735 
02736 #ifndef UNIV_HOTBACKUP
02737   os_mutex_exit(os_file_seek_mutexes[i]);
02738 #endif /* !UNIV_HOTBACKUP */
02739 
02740   os_mutex_enter(os_file_count_mutex);
02741   os_n_pending_writes--;
02742   os_mutex_exit(os_file_count_mutex);
02743 
02744   if (ret && len == n) {
02745 
02746     return(TRUE);
02747   }
02748 
02749   /* If some background file system backup tool is running, then, at
02750   least in Windows 2000, we may get here a specific error. Let us
02751   retry the operation 100 times, with 1 second waits. */
02752 
02753   if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
02754 
02755     os_thread_sleep(1000000);
02756 
02757     n_retries++;
02758 
02759     goto retry;
02760   }
02761 
02762   if (!os_has_said_disk_full) {
02763 
02764     err = (ulint)GetLastError();
02765 
02766     ut_print_timestamp(stderr);
02767 
02768     fprintf(stderr,
02769       "  InnoDB: Error: Write to file %s failed"
02770       " at offset %lu %lu.\n"
02771       "InnoDB: %lu bytes should have been written,"
02772       " only %lu were written.\n"
02773       "InnoDB: Operating system error number %lu.\n"
02774       "InnoDB: Check that your OS and file system"
02775       " support files of this size.\n"
02776       "InnoDB: Check also that the disk is not full"
02777       " or a disk quota exceeded.\n",
02778       name, (ulong) offset_high, (ulong) offset,
02779       (ulong) n, (ulong) len, (ulong) err);
02780 
02781     if (strerror((int)err) != NULL) {
02782       fprintf(stderr,
02783         "InnoDB: Error number %lu means '%s'.\n",
02784         (ulong) err, strerror((int)err));
02785     }
02786 
02787     fprintf(stderr,
02788       "InnoDB: Some operating system error numbers"
02789       " are described at\n"
02790       "InnoDB: "
02791       REFMAN "operating-system-error-codes.html\n");
02792 
02793     os_has_said_disk_full = TRUE;
02794   }
02795 
02796   return(FALSE);
02797 #else
02798   ssize_t ret;
02799 
02800   ret = os_file_pwrite(file, buf, n, offset, offset_high);
02801 
02802   if ((ulint)ret == n) {
02803 
02804     return(TRUE);
02805   }
02806 
02807   if (!os_has_said_disk_full) {
02808 
02809     ut_print_timestamp(stderr);
02810 
02811     fprintf(stderr,
02812       "  InnoDB: Error: Write to file %s failed"
02813       " at offset %lu %lu.\n"
02814       "InnoDB: %lu bytes should have been written,"
02815       " only %ld were written.\n"
02816       "InnoDB: Operating system error number %lu.\n"
02817       "InnoDB: Check that your OS and file system"
02818       " support files of this size.\n"
02819       "InnoDB: Check also that the disk is not full"
02820       " or a disk quota exceeded.\n",
02821       name, offset_high, offset, n, (long int)ret,
02822       (ulint)errno);
02823     if (strerror(errno) != NULL) {
02824       fprintf(stderr,
02825         "InnoDB: Error number %lu means '%s'.\n",
02826         (ulint)errno, strerror(errno));
02827     }
02828 
02829     fprintf(stderr,
02830       "InnoDB: Some operating system error numbers"
02831       " are described at\n"
02832       "InnoDB: "
02833       REFMAN "operating-system-error-codes.html\n");
02834 
02835     os_has_said_disk_full = TRUE;
02836   }
02837 
02838   return(FALSE);
02839 #endif
02840 }
02841 
02842 /*******************************************************************/
02845 UNIV_INTERN
02846 ibool
02847 os_file_status(
02848 /*===========*/
02849   const char* path, 
02850   ibool*    exists, 
02851   os_file_type_t* type) 
02852 {
02853 #ifdef __WIN__
02854   int   ret;
02855   struct _stat  statinfo;
02856 
02857   ret = _stat(path, &statinfo);
02858   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
02859     /* file does not exist */
02860     *exists = FALSE;
02861     return(TRUE);
02862   } else if (ret) {
02863     /* file exists, but stat call failed */
02864 
02865     os_file_handle_error_no_exit(path, "stat");
02866 
02867     return(FALSE);
02868   }
02869 
02870   if (_S_IFDIR & statinfo.st_mode) {
02871     *type = OS_FILE_TYPE_DIR;
02872   } else if (_S_IFREG & statinfo.st_mode) {
02873     *type = OS_FILE_TYPE_FILE;
02874   } else {
02875     *type = OS_FILE_TYPE_UNKNOWN;
02876   }
02877 
02878   *exists = TRUE;
02879 
02880   return(TRUE);
02881 #else
02882   int   ret;
02883   struct stat statinfo;
02884 
02885   ret = stat(path, &statinfo);
02886   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
02887     /* file does not exist */
02888     *exists = FALSE;
02889     return(TRUE);
02890   } else if (ret) {
02891     /* file exists, but stat call failed */
02892 
02893     os_file_handle_error_no_exit(path, "stat");
02894 
02895     return(FALSE);
02896   }
02897 
02898   if (S_ISDIR(statinfo.st_mode)) {
02899     *type = OS_FILE_TYPE_DIR;
02900   } else if (S_ISLNK(statinfo.st_mode)) {
02901     *type = OS_FILE_TYPE_LINK;
02902   } else if (S_ISREG(statinfo.st_mode)) {
02903     *type = OS_FILE_TYPE_FILE;
02904   } else {
02905     *type = OS_FILE_TYPE_UNKNOWN;
02906   }
02907 
02908   *exists = TRUE;
02909 
02910   return(TRUE);
02911 #endif
02912 }
02913 
02914 /*******************************************************************/
02917 UNIV_INTERN
02918 ibool
02919 os_file_get_status(
02920 /*===============*/
02921   const char* path,   
02922   os_file_stat_t* stat_info)  
02924 {
02925 #ifdef __WIN__
02926   int   ret;
02927   struct _stat  statinfo;
02928 
02929   ret = _stat(path, &statinfo);
02930   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
02931     /* file does not exist */
02932 
02933     return(FALSE);
02934   } else if (ret) {
02935     /* file exists, but stat call failed */
02936 
02937     os_file_handle_error_no_exit(path, "stat");
02938 
02939     return(FALSE);
02940   }
02941   if (_S_IFDIR & statinfo.st_mode) {
02942     stat_info->type = OS_FILE_TYPE_DIR;
02943   } else if (_S_IFREG & statinfo.st_mode) {
02944     stat_info->type = OS_FILE_TYPE_FILE;
02945   } else {
02946     stat_info->type = OS_FILE_TYPE_UNKNOWN;
02947   }
02948 
02949   stat_info->ctime = statinfo.st_ctime;
02950   stat_info->atime = statinfo.st_atime;
02951   stat_info->mtime = statinfo.st_mtime;
02952   stat_info->size  = statinfo.st_size;
02953 
02954   return(TRUE);
02955 #else
02956   int   ret;
02957   struct stat statinfo;
02958 
02959   ret = stat(path, &statinfo);
02960 
02961   if (ret && (errno == ENOENT || errno == ENOTDIR)) {
02962     /* file does not exist */
02963 
02964     return(FALSE);
02965   } else if (ret) {
02966     /* file exists, but stat call failed */
02967 
02968     os_file_handle_error_no_exit(path, "stat");
02969 
02970     return(FALSE);
02971   }
02972 
02973   if (S_ISDIR(statinfo.st_mode)) {
02974     stat_info->type = OS_FILE_TYPE_DIR;
02975   } else if (S_ISLNK(statinfo.st_mode)) {
02976     stat_info->type = OS_FILE_TYPE_LINK;
02977   } else if (S_ISREG(statinfo.st_mode)) {
02978     stat_info->type = OS_FILE_TYPE_FILE;
02979   } else {
02980     stat_info->type = OS_FILE_TYPE_UNKNOWN;
02981   }
02982 
02983   stat_info->ctime = statinfo.st_ctime;
02984   stat_info->atime = statinfo.st_atime;
02985   stat_info->mtime = statinfo.st_mtime;
02986   stat_info->size  = statinfo.st_size;
02987 
02988   return(TRUE);
02989 #endif
02990 }
02991 
02992 /* path name separator character */
02993 #ifdef __WIN__
02994 #  define OS_FILE_PATH_SEPARATOR  '\\'
02995 #else
02996 #  define OS_FILE_PATH_SEPARATOR  '/'
02997 #endif
02998 
02999 /****************************************************************/
03027 UNIV_INTERN
03028 char*
03029 os_file_dirname(
03030 /*============*/
03031   const char* path) 
03032 {
03033   /* Find the offset of the last slash */
03034   const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
03035   if (!last_slash) {
03036     /* No slash in the path, return "." */
03037 
03038     return(mem_strdup("."));
03039   }
03040 
03041   /* Ok, there is a slash */
03042 
03043   if (last_slash == path) {
03044     /* last slash is the first char of the path */
03045 
03046     return(mem_strdup("/"));
03047   }
03048 
03049   /* Non-trivial directory component */
03050 
03051   return(mem_strdupl(path, last_slash - path));
03052 }
03053 
03054 /****************************************************************/
03057 UNIV_INTERN
03058 ibool
03059 os_file_create_subdirs_if_needed(
03060 /*=============================*/
03061   const char* path) 
03062 {
03063   char*   subdir;
03064   ibool   success, subdir_exists;
03065   os_file_type_t  type;
03066 
03067   subdir = os_file_dirname(path);
03068   if (strlen(subdir) == 1
03069       && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
03070     /* subdir is root or cwd, nothing to do */
03071     mem_free(subdir);
03072 
03073     return(TRUE);
03074   }
03075 
03076   /* Test if subdir exists */
03077   success = os_file_status(subdir, &subdir_exists, &type);
03078   if (success && !subdir_exists) {
03079     /* subdir does not exist, create it */
03080     success = os_file_create_subdirs_if_needed(subdir);
03081     if (!success) {
03082       mem_free(subdir);
03083 
03084       return(FALSE);
03085     }
03086     success = os_file_create_directory(subdir, FALSE);
03087   }
03088 
03089   mem_free(subdir);
03090 
03091   return(success);
03092 }
03093 
03094 #ifndef UNIV_HOTBACKUP
03095 /****************************************************************/
03098 static
03099 os_aio_slot_t*
03100 os_aio_array_get_nth_slot(
03101 /*======================*/
03102   os_aio_array_t*   array,  
03103   ulint     index)  
03104 {
03105   ut_a(index < array->n_slots);
03106 
03107   return((array->slots) + index);
03108 }
03109 
03110 #if defined(LINUX_NATIVE_AIO)
03111 /******************************************************************/
03114 static
03115 ibool
03116 os_aio_linux_create_io_ctx(
03117 /*=======================*/
03118   ulint   max_events, 
03119   io_context_t* io_ctx)   
03120 {
03121   int ret;
03122   ulint retries = 0;
03123 
03124 retry:
03125   memset(io_ctx, 0x0, sizeof(*io_ctx));
03126 
03127   /* Initialize the io_ctx. Tell it how many pending
03128   IO requests this context will handle. */
03129 
03130   ret = io_setup(max_events, io_ctx);
03131   if (ret == 0) {
03132 #if defined(UNIV_AIO_DEBUG)
03133     fprintf(stderr,
03134       "InnoDB: Linux native AIO:"
03135       " initialized io_ctx for segment\n");
03136 #endif
03137     /* Success. Return now. */
03138     return(TRUE);
03139   }
03140 
03141   /* If we hit EAGAIN we'll make a few attempts before failing. */
03142 
03143   switch (ret) {
03144   case -EAGAIN:
03145     if (retries == 0) {
03146       /* First time around. */
03147       ut_print_timestamp(stderr);
03148       fprintf(stderr,
03149         "  InnoDB: Warning: io_setup() failed"
03150         " with EAGAIN. Will make %d attempts"
03151         " before giving up.\n",
03152         OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
03153     }
03154 
03155     if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
03156       ++retries;
03157       fprintf(stderr,
03158         "InnoDB: Warning: io_setup() attempt"
03159         " %lu failed.\n",
03160         retries);
03161       os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
03162       goto retry;
03163     }
03164 
03165     /* Have tried enough. Better call it a day. */
03166     ut_print_timestamp(stderr);
03167     fprintf(stderr,
03168       "  InnoDB: Error: io_setup() failed"
03169       " with EAGAIN after %d attempts.\n",
03170       OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
03171     break;
03172 
03173   case -ENOSYS:
03174     ut_print_timestamp(stderr);
03175     fprintf(stderr,
03176       "  InnoDB: Error: Linux Native AIO interface"
03177       " is not supported on this platform. Please"
03178       " check your OS documentation and install"
03179       " appropriate binary of InnoDB.\n");
03180 
03181     break;
03182 
03183   default:
03184     ut_print_timestamp(stderr);
03185     fprintf(stderr,
03186       "  InnoDB: Error: Linux Native AIO setup"
03187       " returned following error[%d]\n", -ret);
03188     break;
03189   }
03190 
03191   fprintf(stderr,
03192     "InnoDB: You can disable Linux Native AIO by"
03193     " setting innodb_native_aio = off in my.cnf\n");
03194   return(FALSE);
03195 }
03196 #endif /* LINUX_NATIVE_AIO */
03197 
03198 /******************************************************************/
03203 static
03204 os_aio_array_t*
03205 os_aio_array_create(
03206 /*================*/
03207   ulint n,    
03210   ulint n_segments) 
03211 {
03212   os_aio_array_t* array;
03213   ulint   i;
03214   os_aio_slot_t*  slot;
03215 #ifdef WIN_ASYNC_IO
03216   OVERLAPPED* over;
03217 #elif defined(LINUX_NATIVE_AIO)
03218   struct io_event*  aio_event = NULL;
03219 #endif
03220   ut_a(n > 0);
03221   ut_a(n_segments > 0);
03222 
03223   array = static_cast<os_aio_array_t *>(ut_malloc(sizeof(os_aio_array_t)));
03224 
03225   array->mutex    = os_mutex_create();
03226   array->not_full   = os_event_create(NULL);
03227   array->is_empty   = os_event_create(NULL);
03228 
03229   os_event_set(array->is_empty);
03230 
03231   array->n_slots    = n;
03232   array->n_segments = n_segments;
03233   array->n_reserved = 0;
03234   array->cur_seg    = 0;
03235   array->slots    = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
03236 #ifdef __WIN__
03237   array->handles    = ut_malloc(n * sizeof(HANDLE));
03238 #endif
03239 
03240 #if defined(LINUX_NATIVE_AIO)
03241   array->aio_ctx = NULL;
03242   array->aio_events = NULL;
03243 
03244   /* If we are not using native aio interface then skip this
03245   part of initialization. */
03246   if (!srv_use_native_aio) {
03247     goto skip_native_aio;
03248   }
03249 
03250   /* Initialize the io_context array. One io_context
03251   per segment in the array. */
03252 
03253   array->aio_ctx = (io_context**) ut_malloc(n_segments *
03254            sizeof(*array->aio_ctx));
03255   for (i = 0; i < n_segments; ++i) {
03256     if (!os_aio_linux_create_io_ctx(n/n_segments,
03257              &array->aio_ctx[i])) {
03258       /* If something bad happened during aio setup
03259       we should call it a day and return right away.
03260       We don't care about any leaks because a failure
03261       to initialize the io subsystem means that the
03262       server (or atleast the innodb storage engine)
03263       is not going to startup. */
03264       return(NULL);
03265     }
03266   }
03267 
03268   /* Initialize the event array. One event per slot. */
03269   aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
03270   memset(aio_event, 0x0, sizeof(io_event) * n);
03271   array->aio_events = aio_event;
03272 
03273 skip_native_aio:
03274 #endif /* LINUX_NATIVE_AIO */
03275   for (i = 0; i < n; i++) {
03276     slot = os_aio_array_get_nth_slot(array, i);
03277 
03278     slot->pos = i;
03279     slot->reserved = FALSE;
03280 #ifdef WIN_ASYNC_IO
03281     slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
03282 
03283     over = &(slot->control);
03284 
03285     over->hEvent = slot->handle;
03286 
03287     *((array->handles) + i) = over->hEvent;
03288 
03289 #elif defined(LINUX_NATIVE_AIO)
03290 
03291     memset(&slot->control, 0x0, sizeof(slot->control));
03292     slot->n_bytes = 0;
03293     slot->ret = 0;
03294 #endif
03295   }
03296 
03297   return(array);
03298 }
03299 
03300 /************************************************************************/
03302 static
03303 void
03304 os_aio_array_free(
03305 /*==============*/
03306   os_aio_array_t* array)  
03307 {
03308 #ifdef WIN_ASYNC_IO
03309   ulint i;
03310 
03311   for (i = 0; i < array->n_slots; i++) {
03312     os_aio_slot_t*  slot = os_aio_array_get_nth_slot(array, i);
03313     CloseHandle(slot->handle);
03314   }
03315 #endif /* WIN_ASYNC_IO */
03316 
03317 #ifdef __WIN__
03318   ut_free(array->handles);
03319 #endif /* __WIN__ */
03320   os_mutex_free(array->mutex);
03321   os_event_free(array->not_full);
03322   os_event_free(array->is_empty);
03323 
03324 #if defined(LINUX_NATIVE_AIO)
03325   if (srv_use_native_aio) {
03326     ut_free(array->aio_events);
03327     ut_free(array->aio_ctx);
03328   }
03329 #endif /* LINUX_NATIVE_AIO */
03330 
03331   ut_free(array->slots);
03332   ut_free(array);
03333 }
03334 
03335 /***********************************************************************
03336 Initializes the asynchronous io system. Creates one array each for ibuf
03337 and log i/o. Also creates one array each for read and write where each
03338 array is divided logically into n_read_segs and n_write_segs
03339 respectively. The caller must create an i/o handler thread for each
03340 segment in these arrays. This function also creates the sync array.
03341 No i/o handler thread needs to be created for that */
03342 UNIV_INTERN
03343 ibool
03344 os_aio_init(
03345 /*========*/
03346   ulint n_per_seg,  /*<! in: maximum number of pending aio
03347         operations allowed per segment */
03348   ulint n_read_segs,  /*<! in: number of reader threads */
03349   ulint n_write_segs, /*<! in: number of writer threads */
03350   ulint n_slots_sync) /*<! in: number of slots in the sync aio
03351         array */
03352 {
03353   ulint i;
03354   ulint   n_segments = 2 + n_read_segs + n_write_segs;
03355 
03356   ut_ad(n_segments >= 4);
03357 
03358   os_io_init_simple();
03359 
03360   for (i = 0; i < n_segments; i++) {
03361     srv_set_io_thread_op_info(i, "not started yet");
03362   }
03363 
03364 
03365   /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
03366 
03367   os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
03368   if (os_aio_ibuf_array == NULL) {
03369     goto err_exit;
03370   }
03371 
03372   srv_io_thread_function[0] = "insert buffer thread";
03373 
03374   os_aio_log_array = os_aio_array_create(n_per_seg, 1);
03375   if (os_aio_log_array == NULL) {
03376     goto err_exit;
03377   }
03378 
03379   srv_io_thread_function[1] = "log thread";
03380 
03381   os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
03382             n_read_segs);
03383   if (os_aio_read_array == NULL) {
03384     goto err_exit;
03385   }
03386 
03387   for (i = 2; i < 2 + n_read_segs; i++) {
03388     ut_a(i < SRV_MAX_N_IO_THREADS);
03389     srv_io_thread_function[i] = "read thread";
03390   }
03391 
03392   os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
03393              n_write_segs);
03394   if (os_aio_write_array == NULL) {
03395     goto err_exit;
03396   }
03397 
03398   for (i = 2 + n_read_segs; i < n_segments; i++) {
03399     ut_a(i < SRV_MAX_N_IO_THREADS);
03400     srv_io_thread_function[i] = "write thread";
03401   }
03402 
03403   os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
03404   if (os_aio_sync_array == NULL) {
03405     goto err_exit;
03406   }
03407 
03408 
03409   os_aio_n_segments = n_segments;
03410 
03411   os_aio_validate();
03412 
03413   os_aio_segment_wait_events = static_cast<os_event_t *>(ut_malloc(n_segments * sizeof(void*)));
03414 
03415   for (i = 0; i < n_segments; i++) {
03416     os_aio_segment_wait_events[i] = os_event_create(NULL);
03417   }
03418 
03419   os_last_printout = time(NULL);
03420 
03421   return(TRUE);
03422 
03423 err_exit:
03424   return(FALSE);
03425 
03426 }
03427 
03428 /***********************************************************************
03429 Frees the asynchronous io system. */
03430 UNIV_INTERN
03431 void
03432 os_aio_free(void)
03433 /*=============*/
03434 {
03435   ulint i;
03436 
03437   os_aio_array_free(os_aio_ibuf_array);
03438   os_aio_ibuf_array = NULL;
03439   os_aio_array_free(os_aio_log_array);
03440   os_aio_log_array = NULL;
03441   os_aio_array_free(os_aio_read_array);
03442   os_aio_read_array = NULL;
03443   os_aio_array_free(os_aio_write_array);
03444   os_aio_write_array = NULL;
03445   os_aio_array_free(os_aio_sync_array);
03446   os_aio_sync_array = NULL;
03447 
03448   for (i = 0; i < os_aio_n_segments; i++) {
03449     os_event_free(os_aio_segment_wait_events[i]);
03450   }
03451 
03452   ut_free(os_aio_segment_wait_events);
03453   os_aio_segment_wait_events = 0;
03454   os_aio_n_segments = 0;
03455 }
03456 
03457 #ifdef WIN_ASYNC_IO
03458 /************************************************************************/
03461 static
03462 void
03463 os_aio_array_wake_win_aio_at_shutdown(
03464 /*==================================*/
03465   os_aio_array_t* array)  
03466 {
03467   ulint i;
03468 
03469   for (i = 0; i < array->n_slots; i++) {
03470 
03471     SetEvent((array->slots + i)->handle);
03472   }
03473 }
03474 #endif
03475 
03476 /************************************************************************/
03479 UNIV_INTERN
03480 void
03481 os_aio_wake_all_threads_at_shutdown(void)
03482 /*=====================================*/
03483 {
03484   ulint i;
03485 
03486 #ifdef WIN_ASYNC_IO
03487   /* This code wakes up all ai/o threads in Windows native aio */
03488   os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
03489   os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
03490   os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
03491   os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
03492 
03493 #elif defined(LINUX_NATIVE_AIO)
03494 
03495   /* When using native AIO interface the io helper threads
03496   wait on io_getevents with a timeout value of 500ms. At
03497   each wake up these threads check the server status.
03498   No need to do anything to wake them up. */
03499 
03500   if (srv_use_native_aio) {
03501     return;
03502   }
03503   /* Fall through to simulated AIO handler wakeup if we are
03504   not using native AIO. */
03505 #endif
03506   /* This loop wakes up all simulated ai/o threads */
03507 
03508   for (i = 0; i < os_aio_n_segments; i++) {
03509 
03510     os_event_set(os_aio_segment_wait_events[i]);
03511   }
03512 }
03513 
03514 /************************************************************************/
03517 UNIV_INTERN
03518 void
03519 os_aio_wait_until_no_pending_writes(void)
03520 /*=====================================*/
03521 {
03522   os_event_wait(os_aio_write_array->is_empty);
03523 }
03524 
03525 /**********************************************************************/
03529 static
03530 ulint
03531 os_aio_get_segment_no_from_slot(
03532 /*============================*/
03533   os_aio_array_t* array,  
03534   os_aio_slot_t*  slot) 
03535 {
03536   ulint segment;
03537   ulint seg_len;
03538 
03539   if (array == os_aio_ibuf_array) {
03540     segment = 0;
03541 
03542   } else if (array == os_aio_log_array) {
03543     segment = 1;
03544 
03545   } else if (array == os_aio_read_array) {
03546     seg_len = os_aio_read_array->n_slots
03547       / os_aio_read_array->n_segments;
03548 
03549     segment = 2 + slot->pos / seg_len;
03550   } else {
03551     ut_a(array == os_aio_write_array);
03552     seg_len = os_aio_write_array->n_slots
03553       / os_aio_write_array->n_segments;
03554 
03555     segment = os_aio_read_array->n_segments + 2
03556       + slot->pos / seg_len;
03557   }
03558 
03559   return(segment);
03560 }
03561 
03562 /**********************************************************************/
03565 static
03566 ulint
03567 os_aio_get_array_and_local_segment(
03568 /*===============================*/
03569   os_aio_array_t** array,   
03570   ulint    global_segment)
03571 {
03572   ulint segment;
03573 
03574   ut_a(global_segment < os_aio_n_segments);
03575 
03576   if (global_segment == 0) {
03577     *array = os_aio_ibuf_array;
03578     segment = 0;
03579 
03580   } else if (global_segment == 1) {
03581     *array = os_aio_log_array;
03582     segment = 0;
03583 
03584   } else if (global_segment < os_aio_read_array->n_segments + 2) {
03585     *array = os_aio_read_array;
03586 
03587     segment = global_segment - 2;
03588   } else {
03589     *array = os_aio_write_array;
03590 
03591     segment = global_segment - (os_aio_read_array->n_segments + 2);
03592   }
03593 
03594   return(segment);
03595 }
03596 
03597 /*******************************************************************/
03601 static
03602 os_aio_slot_t*
03603 os_aio_array_reserve_slot(
03604 /*======================*/
03605   ulint   type, 
03606   os_aio_array_t* array,  
03607   fil_node_t* message1,
03609   void*   message2,
03611   os_file_t file, 
03612   const char* name, 
03614   void*   buf,  
03616   ulint   offset, 
03618   ulint   offset_high, 
03620   ulint   len)  
03621 {
03622   os_aio_slot_t*  slot = NULL;
03623 #ifdef WIN_ASYNC_IO
03624   OVERLAPPED* control;
03625 
03626 #elif defined(LINUX_NATIVE_AIO)
03627 
03628   struct iocb*  iocb;
03629   off_t   aio_offset;
03630 
03631 #endif
03632   ulint   i;
03633   ulint   counter;
03634   ulint   slots_per_seg;
03635   ulint   local_seg;
03636 
03637 #ifdef WIN_ASYNC_IO
03638   ut_a((len & 0xFFFFFFFFUL) == len);
03639 #endif
03640 
03641   /* No need of a mutex. Only reading constant fields */
03642   slots_per_seg = array->n_slots / array->n_segments;
03643 
03644   /* We attempt to keep adjacent blocks in the same local
03645   segment. This can help in merging IO requests when we are
03646   doing simulated AIO */
03647   local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
03648         % array->n_segments;
03649 
03650 loop:
03651   os_mutex_enter(array->mutex);
03652 
03653   if (array->n_reserved == array->n_slots) {
03654     os_mutex_exit(array->mutex);
03655 
03656     if (!srv_use_native_aio) {
03657       /* If the handler threads are suspended, wake them
03658       so that we get more slots */
03659 
03660       os_aio_simulated_wake_handler_threads();
03661     }
03662 
03663     os_event_wait(array->not_full);
03664 
03665     goto loop;
03666   }
03667 
03668   /* We start our search for an available slot from our preferred
03669   local segment and do a full scan of the array. We are
03670   guaranteed to find a slot in full scan. */
03671   for (i = local_seg * slots_per_seg, counter = 0;
03672        counter < array->n_slots; i++, counter++) {
03673 
03674     i %= array->n_slots;
03675     slot = os_aio_array_get_nth_slot(array, i);
03676 
03677     if (slot->reserved == FALSE) {
03678       goto found;
03679     }
03680   }
03681 
03682   /* We MUST always be able to get hold of a reserved slot. */
03683   ut_error;
03684 
03685 found:
03686   ut_a(slot->reserved == FALSE);
03687   array->n_reserved++;
03688 
03689   if (array->n_reserved == 1) {
03690     os_event_reset(array->is_empty);
03691   }
03692 
03693   if (array->n_reserved == array->n_slots) {
03694     os_event_reset(array->not_full);
03695   }
03696 
03697   slot->reserved = TRUE;
03698   slot->reservation_time = time(NULL);
03699   slot->message1 = message1;
03700   slot->message2 = message2;
03701   slot->file     = file;
03702   slot->name     = name;
03703   slot->len      = len;
03704   slot->type     = type;
03705   slot->buf      = static_cast<unsigned char *>(buf);
03706   slot->offset   = offset;
03707   slot->offset_high = offset_high;
03708   slot->io_already_done = FALSE;
03709 
03710 #ifdef WIN_ASYNC_IO
03711   control = &(slot->control);
03712   control->Offset = (DWORD)offset;
03713   control->OffsetHigh = (DWORD)offset_high;
03714   ResetEvent(slot->handle);
03715 
03716 #elif defined(LINUX_NATIVE_AIO)
03717 
03718   /* If we are not using native AIO skip this part. */
03719   if (!srv_use_native_aio) {
03720     goto skip_native_aio;
03721   }
03722 
03723   /* Check if we are dealing with 64 bit arch.
03724   If not then make sure that offset fits in 32 bits. */
03725   if (sizeof(aio_offset) == 8) {
03726     aio_offset = offset_high;
03727     aio_offset <<= 32;
03728     aio_offset += offset;
03729   } else {
03730     ut_a(offset_high == 0);
03731     aio_offset = offset;
03732   }
03733 
03734   iocb = &slot->control;
03735 
03736   if (type == OS_FILE_READ) {
03737     io_prep_pread(iocb, file, buf, len, aio_offset);
03738   } else {
03739     ut_a(type == OS_FILE_WRITE);
03740     io_prep_pwrite(iocb, file, buf, len, aio_offset);
03741   }
03742 
03743   iocb->data = (void*)slot;
03744   slot->n_bytes = 0;
03745   slot->ret = 0;
03746   /*fprintf(stderr, "Filled up Linux native iocb.\n");*/
03747   
03748 
03749 skip_native_aio:
03750 #endif /* LINUX_NATIVE_AIO */
03751   os_mutex_exit(array->mutex);
03752 
03753   return(slot);
03754 }
03755 
03756 /*******************************************************************/
03758 static
03759 void
03760 os_aio_array_free_slot(
03761 /*===================*/
03762   os_aio_array_t* array,  
03763   os_aio_slot_t*  slot) 
03764 {
03765   ut_ad(array);
03766   ut_ad(slot);
03767 
03768   os_mutex_enter(array->mutex);
03769 
03770   ut_ad(slot->reserved);
03771 
03772   slot->reserved = FALSE;
03773 
03774   array->n_reserved--;
03775 
03776   if (array->n_reserved == array->n_slots - 1) {
03777     os_event_set(array->not_full);
03778   }
03779 
03780   if (array->n_reserved == 0) {
03781     os_event_set(array->is_empty);
03782   }
03783 
03784 #ifdef WIN_ASYNC_IO
03785 
03786   ResetEvent(slot->handle);
03787 
03788 #elif defined(LINUX_NATIVE_AIO)
03789 
03790   if (srv_use_native_aio) {
03791     memset(&slot->control, 0x0, sizeof(slot->control));
03792     slot->n_bytes = 0;
03793     slot->ret = 0;
03794     /*fprintf(stderr, "Freed up Linux native slot.\n");*/
03795   } else {
03796     /* These fields should not be used if we are not
03797     using native AIO. */
03798     ut_ad(slot->n_bytes == 0);
03799     ut_ad(slot->ret == 0);
03800   }
03801 
03802 #endif
03803   os_mutex_exit(array->mutex);
03804 }
03805 
03806 /**********************************************************************/
03808 static
03809 void
03810 os_aio_simulated_wake_handler_thread(
03811 /*=================================*/
03812   ulint global_segment) 
03814 {
03815   os_aio_array_t* array;
03816   os_aio_slot_t*  slot;
03817   ulint   segment;
03818   ulint   n;
03819   ulint   i;
03820 
03821   ut_ad(!srv_use_native_aio);
03822 
03823   segment = os_aio_get_array_and_local_segment(&array, global_segment);
03824 
03825   n = array->n_slots / array->n_segments;
03826 
03827   /* Look through n slots after the segment * n'th slot */
03828 
03829   os_mutex_enter(array->mutex);
03830 
03831   for (i = 0; i < n; i++) {
03832     slot = os_aio_array_get_nth_slot(array, i + segment * n);
03833 
03834     if (slot->reserved) {
03835       /* Found an i/o request */
03836 
03837       break;
03838     }
03839   }
03840 
03841   os_mutex_exit(array->mutex);
03842 
03843   if (i < n) {
03844     os_event_set(os_aio_segment_wait_events[global_segment]);
03845   }
03846 }
03847 
03848 /**********************************************************************/
03850 UNIV_INTERN
03851 void
03852 os_aio_simulated_wake_handler_threads(void)
03853 /*=======================================*/
03854 {
03855   ulint i;
03856 
03857   if (srv_use_native_aio) {
03858     /* We do not use simulated aio: do nothing */
03859 
03860     return;
03861   }
03862 
03863   os_aio_recommend_sleep_for_read_threads = FALSE;
03864 
03865   for (i = 0; i < os_aio_n_segments; i++) {
03866     os_aio_simulated_wake_handler_thread(i);
03867   }
03868 }
03869 
03870 /**********************************************************************/
03875 UNIV_INTERN
03876 void
03877 os_aio_simulated_put_read_threads_to_sleep(void)
03878 /*============================================*/
03879 {
03880 
03881 /* The idea of putting background IO threads to sleep is only for
03882 Windows when using simulated AIO. Windows XP seems to schedule
03883 background threads too eagerly to allow for coalescing during
03884 readahead requests. */
03885 #ifdef __WIN__
03886   os_aio_array_t* array;
03887   ulint   g;
03888 
03889   if (srv_use_native_aio) {
03890     /* We do not use simulated aio: do nothing */
03891 
03892     return;
03893   }
03894 
03895   os_aio_recommend_sleep_for_read_threads = TRUE;
03896 
03897   for (g = 0; g < os_aio_n_segments; g++) {
03898     os_aio_get_array_and_local_segment(&array, g);
03899 
03900     if (array == os_aio_read_array) {
03901 
03902       os_event_reset(os_aio_segment_wait_events[g]);
03903     }
03904   }
03905 #endif /* __WIN__ */
03906 }
03907 
03908 #if defined(LINUX_NATIVE_AIO)
03909 /*******************************************************************/
03912 static
03913 ibool
03914 os_aio_linux_dispatch(
03915 /*==================*/
03916   os_aio_array_t* array,  
03917   os_aio_slot_t*  slot) 
03918 {
03919   int   ret;
03920   ulint   io_ctx_index;
03921   struct iocb*  iocb;
03922 
03923   ut_ad(slot != NULL);
03924   ut_ad(array);
03925 
03926   ut_a(slot->reserved);
03927 
03928   /* Find out what we are going to work with.
03929   The iocb struct is directly in the slot.
03930   The io_context is one per segment. */
03931 
03932   iocb = &slot->control;
03933   io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
03934 
03935   ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
03936 
03937 #if defined(UNIV_AIO_DEBUG)
03938   fprintf(stderr,
03939     "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
03940     (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
03941     array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
03942 #endif
03943 
03944   /* io_submit returns number of successfully
03945   queued requests or -errno. */
03946   if (UNIV_UNLIKELY(ret != 1)) {
03947     errno = -ret;
03948     return(FALSE);
03949   }
03950 
03951   return(TRUE);
03952 }
03953 #endif /* LINUX_NATIVE_AIO */
03954 
03955 
03956 /*******************************************************************/
03960 UNIV_INTERN
03961 ibool
03962 os_aio_func(
03963 /*========*/
03964   ulint   type, 
03965   ulint   mode, 
03978   const char* name, 
03980   os_file_t file, 
03981   void*   buf,  
03983   ulint   offset, 
03985   ulint   offset_high, 
03987   ulint   n,  
03988   fil_node_t* message1,
03992   void*   message2)
03996 {
03997   os_aio_array_t* array;
03998   os_aio_slot_t*  slot;
03999 #ifdef WIN_ASYNC_IO
04000   ibool   retval;
04001   BOOL    ret   = TRUE;
04002   DWORD   len   = (DWORD) n;
04003   struct fil_node_struct * dummy_mess1;
04004   void*   dummy_mess2;
04005   ulint   dummy_type;
04006 #endif /* WIN_ASYNC_IO */
04007 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
04008   ibool   retry;
04009 #endif
04010   ulint   wake_later;
04011 
04012   ut_ad(file);
04013   ut_ad(buf);
04014   ut_ad(n > 0);
04015   ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
04016   ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
04017   ut_ad(os_aio_validate());
04018 #ifdef WIN_ASYNC_IO
04019   ut_ad((n & 0xFFFFFFFFUL) == n);
04020 #endif
04021 
04022   wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
04023   mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
04024 
04025   if (mode == OS_AIO_SYNC
04026 #ifdef WIN_ASYNC_IO
04027       && !srv_use_native_aio
04028 #endif /* WIN_ASYNC_IO */
04029       ) {
04030     /* This is actually an ordinary synchronous read or write:
04031     no need to use an i/o-handler thread. NOTE that if we use
04032     Windows async i/o, Windows does not allow us to use
04033     ordinary synchronous os_file_read etc. on the same file,
04034     therefore we have built a special mechanism for synchronous
04035     wait in the Windows case. */
04036 
04037     if (type == OS_FILE_READ) {
04038       return(os_file_read(file, buf, offset,
04039               offset_high, n));
04040     }
04041 
04042     ut_a(type == OS_FILE_WRITE);
04043 
04044     return(os_file_write(name, file, buf, offset, offset_high, n));
04045   }
04046 
04047 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
04048 try_again:
04049 #endif
04050   if (mode == OS_AIO_NORMAL) {
04051     if (type == OS_FILE_READ) {
04052       array = os_aio_read_array;
04053     } else {
04054       array = os_aio_write_array;
04055     }
04056   } else if (mode == OS_AIO_IBUF) {
04057     ut_ad(type == OS_FILE_READ);
04058     /* Reduce probability of deadlock bugs in connection with ibuf:
04059     do not let the ibuf i/o handler sleep */
04060 
04061     wake_later = FALSE;
04062 
04063     array = os_aio_ibuf_array;
04064   } else if (mode == OS_AIO_LOG) {
04065 
04066     array = os_aio_log_array;
04067   } else if (mode == OS_AIO_SYNC) {
04068     array = os_aio_sync_array;
04069 
04070 #if defined(LINUX_NATIVE_AIO)
04071     /* In Linux native AIO we don't use sync IO array. */
04072     ut_a(!srv_use_native_aio);
04073 #endif /* LINUX_NATIVE_AIO */
04074   } else {
04075     array = NULL; /* Eliminate compiler warning */
04076     ut_error;
04077   }
04078 
04079   slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
04080            name, buf, offset, offset_high, n);
04081   if (type == OS_FILE_READ) {
04082     if (srv_use_native_aio) {
04083       os_n_file_reads++;
04084       os_bytes_read_since_printout += n;
04085 #ifdef WIN_ASYNC_IO
04086       ret = ReadFile(file, buf, (DWORD)n, &len,
04087                &(slot->control));
04088 
04089 #elif defined(LINUX_NATIVE_AIO)
04090       if (!os_aio_linux_dispatch(array, slot)) {
04091         goto err_exit;
04092       }
04093 #endif
04094     } else {
04095       if (!wake_later) {
04096         os_aio_simulated_wake_handler_thread(
04097           os_aio_get_segment_no_from_slot(
04098             array, slot));
04099       }
04100     }
04101   } else if (type == OS_FILE_WRITE) {
04102     if (srv_use_native_aio) {
04103       os_n_file_writes++;
04104 #ifdef WIN_ASYNC_IO
04105       ret = WriteFile(file, buf, (DWORD)n, &len,
04106           &(slot->control));
04107 
04108 #elif defined(LINUX_NATIVE_AIO)
04109       if (!os_aio_linux_dispatch(array, slot)) {
04110         goto err_exit;
04111       }
04112 #endif
04113     } else {
04114       if (!wake_later) {
04115         os_aio_simulated_wake_handler_thread(
04116           os_aio_get_segment_no_from_slot(
04117             array, slot));
04118       }
04119     }
04120   } else {
04121     ut_error;
04122   }
04123 
04124 #ifdef WIN_ASYNC_IO
04125   if (srv_use_native_aio) {
04126     if ((ret && len == n)
04127         || (!ret && GetLastError() == ERROR_IO_PENDING)) {
04128       /* aio was queued successfully! */
04129 
04130       if (mode == OS_AIO_SYNC) {
04131         /* We want a synchronous i/o operation on a
04132         file where we also use async i/o: in Windows
04133         we must use the same wait mechanism as for
04134         async i/o */
04135 
04136         retval = os_aio_windows_handle(ULINT_UNDEFINED,
04137                      slot->pos,
04138                      &dummy_mess1,
04139                      &dummy_mess2,
04140                      &dummy_type);
04141 
04142         return(retval);
04143       }
04144 
04145       return(TRUE);
04146     }
04147 
04148     goto err_exit;
04149   }
04150 #endif /* WIN_ASYNC_IO */
04151   /* aio was queued successfully! */
04152   return(TRUE);
04153 
04154 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
04155 err_exit:
04156   os_aio_array_free_slot(array, slot);
04157 
04158   retry = os_file_handle_error(name,
04159              type == OS_FILE_READ
04160              ? "aio read" : "aio write");
04161   if (retry) {
04162 
04163     goto try_again;
04164   }
04165 
04166   return(FALSE);
04167 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
04168 }
04169 
04170 #ifdef WIN_ASYNC_IO
04171 /**********************************************************************/
04179 UNIV_INTERN
04180 ibool
04181 os_aio_windows_handle(
04182 /*==================*/
04183   ulint segment,  
04191   ulint pos,    
04193   fil_node_t**message1, 
04198   void**  message2,
04199   ulint*  type)   
04200 {
04201   ulint   orig_seg  = segment;
04202   os_aio_array_t* array;
04203   os_aio_slot_t*  slot;
04204   ulint   n;
04205   ulint   i;
04206   ibool   ret_val;
04207   BOOL    ret;
04208   DWORD   len;
04209   BOOL    retry   = FALSE;
04210 
04211   if (segment == ULINT_UNDEFINED) {
04212     array = os_aio_sync_array;
04213     segment = 0;
04214   } else {
04215     segment = os_aio_get_array_and_local_segment(&array, segment);
04216   }
04217 
04218   /* NOTE! We only access constant fields in os_aio_array. Therefore
04219   we do not have to acquire the protecting mutex yet */
04220 
04221   ut_ad(os_aio_validate());
04222   ut_ad(segment < array->n_segments);
04223 
04224   n = array->n_slots / array->n_segments;
04225 
04226   if (array == os_aio_sync_array) {
04227     WaitForSingleObject(
04228       os_aio_array_get_nth_slot(array, pos)->handle,
04229       INFINITE);
04230     i = pos;
04231   } else {
04232     srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
04233     i = WaitForMultipleObjects((DWORD) n,
04234              array->handles + segment * n,
04235              FALSE,
04236              INFINITE);
04237   }
04238 
04239   if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
04240     os_thread_exit(NULL);
04241   }
04242 
04243   os_mutex_enter(array->mutex);
04244 
04245   slot = os_aio_array_get_nth_slot(array, i + segment * n);
04246 
04247   ut_a(slot->reserved);
04248 
04249   if (orig_seg != ULINT_UNDEFINED) {
04250     srv_set_io_thread_op_info(orig_seg,
04251             "get windows aio return value");
04252   }
04253 
04254   ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
04255 
04256   *message1 = slot->message1;
04257   *message2 = slot->message2;
04258 
04259   *type = slot->type;
04260 
04261   if (ret && len == slot->len) {
04262     ret_val = TRUE;
04263 
04264 #ifdef UNIV_DO_FLUSH
04265     if (slot->type == OS_FILE_WRITE
04266         && !os_do_not_call_flush_at_each_write) {
04267       if (!os_file_flush(slot->file)) {
04268         ut_error;
04269       }
04270     }
04271 #endif /* UNIV_DO_FLUSH */
04272   } else if (os_file_handle_error(slot->name, "Windows aio")) {
04273 
04274     retry = TRUE;
04275   } else {
04276 
04277     ret_val = FALSE;
04278   }
04279 
04280   os_mutex_exit(array->mutex);
04281 
04282   if (retry) {
04283     /* retry failed read/write operation synchronously.
04284     No need to hold array->mutex. */
04285 
04286 #ifdef UNIV_PFS_IO
04287     /* This read/write does not go through os_file_read
04288     and os_file_write APIs, need to register with
04289     performance schema explicitly here. */
04290     struct PSI_file_locker* locker = NULL;
04291     register_pfs_file_io_begin(locker, slot->file, slot->len,
04292              (slot->type == OS_FILE_WRITE)
04293             ? PSI_FILE_WRITE
04294             : PSI_FILE_READ,
04295               __FILE__, __LINE__);
04296 #endif
04297 
04298     ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
04299 
04300     switch (slot->type) {
04301     case OS_FILE_WRITE:
04302       ret = WriteFile(slot->file, slot->buf,
04303           (DWORD) slot->len, &len,
04304           &(slot->control));
04305 
04306       break;
04307     case OS_FILE_READ:
04308       ret = ReadFile(slot->file, slot->buf,
04309                (DWORD) slot->len, &len,
04310                &(slot->control));
04311 
04312       break;
04313     default:
04314       ut_error;
04315     }
04316 
04317 #ifdef UNIV_PFS_IO
04318     register_pfs_file_io_end(locker, len);
04319 #endif
04320 
04321     if (!ret && GetLastError() == ERROR_IO_PENDING) {
04322       /* aio was queued successfully!
04323       We want a synchronous i/o operation on a
04324       file where we also use async i/o: in Windows
04325       we must use the same wait mechanism as for
04326       async i/o */
04327 
04328       ret = GetOverlappedResult(slot->file,
04329               &(slot->control),
04330               &len, TRUE);
04331     }
04332 
04333     ret_val = ret && len == slot->len;
04334   }
04335 
04336   os_aio_array_free_slot(array, slot);
04337 
04338   return(ret_val);
04339 }
04340 #endif
04341 
04342 #if defined(LINUX_NATIVE_AIO)
04343 /******************************************************************/
04354 static
04355 void
04356 os_aio_linux_collect(
04357 /*=================*/
04358   os_aio_array_t* array,    
04359   ulint   segment,  
04360   ulint   seg_size) 
04361 {
04362   int     i;
04363   int     ret;
04364   ulint     start_pos;
04365   ulint     end_pos;
04366   struct timespec   timeout;
04367   struct io_event*  events;
04368   struct io_context*  io_ctx;
04369 
04370   /* sanity checks. */
04371   ut_ad(array != NULL);
04372   ut_ad(seg_size > 0);
04373   ut_ad(segment < array->n_segments);
04374 
04375   /* Which part of event array we are going to work on. */
04376   events = &array->aio_events[segment * seg_size];
04377 
04378   /* Which io_context we are going to use. */
04379   io_ctx = array->aio_ctx[segment];
04380 
04381   /* Starting point of the segment we will be working on. */
04382   start_pos = segment * seg_size;
04383 
04384   /* End point. */
04385   end_pos = start_pos + seg_size;
04386 
04387 retry:
04388 
04389   /* Go down if we are in shutdown mode.
04390   In case of srv_fast_shutdown == 2, there may be pending
04391   IO requests but that should be OK as we essentially treat
04392   that as a crash of InnoDB. */
04393   if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
04394     os_thread_exit(NULL);
04395   }
04396 
04397   /* Initialize the events. The timeout value is arbitrary.
04398   We probably need to experiment with it a little. */
04399   memset(events, 0, sizeof(*events) * seg_size);
04400   timeout.tv_sec = 0;
04401   timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
04402 
04403   ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
04404 
04405   /* This error handling is for any error in collecting the
04406   IO requests. The errors, if any, for any particular IO
04407   request are simply passed on to the calling routine. */
04408 
04409   /* Not enough resources! Try again. */
04410   if (ret == -EAGAIN) {
04411     goto retry;
04412   }
04413 
04414   /* Interrupted! I have tested the behaviour in case of an
04415   interrupt. If we have some completed IOs available then
04416   the return code will be the number of IOs. We get EINTR only
04417   if there are no completed IOs and we have been interrupted. */
04418   if (ret == -EINTR) {
04419     goto retry;
04420   }
04421 
04422   /* No pending request! Go back and check again. */
04423   if (ret == 0) {
04424     goto retry;
04425   }
04426 
04427   /* All other errors! should cause a trap for now. */
04428   if (UNIV_UNLIKELY(ret < 0)) {
04429     ut_print_timestamp(stderr);
04430     fprintf(stderr,
04431       "  InnoDB: unexpected ret_code[%d] from"
04432       " io_getevents()!\n", ret);
04433     ut_error;
04434   }
04435 
04436   ut_a(ret > 0);
04437 
04438   for (i = 0; i < ret; i++) {
04439     os_aio_slot_t*  slot;
04440     struct iocb*  control;
04441 
04442     control = (struct iocb *)events[i].obj;
04443     ut_a(control != NULL);
04444 
04445     slot = (os_aio_slot_t *) control->data;
04446 
04447     /* Some sanity checks. */
04448     ut_a(slot != NULL);
04449     ut_a(slot->reserved);
04450 
04451 #if defined(UNIV_AIO_DEBUG)
04452     fprintf(stderr,
04453       "io_getevents[%c]: slot[%p] ctx[%p]"
04454       " seg[%lu]\n",
04455       (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
04456       slot, io_ctx, segment);
04457 #endif
04458 
04459     /* We are not scribbling previous segment. */
04460     ut_a(slot->pos >= start_pos);
04461 
04462     /* We have not overstepped to next segment. */
04463     ut_a(slot->pos < end_pos);
04464 
04465     /* Mark this request as completed. The error handling
04466     will be done in the calling function. */
04467     os_mutex_enter(array->mutex);
04468     slot->n_bytes = events[i].res;
04469     slot->ret = events[i].res2;
04470     slot->io_already_done = TRUE;
04471     os_mutex_exit(array->mutex);
04472   }
04473 
04474   return;
04475 }
04476 
04477 /**********************************************************************/
04485 UNIV_INTERN
04486 ibool
04487 os_aio_linux_handle(
04488 /*================*/
04489   ulint global_seg, 
04495   fil_node_t**message1, 
04496   void**  message2, 
04500   ulint*  type)   
04501 {
04502   ulint   segment;
04503   os_aio_array_t* array;
04504   os_aio_slot_t*  slot;
04505   ulint   n;
04506   ulint   i;
04507   ibool   ret = FALSE;
04508 
04509   /* Should never be doing Sync IO here. */
04510   ut_a(global_seg != ULINT_UNDEFINED);
04511 
04512   /* Find the array and the local segment. */
04513   segment = os_aio_get_array_and_local_segment(&array, global_seg);
04514   n = array->n_slots / array->n_segments;
04515 
04516   /* Loop until we have found a completed request. */
04517   for (;;) {
04518     os_mutex_enter(array->mutex);
04519     for (i = 0; i < n; ++i) {
04520       slot = os_aio_array_get_nth_slot(
04521           array, i + segment * n);
04522       if (slot->reserved && slot->io_already_done) {
04523         /* Something for us to work on. */
04524         goto found;
04525       }
04526     }
04527 
04528     os_mutex_exit(array->mutex);
04529 
04530     /* We don't have any completed request.
04531     Wait for some request. Note that we return
04532     from wait iff we have found a request. */
04533 
04534     srv_set_io_thread_op_info(global_seg,
04535       "waiting for completed aio requests");
04536     os_aio_linux_collect(array, segment, n);
04537   }
04538 
04539 found:
04540   /* Note that it may be that there are more then one completed
04541   IO requests. We process them one at a time. We may have a case
04542   here to improve the performance slightly by dealing with all
04543   requests in one sweep. */
04544   srv_set_io_thread_op_info(global_seg,
04545         "processing completed aio requests");
04546 
04547   /* Ensure that we are scribbling only our segment. */
04548   ut_a(i < n);
04549 
04550   ut_ad(slot != NULL);
04551   ut_ad(slot->reserved);
04552   ut_ad(slot->io_already_done);
04553 
04554   *message1 = slot->message1;
04555   *message2 = slot->message2;
04556 
04557   *type = slot->type;
04558 
04559   if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
04560     ret = TRUE;
04561 
04562 #ifdef UNIV_DO_FLUSH
04563     if (slot->type == OS_FILE_WRITE
04564         && !os_do_not_call_flush_at_each_write)
04565         && !os_file_flush(slot->file) {
04566       ut_error;
04567     }
04568 #endif /* UNIV_DO_FLUSH */
04569   } else {
04570     errno = -slot->ret;
04571 
04572     /* os_file_handle_error does tell us if we should retry
04573     this IO. As it stands now, we don't do this retry when
04574     reaping requests from a different context than
04575     the dispatcher. This non-retry logic is the same for
04576     windows and linux native AIO.
04577     We should probably look into this to transparently
04578     re-submit the IO. */
04579     os_file_handle_error(slot->name, "Linux aio");
04580 
04581     ret = FALSE;
04582   }
04583 
04584   os_mutex_exit(array->mutex);
04585 
04586   os_aio_array_free_slot(array, slot);
04587 
04588   return(ret);
04589 }
04590 #endif /* LINUX_NATIVE_AIO */
04591 
04592 /**********************************************************************/
04596 UNIV_INTERN
04597 ibool
04598 os_aio_simulated_handle(
04599 /*====================*/
04600   ulint global_segment, 
04605   fil_node_t**message1, 
04610   void**  message2,
04611   ulint*  type)   
04612 {
04613   os_aio_array_t* array;
04614   ulint   segment;
04615   os_aio_slot_t*  slot;
04616   os_aio_slot_t*  slot2;
04617   os_aio_slot_t*  consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
04618   ulint   n_consecutive;
04619   ulint   total_len;
04620   ulint   offs;
04621   ulint   lowest_offset;
04622   ulint   biggest_age;
04623   ulint   age;
04624   byte*   combined_buf;
04625   byte*   combined_buf2;
04626   ibool   ret;
04627   ulint   n;
04628   ulint   i;
04629 
04630   /* Fix compiler warning */
04631   *consecutive_ios = NULL;
04632 
04633   memset(consecutive_ios, 0, sizeof(os_aio_slot_t*) * OS_AIO_MERGE_N_CONSECUTIVE);
04634   segment = os_aio_get_array_and_local_segment(&array, global_segment);
04635 
04636 restart:
04637   /* NOTE! We only access constant fields in os_aio_array. Therefore
04638   we do not have to acquire the protecting mutex yet */
04639 
04640   srv_set_io_thread_op_info(global_segment,
04641           "looking for i/o requests (a)");
04642   ut_ad(os_aio_validate());
04643   ut_ad(segment < array->n_segments);
04644 
04645   n = array->n_slots / array->n_segments;
04646 
04647   /* Look through n slots after the segment * n'th slot */
04648 
04649   if (array == os_aio_read_array
04650       && os_aio_recommend_sleep_for_read_threads) {
04651 
04652     /* Give other threads chance to add several i/os to the array
04653     at once. */
04654 
04655     goto recommended_sleep;
04656   }
04657 
04658   os_mutex_enter(array->mutex);
04659 
04660   srv_set_io_thread_op_info(global_segment,
04661           "looking for i/o requests (b)");
04662 
04663   /* Check if there is a slot for which the i/o has already been
04664   done */
04665 
04666   for (i = 0; i < n; i++) {
04667     slot = os_aio_array_get_nth_slot(array, i + segment * n);
04668 
04669     if (slot->reserved && slot->io_already_done) {
04670 
04671       if (os_aio_print_debug) {
04672         fprintf(stderr,
04673           "InnoDB: i/o for slot %lu"
04674           " already done, returning\n",
04675           (ulong) i);
04676       }
04677 
04678       ret = TRUE;
04679 
04680       goto slot_io_done;
04681     }
04682   }
04683 
04684   n_consecutive = 0;
04685 
04686   /* If there are at least 2 seconds old requests, then pick the oldest
04687   one to prevent starvation. If several requests have the same age,
04688   then pick the one at the lowest offset. */
04689 
04690   biggest_age = 0;
04691   lowest_offset = ULINT_MAX;
04692 
04693   for (i = 0; i < n; i++) {
04694     slot = os_aio_array_get_nth_slot(array, i + segment * n);
04695 
04696     if (slot->reserved) {
04697       age = (ulint)difftime(time(NULL),
04698                 slot->reservation_time);
04699 
04700       if ((age >= 2 && age > biggest_age)
04701           || (age >= 2 && age == biggest_age
04702         && slot->offset < lowest_offset)) {
04703 
04704         /* Found an i/o request */
04705         consecutive_ios[0] = slot;
04706 
04707         n_consecutive = 1;
04708 
04709         biggest_age = age;
04710         lowest_offset = slot->offset;
04711       }
04712     }
04713   }
04714 
04715   if (n_consecutive == 0) {
04716     /* There were no old requests. Look for an i/o request at the
04717     lowest offset in the array (we ignore the high 32 bits of the
04718     offset in these heuristics) */
04719 
04720     lowest_offset = ULINT_MAX;
04721 
04722     for (i = 0; i < n; i++) {
04723       slot = os_aio_array_get_nth_slot(array,
04724                i + segment * n);
04725 
04726       if (slot->reserved && slot->offset < lowest_offset) {
04727 
04728         /* Found an i/o request */
04729         consecutive_ios[0] = slot;
04730 
04731         n_consecutive = 1;
04732 
04733         lowest_offset = slot->offset;
04734       }
04735     }
04736   }
04737 
04738   if (n_consecutive == 0) {
04739 
04740     /* No i/o requested at the moment */
04741 
04742     goto wait_for_io;
04743   }
04744 
04745   /* if n_consecutive != 0, then we have assigned
04746   something valid to consecutive_ios[0] */
04747   ut_ad(n_consecutive != 0);
04748   ut_ad(consecutive_ios[0] != NULL);
04749 
04750   slot = consecutive_ios[0];
04751 
04752   /* Check if there are several consecutive blocks to read or write */
04753 
04754 consecutive_loop:
04755   for (i = 0; i < n; i++) {
04756     slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
04757 
04758     if (slot2->reserved && slot2 != slot
04759         && slot2->offset == slot->offset + slot->len
04760         /* check that sum does not wrap over */
04761         && slot->offset + slot->len > slot->offset
04762         && slot2->offset_high == slot->offset_high
04763         && slot2->type == slot->type
04764         && slot2->file == slot->file) {
04765 
04766       /* Found a consecutive i/o request */
04767 
04768       consecutive_ios[n_consecutive] = slot2;
04769       n_consecutive++;
04770 
04771       slot = slot2;
04772 
04773       if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
04774 
04775         goto consecutive_loop;
04776       } else {
04777         break;
04778       }
04779     }
04780   }
04781 
04782   srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
04783 
04784   /* We have now collected n_consecutive i/o requests in the array;
04785   allocate a single buffer which can hold all data, and perform the
04786   i/o */
04787 
04788   total_len = 0;
04789   slot = consecutive_ios[0];
04790 
04791   for (i = 0; i < n_consecutive; i++) {
04792     total_len += consecutive_ios[i]->len;
04793   }
04794 
04795   if (n_consecutive == 1) {
04796     /* We can use the buffer of the i/o request */
04797     combined_buf = slot->buf;
04798     combined_buf2 = NULL;
04799   } else {
04800     combined_buf2 = static_cast<unsigned char *>(ut_malloc(total_len + UNIV_PAGE_SIZE));
04801 
04802     ut_a(combined_buf2);
04803 
04804     combined_buf = static_cast<unsigned char *>(ut_align(combined_buf2, UNIV_PAGE_SIZE));
04805   }
04806 
04807   /* We release the array mutex for the time of the i/o: NOTE that
04808   this assumes that there is just one i/o-handler thread serving
04809   a single segment of slots! */
04810 
04811   os_mutex_exit(array->mutex);
04812 
04813   if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
04814     /* Copy the buffers to the combined buffer */
04815     offs = 0;
04816 
04817     for (i = 0; i < n_consecutive; i++) {
04818 
04819       ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
04820           consecutive_ios[i]->len);
04821       offs += consecutive_ios[i]->len;
04822     }
04823   }
04824 
04825   srv_set_io_thread_op_info(global_segment, "doing file i/o");
04826 
04827   if (os_aio_print_debug) {
04828     fprintf(stderr,
04829       "InnoDB: doing i/o of type %lu at offset %lu %lu,"
04830       " length %lu\n",
04831       (ulong) slot->type, (ulong) slot->offset_high,
04832       (ulong) slot->offset, (ulong) total_len);
04833   }
04834 
04835   /* Do the i/o with ordinary, synchronous i/o functions: */
04836   if (slot->type == OS_FILE_WRITE) {
04837     ret = os_file_write(slot->name, slot->file, combined_buf,
04838             slot->offset, slot->offset_high,
04839             total_len);
04840   } else {
04841     ret = os_file_read(slot->file, combined_buf,
04842            slot->offset, slot->offset_high, total_len);
04843   }
04844 
04845   ut_a(ret);
04846   srv_set_io_thread_op_info(global_segment, "file i/o done");
04847 
04848 #if 0
04849   fprintf(stderr,
04850     "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
04851     n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
04852 #endif
04853 
04854   if (slot->type == OS_FILE_READ && n_consecutive > 1) {
04855     /* Copy the combined buffer to individual buffers */
04856     offs = 0;
04857 
04858     for (i = 0; i < n_consecutive; i++) {
04859 
04860       ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
04861           consecutive_ios[i]->len);
04862       offs += consecutive_ios[i]->len;
04863     }
04864   }
04865 
04866   if (combined_buf2) {
04867     ut_free(combined_buf2);
04868   }
04869 
04870   os_mutex_enter(array->mutex);
04871 
04872   /* Mark the i/os done in slots */
04873 
04874   for (i = 0; i < n_consecutive; i++) {
04875     consecutive_ios[i]->io_already_done = TRUE;
04876   }
04877 
04878   /* We return the messages for the first slot now, and if there were
04879   several slots, the messages will be returned with subsequent calls
04880   of this function */
04881 
04882 slot_io_done:
04883 
04884   ut_a(slot->reserved);
04885 
04886   *message1 = slot->message1;
04887   *message2 = slot->message2;
04888 
04889   *type = slot->type;
04890 
04891   os_mutex_exit(array->mutex);
04892 
04893   os_aio_array_free_slot(array, slot);
04894 
04895   return(ret);
04896 
04897 wait_for_io:
04898   srv_set_io_thread_op_info(global_segment, "resetting wait event");
04899 
04900   /* We wait here until there again can be i/os in the segment
04901   of this thread */
04902 
04903   os_event_reset(os_aio_segment_wait_events[global_segment]);
04904 
04905   os_mutex_exit(array->mutex);
04906 
04907 recommended_sleep:
04908   srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
04909 
04910   os_event_wait(os_aio_segment_wait_events[global_segment]);
04911 
04912   if (os_aio_print_debug) {
04913     fprintf(stderr,
04914       "InnoDB: i/o handler thread for i/o"
04915       " segment %lu wakes up\n",
04916       (ulong) global_segment);
04917   }
04918 
04919   goto restart;
04920 }
04921 
04922 /**********************************************************************/
04925 static
04926 ibool
04927 os_aio_array_validate(
04928 /*==================*/
04929   os_aio_array_t* array)  
04930 {
04931   os_aio_slot_t*  slot;
04932   ulint   n_reserved  = 0;
04933   ulint   i;
04934 
04935   ut_a(array);
04936 
04937   os_mutex_enter(array->mutex);
04938 
04939   ut_a(array->n_slots > 0);
04940   ut_a(array->n_segments > 0);
04941 
04942   for (i = 0; i < array->n_slots; i++) {
04943     slot = os_aio_array_get_nth_slot(array, i);
04944 
04945     if (slot->reserved) {
04946       n_reserved++;
04947       ut_a(slot->len > 0);
04948     }
04949   }
04950 
04951   ut_a(array->n_reserved == n_reserved);
04952 
04953   os_mutex_exit(array->mutex);
04954 
04955   return(TRUE);
04956 }
04957 
04958 /**********************************************************************/
04961 UNIV_INTERN
04962 ibool
04963 os_aio_validate(void)
04964 /*=================*/
04965 {
04966   os_aio_array_validate(os_aio_read_array);
04967   os_aio_array_validate(os_aio_write_array);
04968   os_aio_array_validate(os_aio_ibuf_array);
04969   os_aio_array_validate(os_aio_log_array);
04970   os_aio_array_validate(os_aio_sync_array);
04971 
04972   return(TRUE);
04973 }
04974 
04975 /**********************************************************************/
04980 static
04981 void
04982 os_aio_print_segment_info(
04983 /*======================*/
04984   FILE*   file, 
04985   ulint*    n_seg,  
04986   os_aio_array_t* array)  
04987 {
04988   ulint i;
04989 
04990   ut_ad(array);
04991   ut_ad(n_seg);
04992   ut_ad(array->n_segments > 0);
04993 
04994   if (array->n_segments == 1) {
04995     return;
04996   }
04997 
04998   fprintf(file, " [");
04999   for (i = 0; i < array->n_segments; i++) {
05000     if (i != 0) {
05001       fprintf(file, ", ");
05002     }
05003 
05004     fprintf(file, "%lu", n_seg[i]);
05005   }
05006   fprintf(file, "] ");
05007 }
05008 
05009 /**********************************************************************/
05011 UNIV_INTERN
05012 void
05013 os_aio_print(
05014 /*=========*/
05015   FILE* file) 
05016 {
05017   os_aio_array_t* array;
05018   os_aio_slot_t*  slot;
05019   ulint   n_reserved;
05020   ulint   n_res_seg[SRV_MAX_N_IO_THREADS];
05021   time_t    current_time;
05022   double    time_elapsed;
05023   double    avg_bytes_read;
05024   ulint   i;
05025 
05026   for (i = 0; i < srv_n_file_io_threads; i++) {
05027     fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
05028       srv_io_thread_op_info[i],
05029       srv_io_thread_function[i]);
05030 
05031 #ifndef __WIN__
05032     if (os_aio_segment_wait_events[i]->is_set) {
05033       fprintf(file, " ev set");
05034     }
05035 #endif
05036 
05037     fprintf(file, "\n");
05038   }
05039 
05040   fputs("Pending normal aio reads:", file);
05041 
05042   array = os_aio_read_array;
05043 loop:
05044   ut_a(array);
05045 
05046   os_mutex_enter(array->mutex);
05047 
05048   ut_a(array->n_slots > 0);
05049   ut_a(array->n_segments > 0);
05050 
05051   n_reserved = 0;
05052 
05053   memset(n_res_seg, 0x0, sizeof(n_res_seg));
05054 
05055   for (i = 0; i < array->n_slots; i++) {
05056     ulint seg_no;
05057 
05058     slot = os_aio_array_get_nth_slot(array, i);
05059 
05060     seg_no = (i * array->n_segments) / array->n_slots;
05061     if (slot->reserved) {
05062       n_reserved++;
05063       n_res_seg[seg_no]++;
05064 #if 0
05065       fprintf(stderr, "Reserved slot, messages %p %p\n",
05066         (void*) slot->message1,
05067         (void*) slot->message2);
05068 #endif
05069       ut_a(slot->len > 0);
05070     }
05071   }
05072 
05073   ut_a(array->n_reserved == n_reserved);
05074 
05075   fprintf(file, " %lu", (ulong) n_reserved);
05076 
05077   os_aio_print_segment_info(file, n_res_seg, array);
05078 
05079   os_mutex_exit(array->mutex);
05080 
05081   if (array == os_aio_read_array) {
05082     fputs(", aio writes:", file);
05083 
05084     array = os_aio_write_array;
05085 
05086     goto loop;
05087   }
05088 
05089   if (array == os_aio_write_array) {
05090     fputs(",\n ibuf aio reads:", file);
05091     array = os_aio_ibuf_array;
05092 
05093     goto loop;
05094   }
05095 
05096   if (array == os_aio_ibuf_array) {
05097     fputs(", log i/o's:", file);
05098     array = os_aio_log_array;
05099 
05100     goto loop;
05101   }
05102 
05103   if (array == os_aio_log_array) {
05104     fputs(", sync i/o's:", file);
05105     array = os_aio_sync_array;
05106 
05107     goto loop;
05108   }
05109 
05110   putc('\n', file);
05111   current_time = time(NULL);
05112   time_elapsed = 0.001 + difftime(current_time, os_last_printout);
05113 
05114   fprintf(file,
05115     "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
05116     "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
05117     (ulong) fil_n_pending_log_flushes,
05118     (ulong) fil_n_pending_tablespace_flushes,
05119     (ulong) os_n_file_reads, (ulong) os_n_file_writes,
05120     (ulong) os_n_fsyncs);
05121 
05122   if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
05123     fprintf(file,
05124       "%lu pending preads, %lu pending pwrites\n",
05125       (ulong) os_file_n_pending_preads,
05126       (ulong) os_file_n_pending_pwrites);
05127   }
05128 
05129   if (os_n_file_reads == os_n_file_reads_old) {
05130     avg_bytes_read = 0.0;
05131   } else {
05132     avg_bytes_read = (double) os_bytes_read_since_printout
05133       / (os_n_file_reads - os_n_file_reads_old);
05134   }
05135 
05136   fprintf(file,
05137     "%.2f reads/s, %lu avg bytes/read,"
05138     " %.2f writes/s, %.2f fsyncs/s\n",
05139     (os_n_file_reads - os_n_file_reads_old)
05140     / time_elapsed,
05141     (ulong)avg_bytes_read,
05142     (os_n_file_writes - os_n_file_writes_old)
05143     / time_elapsed,
05144     (os_n_fsyncs - os_n_fsyncs_old)
05145     / time_elapsed);
05146 
05147   os_n_file_reads_old = os_n_file_reads;
05148   os_n_file_writes_old = os_n_file_writes;
05149   os_n_fsyncs_old = os_n_fsyncs;
05150   os_bytes_read_since_printout = 0;
05151 
05152   os_last_printout = current_time;
05153 }
05154 
05155 /**********************************************************************/
05157 UNIV_INTERN
05158 void
05159 os_aio_refresh_stats(void)
05160 /*======================*/
05161 {
05162   os_n_file_reads_old = os_n_file_reads;
05163   os_n_file_writes_old = os_n_file_writes;
05164   os_n_fsyncs_old = os_n_fsyncs;
05165   os_bytes_read_since_printout = 0;
05166 
05167   os_last_printout = time(NULL);
05168 }
05169 
05170 #ifdef UNIV_DEBUG
05171 /**********************************************************************/
05175 UNIV_INTERN
05176 ibool
05177 os_aio_all_slots_free(void)
05178 /*=======================*/
05179 {
05180   os_aio_array_t* array;
05181   ulint   n_res = 0;
05182 
05183   array = os_aio_read_array;
05184 
05185   os_mutex_enter(array->mutex);
05186 
05187   n_res += array->n_reserved;
05188 
05189   os_mutex_exit(array->mutex);
05190 
05191   array = os_aio_write_array;
05192 
05193   os_mutex_enter(array->mutex);
05194 
05195   n_res += array->n_reserved;
05196 
05197   os_mutex_exit(array->mutex);
05198 
05199   array = os_aio_ibuf_array;
05200 
05201   os_mutex_enter(array->mutex);
05202 
05203   n_res += array->n_reserved;
05204 
05205   os_mutex_exit(array->mutex);
05206 
05207   array = os_aio_log_array;
05208 
05209   os_mutex_enter(array->mutex);
05210 
05211   n_res += array->n_reserved;
05212 
05213   os_mutex_exit(array->mutex);
05214 
05215   array = os_aio_sync_array;
05216 
05217   os_mutex_enter(array->mutex);
05218 
05219   n_res += array->n_reserved;
05220 
05221   os_mutex_exit(array->mutex);
05222 
05223   if (n_res == 0) {
05224 
05225     return(TRUE);
05226   }
05227 
05228   return(FALSE);
05229 }
05230 #endif /* UNIV_DEBUG */
05231 
05232 #endif /* !UNIV_HOTBACKUP */