Drizzled Public API Documentation

buf0flu.cc
00001 /*****************************************************************************
00002 
00003 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
00004 
00005 This program is free software; you can redistribute it and/or modify it under
00006 the terms of the GNU General Public License as published by the Free Software
00007 Foundation; version 2 of the License.
00008 
00009 This program is distributed in the hope that it will be useful, but WITHOUT
00010 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00011 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
00012 
00013 You should have received a copy of the GNU General Public License along with
00014 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
00015 St, Fifth Floor, Boston, MA 02110-1301 USA
00016 
00017 *****************************************************************************/
00018 
00019 /**************************************************/
00026 #include "buf0flu.h"
00027 
00028 #ifdef UNIV_NONINL
00029 #include "buf0flu.ic"
00030 #endif
00031 
00032 #include "buf0buf.h"
00033 #include "srv0srv.h"
00034 #include "page0zip.h"
00035 #ifndef UNIV_HOTBACKUP
00036 #include "ut0byte.h"
00037 #include "ut0lst.h"
00038 #include "page0page.h"
00039 #include "fil0fil.h"
00040 #include "buf0lru.h"
00041 #include "buf0rea.h"
00042 #include "ibuf0ibuf.h"
00043 #include "log0log.h"
00044 #include "os0file.h"
00045 #include "trx0sys.h"
00046 
00047 /**********************************************************************
00048 These statistics are generated for heuristics used in estimating the
00049 rate at which we should flush the dirty blocks to avoid bursty IO
00050 activity. Note that the rate of flushing not only depends on how many
00051 dirty pages we have in the buffer pool but it is also a fucntion of
00052 how much redo the workload is generating and at what rate. */
00053 /* @{ */
00054 
00058 #define BUF_FLUSH_STAT_N_INTERVAL 20
00059 
00062 static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
00063 
00065 static ulint    buf_flush_stat_arr_ind;
00066 
00069 static buf_flush_stat_t buf_flush_stat_cur;
00070 
00073 static buf_flush_stat_t buf_flush_stat_sum;
00074 
00076 static ulint buf_lru_flush_page_count = 0;
00077 
00078 /* @} */
00079 
00080 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00081 /******************************************************************/
00084 static
00085 ibool
00086 buf_flush_validate_low(
00087 /*===================*/
00088   buf_pool_t* buf_pool);  
00089 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00090 
00091 /******************************************************************/
00096 static
00097 buf_page_t*
00098 buf_flush_insert_in_flush_rbt(
00099 /*==========================*/
00100   buf_page_t* bpage)  
00101 {
00102   const ib_rbt_node_t*  c_node;
00103   const ib_rbt_node_t*  p_node;
00104   buf_page_t*   prev = NULL;
00105   buf_pool_t*   buf_pool = buf_pool_from_bpage(bpage);
00106 
00107   ut_ad(buf_flush_list_mutex_own(buf_pool));
00108 
00109   /* Insert this buffer into the rbt. */
00110   c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
00111   ut_a(c_node != NULL);
00112 
00113   /* Get the predecessor. */
00114   p_node = rbt_prev(buf_pool->flush_rbt, c_node);
00115 
00116   if (p_node != NULL) {
00117     buf_page_t**  value;
00118     value = rbt_value(buf_page_t*, p_node);
00119     prev = *value;
00120     ut_a(prev != NULL);
00121   }
00122 
00123   return(prev);
00124 }
00125 
00126 /*********************************************************/
00128 static
00129 void
00130 buf_flush_delete_from_flush_rbt(
00131 /*============================*/
00132   buf_page_t* bpage)  
00133 {
00134 #ifdef UNIV_DEBUG
00135   ibool   ret = FALSE;
00136 #endif /* UNIV_DEBUG */
00137   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00138 
00139   ut_ad(buf_flush_list_mutex_own(buf_pool));
00140 
00141 #ifdef UNIV_DEBUG
00142   ret =
00143 #endif /* UNIV_DEBUG */
00144   rbt_delete(buf_pool->flush_rbt, &bpage);
00145   ut_ad(ret);
00146 }
00147 
00148 /*****************************************************************/
00158 static
00159 int
00160 buf_flush_block_cmp(
00161 /*================*/
00162   const void* p1,   
00163   const void* p2)   
00164 {
00165   int     ret;
00166   const buf_page_t* b1 = *(const buf_page_t**) p1;
00167   const buf_page_t* b2 = *(const buf_page_t**) p2;
00168 #ifdef UNIV_DEBUG
00169   buf_pool_t*   buf_pool = buf_pool_from_bpage(b1);
00170 #endif /* UNIV_DEBUG */
00171 
00172   ut_ad(b1 != NULL);
00173   ut_ad(b2 != NULL);
00174 
00175   ut_ad(buf_flush_list_mutex_own(buf_pool));
00176 
00177   ut_ad(b1->in_flush_list);
00178   ut_ad(b2->in_flush_list);
00179 
00180   if (b2->oldest_modification > b1->oldest_modification) {
00181     return(1);
00182   } else if (b2->oldest_modification < b1->oldest_modification) {
00183     return(-1);
00184   }
00185 
00186   /* If oldest_modification is same then decide on the space. */
00187   ret = (int)(b2->space - b1->space);
00188 
00189   /* Or else decide ordering on the offset field. */
00190   return(ret ? ret : (int)(b2->offset - b1->offset));
00191 }
00192 
00193 /********************************************************************/
00197 UNIV_INTERN
00198 void
00199 buf_flush_init_flush_rbt(void)
00200 /*==========================*/
00201 {
00202   ulint i;
00203 
00204   for (i = 0; i < srv_buf_pool_instances; i++) {
00205     buf_pool_t* buf_pool;
00206 
00207     buf_pool = buf_pool_from_array(i);
00208 
00209     buf_flush_list_mutex_enter(buf_pool);
00210 
00211     /* Create red black tree for speedy insertions in flush list. */
00212     buf_pool->flush_rbt = rbt_create(
00213       sizeof(buf_page_t*), buf_flush_block_cmp);
00214 
00215     buf_flush_list_mutex_exit(buf_pool);
00216   }
00217 }
00218 
00219 /********************************************************************/
00221 UNIV_INTERN
00222 void
00223 buf_flush_free_flush_rbt(void)
00224 /*==========================*/
00225 {
00226   ulint i;
00227 
00228   for (i = 0; i < srv_buf_pool_instances; i++) {
00229     buf_pool_t* buf_pool;
00230 
00231     buf_pool = buf_pool_from_array(i);
00232 
00233     buf_flush_list_mutex_enter(buf_pool);
00234 
00235 #ifdef UNIV_DEBUG_VALGRIND
00236   {
00237     ulint zip_size = buf_block_get_zip_size(block);
00238 
00239     if (UNIV_UNLIKELY(zip_size)) {
00240       UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
00241     } else {
00242       UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
00243     }
00244   }
00245 #endif /* UNIV_DEBUG_VALGRIND */
00246 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00247     ut_a(buf_flush_validate_low(buf_pool));
00248 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00249 
00250     rbt_free(buf_pool->flush_rbt);
00251     buf_pool->flush_rbt = NULL;
00252 
00253     buf_flush_list_mutex_exit(buf_pool);
00254   }
00255 }
00256 
00257 /********************************************************************/
00259 UNIV_INTERN
00260 void
00261 buf_flush_insert_into_flush_list(
00262 /*=============================*/
00263   buf_pool_t* buf_pool, 
00264   buf_block_t*  block,    
00265   ib_uint64_t lsn)    
00266 {
00267   ut_ad(!buf_pool_mutex_own(buf_pool));
00268   ut_ad(log_flush_order_mutex_own());
00269   ut_ad(mutex_own(&block->mutex));
00270 
00271   buf_flush_list_mutex_enter(buf_pool);
00272 
00273   ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
00274         || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
00275       <= lsn));
00276 
00277   /* If we are in the recovery then we need to update the flush
00278   red-black tree as well. */
00279   if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
00280     buf_flush_list_mutex_exit(buf_pool);
00281     buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
00282     return;
00283   }
00284 
00285   ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
00286   ut_ad(!block->page.in_flush_list);
00287 
00288   ut_d(block->page.in_flush_list = TRUE);
00289   block->page.oldest_modification = lsn;
00290   UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
00291 
00292 #ifdef UNIV_DEBUG_VALGRIND
00293   {
00294     ulint zip_size = buf_block_get_zip_size(block);
00295 
00296     if (UNIV_UNLIKELY(zip_size)) {
00297       UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
00298     } else {
00299       UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
00300     }
00301   }
00302 #endif /* UNIV_DEBUG_VALGRIND */
00303 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00304   ut_a(buf_flush_validate_low(buf_pool));
00305 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00306 
00307   buf_flush_list_mutex_exit(buf_pool);
00308 }
00309 
00310 /********************************************************************/
00314 UNIV_INTERN
00315 void
00316 buf_flush_insert_sorted_into_flush_list(
00317 /*====================================*/
00318   buf_pool_t* buf_pool, 
00319   buf_block_t*  block,    
00320   ib_uint64_t lsn)    
00321 {
00322   buf_page_t* prev_b;
00323   buf_page_t* b;
00324 
00325   ut_ad(!buf_pool_mutex_own(buf_pool));
00326   ut_ad(log_flush_order_mutex_own());
00327   ut_ad(mutex_own(&block->mutex));
00328   ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
00329 
00330   buf_flush_list_mutex_enter(buf_pool);
00331 
00332   /* The field in_LRU_list is protected by buf_pool->mutex, which
00333   we are not holding.  However, while a block is in the flush
00334   list, it is dirty and cannot be discarded, not from the
00335   page_hash or from the LRU list.  At most, the uncompressed
00336   page frame of a compressed block may be discarded or created
00337   (copying the block->page to or from a buf_page_t that is
00338   dynamically allocated from buf_buddy_alloc()).  Because those
00339   transitions hold block->mutex and the flush list mutex (via
00340   buf_flush_relocate_on_flush_list()), there is no possibility
00341   of a race condition in the assertions below. */
00342   ut_ad(block->page.in_LRU_list);
00343   ut_ad(block->page.in_page_hash);
00344   /* buf_buddy_block_register() will take a block in the
00345   BUF_BLOCK_MEMORY state, not a file page. */
00346   ut_ad(!block->page.in_zip_hash);
00347 
00348   ut_ad(!block->page.in_flush_list);
00349   ut_d(block->page.in_flush_list = TRUE);
00350   block->page.oldest_modification = lsn;
00351 
00352 #ifdef UNIV_DEBUG_VALGRIND
00353   {
00354     ulint zip_size = buf_block_get_zip_size(block);
00355 
00356     if (UNIV_UNLIKELY(zip_size)) {
00357       UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
00358     } else {
00359       UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
00360     }
00361   }
00362 #endif /* UNIV_DEBUG_VALGRIND */
00363 
00364 #ifdef UNIV_DEBUG_VALGRIND
00365   {
00366     ulint zip_size = buf_block_get_zip_size(block);
00367 
00368     if (UNIV_UNLIKELY(zip_size)) {
00369       UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
00370     } else {
00371       UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
00372     }
00373   }
00374 #endif /* UNIV_DEBUG_VALGRIND */
00375 
00376   prev_b = NULL;
00377 
00378   /* For the most part when this function is called the flush_rbt
00379   should not be NULL. In a very rare boundary case it is possible
00380   that the flush_rbt has already been freed by the recovery thread
00381   before the last page was hooked up in the flush_list by the
00382   io-handler thread. In that case we'll  just do a simple
00383   linear search in the else block. */
00384   if (buf_pool->flush_rbt) {
00385 
00386     prev_b = buf_flush_insert_in_flush_rbt(&block->page);
00387 
00388   } else {
00389 
00390     b = UT_LIST_GET_FIRST(buf_pool->flush_list);
00391 
00392     while (b && b->oldest_modification
00393            > block->page.oldest_modification) {
00394       ut_ad(b->in_flush_list);
00395       prev_b = b;
00396       b = UT_LIST_GET_NEXT(list, b);
00397     }
00398   }
00399 
00400   if (prev_b == NULL) {
00401     UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
00402   } else {
00403     UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
00404              prev_b, &block->page);
00405   }
00406 
00407 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00408   ut_a(buf_flush_validate_low(buf_pool));
00409 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00410 
00411   buf_flush_list_mutex_exit(buf_pool);
00412 }
00413 
00414 /********************************************************************/
00418 UNIV_INTERN
00419 ibool
00420 buf_flush_ready_for_replace(
00421 /*========================*/
00422   buf_page_t* bpage)  
00424 {
00425 #ifdef UNIV_DEBUG
00426   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00427   ut_ad(buf_pool_mutex_own(buf_pool));
00428 #endif
00429   ut_ad(mutex_own(buf_page_get_mutex(bpage)));
00430   ut_ad(bpage->in_LRU_list);
00431 
00432   if (UNIV_LIKELY(buf_page_in_file(bpage))) {
00433 
00434     return(bpage->oldest_modification == 0
00435            && buf_page_get_io_fix(bpage) == BUF_IO_NONE
00436            && bpage->buf_fix_count == 0);
00437   }
00438 
00439   ut_print_timestamp(stderr);
00440   fprintf(stderr,
00441     "  InnoDB: Error: buffer block state %lu"
00442     " in the LRU list!\n",
00443     (ulong) buf_page_get_state(bpage));
00444   ut_print_buf(stderr, bpage, sizeof(buf_page_t));
00445   putc('\n', stderr);
00446 
00447   return(FALSE);
00448 }
00449 
00450 /********************************************************************/
00453 UNIV_INLINE
00454 ibool
00455 buf_flush_ready_for_flush(
00456 /*======================*/
00457   buf_page_t* bpage,  
00459   enum buf_flush  flush_type)
00460 {
00461 #ifdef UNIV_DEBUG
00462   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00463   ut_ad(buf_pool_mutex_own(buf_pool));
00464 #endif
00465   ut_a(buf_page_in_file(bpage));
00466   ut_ad(mutex_own(buf_page_get_mutex(bpage)));
00467   ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
00468 
00469   if (bpage->oldest_modification != 0
00470       && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
00471     ut_ad(bpage->in_flush_list);
00472 
00473     if (flush_type != BUF_FLUSH_LRU) {
00474 
00475       return(TRUE);
00476 
00477     } else if (bpage->buf_fix_count == 0) {
00478 
00479       /* If we are flushing the LRU list, to avoid deadlocks
00480       we require the block not to be bufferfixed, and hence
00481       not latched. */
00482 
00483       return(TRUE);
00484     }
00485   }
00486 
00487   return(FALSE);
00488 }
00489 
00490 /********************************************************************/
00492 UNIV_INTERN
00493 void
00494 buf_flush_remove(
00495 /*=============*/
00496   buf_page_t* bpage)  
00497 {
00498   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00499 
00500   ut_ad(buf_pool_mutex_own(buf_pool));
00501   ut_ad(mutex_own(buf_page_get_mutex(bpage)));
00502   ut_ad(bpage->in_flush_list);
00503 
00504   buf_flush_list_mutex_enter(buf_pool);
00505 
00506   switch (buf_page_get_state(bpage)) {
00507   case BUF_BLOCK_ZIP_PAGE:
00508     /* Clean compressed pages should not be on the flush list */
00509   case BUF_BLOCK_ZIP_FREE:
00510   case BUF_BLOCK_NOT_USED:
00511   case BUF_BLOCK_READY_FOR_USE:
00512   case BUF_BLOCK_MEMORY:
00513   case BUF_BLOCK_REMOVE_HASH:
00514     ut_error;
00515     return;
00516   case BUF_BLOCK_ZIP_DIRTY:
00517     buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
00518     UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
00519     buf_LRU_insert_zip_clean(bpage);
00520     break;
00521   case BUF_BLOCK_FILE_PAGE:
00522     UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
00523     break;
00524   }
00525 
00526   /* If the flush_rbt is active then delete from there as well. */
00527   if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
00528     buf_flush_delete_from_flush_rbt(bpage);
00529   }
00530 
00531   /* Must be done after we have removed it from the flush_rbt
00532   because we assert on in_flush_list in comparison function. */
00533   ut_d(bpage->in_flush_list = FALSE);
00534 
00535   bpage->oldest_modification = 0;
00536 
00537 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00538   ut_a(buf_flush_validate_low(buf_pool));
00539 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00540 
00541   buf_flush_list_mutex_exit(buf_pool);
00542 }
00543 
00544 /*******************************************************************/
00555 UNIV_INTERN
00556 void
00557 buf_flush_relocate_on_flush_list(
00558 /*=============================*/
00559   buf_page_t* bpage,  
00560   buf_page_t* dpage)  
00561 {
00562   buf_page_t* prev;
00563   buf_page_t*   prev_b = NULL;
00564   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00565 
00566   ut_ad(buf_pool_mutex_own(buf_pool));
00567   /* Must reside in the same buffer pool. */
00568   ut_ad(buf_pool == buf_pool_from_bpage(dpage));
00569 
00570   ut_ad(mutex_own(buf_page_get_mutex(bpage)));
00571 
00572   buf_flush_list_mutex_enter(buf_pool);
00573 
00574   /* FIXME: At this point we have both buf_pool and flush_list
00575   mutexes. Theoretically removal of a block from flush list is
00576   only covered by flush_list mutex but currently we do
00577   have buf_pool mutex in buf_flush_remove() therefore this block
00578   is guaranteed to be in the flush list. We need to check if
00579   this will work without the assumption of block removing code
00580   having the buf_pool mutex. */
00581   ut_ad(bpage->in_flush_list);
00582   ut_ad(dpage->in_flush_list);
00583 
00584   /* If recovery is active we must swap the control blocks in
00585   the flush_rbt as well. */
00586   if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
00587     buf_flush_delete_from_flush_rbt(bpage);
00588     prev_b = buf_flush_insert_in_flush_rbt(dpage);
00589   }
00590 
00591   /* Must be done after we have removed it from the flush_rbt
00592   because we assert on in_flush_list in comparison function. */
00593   ut_d(bpage->in_flush_list = FALSE);
00594 
00595   prev = UT_LIST_GET_PREV(list, bpage);
00596   UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
00597 
00598   if (prev) {
00599     ut_ad(prev->in_flush_list);
00600     UT_LIST_INSERT_AFTER(
00601       list,
00602       buf_pool->flush_list,
00603       prev, dpage);
00604   } else {
00605     UT_LIST_ADD_FIRST(
00606       list,
00607       buf_pool->flush_list,
00608       dpage);
00609   }
00610 
00611   /* Just an extra check. Previous in flush_list
00612   should be the same control block as in flush_rbt. */
00613   ut_a(!buf_pool->flush_rbt || prev_b == prev);
00614 
00615 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
00616   ut_a(buf_flush_validate_low(buf_pool));
00617 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
00618 
00619   buf_flush_list_mutex_exit(buf_pool);
00620 }
00621 
00622 /********************************************************************/
00624 UNIV_INTERN
00625 void
00626 buf_flush_write_complete(
00627 /*=====================*/
00628   buf_page_t* bpage)  
00629 {
00630   enum buf_flush  flush_type;
00631   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
00632 
00633   ut_ad(bpage);
00634 
00635   buf_flush_remove(bpage);
00636 
00637   flush_type = buf_page_get_flush_type(bpage);
00638   buf_pool->n_flush[flush_type]--;
00639 
00640   if (flush_type == BUF_FLUSH_LRU) {
00641     /* Put the block to the end of the LRU list to wait to be
00642     moved to the free list */
00643 
00644     buf_LRU_make_block_old(bpage);
00645 
00646     buf_pool->LRU_flush_ended++;
00647   }
00648 
00649   /* fprintf(stderr, "n pending flush %lu\n",
00650   buf_pool->n_flush[flush_type]); */
00651 
00652   if (buf_pool->n_flush[flush_type] == 0
00653       && buf_pool->init_flush[flush_type] == FALSE) {
00654 
00655     /* The running flush batch has ended */
00656 
00657     os_event_set(buf_pool->no_flush[flush_type]);
00658   }
00659 }
00660 
00661 /********************************************************************/
00664 static
00665 void
00666 buf_flush_sync_datafiles(void)
00667 /*==========================*/
00668 {
00669   /* Wake possible simulated aio thread to actually post the
00670   writes to the operating system */
00671   os_aio_simulated_wake_handler_threads();
00672 
00673   /* Wait that all async writes to tablespaces have been posted to
00674   the OS */
00675   os_aio_wait_until_no_pending_writes();
00676 
00677   /* Now we flush the data to disk (for example, with fsync) */
00678   fil_flush_file_spaces(FIL_TABLESPACE);
00679 
00680   return;
00681 }
00682 
00683 /********************************************************************/
00689 static
00690 void
00691 buf_flush_buffered_writes(void)
00692 /*===========================*/
00693 {
00694   byte*   write_buf;
00695   ulint   len;
00696   ulint   len2;
00697   ulint   i;
00698 
00699   if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
00700     /* Sync the writes to the disk. */
00701     buf_flush_sync_datafiles();
00702     return;
00703   }
00704 
00705   mutex_enter(&(trx_doublewrite->mutex));
00706 
00707   /* Write first to doublewrite buffer blocks. We use synchronous
00708   aio and thus know that file write has been completed when the
00709   control returns. */
00710 
00711   if (trx_doublewrite->first_free == 0) {
00712 
00713     mutex_exit(&(trx_doublewrite->mutex));
00714 
00715     return;
00716   }
00717 
00718   for (i = 0; i < trx_doublewrite->first_free; i++) {
00719 
00720     const buf_block_t*  block;
00721 
00722     block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
00723 
00724     if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
00725         || block->page.zip.data) {
00726       /* No simple validate for compressed pages exists. */
00727       continue;
00728     }
00729 
00730     if (UNIV_UNLIKELY
00731         (memcmp(block->frame + (FIL_PAGE_LSN + 4),
00732           block->frame + (UNIV_PAGE_SIZE
00733               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
00734           4))) {
00735       ut_print_timestamp(stderr);
00736       fprintf(stderr,
00737         "  InnoDB: ERROR: The page to be written"
00738         " seems corrupt!\n"
00739         "InnoDB: The lsn fields do not match!"
00740         " Noticed in the buffer pool\n"
00741         "InnoDB: before posting to the"
00742         " doublewrite buffer.\n");
00743     }
00744 
00745     if (!block->check_index_page_at_flush) {
00746     } else if (page_is_comp(block->frame)) {
00747       if (UNIV_UNLIKELY
00748           (!page_simple_validate_new(block->frame))) {
00749 corrupted_page:
00750         buf_page_print(block->frame, 0);
00751 
00752         ut_print_timestamp(stderr);
00753         fprintf(stderr,
00754           "  InnoDB: Apparent corruption of an"
00755           " index page n:o %lu in space %lu\n"
00756           "InnoDB: to be written to data file."
00757           " We intentionally crash server\n"
00758           "InnoDB: to prevent corrupt data"
00759           " from ending up in data\n"
00760           "InnoDB: files.\n",
00761           (ulong) buf_block_get_page_no(block),
00762           (ulong) buf_block_get_space(block));
00763 
00764         ut_error;
00765       }
00766     } else if (UNIV_UNLIKELY
00767          (!page_simple_validate_old(block->frame))) {
00768 
00769       goto corrupted_page;
00770     }
00771   }
00772 
00773   /* increment the doublewrite flushed pages counter */
00774   srv_dblwr_pages_written+= trx_doublewrite->first_free;
00775   srv_dblwr_writes++;
00776 
00777   len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
00778          trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
00779 
00780   write_buf = trx_doublewrite->write_buf;
00781   i = 0;
00782 
00783   fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
00784          trx_doublewrite->block1, 0, len,
00785          (void*) write_buf, NULL);
00786 
00787   for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
00788        len2 += UNIV_PAGE_SIZE, i++) {
00789     const buf_block_t* block = (buf_block_t*)
00790       trx_doublewrite->buf_block_arr[i];
00791 
00792     if (UNIV_LIKELY(!block->page.zip.data)
00793         && UNIV_LIKELY(buf_block_get_state(block)
00794            == BUF_BLOCK_FILE_PAGE)
00795         && UNIV_UNLIKELY
00796         (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
00797           write_buf + len2
00798           + (UNIV_PAGE_SIZE
00799              - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
00800       ut_print_timestamp(stderr);
00801       fprintf(stderr,
00802         "  InnoDB: ERROR: The page to be written"
00803         " seems corrupt!\n"
00804         "InnoDB: The lsn fields do not match!"
00805         " Noticed in the doublewrite block1.\n");
00806     }
00807   }
00808 
00809   if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
00810     goto flush;
00811   }
00812 
00813   len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
00814     * UNIV_PAGE_SIZE;
00815 
00816   write_buf = trx_doublewrite->write_buf
00817     + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
00818   ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
00819 
00820   fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
00821          trx_doublewrite->block2, 0, len,
00822          (void*) write_buf, NULL);
00823 
00824   for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
00825        len2 += UNIV_PAGE_SIZE, i++) {
00826     const buf_block_t* block = (buf_block_t*)
00827       trx_doublewrite->buf_block_arr[i];
00828 
00829     if (UNIV_LIKELY(!block->page.zip.data)
00830         && UNIV_LIKELY(buf_block_get_state(block)
00831            == BUF_BLOCK_FILE_PAGE)
00832         && UNIV_UNLIKELY
00833         (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
00834           write_buf + len2
00835           + (UNIV_PAGE_SIZE
00836              - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
00837       ut_print_timestamp(stderr);
00838       fprintf(stderr,
00839         "  InnoDB: ERROR: The page to be"
00840         " written seems corrupt!\n"
00841         "InnoDB: The lsn fields do not match!"
00842         " Noticed in"
00843         " the doublewrite block2.\n");
00844     }
00845   }
00846 
00847 flush:
00848   /* Now flush the doublewrite buffer data to disk */
00849 
00850   fil_flush(TRX_SYS_SPACE);
00851 
00852   /* We know that the writes have been flushed to disk now
00853   and in recovery we will find them in the doublewrite buffer
00854   blocks. Next do the writes to the intended positions. */
00855 
00856   for (i = 0; i < trx_doublewrite->first_free; i++) {
00857     const buf_block_t* block = (buf_block_t*)
00858       trx_doublewrite->buf_block_arr[i];
00859 
00860     ut_a(buf_page_in_file(&block->page));
00861     if (UNIV_LIKELY_NULL(block->page.zip.data)) {
00862       fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
00863              FALSE, buf_page_get_space(&block->page),
00864              buf_page_get_zip_size(&block->page),
00865              buf_page_get_page_no(&block->page), 0,
00866              buf_page_get_zip_size(&block->page),
00867              (void*)block->page.zip.data,
00868              (void*)block);
00869 
00870       /* Increment the counter of I/O operations used
00871       for selecting LRU policy. */
00872       buf_LRU_stat_inc_io();
00873 
00874       continue;
00875     }
00876 
00877     ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
00878 
00879     if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
00880            block->frame
00881            + (UNIV_PAGE_SIZE
00882               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
00883            4))) {
00884       ut_print_timestamp(stderr);
00885       fprintf(stderr,
00886         "  InnoDB: ERROR: The page to be written"
00887         " seems corrupt!\n"
00888         "InnoDB: The lsn fields do not match!"
00889         " Noticed in the buffer pool\n"
00890         "InnoDB: after posting and flushing"
00891         " the doublewrite buffer.\n"
00892         "InnoDB: Page buf fix count %lu,"
00893         " io fix %lu, state %lu\n",
00894         (ulong)block->page.buf_fix_count,
00895         (ulong)buf_block_get_io_fix(block),
00896         (ulong)buf_block_get_state(block));
00897     }
00898 
00899     fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
00900            FALSE, buf_block_get_space(block), 0,
00901            buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
00902            (void*)block->frame, (void*)block);
00903 
00904     /* Increment the counter of I/O operations used
00905     for selecting LRU policy. */
00906     buf_LRU_stat_inc_io();
00907   }
00908 
00909   /* Sync the writes to the disk. */
00910   buf_flush_sync_datafiles();
00911 
00912   /* We can now reuse the doublewrite memory buffer: */
00913   trx_doublewrite->first_free = 0;
00914 
00915   mutex_exit(&(trx_doublewrite->mutex));
00916 }
00917 
00918 /********************************************************************/
00922 static
00923 void
00924 buf_flush_post_to_doublewrite_buf(
00925 /*==============================*/
00926   buf_page_t* bpage)  
00927 {
00928   ulint zip_size;
00929 try_again:
00930   mutex_enter(&(trx_doublewrite->mutex));
00931 
00932   ut_a(buf_page_in_file(bpage));
00933 
00934   if (trx_doublewrite->first_free
00935       >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
00936     mutex_exit(&(trx_doublewrite->mutex));
00937 
00938     buf_flush_buffered_writes();
00939 
00940     goto try_again;
00941   }
00942 
00943   zip_size = buf_page_get_zip_size(bpage);
00944 
00945   if (UNIV_UNLIKELY(zip_size)) {
00946     UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
00947     /* Copy the compressed page and clear the rest. */
00948     memcpy(trx_doublewrite->write_buf
00949            + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
00950            bpage->zip.data, zip_size);
00951     memset(trx_doublewrite->write_buf
00952            + UNIV_PAGE_SIZE * trx_doublewrite->first_free
00953            + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
00954   } else {
00955     ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
00956     UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
00957            UNIV_PAGE_SIZE);
00958 
00959     memcpy(trx_doublewrite->write_buf
00960            + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
00961            ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
00962   }
00963 
00964   trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
00965 
00966   trx_doublewrite->first_free++;
00967 
00968   if (trx_doublewrite->first_free
00969       >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
00970     mutex_exit(&(trx_doublewrite->mutex));
00971 
00972     buf_flush_buffered_writes();
00973 
00974     return;
00975   }
00976 
00977   mutex_exit(&(trx_doublewrite->mutex));
00978 }
00979 #endif /* !UNIV_HOTBACKUP */
00980 
00981 /********************************************************************/
00983 UNIV_INTERN
00984 void
00985 buf_flush_init_for_writing(
00986 /*=======================*/
00987   byte*   page,   
00988   void*   page_zip_,  
00989   ib_uint64_t newest_lsn) 
00991 {
00992   ut_ad(page);
00993 
00994   if (page_zip_) {
00995     page_zip_des_t* page_zip = static_cast<page_zip_des_t *>(page_zip_);
00996     ulint   zip_size = page_zip_get_size(page_zip);
00997     ut_ad(zip_size);
00998     ut_ad(ut_is_2pow(zip_size));
00999     ut_ad(zip_size <= UNIV_PAGE_SIZE);
01000 
01001     switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
01002     case FIL_PAGE_TYPE_ALLOCATED:
01003     case FIL_PAGE_INODE:
01004     case FIL_PAGE_IBUF_BITMAP:
01005     case FIL_PAGE_TYPE_FSP_HDR:
01006     case FIL_PAGE_TYPE_XDES:
01007       /* These are essentially uncompressed pages. */
01008       memcpy(page_zip->data, page, zip_size);
01009       /* fall through */
01010     case FIL_PAGE_TYPE_ZBLOB:
01011     case FIL_PAGE_TYPE_ZBLOB2:
01012     case FIL_PAGE_INDEX:
01013       mach_write_to_8(page_zip->data
01014           + FIL_PAGE_LSN, newest_lsn);
01015       memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
01016       mach_write_to_4(page_zip->data
01017           + FIL_PAGE_SPACE_OR_CHKSUM,
01018           srv_use_checksums
01019           ? page_zip_calc_checksum(
01020             page_zip->data, zip_size)
01021           : BUF_NO_CHECKSUM_MAGIC);
01022       return;
01023     }
01024 
01025     ut_print_timestamp(stderr);
01026     fputs("  InnoDB: ERROR: The compressed page to be written"
01027           " seems corrupt:", stderr);
01028     ut_print_buf(stderr, page, zip_size);
01029     fputs("\nInnoDB: Possibly older version of the page:", stderr);
01030     ut_print_buf(stderr, page_zip->data, zip_size);
01031     putc('\n', stderr);
01032     ut_error;
01033   }
01034 
01035   /* Write the newest modification lsn to the page header and trailer */
01036   mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
01037 
01038   mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
01039       newest_lsn);
01040 
01041   /* Store the new formula checksum */
01042 
01043   mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
01044       srv_use_checksums
01045       ? buf_calc_page_new_checksum(page)
01046       : BUF_NO_CHECKSUM_MAGIC);
01047 
01048   /* We overwrite the first 4 bytes of the end lsn field to store
01049   the old formula checksum. Since it depends also on the field
01050   FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
01051   new formula checksum. */
01052 
01053   mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
01054       srv_use_checksums
01055       ? buf_calc_page_old_checksum(page)
01056       : BUF_NO_CHECKSUM_MAGIC);
01057 }
01058 
01059 #ifndef UNIV_HOTBACKUP
01060 /********************************************************************/
01064 static
01065 void
01066 buf_flush_write_block_low(
01067 /*======================*/
01068   buf_page_t* bpage)  
01069 {
01070   ulint zip_size  = buf_page_get_zip_size(bpage);
01071   page_t* frame   = NULL;
01072 
01073 #ifdef UNIV_DEBUG
01074   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
01075   ut_ad(!buf_pool_mutex_own(buf_pool));
01076 #endif
01077 
01078 #ifdef UNIV_LOG_DEBUG
01079   static ibool univ_log_debug_warned;
01080 #endif /* UNIV_LOG_DEBUG */
01081 
01082   ut_ad(buf_page_in_file(bpage));
01083 
01084   /* We are not holding buf_pool->mutex or block_mutex here.
01085   Nevertheless, it is safe to access bpage, because it is
01086   io_fixed and oldest_modification != 0.  Thus, it cannot be
01087   relocated in the buffer pool or removed from flush_list or
01088   LRU_list. */
01089   ut_ad(!buf_pool_mutex_own(buf_pool));
01090   ut_ad(!buf_flush_list_mutex_own(buf_pool));
01091   ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
01092   ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
01093   ut_ad(bpage->oldest_modification != 0);
01094 
01095 #ifdef UNIV_IBUF_COUNT_DEBUG
01096   ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
01097 #endif
01098   ut_ad(bpage->newest_modification != 0);
01099 
01100 #ifdef UNIV_LOG_DEBUG
01101   if (!univ_log_debug_warned) {
01102     univ_log_debug_warned = TRUE;
01103     fputs("Warning: cannot force log to disk if"
01104           " UNIV_LOG_DEBUG is defined!\n"
01105           "Crash recovery will not work!\n",
01106           stderr);
01107   }
01108 #else
01109   /* Force the log to the disk before writing the modified block */
01110   log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
01111 #endif
01112   switch (buf_page_get_state(bpage)) {
01113   case BUF_BLOCK_ZIP_FREE:
01114   case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
01115   case BUF_BLOCK_NOT_USED:
01116   case BUF_BLOCK_READY_FOR_USE:
01117   case BUF_BLOCK_MEMORY:
01118   case BUF_BLOCK_REMOVE_HASH:
01119     ut_error;
01120     break;
01121   case BUF_BLOCK_ZIP_DIRTY:
01122     frame = bpage->zip.data;
01123     if (UNIV_LIKELY(srv_use_checksums)) {
01124       ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
01125            == page_zip_calc_checksum(frame, zip_size));
01126     }
01127     mach_write_to_8(frame + FIL_PAGE_LSN,
01128         bpage->newest_modification);
01129     memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
01130     break;
01131   case BUF_BLOCK_FILE_PAGE:
01132     frame = bpage->zip.data;
01133     if (!frame) {
01134       frame = ((buf_block_t*) bpage)->frame;
01135     }
01136 
01137     buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
01138              bpage->zip.data
01139              ? &bpage->zip : NULL,
01140              bpage->newest_modification);
01141     break;
01142   }
01143 
01144   if (!srv_use_doublewrite_buf || !trx_doublewrite) {
01145     fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
01146            FALSE, buf_page_get_space(bpage), zip_size,
01147            buf_page_get_page_no(bpage), 0,
01148            zip_size ? zip_size : UNIV_PAGE_SIZE,
01149            frame, bpage);
01150   } else {
01151     buf_flush_post_to_doublewrite_buf(bpage);
01152   }
01153 }
01154 
01155 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
01156 /********************************************************************/
01162 UNIV_INTERN
01163 ibool
01164 buf_flush_page_try(
01165 /*===============*/
01166   buf_pool_t* buf_pool, 
01167   buf_block_t*  block)    
01168 {
01169   ut_ad(buf_pool_mutex_own(buf_pool));
01170   ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
01171   ut_ad(mutex_own(&block->mutex));
01172 
01173   if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
01174     return(FALSE);
01175   }
01176 
01177   if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
01178       || buf_pool->init_flush[BUF_FLUSH_LRU]) {
01179     /* There is already a flush batch of the same type running */
01180     return(FALSE);
01181   }
01182 
01183   buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;
01184 
01185   buf_page_set_io_fix(&block->page, BUF_IO_WRITE);
01186 
01187   buf_page_set_flush_type(&block->page, BUF_FLUSH_LRU);
01188 
01189   if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {
01190 
01191     os_event_reset(buf_pool->no_flush[BUF_FLUSH_LRU]);
01192   }
01193 
01194   /* VERY IMPORTANT:
01195   Because any thread may call the LRU flush, even when owning
01196   locks on pages, to avoid deadlocks, we must make sure that the
01197   s-lock is acquired on the page without waiting: this is
01198   accomplished because buf_flush_ready_for_flush() must hold,
01199   and that requires the page not to be bufferfixed. */
01200 
01201   rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);
01202 
01203   /* Note that the s-latch is acquired before releasing the
01204   buf_pool mutex: this ensures that the latch is acquired
01205   immediately. */
01206 
01207   mutex_exit(&block->mutex);
01208   buf_pool_mutex_exit(buf_pool);
01209 
01210   /* Even though block is not protected by any mutex at this
01211   point, it is safe to access block, because it is io_fixed and
01212   oldest_modification != 0.  Thus, it cannot be relocated in the
01213   buffer pool or removed from flush_list or LRU_list. */
01214 
01215   buf_flush_write_block_low(&block->page);
01216 
01217   buf_pool_mutex_enter(buf_pool);
01218   buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;
01219 
01220   if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
01221     /* The running flush batch has ended */
01222     os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
01223   }
01224 
01225   buf_pool_mutex_exit(buf_pool);
01226   buf_flush_buffered_writes();
01227 
01228   return(TRUE);
01229 }
01230 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
01231 
01232 /********************************************************************/
01239 static
01240 void
01241 buf_flush_page(
01242 /*===========*/
01243   buf_pool_t* buf_pool, 
01244   buf_page_t* bpage,    
01245   enum buf_flush  flush_type) 
01247 {
01248   mutex_t*  block_mutex;
01249   ibool   is_uncompressed;
01250 
01251   ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
01252   ut_ad(buf_pool_mutex_own(buf_pool));
01253   ut_ad(buf_page_in_file(bpage));
01254 
01255   block_mutex = buf_page_get_mutex(bpage);
01256   ut_ad(mutex_own(block_mutex));
01257 
01258   ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
01259 
01260   buf_page_set_io_fix(bpage, BUF_IO_WRITE);
01261 
01262   buf_page_set_flush_type(bpage, flush_type);
01263 
01264   if (buf_pool->n_flush[flush_type] == 0) {
01265 
01266     os_event_reset(buf_pool->no_flush[flush_type]);
01267   }
01268 
01269   buf_pool->n_flush[flush_type]++;
01270 
01271   is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
01272   ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
01273 
01274   switch (flush_type) {
01275     ibool is_s_latched;
01276   case BUF_FLUSH_LIST:
01277     /* If the simulated aio thread is not running, we must
01278     not wait for any latch, as we may end up in a deadlock:
01279     if buf_fix_count == 0, then we know we need not wait */
01280 
01281     is_s_latched = (bpage->buf_fix_count == 0);
01282     if (is_s_latched && is_uncompressed) {
01283       rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
01284              BUF_IO_WRITE);
01285     }
01286 
01287     mutex_exit(block_mutex);
01288     buf_pool_mutex_exit(buf_pool);
01289 
01290     /* Even though bpage is not protected by any mutex at
01291     this point, it is safe to access bpage, because it is
01292     io_fixed and oldest_modification != 0.  Thus, it
01293     cannot be relocated in the buffer pool or removed from
01294     flush_list or LRU_list. */
01295 
01296     if (!is_s_latched) {
01297       buf_flush_buffered_writes();
01298 
01299       if (is_uncompressed) {
01300         rw_lock_s_lock_gen(&((buf_block_t*) bpage)
01301                ->lock, BUF_IO_WRITE);
01302       }
01303     }
01304 
01305     break;
01306 
01307   case BUF_FLUSH_LRU:
01308     /* VERY IMPORTANT:
01309     Because any thread may call the LRU flush, even when owning
01310     locks on pages, to avoid deadlocks, we must make sure that the
01311     s-lock is acquired on the page without waiting: this is
01312     accomplished because buf_flush_ready_for_flush() must hold,
01313     and that requires the page not to be bufferfixed. */
01314 
01315     if (is_uncompressed) {
01316       rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
01317              BUF_IO_WRITE);
01318     }
01319 
01320     /* Note that the s-latch is acquired before releasing the
01321     buf_pool mutex: this ensures that the latch is acquired
01322     immediately. */
01323 
01324     mutex_exit(block_mutex);
01325     buf_pool_mutex_exit(buf_pool);
01326     break;
01327 
01328   default:
01329     ut_error;
01330   }
01331 
01332   /* Even though bpage is not protected by any mutex at this
01333   point, it is safe to access bpage, because it is io_fixed and
01334   oldest_modification != 0.  Thus, it cannot be relocated in the
01335   buffer pool or removed from flush_list or LRU_list. */
01336 
01337 #ifdef UNIV_DEBUG
01338   if (buf_debug_prints) {
01339     fprintf(stderr,
01340       "Flushing %u space %u page %u\n",
01341       flush_type, bpage->space, bpage->offset);
01342   }
01343 #endif /* UNIV_DEBUG */
01344   buf_flush_write_block_low(bpage);
01345 }
01346 
01347 /***********************************************************/
01350 static
01351 ulint
01352 buf_flush_try_neighbors(
01353 /*====================*/
01354   ulint   space,    
01355   ulint   offset,   
01356   enum buf_flush  flush_type, 
01358   ulint   n_flushed,  
01360   ulint   n_to_flush) 
01362 {
01363   ulint   i;
01364   ulint   low;
01365   ulint   high;
01366   ulint   count = 0;
01367   buf_pool_t* buf_pool = buf_pool_get(space, offset);
01368 
01369   ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
01370 
01371   if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
01372     /* If there is little space, it is better not to flush
01373     any block except from the end of the LRU list */
01374 
01375     low = offset;
01376     high = offset + 1;
01377   } else {
01378     /* When flushed, dirty blocks are searched in
01379     neighborhoods of this size, and flushed along with the
01380     original page. */
01381 
01382     ulint buf_flush_area;
01383   
01384     buf_flush_area  = ut_min(
01385       BUF_READ_AHEAD_AREA(buf_pool),
01386       buf_pool->curr_size / 16);
01387 
01388     low = (offset / buf_flush_area) * buf_flush_area;
01389     high = (offset / buf_flush_area + 1) * buf_flush_area;
01390   }
01391 
01392   /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
01393 
01394   if (high > fil_space_get_size(space)) {
01395     high = fil_space_get_size(space);
01396   }
01397 
01398   for (i = low; i < high; i++) {
01399 
01400     buf_page_t* bpage;
01401 
01402     if ((count + n_flushed) >= n_to_flush) {
01403 
01404       /* We have already flushed enough pages and
01405       should call it a day. There is, however, one
01406       exception. If the page whose neighbors we
01407       are flushing has not been flushed yet then
01408       we'll try to flush the victim that we
01409       selected originally. */
01410       if (i <= offset) {
01411         i = offset;
01412       } else {
01413         break;
01414       }
01415     }
01416 
01417     buf_pool = buf_pool_get(space, i);
01418 
01419     buf_pool_mutex_enter(buf_pool);
01420 
01421     /* We only want to flush pages from this buffer pool. */
01422     bpage = buf_page_hash_get(buf_pool, space, i);
01423 
01424     if (!bpage) {
01425 
01426       buf_pool_mutex_exit(buf_pool);
01427       continue;
01428     }
01429 
01430     ut_a(buf_page_in_file(bpage));
01431 
01432     /* We avoid flushing 'non-old' blocks in an LRU flush,
01433     because the flushed blocks are soon freed */
01434 
01435     if (flush_type != BUF_FLUSH_LRU
01436         || i == offset
01437         || buf_page_is_old(bpage)) {
01438       mutex_t* block_mutex = buf_page_get_mutex(bpage);
01439 
01440       mutex_enter(block_mutex);
01441 
01442       if (buf_flush_ready_for_flush(bpage, flush_type)
01443           && (i == offset || !bpage->buf_fix_count)) {
01444         /* We only try to flush those
01445         neighbors != offset where the buf fix
01446         count is zero, as we then know that we
01447         probably can latch the page without a
01448         semaphore wait. Semaphore waits are
01449         expensive because we must flush the
01450         doublewrite buffer before we start
01451         waiting. */
01452 
01453         buf_flush_page(buf_pool, bpage, flush_type);
01454         ut_ad(!mutex_own(block_mutex));
01455         ut_ad(!buf_pool_mutex_own(buf_pool));
01456         count++;
01457         continue;
01458       } else {
01459         mutex_exit(block_mutex);
01460       }
01461     }
01462     buf_pool_mutex_exit(buf_pool);
01463   }
01464 
01465   return(count);
01466 }
01467 
01468 /********************************************************************/
01475 static
01476 ibool
01477 buf_flush_page_and_try_neighbors(
01478 /*=============================*/
01479   buf_page_t* bpage,    
01482   enum buf_flush  flush_type, 
01484   ulint   n_to_flush, 
01486   ulint*    count)    
01488 {
01489   mutex_t*  block_mutex;
01490   ibool   flushed = FALSE;
01491 #ifdef UNIV_DEBUG
01492   buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
01493 #endif /* UNIV_DEBUG */
01494 
01495   ut_ad(buf_pool_mutex_own(buf_pool));
01496 
01497   block_mutex = buf_page_get_mutex(bpage);
01498   mutex_enter(block_mutex);
01499 
01500   ut_a(buf_page_in_file(bpage));
01501 
01502   if (buf_flush_ready_for_flush(bpage, flush_type)) {
01503     ulint   space;
01504     ulint   offset;
01505     buf_pool_t* buf_pool;
01506 
01507     buf_pool = buf_pool_from_bpage(bpage);
01508 
01509     buf_pool_mutex_exit(buf_pool);
01510 
01511     /* These fields are protected by both the
01512     buffer pool mutex and block mutex. */
01513     space = buf_page_get_space(bpage);
01514     offset = buf_page_get_page_no(bpage);
01515 
01516     mutex_exit(block_mutex);
01517 
01518     /* Try to flush also all the neighbors */
01519     *count += buf_flush_try_neighbors(space,
01520               offset,
01521               flush_type,
01522               *count,
01523               n_to_flush);
01524 
01525     buf_pool_mutex_enter(buf_pool);
01526     flushed = TRUE;
01527   } else {
01528     mutex_exit(block_mutex);
01529   }
01530 
01531   ut_ad(buf_pool_mutex_own(buf_pool));
01532 
01533   return(flushed);
01534 }
01535 
01536 /*******************************************************************/
01542 static
01543 ulint
01544 buf_flush_LRU_list_batch(
01545 /*=====================*/
01546   buf_pool_t* buf_pool, 
01547   ulint   max)    
01548 {
01549   buf_page_t* bpage;
01550   ulint   count = 0;
01551 
01552   ut_ad(buf_pool_mutex_own(buf_pool));
01553 
01554   do {
01555     /* Start from the end of the list looking for a
01556     suitable block to be flushed. */
01557     bpage = UT_LIST_GET_LAST(buf_pool->LRU);
01558 
01559     /* Iterate backwards over the flush list till we find
01560     a page that isn't ready for flushing. */
01561     while (bpage != NULL
01562            && !buf_flush_page_and_try_neighbors(
01563         bpage, BUF_FLUSH_LRU, max, &count)) {
01564 
01565       bpage = UT_LIST_GET_PREV(LRU, bpage);
01566     }
01567   } while (bpage != NULL && count < max);
01568 
01569   /* We keep track of all flushes happening as part of LRU
01570   flush. When estimating the desired rate at which flush_list
01571   should be flushed, we factor in this value. */
01572   buf_lru_flush_page_count += count;
01573 
01574   ut_ad(buf_pool_mutex_own(buf_pool));
01575 
01576   return(count);
01577 }
01578 
01579 /*******************************************************************/
01585 static
01586 ulint
01587 buf_flush_flush_list_batch(
01588 /*=======================*/
01589   buf_pool_t* buf_pool, 
01590   ulint   min_n,    
01594   ib_uint64_t lsn_limit)  
01599 {
01600   ulint   len;
01601   buf_page_t* bpage;
01602   ulint   count = 0;
01603 
01604   ut_ad(buf_pool_mutex_own(buf_pool));
01605 
01606   /* If we have flushed enough, leave the loop */
01607   do {
01608     /* Start from the end of the list looking for a suitable
01609     block to be flushed. */
01610 
01611     buf_flush_list_mutex_enter(buf_pool);
01612 
01613     /* We use len here because theoretically insertions can
01614     happen in the flush_list below while we are traversing
01615     it for a suitable candidate for flushing. We'd like to
01616     set a limit on how farther we are willing to traverse
01617     the list. */
01618     len = UT_LIST_GET_LEN(buf_pool->flush_list);
01619     bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
01620 
01621     if (bpage) {
01622       ut_a(bpage->oldest_modification > 0);
01623     }
01624 
01625     if (!bpage || bpage->oldest_modification >= lsn_limit) {
01626 
01627       /* We have flushed enough */
01628       buf_flush_list_mutex_exit(buf_pool);
01629       break;
01630     }
01631 
01632     ut_a(bpage->oldest_modification > 0);
01633 
01634     ut_ad(bpage->in_flush_list);
01635 
01636     buf_flush_list_mutex_exit(buf_pool);
01637 
01638     /* The list may change during the flushing and we cannot
01639     safely preserve within this function a pointer to a
01640     block in the list! */
01641     while (bpage != NULL
01642            && len > 0
01643            && !buf_flush_page_and_try_neighbors(
01644         bpage, BUF_FLUSH_LIST, min_n, &count)) {
01645 
01646       buf_flush_list_mutex_enter(buf_pool);
01647 
01648       /* If we are here that means that buf_pool->mutex
01649        was not released in buf_flush_page_and_try_neighbors()
01650       above and this guarantees that bpage didn't get
01651       relocated since we released the flush_list
01652       mutex above. There is a chance, however, that
01653       the bpage got removed from flush_list (not
01654       currently possible because flush_list_remove()
01655       also obtains buf_pool mutex but that may change
01656       in future). To avoid this scenario we check
01657       the oldest_modification and if it is zero
01658       we start all over again. */
01659       if (bpage->oldest_modification == 0) {
01660         buf_flush_list_mutex_exit(buf_pool);
01661         break;
01662       }
01663 
01664       bpage = UT_LIST_GET_PREV(list, bpage);
01665 
01666       ut_ad(!bpage || bpage->in_flush_list);
01667 
01668       buf_flush_list_mutex_exit(buf_pool);
01669 
01670       --len;
01671     }
01672 
01673   } while (count < min_n && bpage != NULL && len > 0);
01674 
01675   ut_ad(buf_pool_mutex_own(buf_pool));
01676 
01677   return(count);
01678 }
01679 
01680 /*******************************************************************/
01688 static
01689 ulint
01690 buf_flush_batch(
01691 /*============*/
01692   buf_pool_t* buf_pool, 
01693   enum buf_flush  flush_type, 
01697   ulint   min_n,    
01700   ib_uint64_t lsn_limit)  
01705 {
01706   ulint   count = 0;
01707 
01708   ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
01709 #ifdef UNIV_SYNC_DEBUG
01710   ut_ad((flush_type != BUF_FLUSH_LIST)
01711         || sync_thread_levels_empty_gen(TRUE));
01712 #endif /* UNIV_SYNC_DEBUG */
01713 
01714   buf_pool_mutex_enter(buf_pool);
01715 
01716   /* Note: The buffer pool mutex is released and reacquired within
01717   the flush functions. */
01718   switch(flush_type) {
01719   case BUF_FLUSH_LRU:
01720     count = buf_flush_LRU_list_batch(buf_pool, min_n);
01721     break;
01722   case BUF_FLUSH_LIST:
01723     count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
01724     break;
01725   default:
01726     ut_error;
01727   }
01728 
01729   buf_pool_mutex_exit(buf_pool);
01730 
01731   buf_flush_buffered_writes();
01732 
01733 #ifdef UNIV_DEBUG
01734   if (buf_debug_prints && count > 0) {
01735     fprintf(stderr, flush_type == BUF_FLUSH_LRU
01736       ? "Flushed %lu pages in LRU flush\n"
01737       : "Flushed %lu pages in flush list flush\n",
01738       (ulong) count);
01739   }
01740 #endif /* UNIV_DEBUG */
01741 
01742   srv_buf_pool_flushed += count;
01743 
01744   return(count);
01745 }
01746 
01747 /******************************************************************/
01749 static
01750 void
01751 buf_flush_common(
01752 /*=============*/
01753   enum buf_flush  flush_type, 
01754   ulint   page_count) 
01755 {
01756   buf_flush_buffered_writes();
01757 
01758   ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
01759 
01760 #ifdef UNIV_DEBUG
01761   if (buf_debug_prints && page_count > 0) {
01762     fprintf(stderr, flush_type == BUF_FLUSH_LRU
01763       ? "Flushed %lu pages in LRU flush\n"
01764       : "Flushed %lu pages in flush list flush\n",
01765       (ulong) page_count);
01766   }
01767 #endif /* UNIV_DEBUG */
01768 
01769   srv_buf_pool_flushed += page_count;
01770 
01771   if (flush_type == BUF_FLUSH_LRU) {
01772     /* We keep track of all flushes happening as part of LRU
01773     flush. When estimating the desired rate at which flush_list
01774     should be flushed we factor in this value. */
01775     buf_lru_flush_page_count += page_count;
01776   }
01777 }
01778 
01779 /******************************************************************/
01781 static
01782 ibool
01783 buf_flush_start(
01784 /*============*/
01785   buf_pool_t* buf_pool, 
01786   enum buf_flush  flush_type) 
01788 {
01789   buf_pool_mutex_enter(buf_pool);
01790 
01791   if (buf_pool->n_flush[flush_type] > 0
01792      || buf_pool->init_flush[flush_type] == TRUE) {
01793 
01794     /* There is already a flush batch of the same type running */
01795 
01796     buf_pool_mutex_exit(buf_pool);
01797 
01798     return(FALSE);
01799   }
01800 
01801   buf_pool->init_flush[flush_type] = TRUE;
01802 
01803   buf_pool_mutex_exit(buf_pool);
01804 
01805   return(TRUE);
01806 }
01807 
01808 /******************************************************************/
01810 static
01811 void
01812 buf_flush_end(
01813 /*==========*/
01814   buf_pool_t* buf_pool, 
01815   enum buf_flush  flush_type) 
01817 {
01818   buf_pool_mutex_enter(buf_pool);
01819 
01820   buf_pool->init_flush[flush_type] = FALSE;
01821 
01822   if (buf_pool->n_flush[flush_type] == 0) {
01823 
01824     /* The running flush batch has ended */
01825 
01826     os_event_set(buf_pool->no_flush[flush_type]);
01827   }
01828 
01829   buf_pool_mutex_exit(buf_pool);
01830 }
01831 
01832 /******************************************************************/
01834 UNIV_INTERN
01835 void
01836 buf_flush_wait_batch_end(
01837 /*=====================*/
01838   buf_pool_t* buf_pool, 
01839   enum buf_flush  type)   
01841 {
01842   ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
01843 
01844   if (buf_pool == NULL) {
01845     ulint i;
01846 
01847     for (i = 0; i < srv_buf_pool_instances; ++i) {
01848       buf_pool_t* i_buf_pool = buf_pool_from_array(i);
01849 
01850       os_event_wait(i_buf_pool->no_flush[type]);
01851     }
01852   } else {
01853     os_event_wait(buf_pool->no_flush[type]);
01854   }
01855 }
01856 
01857 /*******************************************************************/
01864 UNIV_INTERN
01865 ulint
01866 buf_flush_LRU(
01867 /*==========*/
01868   buf_pool_t* buf_pool, 
01869   ulint   min_n)    
01872 {
01873   ulint   page_count;
01874 
01875   if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
01876     return(ULINT_UNDEFINED);
01877   }
01878 
01879   page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
01880 
01881   buf_flush_end(buf_pool, BUF_FLUSH_LRU);
01882 
01883   buf_flush_common(BUF_FLUSH_LRU, page_count);
01884 
01885   return(page_count);
01886 }
01887 
01888 /*******************************************************************/
01894 UNIV_INTERN
01895 ulint
01896 buf_flush_list(
01897 /*===========*/
01898   ulint   min_n,    
01901   ib_uint64_t lsn_limit)  
01906 {
01907   ulint   i;
01908   ulint   total_page_count = 0;
01909   ibool   skipped = FALSE;
01910 
01911   if (min_n != ULINT_MAX) {
01912     /* Ensure that flushing is spread evenly amongst the
01913     buffer pool instances. When min_n is ULINT_MAX
01914     we need to flush everything up to the lsn limit
01915     so no limit here. */
01916     min_n = (min_n + srv_buf_pool_instances - 1)
01917        / srv_buf_pool_instances;
01918   }
01919 
01920   /* Flush to lsn_limit in all buffer pool instances */
01921   for (i = 0; i < srv_buf_pool_instances; i++) {
01922     buf_pool_t* buf_pool;
01923     ulint   page_count = 0;
01924 
01925     buf_pool = buf_pool_from_array(i);
01926 
01927     if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
01928       /* We have two choices here. If lsn_limit was
01929       specified then skipping an instance of buffer
01930       pool means we cannot guarantee that all pages
01931       up to lsn_limit has been flushed. We can
01932       return right now with failure or we can try
01933       to flush remaining buffer pools up to the
01934       lsn_limit. We attempt to flush other buffer
01935       pools based on the assumption that it will
01936       help in the retry which will follow the
01937       failure. */
01938       skipped = TRUE;
01939 
01940       continue;
01941     }
01942 
01943     page_count = buf_flush_batch(
01944       buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
01945 
01946     buf_flush_end(buf_pool, BUF_FLUSH_LIST);
01947 
01948     buf_flush_common(BUF_FLUSH_LIST, page_count);
01949 
01950     total_page_count += page_count;
01951   }
01952 
01953   return(lsn_limit != IB_ULONGLONG_MAX && skipped
01954          ? ULINT_UNDEFINED : total_page_count);
01955 }
01956  
01957 /******************************************************************/
01963 static
01964 ulint
01965 buf_flush_LRU_recommendation(
01966 /*=========================*/
01967   buf_pool_t* buf_pool)   
01968 {
01969   buf_page_t* bpage;
01970   ulint   n_replaceable;
01971   ulint   distance  = 0;
01972 
01973   buf_pool_mutex_enter(buf_pool);
01974 
01975   n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
01976 
01977   bpage = UT_LIST_GET_LAST(buf_pool->LRU);
01978 
01979   while ((bpage != NULL)
01980          && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
01981        + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
01982          && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
01983 
01984     mutex_t* block_mutex = buf_page_get_mutex(bpage);
01985 
01986     mutex_enter(block_mutex);
01987 
01988     if (buf_flush_ready_for_replace(bpage)) {
01989       n_replaceable++;
01990     }
01991 
01992     mutex_exit(block_mutex);
01993 
01994     distance++;
01995 
01996     bpage = UT_LIST_GET_PREV(LRU, bpage);
01997   }
01998 
01999   buf_pool_mutex_exit(buf_pool);
02000 
02001   if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
02002 
02003     return(0);
02004   }
02005 
02006   return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
02007          + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
02008          - n_replaceable);
02009 }
02010 
02011 /*********************************************************************/
02017 UNIV_INTERN
02018 void
02019 buf_flush_free_margin(
02020 /*==================*/
02021   buf_pool_t* buf_pool)   
02022 {
02023   ulint n_to_flush;
02024 
02025   n_to_flush = buf_flush_LRU_recommendation(buf_pool);
02026 
02027   if (n_to_flush > 0) {
02028     ulint n_flushed;
02029 
02030     n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
02031 
02032     if (n_flushed == ULINT_UNDEFINED) {
02033       /* There was an LRU type flush batch already running;
02034       let us wait for it to end */
02035 
02036       buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
02037     }
02038   }
02039 }
02040 
02041 /*********************************************************************/
02043 UNIV_INTERN
02044 void
02045 buf_flush_free_margins(void)
02046 /*========================*/
02047 {
02048   ulint i;
02049 
02050   for (i = 0; i < srv_buf_pool_instances; i++) {
02051     buf_pool_t* buf_pool;
02052 
02053     buf_pool = buf_pool_from_array(i);
02054 
02055     buf_flush_free_margin(buf_pool);
02056   }
02057 }
02058 
02059 /*********************************************************************
02060 Update the historical stats that we are collecting for flush rate
02061 heuristics at the end of each interval.
02062 Flush rate heuristic depends on (a) rate of redo log generation and
02063 (b) the rate at which LRU flush is happening. */
02064 UNIV_INTERN
02065 void
02066 buf_flush_stat_update(void)
02067 /*=======================*/
02068 {
02069   buf_flush_stat_t* item;
02070   ib_uint64_t   lsn_diff;
02071   ib_uint64_t   lsn;
02072   ulint     n_flushed;
02073 
02074   lsn = log_get_lsn();
02075   if (buf_flush_stat_cur.redo == 0) {
02076     /* First time around. Just update the current LSN
02077     and return. */
02078     buf_flush_stat_cur.redo = lsn;
02079     return;
02080   }
02081 
02082   item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
02083 
02084   /* values for this interval */
02085   lsn_diff = lsn - buf_flush_stat_cur.redo;
02086   n_flushed = buf_lru_flush_page_count
02087         - buf_flush_stat_cur.n_flushed;
02088 
02089   /* add the current value and subtract the obsolete entry. */
02090   buf_flush_stat_sum.redo += lsn_diff - item->redo;
02091   buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
02092 
02093   /* put current entry in the array. */
02094   item->redo = lsn_diff;
02095   item->n_flushed = n_flushed;
02096 
02097   /* update the index */
02098   buf_flush_stat_arr_ind++;
02099   buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
02100 
02101   /* reset the current entry. */
02102   buf_flush_stat_cur.redo = lsn;
02103   buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
02104 }
02105 
02106 /*********************************************************************
02107 Determines the fraction of dirty pages that need to be flushed based
02108 on the speed at which we generate redo log. Note that if redo log
02109 is generated at a significant rate without corresponding increase
02110 in the number of dirty pages (for example, an in-memory workload)
02111 it can cause IO bursts of flushing. This function implements heuristics
02112 to avoid this burstiness.
02113 @return number of dirty pages to be flushed / second */
02114 UNIV_INTERN
02115 ulint
02116 buf_flush_get_desired_flush_rate(void)
02117 /*==================================*/
02118 {
02119   ulint   i;
02120   lint    rate;
02121   ulint   redo_avg;
02122   ulint   n_dirty = 0;
02123   ulint   n_flush_req;
02124   ulint   lru_flush_avg;
02125   ib_uint64_t lsn = log_get_lsn();
02126   ulint   log_capacity = log_get_capacity();
02127 
02128   /* log_capacity should never be zero after the initialization
02129   of log subsystem. */
02130   ut_ad(log_capacity != 0);
02131 
02132   /* Get total number of dirty pages. It is OK to access
02133   flush_list without holding any mutex as we are using this
02134   only for heuristics. */
02135   for (i = 0; i < srv_buf_pool_instances; i++) {
02136     buf_pool_t* buf_pool;
02137 
02138     buf_pool = buf_pool_from_array(i);
02139     n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
02140   }
02141 
02142   /* An overflow can happen if we generate more than 2^32 bytes
02143   of redo in this interval i.e.: 4G of redo in 1 second. We can
02144   safely consider this as infinity because if we ever come close
02145   to 4G we'll start a synchronous flush of dirty pages. */
02146   /* redo_avg below is average at which redo is generated in
02147   past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
02148   interval. */
02149   redo_avg = (ulint) (buf_flush_stat_sum.redo
02150           / BUF_FLUSH_STAT_N_INTERVAL
02151           + (lsn - buf_flush_stat_cur.redo));
02152 
02153   /* An overflow can happen possibly if we flush more than 2^32
02154   pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
02155   unlikely scenario. Even when this happens it means that our
02156   flush rate will be off the mark. It won't affect correctness
02157   of any subsystem. */
02158   /* lru_flush_avg below is rate at which pages are flushed as
02159   part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
02160   number of pages flushed in the current interval. */
02161   lru_flush_avg = buf_flush_stat_sum.n_flushed
02162       / BUF_FLUSH_STAT_N_INTERVAL
02163       + (buf_lru_flush_page_count
02164          - buf_flush_stat_cur.n_flushed);
02165 
02166   n_flush_req = (n_dirty * redo_avg) / log_capacity;
02167 
02168   /* The number of pages that we want to flush from the flush
02169   list is the difference between the required rate and the
02170   number of pages that we are historically flushing from the
02171   LRU list */
02172   rate = n_flush_req - lru_flush_avg;
02173   return(rate > 0 ? (ulint) rate : 0);
02174 }
02175 
02176 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
02177 /******************************************************************/
02180 static
02181 ibool
02182 buf_flush_validate_low(
02183 /*===================*/
02184   buf_pool_t* buf_pool)   
02185 {
02186   buf_page_t*   bpage;
02187   const ib_rbt_node_t*  rnode = NULL;
02188 
02189   ut_ad(buf_flush_list_mutex_own(buf_pool));
02190 
02191   UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
02192        ut_ad(ut_list_node_313->in_flush_list));
02193 
02194   bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
02195 
02196   /* If we are in recovery mode i.e.: flush_rbt != NULL
02197   then each block in the flush_list must also be present
02198   in the flush_rbt. */
02199   if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
02200     rnode = rbt_first(buf_pool->flush_rbt);
02201   }
02202 
02203   while (bpage != NULL) {
02204     const ib_uint64_t om = bpage->oldest_modification;
02205 
02206     ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
02207 
02208     ut_ad(bpage->in_flush_list);
02209 
02210     /* A page in buf_pool->flush_list can be in
02211     BUF_BLOCK_REMOVE_HASH state. This happens when a page
02212     is in the middle of being relocated. In that case the
02213     original descriptor can have this state and still be
02214     in the flush list waiting to acquire the
02215     buf_pool->flush_list_mutex to complete the relocation. */
02216     ut_a(buf_page_in_file(bpage)
02217          || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
02218     ut_a(om > 0);
02219 
02220     if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
02221       buf_page_t** prpage;
02222 
02223       ut_a(rnode);
02224       prpage = rbt_value(buf_page_t*, rnode);
02225 
02226       ut_a(*prpage);
02227       ut_a(*prpage == bpage);
02228       rnode = rbt_next(buf_pool->flush_rbt, rnode);
02229     }
02230 
02231     bpage = UT_LIST_GET_NEXT(list, bpage);
02232 
02233     ut_a(!bpage || om >= bpage->oldest_modification);
02234   }
02235 
02236   /* By this time we must have exhausted the traversal of
02237   flush_rbt (if active) as well. */
02238   ut_a(rnode == NULL);
02239 
02240   return(TRUE);
02241 }
02242 
02243 /******************************************************************/
02246 UNIV_INTERN
02247 ibool
02248 buf_flush_validate(
02249 /*===============*/
02250   buf_pool_t* buf_pool) 
02251 {
02252   ibool ret;
02253 
02254   buf_flush_list_mutex_enter(buf_pool);
02255 
02256   ret = buf_flush_validate_low(buf_pool);
02257 
02258   buf_flush_list_mutex_exit(buf_pool);
02259 
02260   return(ret);
02261 }
02262 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
02263 #endif /* !UNIV_HOTBACKUP */