1
/* Copyright (C) 2003 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
19
#include <SimulatedBlock.hpp>
21
#include <DLCHashTable.hpp>
22
#include <DLCFifoList.hpp>
23
#include <NodeBitmask.hpp>
24
#include <signaldata/LCP.hpp>
28
#include <OutputStream.hpp>
33
* PAGE ENTRIES AND REQUESTS
35
* Central structure is "page entry". It corresponds to a disk page
36
* identified by file and page number (file_no, page_no).
38
* A page entry is created by first request for the disk page.
39
* Subsequent requests are queued under the same page entry.
41
* There is a limited number of in-memory "cache pages", also called
42
* "buffer pages" or "real pages". These are used by the more numerous
43
* page entries to buffer the disk pages.
45
* A new or non-resident page entry must first be "bound" to an
46
* available cache page. Next the disk page must be "mapped" to the
47
* cache page. If the page is empty (never written) it is considered
48
* mapped trivially. Otherwise the cache page must be updated via
49
* "pagein" from disk. A bound and mapped page is called "resident".
51
* Updating a resident cache page makes it "dirty". A background
52
* clean-up process makes dirty pages "clean" via "pageout" to disk.
53
* Write ahead logging (WAL) of the page is done first i.e. UNDO log is
54
* flushed up to the page log sequence number (LSN) by calling a TSMAN
55
* method. The reason for this is obvious but not relevant to PGMAN.
57
* A local check point (LCP) periodically performs a complete pageout of
58
* dirty pages. It must iterate over a list which will cover all pages
59
* which had been dirty since LCP start.
61
* A clean page is a candidate ("victim") for being "unmapped" and
62
* "evicted" from the cache, to allow another page to become resident.
63
* This process is called "page replacement".
67
* Page replacement uses the LIRS algorithm (Jiang-Zhang).
69
* The "recency" of a page is the time between now and the last request
70
* for the page. The "inter-reference recency" (IRR) of a page is the
71
* time between the last 2 requests for the page. "Time" is advanced by
72
* request for any page.
74
* Page entries are divided into "hot" ("lir") and "cold" ("hir"). Here
75
* lir/hir refers to low/high IRR. Hot pages are always resident but
76
* cold pages need not be.
78
* Number of hot pages is limited to slightly less than number of cache
79
* pages. Until this number is reached, all used cache pages are hot.
80
* Then the algorithm described next is applied. The algorithm avoids
81
* storing any of the actual recency values.
83
* Primary data structure is the "stack". It contains all hot entries
84
* and recently referenced cold entries (resident or not). The stack is
85
* in recency order with most recent (lowest recency) entry on top.
86
* Entries which are less recent than the least recent hot page are
87
* removed ("stack pruning"). So the bottom page is always hot.
89
* The cold entries on the stack are undergoing a "trial period". If
90
* they are referenced soon again (see IRR), they become hot. Otherwise
91
* they fall off the bottom of the stack.
93
* Secondary data structure is the "queue". It contains all resident
94
* cold pages (on stack or not). When a hot page is removed from the
95
* stack it is added to the end of the queue. When page replacement
96
* needs a page it removes it from the front of the queue.
98
* Page requests cause the input entry to be inserted and updated in
99
* LIRS lists. Remember that an entry can be present on both stack and
100
* queue. The rules per type of input entry are:
102
* 1. Hot. Move input entry to stack top. If input entry was at stack
103
* bottom, do stack pruning.
105
* 2. Cold resident. Move input entry to stack top. Then:
107
* 2a. If input entry was on stack, change it to hot, remove it from
108
* queue, change stack bottom entry to cold and move the bottom entry to
109
* queue end, and do stack pruning.
111
* 2b. If input entry was on queue only, leave it cold but move it to
114
* 3. Cold non-resident. Remove entry at queue front and evict it from
115
* the cache. If the evicted entry was on stack, it remains as unbound
116
* entry on stack, to continue its trial period. Map input entry to the
117
* freed cache page. Move input entry to stack top. Then:
119
* 3a. If input entry was on stack, change it to hot, change stack
120
* bottom entry to cold and move the bottom entry to queue end, and do
123
* 3b. If input entry was new, leave it cold but move it to end of
128
* In LIRS the 'resident' requirement is changed as follows:
130
* Stack entries, including hot ones, can have any state. Unbound stack
131
* entries are created by new requests and by pages evicted from queue
132
* front which are still on stack.
134
* Queue entries must be bound. They become resident and evictable
135
* within a finite time. A page is "evictable" if it is mapped, clean,
136
* and has no requests.
138
* An unbound entry which should be on queue is added there at bind
139
* time. Such entries are created when an unbound entry with open
140
* requests is popped (hot) or pruned (cold) from the stack. This can
141
* happen if the cache is too small.
145
* LIRS (and related algorithms) do not address dirty pages. From above
146
* it is obvious that the clean-up process should process dirty queue
147
* entries proceeding from front to end. This also favors pages with
148
* lower LSN numbers which minimizes amount of WAL to write.
150
* In fact the clean-up process holds a permanent pointer into the queue
151
* where all entries strictly towards the front are clean. For such an
152
* entry to become dirty it must be referenced again which moves it to
153
* queue end and past the clean-up pointer. (In practice, until this
154
* works, cleanup recycles back to queue front).
158
* Page entries are put on a number of lists.
160
* 1. Hash table on (file_no, page_no). Used for fast lookup and for
161
* LCP to iterate over.
163
* The other lists are doubly-linked FIFOs. In general entries are
164
* added to the end (last entry) and processed from the front (first
165
* entry). When used as stack, end is top and front is bottom.
167
* 2. The LIRS stack and queue. These control page replacement.
169
* 3. Page entries are divided into disjoint "sublists" based on page
170
* "state" i.e. the set of page properties. Some sublists drive page
171
* processing and have next entry to process at the front.
173
* Current sublists are as follows. Those that drive processing are
174
* marked with a plus (+).
176
* SL_BIND + waiting for available buffer page
177
* SL_MAP + waiting to start pagein from disk
178
* SL_MAP_IO - above in i/o wait (the pagein)
179
* SL_CALLBACK + request done, waiting to invoke callbacks
180
* SL_CALLBACK_IO - above in i/o wait (pageout by cleanup)
181
* SL_BUSY - being written to by PGMAN client
182
* SL_LOCKED - permanently locked to cache
183
* SL_OTHER - default sublist
187
* Page processing uses a number independent continueB loops.
189
* 1. The "stats loop". Started at node start. Checks lists in debug
190
* mode. In the future could gather statistics and adjust parameters
191
* based on load. Continues via delay signal.
193
* 2. The "busy loop". Started by page request. Each loop does bind,
194
* map, and callback of a number of entries. Continues via no-delay
195
* signal until nothing to do.
197
* 3. The "cleanup loop". Started at node start. Each loop starts
198
* pageout of a number of dirty queue entries. Continues via delay
201
* 4. The "LCP loop". Started periodically by NDB. Each loop starts
202
* pageout of a number of hash list entries. Continues via delay signal
207
* LOCKED pages are not put on stack or queue. They are flushed to disk
208
* by LCP but not by clean-up.
210
* A TUP scan is likely to access a page repeatedly within a short time.
211
* This can make the page hot when it should not be. Such "correlated
212
* requests" are handled by a request flag which modifies default LIRS
213
* processing. [fill in details later]
215
* Also PK operations make 2 rapid page references. The 2nd one is for
216
* commit. This too should be handled as a correlated request.
220
* TSMAN reads "meta" pages such as extent headers. There are currently
221
* "locked" forever in PGMAN cache.
225
* DBTUP works with copy pages (or UNDO buffers) in memory. The real
226
* page is updated only between page request with COMMIT_REQ flag and
227
* a subsequent LSN update. These need not occur in same timeslice
228
* since DBTUP may need to flush UNDO log in-between.
230
* The page is "busy" if any transaction is between COMMIT_REQ and LSN
231
* update. A busy page must be locked in buffer cache. No pageout of
232
* a busy page can be started by clean-up or LCP.
235
class Pgman : public SimulatedBlock
238
Pgman(Block_context& ctx);
240
BLOCK_DEFINES(Pgman);
243
friend class Page_cache_client;
245
struct Page_entry; // CC
246
friend struct Page_entry;
248
struct Page_request {
250
OP_MASK = 0x000F // 4 bits for TUP operation
251
,LOCK_PAGE = 0x0020 // lock page in memory
252
,EMPTY_PAGE = 0x0040 // empty (new) page
253
,ALLOC_REQ = 0x0080 // part of alloc
254
,COMMIT_REQ = 0x0100 // part of commit
255
,DIRTY_REQ = 0x0200 // make page dirty wo/ update_lsn
256
,UNLOCK_PAGE = 0x0400
257
,CORR_REQ = 0x0800 // correlated request (no LIRS update)
259
,DELAY_REQ = 0x1000 // Force request to be delayed
265
SimulatedBlock::Callback m_callback;
268
Uint64 m_delay_until_time;
274
typedef RecordPool<Page_request, WOPool> Page_request_pool;
275
typedef SLFifoListImpl<Page_request_pool, Page_request> Page_request_list;
276
typedef LocalSLFifoListImpl<Page_request_pool, Page_request> Local_page_request_list;
278
struct Page_entry_stack_ptr {
283
struct Page_entry_queue_ptr {
288
struct Page_entry_sublist_ptr {
293
typedef Uint16 Page_state;
295
struct Page_entry : Page_entry_stack_ptr,
296
Page_entry_queue_ptr,
297
Page_entry_sublist_ptr {
299
Page_entry(Uint32 file_no, Uint32 page_no);
303
,REQUEST = 0x0001 // has outstanding request
304
,EMPTY = 0x0002 // empty (never written) page
305
,BOUND = 0x0004 // m_real_page_ptr assigned
306
,MAPPED = 0x0008 // bound, and empty or paged in
307
,DIRTY = 0x0010 // page is modified
308
,USED = 0x0020 // used by some tx (not set currently)
309
,BUSY = 0x0040 // page is being written to
310
,LOCKED = 0x0080 // locked in cache (forever)
311
,PAGEIN = 0x0100 // paging in
312
,PAGEOUT = 0x0200 // paging out
313
,LOGSYNC = 0x0400 // undo WAL as part of pageout
314
,LCP = 0x1000 // page is LCP flushed
315
,HOT = 0x2000 // page is hot
316
,ONSTACK = 0x4000 // page is on LIRS stack
317
,ONQUEUE = 0x8000 // page is on LIRS queue
333
Uint16 m_file_no; // disk page address set at seize
334
Page_state m_state; // flags (0 for new entry)
337
Uint32 m_real_page_i;
341
Uint32 m_dirty_count;
342
Uint32 m_copy_page_i;
344
Uint32 m_busy_count; // non-zero means BUSY
348
Page_request_list::Head m_requests;
353
Uint32 hashValue() const { return m_file_no << 16 | m_page_no; }
354
bool equal(const Page_entry& obj) const {
356
m_file_no == obj.m_file_no && m_page_no == obj.m_page_no;
364
typedef DLCHashTable<Page_entry> Page_hashlist;
365
typedef DLCFifoList<Page_entry, Page_entry_stack_ptr> Page_stack;
366
typedef DLCFifoList<Page_entry, Page_entry_queue_ptr> Page_queue;
367
typedef DLCFifoList<Page_entry, Page_entry_sublist_ptr> Page_sublist;
370
Logfile_client m_lgman;
373
bool m_stats_loop_on;
375
bool m_cleanup_loop_on;
380
Uint32 m_last_lcp_complete;
381
Uint32 m_lcp_curr_bucket;
382
Uint32 m_lcp_outstanding; // remaining i/o waits
383
EndLcpReq m_end_lcp_req;
385
// clean-up variables
386
Ptr<Page_entry> m_cleanup_ptr;
389
typedef DataBuffer<15> File_map;
391
File_map::DataBufferPool m_data_buffer_pool;
393
// page entries and requests
394
Page_request_pool m_page_request_pool;
395
ArrayPool<Page_entry> m_page_entry_pool;
396
Page_hashlist m_page_hashlist;
397
Page_stack m_page_stack;
398
Page_queue m_page_queue;
399
Page_sublist* m_page_sublist[Page_entry::SUBLIST_COUNT];
404
Uint32 m_max_pages; // max number of cache pages
405
Uint32 m_lirs_stack_mult; // in m_max_pages (around 3-10)
406
Uint32 m_max_hot_pages; // max hot cache pages (up to 99%)
407
Uint32 m_max_loop_count; // limit purely local loops
408
Uint32 m_max_io_waits;
409
Uint32 m_stats_loop_delay;
410
Uint32 m_cleanup_loop_delay;
411
Uint32 m_lcp_loop_delay;
414
// runtime sizes and statistics
417
Uint32 m_num_pages; // current number of cache pages
419
Uint32 m_page_faults;
420
Uint32 m_current_io_waits;
424
void execSTTOR(Signal* signal);
425
void sendSTTORRY(Signal*);
426
void execREAD_CONFIG_REQ(Signal* signal);
427
void execCONTINUEB(Signal* signal);
429
void execLCP_FRAG_ORD(Signal*);
430
void execEND_LCP_REQ(Signal*);
432
void execFSREADCONF(Signal*);
433
void execFSREADREF(Signal*);
434
void execFSWRITECONF(Signal*);
435
void execFSWRITEREF(Signal*);
437
void execDUMP_STATE_ORD(Signal* signal);
440
static Uint32 get_sublist_no(Page_state state);
441
void set_page_state(Ptr<Page_entry> ptr, Page_state new_state);
443
bool seize_cache_page(Ptr<GlobalPage>& gptr);
444
void release_cache_page(Uint32 i);
446
bool find_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
447
Uint32 seize_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
448
bool get_page_entry(Ptr<Page_entry>&, Uint32 file_no, Uint32 page_no);
449
void release_page_entry(Ptr<Page_entry>&);
451
void lirs_stack_prune();
452
void lirs_stack_pop();
453
void lirs_reference(Ptr<Page_entry> ptr);
455
void do_stats_loop(Signal*);
456
void do_busy_loop(Signal*, bool direct = false);
457
void do_cleanup_loop(Signal*);
458
void do_lcp_loop(Signal*, bool direct = false);
460
bool process_bind(Signal*);
461
bool process_bind(Signal*, Ptr<Page_entry> ptr);
462
bool process_map(Signal*);
463
bool process_map(Signal*, Ptr<Page_entry> ptr);
464
bool process_callback(Signal*);
465
bool process_callback(Signal*, Ptr<Page_entry> ptr);
467
bool process_cleanup(Signal*);
468
void move_cleanup_ptr(Ptr<Page_entry> ptr);
470
bool process_lcp(Signal*);
471
void process_lcp_locked(Signal* signal, Ptr<Page_entry> ptr);
472
void process_lcp_locked_fswriteconf(Signal* signal, Ptr<Page_entry> ptr);
474
void pagein(Signal*, Ptr<Page_entry>);
475
void fsreadreq(Signal*, Ptr<Page_entry>);
476
void fsreadconf(Signal*, Ptr<Page_entry>);
477
void pageout(Signal*, Ptr<Page_entry>);
478
void logsync_callback(Signal*, Uint32 ptrI, Uint32 res);
479
void fswritereq(Signal*, Ptr<Page_entry>);
480
void fswriteconf(Signal*, Ptr<Page_entry>);
482
int get_page(Signal*, Ptr<Page_entry>, Page_request page_req);
483
void update_lsn(Ptr<Page_entry>, Uint32 block, Uint64 lsn);
484
Uint32 create_data_file();
485
Uint32 alloc_data_file(Uint32 file_no);
486
void map_file_no(Uint32 file_no, Uint32 fd);
487
void free_data_file(Uint32 file_no, Uint32 fd = RNIL);
488
int drop_page(Ptr<Page_entry>);
493
void verify_page_entry(Ptr<Page_entry> ptr);
494
void verify_page_lists();
496
bool dump_page_lists(Uint32 ptrI = RNIL);
497
void open_debug_file(Uint32 flag);
499
static const char* get_sublist_name(Uint32 list_no);
500
friend class NdbOut& operator<<(NdbOut&, Ptr<Page_request>);
501
friend class NdbOut& operator<<(NdbOut&, Ptr<Page_entry>);
504
class NdbOut& operator<<(NdbOut&, Ptr<Pgman::Page_request>);
505
class NdbOut& operator<<(NdbOut&, Ptr<Pgman::Page_entry>);
507
class Page_cache_client
513
Page_cache_client(SimulatedBlock* block, Pgman*);
517
SimulatedBlock::Callback m_callback;
520
Uint64 m_delay_until_time;
524
Ptr<GlobalPage> m_ptr; // TODO remove
527
LOCK_PAGE = Pgman::Page_request::LOCK_PAGE
528
,EMPTY_PAGE = Pgman::Page_request::EMPTY_PAGE
529
,ALLOC_REQ = Pgman::Page_request::ALLOC_REQ
530
,COMMIT_REQ = Pgman::Page_request::COMMIT_REQ
531
,DIRTY_REQ = Pgman::Page_request::DIRTY_REQ
532
,UNLOCK_PAGE = Pgman::Page_request::UNLOCK_PAGE
533
,CORR_REQ = Pgman::Page_request::CORR_REQ
535
,DELAY_REQ = Pgman::Page_request::DELAY_REQ
541
* @note This request may return true even if previous request
542
* for same page return false, and it's callback has not been called
543
* @return -1, on error
544
* 0, request is queued
547
int get_page(Signal*, Request&, Uint32 flags);
549
void update_lsn(Local_key, Uint64 lsn);
554
* @return -1 on error
555
* 0 is request is queued
558
int drop_page(Local_key, Uint32 page_id);
563
Uint32 create_data_file();
566
* Alloc datafile record
568
Uint32 alloc_data_file(Uint32 file_no);
571
* Map file_no to m_fd
573
void map_file_no(Uint32 m_file_no, Uint32 m_fd);
578
void free_data_file(Uint32 file_no, Uint32 fd = RNIL);
582
Page_cache_client::get_page(Signal* signal, Request& req, Uint32 flags)
584
Ptr<Pgman::Page_entry> entry_ptr;
585
Uint32 file_no = req.m_page.m_file_no;
586
Uint32 page_no = req.m_page.m_page_no;
590
<< "PGCLI: get_page " << file_no << "," << page_no
591
<< " flags=" << hex << flags << endl;
595
bool ok = m_pgman->get_page_entry(entry_ptr, file_no, page_no);
601
Pgman::Page_request page_req;
602
page_req.m_block = m_block;
603
page_req.m_flags = flags;
604
page_req.m_callback = req.m_callback;
606
page_req.m_delay_until_time = req.m_delay_until_time;
609
int i = m_pgman->get_page(signal, entry_ptr, page_req);
613
m_pgman->m_global_page_pool.getPtr(m_ptr, (Uint32)i);
619
Page_cache_client::update_lsn(Local_key key, Uint64 lsn)
621
Ptr<Pgman::Page_entry> entry_ptr;
622
Uint32 file_no = key.m_file_no;
623
Uint32 page_no = key.m_page_no;
627
<< "PGCLI: update_lsn " << file_no << "," << page_no
628
<< " lsn=" << lsn << endl;
631
bool found = m_pgman->find_page_entry(entry_ptr, file_no, page_no);
634
m_pgman->update_lsn(entry_ptr, m_block, lsn);
639
Page_cache_client::drop_page(Local_key key, Uint32 page_id)
641
Ptr<Pgman::Page_entry> entry_ptr;
642
Uint32 file_no = key.m_file_no;
643
Uint32 page_no = key.m_page_no;
647
<< "PGCLI: drop_page " << file_no << "," << page_no << endl;
650
bool found = m_pgman->find_page_entry(entry_ptr, file_no, page_no);
652
assert(entry_ptr.p->m_real_page_i == page_id);
654
return m_pgman->drop_page(entry_ptr);
658
Page_cache_client::create_data_file()
660
return m_pgman->create_data_file();
664
Page_cache_client::alloc_data_file(Uint32 file_no)
666
return m_pgman->alloc_data_file(file_no);
670
Page_cache_client::map_file_no(Uint32 file_no, Uint32 fd)
672
m_pgman->map_file_no(file_no, fd);
676
Page_cache_client::free_data_file(Uint32 file_no, Uint32 fd)
678
m_pgman->free_data_file(file_no, fd);