4
** The author disclaims copyright to this source code. In place of
5
** a legal notice, here is a blessing:
7
** May you do good and not evil.
8
** May you find forgiveness for yourself and forgive others.
9
** May you share freely, never taking more than you give.
11
*************************************************************************
12
** This is the implementation of the page cache subsystem or "pager".
14
** The pager is used to access a database disk file. It implements
15
** atomic commit and rollback through the use of a journal file that
16
** is separate from the database file. The pager also implements file
17
** locking to prevent two processes from writing the same database
18
** file simultaneously, or one process from reading the database while
19
** another is writing.
21
** @(#) $Id: pager.c,v 1.201 2005/03/28 18:04:28 drh Exp $
23
#include "sqliteInt.h"
30
** Macros for troubleshooting. Normally turned off
33
#define TRACE1(X) sqlite3DebugPrintf(X)
34
#define TRACE2(X,Y) sqlite3DebugPrintf(X,Y)
35
#define TRACE3(X,Y,Z) sqlite3DebugPrintf(X,Y,Z)
36
#define TRACE4(X,Y,Z,W) sqlite3DebugPrintf(X,Y,Z,W)
37
#define TRACE5(X,Y,Z,W,V) sqlite3DebugPrintf(X,Y,Z,W,V)
42
#define TRACE4(X,Y,Z,W)
43
#define TRACE5(X,Y,Z,W,V)
47
** The following two macros are used within the TRACEX() macros above
48
** to print out file-descriptors. They are required so that tracing
49
** can be turned on when using both the regular os_unix.c and os_test.c
52
** PAGERID() takes a pointer to a Pager struct as it's argument. The
53
** associated file-descriptor is returned. FILEHANDLEID() takes an OsFile
54
** struct as it's argument.
57
#define PAGERID(p) (p->fd->fd.h)
58
#define FILEHANDLEID(fd) (fd->fd.h)
60
#define PAGERID(p) (p->fd.h)
61
#define FILEHANDLEID(fd) (fd.h)
65
** The page cache as a whole is always in one of the following
68
** PAGER_UNLOCK The page cache is not currently reading or
69
** writing the database file. There is no
70
** data held in memory. This is the initial
73
** PAGER_SHARED The page cache is reading the database.
74
** Writing is not permitted. There can be
75
** multiple readers accessing the same database
76
** file at the same time.
78
** PAGER_RESERVED This process has reserved the database for writing
79
** but has not yet made any changes. Only one process
80
** at a time can reserve the database. The original
81
** database file has not been modified so other
82
** processes may still be reading the on-disk
85
** PAGER_EXCLUSIVE The page cache is writing the database.
86
** Access is exclusive. No other processes or
87
** threads can be reading or writing while one
88
** process is writing.
90
** PAGER_SYNCED The pager moves to this state from PAGER_EXCLUSIVE
91
** after all dirty pages have been written to the
92
** database file and the file has been synced to
93
** disk. All that remains to do is to remove the
94
** journal file and the transaction will be
97
** The page cache comes up in PAGER_UNLOCK. The first time a
98
** sqlite3pager_get() occurs, the state transitions to PAGER_SHARED.
99
** After all pages have been released using sqlite_page_unref(),
100
** the state transitions back to PAGER_UNLOCK. The first time
101
** that sqlite3pager_write() is called, the state transitions to
102
** PAGER_RESERVED. (Note that sqlite_page_write() can only be
103
** called on an outstanding page which means that the pager must
104
** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
105
** The transition to PAGER_EXCLUSIVE occurs when before any changes
106
** are made to the database file. After an sqlite3pager_rollback()
107
** or sqlite_pager_commit(), the state goes back to PAGER_SHARED.
109
#define PAGER_UNLOCK 0
110
#define PAGER_SHARED 1 /* same as SHARED_LOCK */
111
#define PAGER_RESERVED 2 /* same as RESERVED_LOCK */
112
#define PAGER_EXCLUSIVE 4 /* same as EXCLUSIVE_LOCK */
113
#define PAGER_SYNCED 5
116
** If the SQLITE_BUSY_RESERVED_LOCK macro is set to true at compile-time,
117
** then failed attempts to get a reserved lock will invoke the busy callback.
118
** This is off by default. To see why, consider the following scenario:
120
** Suppose thread A already has a shared lock and wants a reserved lock.
121
** Thread B already has a reserved lock and wants an exclusive lock. If
122
** both threads are using their busy callbacks, it might be a long time
123
** be for one of the threads give up and allows the other to proceed.
124
** But if the thread trying to get the reserved lock gives up quickly
125
** (if it never invokes its busy callback) then the contention will be
128
#ifndef SQLITE_BUSY_RESERVED_LOCK
129
# define SQLITE_BUSY_RESERVED_LOCK 0
133
** This macro rounds values up so that if the value is an address it
134
** is guaranteed to be an address that is aligned to an 8-byte boundary.
136
#define FORCE_ALIGNMENT(X) (((X)+7)&~7)
139
** Each in-memory image of a page begins with the following header.
140
** This header is only visible to this pager module. The client
141
** code that calls pager sees only the data that follows the header.
143
** Client code should call sqlite3pager_write() on a page prior to making
144
** any modifications to that page. The first time sqlite3pager_write()
145
** is called, the original page contents are written into the rollback
146
** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once
147
** the journal page has made it onto the disk surface, PgHdr.needSync
148
** is cleared. The modified page cannot be written back into the original
149
** database file until the journal pages has been synced to disk and the
150
** PgHdr.needSync has been cleared.
152
** The PgHdr.dirty flag is set when sqlite3pager_write() is called and
153
** is cleared again when the page content is written back to the original
156
typedef struct PgHdr PgHdr;
158
Pager *pPager; /* The pager to which this page belongs */
159
Pgno pgno; /* The page number for this page */
160
PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
161
PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
162
PgHdr *pNextAll; /* A list of all pages */
163
PgHdr *pNextStmt, *pPrevStmt; /* List of pages in the statement journal */
164
u8 inJournal; /* TRUE if has been written to journal */
165
u8 inStmt; /* TRUE if in the statement subjournal */
166
u8 dirty; /* TRUE if we need to write back changes */
167
u8 needSync; /* Sync journal before writing this page */
168
u8 alwaysRollback; /* Disable dont_rollback() for this page */
169
short int nRef; /* Number of users of this page */
170
PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */
171
#ifdef SQLITE_CHECK_PAGES
174
/* pPager->psAligned bytes of page data follow this header */
175
/* Pager.nExtra bytes of local data follow the page data */
179
** For an in-memory only database, some extra information is recorded about
180
** each page so that changes can be rolled back. (Journal files are not
181
** used for in-memory databases.) The following information is added to
182
** the end of every EXTRA block for in-memory databases.
184
** This information could have been added directly to the PgHdr structure.
185
** But then it would take up an extra 8 bytes of storage on every PgHdr
186
** even for disk-based databases. Splitting it out saves 8 bytes. This
187
** is only a savings of 0.8% but those percentages add up.
189
typedef struct PgHistory PgHistory;
191
u8 *pOrig; /* Original page text. Restore to this on a full rollback */
192
u8 *pStmt; /* Text as it was at the beginning of the current statement */
196
** A macro used for invoking the codec if there is one
198
#ifdef SQLITE_HAS_CODEC
199
# define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
201
# define CODEC(P,D,N,X)
205
** Convert a pointer to a PgHdr into a pointer to its data
208
#define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
209
#define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
210
#define PGHDR_TO_EXTRA(G,P) ((void*)&((char*)(&(G)[1]))[(P)->psAligned])
211
#define PGHDR_TO_HIST(P,PGR) \
212
((PgHistory*)&((char*)(&(P)[1]))[(PGR)->psAligned+(PGR)->nExtra])
215
** How big to make the hash table used for locating in-memory pages
216
** by page number. This macro looks a little silly, but is evaluated
217
** at compile-time, not run-time (at least for gcc this is true).
220
(MAX_PAGES>1024)?2048: \
221
(MAX_PAGES>512)?1024: \
222
(MAX_PAGES>256)?512: \
223
(MAX_PAGES>128)?256: \
224
(MAX_PAGES>64)?128:64 \
228
** Hash a page number
230
#define pager_hash(PN) ((PN)&(N_PG_HASH-1))
233
** A open page cache is an instance of the following structure.
236
char *zFilename; /* Name of the database file */
237
char *zJournal; /* Name of the journal file */
238
char *zDirectory; /* Directory hold database and journal files */
239
OsFile fd, jfd; /* File descriptors for database and journal */
240
OsFile stfd; /* File descriptor for the statement subjournal*/
241
int dbSize; /* Number of pages in the file */
242
int origDbSize; /* dbSize before the current change */
243
int stmtSize; /* Size of database (in pages) at stmt_begin() */
244
i64 stmtJSize; /* Size of journal at stmt_begin() */
245
int nRec; /* Number of pages written to the journal */
246
u32 cksumInit; /* Quasi-random value added to every checksum */
247
int stmtNRec; /* Number of records in stmt subjournal */
248
int nExtra; /* Add this many bytes to each in-memory page */
249
void (*xDestructor)(void*,int); /* Call this routine when freeing pages */
250
void (*xReiniter)(void*,int); /* Call this routine when reloading pages */
251
int pageSize; /* Number of bytes in a page */
252
int psAligned; /* pageSize rounded up to a multiple of 8 */
253
int nPage; /* Total number of in-memory pages */
254
int nMaxPage; /* High water mark of nPage */
255
int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
256
int mxPage; /* Maximum number of pages to hold in cache */
257
int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
258
int nRead,nWrite; /* Database pages read/written */
259
void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
260
void *pCodecArg; /* First argument to xCodec() */
261
u8 journalOpen; /* True if journal file descriptors is valid */
262
u8 journalStarted; /* True if header of journal is synced */
263
u8 useJournal; /* Use a rollback journal on this file */
264
u8 noReadlock; /* Do not bother to obtain readlocks */
265
u8 stmtOpen; /* True if the statement subjournal is open */
266
u8 stmtInUse; /* True we are in a statement subtransaction */
267
u8 stmtAutoopen; /* Open stmt journal when main journal is opened*/
268
u8 noSync; /* Do not sync the journal if true */
269
u8 fullSync; /* Do extra syncs of the journal for robustness */
270
u8 state; /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
271
u8 errMask; /* One of several kinds of errors */
272
u8 tempFile; /* zFilename is a temporary file */
273
u8 readOnly; /* True for a read-only database */
274
u8 needSync; /* True if an fsync() is needed on the journal */
275
u8 dirtyCache; /* True if cached pages have changed */
276
u8 alwaysRollback; /* Disable dont_rollback() for all pages */
277
u8 memDb; /* True to inhibit all file I/O */
278
u8 *aInJournal; /* One bit for each page in the database file */
279
u8 *aInStmt; /* One bit for each page in the database */
280
u8 setMaster; /* True if a m-j name has been written to jrnl */
281
BusyHandler *pBusyHandler; /* Pointer to sqlite.busyHandler */
282
PgHdr *pFirst, *pLast; /* List of free pages */
283
PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */
284
PgHdr *pAll; /* List of all pages */
285
PgHdr *pStmt; /* List of pages in the statement subjournal */
286
i64 journalOff; /* Current byte offset in the journal file */
287
i64 journalHdr; /* Byte offset to previous journal header */
288
i64 stmtHdrOff; /* First journal header written this statement */
289
i64 stmtCksum; /* cksumInit when statement was started */
290
int sectorSize; /* Assumed sector size during rollback */
291
PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number to PgHdr */
295
** These are bits that can be set in Pager.errMask.
297
#define PAGER_ERR_FULL 0x01 /* a write() failed */
298
#define PAGER_ERR_MEM 0x02 /* malloc() failed */
299
#define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
300
#define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
301
#define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
304
** Journal files begin with the following magic string. The data
305
** was obtained from /dev/random. It is used only as a sanity check.
307
** Since version 2.8.0, the journal format contains additional sanity
308
** checking information. If the power fails while the journal is begin
309
** written, semi-random garbage data might appear in the journal
310
** file after power is restored. If an attempt is then made
311
** to roll the journal back, the database could be corrupted. The additional
312
** sanity checking data is an attempt to discover the garbage in the
313
** journal and ignore it.
315
** The sanity checking information for the new journal format consists
316
** of a 32-bit checksum on each page of data. The checksum covers both
317
** the page number and the pPager->pageSize bytes of data for the page.
318
** This cksum is initialized to a 32-bit random value that appears in the
319
** journal file right after the header. The random initializer is important,
320
** because garbage data that appears at the end of a journal is likely
321
** data that was once in other files that have now been deleted. If the
322
** garbage data came from an obsolete journal file, the checksums might
323
** be correct. But by initializing the checksum to random value which
324
** is different for every journal, we minimize that risk.
326
static const unsigned char aJournalMagic[] = {
327
0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
331
** The size of the header and of each page in the journal is determined
332
** by the following macros.
334
#define JOURNAL_PG_SZ(pPager) ((pPager->pageSize) + 8)
337
** The journal header size for this pager. In the future, this could be
338
** set to some value read from the disk controller. The important
339
** characteristic is that it is the same size as a disk sector.
341
#define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
344
** The macro MEMDB is true if we are dealing with an in-memory database.
345
** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
346
** the value of MEMDB will be a constant and the compiler will optimize
347
** out code that would never execute.
349
#ifdef SQLITE_OMIT_MEMORYDB
352
# define MEMDB pPager->memDb
356
** The default size of a disk sector
358
#define PAGER_SECTOR_SIZE 512
361
** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
362
** reserved for working around a windows/posix incompatibility). It is
363
** used in the journal to signify that the remainder of the journal file
364
** is devoted to storing a master journal name - there are no more pages to
365
** roll back. See comments for function writeMasterJournal() for details.
367
/* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */
368
#define PAGER_MJ_PGNO(x) ((PENDING_BYTE/((x)->pageSize))+1)
371
** The maximum legal page number is (2^31 - 1).
373
#define PAGER_MAX_PGNO 2147483647
376
** Enable reference count tracking (for debugging) here:
379
int pager3_refinfo_enable = 0;
380
static void pager_refinfo(PgHdr *p){
382
if( !pager3_refinfo_enable ) return;
384
"REFCNT: %4d addr=%p nRef=%d\n",
385
p->pgno, PGHDR_TO_DATA(p), p->nRef
387
cnt++; /* Something to set a breakpoint on */
389
# define REFINFO(X) pager_refinfo(X)
395
** Read a 32-bit integer from the given file descriptor. Store the integer
396
** that is read in *pRes. Return SQLITE_OK if everything worked, or an
397
** error code is something goes wrong.
399
** All values are stored on disk as big-endian.
401
static int read32bits(OsFile *fd, u32 *pRes){
404
rc = sqlite3OsRead(fd, &res, sizeof(res));
408
res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
415
** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK
416
** on success or an error code is something goes wrong.
418
static int write32bits(OsFile *fd, u32 val){
420
ac[0] = (val>>24) & 0xff;
421
ac[1] = (val>>16) & 0xff;
422
ac[2] = (val>>8) & 0xff;
424
return sqlite3OsWrite(fd, ac, 4);
428
** Write the 32-bit integer 'val' into the page identified by page header
429
** 'p' at offset 'offset'.
431
static void store32bits(u32 val, PgHdr *p, int offset){
433
ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
434
ac[0] = (val>>24) & 0xff;
435
ac[1] = (val>>16) & 0xff;
436
ac[2] = (val>>8) & 0xff;
441
** Read a 32-bit integer at offset 'offset' from the page identified by
444
static u32 retrieve32bits(PgHdr *p, int offset){
446
ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
447
return (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
452
** Convert the bits in the pPager->errMask into an approprate
455
static int pager_errcode(Pager *pPager){
457
if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL;
458
if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR;
459
if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL;
460
if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM;
461
if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
465
#ifdef SQLITE_CHECK_PAGES
467
** Return a 32-bit hash of the page data for pPage.
469
static u32 pager_pagehash(PgHdr *pPage){
472
unsigned char *pData = (unsigned char *)PGHDR_TO_DATA(pPage);
473
for(i=0; i<pPage->pPager->pageSize; i++){
474
hash = (hash+i)^pData[i];
480
** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
481
** is defined, and NDEBUG is not defined, an assert() statement checks
482
** that the page is either dirty or still matches the calculated page-hash.
484
#define CHECK_PAGE(x) checkPage(x)
485
static void checkPage(PgHdr *pPg){
486
Pager *pPager = pPg->pPager;
487
assert( !pPg->pageHash || pPager->errMask || MEMDB || pPg->dirty ||
488
pPg->pageHash==pager_pagehash(pPg) );
492
#define CHECK_PAGE(x)
496
** When this is called the journal file for pager pPager must be open.
497
** The master journal file name is read from the end of the file and
498
** written into memory obtained from sqliteMalloc(). *pzMaster is
499
** set to point at the memory and SQLITE_OK returned. The caller must
500
** sqliteFree() *pzMaster.
502
** If no master journal file name is present *pzMaster is set to 0 and
503
** SQLITE_OK returned.
505
static int readMasterJournal(OsFile *pJrnl, char **pzMaster){
511
unsigned char aMagic[8]; /* A buffer to hold the magic header */
515
rc = sqlite3OsFileSize(pJrnl, &szJ);
516
if( rc!=SQLITE_OK || szJ<16 ) return rc;
518
rc = sqlite3OsSeek(pJrnl, szJ-16);
519
if( rc!=SQLITE_OK ) return rc;
521
rc = read32bits(pJrnl, &len);
522
if( rc!=SQLITE_OK ) return rc;
524
rc = read32bits(pJrnl, &cksum);
525
if( rc!=SQLITE_OK ) return rc;
527
rc = sqlite3OsRead(pJrnl, aMagic, 8);
528
if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc;
530
rc = sqlite3OsSeek(pJrnl, szJ-16-len);
531
if( rc!=SQLITE_OK ) return rc;
533
*pzMaster = (char *)sqliteMalloc(len+1);
537
rc = sqlite3OsRead(pJrnl, *pzMaster, len);
539
sqliteFree(*pzMaster);
544
/* See if the checksum matches the master journal name */
545
for(i=0; i<len; i++){
546
cksum -= (*pzMaster)[i];
549
/* If the checksum doesn't add up, then one or more of the disk sectors
550
** containing the master journal filename is corrupted. This means
551
** definitely roll back, so just return SQLITE_OK and report a (nul)
552
** master-journal filename.
554
sqliteFree(*pzMaster);
557
(*pzMaster)[len] = '\0';
564
** Seek the journal file descriptor to the next sector boundary where a
565
** journal header may be read or written. Pager.journalOff is updated with
566
** the new seek offset.
568
** i.e for a sector size of 512:
570
** Input Offset Output Offset
571
** ---------------------------------------
578
static int seekJournalHdr(Pager *pPager){
580
i64 c = pPager->journalOff;
582
offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
584
assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
586
assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
587
pPager->journalOff = offset;
588
return sqlite3OsSeek(&pPager->jfd, pPager->journalOff);
592
** The journal file must be open when this routine is called. A journal
593
** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
596
** The format for the journal header is as follows:
597
** - 8 bytes: Magic identifying journal format.
598
** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
599
** - 4 bytes: Random number used for page hash.
600
** - 4 bytes: Initial database page count.
601
** - 4 bytes: Sector size used by the process that wrote this journal.
603
** Followed by (JOURNAL_HDR_SZ - 24) bytes of unused space.
605
static int writeJournalHdr(Pager *pPager){
607
int rc = seekJournalHdr(pPager);
610
pPager->journalHdr = pPager->journalOff;
611
if( pPager->stmtHdrOff==0 ){
612
pPager->stmtHdrOff = pPager->journalHdr;
614
pPager->journalOff += JOURNAL_HDR_SZ(pPager);
618
** Possibly for a pager not in no-sync mode, the journal magic should not
619
** be written until nRec is filled in as part of next syncJournal().
621
** Actually maybe the whole journal header should be delayed until that
622
** point. Think about this.
624
rc = sqlite3OsWrite(&pPager->jfd, aJournalMagic, sizeof(aJournalMagic));
627
/* The nRec Field. 0xFFFFFFFF for no-sync journals. */
628
rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
631
/* The random check-hash initialiser */
632
sqlite3Randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
633
rc = write32bits(&pPager->jfd, pPager->cksumInit);
636
/* The initial database size */
637
rc = write32bits(&pPager->jfd, pPager->dbSize);
640
/* The assumed sector size for this process */
641
rc = write32bits(&pPager->jfd, pPager->sectorSize);
644
/* The journal header has been written successfully. Seek the journal
645
** file descriptor to the end of the journal header sector.
648
sqlite3OsSeek(&pPager->jfd, pPager->journalOff-1);
649
rc = sqlite3OsWrite(&pPager->jfd, "\000", 1);
655
** The journal file must be open when this is called. A journal header file
656
** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
657
** file. See comments above function writeJournalHdr() for a description of
658
** the journal header format.
660
** If the header is read successfully, *nRec is set to the number of
661
** page records following this header and *dbSize is set to the size of the
662
** database before the transaction began, in pages. Also, pPager->cksumInit
663
** is set to the value read from the journal header. SQLITE_OK is returned
666
** If the journal header file appears to be corrupted, SQLITE_DONE is
667
** returned and *nRec and *dbSize are not set. If JOURNAL_HDR_SZ bytes
668
** cannot be read from the journal file an error code is returned.
670
static int readJournalHdr(
677
unsigned char aMagic[8]; /* A buffer to hold the magic header */
679
rc = seekJournalHdr(pPager);
682
if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
686
rc = sqlite3OsRead(&pPager->jfd, aMagic, sizeof(aMagic));
689
if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
693
rc = read32bits(&pPager->jfd, pNRec);
696
rc = read32bits(&pPager->jfd, &pPager->cksumInit);
699
rc = read32bits(&pPager->jfd, pDbSize);
702
/* Update the assumed sector-size to match the value used by
703
** the process that created this journal. If this journal was
704
** created by a process other than this one, then this routine
705
** is being called from within pager_playback(). The local value
706
** of Pager.sectorSize is restored at the end of that routine.
708
rc = read32bits(&pPager->jfd, (u32 *)&pPager->sectorSize);
711
pPager->journalOff += JOURNAL_HDR_SZ(pPager);
712
rc = sqlite3OsSeek(&pPager->jfd, pPager->journalOff);
718
** Write the supplied master journal name into the journal file for pager
719
** pPager at the current location. The master journal name must be the last
720
** thing written to a journal file. If the pager is in full-sync mode, the
721
** journal file descriptor is advanced to the next sector boundary before
722
** anything is written. The format is:
724
** + 4 bytes: PAGER_MJ_PGNO.
725
** + N bytes: length of master journal name.
727
** + 4 bytes: Master journal name checksum.
728
** + 8 bytes: aJournalMagic[].
730
** The master journal page checksum is the sum of the bytes in the master
733
static int writeMasterJournal(Pager *pPager, const char *zMaster){
739
if( !zMaster || pPager->setMaster) return SQLITE_OK;
740
pPager->setMaster = 1;
742
len = strlen(zMaster);
743
for(i=0; i<len; i++){
747
/* If in full-sync mode, advance to the next disk sector before writing
748
** the master journal name. This is in case the previous page written to
749
** the journal has already been synced.
751
if( pPager->fullSync ){
752
rc = seekJournalHdr(pPager);
753
if( rc!=SQLITE_OK ) return rc;
755
pPager->journalOff += (len+20);
757
rc = write32bits(&pPager->jfd, PAGER_MJ_PGNO(pPager));
758
if( rc!=SQLITE_OK ) return rc;
760
rc = sqlite3OsWrite(&pPager->jfd, zMaster, len);
761
if( rc!=SQLITE_OK ) return rc;
763
rc = write32bits(&pPager->jfd, len);
764
if( rc!=SQLITE_OK ) return rc;
766
rc = write32bits(&pPager->jfd, cksum);
767
if( rc!=SQLITE_OK ) return rc;
769
rc = sqlite3OsWrite(&pPager->jfd, aJournalMagic, sizeof(aJournalMagic));
770
pPager->needSync = 1;
775
** Add or remove a page from the list of all pages that are in the
776
** statement journal.
778
** The Pager keeps a separate list of pages that are currently in
779
** the statement journal. This helps the sqlite3pager_stmt_commit()
780
** routine run MUCH faster for the common case where there are many
781
** pages in memory but only a few are in the statement journal.
783
static void page_add_to_stmt_list(PgHdr *pPg){
784
Pager *pPager = pPg->pPager;
785
if( pPg->inStmt ) return;
786
assert( pPg->pPrevStmt==0 && pPg->pNextStmt==0 );
789
pPager->pStmt->pPrevStmt = pPg;
791
pPg->pNextStmt = pPager->pStmt;
795
static void page_remove_from_stmt_list(PgHdr *pPg){
796
if( !pPg->inStmt ) return;
797
if( pPg->pPrevStmt ){
798
assert( pPg->pPrevStmt->pNextStmt==pPg );
799
pPg->pPrevStmt->pNextStmt = pPg->pNextStmt;
801
assert( pPg->pPager->pStmt==pPg );
802
pPg->pPager->pStmt = pPg->pNextStmt;
804
if( pPg->pNextStmt ){
805
assert( pPg->pNextStmt->pPrevStmt==pPg );
806
pPg->pNextStmt->pPrevStmt = pPg->pPrevStmt;
814
** Find a page in the hash table given its page number. Return
815
** a pointer to the page or NULL if not found.
817
static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
818
PgHdr *p = pPager->aHash[pager_hash(pgno)];
819
while( p && p->pgno!=pgno ){
826
** Unlock the database and clear the in-memory cache. This routine
827
** sets the state of the pager back to what it was when it was first
828
** opened. Any outstanding pages are invalidated and subsequent attempts
829
** to access those pages will likely result in a coredump.
831
static void pager_reset(Pager *pPager){
833
if( pPager->errMask ) return;
834
for(pPg=pPager->pAll; pPg; pPg=pNext){
835
pNext = pPg->pNextAll;
839
pPager->pFirstSynced = 0;
842
memset(pPager->aHash, 0, sizeof(pPager->aHash));
844
if( pPager->state>=PAGER_RESERVED ){
845
sqlite3pager_rollback(pPager);
847
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
848
pPager->state = PAGER_UNLOCK;
851
assert( pPager->journalOpen==0 );
855
** This function is used to reset the pager after a malloc() failure. This
856
** doesn't work with in-memory databases. If a malloc() fails when an
857
** in-memory database is in use it is not possible to recover.
859
** If a transaction or statement transaction is active, it is rolled back.
861
** It is an error to call this function if any pages are in use.
863
#ifndef SQLITE_OMIT_GLOBALRECOVER
864
int sqlite3pager_reset(Pager *pPager){
866
if( pPager->nRef || MEMDB ){
869
pPager->errMask &= ~(PAGER_ERR_MEM);
878
** When this routine is called, the pager has the journal file open and
879
** a RESERVED or EXCLUSIVE lock on the database. This routine releases
880
** the database lock and acquires a SHARED lock in its place. The journal
881
** file is deleted and closed.
883
** TODO: Consider keeping the journal file open for temporary databases.
884
** This might give a performance improvement on windows where opening
885
** a file is an expensive operation.
887
static int pager_unwritelock(Pager *pPager){
891
if( pPager->state<PAGER_RESERVED ){
894
sqlite3pager_stmt_commit(pPager);
895
if( pPager->stmtOpen ){
896
sqlite3OsClose(&pPager->stfd);
897
pPager->stmtOpen = 0;
899
if( pPager->journalOpen ){
900
sqlite3OsClose(&pPager->jfd);
901
pPager->journalOpen = 0;
902
sqlite3OsDelete(pPager->zJournal);
903
sqliteFree( pPager->aInJournal );
904
pPager->aInJournal = 0;
905
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
909
#ifdef SQLITE_CHECK_PAGES
910
pPg->pageHash = pager_pagehash(pPg);
913
pPager->dirtyCache = 0;
916
assert( pPager->aInJournal==0 );
917
assert( pPager->dirtyCache==0 || pPager->useJournal==0 );
919
rc = sqlite3OsUnlock(&pPager->fd, SHARED_LOCK);
920
pPager->state = PAGER_SHARED;
921
pPager->origDbSize = 0;
922
pPager->setMaster = 0;
927
** Compute and return a checksum for the page of data.
929
** This is not a real checksum. It is really just the sum of the
930
** random initial value and the page number. We experimented with
931
** a checksum of the entire data, but that was found to be too slow.
933
** Note that the page number is stored at the beginning of data and
934
** the checksum is stored at the end. This is important. If journal
935
** corruption occurs due to a power failure, the most likely scenario
936
** is that one end or the other of the record will be changed. It is
937
** much less likely that the two ends of the journal record will be
938
** correct and the middle be corrupt. Thus, this "checksum" scheme,
939
** though fast and simple, catches the mostly likely kind of corruption.
941
** FIX ME: Consider adding every 200th (or so) byte of the data to the
942
** checksum. That way if a single page spans 3 or more disk sectors and
943
** only the middle sector is corrupt, we will still have a reasonable
944
** chance of failing the checksum and thus detecting the problem.
946
static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
947
u32 cksum = pPager->cksumInit;
948
int i = pPager->pageSize-200;
957
** Read a single page from the journal file opened on file descriptor
958
** jfd. Playback this one page.
960
** If useCksum==0 it means this journal does not use checksums. Checksums
961
** are not used in statement journals because statement journals do not
962
** need to survive power failures.
964
static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int useCksum){
966
PgHdr *pPg; /* An existing page in the cache */
967
Pgno pgno; /* The page number of a page in journal */
968
u32 cksum; /* Checksum used for sanity checking */
969
u8 aData[SQLITE_MAX_PAGE_SIZE]; /* Temp storage for a page */
971
/* useCksum should be true for the main journal and false for
972
** statement journals. Verify that this is always the case
974
assert( jfd == (useCksum ? &pPager->jfd : &pPager->stfd) );
977
rc = read32bits(jfd, &pgno);
978
if( rc!=SQLITE_OK ) return rc;
979
rc = sqlite3OsRead(jfd, &aData, pPager->pageSize);
980
if( rc!=SQLITE_OK ) return rc;
981
pPager->journalOff += pPager->pageSize + 4;
983
/* Sanity checking on the page. This is more important that I originally
984
** thought. If a power failure occurs while the journal is being written,
985
** it could cause invalid data to be written into the journal. We need to
986
** detect this invalid data (with high probability) and ignore it.
988
if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
991
if( pgno>(unsigned)pPager->dbSize ){
995
rc = read32bits(jfd, &cksum);
997
pPager->journalOff += 4;
998
if( pager_cksum(pPager, pgno, aData)!=cksum ){
1003
assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
1005
/* If the pager is in RESERVED state, then there must be a copy of this
1006
** page in the pager cache. In this case just update the pager cache,
1007
** not the database file. The page is left marked dirty in this case.
1009
** If in EXCLUSIVE state, then we update the pager cache if it exists
1010
** and the main file. The page is then marked not dirty.
1012
** Ticket #1171: The statement journal might contain page content that is
1013
** different from the page content at the start of the transaction.
1014
** This occurs when a page is changed prior to the start of a statement
1015
** then changed again within the statement. When rolling back such a
1016
** statement we must not write to the original database unless we know
1017
** for certain that original page contents are in the main rollback
1018
** journal. Otherwise, if a full ROLLBACK occurs after the statement
1019
** rollback the full ROLLBACK will not restore the page to its original
1020
** content. Two conditions must be met before writing to the database
1021
** files. (1) the database must be locked. (2) we know that the original
1022
** page content is in the main journal either because the page is not in
1023
** cache or else it is marked as needSync==0.
1025
pPg = pager_lookup(pPager, pgno);
1026
assert( pPager->state>=PAGER_EXCLUSIVE || pPg!=0 );
1027
TRACE3("PLAYBACK %d page %d\n", PAGERID(pPager), pgno);
1028
if( pPager->state>=PAGER_EXCLUSIVE && (pPg==0 || pPg->needSync==0) ){
1029
sqlite3OsSeek(&pPager->fd, (pgno-1)*(i64)pPager->pageSize);
1030
rc = sqlite3OsWrite(&pPager->fd, aData, pPager->pageSize);
1031
if( pPg ) pPg->dirty = 0;
1034
/* No page should ever be explicitly rolled back that is in use, except
1035
** for page 1 which is held in use in order to keep the lock on the
1036
** database active. However such a page may be rolled back as a result
1037
** of an internal error resulting in an automatic call to
1038
** sqlite3pager_rollback().
1041
/* assert( pPg->nRef==0 || pPg->pgno==1 ); */
1042
pData = PGHDR_TO_DATA(pPg);
1043
memcpy(pData, aData, pPager->pageSize);
1044
if( pPager->xDestructor ){ /*** FIX ME: Should this be xReinit? ***/
1045
pPager->xDestructor(pData, pPager->pageSize);
1047
#ifdef SQLITE_CHECK_PAGES
1048
pPg->pageHash = pager_pagehash(pPg);
1050
CODEC(pPager, pData, pPg->pgno, 3);
1056
** Parameter zMaster is the name of a master journal file. A single journal
1057
** file that referred to the master journal file has just been rolled back.
1058
** This routine checks if it is possible to delete the master journal file,
1059
** and does so if it is.
1061
** The master journal file contains the names of all child journals.
1062
** To tell if a master journal can be deleted, check to each of the
1063
** children. If all children are either missing or do not refer to
1064
** a different master journal, then this master journal can be deleted.
1066
static int pager_delmaster(const char *zMaster){
1068
int master_open = 0;
1070
char *zMasterJournal = 0; /* Contents of master journal file */
1071
i64 nMasterJournal; /* Size of master journal file */
1073
/* Open the master journal file exclusively in case some other process
1074
** is running this routine also. Not that it makes too much difference.
1076
memset(&master, 0, sizeof(master));
1077
rc = sqlite3OsOpenReadOnly(zMaster, &master);
1078
if( rc!=SQLITE_OK ) goto delmaster_out;
1080
rc = sqlite3OsFileSize(&master, &nMasterJournal);
1081
if( rc!=SQLITE_OK ) goto delmaster_out;
1083
if( nMasterJournal>0 ){
1085
char *zMasterPtr = 0;
1087
/* Load the entire master journal file into space obtained from
1088
** sqliteMalloc() and pointed to by zMasterJournal.
1090
zMasterJournal = (char *)sqliteMalloc(nMasterJournal);
1091
if( !zMasterJournal ){
1095
rc = sqlite3OsRead(&master, zMasterJournal, nMasterJournal);
1096
if( rc!=SQLITE_OK ) goto delmaster_out;
1098
zJournal = zMasterJournal;
1099
while( (zJournal-zMasterJournal)<nMasterJournal ){
1100
if( sqlite3OsFileExists(zJournal) ){
1101
/* One of the journals pointed to by the master journal exists.
1102
** Open it and check if it points at the master journal. If
1103
** so, return without deleting the master journal file.
1108
memset(&journal, 0, sizeof(journal));
1109
rc = sqlite3OsOpenReadOnly(zJournal, &journal);
1110
if( rc!=SQLITE_OK ){
1114
rc = readMasterJournal(&journal, &zMasterPtr);
1115
sqlite3OsClose(&journal);
1116
if( rc!=SQLITE_OK ){
1120
c = zMasterPtr!=0 && strcmp(zMasterPtr, zMaster)==0;
1121
sqliteFree(zMasterPtr);
1123
/* We have a match. Do not delete the master journal file. */
1127
zJournal += (strlen(zJournal)+1);
1131
sqlite3OsDelete(zMaster);
1134
if( zMasterJournal ){
1135
sqliteFree(zMasterJournal);
1138
sqlite3OsClose(&master);
1144
** Make every page in the cache agree with what is on disk. In other words,
1145
** reread the disk to reset the state of the cache.
1147
** This routine is called after a rollback in which some of the dirty cache
1148
** pages had never been written out to disk. We need to roll back the
1149
** cache content and the easiest way to do that is to reread the old content
1150
** back from the disk.
1152
static int pager_reload_cache(Pager *pPager){
1155
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1156
char zBuf[SQLITE_MAX_PAGE_SIZE];
1157
if( !pPg->dirty ) continue;
1158
if( (int)pPg->pgno <= pPager->origDbSize ){
1159
sqlite3OsSeek(&pPager->fd, pPager->pageSize*(i64)(pPg->pgno-1));
1160
rc = sqlite3OsRead(&pPager->fd, zBuf, pPager->pageSize);
1161
TRACE3("REFETCH %d page %d\n", PAGERID(pPager), pPg->pgno);
1163
CODEC(pPager, zBuf, pPg->pgno, 2);
1165
memset(zBuf, 0, pPager->pageSize);
1167
if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), pPager->pageSize) ){
1168
memcpy(PGHDR_TO_DATA(pPg), zBuf, pPager->pageSize);
1169
if( pPager->xReiniter ){
1170
pPager->xReiniter(PGHDR_TO_DATA(pPg), pPager->pageSize);
1172
memset(PGHDR_TO_EXTRA(pPg, pPager), 0, pPager->nExtra);
1177
#ifdef SQLITE_CHECK_PAGES
1178
pPg->pageHash = pager_pagehash(pPg);
1185
** Truncate the main file of the given pager to the number of pages
1188
static int pager_truncate(Pager *pPager, int nPage){
1189
assert( pPager->state>=PAGER_EXCLUSIVE );
1190
return sqlite3OsTruncate(&pPager->fd, pPager->pageSize*(i64)nPage);
1194
** Playback the journal and thus restore the database file to
1195
** the state it was in before we started making changes.
1197
** The journal file format is as follows:
1199
** (1) 8 byte prefix. A copy of aJournalMagic[].
1200
** (2) 4 byte big-endian integer which is the number of valid page records
1201
** in the journal. If this value is 0xffffffff, then compute the
1202
** number of page records from the journal size.
1203
** (3) 4 byte big-endian integer which is the initial value for the
1205
** (4) 4 byte integer which is the number of pages to truncate the
1206
** database to during a rollback.
1207
** (5) 4 byte integer which is the number of bytes in the master journal
1208
** name. The value may be zero (indicate that there is no master
1210
** (6) N bytes of the master journal name. The name will be nul-terminated
1211
** and might be shorter than the value read from (5). If the first byte
1212
** of the name is \000 then there is no master journal. The master
1213
** journal name is stored in UTF-8.
1214
** (7) Zero or more pages instances, each as follows:
1215
** + 4 byte page number.
1216
** + pPager->pageSize bytes of data.
1217
** + 4 byte checksum
1219
** When we speak of the journal header, we mean the first 6 items above.
1220
** Each entry in the journal is an instance of the 7th item.
1222
** Call the value from the second bullet "nRec". nRec is the number of
1223
** valid page entries in the journal. In most cases, you can compute the
1224
** value of nRec from the size of the journal file. But if a power
1225
** failure occurred while the journal was being written, it could be the
1226
** case that the size of the journal file had already been increased but
1227
** the extra entries had not yet made it safely to disk. In such a case,
1228
** the value of nRec computed from the file size would be too large. For
1229
** that reason, we always use the nRec value in the header.
1231
** If the nRec value is 0xffffffff it means that nRec should be computed
1232
** from the file size. This value is used when the user selects the
1233
** no-sync option for the journal. A power failure could lead to corruption
1234
** in this case. But for things like temporary table (which will be
1235
** deleted when the power is restored) we don't care.
1237
** If the file opened as the journal file is not a well-formed
1238
** journal file then all pages up to the first corrupted page are rolled
1239
** back (or no pages if the journal header is corrupted). The journal file
1240
** is then deleted and SQLITE_OK returned, just as if no corruption had
1241
** been encountered.
1243
** If an I/O or malloc() error occurs, the journal-file is not deleted
1244
** and an error code is returned.
1246
static int pager_playback(Pager *pPager){
1247
i64 szJ; /* Size of the journal file in bytes */
1248
u32 nRec; /* Number of Records in the journal */
1249
int i; /* Loop counter */
1250
Pgno mxPg = 0; /* Size of the original file in pages */
1251
int rc; /* Result code of a subroutine */
1252
char *zMaster = 0; /* Name of master journal file if any */
1254
/* Figure out how many records are in the journal. Abort early if
1255
** the journal is empty.
1257
assert( pPager->journalOpen );
1258
rc = sqlite3OsFileSize(&pPager->jfd, &szJ);
1259
if( rc!=SQLITE_OK ){
1263
/* Read the master journal name from the journal, if it is present.
1264
** If a master journal file name is specified, but the file is not
1265
** present on disk, then the journal is not hot and does not need to be
1268
rc = readMasterJournal(&pPager->jfd, &zMaster);
1269
assert( rc!=SQLITE_DONE );
1270
if( rc!=SQLITE_OK || (zMaster && !sqlite3OsFileExists(zMaster)) ){
1271
sqliteFree(zMaster);
1273
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
1276
sqlite3OsSeek(&pPager->jfd, 0);
1277
pPager->journalOff = 0;
1279
/* This loop terminates either when the readJournalHdr() call returns
1280
** SQLITE_DONE or an IO error occurs. */
1283
/* Read the next journal header from the journal file. If there are
1284
** not enough bytes left in the journal file for a complete header, or
1285
** it is corrupted, then a process must of failed while writing it.
1286
** This indicates nothing more needs to be rolled back.
1288
rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
1289
if( rc!=SQLITE_OK ){
1290
if( rc==SQLITE_DONE ){
1296
/* If nRec is 0xffffffff, then this journal was created by a process
1297
** working in no-sync mode. This means that the rest of the journal
1298
** file consists of pages, there are no more journal headers. Compute
1299
** the value of nRec based on this assumption.
1301
if( nRec==0xffffffff ){
1302
assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
1303
nRec = (szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager);
1306
/* If this is the first header read from the journal, truncate the
1307
** database file back to it's original size.
1309
if( pPager->state>=PAGER_EXCLUSIVE &&
1310
pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
1311
assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
1312
rc = pager_truncate(pPager, mxPg);
1313
if( rc!=SQLITE_OK ){
1316
pPager->dbSize = mxPg;
1319
/* rc = sqlite3OsSeek(&pPager->jfd, JOURNAL_HDR_SZ(pPager)); */
1320
if( rc!=SQLITE_OK ) goto end_playback;
1322
/* Copy original pages out of the journal and back into the database file.
1324
for(i=0; i<nRec; i++){
1325
rc = pager_playback_one_page(pPager, &pPager->jfd, 1);
1326
if( rc!=SQLITE_OK ){
1327
if( rc==SQLITE_DONE ){
1329
pPager->journalOff = szJ;
1338
/* Pages that have been written to the journal but never synced
1339
** where not restored by the loop above. We have to restore those
1340
** pages by reading them back from the original database.
1342
assert( rc==SQLITE_OK );
1343
pager_reload_cache(pPager);
1346
if( rc==SQLITE_OK ){
1347
rc = pager_unwritelock(pPager);
1350
/* If there was a master journal and this routine will return true,
1351
** see if it is possible to delete the master journal.
1353
if( rc==SQLITE_OK ){
1354
rc = pager_delmaster(zMaster);
1356
sqliteFree(zMaster);
1359
/* The Pager.sectorSize variable may have been updated while rolling
1360
** back a journal created by a process with a different PAGER_SECTOR_SIZE
1361
** value. Reset it to the correct value for this process.
1363
pPager->sectorSize = PAGER_SECTOR_SIZE;
1368
** Playback the statement journal.
1370
** This is similar to playing back the transaction journal but with
1371
** a few extra twists.
1373
** (1) The number of pages in the database file at the start of
1374
** the statement is stored in pPager->stmtSize, not in the
1375
** journal file itself.
1377
** (2) In addition to playing back the statement journal, also
1378
** playback all pages of the transaction journal beginning
1379
** at offset pPager->stmtJSize.
1381
static int pager_stmt_playback(Pager *pPager){
1382
i64 szJ; /* Size of the full journal */
1384
int nRec; /* Number of Records */
1385
int i; /* Loop counter */
1388
szJ = pPager->journalOff;
1392
rc = sqlite3OsFileSize(&pPager->jfd, &os_szJ);
1393
if( rc!=SQLITE_OK ) return rc;
1394
assert( szJ==os_szJ );
1398
/* Set hdrOff to be the offset to the first journal header written
1399
** this statement transaction, or the end of the file if no journal
1400
** header was written.
1402
hdrOff = pPager->stmtHdrOff;
1403
assert( pPager->fullSync || !hdrOff );
1408
/* Truncate the database back to its original size.
1410
if( pPager->state>=PAGER_EXCLUSIVE ){
1411
rc = pager_truncate(pPager, pPager->stmtSize);
1413
pPager->dbSize = pPager->stmtSize;
1415
/* Figure out how many records are in the statement journal.
1417
assert( pPager->stmtInUse && pPager->journalOpen );
1418
sqlite3OsSeek(&pPager->stfd, 0);
1419
nRec = pPager->stmtNRec;
1421
/* Copy original pages out of the statement journal and back into the
1422
** database file. Note that the statement journal omits checksums from
1423
** each record since power-failure recovery is not important to statement
1426
for(i=nRec-1; i>=0; i--){
1427
rc = pager_playback_one_page(pPager, &pPager->stfd, 0);
1428
assert( rc!=SQLITE_DONE );
1429
if( rc!=SQLITE_OK ) goto end_stmt_playback;
1432
/* Now roll some pages back from the transaction journal. Pager.stmtJSize
1433
** was the size of the journal file when this statement was started, so
1434
** everything after that needs to be rolled back, either into the
1435
** database, the memory cache, or both.
1437
** If it is not zero, then Pager.stmtHdrOff is the offset to the start
1438
** of the first journal header written during this statement transaction.
1440
rc = sqlite3OsSeek(&pPager->jfd, pPager->stmtJSize);
1441
if( rc!=SQLITE_OK ){
1442
goto end_stmt_playback;
1444
pPager->journalOff = pPager->stmtJSize;
1445
pPager->cksumInit = pPager->stmtCksum;
1446
assert( JOURNAL_HDR_SZ(pPager)<(pPager->pageSize+8) );
1447
while( pPager->journalOff <= (hdrOff-(pPager->pageSize+8)) ){
1448
rc = pager_playback_one_page(pPager, &pPager->jfd, 1);
1449
assert( rc!=SQLITE_DONE );
1450
if( rc!=SQLITE_OK ) goto end_stmt_playback;
1453
while( pPager->journalOff < szJ ){
1456
rc = readJournalHdr(pPager, szJ, &nRec, &dummy);
1457
if( rc!=SQLITE_OK ){
1458
assert( rc!=SQLITE_DONE );
1459
goto end_stmt_playback;
1462
nRec = (szJ - pPager->journalOff) / (pPager->pageSize+8);
1464
for(i=nRec-1; i>=0 && pPager->journalOff < szJ; i--){
1465
rc = pager_playback_one_page(pPager, &pPager->jfd, 1);
1466
assert( rc!=SQLITE_DONE );
1467
if( rc!=SQLITE_OK ) goto end_stmt_playback;
1471
pPager->journalOff = szJ;
1474
if( rc!=SQLITE_OK ){
1475
pPager->errMask |= PAGER_ERR_CORRUPT;
1476
rc = SQLITE_CORRUPT; /* bkpt-CORRUPT */
1478
pPager->journalOff = szJ;
1479
/* pager_reload_cache(pPager); */
1485
** Change the maximum number of in-memory pages that are allowed.
1487
** The maximum number is the absolute value of the mxPage parameter.
1488
** If mxPage is negative, the noSync flag is also set. noSync bypasses
1489
** calls to sqlite3OsSync(). The pager runs much faster with noSync on,
1490
** but if the operating system crashes or there is an abrupt power
1491
** failure, the database file might be left in an inconsistent and
1492
** unrepairable state.
1494
void sqlite3pager_set_cachesize(Pager *pPager, int mxPage){
1496
pPager->noSync = pPager->tempFile;
1497
if( pPager->noSync ) pPager->needSync = 0;
1503
pPager->mxPage = mxPage;
1505
pPager->mxPage = 10;
1510
** Adjust the robustness of the database to damage due to OS crashes
1511
** or power failures by changing the number of syncs()s when writing
1512
** the rollback journal. There are three levels:
1514
** OFF sqlite3OsSync() is never called. This is the default
1515
** for temporary and transient files.
1517
** NORMAL The journal is synced once before writes begin on the
1518
** database. This is normally adequate protection, but
1519
** it is theoretically possible, though very unlikely,
1520
** that an inopertune power failure could leave the journal
1521
** in a state which would cause damage to the database
1522
** when it is rolled back.
1524
** FULL The journal is synced twice before writes begin on the
1525
** database (with some additional information - the nRec field
1526
** of the journal header - being written in between the two
1527
** syncs). If we assume that writing a
1528
** single disk sector is atomic, then this mode provides
1529
** assurance that the journal will not be corrupted to the
1530
** point of causing damage to the database during rollback.
1532
** Numeric values associated with these states are OFF==1, NORMAL=2,
1535
#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1536
void sqlite3pager_set_safety_level(Pager *pPager, int level){
1537
pPager->noSync = level==1 || pPager->tempFile;
1538
pPager->fullSync = level==3 && !pPager->tempFile;
1539
if( pPager->noSync ) pPager->needSync = 0;
1544
** Open a temporary file. Write the name of the file into zName
1545
** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write
1546
** the file descriptor into *fd. Return SQLITE_OK on success or some
1547
** other error code if we fail.
1549
** The OS will automatically delete the temporary file when it is
1552
static int sqlite3pager_opentemp(char *zFile, OsFile *fd){
1557
sqlite3OsTempFileName(zFile);
1558
rc = sqlite3OsOpenExclusive(zFile, fd, 1);
1559
}while( cnt>0 && rc!=SQLITE_OK && rc!=SQLITE_NOMEM );
1564
** Create a new page cache and put a pointer to the page cache in *ppPager.
1565
** The file to be cached need not exist. The file is not locked until
1566
** the first call to sqlite3pager_get() and is only held open until the
1567
** last page is released using sqlite3pager_unref().
1569
** If zFilename is NULL then a randomly-named temporary file is created
1570
** and used as the file to be cached. The file will be deleted
1571
** automatically when it is closed.
1573
** If zFilename is ":memory:" then all information is held in cache.
1574
** It is never written to disk. This can be used to implement an
1575
** in-memory database.
1577
int sqlite3pager_open(
1578
Pager **ppPager, /* Return the Pager structure here */
1579
const char *zFilename, /* Name of the database file to open */
1580
int nExtra, /* Extra bytes append to each in-memory page */
1581
int flags /* flags controlling this file */
1584
char *zFullPathname = 0;
1592
int useJournal = (flags & PAGER_OMIT_JOURNAL)==0;
1593
int noReadlock = (flags & PAGER_NO_READLOCK)!=0;
1594
char zTemp[SQLITE_TEMPNAME_SIZE];
1597
memset(&fd, 0, sizeof(fd));
1598
if( sqlite3_malloc_failed ){
1599
return SQLITE_NOMEM;
1601
if( zFilename && zFilename[0] ){
1602
#ifndef SQLITE_OMIT_MEMORYDB
1603
if( strcmp(zFilename,":memory:")==0 ){
1605
zFullPathname = sqliteStrDup("");
1610
zFullPathname = sqlite3OsFullPathname(zFilename);
1611
if( zFullPathname ){
1612
rc = sqlite3OsOpenReadWrite(zFullPathname, &fd, &readOnly);
1616
rc = sqlite3pager_opentemp(zTemp, &fd);
1618
zFullPathname = sqlite3OsFullPathname(zFilename);
1619
if( rc==SQLITE_OK ){
1623
if( !zFullPathname ){
1624
sqlite3OsClose(&fd);
1625
return SQLITE_NOMEM;
1627
if( rc!=SQLITE_OK ){
1628
sqlite3OsClose(&fd);
1629
sqliteFree(zFullPathname);
1632
nameLen = strlen(zFullPathname);
1633
pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 );
1635
sqlite3OsClose(&fd);
1636
sqliteFree(zFullPathname);
1637
return SQLITE_NOMEM;
1639
TRACE3("OPEN %d %s\n", FILEHANDLEID(fd), zFullPathname);
1640
pPager->zFilename = (char*)&pPager[1];
1641
pPager->zDirectory = &pPager->zFilename[nameLen+1];
1642
pPager->zJournal = &pPager->zDirectory[nameLen+1];
1643
strcpy(pPager->zFilename, zFullPathname);
1644
strcpy(pPager->zDirectory, zFullPathname);
1645
for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){}
1646
if( i>0 ) pPager->zDirectory[i-1] = 0;
1647
strcpy(pPager->zJournal, zFullPathname);
1648
sqliteFree(zFullPathname);
1649
strcpy(&pPager->zJournal[nameLen], "-journal");
1652
pPager->fd.pPager = pPager;
1654
pPager->journalOpen = 0;
1655
pPager->useJournal = useJournal && !memDb;
1656
pPager->noReadlock = noReadlock && readOnly;
1657
pPager->stmtOpen = 0;
1658
pPager->stmtInUse = 0;
1660
pPager->dbSize = memDb-1;
1661
pPager->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1662
pPager->psAligned = FORCE_ALIGNMENT(pPager->pageSize);
1663
pPager->stmtSize = 0;
1664
pPager->stmtJSize = 0;
1666
pPager->nMaxPage = 0;
1667
pPager->mxPage = 100;
1668
pPager->state = PAGER_UNLOCK;
1669
pPager->errMask = 0;
1670
pPager->tempFile = tempFile;
1671
pPager->memDb = memDb;
1672
pPager->readOnly = readOnly;
1673
pPager->needSync = 0;
1674
pPager->noSync = pPager->tempFile || !useJournal;
1675
pPager->fullSync = (pPager->noSync?0:1);
1677
pPager->pFirstSynced = 0;
1679
pPager->nExtra = FORCE_ALIGNMENT(nExtra);
1680
pPager->sectorSize = PAGER_SECTOR_SIZE;
1681
pPager->pBusyHandler = 0;
1682
memset(pPager->aHash, 0, sizeof(pPager->aHash));
1688
** Set the busy handler function.
1690
void sqlite3pager_set_busyhandler(Pager *pPager, BusyHandler *pBusyHandler){
1691
pPager->pBusyHandler = pBusyHandler;
1695
** Set the destructor for this pager. If not NULL, the destructor is called
1696
** when the reference count on each page reaches zero. The destructor can
1697
** be used to clean up information in the extra segment appended to each page.
1699
** The destructor is not called as a result sqlite3pager_close().
1700
** Destructors are only called by sqlite3pager_unref().
1702
void sqlite3pager_set_destructor(Pager *pPager, void (*xDesc)(void*,int)){
1703
pPager->xDestructor = xDesc;
1707
** Set the reinitializer for this pager. If not NULL, the reinitializer
1708
** is called when the content of a page in cache is restored to its original
1709
** value as a result of a rollback. The callback gives higher-level code
1710
** an opportunity to restore the EXTRA section to agree with the restored
1713
void sqlite3pager_set_reiniter(Pager *pPager, void (*xReinit)(void*,int)){
1714
pPager->xReiniter = xReinit;
1718
** Set the page size.
1720
** The page size must only be changed when the cache is empty.
1722
void sqlite3pager_set_pagesize(Pager *pPager, int pageSize){
1723
assert( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE );
1724
pPager->pageSize = pageSize;
1725
pPager->psAligned = FORCE_ALIGNMENT(pageSize);
1729
** Read the first N bytes from the beginning of the file into memory
1730
** that pDest points to. No error checking is done.
1732
void sqlite3pager_read_fileheader(Pager *pPager, int N, unsigned char *pDest){
1733
memset(pDest, 0, N);
1735
sqlite3OsSeek(&pPager->fd, 0);
1736
sqlite3OsRead(&pPager->fd, pDest, N);
1741
** Return the total number of pages in the disk file associated with
1744
int sqlite3pager_pagecount(Pager *pPager){
1746
assert( pPager!=0 );
1747
if( pPager->dbSize>=0 ){
1748
return pPager->dbSize;
1750
if( sqlite3OsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
1751
pPager->errMask |= PAGER_ERR_DISK;
1754
n /= pPager->pageSize;
1755
if( !MEMDB && n==PENDING_BYTE/pPager->pageSize ){
1758
if( pPager->state!=PAGER_UNLOCK ){
1765
** Forward declaration
1767
static int syncJournal(Pager*);
1771
** Unlink pPg from it's hash chain. Also set the page number to 0 to indicate
1772
** that the page is not part of any hash chain. This is required because the
1773
** sqlite3pager_movepage() routine can leave a page in the
1774
** pNextFree/pPrevFree list that is not a part of any hash-chain.
1776
static void unlinkHashChain(Pager *pPager, PgHdr *pPg){
1778
/* If the page number is zero, then this page is not in any hash chain. */
1781
if( pPg->pNextHash ){
1782
pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1784
if( pPg->pPrevHash ){
1785
assert( pPager->aHash[pager_hash(pPg->pgno)]!=pPg );
1786
pPg->pPrevHash->pNextHash = pPg->pNextHash;
1788
int h = pager_hash(pPg->pgno);
1789
assert( pPager->aHash[h]==pPg );
1790
pPager->aHash[h] = pPg->pNextHash;
1794
pPg->pNextHash = pPg->pPrevHash = 0;
1798
** Unlink a page from the free list (the list of all pages where nRef==0)
1799
** and from its hash collision chain.
1801
static void unlinkPage(PgHdr *pPg){
1802
Pager *pPager = pPg->pPager;
1804
/* Keep the pFirstSynced pointer pointing at the first synchronized page */
1805
if( pPg==pPager->pFirstSynced ){
1806
PgHdr *p = pPg->pNextFree;
1807
while( p && p->needSync ){ p = p->pNextFree; }
1808
pPager->pFirstSynced = p;
1811
/* Unlink from the freelist */
1812
if( pPg->pPrevFree ){
1813
pPg->pPrevFree->pNextFree = pPg->pNextFree;
1815
assert( pPager->pFirst==pPg );
1816
pPager->pFirst = pPg->pNextFree;
1818
if( pPg->pNextFree ){
1819
pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1821
assert( pPager->pLast==pPg );
1822
pPager->pLast = pPg->pPrevFree;
1824
pPg->pNextFree = pPg->pPrevFree = 0;
1826
/* Unlink from the pgno hash table */
1827
unlinkHashChain(pPager, pPg);
1830
#ifndef SQLITE_OMIT_MEMORYDB
1832
** This routine is used to truncate an in-memory database. Delete
1833
** all pages whose pgno is larger than pPager->dbSize and is unreferenced.
1834
** Referenced pages larger than pPager->dbSize are zeroed.
1836
static void memoryTruncate(Pager *pPager){
1839
int dbSize = pPager->dbSize;
1841
ppPg = &pPager->pAll;
1842
while( (pPg = *ppPg)!=0 ){
1843
if( pPg->pgno<=dbSize ){
1844
ppPg = &pPg->pNextAll;
1845
}else if( pPg->nRef>0 ){
1846
memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
1847
ppPg = &pPg->pNextAll;
1849
*ppPg = pPg->pNextAll;
1857
#define memoryTruncate(p)
1861
** Try to obtain a lock on a file. Invoke the busy callback if the lock
1862
** is currently not available. Repeate until the busy callback returns
1863
** false or until the lock succeeds.
1865
** Return SQLITE_OK on success and an error code if we cannot obtain
1868
static int pager_wait_on_lock(Pager *pPager, int locktype){
1870
assert( PAGER_SHARED==SHARED_LOCK );
1871
assert( PAGER_RESERVED==RESERVED_LOCK );
1872
assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
1873
if( pPager->state>=locktype ){
1879
rc = sqlite3OsLock(&pPager->fd, locktype);
1880
}while( rc==SQLITE_BUSY &&
1881
(pH = pPager->pBusyHandler)!=0 &&
1882
pH->xFunc && pH->xFunc(pH->pArg, busy++)
1884
if( rc==SQLITE_OK ){
1885
pPager->state = locktype;
1892
** Truncate the file to the number of pages specified.
1894
int sqlite3pager_truncate(Pager *pPager, Pgno nPage){
1896
sqlite3pager_pagecount(pPager);
1897
if( pPager->errMask!=0 ){
1898
rc = pager_errcode(pPager);
1901
if( nPage>=(unsigned)pPager->dbSize ){
1905
pPager->dbSize = nPage;
1906
memoryTruncate(pPager);
1909
rc = syncJournal(pPager);
1910
if( rc!=SQLITE_OK ){
1914
/* Get an exclusive lock on the database before truncating. */
1915
rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
1916
if( rc!=SQLITE_OK ){
1920
rc = pager_truncate(pPager, nPage);
1921
if( rc==SQLITE_OK ){
1922
pPager->dbSize = nPage;
1928
** Shutdown the page cache. Free all memory and close all files.
1930
** If a transaction was in progress when this routine is called, that
1931
** transaction is rolled back. All outstanding pages are invalidated
1932
** and their memory is freed. Any attempt to use a page associated
1933
** with this page cache after this function returns will likely
1934
** result in a coredump.
1936
int sqlite3pager_close(Pager *pPager){
1938
switch( pPager->state ){
1939
case PAGER_RESERVED:
1941
case PAGER_EXCLUSIVE: {
1942
/* We ignore any IO errors that occur during the rollback
1943
** operation. So disable IO error simulation so that testing
1944
** works more easily.
1946
#if defined(SQLITE_TEST) && (defined(OS_UNIX) || defined(OS_WIN))
1947
extern int sqlite3_io_error_pending;
1948
int ioerr_cnt = sqlite3_io_error_pending;
1949
sqlite3_io_error_pending = -1;
1951
sqlite3pager_rollback(pPager);
1952
#if defined(SQLITE_TEST) && (defined(OS_UNIX) || defined(OS_WIN))
1953
sqlite3_io_error_pending = ioerr_cnt;
1956
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
1958
assert( pPager->errMask || pPager->journalOpen==0 );
1961
case PAGER_SHARED: {
1963
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
1972
for(pPg=pPager->pAll; pPg; pPg=pNext){
1975
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
1976
assert( !pPg->alwaysRollback );
1977
assert( !pHist->pOrig );
1978
assert( !pHist->pStmt );
1981
pNext = pPg->pNextAll;
1984
TRACE2("CLOSE %d\n", PAGERID(pPager));
1985
assert( pPager->errMask || (pPager->journalOpen==0 && pPager->stmtOpen==0) );
1986
if( pPager->journalOpen ){
1987
sqlite3OsClose(&pPager->jfd);
1989
sqliteFree(pPager->aInJournal);
1990
if( pPager->stmtOpen ){
1991
sqlite3OsClose(&pPager->stfd);
1993
sqlite3OsClose(&pPager->fd);
1994
/* Temp files are automatically deleted by the OS
1995
** if( pPager->tempFile ){
1996
** sqlite3OsDelete(pPager->zFilename);
2005
** Return the page number for the given page data.
2007
Pgno sqlite3pager_pagenumber(void *pData){
2008
PgHdr *p = DATA_TO_PGHDR(pData);
2013
** The page_ref() function increments the reference count for a page.
2014
** If the page is currently on the freelist (the reference count is zero) then
2015
** remove it from the freelist.
2017
** For non-test systems, page_ref() is a macro that calls _page_ref()
2018
** online of the reference count is zero. For test systems, page_ref()
2019
** is a real function so that we can set breakpoints and trace it.
2021
static void _page_ref(PgHdr *pPg){
2023
/* The page is currently on the freelist. Remove it. */
2024
if( pPg==pPg->pPager->pFirstSynced ){
2025
PgHdr *p = pPg->pNextFree;
2026
while( p && p->needSync ){ p = p->pNextFree; }
2027
pPg->pPager->pFirstSynced = p;
2029
if( pPg->pPrevFree ){
2030
pPg->pPrevFree->pNextFree = pPg->pNextFree;
2032
pPg->pPager->pFirst = pPg->pNextFree;
2034
if( pPg->pNextFree ){
2035
pPg->pNextFree->pPrevFree = pPg->pPrevFree;
2037
pPg->pPager->pLast = pPg->pPrevFree;
2039
pPg->pPager->nRef++;
2045
static void page_ref(PgHdr *pPg){
2054
# define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
2058
** Increment the reference count for a page. The input pointer is
2059
** a reference to the page data.
2061
int sqlite3pager_ref(void *pData){
2062
PgHdr *pPg = DATA_TO_PGHDR(pData);
2068
** Sync the journal. In other words, make sure all the pages that have
2069
** been written to the journal have actually reached the surface of the
2070
** disk. It is not safe to modify the original database file until after
2071
** the journal has been synced. If the original database is modified before
2072
** the journal is synced and a power failure occurs, the unsynced journal
2073
** data would be lost and we would be unable to completely rollback the
2074
** database changes. Database corruption would occur.
2076
** This routine also updates the nRec field in the header of the journal.
2077
** (See comments on the pager_playback() routine for additional information.)
2078
** If the sync mode is FULL, two syncs will occur. First the whole journal
2079
** is synced, then the nRec field is updated, then a second sync occurs.
2081
** For temporary databases, we do not care if we are able to rollback
2082
** after a power failure, so sync occurs.
2084
** This routine clears the needSync field of every page current held in
2087
static int syncJournal(Pager *pPager){
2091
/* Sync the journal before modifying the main database
2092
** (assuming there is a journal and it needs to be synced.)
2094
if( pPager->needSync ){
2095
if( !pPager->tempFile ){
2096
assert( pPager->journalOpen );
2097
/* assert( !pPager->noSync ); // noSync might be set if synchronous
2098
** was turned off after the transaction was started. Ticket #615 */
2101
/* Make sure the pPager->nRec counter we are keeping agrees
2102
** with the nRec computed from the size of the journal file.
2105
rc = sqlite3OsFileSize(&pPager->jfd, &jSz);
2106
if( rc!=0 ) return rc;
2107
assert( pPager->journalOff==jSz );
2111
/* Write the nRec value into the journal file header. If in
2112
** full-synchronous mode, sync the journal first. This ensures that
2113
** all data has really hit the disk before nRec is updated to mark
2114
** it as a candidate for rollback.
2116
if( pPager->fullSync ){
2117
TRACE2("SYNC journal of %d\n", PAGERID(pPager));
2118
rc = sqlite3OsSync(&pPager->jfd);
2119
if( rc!=0 ) return rc;
2121
sqlite3OsSeek(&pPager->jfd, pPager->journalHdr + sizeof(aJournalMagic));
2122
rc = write32bits(&pPager->jfd, pPager->nRec);
2125
sqlite3OsSeek(&pPager->jfd, pPager->journalOff);
2127
TRACE2("SYNC journal of %d\n", PAGERID(pPager));
2128
rc = sqlite3OsSync(&pPager->jfd);
2129
if( rc!=0 ) return rc;
2130
pPager->journalStarted = 1;
2132
pPager->needSync = 0;
2134
/* Erase the needSync flag from every page.
2136
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2139
pPager->pFirstSynced = pPager->pFirst;
2143
/* If the Pager.needSync flag is clear then the PgHdr.needSync
2144
** flag must also be clear for all pages. Verify that this
2145
** invariant is true.
2148
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2149
assert( pPg->needSync==0 );
2151
assert( pPager->pFirstSynced==pPager->pFirst );
2159
** Given a list of pages (connected by the PgHdr.pDirty pointer) write
2160
** every one of those pages out to the database file and mark them all
2163
static int pager_write_pagelist(PgHdr *pList){
2167
if( pList==0 ) return SQLITE_OK;
2168
pPager = pList->pPager;
2170
/* At this point there may be either a RESERVED or EXCLUSIVE lock on the
2171
** database file. If there is already an EXCLUSIVE lock, the following
2172
** calls to sqlite3OsLock() are no-ops.
2174
** Moving the lock from RESERVED to EXCLUSIVE actually involves going
2175
** through an intermediate state PENDING. A PENDING lock prevents new
2176
** readers from attaching to the database but is unsufficient for us to
2177
** write. The idea of a PENDING lock is to prevent new readers from
2178
** coming in while we wait for existing readers to clear.
2180
** While the pager is in the RESERVED state, the original database file
2181
** is unchanged and we can rollback without having to playback the
2182
** journal into the original database file. Once we transition to
2183
** EXCLUSIVE, it means the database file has been changed and any rollback
2184
** will require a journal playback.
2186
rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2187
if( rc!=SQLITE_OK ){
2192
assert( pList->dirty );
2193
sqlite3OsSeek(&pPager->fd, (pList->pgno-1)*(i64)pPager->pageSize);
2194
/* If there are dirty pages in the page cache with page numbers greater
2195
** than Pager.dbSize, this means sqlite3pager_truncate() was called to
2196
** make the file smaller (presumably by auto-vacuum code). Do not write
2197
** any such pages to the file.
2199
if( pList->pgno<=pPager->dbSize ){
2200
CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
2201
TRACE3("STORE %d page %d\n", PAGERID(pPager), pList->pgno);
2202
rc = sqlite3OsWrite(&pPager->fd, PGHDR_TO_DATA(pList), pPager->pageSize);
2203
CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
2208
TRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
2213
#ifdef SQLITE_CHECK_PAGES
2214
pList->pageHash = pager_pagehash(pList);
2216
pList = pList->pDirty;
2222
** Collect every dirty page into a dirty list and
2223
** return a pointer to the head of that list. All pages are
2224
** collected even if they are still in use.
2226
static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
2229
for(p=pPager->pAll; p; p=p->pNextAll){
2239
** Return TRUE if there is a hot journal on the given pager.
2240
** A hot journal is one that needs to be played back.
2242
** If the current size of the database file is 0 but a journal file
2243
** exists, that is probably an old journal left over from a prior
2244
** database with the same name. Just delete the journal.
2246
static int hasHotJournal(Pager *pPager){
2247
if( !pPager->useJournal ) return 0;
2248
if( !sqlite3OsFileExists(pPager->zJournal) ) return 0;
2249
if( sqlite3OsCheckReservedLock(&pPager->fd) ) return 0;
2250
if( sqlite3pager_pagecount(pPager)==0 ){
2251
sqlite3OsDelete(pPager->zJournal);
2261
** A read lock on the disk file is obtained when the first page is acquired.
2262
** This read lock is dropped when the last page is released.
2264
** A _get works for any page number greater than 0. If the database
2265
** file is smaller than the requested page, then no actual disk
2266
** read occurs and the memory image of the page is initialized to
2267
** all zeros. The extra data appended to a page is always initialized
2268
** to zeros the first time a page is loaded into memory.
2270
** The acquisition might fail for several reasons. In all cases,
2271
** an appropriate error code is returned and *ppPage is set to NULL.
2273
** See also sqlite3pager_lookup(). Both this routine and _lookup() attempt
2274
** to find a page in the in-memory cache first. If the page is not already
2275
** in memory, this routine goes to disk to read it in whereas _lookup()
2276
** just returns 0. This routine acquires a read-lock the first time it
2277
** has to go to disk, and could also playback an old journal if necessary.
2278
** Since _lookup() never goes to disk, it never has to deal with locks
2279
** or journal files.
2281
int sqlite3pager_get(Pager *pPager, Pgno pgno, void **ppPage){
2285
/* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
2286
** number greater than this, or zero, is requested.
2288
if( pgno>PAGER_MAX_PGNO || pgno==0 ){
2289
return SQLITE_CORRUPT;
2292
/* Make sure we have not hit any critical errors.
2294
assert( pPager!=0 );
2296
if( pPager->errMask & ~(PAGER_ERR_FULL) ){
2297
return pager_errcode(pPager);
2300
/* If this is the first page accessed, then get a SHARED lock
2301
** on the database file.
2303
if( pPager->nRef==0 && !MEMDB ){
2304
if( !pPager->noReadlock ){
2305
rc = pager_wait_on_lock(pPager, SHARED_LOCK);
2306
if( rc!=SQLITE_OK ){
2311
/* If a journal file exists, and there is no RESERVED lock on the
2312
** database file, then it either needs to be played back or deleted.
2314
if( hasHotJournal(pPager) ){
2317
/* Get an EXCLUSIVE lock on the database file. At this point it is
2318
** important that a RESERVED lock is not obtained on the way to the
2319
** EXCLUSIVE lock. If it were, another process might open the
2320
** database file, detect the RESERVED lock, and conclude that the
2321
** database is safe to read while this process is still rolling it
2324
** Because the intermediate RESERVED lock is not requested, the
2325
** second process will get to this point in the code and fail to
2326
** obtain it's own EXCLUSIVE lock on the database file.
2328
rc = sqlite3OsLock(&pPager->fd, EXCLUSIVE_LOCK);
2329
if( rc!=SQLITE_OK ){
2330
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
2331
pPager->state = PAGER_UNLOCK;
2334
pPager->state = PAGER_EXCLUSIVE;
2336
/* Open the journal for reading only. Return SQLITE_BUSY if
2337
** we are unable to open the journal file.
2339
** The journal file does not need to be locked itself. The
2340
** journal file is never open unless the main database file holds
2341
** a write lock, so there is never any chance of two or more
2342
** processes opening the journal at the same time.
2344
rc = sqlite3OsOpenReadOnly(pPager->zJournal, &pPager->jfd);
2345
if( rc!=SQLITE_OK ){
2346
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
2347
pPager->state = PAGER_UNLOCK;
2350
pPager->journalOpen = 1;
2351
pPager->journalStarted = 0;
2352
pPager->journalOff = 0;
2353
pPager->setMaster = 0;
2354
pPager->journalHdr = 0;
2356
/* Playback and delete the journal. Drop the database write
2357
** lock and reacquire the read lock.
2359
rc = pager_playback(pPager);
2360
if( rc!=SQLITE_OK ){
2366
/* Search for page in cache */
2367
pPg = pager_lookup(pPager, pgno);
2368
if( MEMDB && pPager->state==PAGER_UNLOCK ){
2369
pPager->state = PAGER_SHARED;
2373
/* The requested page is not in the page cache. */
2376
if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 || MEMDB ){
2377
/* Create a new page */
2378
pPg = sqliteMallocRaw( sizeof(*pPg) + pPager->psAligned
2379
+ sizeof(u32) + pPager->nExtra
2380
+ MEMDB*sizeof(PgHistory) );
2382
pPager->errMask |= PAGER_ERR_MEM;
2383
return SQLITE_NOMEM;
2385
memset(pPg, 0, sizeof(*pPg));
2387
memset(PGHDR_TO_HIST(pPg, pPager), 0, sizeof(PgHistory));
2389
pPg->pPager = pPager;
2390
pPg->pNextAll = pPager->pAll;
2393
if( pPager->nPage>pPager->nMaxPage ){
2394
assert( pPager->nMaxPage==(pPager->nPage-1) );
2398
/* Find a page to recycle. Try to locate a page that does not
2399
** require us to do an fsync() on the journal.
2401
pPg = pPager->pFirstSynced;
2403
/* If we could not find a page that does not require an fsync()
2404
** on the journal file then fsync the journal file. This is a
2405
** very slow operation, so we work hard to avoid it. But sometimes
2406
** it can't be helped.
2409
int rc = syncJournal(pPager);
2411
sqlite3pager_rollback(pPager);
2412
return SQLITE_IOERR;
2414
if( pPager->fullSync ){
2415
/* If in full-sync mode, write a new journal header into the
2416
** journal file. This is done to avoid ever modifying a journal
2417
** header that is involved in the rollback of pages that have
2418
** already been written to the database (in case the header is
2419
** trashed when the nRec field is updated).
2422
assert( pPager->journalOff > 0 );
2423
rc = writeJournalHdr(pPager);
2425
sqlite3pager_rollback(pPager);
2426
return SQLITE_IOERR;
2429
pPg = pPager->pFirst;
2431
assert( pPg->nRef==0 );
2433
/* Write the page to the database file if it is dirty.
2436
assert( pPg->needSync==0 );
2438
rc = pager_write_pagelist( pPg );
2439
if( rc!=SQLITE_OK ){
2440
sqlite3pager_rollback(pPager);
2441
return SQLITE_IOERR;
2444
assert( pPg->dirty==0 );
2446
/* If the page we are recycling is marked as alwaysRollback, then
2447
** set the global alwaysRollback flag, thus disabling the
2448
** sqlite_dont_rollback() optimization for the rest of this transaction.
2449
** It is necessary to do this because the page marked alwaysRollback
2450
** might be reloaded at a later time but at that point we won't remember
2451
** that is was marked alwaysRollback. This means that all pages must
2452
** be marked as alwaysRollback from here on out.
2454
if( pPg->alwaysRollback ){
2455
pPager->alwaysRollback = 1;
2458
/* Unlink the old page from the free list and the hash table
2464
if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
2465
sqlite3CheckMemory(pPager->aInJournal, pgno/8);
2466
assert( pPager->journalOpen );
2467
pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
2473
if( pPager->aInStmt && (int)pgno<=pPager->stmtSize
2474
&& (pPager->aInStmt[pgno/8] & (1<<(pgno&7)))!=0 ){
2475
page_add_to_stmt_list(pPg);
2477
page_remove_from_stmt_list(pPg);
2483
h = pager_hash(pgno);
2484
pPg->pNextHash = pPager->aHash[h];
2485
pPager->aHash[h] = pPg;
2486
if( pPg->pNextHash ){
2487
assert( pPg->pNextHash->pPrevHash==0 );
2488
pPg->pNextHash->pPrevHash = pPg;
2490
if( pPager->nExtra>0 ){
2491
memset(PGHDR_TO_EXTRA(pPg, pPager), 0, pPager->nExtra);
2493
if( pPager->errMask!=0 ){
2494
sqlite3pager_unref(PGHDR_TO_DATA(pPg));
2495
rc = pager_errcode(pPager);
2498
if( sqlite3pager_pagecount(pPager)<(int)pgno ){
2499
memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
2503
sqlite3OsSeek(&pPager->fd, (pgno-1)*(i64)pPager->pageSize);
2504
rc = sqlite3OsRead(&pPager->fd, PGHDR_TO_DATA(pPg), pPager->pageSize);
2505
TRACE3("FETCH %d page %d\n", PAGERID(pPager), pPg->pgno);
2506
CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
2507
if( rc!=SQLITE_OK ){
2509
if( sqlite3OsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
2510
|| fileSize>=pgno*pPager->pageSize ){
2511
sqlite3pager_unref(PGHDR_TO_DATA(pPg));
2514
memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
2520
#ifdef SQLITE_CHECK_PAGES
2521
pPg->pageHash = pager_pagehash(pPg);
2524
/* The requested page is in the page cache. */
2528
*ppPage = PGHDR_TO_DATA(pPg);
2533
** Acquire a page if it is already in the in-memory cache. Do
2534
** not read the page from disk. Return a pointer to the page,
2535
** or 0 if the page is not in cache.
2537
** See also sqlite3pager_get(). The difference between this routine
2538
** and sqlite3pager_get() is that _get() will go to the disk and read
2539
** in the page if the page is not already in cache. This routine
2540
** returns NULL if the page is not in cache or if a disk I/O error
2541
** has ever happened.
2543
void *sqlite3pager_lookup(Pager *pPager, Pgno pgno){
2546
assert( pPager!=0 );
2548
if( pPager->errMask & ~(PAGER_ERR_FULL) ){
2551
pPg = pager_lookup(pPager, pgno);
2552
if( pPg==0 ) return 0;
2554
return PGHDR_TO_DATA(pPg);
2560
** If the number of references to the page drop to zero, then the
2561
** page is added to the LRU list. When all references to all pages
2562
** are released, a rollback occurs and the lock on the database is
2565
int sqlite3pager_unref(void *pData){
2568
/* Decrement the reference count for this page
2570
pPg = DATA_TO_PGHDR(pData);
2571
assert( pPg->nRef>0 );
2577
/* When the number of references to a page reach 0, call the
2578
** destructor and add the page to the freelist.
2582
pPager = pPg->pPager;
2584
pPg->pPrevFree = pPager->pLast;
2585
pPager->pLast = pPg;
2586
if( pPg->pPrevFree ){
2587
pPg->pPrevFree->pNextFree = pPg;
2589
pPager->pFirst = pPg;
2591
if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
2592
pPager->pFirstSynced = pPg;
2594
if( pPager->xDestructor ){
2595
pPager->xDestructor(pData, pPager->pageSize);
2598
/* When all pages reach the freelist, drop the read lock from
2599
** the database file.
2602
assert( pPager->nRef>=0 );
2603
if( pPager->nRef==0 && !MEMDB ){
2604
pager_reset(pPager);
2611
** Create a journal file for pPager. There should already be a RESERVED
2612
** or EXCLUSIVE lock on the database file when this routine is called.
2614
** Return SQLITE_OK if everything. Return an error code and release the
2615
** write lock if anything goes wrong.
2617
static int pager_open_journal(Pager *pPager){
2620
assert( pPager->state>=PAGER_RESERVED );
2621
assert( pPager->journalOpen==0 );
2622
assert( pPager->useJournal );
2623
assert( pPager->aInJournal==0 );
2624
sqlite3pager_pagecount(pPager);
2625
pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
2626
if( pPager->aInJournal==0 ){
2628
goto failed_to_open_journal;
2630
rc = sqlite3OsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
2631
pPager->journalOff = 0;
2632
pPager->setMaster = 0;
2633
pPager->journalHdr = 0;
2634
if( rc!=SQLITE_OK ){
2635
goto failed_to_open_journal;
2637
SET_FULLSYNC(pPager->jfd, pPager->fullSync);
2638
SET_FULLSYNC(pPager->fd, pPager->fullSync);
2639
sqlite3OsOpenDirectory(pPager->zDirectory, &pPager->jfd);
2640
pPager->journalOpen = 1;
2641
pPager->journalStarted = 0;
2642
pPager->needSync = 0;
2643
pPager->alwaysRollback = 0;
2645
if( pPager->errMask!=0 ){
2646
rc = pager_errcode(pPager);
2647
goto failed_to_open_journal;
2649
pPager->origDbSize = pPager->dbSize;
2651
rc = writeJournalHdr(pPager);
2653
if( pPager->stmtAutoopen && rc==SQLITE_OK ){
2654
rc = sqlite3pager_stmt_begin(pPager);
2656
if( rc!=SQLITE_OK ){
2657
rc = pager_unwritelock(pPager);
2658
if( rc==SQLITE_OK ){
2664
failed_to_open_journal:
2665
sqliteFree(pPager->aInJournal);
2666
pPager->aInJournal = 0;
2667
sqlite3OsUnlock(&pPager->fd, NO_LOCK);
2668
pPager->state = PAGER_UNLOCK;
2673
** Acquire a write-lock on the database. The lock is removed when
2674
** the any of the following happen:
2676
** * sqlite3pager_commit() is called.
2677
** * sqlite3pager_rollback() is called.
2678
** * sqlite3pager_close() is called.
2679
** * sqlite3pager_unref() is called to on every outstanding page.
2681
** The first parameter to this routine is a pointer to any open page of the
2682
** database file. Nothing changes about the page - it is used merely to
2683
** acquire a pointer to the Pager structure and as proof that there is
2684
** already a read-lock on the database.
2686
** The second parameter indicates how much space in bytes to reserve for a
2687
** master journal file-name at the start of the journal when it is created.
2689
** A journal file is opened if this is not a temporary file. For temporary
2690
** files, the opening of the journal file is deferred until there is an
2691
** actual need to write to the journal.
2693
** If the database is already reserved for writing, this routine is a no-op.
2695
** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file
2696
** immediately instead of waiting until we try to flush the cache. The
2697
** exFlag is ignored if a transaction is already active.
2699
int sqlite3pager_begin(void *pData, int exFlag){
2700
PgHdr *pPg = DATA_TO_PGHDR(pData);
2701
Pager *pPager = pPg->pPager;
2703
assert( pPg->nRef>0 );
2704
assert( pPager->state!=PAGER_UNLOCK );
2705
if( pPager->state==PAGER_SHARED ){
2706
assert( pPager->aInJournal==0 );
2708
pPager->state = PAGER_EXCLUSIVE;
2709
pPager->origDbSize = pPager->dbSize;
2711
rc = sqlite3OsLock(&pPager->fd, RESERVED_LOCK);
2712
if( rc==SQLITE_OK ){
2713
pPager->state = PAGER_RESERVED;
2715
rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2718
if( rc!=SQLITE_OK ){
2721
pPager->dirtyCache = 0;
2722
TRACE2("TRANSACTION %d\n", PAGERID(pPager));
2723
if( pPager->useJournal && !pPager->tempFile ){
2724
rc = pager_open_journal(pPager);
2732
** Mark a data page as writeable. The page is written into the journal
2733
** if it is not there already. This routine must be called before making
2734
** changes to a page.
2736
** The first time this routine is called, the pager creates a new
2737
** journal and acquires a RESERVED lock on the database. If the RESERVED
2738
** lock could not be acquired, this routine returns SQLITE_BUSY. The
2739
** calling routine must check for that return value and be careful not to
2740
** change any page data until this routine returns SQLITE_OK.
2742
** If the journal file could not be written because the disk is full,
2743
** then this routine returns SQLITE_FULL and does an immediate rollback.
2744
** All subsequent write attempts also return SQLITE_FULL until there
2745
** is a call to sqlite3pager_commit() or sqlite3pager_rollback() to
2748
int sqlite3pager_write(void *pData){
2749
PgHdr *pPg = DATA_TO_PGHDR(pData);
2750
Pager *pPager = pPg->pPager;
2755
if( pPager->errMask ){
2756
return pager_errcode(pPager);
2758
if( pPager->readOnly ){
2762
assert( !pPager->setMaster );
2766
/* Mark the page as dirty. If the page has already been written
2767
** to the journal then we can return right away.
2770
if( pPg->inJournal && (pPg->inStmt || pPager->stmtInUse==0) ){
2771
pPager->dirtyCache = 1;
2774
/* If we get this far, it means that the page needs to be
2775
** written to the transaction journal or the ckeckpoint journal
2778
** First check to see that the transaction journal exists and
2779
** create it if it does not.
2781
assert( pPager->state!=PAGER_UNLOCK );
2782
rc = sqlite3pager_begin(pData, 0);
2783
if( rc!=SQLITE_OK ){
2786
assert( pPager->state>=PAGER_RESERVED );
2787
if( !pPager->journalOpen && pPager->useJournal ){
2788
rc = pager_open_journal(pPager);
2789
if( rc!=SQLITE_OK ) return rc;
2791
assert( pPager->journalOpen || !pPager->useJournal );
2792
pPager->dirtyCache = 1;
2794
/* The transaction journal now exists and we have a RESERVED or an
2795
** EXCLUSIVE lock on the main database file. Write the current page to
2796
** the transaction journal if it is not there already.
2798
if( !pPg->inJournal && (pPager->useJournal || MEMDB) ){
2799
if( (int)pPg->pgno <= pPager->origDbSize ){
2803
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
2804
TRACE3("JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
2805
assert( pHist->pOrig==0 );
2806
pHist->pOrig = sqliteMallocRaw( pPager->pageSize );
2808
memcpy(pHist->pOrig, PGHDR_TO_DATA(pPg), pPager->pageSize);
2812
CODEC(pPager, pData, pPg->pgno, 7);
2813
cksum = pager_cksum(pPager, pPg->pgno, pData);
2814
saved = *(u32*)PGHDR_TO_EXTRA(pPg, pPager);
2815
store32bits(cksum, pPg, pPager->pageSize);
2816
szPg = pPager->pageSize+8;
2817
store32bits(pPg->pgno, pPg, -4);
2818
rc = sqlite3OsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
2819
pPager->journalOff += szPg;
2820
TRACE4("JOURNAL %d page %d needSync=%d\n",
2821
PAGERID(pPager), pPg->pgno, pPg->needSync);
2822
CODEC(pPager, pData, pPg->pgno, 0);
2823
*(u32*)PGHDR_TO_EXTRA(pPg, pPager) = saved;
2824
if( rc!=SQLITE_OK ){
2825
sqlite3pager_rollback(pPager);
2826
pPager->errMask |= PAGER_ERR_FULL;
2830
assert( pPager->aInJournal!=0 );
2831
pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
2832
pPg->needSync = !pPager->noSync;
2833
if( pPager->stmtInUse ){
2834
pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
2835
page_add_to_stmt_list(pPg);
2839
pPg->needSync = !pPager->journalStarted && !pPager->noSync;
2840
TRACE4("APPEND %d page %d needSync=%d\n",
2841
PAGERID(pPager), pPg->pgno, pPg->needSync);
2843
if( pPg->needSync ){
2844
pPager->needSync = 1;
2849
/* If the statement journal is open and the page is not in it,
2850
** then write the current page to the statement journal. Note that
2851
** the statement journal format differs from the standard journal format
2852
** in that it omits the checksums and the header.
2854
if( pPager->stmtInUse && !pPg->inStmt && (int)pPg->pgno<=pPager->stmtSize ){
2855
assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
2857
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
2858
assert( pHist->pStmt==0 );
2859
pHist->pStmt = sqliteMallocRaw( pPager->pageSize );
2861
memcpy(pHist->pStmt, PGHDR_TO_DATA(pPg), pPager->pageSize);
2863
TRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
2865
store32bits(pPg->pgno, pPg, -4);
2866
CODEC(pPager, pData, pPg->pgno, 7);
2867
rc = sqlite3OsWrite(&pPager->stfd,((char*)pData)-4, pPager->pageSize+4);
2868
TRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
2869
CODEC(pPager, pData, pPg->pgno, 0);
2870
if( rc!=SQLITE_OK ){
2871
sqlite3pager_rollback(pPager);
2872
pPager->errMask |= PAGER_ERR_FULL;
2876
assert( pPager->aInStmt!=0 );
2877
pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
2879
page_add_to_stmt_list(pPg);
2883
/* Update the database size and return.
2885
if( pPager->dbSize<(int)pPg->pgno ){
2886
pPager->dbSize = pPg->pgno;
2887
if( !MEMDB && pPager->dbSize==PENDING_BYTE/pPager->pageSize ){
2895
** Return TRUE if the page given in the argument was previously passed
2896
** to sqlite3pager_write(). In other words, return TRUE if it is ok
2897
** to change the content of the page.
2899
int sqlite3pager_iswriteable(void *pData){
2900
PgHdr *pPg = DATA_TO_PGHDR(pData);
2904
#ifndef SQLITE_OMIT_VACUUM
2906
** Replace the content of a single page with the information in the third
2909
int sqlite3pager_overwrite(Pager *pPager, Pgno pgno, void *pData){
2913
rc = sqlite3pager_get(pPager, pgno, &pPage);
2914
if( rc==SQLITE_OK ){
2915
rc = sqlite3pager_write(pPage);
2916
if( rc==SQLITE_OK ){
2917
memcpy(pPage, pData, pPager->pageSize);
2919
sqlite3pager_unref(pPage);
2926
** A call to this routine tells the pager that it is not necessary to
2927
** write the information on page "pgno" back to the disk, even though
2928
** that page might be marked as dirty.
2930
** The overlying software layer calls this routine when all of the data
2931
** on the given page is unused. The pager marks the page as clean so
2932
** that it does not get written to disk.
2934
** Tests show that this optimization, together with the
2935
** sqlite3pager_dont_rollback() below, more than double the speed
2936
** of large INSERT operations and quadruple the speed of large DELETEs.
2938
** When this routine is called, set the alwaysRollback flag to true.
2939
** Subsequent calls to sqlite3pager_dont_rollback() for the same page
2940
** will thereafter be ignored. This is necessary to avoid a problem
2941
** where a page with data is added to the freelist during one part of
2942
** a transaction then removed from the freelist during a later part
2943
** of the same transaction and reused for some other purpose. When it
2944
** is first added to the freelist, this routine is called. When reused,
2945
** the dont_rollback() routine is called. But because the page contains
2946
** critical data, we still need to be sure it gets rolled back in spite
2947
** of the dont_rollback() call.
2949
void sqlite3pager_dont_write(Pager *pPager, Pgno pgno){
2954
pPg = pager_lookup(pPager, pgno);
2955
pPg->alwaysRollback = 1;
2956
if( pPg && pPg->dirty ){
2957
if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
2958
/* If this pages is the last page in the file and the file has grown
2959
** during the current transaction, then do NOT mark the page as clean.
2960
** When the database file grows, we must make sure that the last page
2961
** gets written at least once so that the disk file will be the correct
2962
** size. If you do not write this page and the size of the file
2963
** on the disk ends up being too small, that can lead to database
2964
** corruption during the next transaction.
2967
TRACE3("DONT_WRITE page %d of %d\n", pgno, PAGERID(pPager));
2969
#ifdef SQLITE_CHECK_PAGES
2970
pPg->pageHash = pager_pagehash(pPg);
2977
** A call to this routine tells the pager that if a rollback occurs,
2978
** it is not necessary to restore the data on the given page. This
2979
** means that the pager does not have to record the given page in the
2980
** rollback journal.
2982
void sqlite3pager_dont_rollback(void *pData){
2983
PgHdr *pPg = DATA_TO_PGHDR(pData);
2984
Pager *pPager = pPg->pPager;
2986
if( pPager->state!=PAGER_EXCLUSIVE || pPager->journalOpen==0 ) return;
2987
if( pPg->alwaysRollback || pPager->alwaysRollback || MEMDB ) return;
2988
if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
2989
assert( pPager->aInJournal!=0 );
2990
pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
2992
if( pPager->stmtInUse ){
2993
pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
2994
page_add_to_stmt_list(pPg);
2996
TRACE3("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager));
2998
if( pPager->stmtInUse && !pPg->inStmt && (int)pPg->pgno<=pPager->stmtSize ){
2999
assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
3000
assert( pPager->aInStmt!=0 );
3001
pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
3002
page_add_to_stmt_list(pPg);
3007
#ifndef SQLITE_OMIT_MEMORYDB
3009
** Clear a PgHistory block
3011
static void clearHistory(PgHistory *pHist){
3012
sqliteFree(pHist->pOrig);
3013
sqliteFree(pHist->pStmt);
3018
#define clearHistory(x)
3022
** Commit all changes to the database and release the write lock.
3024
** If the commit fails for any reason, a rollback attempt is made
3025
** and an error code is returned. If the commit worked, SQLITE_OK
3028
int sqlite3pager_commit(Pager *pPager){
3032
if( pPager->errMask==PAGER_ERR_FULL ){
3033
rc = sqlite3pager_rollback(pPager);
3034
if( rc==SQLITE_OK ){
3039
if( pPager->errMask!=0 ){
3040
rc = pager_errcode(pPager);
3043
if( pPager->state<PAGER_RESERVED ){
3044
return SQLITE_ERROR;
3046
TRACE2("COMMIT %d\n", PAGERID(pPager));
3048
pPg = pager_get_all_dirty_pages(pPager);
3050
clearHistory(PGHDR_TO_HIST(pPg, pPager));
3054
pPg->pPrevStmt = pPg->pNextStmt = 0;
3058
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
3059
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
3060
assert( !pPg->alwaysRollback );
3061
assert( !pHist->pOrig );
3062
assert( !pHist->pStmt );
3066
pPager->state = PAGER_SHARED;
3069
if( pPager->dirtyCache==0 ){
3070
/* Exit early (without doing the time-consuming sqlite3OsSync() calls)
3071
** if there have been no changes to the database file. */
3072
assert( pPager->needSync==0 );
3073
rc = pager_unwritelock(pPager);
3074
pPager->dbSize = -1;
3077
assert( pPager->journalOpen );
3078
rc = sqlite3pager_sync(pPager, 0, 0);
3079
if( rc!=SQLITE_OK ){
3082
rc = pager_unwritelock(pPager);
3083
pPager->dbSize = -1;
3086
/* Jump here if anything goes wrong during the commit process.
3089
sqlite3pager_rollback(pPager);
3094
** Rollback all changes. The database falls back to PAGER_SHARED mode.
3095
** All in-memory cache pages revert to their original data contents.
3096
** The journal is deleted.
3098
** This routine cannot fail unless some other process is not following
3099
** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
3100
** process is writing trash into the journal file (SQLITE_CORRUPT) or
3101
** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error
3102
** codes are returned for all these occasions. Otherwise,
3103
** SQLITE_OK is returned.
3105
int sqlite3pager_rollback(Pager *pPager){
3107
TRACE2("ROLLBACK %d\n", PAGERID(pPager));
3110
for(p=pPager->pAll; p; p=p->pNextAll){
3112
assert( !p->alwaysRollback );
3114
assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pOrig );
3115
assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pStmt );
3119
pHist = PGHDR_TO_HIST(p, pPager);
3121
memcpy(PGHDR_TO_DATA(p), pHist->pOrig, pPager->pageSize);
3122
TRACE3("ROLLBACK-PAGE %d of %d\n", p->pgno, PAGERID(pPager));
3124
TRACE3("PAGE %d is clean on %d\n", p->pgno, PAGERID(pPager));
3126
clearHistory(pHist);
3130
p->pPrevStmt = p->pNextStmt = 0;
3132
if( pPager->xReiniter ){
3133
pPager->xReiniter(PGHDR_TO_DATA(p), pPager->pageSize);
3138
pPager->dbSize = pPager->origDbSize;
3139
memoryTruncate(pPager);
3140
pPager->stmtInUse = 0;
3141
pPager->state = PAGER_SHARED;
3145
if( !pPager->dirtyCache || !pPager->journalOpen ){
3146
rc = pager_unwritelock(pPager);
3147
pPager->dbSize = -1;
3151
if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
3152
if( pPager->state>=PAGER_EXCLUSIVE ){
3153
pager_playback(pPager);
3155
return pager_errcode(pPager);
3157
if( pPager->state==PAGER_RESERVED ){
3159
rc = pager_reload_cache(pPager);
3160
rc2 = pager_unwritelock(pPager);
3161
if( rc==SQLITE_OK ){
3165
rc = pager_playback(pPager);
3167
if( rc!=SQLITE_OK ){
3168
rc = SQLITE_CORRUPT; /* bkpt-CORRUPT */
3169
pPager->errMask |= PAGER_ERR_CORRUPT;
3171
pPager->dbSize = -1;
3176
** Return TRUE if the database file is opened read-only. Return FALSE
3177
** if the database is (in theory) writable.
3179
int sqlite3pager_isreadonly(Pager *pPager){
3180
return pPager->readOnly;
3184
** This routine is used for testing and analysis only.
3186
int *sqlite3pager_stats(Pager *pPager){
3188
a[0] = pPager->nRef;
3189
a[1] = pPager->nPage;
3190
a[2] = pPager->mxPage;
3191
a[3] = pPager->dbSize;
3192
a[4] = pPager->state;
3193
a[5] = pPager->errMask;
3194
a[6] = pPager->nHit;
3195
a[7] = pPager->nMiss;
3196
a[8] = pPager->nOvfl;
3197
a[9] = pPager->nRead;
3198
a[10] = pPager->nWrite;
3203
** Set the statement rollback point.
3205
** This routine should be called with the transaction journal already
3206
** open. A new statement journal is created that can be used to rollback
3207
** changes of a single SQL command within a larger transaction.
3209
int sqlite3pager_stmt_begin(Pager *pPager){
3211
char zTemp[SQLITE_TEMPNAME_SIZE];
3212
assert( !pPager->stmtInUse );
3213
assert( pPager->dbSize>=0 );
3214
TRACE2("STMT-BEGIN %d\n", PAGERID(pPager));
3216
pPager->stmtInUse = 1;
3217
pPager->stmtSize = pPager->dbSize;
3220
if( !pPager->journalOpen ){
3221
pPager->stmtAutoopen = 1;
3224
assert( pPager->journalOpen );
3225
pPager->aInStmt = sqliteMalloc( pPager->dbSize/8 + 1 );
3226
if( pPager->aInStmt==0 ){
3227
sqlite3OsLock(&pPager->fd, SHARED_LOCK);
3228
return SQLITE_NOMEM;
3231
rc = sqlite3OsFileSize(&pPager->jfd, &pPager->stmtJSize);
3232
if( rc ) goto stmt_begin_failed;
3233
assert( pPager->stmtJSize == pPager->journalOff );
3235
pPager->stmtJSize = pPager->journalOff;
3236
pPager->stmtSize = pPager->dbSize;
3237
pPager->stmtHdrOff = 0;
3238
pPager->stmtCksum = pPager->cksumInit;
3239
if( !pPager->stmtOpen ){
3240
rc = sqlite3pager_opentemp(zTemp, &pPager->stfd);
3241
if( rc ) goto stmt_begin_failed;
3242
pPager->stmtOpen = 1;
3243
pPager->stmtNRec = 0;
3245
pPager->stmtInUse = 1;
3249
if( pPager->aInStmt ){
3250
sqliteFree(pPager->aInStmt);
3251
pPager->aInStmt = 0;
3257
** Commit a statement.
3259
int sqlite3pager_stmt_commit(Pager *pPager){
3260
if( pPager->stmtInUse ){
3262
TRACE2("STMT-COMMIT %d\n", PAGERID(pPager));
3264
sqlite3OsSeek(&pPager->stfd, 0);
3265
/* sqlite3OsTruncate(&pPager->stfd, 0); */
3266
sqliteFree( pPager->aInStmt );
3267
pPager->aInStmt = 0;
3269
for(pPg=pPager->pStmt; pPg; pPg=pNext){
3270
pNext = pPg->pNextStmt;
3271
assert( pPg->inStmt );
3273
pPg->pPrevStmt = pPg->pNextStmt = 0;
3275
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
3276
sqliteFree(pHist->pStmt);
3280
pPager->stmtNRec = 0;
3281
pPager->stmtInUse = 0;
3284
pPager->stmtAutoopen = 0;
3289
** Rollback a statement.
3291
int sqlite3pager_stmt_rollback(Pager *pPager){
3293
if( pPager->stmtInUse ){
3294
TRACE2("STMT-ROLLBACK %d\n", PAGERID(pPager));
3297
for(pPg=pPager->pStmt; pPg; pPg=pPg->pNextStmt){
3298
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
3300
memcpy(PGHDR_TO_DATA(pPg), pHist->pStmt, pPager->pageSize);
3301
sqliteFree(pHist->pStmt);
3305
pPager->dbSize = pPager->stmtSize;
3306
memoryTruncate(pPager);
3309
rc = pager_stmt_playback(pPager);
3311
sqlite3pager_stmt_commit(pPager);
3315
pPager->stmtAutoopen = 0;
3320
** Return the full pathname of the database file.
3322
const char *sqlite3pager_filename(Pager *pPager){
3323
return pPager->zFilename;
3327
** Return the directory of the database file.
3329
const char *sqlite3pager_dirname(Pager *pPager){
3330
return pPager->zDirectory;
3334
** Return the full pathname of the journal file.
3336
const char *sqlite3pager_journalname(Pager *pPager){
3337
return pPager->zJournal;
3341
** Set the codec for this pager
3343
void sqlite3pager_set_codec(
3345
void (*xCodec)(void*,void*,Pgno,int),
3348
pPager->xCodec = xCodec;
3349
pPager->pCodecArg = pCodecArg;
3353
** This routine is called to increment the database file change-counter,
3354
** stored at byte 24 of the pager file.
3356
static int pager_incr_changecounter(Pager *pPager){
3362
/* Open page 1 of the file for writing. */
3363
rc = sqlite3pager_get(pPager, 1, &pPage);
3364
if( rc!=SQLITE_OK ) return rc;
3365
rc = sqlite3pager_write(pPage);
3366
if( rc!=SQLITE_OK ) return rc;
3368
/* Read the current value at byte 24. */
3369
pPgHdr = DATA_TO_PGHDR(pPage);
3370
change_counter = retrieve32bits(pPgHdr, 24);
3372
/* Increment the value just read and write it back to byte 24. */
3374
store32bits(change_counter, pPgHdr, 24);
3376
/* Release the page reference. */
3377
sqlite3pager_unref(pPage);
3382
** Sync the database file for the pager pPager. zMaster points to the name
3383
** of a master journal file that should be written into the individual
3384
** journal file. zMaster may be NULL, which is interpreted as no master
3385
** journal (a single database transaction).
3387
** This routine ensures that the journal is synced, all dirty pages written
3388
** to the database file and the database file synced. The only thing that
3389
** remains to commit the transaction is to delete the journal file (or
3390
** master journal file if specified).
3392
** Note that if zMaster==NULL, this does not overwrite a previous value
3393
** passed to an sqlite3pager_sync() call.
3395
** If parameter nTrunc is non-zero, then the pager file is truncated to
3396
** nTrunc pages (this is used by auto-vacuum databases).
3398
int sqlite3pager_sync(Pager *pPager, const char *zMaster, Pgno nTrunc){
3401
TRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n",
3402
pPager->zFilename, zMaster, nTrunc);
3404
/* If this is an in-memory db, or no pages have been written to, or this
3405
** function has already been called, it is a no-op.
3407
if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
3409
assert( pPager->journalOpen );
3411
/* If a master journal file name has already been written to the
3412
** journal file, then no sync is required. This happens when it is
3413
** written, then the process fails to upgrade from a RESERVED to an
3414
** EXCLUSIVE lock. The next time the process tries to commit the
3415
** transaction the m-j name will have already been written.
3417
if( !pPager->setMaster ){
3418
rc = pager_incr_changecounter(pPager);
3419
if( rc!=SQLITE_OK ) goto sync_exit;
3420
#ifndef SQLITE_OMIT_AUTOVACUUM
3422
/* If this transaction has made the database smaller, then all pages
3423
** being discarded by the truncation must be written to the journal
3428
for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
3429
if( !(pPager->aInJournal[i/8] & (1<<(i&7))) ){
3430
rc = sqlite3pager_get(pPager, i, &pPage);
3431
if( rc!=SQLITE_OK ) goto sync_exit;
3432
rc = sqlite3pager_write(pPage);
3433
sqlite3pager_unref(pPage);
3434
if( rc!=SQLITE_OK ) goto sync_exit;
3439
rc = writeMasterJournal(pPager, zMaster);
3440
if( rc!=SQLITE_OK ) goto sync_exit;
3441
rc = syncJournal(pPager);
3442
if( rc!=SQLITE_OK ) goto sync_exit;
3445
#ifndef SQLITE_OMIT_AUTOVACUUM
3447
rc = sqlite3pager_truncate(pPager, nTrunc);
3448
if( rc!=SQLITE_OK ) goto sync_exit;
3452
/* Write all dirty pages to the database file */
3453
pPg = pager_get_all_dirty_pages(pPager);
3454
rc = pager_write_pagelist(pPg);
3455
if( rc!=SQLITE_OK ) goto sync_exit;
3457
/* Sync the database file. */
3458
if( !pPager->noSync ){
3459
rc = sqlite3OsSync(&pPager->fd);
3462
pPager->state = PAGER_SYNCED;
3469
#ifndef SQLITE_OMIT_AUTOVACUUM
3471
** Move the page identified by pData to location pgno in the file.
3473
** There must be no references to the current page pgno. If current page
3474
** pgno is not already in the rollback journal, it is not written there by
3475
** by this routine. The same applies to the page pData refers to on entry to
3478
** References to the page refered to by pData remain valid. Updating any
3479
** meta-data associated with page pData (i.e. data stored in the nExtra bytes
3480
** allocated along with the page) is the responsibility of the caller.
3482
** A transaction must be active when this routine is called. It used to be
3483
** required that a statement transaction was not active, but this restriction
3484
** has been removed (CREATE INDEX needs to move a page when a statement
3485
** transaction is active).
3487
int sqlite3pager_movepage(Pager *pPager, void *pData, Pgno pgno){
3488
PgHdr *pPg = DATA_TO_PGHDR(pData);
3491
Pgno needSyncPgno = 0;
3493
assert( pPg->nRef>0 );
3495
TRACE5("MOVE %d page %d (needSync=%d) moves to %d\n",
3496
PAGERID(pPager), pPg->pgno, pPg->needSync, pgno);
3498
if( pPg->needSync ){
3499
needSyncPgno = pPg->pgno;
3500
assert( pPg->inJournal );
3501
assert( pPg->dirty );
3502
assert( pPager->needSync );
3505
/* Unlink pPg from it's hash-chain */
3506
unlinkHashChain(pPager, pPg);
3508
/* If the cache contains a page with page-number pgno, remove it
3509
** from it's hash chain. Also, if the PgHdr.needSync was set for
3510
** page pgno before the 'move' operation, it needs to be retained
3511
** for the page moved there.
3513
pPgOld = pager_lookup(pPager, pgno);
3515
assert( pPgOld->nRef==0 );
3516
unlinkHashChain(pPager, pPgOld);
3518
if( pPgOld->needSync ){
3519
assert( pPgOld->inJournal );
3522
assert( pPager->needSync );
3526
/* Change the page number for pPg and insert it into the new hash-chain. */
3528
h = pager_hash(pgno);
3529
if( pPager->aHash[h] ){
3530
assert( pPager->aHash[h]->pPrevHash==0 );
3531
pPager->aHash[h]->pPrevHash = pPg;
3533
pPg->pNextHash = pPager->aHash[h];
3534
pPager->aHash[h] = pPg;
3538
pPager->dirtyCache = 1;
3541
/* If needSyncPgno is non-zero, then the journal file needs to be
3542
** sync()ed before any data is written to database file page needSyncPgno.
3543
** Currently, no such page exists in the page-cache and the
3544
** Pager.aInJournal bit has been set. This needs to be remedied by loading
3545
** the page into the pager-cache and setting the PgHdr.needSync flag.
3547
** The sqlite3pager_get() call may cause the journal to sync. So make
3548
** sure the Pager.needSync flag is set too.
3552
assert( pPager->needSync );
3553
rc = sqlite3pager_get(pPager, needSyncPgno, &pNeedSync);
3554
if( rc!=SQLITE_OK ) return rc;
3555
pPager->needSync = 1;
3556
DATA_TO_PGHDR(pNeedSync)->needSync = 1;
3557
DATA_TO_PGHDR(pNeedSync)->inJournal = 1;
3558
DATA_TO_PGHDR(pNeedSync)->dirty = 1;
3559
sqlite3pager_unref(pNeedSync);
3566
#if defined(SQLITE_DEBUG) || defined(SQLITE_TEST)
3568
** Return the current state of the file lock for the given pager.
3569
** The return value is one of NO_LOCK, SHARED_LOCK, RESERVED_LOCK,
3570
** PENDING_LOCK, or EXCLUSIVE_LOCK.
3572
int sqlite3pager_lockstate(Pager *pPager){
3574
return pPager->fd->fd.locktype;
3576
return pPager->fd.locktype;
3583
** Print a listing of all referenced pages and their ref count.
3585
void sqlite3pager_refdump(Pager *pPager){
3587
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
3588
if( pPg->nRef<=0 ) continue;
3589
sqlite3DebugPrintf("PAGE %3d addr=%p nRef=%d\n",
3590
pPg->pgno, PGHDR_TO_DATA(pPg), pPg->nRef);