4
** The author disclaims copyright to this source code. In place of
5
** a legal notice, here is a blessing:
7
** May you do good and not evil.
8
** May you find forgiveness for yourself and forgive others.
9
** May you share freely, never taking more than you give.
11
*************************************************************************
12
** $Id: btree.c,v 1.256 2005/03/29 13:17:46 drh Exp $
14
** This file implements a external (disk-based) database using BTrees.
15
** For a detailed discussion of BTrees, refer to
17
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18
** "Sorting And Searching", pages 473-480. Addison-Wesley
19
** Publishing Company, Reading, Massachusetts.
21
** The basic idea is that each page of the file contains N database
22
** entries and N+1 pointers to subpages.
24
** ----------------------------------------------------------------
25
** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26
** ----------------------------------------------------------------
28
** All of the keys on the page that Ptr(0) points to have values less
29
** than Key(0). All of the keys on page Ptr(1) and its subpages have
30
** values greater than Key(0) and less than Key(1). All of the keys
31
** on Ptr(N+1) and its subpages have values greater than Key(N). And
34
** Finding a particular key requires reading O(log(M)) pages from the
35
** disk where M is the number of entries in the tree.
37
** In this implementation, a single file can hold one or more separate
38
** BTrees. Each BTree is identified by the index of its root page. The
39
** key and data for any entry are combined to form the "payload". A
40
** fixed amount of payload can be carried directly on the database
41
** page. If the payload is larger than the preset amount then surplus
42
** bytes are stored on overflow pages. The payload for an entry
43
** and the preceding pointer are combined to form a "Cell". Each
44
** page has a small header which contains the Ptr(N+1) pointer and other
45
** information such as the size of key and data.
49
** The file is divided into pages. The first page is called page 1,
50
** the second is page 2, and so forth. A page number of zero indicates
51
** "no such page". The page size can be anything between 512 and 65536.
52
** Each page can be either a btree page, a freelist page or an overflow
55
** The first page is always a btree page. The first 100 bytes of the first
56
** page contain a special header (the "file header") that describes the file.
57
** The format of the file header is as follows:
59
** OFFSET SIZE DESCRIPTION
60
** 0 16 Header string: "SQLite format 3\000"
61
** 16 2 Page size in bytes.
62
** 18 1 File format write version
63
** 19 1 File format read version
64
** 20 1 Bytes of unused space at the end of each page
65
** 21 1 Max embedded payload fraction
66
** 22 1 Min embedded payload fraction
67
** 23 1 Min leaf payload fraction
68
** 24 4 File change counter
69
** 28 4 Reserved for future use
70
** 32 4 First freelist page
71
** 36 4 Number of freelist pages in the file
72
** 40 60 15 4-byte meta values passed to higher layers
74
** All of the integer values are big-endian (most significant byte first).
76
** The file change counter is incremented when the database is changed more
77
** than once within the same second. This counter, together with the
78
** modification time of the file, allows other processes to know
79
** when the file has changed and thus when they need to flush their
82
** The max embedded payload fraction is the amount of the total usable
83
** space in a page that can be consumed by a single cell for standard
84
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
85
** is to limit the maximum cell size so that at least 4 cells will fit
86
** on one page. Thus the default max embedded payload fraction is 64.
88
** If the payload for a cell is larger than the max payload, then extra
89
** payload is spilled to overflow pages. Once an overflow page is allocated,
90
** as many bytes as possible are moved into the overflow pages without letting
91
** the cell size drop below the min embedded payload fraction.
93
** The min leaf payload fraction is like the min embedded payload fraction
94
** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
95
** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96
** not specified in the header.
98
** Each btree pages is divided into three sections: The header, the
99
** cell pointer array, and the cell area area. Page 1 also has a 100-byte
100
** file header that occurs before the page header.
102
** |----------------|
103
** | file header | 100 bytes. Page 1 only.
104
** |----------------|
105
** | page header | 8 bytes for leaves. 12 bytes for interior nodes
106
** |----------------|
107
** | cell pointer | | 2 bytes per cell. Sorted order.
108
** | array | | Grows downward
110
** |----------------|
113
** |----------------| ^ Grows upwards
114
** | cell content | | Arbitrary order interspersed with freeblocks.
115
** | area | | and free space fragments.
116
** |----------------|
118
** The page headers looks like this:
120
** OFFSET SIZE DESCRIPTION
121
** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
122
** 1 2 byte offset to the first freeblock
123
** 3 2 number of cells on this page
124
** 5 2 first byte of the cell content area
125
** 7 1 number of fragmented free bytes
126
** 8 4 Right child (the Ptr(N+1) value). Omitted on leaves.
128
** The flags define the format of this btree page. The leaf flag means that
129
** this page has no children. The zerodata flag means that this page carries
130
** only keys and no data. The intkey flag means that the key is a integer
131
** which is stored in the key size entry of the cell header rather than in
134
** The cell pointer array begins on the first byte after the page header.
135
** The cell pointer array contains zero or more 2-byte numbers which are
136
** offsets from the beginning of the page to the cell content in the cell
137
** content area. The cell pointers occur in sorted order. The system strives
138
** to keep free space after the last cell pointer so that new cells can
139
** be easily added without having to defragment the page.
141
** Cell content is stored at the very end of the page and grows toward the
142
** beginning of the page.
144
** Unused space within the cell content area is collected into a linked list of
145
** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
146
** to the first freeblock is given in the header. Freeblocks occur in
147
** increasing order. Because a freeblock must be at least 4 bytes in size,
148
** any group of 3 or fewer unused bytes in the cell content area cannot
149
** exist on the freeblock chain. A group of 3 or fewer free bytes is called
150
** a fragment. The total number of bytes in all fragments is recorded.
151
** in the page header at offset 7.
154
** 2 Byte offset of the next freeblock
155
** 2 Bytes in this freeblock
157
** Cells are of variable length. Cells are stored in the cell content area at
158
** the end of the page. Pointers to the cells are in the cell pointer array
159
** that immediately follows the page header. Cells is not necessarily
160
** contiguous or in order, but cell pointers are contiguous and in order.
162
** Cell content makes use of variable length integers. A variable
163
** length integer is 1 to 9 bytes where the lower 7 bits of each
164
** byte are used. The integer consists of all bytes that have bit 8 set and
165
** the first byte with bit 8 clear. The most significant byte of the integer
166
** appears first. A variable-length integer may not be more than 9 bytes long.
167
** As a special case, all 8 bytes of the 9th byte are used as data. This
168
** allows a 64-bit integer to be encoded in 9 bytes.
170
** 0x00 becomes 0x00000000
171
** 0x7f becomes 0x0000007f
172
** 0x81 0x00 becomes 0x00000080
173
** 0x82 0x00 becomes 0x00000100
174
** 0x80 0x7f becomes 0x0000007f
175
** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
176
** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
178
** Variable length integers are used for rowids and to hold the number of
179
** bytes of key and data in a btree cell.
181
** The content of a cell looks like this:
184
** 4 Page number of the left child. Omitted if leaf flag is set.
185
** var Number of bytes of data. Omitted if the zerodata flag is set.
186
** var Number of bytes of key. Or the key itself if intkey flag is set.
188
** 4 First page of the overflow chain. Omitted if no overflow
190
** Overflow pages form a linked list. Each page except the last is completely
191
** filled with data (pagesize - 4 bytes). The last page can have as little
192
** as 1 byte of data.
195
** 4 Page number of next overflow page
198
** Freelist pages come in two subtypes: trunk pages and leaf pages. The
199
** file header points to first in a linked list of trunk page. Each trunk
200
** page points to multiple leaf pages. The content of a leaf page is
201
** unspecified. A trunk page looks like this:
204
** 4 Page number of next trunk page
205
** 4 Number of leaf pointers on this page
206
** * zero or more pages numbers of leaves
208
#include "sqliteInt.h"
215
** This macro rounds values up so that if the value is an address it
216
** is guaranteed to be an address that is aligned to an 8-byte boundary.
218
#define FORCE_ALIGNMENT(X) (((X)+7)&~7)
220
/* The following value is the maximum cell size assuming a maximum page
223
#define MX_CELL_SIZE(pBt) (pBt->pageSize-8)
225
/* The maximum number of cells on a single page of the database. This
226
** assumes a minimum cell size of 3 bytes. Such small cells will be
227
** exceedingly rare, but they are possible.
229
#define MX_CELL(pBt) ((pBt->pageSize-8)/3)
231
/* Forward declarations */
232
typedef struct MemPage MemPage;
235
** This is a magic string that appears at the beginning of every
236
** SQLite database in order to identify the file as a real database.
237
** 123456789 123456 */
238
static const char zMagicHeader[] = "SQLite format 3";
241
** Page type flags. An ORed combination of these flags appear as the
242
** first byte of every BTree page.
244
#define PTF_INTKEY 0x01
245
#define PTF_ZERODATA 0x02
246
#define PTF_LEAFDATA 0x04
247
#define PTF_LEAF 0x08
250
** As each page of the file is loaded into memory, an instance of the following
251
** structure is appended and initialized to zero. This structure stores
252
** information about the page that is decoded from the raw file page.
254
** The pParent field points back to the parent page. This allows us to
255
** walk up the BTree from any leaf to the root. Care must be taken to
256
** unref() the parent page pointer when this page is no longer referenced.
257
** The pageDestructor() routine handles that chore.
260
u8 isInit; /* True if previously initialized. MUST BE FIRST! */
261
u8 idxShift; /* True if Cell indices have changed */
262
u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
263
u8 intKey; /* True if intkey flag is set */
264
u8 leaf; /* True if leaf flag is set */
265
u8 zeroData; /* True if table stores keys only */
266
u8 leafData; /* True if tables stores data on leaves only */
267
u8 hasData; /* True if this page stores data */
268
u8 hdrOffset; /* 100 for page 1. 0 otherwise */
269
u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */
270
u16 maxLocal; /* Copy of Btree.maxLocal or Btree.maxLeaf */
271
u16 minLocal; /* Copy of Btree.minLocal or Btree.minLeaf */
272
u16 cellOffset; /* Index in aData of first cell pointer */
273
u16 idxParent; /* Index in parent of this node */
274
u16 nFree; /* Number of free bytes on the page */
275
u16 nCell; /* Number of cells on this page, local and ovfl */
276
struct _OvflCell { /* Cells that will not fit on aData[] */
277
u8 *pCell; /* Pointers to the body of the overflow cell */
278
u16 idx; /* Insert this cell before idx-th non-overflow cell */
280
struct Btree *pBt; /* Pointer back to BTree structure */
281
u8 *aData; /* Pointer back to the start of the page */
282
Pgno pgno; /* Page number for this page */
283
MemPage *pParent; /* The parent of this page. NULL for root */
287
** The in-memory image of a disk page has the auxiliary information appended
288
** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
289
** that extra information.
291
#define EXTRA_SIZE sizeof(MemPage)
294
** Everything we need to know about an open database
297
Pager *pPager; /* The page cache */
298
BtCursor *pCursor; /* A list of all open cursors */
299
MemPage *pPage1; /* First page of the database */
300
u8 inTrans; /* True if a transaction is in progress */
301
u8 inStmt; /* True if we are in a statement subtransaction */
302
u8 readOnly; /* True if the underlying file is readonly */
303
u8 maxEmbedFrac; /* Maximum payload as % of total page size */
304
u8 minEmbedFrac; /* Minimum payload as % of total page size */
305
u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
306
u8 pageSizeFixed; /* True if the page size can no longer be changed */
307
#ifndef SQLITE_OMIT_AUTOVACUUM
308
u8 autoVacuum; /* True if database supports auto-vacuum */
310
u16 pageSize; /* Total number of bytes on a page */
311
u16 psAligned; /* pageSize rounded up to a multiple of 8 */
312
u16 usableSize; /* Number of usable bytes on each page */
313
int maxLocal; /* Maximum local payload in non-LEAFDATA tables */
314
int minLocal; /* Minimum local payload in non-LEAFDATA tables */
315
int maxLeaf; /* Maximum local payload in a LEAFDATA table */
316
int minLeaf; /* Minimum local payload in a LEAFDATA table */
317
BusyHandler *pBusyHandler; /* Callback for when there is lock contention */
322
** Btree.inTrans may take one of the following values.
326
#define TRANS_WRITE 2
329
** An instance of the following structure is used to hold information
330
** about a cell. The parseCellPtr() function fills in this structure
331
** based on information extract from the raw disk page.
333
typedef struct CellInfo CellInfo;
335
u8 *pCell; /* Pointer to the start of cell content */
336
i64 nKey; /* The key for INTKEY tables, or number of bytes in key */
337
u32 nData; /* Number of bytes of data */
338
u16 nHeader; /* Size of the cell content header in bytes */
339
u16 nLocal; /* Amount of payload held locally */
340
u16 iOverflow; /* Offset to overflow page number. Zero if no overflow */
341
u16 nSize; /* Size of the cell content on the main b-tree page */
345
** A cursor is a pointer to a particular entry in the BTree.
346
** The entry is identified by its MemPage and the index in
347
** MemPage.aCell[] of the entry.
350
Btree *pBt; /* The Btree to which this cursor belongs */
351
BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
352
int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
353
void *pArg; /* First arg to xCompare() */
354
Pgno pgnoRoot; /* The root page of this tree */
355
MemPage *pPage; /* Page that contains the entry */
356
int idx; /* Index of the entry in pPage->aCell[] */
357
CellInfo info; /* A parse of the cell we are pointing at */
358
u8 wrFlag; /* True if writable */
359
u8 isValid; /* TRUE if points to a valid entry */
363
** The TRACE macro will print high-level status information about the
364
** btree operation when the global variable sqlite3_btree_trace is
368
# define TRACE(X) if( sqlite3_btree_trace )\
369
{ sqlite3DebugPrintf X; fflush(stdout); }
373
int sqlite3_btree_trace=0; /* True to enable tracing */
376
** Forward declaration
378
static int checkReadLocks(Btree*,Pgno,BtCursor*);
381
** Read or write a two- and four-byte big-endian integer values.
383
static u32 get2byte(unsigned char *p){
384
return (p[0]<<8) | p[1];
386
static u32 get4byte(unsigned char *p){
387
return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
389
static void put2byte(unsigned char *p, u32 v){
393
static void put4byte(unsigned char *p, u32 v){
401
** Routines to read and write variable-length integers. These used to
402
** be defined locally, but now we use the varint routines in the util.c
405
#define getVarint sqlite3GetVarint
406
#define getVarint32 sqlite3GetVarint32
407
#define putVarint sqlite3PutVarint
409
/* The database page the PENDING_BYTE occupies. This page is never used.
410
** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They
411
** should possibly be consolidated (presumably in pager.h).
413
#define PENDING_BYTE_PAGE(pBt) ((PENDING_BYTE/(pBt)->pageSize)+1)
415
#ifndef SQLITE_OMIT_AUTOVACUUM
417
** These macros define the location of the pointer-map entry for a
418
** database page. The first argument to each is the number of usable
419
** bytes on each page of the database (often 1024). The second is the
420
** page number to look up in the pointer map.
422
** PTRMAP_PAGENO returns the database page number of the pointer-map
423
** page that stores the required pointer. PTRMAP_PTROFFSET returns
424
** the offset of the requested map entry.
426
** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
427
** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
428
** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
431
#define PTRMAP_PAGENO(pgsz, pgno) (((pgno-2)/(pgsz/5+1))*(pgsz/5+1)+2)
432
#define PTRMAP_PTROFFSET(pgsz, pgno) (((pgno-2)%(pgsz/5+1)-1)*5)
433
#define PTRMAP_ISPAGE(pgsz, pgno) (PTRMAP_PAGENO(pgsz,pgno)==pgno)
436
** The pointer map is a lookup table that identifies the parent page for
437
** each child page in the database file. The parent page is the page that
438
** contains a pointer to the child. Every page in the database contains
439
** 0 or 1 parent pages. (In this context 'database page' refers
440
** to any page that is not part of the pointer map itself.) Each pointer map
441
** entry consists of a single byte 'type' and a 4 byte parent page number.
442
** The PTRMAP_XXX identifiers below are the valid types.
444
** The purpose of the pointer map is to facility moving pages from one
445
** position in the file to another as part of autovacuum. When a page
446
** is moved, the pointer in its parent must be updated to point to the
447
** new location. The pointer map is used to locate the parent page quickly.
449
** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
450
** used in this case.
452
** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
453
** is not used in this case.
455
** PTRMAP_OVERFLOW1: The database page is the first page in a list of
456
** overflow pages. The page number identifies the page that
457
** contains the cell with a pointer to this overflow page.
459
** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
460
** overflow pages. The page-number identifies the previous
461
** page in the overflow page list.
463
** PTRMAP_BTREE: The database page is a non-root btree page. The page number
464
** identifies the parent page in the btree.
466
#define PTRMAP_ROOTPAGE 1
467
#define PTRMAP_FREEPAGE 2
468
#define PTRMAP_OVERFLOW1 3
469
#define PTRMAP_OVERFLOW2 4
470
#define PTRMAP_BTREE 5
473
** Write an entry into the pointer map.
475
** This routine updates the pointer map entry for page number 'key'
476
** so that it maps to type 'eType' and parent page number 'pgno'.
477
** An error code is returned if something goes wrong, otherwise SQLITE_OK.
479
static int ptrmapPut(Btree *pBt, Pgno key, u8 eType, Pgno parent){
480
u8 *pPtrmap; /* The pointer map page */
481
Pgno iPtrmap; /* The pointer map page number */
482
int offset; /* Offset in pointer map page */
485
assert( pBt->autoVacuum );
487
return SQLITE_CORRUPT;
489
iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
490
rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
494
offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
496
if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
497
TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
498
rc = sqlite3pager_write(pPtrmap);
500
pPtrmap[offset] = eType;
501
put4byte(&pPtrmap[offset+1], parent);
505
sqlite3pager_unref(pPtrmap);
510
** Read an entry from the pointer map.
512
** This routine retrieves the pointer map entry for page 'key', writing
513
** the type and parent page number to *pEType and *pPgno respectively.
514
** An error code is returned if something goes wrong, otherwise SQLITE_OK.
516
static int ptrmapGet(Btree *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
517
int iPtrmap; /* Pointer map page index */
518
u8 *pPtrmap; /* Pointer map page data */
519
int offset; /* Offset of entry in pointer map */
522
iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
523
rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
528
offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
529
if( pEType ) *pEType = pPtrmap[offset];
530
if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
532
sqlite3pager_unref(pPtrmap);
533
if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT;
537
#endif /* SQLITE_OMIT_AUTOVACUUM */
540
** Given a btree page and a cell index (0 means the first cell on
541
** the page, 1 means the second cell, and so forth) return a pointer
542
** to the cell content.
544
** This routine works only for pages that do not contain overflow cells.
546
static u8 *findCell(MemPage *pPage, int iCell){
547
u8 *data = pPage->aData;
549
assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
550
return data + get2byte(&data[pPage->cellOffset+2*iCell]);
554
** This a more complex version of findCell() that works for
555
** pages that do contain overflow cells. See insert
557
static u8 *findOverflowCell(MemPage *pPage, int iCell){
559
for(i=pPage->nOverflow-1; i>=0; i--){
561
struct _OvflCell *pOvfl;
562
pOvfl = &pPage->aOvfl[i];
571
return findCell(pPage, iCell);
575
** Parse a cell content block and fill in the CellInfo structure. There
576
** are two versions of this function. parseCell() takes a cell index
577
** as the second argument and parseCellPtr() takes a pointer to the
578
** body of the cell as its second argument.
580
static void parseCellPtr(
581
MemPage *pPage, /* Page containing the cell */
582
u8 *pCell, /* Pointer to the cell text. */
583
CellInfo *pInfo /* Fill in this structure */
585
int n; /* Number bytes in cell content header */
586
u32 nPayload; /* Number of bytes of cell payload */
588
pInfo->pCell = pCell;
589
assert( pPage->leaf==0 || pPage->leaf==1 );
590
n = pPage->childPtrSize;
591
assert( n==4-4*pPage->leaf );
592
if( pPage->hasData ){
593
n += getVarint32(&pCell[n], &nPayload);
597
n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
599
pInfo->nData = nPayload;
600
if( !pPage->intKey ){
601
nPayload += pInfo->nKey;
603
if( nPayload<=pPage->maxLocal ){
604
/* This is the (easy) common case where the entire payload fits
605
** on the local page. No overflow is required.
607
int nSize; /* Total size of cell content in bytes */
608
pInfo->nLocal = nPayload;
609
pInfo->iOverflow = 0;
610
nSize = nPayload + n;
612
nSize = 4; /* Minimum cell size is 4 */
614
pInfo->nSize = nSize;
616
/* If the payload will not fit completely on the local page, we have
617
** to decide how much to store locally and how much to spill onto
618
** overflow pages. The strategy is to minimize the amount of unused
619
** space on overflow pages while keeping the amount of local storage
620
** in between minLocal and maxLocal.
622
** Warning: changing the way overflow payload is distributed in any
623
** way will result in an incompatible file format.
625
int minLocal; /* Minimum amount of payload held locally */
626
int maxLocal; /* Maximum amount of payload held locally */
627
int surplus; /* Overflow payload available for local storage */
629
minLocal = pPage->minLocal;
630
maxLocal = pPage->maxLocal;
631
surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
632
if( surplus <= maxLocal ){
633
pInfo->nLocal = surplus;
635
pInfo->nLocal = minLocal;
637
pInfo->iOverflow = pInfo->nLocal + n;
638
pInfo->nSize = pInfo->iOverflow + 4;
641
static void parseCell(
642
MemPage *pPage, /* Page containing the cell */
643
int iCell, /* The cell index. First cell is 0 */
644
CellInfo *pInfo /* Fill in this structure */
646
parseCellPtr(pPage, findCell(pPage, iCell), pInfo);
650
** Compute the total number of bytes that a Cell needs in the cell
651
** data area of the btree-page. The return number includes the cell
652
** data header and the local payload, but not any overflow page or
653
** the space used by the cell pointer.
656
static int cellSize(MemPage *pPage, int iCell){
658
parseCell(pPage, iCell, &info);
662
static int cellSizePtr(MemPage *pPage, u8 *pCell){
664
parseCellPtr(pPage, pCell, &info);
668
#ifndef SQLITE_OMIT_AUTOVACUUM
670
** If the cell pCell, part of page pPage contains a pointer
671
** to an overflow page, insert an entry into the pointer-map
672
** for the overflow page.
674
static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
677
parseCellPtr(pPage, pCell, &info);
678
if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
679
Pgno ovfl = get4byte(&pCell[info.iOverflow]);
680
return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
686
** If the cell with index iCell on page pPage contains a pointer
687
** to an overflow page, insert an entry into the pointer-map
688
** for the overflow page.
690
static int ptrmapPutOvfl(MemPage *pPage, int iCell){
692
pCell = findOverflowCell(pPage, iCell);
693
return ptrmapPutOvflPtr(pPage, pCell);
699
** Do sanity checking on a page. Throw an exception if anything is
702
** This routine is used for internal error checking only. It is omitted
705
#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
706
static void _pageIntegrity(MemPage *pPage){
709
int i, j, idx, c, pc, hdr, nFree;
711
int nCell, cellLimit;
714
used = sqliteMallocRaw( pPage->pBt->pageSize );
715
if( used==0 ) return;
716
usableSize = pPage->pBt->usableSize;
717
assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->psAligned] );
718
hdr = pPage->hdrOffset;
719
assert( hdr==(pPage->pgno==1 ? 100 : 0) );
720
assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
721
c = pPage->aData[hdr];
723
assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
724
assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
725
assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) );
726
assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) );
727
assert( pPage->hasData ==
728
!(pPage->zeroData || (!pPage->leaf && pPage->leafData)) );
729
assert( pPage->cellOffset==pPage->hdrOffset+12-4*pPage->leaf );
730
assert( pPage->nCell = get2byte(&pPage->aData[hdr+3]) );
733
memset(used, 0, usableSize);
734
for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
736
pc = get2byte(&data[hdr+1]);
739
assert( pc>0 && pc<usableSize-4 );
740
size = get2byte(&data[pc+2]);
741
assert( pc+size<=usableSize );
743
for(i=pc; i<pc+size; i++){
744
assert( used[i]==0 );
747
pc = get2byte(&data[pc]);
750
nCell = get2byte(&data[hdr+3]);
751
cellLimit = get2byte(&data[hdr+5]);
752
assert( pPage->isInit==0
753
|| pPage->nFree==nFree+data[hdr+7]+cellLimit-(cellOffset+2*nCell) );
754
cellOffset = pPage->cellOffset;
755
for(i=0; i<nCell; i++){
757
pc = get2byte(&data[cellOffset+2*i]);
758
assert( pc>0 && pc<usableSize-4 );
759
size = cellSize(pPage, &data[pc]);
760
assert( pc+size<=usableSize );
761
for(j=pc; j<pc+size; j++){
762
assert( used[j]==0 );
766
for(i=cellOffset+2*nCell; i<cellimit; i++){
767
assert( used[i]==0 );
771
for(i=0; i<usableSize; i++){
772
assert( used[i]<=1 );
773
if( used[i]==0 ) nFree++;
775
assert( nFree==data[hdr+7] );
778
#define pageIntegrity(X) _pageIntegrity(X)
780
# define pageIntegrity(X)
784
** Defragment the page given. All Cells are moved to the
785
** beginning of the page and all free space is collected
786
** into one big FreeBlk at the end of the page.
788
static int defragmentPage(MemPage *pPage){
789
int i; /* Loop counter */
790
int pc; /* Address of a i-th cell */
791
int addr; /* Offset of first byte after cell pointer array */
792
int hdr; /* Offset to the page header */
793
int size; /* Size of a cell */
794
int usableSize; /* Number of usable bytes on a page */
795
int cellOffset; /* Offset to the cell pointer array */
796
int brk; /* Offset to the cell content area */
797
int nCell; /* Number of cells on the page */
798
unsigned char *data; /* The page data */
799
unsigned char *temp; /* Temp area for cell content */
801
assert( sqlite3pager_iswriteable(pPage->aData) );
802
assert( pPage->pBt!=0 );
803
assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
804
assert( pPage->nOverflow==0 );
805
temp = sqliteMalloc( pPage->pBt->pageSize );
806
if( temp==0 ) return SQLITE_NOMEM;
808
hdr = pPage->hdrOffset;
809
cellOffset = pPage->cellOffset;
810
nCell = pPage->nCell;
811
assert( nCell==get2byte(&data[hdr+3]) );
812
usableSize = pPage->pBt->usableSize;
813
brk = get2byte(&data[hdr+5]);
814
memcpy(&temp[brk], &data[brk], usableSize - brk);
816
for(i=0; i<nCell; i++){
817
u8 *pAddr; /* The i-th cell pointer */
818
pAddr = &data[cellOffset + i*2];
819
pc = get2byte(pAddr);
820
assert( pc<pPage->pBt->usableSize );
821
size = cellSizePtr(pPage, &temp[pc]);
823
memcpy(&data[brk], &temp[pc], size);
824
put2byte(pAddr, brk);
826
assert( brk>=cellOffset+2*nCell );
827
put2byte(&data[hdr+5], brk);
831
addr = cellOffset+2*nCell;
832
memset(&data[addr], 0, brk-addr);
838
** Allocate nByte bytes of space on a page.
840
** Return the index into pPage->aData[] of the first byte of
841
** the new allocation. Or return 0 if there is not enough free
842
** space on the page to satisfy the allocation request.
844
** If the page contains nBytes of free space but does not contain
845
** nBytes of contiguous free space, then this routine automatically
846
** calls defragementPage() to consolidate all free space before
847
** allocating the new chunk.
849
static int allocateSpace(MemPage *pPage, int nByte){
859
assert( sqlite3pager_iswriteable(data) );
860
assert( pPage->pBt );
861
if( nByte<4 ) nByte = 4;
862
if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
863
pPage->nFree -= nByte;
864
hdr = pPage->hdrOffset;
868
/* Search the freelist looking for a slot big enough to satisfy the
871
while( (pc = get2byte(&data[addr]))>0 ){
872
size = get2byte(&data[pc+2]);
875
memcpy(&data[addr], &data[pc], 2);
876
data[hdr+7] = nFrag + size - nByte;
879
put2byte(&data[pc+2], size-nByte);
880
return pc + size - nByte;
887
/* Allocate memory from the gap in between the cell pointer array
888
** and the cell content area.
890
top = get2byte(&data[hdr+5]);
891
nCell = get2byte(&data[hdr+3]);
892
cellOffset = pPage->cellOffset;
893
if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
894
if( defragmentPage(pPage) ) return 0;
895
top = get2byte(&data[hdr+5]);
898
assert( cellOffset + 2*nCell <= top );
899
put2byte(&data[hdr+5], top);
904
** Return a section of the pPage->aData to the freelist.
905
** The first byte of the new free block is pPage->aDisk[start]
906
** and the size of the block is "size" bytes.
908
** Most of the effort here is involved in coalesing adjacent
909
** free blocks into a single big free block.
911
static void freeSpace(MemPage *pPage, int start, int size){
912
int addr, pbegin, hdr;
913
unsigned char *data = pPage->aData;
915
assert( pPage->pBt!=0 );
916
assert( sqlite3pager_iswriteable(data) );
917
assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
918
assert( (start + size)<=pPage->pBt->usableSize );
919
if( size<4 ) size = 4;
921
/* Add the space back into the linked list of freeblocks */
922
hdr = pPage->hdrOffset;
924
while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
925
assert( pbegin<=pPage->pBt->usableSize-4 );
926
assert( pbegin>addr );
929
assert( pbegin<=pPage->pBt->usableSize-4 );
930
assert( pbegin>addr || pbegin==0 );
931
put2byte(&data[addr], start);
932
put2byte(&data[start], pbegin);
933
put2byte(&data[start+2], size);
934
pPage->nFree += size;
936
/* Coalesce adjacent free blocks */
937
addr = pPage->hdrOffset + 1;
938
while( (pbegin = get2byte(&data[addr]))>0 ){
940
assert( pbegin>addr );
941
assert( pbegin<=pPage->pBt->usableSize-4 );
942
pnext = get2byte(&data[pbegin]);
943
psize = get2byte(&data[pbegin+2]);
944
if( pbegin + psize + 3 >= pnext && pnext>0 ){
945
int frag = pnext - (pbegin+psize);
946
assert( frag<=data[pPage->hdrOffset+7] );
947
data[pPage->hdrOffset+7] -= frag;
948
put2byte(&data[pbegin], get2byte(&data[pnext]));
949
put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
955
/* If the cell content area begins with a freeblock, remove it. */
956
if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
958
pbegin = get2byte(&data[hdr+1]);
959
memcpy(&data[hdr+1], &data[pbegin], 2);
960
top = get2byte(&data[hdr+5]);
961
put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
966
** Decode the flags byte (the first byte of the header) for a page
967
** and initialize fields of the MemPage structure accordingly.
969
static void decodeFlags(MemPage *pPage, int flagByte){
970
Btree *pBt; /* A copy of pPage->pBt */
972
assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
973
pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
974
pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
975
pPage->leaf = (flagByte & PTF_LEAF)!=0;
976
pPage->childPtrSize = 4*(pPage->leaf==0);
978
if( flagByte & PTF_LEAFDATA ){
980
pPage->maxLocal = pBt->maxLeaf;
981
pPage->minLocal = pBt->minLeaf;
984
pPage->maxLocal = pBt->maxLocal;
985
pPage->minLocal = pBt->minLocal;
987
pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
991
** Initialize the auxiliary information for a disk block.
993
** The pParent parameter must be a pointer to the MemPage which
994
** is the parent of the page being initialized. The root of a
995
** BTree has no parent and so for that page, pParent==NULL.
997
** Return SQLITE_OK on success. If we see that the page does
998
** not contain a well-formed database page, then return
999
** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1000
** guarantee that the page is well-formed. It only shows that
1001
** we failed to detect any corruption.
1003
static int initPage(
1004
MemPage *pPage, /* The page to be initialized */
1005
MemPage *pParent /* The parent. Might be NULL */
1007
int pc; /* Address of a freeblock within pPage->aData[] */
1008
int hdr; /* Offset to beginning of page header */
1009
u8 *data; /* Equal to pPage->aData */
1010
Btree *pBt; /* The main btree structure */
1011
int usableSize; /* Amount of usable space on each page */
1012
int cellOffset; /* Offset from start of page to first cell pointer */
1013
int nFree; /* Number of unused bytes on the page */
1014
int top; /* First byte of the cell content area */
1018
assert( pParent==0 || pParent->pBt==pBt );
1019
assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
1020
assert( pPage->aData == &((unsigned char*)pPage)[-pBt->psAligned] );
1021
if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
1022
/* The parent page should never change unless the file is corrupt */
1023
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1025
if( pPage->isInit ) return SQLITE_OK;
1026
if( pPage->pParent==0 && pParent!=0 ){
1027
pPage->pParent = pParent;
1028
sqlite3pager_ref(pParent->aData);
1030
hdr = pPage->hdrOffset;
1031
data = pPage->aData;
1032
decodeFlags(pPage, data[hdr]);
1033
pPage->nOverflow = 0;
1034
pPage->idxShift = 0;
1035
usableSize = pBt->usableSize;
1036
pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1037
top = get2byte(&data[hdr+5]);
1038
pPage->nCell = get2byte(&data[hdr+3]);
1039
if( pPage->nCell>MX_CELL(pBt) ){
1040
/* To many cells for a single page. The page must be corrupt */
1041
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1043
if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
1044
/* All pages must have at least one cell, except for root pages */
1045
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1048
/* Compute the total free space on the page */
1049
pc = get2byte(&data[hdr+1]);
1050
nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1053
if( pc>usableSize-4 ){
1054
/* Free block is off the page */
1055
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1057
next = get2byte(&data[pc]);
1058
size = get2byte(&data[pc+2]);
1059
if( next>0 && next<=pc+size+3 ){
1060
/* Free blocks must be in accending order */
1061
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1066
pPage->nFree = nFree;
1067
if( nFree>=usableSize ){
1068
/* Free space cannot exceed total page size */
1069
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1073
pageIntegrity(pPage);
1078
** Set up a raw page so that it looks like a database page holding
1081
static void zeroPage(MemPage *pPage, int flags){
1082
unsigned char *data = pPage->aData;
1083
Btree *pBt = pPage->pBt;
1084
int hdr = pPage->hdrOffset;
1087
assert( sqlite3pager_pagenumber(data)==pPage->pgno );
1088
assert( &data[pBt->psAligned] == (unsigned char*)pPage );
1089
assert( sqlite3pager_iswriteable(data) );
1090
memset(&data[hdr], 0, pBt->usableSize - hdr);
1092
first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1093
memset(&data[hdr+1], 0, 4);
1095
put2byte(&data[hdr+5], pBt->usableSize);
1096
pPage->nFree = pBt->usableSize - first;
1097
decodeFlags(pPage, flags);
1098
pPage->hdrOffset = hdr;
1099
pPage->cellOffset = first;
1100
pPage->nOverflow = 0;
1101
pPage->idxShift = 0;
1104
pageIntegrity(pPage);
1108
** Get a page from the pager. Initialize the MemPage.pBt and
1109
** MemPage.aData elements if needed.
1111
static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
1113
unsigned char *aData;
1115
rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
1117
pPage = (MemPage*)&aData[pBt->psAligned];
1118
pPage->aData = aData;
1121
pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1127
** Get a page from the pager and initialize it. This routine
1128
** is just a convenience wrapper around separate calls to
1129
** getPage() and initPage().
1131
static int getAndInitPage(
1132
Btree *pBt, /* The database file */
1133
Pgno pgno, /* Number of the page to get */
1134
MemPage **ppPage, /* Write the page pointer here */
1135
MemPage *pParent /* Parent of the page */
1139
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
1141
rc = getPage(pBt, pgno, ppPage);
1142
if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1143
rc = initPage(*ppPage, pParent);
1149
** Release a MemPage. This should be called once for each prior
1152
static void releasePage(MemPage *pPage){
1154
assert( pPage->aData );
1155
assert( pPage->pBt );
1156
assert( &pPage->aData[pPage->pBt->psAligned]==(unsigned char*)pPage );
1157
sqlite3pager_unref(pPage->aData);
1162
** This routine is called when the reference count for a page
1163
** reaches zero. We need to unref the pParent pointer when that
1166
static void pageDestructor(void *pData, int pageSize){
1167
MemPage *pPage = (MemPage*)&((char*)pData)[FORCE_ALIGNMENT(pageSize)];
1168
if( pPage->pParent ){
1169
MemPage *pParent = pPage->pParent;
1171
releasePage(pParent);
1177
** During a rollback, when the pager reloads information into the cache
1178
** so that the cache is restored to its original state at the start of
1179
** the transaction, for each page restored this routine is called.
1181
** This routine needs to reset the extra data section at the end of the
1182
** page to agree with the restored data.
1184
static void pageReinit(void *pData, int pageSize){
1185
MemPage *pPage = (MemPage*)&((char*)pData)[FORCE_ALIGNMENT(pageSize)];
1186
if( pPage->isInit ){
1188
initPage(pPage, pPage->pParent);
1193
** Open a database file.
1195
** zFilename is the name of the database file. If zFilename is NULL
1196
** a new database with a random name is created. This randomly named
1197
** database file will be deleted when sqlite3BtreeClose() is called.
1199
int sqlite3BtreeOpen(
1200
const char *zFilename, /* Name of the file containing the BTree database */
1201
Btree **ppBtree, /* Pointer to new Btree object written here */
1202
int flags /* Options */
1207
unsigned char zDbHeader[100];
1210
** The following asserts make sure that structures used by the btree are
1211
** the right size. This is to guard against size changes that result
1212
** when compiling on a different architecture.
1214
assert( sizeof(i64)==8 );
1215
assert( sizeof(u64)==8 );
1216
assert( sizeof(u32)==4 );
1217
assert( sizeof(u16)==2 );
1218
assert( sizeof(Pgno)==4 );
1219
assert( sizeof(ptr)==sizeof(char*) );
1220
assert( sizeof(uptr)==sizeof(ptr) );
1222
pBt = sqliteMalloc( sizeof(*pBt) );
1225
return SQLITE_NOMEM;
1227
rc = sqlite3pager_open(&pBt->pPager, zFilename, EXTRA_SIZE, flags);
1228
if( rc!=SQLITE_OK ){
1229
if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
1234
sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
1235
sqlite3pager_set_reiniter(pBt->pPager, pageReinit);
1238
pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
1239
sqlite3pager_read_fileheader(pBt->pPager, sizeof(zDbHeader), zDbHeader);
1240
pBt->pageSize = get2byte(&zDbHeader[16]);
1241
if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE ){
1242
pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1243
pBt->maxEmbedFrac = 64; /* 25% */
1244
pBt->minEmbedFrac = 32; /* 12.5% */
1245
pBt->minLeafFrac = 32; /* 12.5% */
1246
#ifndef SQLITE_OMIT_AUTOVACUUM
1247
/* If the magic name ":memory:" will create an in-memory database, then
1248
** do not set the auto-vacuum flag, even if SQLITE_DEFAULT_AUTOVACUUM
1249
** is true. On the other hand, if SQLITE_OMIT_MEMORYDB has been defined,
1250
** then ":memory:" is just a regular file-name. Respect the auto-vacuum
1251
** default in this case.
1253
#ifndef SQLITE_OMIT_MEMORYDB
1254
if( zFilename && strcmp(zFilename,":memory:") ){
1258
pBt->autoVacuum = SQLITE_DEFAULT_AUTOVACUUM;
1263
nReserve = zDbHeader[20];
1264
pBt->maxEmbedFrac = zDbHeader[21];
1265
pBt->minEmbedFrac = zDbHeader[22];
1266
pBt->minLeafFrac = zDbHeader[23];
1267
pBt->pageSizeFixed = 1;
1268
#ifndef SQLITE_OMIT_AUTOVACUUM
1269
pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1272
pBt->usableSize = pBt->pageSize - nReserve;
1273
pBt->psAligned = FORCE_ALIGNMENT(pBt->pageSize);
1274
sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize);
1280
** Close an open database and invalidate all cursors.
1282
int sqlite3BtreeClose(Btree *pBt){
1283
while( pBt->pCursor ){
1284
sqlite3BtreeCloseCursor(pBt->pCursor);
1286
sqlite3pager_close(pBt->pPager);
1292
** Change the busy handler callback function.
1294
int sqlite3BtreeSetBusyHandler(Btree *pBt, BusyHandler *pHandler){
1295
pBt->pBusyHandler = pHandler;
1296
sqlite3pager_set_busyhandler(pBt->pPager, pHandler);
1301
** Change the limit on the number of pages allowed in the cache.
1303
** The maximum number of cache pages is set to the absolute
1304
** value of mxPage. If mxPage is negative, the pager will
1305
** operate asynchronously - it will not stop to do fsync()s
1306
** to insure data is written to the disk surface before
1307
** continuing. Transactions still work if synchronous is off,
1308
** and the database cannot be corrupted if this program
1309
** crashes. But if the operating system crashes or there is
1310
** an abrupt power failure when synchronous is off, the database
1311
** could be left in an inconsistent and unrecoverable state.
1312
** Synchronous is on by default so database corruption is not
1313
** normally a worry.
1315
int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
1316
sqlite3pager_set_cachesize(pBt->pPager, mxPage);
1321
** Change the way data is synced to disk in order to increase or decrease
1322
** how well the database resists damage due to OS crashes and power
1323
** failures. Level 1 is the same as asynchronous (no syncs() occur and
1324
** there is a high probability of damage) Level 2 is the default. There
1325
** is a very low but non-zero probability of damage. Level 3 reduces the
1326
** probability of damage to near zero but with a write performance reduction.
1328
#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1329
int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
1330
sqlite3pager_set_safety_level(pBt->pPager, level);
1335
#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1337
** Change the default pages size and the number of reserved bytes per page.
1339
** The page size must be a power of 2 between 512 and 65536. If the page
1340
** size supplied does not meet this constraint then the page size is not
1343
** Page sizes are constrained to be a power of two so that the region
1344
** of the database file used for locking (beginning at PENDING_BYTE,
1345
** the first byte past the 1GB boundary, 0x40000000) needs to occur
1346
** at the beginning of a page.
1348
** If parameter nReserve is less than zero, then the number of reserved
1349
** bytes per page is left unchanged.
1351
int sqlite3BtreeSetPageSize(Btree *pBt, int pageSize, int nReserve){
1352
if( pBt->pageSizeFixed ){
1353
return SQLITE_READONLY;
1356
nReserve = pBt->pageSize - pBt->usableSize;
1358
if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1359
((pageSize-1)&pageSize)==0 ){
1360
pBt->pageSize = pageSize;
1361
pBt->psAligned = FORCE_ALIGNMENT(pageSize);
1362
sqlite3pager_set_pagesize(pBt->pPager, pageSize);
1364
pBt->usableSize = pBt->pageSize - nReserve;
1369
** Return the currently defined page size
1371
int sqlite3BtreeGetPageSize(Btree *pBt){
1372
return pBt->pageSize;
1374
int sqlite3BtreeGetReserve(Btree *pBt){
1375
return pBt->pageSize - pBt->usableSize;
1377
#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1380
** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1381
** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1382
** is disabled. The default value for the auto-vacuum property is
1383
** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1385
int sqlite3BtreeSetAutoVacuum(Btree *pBt, int autoVacuum){
1386
#ifdef SQLITE_OMIT_AUTOVACUUM
1387
return SQLITE_READONLY;
1389
if( pBt->pageSizeFixed ){
1390
return SQLITE_READONLY;
1392
pBt->autoVacuum = (autoVacuum?1:0);
1398
** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1399
** enabled 1 is returned. Otherwise 0.
1401
int sqlite3BtreeGetAutoVacuum(Btree *pBt){
1402
#ifdef SQLITE_OMIT_AUTOVACUUM
1405
return pBt->autoVacuum;
1411
** Get a reference to pPage1 of the database file. This will
1412
** also acquire a readlock on that file.
1414
** SQLITE_OK is returned on success. If the file is not a
1415
** well-formed database file, then SQLITE_CORRUPT is returned.
1416
** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1417
** is returned if we run out of memory. SQLITE_PROTOCOL is returned
1418
** if there is a locking protocol violation.
1420
static int lockBtree(Btree *pBt){
1423
if( pBt->pPage1 ) return SQLITE_OK;
1424
rc = getPage(pBt, 1, &pPage1);
1425
if( rc!=SQLITE_OK ) return rc;
1428
/* Do some checking to help insure the file we opened really is
1429
** a valid database file.
1432
if( sqlite3pager_pagecount(pBt->pPager)>0 ){
1433
u8 *page1 = pPage1->aData;
1434
if( memcmp(page1, zMagicHeader, 16)!=0 ){
1435
goto page1_init_failed;
1437
if( page1[18]>1 || page1[19]>1 ){
1438
goto page1_init_failed;
1440
pBt->pageSize = get2byte(&page1[16]);
1441
pBt->usableSize = pBt->pageSize - page1[20];
1442
if( pBt->usableSize<500 ){
1443
goto page1_init_failed;
1445
pBt->psAligned = FORCE_ALIGNMENT(pBt->pageSize);
1446
pBt->maxEmbedFrac = page1[21];
1447
pBt->minEmbedFrac = page1[22];
1448
pBt->minLeafFrac = page1[23];
1449
#ifndef SQLITE_OMIT_AUTOVACUUM
1450
pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1454
/* maxLocal is the maximum amount of payload to store locally for
1455
** a cell. Make sure it is small enough so that at least minFanout
1456
** cells can will fit on one page. We assume a 10-byte page header.
1457
** Besides the payload, the cell must store:
1458
** 2-byte pointer to the cell
1459
** 4-byte child pointer
1460
** 9-byte nKey value
1461
** 4-byte nData value
1462
** 4-byte overflow page pointer
1463
** So a cell consists of a 2-byte poiner, a header which is as much as
1464
** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1467
pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1468
pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1469
pBt->maxLeaf = pBt->usableSize - 35;
1470
pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1471
if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1472
goto page1_init_failed;
1474
assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1475
pBt->pPage1 = pPage1;
1479
releasePage(pPage1);
1485
** This routine works like lockBtree() except that it also invokes the
1486
** busy callback if there is lock contention.
1488
static int lockBtreeWithRetry(Btree *pBt){
1490
if( pBt->inTrans==TRANS_NONE ){
1491
rc = sqlite3BtreeBeginTrans(pBt, 0);
1492
pBt->inTrans = TRANS_NONE;
1499
** If there are no outstanding cursors and we are not in the middle
1500
** of a transaction but there is a read lock on the database, then
1501
** this routine unrefs the first page of the database file which
1502
** has the effect of releasing the read lock.
1504
** If there are any outstanding cursors, this routine is a no-op.
1506
** If there is a transaction in progress, this routine is a no-op.
1508
static void unlockBtreeIfUnused(Btree *pBt){
1509
if( pBt->inTrans==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1510
if( pBt->pPage1->aData==0 ){
1511
MemPage *pPage = pBt->pPage1;
1512
pPage->aData = &((char*)pPage)[-pBt->psAligned];
1516
releasePage(pBt->pPage1);
1523
** Create a new database by initializing the first page of the
1526
static int newDatabase(Btree *pBt){
1528
unsigned char *data;
1530
if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
1534
rc = sqlite3pager_write(data);
1536
memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1537
assert( sizeof(zMagicHeader)==16 );
1538
put2byte(&data[16], pBt->pageSize);
1541
data[20] = pBt->pageSize - pBt->usableSize;
1542
data[21] = pBt->maxEmbedFrac;
1543
data[22] = pBt->minEmbedFrac;
1544
data[23] = pBt->minLeafFrac;
1545
memset(&data[24], 0, 100-24);
1546
zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1547
pBt->pageSizeFixed = 1;
1548
#ifndef SQLITE_OMIT_AUTOVACUUM
1549
if( pBt->autoVacuum ){
1550
put4byte(&data[36 + 4*4], 1);
1557
** Attempt to start a new transaction. A write-transaction
1558
** is started if the second argument is nonzero, otherwise a read-
1559
** transaction. If the second argument is 2 or more and exclusive
1560
** transaction is started, meaning that no other process is allowed
1561
** to access the database. A preexisting transaction may not be
1562
** upgraded to exclusive by calling this routine a second time - the
1563
** exclusivity flag only works for a new transaction.
1565
** A write-transaction must be started before attempting any
1566
** changes to the database. None of the following routines
1567
** will work unless a transaction is started first:
1569
** sqlite3BtreeCreateTable()
1570
** sqlite3BtreeCreateIndex()
1571
** sqlite3BtreeClearTable()
1572
** sqlite3BtreeDropTable()
1573
** sqlite3BtreeInsert()
1574
** sqlite3BtreeDelete()
1575
** sqlite3BtreeUpdateMeta()
1577
** If an initial attempt to acquire the lock fails because of lock contention
1578
** and the database was previously unlocked, then invoke the busy handler
1579
** if there is one. But if there was previously a read-lock, do not
1580
** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1581
** returned when there is already a read-lock in order to avoid a deadlock.
1583
** Suppose there are two processes A and B. A has a read lock and B has
1584
** a reserved lock. B tries to promote to exclusive but is blocked because
1585
** of A's read lock. A tries to promote to reserved but is blocked by B.
1586
** One or the other of the two processes must give way or there can be
1587
** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1588
** when A already has a read lock, we encourage A to give up and let B
1591
int sqlite3BtreeBeginTrans(Btree *pBt, int wrflag){
1596
/* If the btree is already in a write-transaction, or it
1597
** is already in a read-transaction and a read-transaction
1598
** is requested, this is a no-op.
1600
if( pBt->inTrans==TRANS_WRITE || (pBt->inTrans==TRANS_READ && !wrflag) ){
1604
/* Write transactions are not possible on a read-only database */
1605
if( pBt->readOnly && wrflag ){
1606
return SQLITE_READONLY;
1610
if( pBt->pPage1==0 ){
1611
rc = lockBtree(pBt);
1614
if( rc==SQLITE_OK && wrflag ){
1615
rc = sqlite3pager_begin(pBt->pPage1->aData, wrflag>1);
1616
if( rc==SQLITE_OK ){
1617
rc = newDatabase(pBt);
1621
if( rc==SQLITE_OK ){
1622
pBt->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1623
if( wrflag ) pBt->inStmt = 0;
1625
unlockBtreeIfUnused(pBt);
1627
}while( rc==SQLITE_BUSY && pBt->inTrans==TRANS_NONE &&
1628
(pH = pBt->pBusyHandler)!=0 &&
1629
pH->xFunc && pH->xFunc(pH->pArg, busy++)
1634
#ifndef SQLITE_OMIT_AUTOVACUUM
1637
** Set the pointer-map entries for all children of page pPage. Also, if
1638
** pPage contains cells that point to overflow pages, set the pointer
1639
** map entries for the overflow pages as well.
1641
static int setChildPtrmaps(MemPage *pPage){
1642
int i; /* Counter variable */
1643
int nCell; /* Number of cells in page pPage */
1644
int rc = SQLITE_OK; /* Return code */
1645
Btree *pBt = pPage->pBt;
1646
int isInitOrig = pPage->isInit;
1647
Pgno pgno = pPage->pgno;
1650
nCell = pPage->nCell;
1652
for(i=0; i<nCell; i++){
1653
u8 *pCell = findCell(pPage, i);
1655
rc = ptrmapPutOvflPtr(pPage, pCell);
1656
if( rc!=SQLITE_OK ){
1657
goto set_child_ptrmaps_out;
1661
Pgno childPgno = get4byte(pCell);
1662
rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1663
if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1668
Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1669
rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1672
set_child_ptrmaps_out:
1673
pPage->isInit = isInitOrig;
1678
** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1679
** page, is a pointer to page iFrom. Modify this pointer so that it points to
1680
** iTo. Parameter eType describes the type of pointer to be modified, as
1683
** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1686
** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1687
** page pointed to by one of the cells on pPage.
1689
** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1690
** overflow page in the list.
1692
static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1693
if( eType==PTRMAP_OVERFLOW2 ){
1694
/* The pointer is always the first 4 bytes of the page in this case. */
1695
if( get4byte(pPage->aData)!=iFrom ){
1696
return SQLITE_CORRUPT;
1698
put4byte(pPage->aData, iTo);
1700
int isInitOrig = pPage->isInit;
1705
nCell = pPage->nCell;
1707
for(i=0; i<nCell; i++){
1708
u8 *pCell = findCell(pPage, i);
1709
if( eType==PTRMAP_OVERFLOW1 ){
1711
parseCellPtr(pPage, pCell, &info);
1712
if( info.iOverflow ){
1713
if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1714
put4byte(&pCell[info.iOverflow], iTo);
1719
if( get4byte(pCell)==iFrom ){
1720
put4byte(pCell, iTo);
1727
if( eType!=PTRMAP_BTREE ||
1728
get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1729
return SQLITE_CORRUPT;
1731
put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1734
pPage->isInit = isInitOrig;
1741
** Move the open database page pDbPage to location iFreePage in the
1742
** database. The pDbPage reference remains valid.
1744
static int relocatePage(
1745
Btree *pBt, /* Btree */
1746
MemPage *pDbPage, /* Open page to move */
1747
u8 eType, /* Pointer map 'type' entry for pDbPage */
1748
Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
1749
Pgno iFreePage /* The location to move pDbPage to */
1751
MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
1752
Pgno iDbPage = pDbPage->pgno;
1753
Pager *pPager = pBt->pPager;
1756
assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
1757
eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
1759
/* Move page iDbPage from it's current location to page number iFreePage */
1760
TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
1761
iDbPage, iFreePage, iPtrPage, eType));
1762
rc = sqlite3pager_movepage(pPager, pDbPage->aData, iFreePage);
1763
if( rc!=SQLITE_OK ){
1766
pDbPage->pgno = iFreePage;
1768
/* If pDbPage was a btree-page, then it may have child pages and/or cells
1769
** that point to overflow pages. The pointer map entries for all these
1770
** pages need to be changed.
1772
** If pDbPage is an overflow page, then the first 4 bytes may store a
1773
** pointer to a subsequent overflow page. If this is the case, then
1774
** the pointer map needs to be updated for the subsequent overflow page.
1776
if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
1777
rc = setChildPtrmaps(pDbPage);
1778
if( rc!=SQLITE_OK ){
1782
Pgno nextOvfl = get4byte(pDbPage->aData);
1784
rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
1785
if( rc!=SQLITE_OK ){
1791
/* Fix the database pointer on page iPtrPage that pointed at iDbPage so
1792
** that it points at iFreePage. Also fix the pointer map entry for
1795
if( eType!=PTRMAP_ROOTPAGE ){
1796
rc = getPage(pBt, iPtrPage, &pPtrPage);
1797
if( rc!=SQLITE_OK ){
1800
rc = sqlite3pager_write(pPtrPage->aData);
1801
if( rc!=SQLITE_OK ){
1802
releasePage(pPtrPage);
1805
rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
1806
releasePage(pPtrPage);
1807
if( rc==SQLITE_OK ){
1808
rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
1814
/* Forward declaration required by autoVacuumCommit(). */
1815
static int allocatePage(Btree *, MemPage **, Pgno *, Pgno, u8);
1818
** This routine is called prior to sqlite3pager_commit when a transaction
1819
** is commited for an auto-vacuum database.
1821
static int autoVacuumCommit(Btree *pBt, Pgno *nTrunc){
1822
Pager *pPager = pBt->pPager;
1823
Pgno nFreeList; /* Number of pages remaining on the free-list. */
1824
int nPtrMap; /* Number of pointer-map pages deallocated */
1825
Pgno origSize; /* Pages in the database file */
1826
Pgno finSize; /* Pages in the database file after truncation */
1827
int rc; /* Return code */
1829
int pgsz = pBt->pageSize; /* Page size for this database */
1830
Pgno iDbPage; /* The database page to move */
1831
MemPage *pDbMemPage = 0; /* "" */
1832
Pgno iPtrPage; /* The page that contains a pointer to iDbPage */
1833
Pgno iFreePage; /* The free-list page to move iDbPage to */
1834
MemPage *pFreeMemPage = 0; /* "" */
1837
int nRef = *sqlite3pager_stats(pPager);
1840
assert( pBt->autoVacuum );
1841
if( PTRMAP_ISPAGE(pgsz, sqlite3pager_pagecount(pPager)) ){
1842
return SQLITE_CORRUPT;
1845
/* Figure out how many free-pages are in the database. If there are no
1846
** free pages, then auto-vacuum is a no-op.
1848
nFreeList = get4byte(&pBt->pPage1->aData[36]);
1854
origSize = sqlite3pager_pagecount(pPager);
1855
nPtrMap = (nFreeList-origSize+PTRMAP_PAGENO(pgsz, origSize)+pgsz/5)/(pgsz/5);
1856
finSize = origSize - nFreeList - nPtrMap;
1857
if( origSize>PENDING_BYTE_PAGE(pBt) && finSize<=PENDING_BYTE_PAGE(pBt) ){
1859
if( PTRMAP_ISPAGE(pBt->usableSize, finSize) ){
1863
TRACE(("AUTOVACUUM: Begin (db size %d->%d)\n", origSize, finSize));
1865
/* Variable 'finSize' will be the size of the file in pages after
1866
** the auto-vacuum has completed (the current file size minus the number
1867
** of pages on the free list). Loop through the pages that lie beyond
1868
** this mark, and if they are not already on the free list, move them
1869
** to a free page earlier in the file (somewhere before finSize).
1871
for( iDbPage=finSize+1; iDbPage<=origSize; iDbPage++ ){
1872
/* If iDbPage is a pointer map page, or the pending-byte page, skip it. */
1873
if( PTRMAP_ISPAGE(pgsz, iDbPage) || iDbPage==PENDING_BYTE_PAGE(pBt) ){
1877
rc = ptrmapGet(pBt, iDbPage, &eType, &iPtrPage);
1878
if( rc!=SQLITE_OK ) goto autovacuum_out;
1879
if( eType==PTRMAP_ROOTPAGE ){
1880
rc = SQLITE_CORRUPT;
1881
goto autovacuum_out;
1884
/* If iDbPage is free, do not swap it. */
1885
if( eType==PTRMAP_FREEPAGE ){
1888
rc = getPage(pBt, iDbPage, &pDbMemPage);
1889
if( rc!=SQLITE_OK ) goto autovacuum_out;
1891
/* Find the next page in the free-list that is not already at the end
1892
** of the file. A page can be pulled off the free list using the
1893
** allocatePage() routine.
1897
releasePage(pFreeMemPage);
1900
rc = allocatePage(pBt, &pFreeMemPage, &iFreePage, 0, 0);
1901
if( rc!=SQLITE_OK ){
1902
releasePage(pDbMemPage);
1903
goto autovacuum_out;
1905
assert( iFreePage<=origSize );
1906
}while( iFreePage>finSize );
1907
releasePage(pFreeMemPage);
1910
rc = relocatePage(pBt, pDbMemPage, eType, iPtrPage, iFreePage);
1911
releasePage(pDbMemPage);
1912
if( rc!=SQLITE_OK ) goto autovacuum_out;
1915
/* The entire free-list has been swapped to the end of the file. So
1916
** truncate the database file to finSize pages and consider the
1919
rc = sqlite3pager_write(pBt->pPage1->aData);
1920
if( rc!=SQLITE_OK ) goto autovacuum_out;
1921
put4byte(&pBt->pPage1->aData[32], 0);
1922
put4byte(&pBt->pPage1->aData[36], 0);
1923
if( rc!=SQLITE_OK ) goto autovacuum_out;
1927
assert( nRef==*sqlite3pager_stats(pPager) );
1928
if( rc!=SQLITE_OK ){
1929
sqlite3pager_rollback(pPager);
1936
** Commit the transaction currently in progress.
1938
** This will release the write lock on the database file. If there
1939
** are no active cursors, it also releases the read lock.
1941
int sqlite3BtreeCommit(Btree *pBt){
1943
if( pBt->inTrans==TRANS_WRITE ){
1944
rc = sqlite3pager_commit(pBt->pPager);
1946
pBt->inTrans = TRANS_NONE;
1948
unlockBtreeIfUnused(pBt);
1954
** Return the number of write-cursors open on this handle. This is for use
1955
** in assert() expressions, so it is only compiled if NDEBUG is not
1958
static int countWriteCursors(Btree *pBt){
1961
for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1962
if( pCur->wrFlag ) r++;
1970
** Print debugging information about all cursors to standard output.
1972
void sqlite3BtreeCursorList(Btree *pBt){
1974
for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1975
MemPage *pPage = pCur->pPage;
1976
char *zMode = pCur->wrFlag ? "rw" : "ro";
1977
sqlite3DebugPrintf("CURSOR %p rooted at %4d(%s) currently at %d.%d%s\n",
1978
pCur, pCur->pgnoRoot, zMode,
1979
pPage ? pPage->pgno : 0, pCur->idx,
1980
pCur->isValid ? "" : " eof"
1987
** Rollback the transaction in progress. All cursors will be
1988
** invalided by this operation. Any attempt to use a cursor
1989
** that was open at the beginning of this operation will result
1992
** This will release the write lock on the database file. If there
1993
** are no active cursors, it also releases the read lock.
1995
int sqlite3BtreeRollback(Btree *pBt){
1998
if( pBt->inTrans==TRANS_WRITE ){
1999
rc = sqlite3pager_rollback(pBt->pPager);
2000
/* The rollback may have destroyed the pPage1->aData value. So
2001
** call getPage() on page 1 again to make sure pPage1->aData is
2002
** set correctly. */
2003
if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
2004
releasePage(pPage1);
2006
assert( countWriteCursors(pBt)==0 );
2008
pBt->inTrans = TRANS_NONE;
2010
unlockBtreeIfUnused(pBt);
2015
** Start a statement subtransaction. The subtransaction can
2016
** can be rolled back independently of the main transaction.
2017
** You must start a transaction before starting a subtransaction.
2018
** The subtransaction is ended automatically if the main transaction
2019
** commits or rolls back.
2021
** Only one subtransaction may be active at a time. It is an error to try
2022
** to start a new subtransaction if another subtransaction is already active.
2024
** Statement subtransactions are used around individual SQL statements
2025
** that are contained within a BEGIN...COMMIT block. If a constraint
2026
** error occurs within the statement, the effect of that one statement
2027
** can be rolled back without having to rollback the entire transaction.
2029
int sqlite3BtreeBeginStmt(Btree *pBt){
2031
if( (pBt->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2032
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2034
rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
2041
** Commit the statment subtransaction currently in progress. If no
2042
** subtransaction is active, this is a no-op.
2044
int sqlite3BtreeCommitStmt(Btree *pBt){
2046
if( pBt->inStmt && !pBt->readOnly ){
2047
rc = sqlite3pager_stmt_commit(pBt->pPager);
2056
** Rollback the active statement subtransaction. If no subtransaction
2057
** is active this routine is a no-op.
2059
** All cursors will be invalidated by this operation. Any attempt
2060
** to use a cursor that was open at the beginning of this operation
2061
** will result in an error.
2063
int sqlite3BtreeRollbackStmt(Btree *pBt){
2065
if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
2066
rc = sqlite3pager_stmt_rollback(pBt->pPager);
2067
assert( countWriteCursors(pBt)==0 );
2073
** Default key comparison function to be used if no comparison function
2074
** is specified on the sqlite3BtreeCursor() call.
2076
static int dfltCompare(
2077
void *NotUsed, /* User data is not used */
2078
int n1, const void *p1, /* First key to compare */
2079
int n2, const void *p2 /* Second key to compare */
2082
c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2090
** Create a new cursor for the BTree whose root is on the page
2091
** iTable. The act of acquiring a cursor gets a read lock on
2092
** the database file.
2094
** If wrFlag==0, then the cursor can only be used for reading.
2095
** If wrFlag==1, then the cursor can be used for reading or for
2096
** writing if other conditions for writing are also met. These
2097
** are the conditions that must be met in order for writing to
2100
** 1: The cursor must have been opened with wrFlag==1
2102
** 2: No other cursors may be open with wrFlag==0 on the same table
2104
** 3: The database must be writable (not on read-only media)
2106
** 4: There must be an active transaction.
2108
** Condition 2 warrants further discussion. If any cursor is opened
2109
** on a table with wrFlag==0, that prevents all other cursors from
2110
** writing to that table. This is a kind of "read-lock". When a cursor
2111
** is opened with wrFlag==0 it is guaranteed that the table will not
2112
** change as long as the cursor is open. This allows the cursor to
2113
** do a sequential scan of the table without having to worry about
2114
** entries being inserted or deleted during the scan. Cursors should
2115
** be opened with wrFlag==0 only if this read-lock property is needed.
2116
** That is to say, cursors should be opened with wrFlag==0 only if they
2117
** intend to use the sqlite3BtreeNext() system call. All other cursors
2118
** should be opened with wrFlag==1 even if they never really intend
2121
** No checking is done to make sure that page iTable really is the
2122
** root page of a b-tree. If it is not, then the cursor acquired
2123
** will not work correctly.
2125
** The comparison function must be logically the same for every cursor
2126
** on a particular table. Changing the comparison function will result
2127
** in incorrect operations. If the comparison function is NULL, a
2128
** default comparison function is used. The comparison function is
2129
** always ignored for INTKEY tables.
2131
int sqlite3BtreeCursor(
2132
Btree *pBt, /* The btree */
2133
int iTable, /* Root page of table to open */
2134
int wrFlag, /* 1 to write. 0 read-only */
2135
int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2136
void *pArg, /* First arg to xCompare() */
2137
BtCursor **ppCur /* Write new cursor here */
2144
if( pBt->readOnly ){
2145
return SQLITE_READONLY;
2147
if( checkReadLocks(pBt, iTable, 0) ){
2148
return SQLITE_LOCKED;
2151
if( pBt->pPage1==0 ){
2152
rc = lockBtreeWithRetry(pBt);
2153
if( rc!=SQLITE_OK ){
2157
pCur = sqliteMallocRaw( sizeof(*pCur) );
2160
goto create_cursor_exception;
2162
pCur->pgnoRoot = (Pgno)iTable;
2163
pCur->pPage = 0; /* For exit-handler, in case getAndInitPage() fails. */
2164
if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
2166
goto create_cursor_exception;
2168
rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2169
if( rc!=SQLITE_OK ){
2170
goto create_cursor_exception;
2172
pCur->xCompare = xCmp ? xCmp : dfltCompare;
2175
pCur->wrFlag = wrFlag;
2177
memset(&pCur->info, 0, sizeof(pCur->info));
2178
pCur->pNext = pBt->pCursor;
2180
pCur->pNext->pPrev = pCur;
2183
pBt->pCursor = pCur;
2188
create_cursor_exception:
2190
releasePage(pCur->pPage);
2193
unlockBtreeIfUnused(pBt);
2197
#if 0 /* Not Used */
2199
** Change the value of the comparison function used by a cursor.
2201
void sqlite3BtreeSetCompare(
2202
BtCursor *pCur, /* The cursor to whose comparison function is changed */
2203
int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */
2204
void *pArg /* First argument to xCmp() */
2206
pCur->xCompare = xCmp ? xCmp : dfltCompare;
2212
** Close a cursor. The read lock on the database file is released
2213
** when the last cursor is closed.
2215
int sqlite3BtreeCloseCursor(BtCursor *pCur){
2216
Btree *pBt = pCur->pBt;
2218
pCur->pPrev->pNext = pCur->pNext;
2220
pBt->pCursor = pCur->pNext;
2223
pCur->pNext->pPrev = pCur->pPrev;
2225
releasePage(pCur->pPage);
2226
unlockBtreeIfUnused(pBt);
2232
** Make a temporary cursor by filling in the fields of pTempCur.
2233
** The temporary cursor is not on the cursor list for the Btree.
2235
static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2236
memcpy(pTempCur, pCur, sizeof(*pCur));
2237
pTempCur->pNext = 0;
2238
pTempCur->pPrev = 0;
2239
if( pTempCur->pPage ){
2240
sqlite3pager_ref(pTempCur->pPage->aData);
2245
** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2248
static void releaseTempCursor(BtCursor *pCur){
2250
sqlite3pager_unref(pCur->pPage->aData);
2255
** Make sure the BtCursor.info field of the given cursor is valid.
2256
** If it is not already valid, call parseCell() to fill it in.
2258
** BtCursor.info is a cache of the information in the current cell.
2259
** Using this cache reduces the number of calls to parseCell().
2261
static void getCellInfo(BtCursor *pCur){
2262
if( pCur->info.nSize==0 ){
2263
parseCell(pCur->pPage, pCur->idx, &pCur->info);
2267
memset(&info, 0, sizeof(info));
2268
parseCell(pCur->pPage, pCur->idx, &info);
2269
assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2275
** Set *pSize to the size of the buffer needed to hold the value of
2276
** the key for the current entry. If the cursor is not pointing
2277
** to a valid entry, *pSize is set to 0.
2279
** For a table with the INTKEY flag set, this routine returns the key
2280
** itself, not the number of bytes in the key.
2282
int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2283
if( !pCur->isValid ){
2287
*pSize = pCur->info.nKey;
2293
** Set *pSize to the number of bytes of data in the entry the
2294
** cursor currently points to. Always return SQLITE_OK.
2295
** Failure is not possible. If the cursor is not currently
2296
** pointing to an entry (which can happen, for example, if
2297
** the database is empty) then *pSize is set to 0.
2299
int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2300
if( !pCur->isValid ){
2301
/* Not pointing at a valid entry - set *pSize to 0. */
2305
*pSize = pCur->info.nData;
2311
** Read payload information from the entry that the pCur cursor is
2312
** pointing to. Begin reading the payload at "offset" and read
2313
** a total of "amt" bytes. Put the result in zBuf.
2315
** This routine does not make a distinction between key and data.
2316
** It just reads bytes from the payload area. Data might appear
2317
** on the main page or be scattered out on multiple overflow pages.
2319
static int getPayload(
2320
BtCursor *pCur, /* Cursor pointing to entry to read from */
2321
int offset, /* Begin reading this far into payload */
2322
int amt, /* Read this many bytes */
2323
unsigned char *pBuf, /* Write the bytes into this buffer */
2324
int skipKey /* offset begins at data if this is true */
2326
unsigned char *aPayload;
2334
assert( pCur!=0 && pCur->pPage!=0 );
2335
assert( pCur->isValid );
2337
pPage = pCur->pPage;
2338
pageIntegrity(pPage);
2339
assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2341
aPayload = pCur->info.pCell;
2342
aPayload += pCur->info.nHeader;
2343
if( pPage->intKey ){
2346
nKey = pCur->info.nKey;
2348
assert( offset>=0 );
2352
if( offset+amt > nKey+pCur->info.nData ){
2353
return SQLITE_ERROR;
2355
if( offset<pCur->info.nLocal ){
2357
if( a+offset>pCur->info.nLocal ){
2358
a = pCur->info.nLocal - offset;
2360
memcpy(pBuf, &aPayload[offset], a);
2368
offset -= pCur->info.nLocal;
2370
ovflSize = pBt->usableSize - 4;
2372
nextPage = get4byte(&aPayload[pCur->info.nLocal]);
2373
while( amt>0 && nextPage ){
2374
rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
2378
nextPage = get4byte(aPayload);
2379
if( offset<ovflSize ){
2381
if( a + offset > ovflSize ){
2382
a = ovflSize - offset;
2384
memcpy(pBuf, &aPayload[offset+4], a);
2391
sqlite3pager_unref(aPayload);
2396
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
2402
** Read part of the key associated with cursor pCur. Exactly
2403
** "amt" bytes will be transfered into pBuf[]. The transfer
2404
** begins at "offset".
2406
** Return SQLITE_OK on success or an error code if anything goes
2407
** wrong. An error is returned if "offset+amt" is larger than
2408
** the available payload.
2410
int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2411
assert( pCur->isValid );
2412
assert( pCur->pPage!=0 );
2413
if( pCur->pPage->intKey ){
2414
return SQLITE_CORRUPT;
2416
assert( pCur->pPage->intKey==0 );
2417
assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2418
return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
2422
** Read part of the data associated with cursor pCur. Exactly
2423
** "amt" bytes will be transfered into pBuf[]. The transfer
2424
** begins at "offset".
2426
** Return SQLITE_OK on success or an error code if anything goes
2427
** wrong. An error is returned if "offset+amt" is larger than
2428
** the available payload.
2430
int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2431
assert( pCur->isValid );
2432
assert( pCur->pPage!=0 );
2433
assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2434
return getPayload(pCur, offset, amt, pBuf, 1);
2438
** Return a pointer to payload information from the entry that the
2439
** pCur cursor is pointing to. The pointer is to the beginning of
2440
** the key if skipKey==0 and it points to the beginning of data if
2441
** skipKey==1. The number of bytes of available key/data is written
2442
** into *pAmt. If *pAmt==0, then the value returned will not be
2445
** This routine is an optimization. It is common for the entire key
2446
** and data to fit on the local page and for there to be no overflow
2447
** pages. When that is so, this routine can be used to access the
2448
** key and data without making a copy. If the key and/or data spills
2449
** onto overflow pages, then getPayload() must be used to reassembly
2450
** the key/data and copy it into a preallocated buffer.
2452
** The pointer returned by this routine looks directly into the cached
2453
** page of the database. The data might change or move the next time
2454
** any btree routine is called.
2456
static const unsigned char *fetchPayload(
2457
BtCursor *pCur, /* Cursor pointing to entry to read from */
2458
int *pAmt, /* Write the number of available bytes here */
2459
int skipKey /* read beginning at data if this is true */
2461
unsigned char *aPayload;
2467
assert( pCur!=0 && pCur->pPage!=0 );
2468
assert( pCur->isValid );
2470
pPage = pCur->pPage;
2471
pageIntegrity(pPage);
2472
assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2474
aPayload = pCur->info.pCell;
2475
aPayload += pCur->info.nHeader;
2476
if( pPage->intKey ){
2479
nKey = pCur->info.nKey;
2483
nLocal = pCur->info.nLocal - nKey;
2485
nLocal = pCur->info.nLocal;
2496
** For the entry that cursor pCur is point to, return as
2497
** many bytes of the key or data as are available on the local
2498
** b-tree page. Write the number of available bytes into *pAmt.
2500
** The pointer returned is ephemeral. The key/data may move
2501
** or be destroyed on the next call to any Btree routine.
2503
** These routines is used to get quick access to key and data
2504
** in the common case where no overflow pages are used.
2506
const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
2507
return (const void*)fetchPayload(pCur, pAmt, 0);
2509
const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
2510
return (const void*)fetchPayload(pCur, pAmt, 1);
2515
** Move the cursor down to a new child page. The newPgno argument is the
2516
** page number of the child page to move to.
2518
static int moveToChild(BtCursor *pCur, u32 newPgno){
2522
Btree *pBt = pCur->pBt;
2524
assert( pCur->isValid );
2525
rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
2527
pageIntegrity(pNewPage);
2528
pNewPage->idxParent = pCur->idx;
2529
pOldPage = pCur->pPage;
2530
pOldPage->idxShift = 0;
2531
releasePage(pOldPage);
2532
pCur->pPage = pNewPage;
2534
pCur->info.nSize = 0;
2535
if( pNewPage->nCell<1 ){
2536
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
2542
** Return true if the page is the virtual root of its table.
2544
** The virtual root page is the root page for most tables. But
2545
** for the table rooted on page 1, sometime the real root page
2546
** is empty except for the right-pointer. In such cases the
2547
** virtual root page is the page that the right-pointer of page
2548
** 1 is pointing to.
2550
static int isRootPage(MemPage *pPage){
2551
MemPage *pParent = pPage->pParent;
2552
if( pParent==0 ) return 1;
2553
if( pParent->pgno>1 ) return 0;
2554
if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
2559
** Move the cursor up to the parent page.
2561
** pCur->idx is set to the cell index that contains the pointer
2562
** to the page we are coming from. If we are coming from the
2563
** right-most child page then pCur->idx is set to one more than
2564
** the largest cell index.
2566
static void moveToParent(BtCursor *pCur){
2572
assert( pCur->isValid );
2573
pPage = pCur->pPage;
2575
assert( !isRootPage(pPage) );
2576
pageIntegrity(pPage);
2577
pParent = pPage->pParent;
2578
assert( pParent!=0 );
2579
pageIntegrity(pParent);
2580
idxParent = pPage->idxParent;
2581
sqlite3pager_ref(pParent->aData);
2582
oldPgno = pPage->pgno;
2584
pCur->pPage = pParent;
2585
pCur->info.nSize = 0;
2586
assert( pParent->idxShift==0 );
2587
pCur->idx = idxParent;
2591
** Move the cursor to the root page
2593
static int moveToRoot(BtCursor *pCur){
2596
Btree *pBt = pCur->pBt;
2598
rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
2603
releasePage(pCur->pPage);
2604
pageIntegrity(pRoot);
2605
pCur->pPage = pRoot;
2607
pCur->info.nSize = 0;
2608
if( pRoot->nCell==0 && !pRoot->leaf ){
2610
assert( pRoot->pgno==1 );
2611
subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
2612
assert( subpage>0 );
2614
rc = moveToChild(pCur, subpage);
2616
pCur->isValid = pCur->pPage->nCell>0;
2621
** Move the cursor down to the left-most leaf entry beneath the
2622
** entry to which it is currently pointing.
2624
static int moveToLeftmost(BtCursor *pCur){
2629
assert( pCur->isValid );
2630
while( !(pPage = pCur->pPage)->leaf ){
2631
assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2632
pgno = get4byte(findCell(pPage, pCur->idx));
2633
rc = moveToChild(pCur, pgno);
2640
** Move the cursor down to the right-most leaf entry beneath the
2641
** page to which it is currently pointing. Notice the difference
2642
** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
2643
** finds the left-most entry beneath the *entry* whereas moveToRightmost()
2644
** finds the right-most entry beneath the *page*.
2646
static int moveToRightmost(BtCursor *pCur){
2651
assert( pCur->isValid );
2652
while( !(pPage = pCur->pPage)->leaf ){
2653
pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2654
pCur->idx = pPage->nCell;
2655
rc = moveToChild(pCur, pgno);
2658
pCur->idx = pPage->nCell - 1;
2659
pCur->info.nSize = 0;
2663
/* Move the cursor to the first entry in the table. Return SQLITE_OK
2664
** on success. Set *pRes to 0 if the cursor actually points to something
2665
** or set *pRes to 1 if the table is empty.
2667
int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
2669
rc = moveToRoot(pCur);
2671
if( pCur->isValid==0 ){
2672
assert( pCur->pPage->nCell==0 );
2676
assert( pCur->pPage->nCell>0 );
2678
rc = moveToLeftmost(pCur);
2682
/* Move the cursor to the last entry in the table. Return SQLITE_OK
2683
** on success. Set *pRes to 0 if the cursor actually points to something
2684
** or set *pRes to 1 if the table is empty.
2686
int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
2688
rc = moveToRoot(pCur);
2690
if( pCur->isValid==0 ){
2691
assert( pCur->pPage->nCell==0 );
2695
assert( pCur->isValid );
2697
rc = moveToRightmost(pCur);
2701
/* Move the cursor so that it points to an entry near pKey/nKey.
2702
** Return a success code.
2704
** For INTKEY tables, only the nKey parameter is used. pKey is
2705
** ignored. For other tables, nKey is the number of bytes of data
2706
** in nKey. The comparison function specified when the cursor was
2707
** created is used to compare keys.
2709
** If an exact match is not found, then the cursor is always
2710
** left pointing at a leaf page which would hold the entry if it
2711
** were present. The cursor might point to an entry that comes
2712
** before or after the key.
2714
** The result of comparing the key with the entry to which the
2715
** cursor is written to *pRes if pRes!=NULL. The meaning of
2716
** this value is as follows:
2718
** *pRes<0 The cursor is left pointing at an entry that
2719
** is smaller than pKey or if the table is empty
2720
** and the cursor is therefore left point to nothing.
2722
** *pRes==0 The cursor is left pointing at an entry that
2723
** exactly matches pKey.
2725
** *pRes>0 The cursor is left pointing at an entry that
2726
** is larger than pKey.
2728
int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){
2730
rc = moveToRoot(pCur);
2732
assert( pCur->pPage );
2733
assert( pCur->pPage->isInit );
2734
if( pCur->isValid==0 ){
2736
assert( pCur->pPage->nCell==0 );
2742
MemPage *pPage = pCur->pPage;
2743
int c = -1; /* pRes return if table is empty must be -1 */
2745
upr = pPage->nCell-1;
2746
if( !pPage->intKey && pKey==0 ){
2747
return SQLITE_CORRUPT;
2749
pageIntegrity(pPage);
2753
pCur->idx = (lwr+upr)/2;
2754
pCur->info.nSize = 0;
2755
sqlite3BtreeKeySize(pCur, &nCellKey);
2756
if( pPage->intKey ){
2757
if( nCellKey<nKey ){
2759
}else if( nCellKey>nKey ){
2766
pCellKey = (void *)fetchPayload(pCur, &available, 0);
2767
if( available>=nCellKey ){
2768
c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2770
pCellKey = sqliteMallocRaw( nCellKey );
2771
if( pCellKey==0 ) return SQLITE_NOMEM;
2772
rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
2773
c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2774
sqliteFree(pCellKey);
2779
if( pPage->leafData && !pPage->leaf ){
2784
if( pRes ) *pRes = 0;
2794
assert( lwr==upr+1 );
2795
assert( pPage->isInit );
2798
}else if( lwr>=pPage->nCell ){
2799
chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2801
chldPg = get4byte(findCell(pPage, lwr));
2804
assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2805
if( pRes ) *pRes = c;
2809
pCur->info.nSize = 0;
2810
rc = moveToChild(pCur, chldPg);
2819
** Return TRUE if the cursor is not pointing at an entry of the table.
2821
** TRUE will be returned after a call to sqlite3BtreeNext() moves
2822
** past the last entry in the table or sqlite3BtreePrev() moves past
2823
** the first entry. TRUE is also returned if the table is empty.
2825
int sqlite3BtreeEof(BtCursor *pCur){
2826
return pCur->isValid==0;
2830
** Advance the cursor to the next entry in the database. If
2831
** successful then set *pRes=0. If the cursor
2832
** was already pointing to the last entry in the database before
2833
** this routine was called, then set *pRes=1.
2835
int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
2837
MemPage *pPage = pCur->pPage;
2840
if( pCur->isValid==0 ){
2844
assert( pPage->isInit );
2845
assert( pCur->idx<pPage->nCell );
2848
pCur->info.nSize = 0;
2849
if( pCur->idx>=pPage->nCell ){
2851
rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
2853
rc = moveToLeftmost(pCur);
2858
if( isRootPage(pPage) ){
2864
pPage = pCur->pPage;
2865
}while( pCur->idx>=pPage->nCell );
2867
if( pPage->leafData ){
2868
rc = sqlite3BtreeNext(pCur, pRes);
2878
rc = moveToLeftmost(pCur);
2883
** Step the cursor to the back to the previous entry in the database. If
2884
** successful then set *pRes=0. If the cursor
2885
** was already pointing to the first entry in the database before
2886
** this routine was called, then set *pRes=1.
2888
int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
2892
if( pCur->isValid==0 ){
2897
pPage = pCur->pPage;
2898
assert( pPage->isInit );
2899
assert( pCur->idx>=0 );
2901
pgno = get4byte( findCell(pPage, pCur->idx) );
2902
rc = moveToChild(pCur, pgno);
2904
rc = moveToRightmost(pCur);
2906
while( pCur->idx==0 ){
2907
if( isRootPage(pPage) ){
2913
pPage = pCur->pPage;
2916
pCur->info.nSize = 0;
2917
if( pPage->leafData && !pPage->leaf ){
2918
rc = sqlite3BtreePrevious(pCur, pRes);
2928
** Allocate a new page from the database file.
2930
** The new page is marked as dirty. (In other words, sqlite3pager_write()
2931
** has already been called on the new page.) The new page has also
2932
** been referenced and the calling routine is responsible for calling
2933
** sqlite3pager_unref() on the new page when it is done.
2935
** SQLITE_OK is returned on success. Any other return value indicates
2936
** an error. *ppPage and *pPgno are undefined in the event of an error.
2937
** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
2939
** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2940
** locate a page close to the page number "nearby". This can be used in an
2941
** attempt to keep related pages close to each other in the database file,
2942
** which in turn can make database access faster.
2944
** If the "exact" parameter is not 0, and the page-number nearby exists
2945
** anywhere on the free-list, then it is guarenteed to be returned. This
2946
** is only used by auto-vacuum databases when allocating a new table.
2948
static int allocatePage(
2957
int n; /* Number of pages on the freelist */
2958
int k; /* Number of leaves on the trunk of the freelist */
2960
pPage1 = pBt->pPage1;
2961
n = get4byte(&pPage1->aData[36]);
2963
/* There are pages on the freelist. Reuse one of those pages. */
2964
MemPage *pTrunk = 0;
2966
MemPage *pPrevTrunk = 0;
2967
u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
2969
/* If the 'exact' parameter was true and a query of the pointer-map
2970
** shows that the page 'nearby' is somewhere on the free-list, then
2971
** the entire-list will be searched for that page.
2973
#ifndef SQLITE_OMIT_AUTOVACUUM
2977
assert( pBt->autoVacuum );
2978
rc = ptrmapGet(pBt, nearby, &eType, 0);
2980
if( eType==PTRMAP_FREEPAGE ){
2987
/* Decrement the free-list count by 1. Set iTrunk to the index of the
2988
** first free-list trunk page. iPrevTrunk is initially 1.
2990
rc = sqlite3pager_write(pPage1->aData);
2992
put4byte(&pPage1->aData[36], n-1);
2994
/* The code within this loop is run only once if the 'searchList' variable
2995
** is not true. Otherwise, it runs once for each trunk-page on the
2996
** free-list until the page 'nearby' is located.
2999
pPrevTrunk = pTrunk;
3001
iTrunk = get4byte(&pPrevTrunk->aData[0]);
3003
iTrunk = get4byte(&pPage1->aData[32]);
3005
rc = getPage(pBt, iTrunk, &pTrunk);
3007
releasePage(pPrevTrunk);
3011
/* TODO: This should move to after the loop? */
3012
rc = sqlite3pager_write(pTrunk->aData);
3014
releasePage(pTrunk);
3015
releasePage(pPrevTrunk);
3019
k = get4byte(&pTrunk->aData[4]);
3020
if( k==0 && !searchList ){
3021
/* The trunk has no leaves and the list is not being searched.
3022
** So extract the trunk page itself and use it as the newly
3023
** allocated page */
3024
assert( pPrevTrunk==0 );
3026
memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3029
TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3030
}else if( k>pBt->usableSize/4 - 8 ){
3031
/* Value of k is out of range. Database corruption */
3032
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
3033
#ifndef SQLITE_OMIT_AUTOVACUUM
3034
}else if( searchList && nearby==iTrunk ){
3035
/* The list is being searched and this trunk page is the page
3036
** to allocate, regardless of whether it has leaves.
3038
assert( *pPgno==iTrunk );
3043
memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3045
memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3048
/* The trunk page is required by the caller but it contains
3049
** pointers to free-list leaves. The first leaf becomes a trunk
3050
** page in this case.
3053
Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3054
rc = getPage(pBt, iNewTrunk, &pNewTrunk);
3055
if( rc!=SQLITE_OK ){
3056
releasePage(pTrunk);
3057
releasePage(pPrevTrunk);
3060
rc = sqlite3pager_write(pNewTrunk->aData);
3061
if( rc!=SQLITE_OK ){
3062
releasePage(pNewTrunk);
3063
releasePage(pTrunk);
3064
releasePage(pPrevTrunk);
3067
memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3068
put4byte(&pNewTrunk->aData[4], k-1);
3069
memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3071
put4byte(&pPage1->aData[32], iNewTrunk);
3073
put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3075
releasePage(pNewTrunk);
3078
TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3081
/* Extract a leaf from the trunk */
3084
unsigned char *aData = pTrunk->aData;
3088
dist = get4byte(&aData[8]) - nearby;
3089
if( dist<0 ) dist = -dist;
3091
int d2 = get4byte(&aData[8+i*4]) - nearby;
3092
if( d2<0 ) d2 = -d2;
3102
iPage = get4byte(&aData[8+closest*4]);
3103
if( !searchList || iPage==nearby ){
3105
if( *pPgno>sqlite3pager_pagecount(pBt->pPager) ){
3106
/* Free page off the end of the file */
3107
return SQLITE_CORRUPT; /* bkpt-CORRUPT */
3109
TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
3110
": %d more free pages\n",
3111
*pPgno, closest+1, k, pTrunk->pgno, n-1));
3113
memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
3115
put4byte(&aData[4], k-1);
3116
rc = getPage(pBt, *pPgno, ppPage);
3117
if( rc==SQLITE_OK ){
3118
sqlite3pager_dont_rollback((*ppPage)->aData);
3119
rc = sqlite3pager_write((*ppPage)->aData);
3120
if( rc!=SQLITE_OK ){
3121
releasePage(*ppPage);
3127
releasePage(pPrevTrunk);
3128
}while( searchList );
3129
releasePage(pTrunk);
3131
/* There are no pages on the freelist, so create a new page at the
3132
** end of the file */
3133
*pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
3135
#ifndef SQLITE_OMIT_AUTOVACUUM
3136
if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt->usableSize, *pPgno) ){
3137
/* If *pPgno refers to a pointer-map page, allocate two new pages
3138
** at the end of the file instead of one. The first allocated page
3139
** becomes a new pointer-map page, the second is used by the caller.
3141
TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
3142
assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3147
assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3148
rc = getPage(pBt, *pPgno, ppPage);
3150
rc = sqlite3pager_write((*ppPage)->aData);
3151
if( rc!=SQLITE_OK ){
3152
releasePage(*ppPage);
3154
TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
3157
assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3162
** Add a page of the database file to the freelist.
3164
** sqlite3pager_unref() is NOT called for pPage.
3166
static int freePage(MemPage *pPage){
3167
Btree *pBt = pPage->pBt;
3168
MemPage *pPage1 = pBt->pPage1;
3171
/* Prepare the page for freeing */
3172
assert( pPage->pgno>1 );
3174
releasePage(pPage->pParent);
3177
/* Increment the free page count on pPage1 */
3178
rc = sqlite3pager_write(pPage1->aData);
3180
n = get4byte(&pPage1->aData[36]);
3181
put4byte(&pPage1->aData[36], n+1);
3183
#ifndef SQLITE_OMIT_AUTOVACUUM
3184
/* If the database supports auto-vacuum, write an entry in the pointer-map
3185
** to indicate that the page is free.
3187
if( pBt->autoVacuum ){
3188
rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
3194
/* This is the first free page */
3195
rc = sqlite3pager_write(pPage->aData);
3197
memset(pPage->aData, 0, 8);
3198
put4byte(&pPage1->aData[32], pPage->pgno);
3199
TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
3201
/* Other free pages already exist. Retrive the first trunk page
3202
** of the freelist and find out how many leaves it has. */
3204
rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
3206
k = get4byte(&pTrunk->aData[4]);
3207
if( k>=pBt->usableSize/4 - 8 ){
3208
/* The trunk is full. Turn the page being freed into a new
3209
** trunk page with no leaves. */
3210
rc = sqlite3pager_write(pPage->aData);
3212
put4byte(pPage->aData, pTrunk->pgno);
3213
put4byte(&pPage->aData[4], 0);
3214
put4byte(&pPage1->aData[32], pPage->pgno);
3215
TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
3216
pPage->pgno, pTrunk->pgno));
3218
/* Add the newly freed page as a leaf on the current trunk */
3219
rc = sqlite3pager_write(pTrunk->aData);
3221
put4byte(&pTrunk->aData[4], k+1);
3222
put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
3223
sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
3224
TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
3226
releasePage(pTrunk);
3232
** Free any overflow pages associated with the given Cell.
3234
static int clearCell(MemPage *pPage, unsigned char *pCell){
3235
Btree *pBt = pPage->pBt;
3240
parseCellPtr(pPage, pCell, &info);
3241
if( info.iOverflow==0 ){
3242
return SQLITE_OK; /* No overflow pages. Return without doing anything */
3244
ovflPgno = get4byte(&pCell[info.iOverflow]);
3245
while( ovflPgno!=0 ){
3247
if( ovflPgno>sqlite3pager_pagecount(pBt->pPager) ){
3248
return SQLITE_CORRUPT;
3250
rc = getPage(pBt, ovflPgno, &pOvfl);
3252
ovflPgno = get4byte(pOvfl->aData);
3253
rc = freePage(pOvfl);
3254
sqlite3pager_unref(pOvfl->aData);
3261
** Create the byte sequence used to represent a cell on page pPage
3262
** and write that byte sequence into pCell[]. Overflow pages are
3263
** allocated and filled in as necessary. The calling procedure
3264
** is responsible for making sure sufficient space has been allocated
3267
** Note that pCell does not necessary need to point to the pPage->aData
3268
** area. pCell might point to some temporary storage. The cell will
3269
** be constructed in this temporary area then copied into pPage->aData
3272
static int fillInCell(
3273
MemPage *pPage, /* The page that contains the cell */
3274
unsigned char *pCell, /* Complete text of the cell */
3275
const void *pKey, i64 nKey, /* The key */
3276
const void *pData,int nData, /* The data */
3277
int *pnSize /* Write cell size here */
3284
MemPage *pToRelease = 0;
3285
unsigned char *pPrior;
3286
unsigned char *pPayload;
3287
Btree *pBt = pPage->pBt;
3292
/* Fill in the header. */
3297
if( pPage->hasData ){
3298
nHeader += putVarint(&pCell[nHeader], nData);
3302
nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
3303
parseCellPtr(pPage, pCell, &info);
3304
assert( info.nHeader==nHeader );
3305
assert( info.nKey==nKey );
3306
assert( info.nData==nData );
3308
/* Fill in the payload */
3310
if( pPage->intKey ){
3319
*pnSize = info.nSize;
3320
spaceLeft = info.nLocal;
3321
pPayload = &pCell[nHeader];
3322
pPrior = &pCell[info.iOverflow];
3324
while( nPayload>0 ){
3326
#ifndef SQLITE_OMIT_AUTOVACUUM
3327
Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
3329
rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
3330
#ifndef SQLITE_OMIT_AUTOVACUUM
3331
/* If the database supports auto-vacuum, and the second or subsequent
3332
** overflow page is being allocated, add an entry to the pointer-map
3333
** for that page now. The entry for the first overflow page will be
3334
** added later, by the insertCell() routine.
3336
if( pBt->autoVacuum && pgnoPtrmap!=0 && rc==SQLITE_OK ){
3337
rc = ptrmapPut(pBt, pgnoOvfl, PTRMAP_OVERFLOW2, pgnoPtrmap);
3341
releasePage(pToRelease);
3342
/* clearCell(pPage, pCell); */
3345
put4byte(pPrior, pgnoOvfl);
3346
releasePage(pToRelease);
3348
pPrior = pOvfl->aData;
3349
put4byte(pPrior, 0);
3350
pPayload = &pOvfl->aData[4];
3351
spaceLeft = pBt->usableSize - 4;
3354
if( n>spaceLeft ) n = spaceLeft;
3355
if( n>nSrc ) n = nSrc;
3356
memcpy(pPayload, pSrc, n);
3367
releasePage(pToRelease);
3372
** Change the MemPage.pParent pointer on the page whose number is
3373
** given in the second argument so that MemPage.pParent holds the
3374
** pointer in the third argument.
3376
static int reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
3378
unsigned char *aData;
3380
if( pgno==0 ) return SQLITE_OK;
3381
assert( pBt->pPager!=0 );
3382
aData = sqlite3pager_lookup(pBt->pPager, pgno);
3384
pThis = (MemPage*)&aData[pBt->psAligned];
3385
assert( pThis->aData==aData );
3386
if( pThis->isInit ){
3387
if( pThis->pParent!=pNewParent ){
3388
if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
3389
pThis->pParent = pNewParent;
3390
if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
3392
pThis->idxParent = idx;
3394
sqlite3pager_unref(aData);
3397
#ifndef SQLITE_OMIT_AUTOVACUUM
3398
if( pBt->autoVacuum ){
3399
return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
3408
** Change the pParent pointer of all children of pPage to point back
3411
** In other words, for every child of pPage, invoke reparentPage()
3412
** to make sure that each child knows that pPage is its parent.
3414
** This routine gets called after you memcpy() one page into
3417
static int reparentChildPages(MemPage *pPage){
3419
Btree *pBt = pPage->pBt;
3422
if( pPage->leaf ) return SQLITE_OK;
3424
for(i=0; i<pPage->nCell; i++){
3425
u8 *pCell = findCell(pPage, i);
3427
rc = reparentPage(pBt, get4byte(pCell), pPage, i);
3428
if( rc!=SQLITE_OK ) return rc;
3432
rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
3434
pPage->idxShift = 0;
3440
** Remove the i-th cell from pPage. This routine effects pPage only.
3441
** The cell content is not freed or deallocated. It is assumed that
3442
** the cell content has been copied someplace else. This routine just
3443
** removes the reference to the cell from pPage.
3445
** "sz" must be the number of bytes in the cell.
3447
static void dropCell(MemPage *pPage, int idx, int sz){
3448
int i; /* Loop counter */
3449
int pc; /* Offset to cell content of cell being deleted */
3450
u8 *data; /* pPage->aData */
3451
u8 *ptr; /* Used to move bytes around within data[] */
3453
assert( idx>=0 && idx<pPage->nCell );
3454
assert( sz==cellSize(pPage, idx) );
3455
assert( sqlite3pager_iswriteable(pPage->aData) );
3456
data = pPage->aData;
3457
ptr = &data[pPage->cellOffset + 2*idx];
3459
assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
3460
freeSpace(pPage, pc, sz);
3461
for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
3466
put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
3468
pPage->idxShift = 1;
3472
** Insert a new cell on pPage at cell index "i". pCell points to the
3473
** content of the cell.
3475
** If the cell content will fit on the page, then put it there. If it
3476
** will not fit, then make a copy of the cell content into pTemp if
3477
** pTemp is not null. Regardless of pTemp, allocate a new entry
3478
** in pPage->aOvfl[] and make it point to the cell content (either
3479
** in pTemp or the original pCell) and also record its index.
3480
** Allocating a new entry in pPage->aCell[] implies that
3481
** pPage->nOverflow is incremented.
3483
** If nSkip is non-zero, then do not copy the first nSkip bytes of the
3484
** cell. The caller will overwrite them after this function returns. If
3485
** nSkip is non-zero, then pCell may not point to an invalid memory location
3486
** (but pCell+nSkip is always valid).
3488
static int insertCell(
3489
MemPage *pPage, /* Page into which we are copying */
3490
int i, /* New cell becomes the i-th cell of the page */
3491
u8 *pCell, /* Content of the new cell */
3492
int sz, /* Bytes of content in pCell */
3493
u8 *pTemp, /* Temp storage space for pCell, if needed */
3494
u8 nSkip /* Do not write the first nSkip bytes of the cell */
3496
int idx; /* Where to write new cell content in data[] */
3497
int j; /* Loop counter */
3498
int top; /* First byte of content for any cell in data[] */
3499
int end; /* First byte past the last cell pointer in data[] */
3500
int ins; /* Index in data[] where new cell pointer is inserted */
3501
int hdr; /* Offset into data[] of the page header */
3502
int cellOffset; /* Address of first cell pointer in data[] */
3503
u8 *data; /* The content of the whole page */
3504
u8 *ptr; /* Used for moving information around in data[] */
3506
assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
3507
assert( sz==cellSizePtr(pPage, pCell) );
3508
assert( sqlite3pager_iswriteable(pPage->aData) );
3509
if( pPage->nOverflow || sz+2>pPage->nFree ){
3511
memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
3514
j = pPage->nOverflow++;
3515
assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
3516
pPage->aOvfl[j].pCell = pCell;
3517
pPage->aOvfl[j].idx = i;
3520
data = pPage->aData;
3521
hdr = pPage->hdrOffset;
3522
top = get2byte(&data[hdr+5]);
3523
cellOffset = pPage->cellOffset;
3524
end = cellOffset + 2*pPage->nCell + 2;
3525
ins = cellOffset + 2*i;
3526
if( end > top - sz ){
3527
int rc = defragmentPage(pPage);
3528
if( rc!=SQLITE_OK ) return rc;
3529
top = get2byte(&data[hdr+5]);
3530
assert( end + sz <= top );
3532
idx = allocateSpace(pPage, sz);
3534
assert( end <= get2byte(&data[hdr+5]) );
3537
memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
3538
for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
3542
put2byte(&data[ins], idx);
3543
put2byte(&data[hdr+3], pPage->nCell);
3544
pPage->idxShift = 1;
3545
pageIntegrity(pPage);
3546
#ifndef SQLITE_OMIT_AUTOVACUUM
3547
if( pPage->pBt->autoVacuum ){
3548
/* The cell may contain a pointer to an overflow page. If so, write
3549
** the entry for the overflow page into the pointer map.
3552
parseCellPtr(pPage, pCell, &info);
3553
if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
3554
Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
3555
int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
3556
if( rc!=SQLITE_OK ) return rc;
3566
** Add a list of cells to a page. The page should be initially empty.
3567
** The cells are guaranteed to fit on the page.
3569
static void assemblePage(
3570
MemPage *pPage, /* The page to be assemblied */
3571
int nCell, /* The number of cells to add to this page */
3572
u8 **apCell, /* Pointers to cell bodies */
3573
int *aSize /* Sizes of the cells */
3575
int i; /* Loop counter */
3576
int totalSize; /* Total size of all cells */
3577
int hdr; /* Index of page header */
3578
int cellptr; /* Address of next cell pointer */
3579
int cellbody; /* Address of next cell body */
3580
u8 *data; /* Data for the page */
3582
assert( pPage->nOverflow==0 );
3584
for(i=0; i<nCell; i++){
3585
totalSize += aSize[i];
3587
assert( totalSize+2*nCell<=pPage->nFree );
3588
assert( pPage->nCell==0 );
3589
cellptr = pPage->cellOffset;
3590
data = pPage->aData;
3591
hdr = pPage->hdrOffset;
3592
put2byte(&data[hdr+3], nCell);
3593
cellbody = allocateSpace(pPage, totalSize);
3594
assert( cellbody>0 );
3595
assert( pPage->nFree >= 2*nCell );
3596
pPage->nFree -= 2*nCell;
3597
for(i=0; i<nCell; i++){
3598
put2byte(&data[cellptr], cellbody);
3599
memcpy(&data[cellbody], apCell[i], aSize[i]);
3601
cellbody += aSize[i];
3603
assert( cellbody==pPage->pBt->usableSize );
3604
pPage->nCell = nCell;
3608
** The following parameters determine how many adjacent pages get involved
3609
** in a balancing operation. NN is the number of neighbors on either side
3610
** of the page that participate in the balancing operation. NB is the
3611
** total number of pages that participate, including the target page and
3612
** NN neighbors on either side.
3614
** The minimum value of NN is 1 (of course). Increasing NN above 1
3615
** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
3616
** in exchange for a larger degradation in INSERT and UPDATE performance.
3617
** The value of NN appears to give the best results overall.
3619
#define NN 1 /* Number of neighbors on either side of pPage */
3620
#define NB (NN*2+1) /* Total pages involved in the balance */
3622
/* Forward reference */
3623
static int balance(MemPage*, int);
3625
#ifndef SQLITE_OMIT_QUICKBALANCE
3627
** This version of balance() handles the common special case where
3628
** a new entry is being inserted on the extreme right-end of the
3629
** tree, in other words, when the new entry will become the largest
3630
** entry in the tree.
3632
** Instead of trying balance the 3 right-most leaf pages, just add
3633
** a new page to the right-hand side and put the one new entry in
3634
** that page. This leaves the right side of the tree somewhat
3635
** unbalanced. But odds are that we will be inserting new entries
3636
** at the end soon afterwards so the nearly empty page will quickly
3637
** fill up. On average.
3639
** pPage is the leaf page which is the right-most page in the tree.
3640
** pParent is its parent. pPage must have a single overflow entry
3641
** which is also the right-most entry on the page.
3643
static int balance_quick(MemPage *pPage, MemPage *pParent){
3650
Btree *pBt = pPage->pBt;
3651
int parentIdx = pParent->nCell; /* pParent new divider cell index */
3652
int parentSize; /* Size of new divider cell */
3653
u8 parentCell[64]; /* Space for the new divider cell */
3655
/* Allocate a new page. Insert the overflow cell from pPage
3656
** into it. Then remove the overflow cell from pPage.
3658
rc = allocatePage(pBt, &pNew, &pgnoNew, 0, 0);
3659
if( rc!=SQLITE_OK ){
3662
pCell = pPage->aOvfl[0].pCell;
3663
szCell = cellSizePtr(pPage, pCell);
3664
zeroPage(pNew, pPage->aData[0]);
3665
assemblePage(pNew, 1, &pCell, &szCell);
3666
pPage->nOverflow = 0;
3668
/* Set the parent of the newly allocated page to pParent. */
3669
pNew->pParent = pParent;
3670
sqlite3pager_ref(pParent->aData);
3672
/* pPage is currently the right-child of pParent. Change this
3673
** so that the right-child is the new page allocated above and
3674
** pPage is the next-to-right child.
3676
assert( pPage->nCell>0 );
3677
parseCellPtr(pPage, findCell(pPage, pPage->nCell-1), &info);
3678
rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, &parentSize);
3679
if( rc!=SQLITE_OK ){
3682
assert( parentSize<64 );
3683
rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
3684
if( rc!=SQLITE_OK ){
3687
put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
3688
put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
3690
#ifndef SQLITE_OMIT_AUTOVACUUM
3691
/* If this is an auto-vacuum database, update the pointer map
3692
** with entries for the new page, and any pointer from the
3693
** cell on the page to an overflow page.
3695
if( pBt->autoVacuum ){
3696
rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
3697
if( rc!=SQLITE_OK ){
3700
rc = ptrmapPutOvfl(pNew, 0);
3701
if( rc!=SQLITE_OK ){
3707
/* Release the reference to the new page and balance the parent page,
3708
** in case the divider cell inserted caused it to become overfull.
3711
return balance(pParent, 0);
3713
#endif /* SQLITE_OMIT_QUICKBALANCE */
3716
** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
3717
** if the database supports auto-vacuum or not. Because it is used
3718
** within an expression that is an argument to another macro
3719
** (sqliteMallocRaw), it is not possible to use conditional compilation.
3720
** So, this macro is defined instead.
3722
#ifndef SQLITE_OMIT_AUTOVACUUM
3723
#define ISAUTOVACUUM (pBt->autoVacuum)
3725
#define ISAUTOVACUUM 0
3729
** This routine redistributes Cells on pPage and up to NN*2 siblings
3730
** of pPage so that all pages have about the same amount of free space.
3731
** Usually NN siblings on either side of pPage is used in the balancing,
3732
** though more siblings might come from one side if pPage is the first
3733
** or last child of its parent. If pPage has fewer than 2*NN siblings
3734
** (something which can only happen if pPage is the root page or a
3735
** child of root) then all available siblings participate in the balancing.
3737
** The number of siblings of pPage might be increased or decreased by one or
3738
** two in an effort to keep pages nearly full but not over full. The root page
3739
** is special and is allowed to be nearly empty. If pPage is
3740
** the root page, then the depth of the tree might be increased
3741
** or decreased by one, as necessary, to keep the root page from being
3742
** overfull or completely empty.
3744
** Note that when this routine is called, some of the Cells on pPage
3745
** might not actually be stored in pPage->aData[]. This can happen
3746
** if the page is overfull. Part of the job of this routine is to
3747
** make sure all Cells for pPage once again fit in pPage->aData[].
3749
** In the course of balancing the siblings of pPage, the parent of pPage
3750
** might become overfull or underfull. If that happens, then this routine
3751
** is called recursively on the parent.
3753
** If this routine fails for any reason, it might leave the database
3754
** in a corrupted state. So if this routine fails, the database should
3757
static int balance_nonroot(MemPage *pPage){
3758
MemPage *pParent; /* The parent of pPage */
3759
Btree *pBt; /* The whole database */
3760
int nCell = 0; /* Number of cells in apCell[] */
3761
int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
3762
int nOld; /* Number of pages in apOld[] */
3763
int nNew; /* Number of pages in apNew[] */
3764
int nDiv; /* Number of cells in apDiv[] */
3765
int i, j, k; /* Loop counters */
3766
int idx; /* Index of pPage in pParent->aCell[] */
3767
int nxDiv; /* Next divider slot in pParent->aCell[] */
3768
int rc; /* The return code */
3769
int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
3770
int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
3771
int usableSpace; /* Bytes in pPage beyond the header */
3772
int pageFlags; /* Value of pPage->aData[0] */
3773
int subtotal; /* Subtotal of bytes in cells on one page */
3774
int iSpace = 0; /* First unused byte of aSpace[] */
3775
MemPage *apOld[NB]; /* pPage and up to two siblings */
3776
Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
3777
MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
3778
MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
3779
Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
3780
int idxDiv[NB]; /* Indices of divider cells in pParent */
3781
u8 *apDiv[NB]; /* Divider cells in pParent */
3782
int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
3783
int szNew[NB+2]; /* Combined size of cells place on i-th page */
3784
u8 **apCell = 0; /* All cells begin balanced */
3785
int *szCell; /* Local size of all cells in apCell[] */
3786
u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
3787
u8 *aSpace; /* Space to hold copies of dividers cells */
3788
#ifndef SQLITE_OMIT_AUTOVACUUM
3793
** Find the parent page.
3795
assert( pPage->isInit );
3796
assert( sqlite3pager_iswriteable(pPage->aData) );
3798
pParent = pPage->pParent;
3799
sqlite3pager_write(pParent->aData);
3801
TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
3803
#ifndef SQLITE_OMIT_QUICKBALANCE
3805
** A special case: If a new entry has just been inserted into a
3806
** table (that is, a btree with integer keys and all data at the leaves)
3807
** an the new entry is the right-most entry in the tree (it has the
3808
** largest key) then use the special balance_quick() routine for
3809
** balancing. balance_quick() is much faster and results in a tighter
3810
** packing of data in the common case.
3815
pPage->nOverflow==1 &&
3816
pPage->aOvfl[0].idx==pPage->nCell &&
3817
pPage->pParent->pgno!=1 &&
3818
get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
3821
** TODO: Check the siblings to the left of pPage. It may be that
3822
** they are not full and no new page is required.
3824
return balance_quick(pPage, pParent);
3829
** Find the cell in the parent page whose left child points back
3830
** to pPage. The "idx" variable is the index of that cell. If pPage
3831
** is the rightmost child of pParent then set idx to pParent->nCell
3833
if( pParent->idxShift ){
3836
assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
3837
for(idx=0; idx<pParent->nCell; idx++){
3838
if( get4byte(findCell(pParent, idx))==pgno ){
3842
assert( idx<pParent->nCell
3843
|| get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
3845
idx = pPage->idxParent;
3849
** Initialize variables so that it will be safe to jump
3850
** directly to balance_cleanup at any moment.
3853
sqlite3pager_ref(pParent->aData);
3856
** Find sibling pages to pPage and the cells in pParent that divide
3857
** the siblings. An attempt is made to find NN siblings on either
3858
** side of pPage. More siblings are taken from one side, however, if
3859
** pPage there are fewer than NN siblings on the other side. If pParent
3860
** has NB or fewer children then all children of pParent are taken.
3863
if( nxDiv + NB > pParent->nCell ){
3864
nxDiv = pParent->nCell - NB + 1;
3870
for(i=0, k=nxDiv; i<NB; i++, k++){
3871
if( k<pParent->nCell ){
3873
apDiv[i] = findCell(pParent, k);
3875
assert( !pParent->leaf );
3876
pgnoOld[i] = get4byte(apDiv[i]);
3877
}else if( k==pParent->nCell ){
3878
pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
3882
rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
3883
if( rc ) goto balance_cleanup;
3884
apOld[i]->idxParent = k;
3888
nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
3891
/* Make nMaxCells a multiple of 2 in order to preserve 8-byte
3893
nMaxCells = (nMaxCells + 1)&~1;
3896
** Allocate space for memory structures
3898
apCell = sqliteMallocRaw(
3899
nMaxCells*sizeof(u8*) /* apCell */
3900
+ nMaxCells*sizeof(int) /* szCell */
3901
+ sizeof(MemPage)*NB /* aCopy */
3902
+ pBt->psAligned*(5+NB) /* aSpace */
3903
+ (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */
3907
goto balance_cleanup;
3909
szCell = (int*)&apCell[nMaxCells];
3910
aCopy[0] = (u8*)&szCell[nMaxCells];
3911
for(i=1; i<NB; i++){
3912
aCopy[i] = &aCopy[i-1][pBt->psAligned+sizeof(MemPage)];
3914
aSpace = &aCopy[NB-1][pBt->psAligned+sizeof(MemPage)];
3915
#ifndef SQLITE_OMIT_AUTOVACUUM
3916
if( pBt->autoVacuum ){
3917
aFrom = &aSpace[5*pBt->psAligned];
3922
** Make copies of the content of pPage and its siblings into aOld[].
3923
** The rest of this function will use data from the copies rather
3924
** that the original pages since the original pages will be in the
3925
** process of being overwritten.
3927
for(i=0; i<nOld; i++){
3928
MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->psAligned];
3929
p->aData = &((u8*)p)[-pBt->psAligned];
3930
memcpy(p->aData, apOld[i]->aData, pBt->psAligned + sizeof(MemPage));
3931
p->aData = &((u8*)p)[-pBt->psAligned];
3935
** Load pointers to all cells on sibling pages and the divider cells
3936
** into the local apCell[] array. Make copies of the divider cells
3937
** into space obtained form aSpace[] and remove the the divider Cells
3940
** If the siblings are on leaf pages, then the child pointers of the
3941
** divider cells are stripped from the cells before they are copied
3942
** into aSpace[]. In this way, all cells in apCell[] are without
3943
** child pointers. If siblings are not leaves, then all cell in
3944
** apCell[] include child pointers. Either way, all cells in apCell[]
3947
** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
3948
** leafData: 1 if pPage holds key+data and pParent holds only keys.
3951
leafCorrection = pPage->leaf*4;
3952
leafData = pPage->leafData && pPage->leaf;
3953
for(i=0; i<nOld; i++){
3954
MemPage *pOld = apCopy[i];
3955
int limit = pOld->nCell+pOld->nOverflow;
3956
for(j=0; j<limit; j++){
3957
assert( nCell<nMaxCells );
3958
apCell[nCell] = findOverflowCell(pOld, j);
3959
szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
3960
#ifndef SQLITE_OMIT_AUTOVACUUM
3961
if( pBt->autoVacuum ){
3964
for(a=0; a<pOld->nOverflow; a++){
3965
if( pOld->aOvfl[a].pCell==apCell[nCell] ){
3966
aFrom[nCell] = 0xFF;
3975
int sz = cellSizePtr(pParent, apDiv[i]);
3977
/* With the LEAFDATA flag, pParent cells hold only INTKEYs that
3978
** are duplicates of keys on the child pages. We need to remove
3979
** the divider cells from pParent, but the dividers cells are not
3980
** added to apCell[] because they are duplicates of child cells.
3982
dropCell(pParent, nxDiv, sz);
3985
assert( nCell<nMaxCells );
3987
pTemp = &aSpace[iSpace];
3989
assert( iSpace<=pBt->psAligned*5 );
3990
memcpy(pTemp, apDiv[i], sz);
3991
apCell[nCell] = pTemp+leafCorrection;
3992
#ifndef SQLITE_OMIT_AUTOVACUUM
3993
if( pBt->autoVacuum ){
3994
aFrom[nCell] = 0xFF;
3997
dropCell(pParent, nxDiv, sz);
3998
szCell[nCell] -= leafCorrection;
3999
assert( get4byte(pTemp)==pgnoOld[i] );
4001
assert( leafCorrection==0 );
4002
/* The right pointer of the child page pOld becomes the left
4003
** pointer of the divider cell */
4004
memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4006
assert( leafCorrection==4 );
4014
** Figure out the number of pages needed to hold all nCell cells.
4015
** Store this number in "k". Also compute szNew[] which is the total
4016
** size of all cells on the i-th page and cntNew[] which is the index
4017
** in apCell[] of the cell that divides page i from page i+1.
4018
** cntNew[k] should equal nCell.
4020
** Values computed by this block:
4022
** k: The total number of sibling pages
4023
** szNew[i]: Spaced used on the i-th sibling page.
4024
** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
4025
** the right of the i-th sibling page.
4026
** usableSpace: Number of bytes of space available on each sibling.
4029
usableSpace = pBt->usableSize - 12 + leafCorrection;
4030
for(subtotal=k=i=0; i<nCell; i++){
4031
assert( i<nMaxCells );
4032
subtotal += szCell[i] + 2;
4033
if( subtotal > usableSpace ){
4034
szNew[k] = subtotal - szCell[i];
4036
if( leafData ){ i--; }
4041
szNew[k] = subtotal;
4046
** The packing computed by the previous block is biased toward the siblings
4047
** on the left side. The left siblings are always nearly full, while the
4048
** right-most sibling might be nearly empty. This block of code attempts
4049
** to adjust the packing of siblings to get a better balance.
4051
** This adjustment is more than an optimization. The packing above might
4052
** be so out of balance as to be illegal. For example, the right-most
4053
** sibling might be completely empty. This adjustment is not optional.
4055
for(i=k-1; i>0; i--){
4056
int szRight = szNew[i]; /* Size of sibling on the right */
4057
int szLeft = szNew[i-1]; /* Size of sibling on the left */
4058
int r; /* Index of right-most cell in left sibling */
4059
int d; /* Index of first cell to the left of right sibling */
4061
r = cntNew[i-1] - 1;
4062
d = r + 1 - leafData;
4063
assert( d<nMaxCells );
4064
assert( r<nMaxCells );
4065
while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
4066
szRight += szCell[d] + 2;
4067
szLeft -= szCell[r] + 2;
4069
r = cntNew[i-1] - 1;
4070
d = r + 1 - leafData;
4073
szNew[i-1] = szLeft;
4075
assert( cntNew[0]>0 );
4078
** Allocate k new pages. Reuse old pages where possible.
4080
assert( pPage->pgno>1 );
4081
pageFlags = pPage->aData[0];
4085
pNew = apNew[i] = apOld[i];
4086
pgnoNew[i] = pgnoOld[i];
4088
rc = sqlite3pager_write(pNew->aData);
4089
if( rc ) goto balance_cleanup;
4091
rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
4092
if( rc ) goto balance_cleanup;
4096
zeroPage(pNew, pageFlags);
4099
/* Free any old pages that were not reused as new pages.
4102
rc = freePage(apOld[i]);
4103
if( rc ) goto balance_cleanup;
4104
releasePage(apOld[i]);
4110
** Put the new pages in accending order. This helps to
4111
** keep entries in the disk file in order so that a scan
4112
** of the table is a linear scan through the file. That
4113
** in turn helps the operating system to deliver pages
4114
** from the disk more rapidly.
4116
** An O(n^2) insertion sort algorithm is used, but since
4117
** n is never more than NB (a small constant), that should
4118
** not be a problem.
4120
** When NB==3, this one optimization makes the database
4121
** about 25% faster for large insertions and deletions.
4123
for(i=0; i<k-1; i++){
4124
int minV = pgnoNew[i];
4126
for(j=i+1; j<k; j++){
4127
if( pgnoNew[j]<(unsigned)minV ){
4137
pgnoNew[i] = pgnoNew[minI];
4138
apNew[i] = apNew[minI];
4143
TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
4145
nOld>=2 ? pgnoOld[1] : 0,
4146
nOld>=3 ? pgnoOld[2] : 0,
4147
pgnoNew[0], szNew[0],
4148
nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
4149
nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
4150
nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
4151
nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
4154
** Evenly distribute the data in apCell[] across the new pages.
4155
** Insert divider cells into pParent as necessary.
4158
for(i=0; i<nNew; i++){
4159
/* Assemble the new sibling page. */
4160
MemPage *pNew = apNew[i];
4161
assert( j<nMaxCells );
4162
assert( pNew->pgno==pgnoNew[i] );
4163
assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
4164
assert( pNew->nCell>0 );
4165
assert( pNew->nOverflow==0 );
4167
#ifndef SQLITE_OMIT_AUTOVACUUM
4168
/* If this is an auto-vacuum database, update the pointer map entries
4169
** that point to the siblings that were rearranged. These can be: left
4170
** children of cells, the right-child of the page, or overflow pages
4171
** pointed to by cells.
4173
if( pBt->autoVacuum ){
4174
for(k=j; k<cntNew[i]; k++){
4175
assert( k<nMaxCells );
4176
if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
4177
rc = ptrmapPutOvfl(pNew, k-j);
4178
if( rc!=SQLITE_OK ){
4179
goto balance_cleanup;
4188
/* If the sibling page assembled above was not the right-most sibling,
4189
** insert a divider cell into the parent page.
4191
if( i<nNew-1 && j<nCell ){
4196
assert( j<nMaxCells );
4198
sz = szCell[j] + leafCorrection;
4200
memcpy(&pNew->aData[8], pCell, 4);
4202
}else if( leafData ){
4203
/* If the tree is a leaf-data tree, and the siblings are leaves,
4204
** then there is no divider cell in apCell[]. Instead, the divider
4205
** cell consists of the integer key for the right-most cell of
4206
** the sibling-page assembled above only.
4210
parseCellPtr(pNew, apCell[j], &info);
4211
pCell = &aSpace[iSpace];
4212
fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
4214
assert( iSpace<=pBt->psAligned*5 );
4218
pTemp = &aSpace[iSpace];
4220
assert( iSpace<=pBt->psAligned*5 );
4222
rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
4223
if( rc!=SQLITE_OK ) goto balance_cleanup;
4224
put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
4225
#ifndef SQLITE_OMIT_AUTOVACUUM
4226
/* If this is an auto-vacuum database, and not a leaf-data tree,
4227
** then update the pointer map with an entry for the overflow page
4228
** that the cell just inserted points to (if any).
4230
if( pBt->autoVacuum && !leafData ){
4231
rc = ptrmapPutOvfl(pParent, nxDiv);
4232
if( rc!=SQLITE_OK ){
4233
goto balance_cleanup;
4242
if( (pageFlags & PTF_LEAF)==0 ){
4243
memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
4245
if( nxDiv==pParent->nCell+pParent->nOverflow ){
4246
/* Right-most sibling is the right-most child of pParent */
4247
put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
4249
/* Right-most sibling is the left child of the first entry in pParent
4250
** past the right-most divider entry */
4251
put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
4255
** Reparent children of all cells.
4257
for(i=0; i<nNew; i++){
4258
rc = reparentChildPages(apNew[i]);
4259
if( rc!=SQLITE_OK ) goto balance_cleanup;
4261
rc = reparentChildPages(pParent);
4262
if( rc!=SQLITE_OK ) goto balance_cleanup;
4265
** Balance the parent page. Note that the current page (pPage) might
4266
** have been added to the freelist so it might no longer be initialized.
4267
** But the parent page will always be initialized.
4269
assert( pParent->isInit );
4270
/* assert( pPage->isInit ); // No! pPage might have been added to freelist */
4271
/* pageIntegrity(pPage); // No! pPage might have been added to freelist */
4272
rc = balance(pParent, 0);
4275
** Cleanup before returning.
4279
for(i=0; i<nOld; i++){
4280
releasePage(apOld[i]);
4282
for(i=0; i<nNew; i++){
4283
releasePage(apNew[i]);
4285
releasePage(pParent);
4286
TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
4287
pPage->pgno, nOld, nNew, nCell));
4292
** This routine is called for the root page of a btree when the root
4293
** page contains no cells. This is an opportunity to make the tree
4294
** shallower by one level.
4296
static int balance_shallower(MemPage *pPage){
4297
MemPage *pChild; /* The only child page of pPage */
4298
Pgno pgnoChild; /* Page number for pChild */
4299
int rc = SQLITE_OK; /* Return code from subprocedures */
4300
Btree *pBt; /* The main BTree structure */
4301
int mxCellPerPage; /* Maximum number of cells per page */
4302
u8 **apCell; /* All cells from pages being balanced */
4303
int *szCell; /* Local size of all cells */
4305
assert( pPage->pParent==0 );
4306
assert( pPage->nCell==0 );
4308
mxCellPerPage = MX_CELL(pBt);
4309
apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
4310
if( apCell==0 ) return SQLITE_NOMEM;
4311
szCell = (int*)&apCell[mxCellPerPage];
4313
/* The table is completely empty */
4314
TRACE(("BALANCE: empty table %d\n", pPage->pgno));
4316
/* The root page is empty but has one child. Transfer the
4317
** information from that one child into the root page if it
4318
** will fit. This reduces the depth of the tree by one.
4320
** If the root page is page 1, it has less space available than
4321
** its child (due to the 100 byte header that occurs at the beginning
4322
** of the database fle), so it might not be able to hold all of the
4323
** information currently contained in the child. If this is the
4324
** case, then do not do the transfer. Leave page 1 empty except
4325
** for the right-pointer to the child page. The child page becomes
4326
** the virtual root of the tree.
4328
pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4329
assert( pgnoChild>0 );
4330
assert( pgnoChild<=sqlite3pager_pagecount(pPage->pBt->pPager) );
4331
rc = getPage(pPage->pBt, pgnoChild, &pChild);
4332
if( rc ) goto end_shallow_balance;
4333
if( pPage->pgno==1 ){
4334
rc = initPage(pChild, pPage);
4335
if( rc ) goto end_shallow_balance;
4336
assert( pChild->nOverflow==0 );
4337
if( pChild->nFree>=100 ){
4338
/* The child information will fit on the root page, so do the
4341
zeroPage(pPage, pChild->aData[0]);
4342
for(i=0; i<pChild->nCell; i++){
4343
apCell[i] = findCell(pChild,i);
4344
szCell[i] = cellSizePtr(pChild, apCell[i]);
4346
assemblePage(pPage, pChild->nCell, apCell, szCell);
4347
/* Copy the right-pointer of the child to the parent. */
4348
put4byte(&pPage->aData[pPage->hdrOffset+8],
4349
get4byte(&pChild->aData[pChild->hdrOffset+8]));
4351
TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
4353
/* The child has more information that will fit on the root.
4354
** The tree is already balanced. Do nothing. */
4355
TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
4358
memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
4361
rc = initPage(pPage, 0);
4362
assert( rc==SQLITE_OK );
4364
TRACE(("BALANCE: transfer child %d into root %d\n",
4365
pChild->pgno, pPage->pgno));
4367
rc = reparentChildPages(pPage);
4368
assert( pPage->nOverflow==0 );
4369
#ifndef SQLITE_OMIT_AUTOVACUUM
4370
if( pBt->autoVacuum ){
4372
for(i=0; i<pPage->nCell; i++){
4373
rc = ptrmapPutOvfl(pPage, i);
4374
if( rc!=SQLITE_OK ){
4375
goto end_shallow_balance;
4380
if( rc!=SQLITE_OK ) goto end_shallow_balance;
4381
releasePage(pChild);
4383
end_shallow_balance:
4390
** The root page is overfull
4392
** When this happens, Create a new child page and copy the
4393
** contents of the root into the child. Then make the root
4394
** page an empty page with rightChild pointing to the new
4395
** child. Finally, call balance_internal() on the new child
4396
** to cause it to split.
4398
static int balance_deeper(MemPage *pPage){
4399
int rc; /* Return value from subprocedures */
4400
MemPage *pChild; /* Pointer to a new child page */
4401
Pgno pgnoChild; /* Page number of the new child page */
4402
Btree *pBt; /* The BTree */
4403
int usableSize; /* Total usable size of a page */
4404
u8 *data; /* Content of the parent page */
4405
u8 *cdata; /* Content of the child page */
4406
int hdr; /* Offset to page header in parent */
4407
int brk; /* Offset to content of first cell in parent */
4409
assert( pPage->pParent==0 );
4410
assert( pPage->nOverflow>0 );
4412
rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
4414
assert( sqlite3pager_iswriteable(pChild->aData) );
4415
usableSize = pBt->usableSize;
4416
data = pPage->aData;
4417
hdr = pPage->hdrOffset;
4418
brk = get2byte(&data[hdr+5]);
4419
cdata = pChild->aData;
4420
memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
4421
memcpy(&cdata[brk], &data[brk], usableSize-brk);
4422
assert( pChild->isInit==0 );
4423
rc = initPage(pChild, pPage);
4424
if( rc ) goto balancedeeper_out;
4425
memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
4426
pChild->nOverflow = pPage->nOverflow;
4427
if( pChild->nOverflow ){
4430
assert( pChild->nCell==pPage->nCell );
4431
zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
4432
put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
4433
TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
4434
#ifndef SQLITE_OMIT_AUTOVACUUM
4435
if( pBt->autoVacuum ){
4437
rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
4438
if( rc ) goto balancedeeper_out;
4439
for(i=0; i<pChild->nCell; i++){
4440
rc = ptrmapPutOvfl(pChild, i);
4441
if( rc!=SQLITE_OK ){
4447
rc = balance_nonroot(pChild);
4450
releasePage(pChild);
4455
** Decide if the page pPage needs to be balanced. If balancing is
4456
** required, call the appropriate balancing routine.
4458
static int balance(MemPage *pPage, int insert){
4460
if( pPage->pParent==0 ){
4461
if( pPage->nOverflow>0 ){
4462
rc = balance_deeper(pPage);
4464
if( rc==SQLITE_OK && pPage->nCell==0 ){
4465
rc = balance_shallower(pPage);
4468
if( pPage->nOverflow>0 ||
4469
(!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
4470
rc = balance_nonroot(pPage);
4477
** This routine checks all cursors that point to table pgnoRoot.
4478
** If any of those cursors other than pExclude were opened with
4479
** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
4480
** cursors that point to pgnoRoot were opened with wrFlag==1
4481
** then this routine returns SQLITE_OK.
4483
** In addition to checking for read-locks (where a read-lock
4484
** means a cursor opened with wrFlag==0) this routine also moves
4485
** all cursors other than pExclude so that they are pointing to the
4486
** first Cell on root page. This is necessary because an insert
4487
** or delete might change the number of cells on a page or delete
4488
** a page entirely and we do not want to leave any cursors
4489
** pointing to non-existant pages or cells.
4491
static int checkReadLocks(Btree *pBt, Pgno pgnoRoot, BtCursor *pExclude){
4493
for(p=pBt->pCursor; p; p=p->pNext){
4494
if( p->pgnoRoot!=pgnoRoot || p==pExclude ) continue;
4495
if( p->wrFlag==0 ) return SQLITE_LOCKED;
4496
if( p->pPage->pgno!=p->pgnoRoot ){
4504
** Insert a new record into the BTree. The key is given by (pKey,nKey)
4505
** and the data is given by (pData,nData). The cursor is used only to
4506
** define what table the record should be inserted into. The cursor
4507
** is left pointing at a random location.
4509
** For an INTKEY table, only the nKey value of the key is used. pKey is
4510
** ignored. For a ZERODATA table, the pData and nData are both ignored.
4512
int sqlite3BtreeInsert(
4513
BtCursor *pCur, /* Insert data into the table of this cursor */
4514
const void *pKey, i64 nKey, /* The key of the new record */
4515
const void *pData, int nData /* The data of the new record */
4521
Btree *pBt = pCur->pBt;
4522
unsigned char *oldCell;
4523
unsigned char *newCell = 0;
4525
if( pBt->inTrans!=TRANS_WRITE ){
4526
/* Must start a transaction before doing an insert */
4527
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4529
assert( !pBt->readOnly );
4530
if( !pCur->wrFlag ){
4531
return SQLITE_PERM; /* Cursor not open for writing */
4533
if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
4534
return SQLITE_LOCKED; /* The table pCur points to has a read lock */
4536
rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
4538
pPage = pCur->pPage;
4539
assert( pPage->intKey || nKey>=0 );
4540
assert( pPage->leaf || !pPage->leafData );
4541
TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
4542
pCur->pgnoRoot, nKey, nData, pPage->pgno,
4543
loc==0 ? "overwrite" : "new entry"));
4544
assert( pPage->isInit );
4545
rc = sqlite3pager_write(pPage->aData);
4547
newCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
4548
if( newCell==0 ) return SQLITE_NOMEM;
4549
rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
4550
if( rc ) goto end_insert;
4551
assert( szNew==cellSizePtr(pPage, newCell) );
4552
assert( szNew<=MX_CELL_SIZE(pBt) );
4553
if( loc==0 && pCur->isValid ){
4555
assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
4556
oldCell = findCell(pPage, pCur->idx);
4558
memcpy(newCell, oldCell, 4);
4560
szOld = cellSizePtr(pPage, oldCell);
4561
rc = clearCell(pPage, oldCell);
4562
if( rc ) goto end_insert;
4563
dropCell(pPage, pCur->idx, szOld);
4564
}else if( loc<0 && pPage->nCell>0 ){
4565
assert( pPage->leaf );
4567
pCur->info.nSize = 0;
4569
assert( pPage->leaf );
4571
rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
4572
if( rc!=SQLITE_OK ) goto end_insert;
4573
rc = balance(pPage, 1);
4574
/* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
4575
/* fflush(stdout); */
4576
if( rc==SQLITE_OK ){
4580
sqliteFree(newCell);
4585
** Delete the entry that the cursor is pointing to. The cursor
4586
** is left pointing at a random location.
4588
int sqlite3BtreeDelete(BtCursor *pCur){
4589
MemPage *pPage = pCur->pPage;
4590
unsigned char *pCell;
4593
Btree *pBt = pCur->pBt;
4595
assert( pPage->isInit );
4596
if( pBt->inTrans!=TRANS_WRITE ){
4597
/* Must start a transaction before doing a delete */
4598
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4600
assert( !pBt->readOnly );
4601
if( pCur->idx >= pPage->nCell ){
4602
return SQLITE_ERROR; /* The cursor is not pointing to anything */
4604
if( !pCur->wrFlag ){
4605
return SQLITE_PERM; /* Did not open this cursor for writing */
4607
if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
4608
return SQLITE_LOCKED; /* The table pCur points to has a read lock */
4610
rc = sqlite3pager_write(pPage->aData);
4613
/* Locate the cell within it's page and leave pCell pointing to the
4614
** data. The clearCell() call frees any overflow pages associated with the
4615
** cell. The cell itself is still intact.
4617
pCell = findCell(pPage, pCur->idx);
4619
pgnoChild = get4byte(pCell);
4621
rc = clearCell(pPage, pCell);
4626
** The entry we are about to delete is not a leaf so if we do not
4627
** do something we will leave a hole on an internal page.
4628
** We have to fill the hole by moving in a cell from a leaf. The
4629
** next Cell after the one to be deleted is guaranteed to exist and
4630
** to be a leaf so we can use it.
4633
unsigned char *pNext;
4636
unsigned char *tempCell = 0;
4637
assert( !pPage->leafData );
4638
getTempCursor(pCur, &leafCur);
4639
rc = sqlite3BtreeNext(&leafCur, ¬Used);
4640
if( rc!=SQLITE_OK ){
4641
if( rc!=SQLITE_NOMEM ){
4642
rc = SQLITE_CORRUPT; /* bkpt-CORRUPT */
4645
if( rc==SQLITE_OK ){
4646
rc = sqlite3pager_write(leafCur.pPage->aData);
4648
if( rc==SQLITE_OK ){
4649
TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
4650
pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
4651
dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
4652
pNext = findCell(leafCur.pPage, leafCur.idx);
4653
szNext = cellSizePtr(leafCur.pPage, pNext);
4654
assert( MX_CELL_SIZE(pBt)>=szNext+4 );
4655
tempCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
4660
if( rc==SQLITE_OK ){
4661
rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
4663
if( rc==SQLITE_OK ){
4664
put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
4665
rc = balance(pPage, 0);
4667
if( rc==SQLITE_OK ){
4668
dropCell(leafCur.pPage, leafCur.idx, szNext);
4669
rc = balance(leafCur.pPage, 0);
4671
sqliteFree(tempCell);
4672
releaseTempCursor(&leafCur);
4674
TRACE(("DELETE: table=%d delete from leaf %d\n",
4675
pCur->pgnoRoot, pPage->pgno));
4676
dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
4677
rc = balance(pPage, 0);
4679
if( rc==SQLITE_OK ){
4686
** Create a new BTree table. Write into *piTable the page
4687
** number for the root page of the new table.
4689
** The type of type is determined by the flags parameter. Only the
4690
** following values of flags are currently in use. Other values for
4691
** flags might not work:
4693
** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
4694
** BTREE_ZERODATA Used for SQL indices
4696
int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
4700
if( pBt->inTrans!=TRANS_WRITE ){
4701
/* Must start a transaction first */
4702
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4704
assert( !pBt->readOnly );
4706
/* It is illegal to create a table if any cursors are open on the
4707
** database. This is because in auto-vacuum mode the backend may
4708
** need to move a database page to make room for the new root-page.
4709
** If an open cursor was using the page a problem would occur.
4712
return SQLITE_LOCKED;
4715
#ifdef SQLITE_OMIT_AUTOVACUUM
4716
rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
4719
if( pBt->autoVacuum ){
4720
Pgno pgnoMove; /* Move a page here to make room for the root-page */
4721
MemPage *pPageMove; /* The page to move to. */
4723
/* Read the value of meta[3] from the database to determine where the
4724
** root page of the new table should go. meta[3] is the largest root-page
4725
** created so far, so the new root-page is (meta[3]+1).
4727
rc = sqlite3BtreeGetMeta(pBt, 4, &pgnoRoot);
4728
if( rc!=SQLITE_OK ) return rc;
4731
/* The new root-page may not be allocated on a pointer-map page, or the
4732
** PENDING_BYTE page.
4734
if( pgnoRoot==PTRMAP_PAGENO(pBt->usableSize, pgnoRoot) ||
4735
pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
4738
assert( pgnoRoot>=3 );
4740
/* Allocate a page. The page that currently resides at pgnoRoot will
4741
** be moved to the allocated page (unless the allocated page happens
4742
** to reside at pgnoRoot).
4744
rc = allocatePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
4745
if( rc!=SQLITE_OK ){
4749
if( pgnoMove!=pgnoRoot ){
4753
releasePage(pPageMove);
4754
rc = getPage(pBt, pgnoRoot, &pRoot);
4755
if( rc!=SQLITE_OK ){
4758
rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
4759
if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
4763
assert( eType!=PTRMAP_ROOTPAGE );
4764
assert( eType!=PTRMAP_FREEPAGE );
4765
rc = sqlite3pager_write(pRoot->aData);
4766
if( rc!=SQLITE_OK ){
4770
rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
4772
if( rc!=SQLITE_OK ){
4775
rc = getPage(pBt, pgnoRoot, &pRoot);
4776
if( rc!=SQLITE_OK ){
4779
rc = sqlite3pager_write(pRoot->aData);
4780
if( rc!=SQLITE_OK ){
4788
/* Update the pointer-map and meta-data with the new root-page number. */
4789
rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
4794
rc = sqlite3BtreeUpdateMeta(pBt, 4, pgnoRoot);
4801
rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
4805
assert( sqlite3pager_iswriteable(pRoot->aData) );
4806
zeroPage(pRoot, flags | PTF_LEAF);
4807
sqlite3pager_unref(pRoot->aData);
4808
*piTable = (int)pgnoRoot;
4813
** Erase the given database page and all its children. Return
4814
** the page to the freelist.
4816
static int clearDatabasePage(
4817
Btree *pBt, /* The BTree that contains the table */
4818
Pgno pgno, /* Page number to clear */
4819
MemPage *pParent, /* Parent page. NULL for the root */
4820
int freePageFlag /* Deallocate page if true */
4824
unsigned char *pCell;
4827
if( pgno>sqlite3pager_pagecount(pBt->pPager) ){
4828
return SQLITE_CORRUPT;
4831
rc = getAndInitPage(pBt, pgno, &pPage, pParent);
4832
if( rc ) goto cleardatabasepage_out;
4833
rc = sqlite3pager_write(pPage->aData);
4834
if( rc ) goto cleardatabasepage_out;
4835
for(i=0; i<pPage->nCell; i++){
4836
pCell = findCell(pPage, i);
4838
rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
4839
if( rc ) goto cleardatabasepage_out;
4841
rc = clearCell(pPage, pCell);
4842
if( rc ) goto cleardatabasepage_out;
4845
rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
4846
if( rc ) goto cleardatabasepage_out;
4849
rc = freePage(pPage);
4851
zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
4854
cleardatabasepage_out:
4860
** Delete all information from a single table in the database. iTable is
4861
** the page number of the root of the table. After this routine returns,
4862
** the root page is empty, but still exists.
4864
** This routine will fail with SQLITE_LOCKED if there are any open
4865
** read cursors on the table. Open write cursors are moved to the
4866
** root of the table.
4868
int sqlite3BtreeClearTable(Btree *pBt, int iTable){
4871
if( pBt->inTrans!=TRANS_WRITE ){
4872
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4874
for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
4875
if( pCur->pgnoRoot==(Pgno)iTable ){
4876
if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
4880
rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
4882
sqlite3BtreeRollback(pBt);
4888
** Erase all information in a table and add the root of the table to
4889
** the freelist. Except, the root of the principle table (the one on
4890
** page 1) is never added to the freelist.
4892
** This routine will fail with SQLITE_LOCKED if there are any open
4893
** cursors on the table.
4895
** If AUTOVACUUM is enabled and the page at iTable is not the last
4896
** root page in the database file, then the last root page
4897
** in the database file is moved into the slot formerly occupied by
4898
** iTable and that last slot formerly occupied by the last root page
4899
** is added to the freelist instead of iTable. In this say, all
4900
** root pages are kept at the beginning of the database file, which
4901
** is necessary for AUTOVACUUM to work right. *piMoved is set to the
4902
** page number that used to be the last root page in the file before
4903
** the move. If no page gets moved, *piMoved is set to 0.
4904
** The last root page is recorded in meta[3] and the value of
4905
** meta[3] is updated by this procedure.
4907
int sqlite3BtreeDropTable(Btree *pBt, int iTable, int *piMoved){
4911
if( pBt->inTrans!=TRANS_WRITE ){
4912
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
4915
/* It is illegal to drop a table if any cursors are open on the
4916
** database. This is because in auto-vacuum mode the backend may
4917
** need to move another root-page to fill a gap left by the deleted
4918
** root page. If an open cursor was using this page a problem would
4922
return SQLITE_LOCKED;
4925
rc = getPage(pBt, (Pgno)iTable, &pPage);
4927
rc = sqlite3BtreeClearTable(pBt, iTable);
4936
#ifdef SQLITE_OMIT_AUTOVACUUM
4937
rc = freePage(pPage);
4940
if( pBt->autoVacuum ){
4942
rc = sqlite3BtreeGetMeta(pBt, 4, &maxRootPgno);
4943
if( rc!=SQLITE_OK ){
4948
if( iTable==maxRootPgno ){
4949
/* If the table being dropped is the table with the largest root-page
4950
** number in the database, put the root page on the free list.
4952
rc = freePage(pPage);
4954
if( rc!=SQLITE_OK ){
4958
/* The table being dropped does not have the largest root-page
4959
** number in the database. So move the page that does into the
4960
** gap left by the deleted root-page.
4964
rc = getPage(pBt, maxRootPgno, &pMove);
4965
if( rc!=SQLITE_OK ){
4968
rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
4970
if( rc!=SQLITE_OK ){
4973
rc = getPage(pBt, maxRootPgno, &pMove);
4974
if( rc!=SQLITE_OK ){
4977
rc = freePage(pMove);
4979
if( rc!=SQLITE_OK ){
4982
*piMoved = maxRootPgno;
4985
/* Set the new 'max-root-page' value in the database header. This
4986
** is the old value less one, less one more if that happens to
4987
** be a root-page number, less one again if that is the
4988
** PENDING_BYTE_PAGE.
4991
if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
4994
if( maxRootPgno==PTRMAP_PAGENO(pBt->usableSize, maxRootPgno) ){
4997
assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
4999
rc = sqlite3BtreeUpdateMeta(pBt, 4, maxRootPgno);
5001
rc = freePage(pPage);
5006
/* If sqlite3BtreeDropTable was called on page 1. */
5007
zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
5015
** Read the meta-information out of a database file. Meta[0]
5016
** is the number of free pages currently in the database. Meta[1]
5017
** through meta[15] are available for use by higher layers. Meta[0]
5018
** is read-only, the others are read/write.
5020
** The schema layer numbers meta values differently. At the schema
5021
** layer (and the SetCookie and ReadCookie opcodes) the number of
5022
** free pages is not visible. So Cookie[0] is the same as Meta[1].
5024
int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
5028
assert( idx>=0 && idx<=15 );
5029
rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
5031
*pMeta = get4byte(&pP1[36 + idx*4]);
5032
sqlite3pager_unref(pP1);
5034
/* If autovacuumed is disabled in this build but we are trying to
5035
** access an autovacuumed database, then make the database readonly.
5037
#ifdef SQLITE_OMIT_AUTOVACUUM
5038
if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
5045
** Write meta-information back into the database. Meta[0] is
5046
** read-only and may not be written.
5048
int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
5051
assert( idx>=1 && idx<=15 );
5052
if( pBt->inTrans!=TRANS_WRITE ){
5053
return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5055
assert( pBt->pPage1!=0 );
5056
pP1 = pBt->pPage1->aData;
5057
rc = sqlite3pager_write(pP1);
5059
put4byte(&pP1[36 + idx*4], iMeta);
5064
** Return the flag byte at the beginning of the page that the cursor
5065
** is currently pointing to.
5067
int sqlite3BtreeFlags(BtCursor *pCur){
5068
MemPage *pPage = pCur->pPage;
5069
return pPage ? pPage->aData[pPage->hdrOffset] : 0;
5074
** Print a disassembly of the given page on standard output. This routine
5075
** is used for debugging and testing only.
5077
static int btreePageDump(Btree *pBt, int pgno, int recursive, MemPage *pParent){
5086
unsigned char *data;
5088
unsigned char payload[20];
5090
rc = getPage(pBt, (Pgno)pgno, &pPage);
5091
isInit = pPage->isInit;
5092
if( pPage->isInit==0 ){
5093
initPage(pPage, pParent);
5098
hdr = pPage->hdrOffset;
5099
data = pPage->aData;
5101
pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
5102
pPage->zeroData = (c & PTF_ZERODATA)!=0;
5103
pPage->leafData = (c & PTF_LEAFDATA)!=0;
5104
pPage->leaf = (c & PTF_LEAF)!=0;
5105
pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
5106
nCell = get2byte(&data[hdr+3]);
5107
sqlite3DebugPrintf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno,
5108
data[hdr], data[hdr+7],
5109
(pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
5110
assert( hdr == (pgno==1 ? 100 : 0) );
5111
idx = hdr + 12 - pPage->leaf*4;
5112
for(i=0; i<nCell; i++){
5115
unsigned char *pCell;
5119
addr = get2byte(&data[idx + 2*i]);
5120
pCell = &data[addr];
5121
parseCellPtr(pPage, pCell, &info);
5123
sprintf(range,"%d..%d", addr, addr+sz-1);
5127
child = get4byte(pCell);
5130
if( !pPage->intKey ) sz += info.nKey;
5131
if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
5132
memcpy(payload, &pCell[info.nHeader], sz);
5133
for(j=0; j<sz; j++){
5134
if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
5138
"cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n",
5139
i, range, child, info.nKey, info.nData, payload
5143
sqlite3DebugPrintf("right_child: %d\n", get4byte(&data[hdr+8]));
5147
idx = get2byte(&data[hdr+1]);
5148
while( idx>0 && idx<pPage->pBt->usableSize ){
5149
int sz = get2byte(&data[idx+2]);
5150
sprintf(range,"%d..%d", idx, idx+sz-1);
5152
sqlite3DebugPrintf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
5153
i, range, sz, nFree);
5154
idx = get2byte(&data[idx]);
5158
sqlite3DebugPrintf("ERROR: next freeblock index out of range: %d\n", idx);
5160
if( recursive && !pPage->leaf ){
5161
for(i=0; i<nCell; i++){
5162
unsigned char *pCell = findCell(pPage, i);
5163
btreePageDump(pBt, get4byte(pCell), 1, pPage);
5164
idx = get2byte(pCell);
5166
btreePageDump(pBt, get4byte(&data[hdr+8]), 1, pPage);
5168
pPage->isInit = isInit;
5169
sqlite3pager_unref(data);
5173
int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
5174
return btreePageDump(pBt, pgno, recursive, 0);
5180
** Fill aResult[] with information about the entry and page that the
5181
** cursor is pointing to.
5183
** aResult[0] = The page number
5184
** aResult[1] = The entry number
5185
** aResult[2] = Total number of entries on this page
5186
** aResult[3] = Cell size (local payload + header)
5187
** aResult[4] = Number of free bytes on this page
5188
** aResult[5] = Number of free blocks on the page
5189
** aResult[6] = Total payload size (local + overflow)
5190
** aResult[7] = Header size in bytes
5191
** aResult[8] = Local payload size
5192
** aResult[9] = Parent page number
5194
** This routine is used for testing and debugging only.
5196
int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult, int upCnt){
5198
MemPage *pPage = pCur->pPage;
5201
pageIntegrity(pPage);
5202
assert( pPage->isInit );
5203
getTempCursor(pCur, &tmpCur);
5205
moveToParent(&tmpCur);
5207
pPage = tmpCur.pPage;
5208
pageIntegrity(pPage);
5209
aResult[0] = sqlite3pager_pagenumber(pPage->aData);
5210
assert( aResult[0]==pPage->pgno );
5211
aResult[1] = tmpCur.idx;
5212
aResult[2] = pPage->nCell;
5213
if( tmpCur.idx>=0 && tmpCur.idx<pPage->nCell ){
5214
getCellInfo(&tmpCur);
5215
aResult[3] = tmpCur.info.nSize;
5216
aResult[6] = tmpCur.info.nData;
5217
aResult[7] = tmpCur.info.nHeader;
5218
aResult[8] = tmpCur.info.nLocal;
5225
aResult[4] = pPage->nFree;
5227
idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
5228
while( idx>0 && idx<pPage->pBt->usableSize ){
5230
idx = get2byte(&pPage->aData[idx]);
5233
if( pPage->pParent==0 || isRootPage(pPage) ){
5236
aResult[9] = pPage->pParent->pgno;
5238
releaseTempCursor(&tmpCur);
5244
** Return the pager associated with a BTree. This routine is used for
5245
** testing and debugging only.
5247
Pager *sqlite3BtreePager(Btree *pBt){
5252
** This structure is passed around through all the sanity checking routines
5253
** in order to keep track of some global state information.
5255
typedef struct IntegrityCk IntegrityCk;
5256
struct IntegrityCk {
5257
Btree *pBt; /* The tree being checked out */
5258
Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
5259
int nPage; /* Number of pages in the database */
5260
int *anRef; /* Number of times each page is referenced */
5261
char *zErrMsg; /* An error message. NULL of no errors seen. */
5264
#ifndef SQLITE_OMIT_INTEGRITY_CHECK
5266
** Append a message to the error message string.
5268
static void checkAppendMsg(
5269
IntegrityCk *pCheck,
5271
const char *zFormat,
5276
va_start(ap, zFormat);
5277
zMsg2 = sqlite3VMPrintf(zFormat, ap);
5279
if( zMsg1==0 ) zMsg1 = "";
5280
if( pCheck->zErrMsg ){
5281
char *zOld = pCheck->zErrMsg;
5282
pCheck->zErrMsg = 0;
5283
sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
5286
sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
5290
#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5292
#ifndef SQLITE_OMIT_INTEGRITY_CHECK
5294
** Add 1 to the reference count for page iPage. If this is the second
5295
** reference to the page, add an error message to pCheck->zErrMsg.
5296
** Return 1 if there are 2 ore more references to the page and 0 if
5297
** if this is the first reference to the page.
5299
** Also check that the page number is in bounds.
5301
static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
5302
if( iPage==0 ) return 1;
5303
if( iPage>pCheck->nPage || iPage<0 ){
5304
checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
5307
if( pCheck->anRef[iPage]==1 ){
5308
checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
5311
return (pCheck->anRef[iPage]++)>1;
5314
#ifndef SQLITE_OMIT_AUTOVACUUM
5316
** Check that the entry in the pointer-map for page iChild maps to
5317
** page iParent, pointer type ptrType. If not, append an error message
5320
static void checkPtrmap(
5321
IntegrityCk *pCheck, /* Integrity check context */
5322
Pgno iChild, /* Child page number */
5323
u8 eType, /* Expected pointer map type */
5324
Pgno iParent, /* Expected pointer map parent page number */
5325
char *zContext /* Context description (used for error msg) */
5331
rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
5332
if( rc!=SQLITE_OK ){
5333
checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
5337
if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
5338
checkAppendMsg(pCheck, zContext,
5339
"Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
5340
iChild, eType, iParent, ePtrmapType, iPtrmapParent);
5346
** Check the integrity of the freelist or of an overflow page list.
5347
** Verify that the number of pages on the list is N.
5349
static void checkList(
5350
IntegrityCk *pCheck, /* Integrity checking context */
5351
int isFreeList, /* True for a freelist. False for overflow page list */
5352
int iPage, /* Page number for first page in the list */
5353
int N, /* Expected number of pages in the list */
5354
char *zContext /* Context for error messages */
5360
unsigned char *pOvfl;
5362
checkAppendMsg(pCheck, zContext,
5363
"%d of %d pages missing from overflow list starting at %d",
5364
N+1, expected, iFirst);
5367
if( checkRef(pCheck, iPage, zContext) ) break;
5368
if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
5369
checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
5373
int n = get4byte(&pOvfl[4]);
5374
#ifndef SQLITE_OMIT_AUTOVACUUM
5375
if( pCheck->pBt->autoVacuum ){
5376
checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
5379
if( n>pCheck->pBt->usableSize/4-8 ){
5380
checkAppendMsg(pCheck, zContext,
5381
"freelist leaf count too big on page %d", iPage);
5385
Pgno iFreePage = get4byte(&pOvfl[8+i*4]);
5386
#ifndef SQLITE_OMIT_AUTOVACUUM
5387
if( pCheck->pBt->autoVacuum ){
5388
checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
5391
checkRef(pCheck, iFreePage, zContext);
5396
#ifndef SQLITE_OMIT_AUTOVACUUM
5398
/* If this database supports auto-vacuum and iPage is not the last
5399
** page in this overflow list, check that the pointer-map entry for
5400
** the following page matches iPage.
5402
if( pCheck->pBt->autoVacuum && N>0 ){
5403
i = get4byte(pOvfl);
5404
checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
5408
iPage = get4byte(pOvfl);
5409
sqlite3pager_unref(pOvfl);
5412
#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5414
#ifndef SQLITE_OMIT_INTEGRITY_CHECK
5416
** Do various sanity checks on a single page of a tree. Return
5417
** the tree depth. Root pages return 0. Parents of root pages
5418
** return 1, and so forth.
5420
** These checks are done:
5422
** 1. Make sure that cells and freeblocks do not overlap
5423
** but combine to completely cover the page.
5424
** NO 2. Make sure cell keys are in order.
5425
** NO 3. Make sure no key is less than or equal to zLowerBound.
5426
** NO 4. Make sure no key is greater than or equal to zUpperBound.
5427
** 5. Check the integrity of overflow pages.
5428
** 6. Recursively call checkTreePage on all children.
5429
** 7. Verify that the depth of all children is the same.
5430
** 8. Make sure this page is at least 33% full or else it is
5431
** the root of the tree.
5433
static int checkTreePage(
5434
IntegrityCk *pCheck, /* Context for the sanity check */
5435
int iPage, /* Page number of the page to check */
5436
MemPage *pParent, /* Parent page */
5437
char *zParentContext, /* Parent context */
5438
char *zLowerBound, /* All keys should be greater than this, if not NULL */
5439
int nLower, /* Number of characters in zLowerBound */
5440
char *zUpperBound, /* All keys should be less than this, if not NULL */
5441
int nUpper /* Number of characters in zUpperBound */
5444
int i, rc, depth, d2, pgno, cnt;
5450
int maxLocal, usableSize;
5454
sprintf(zContext, "Page %d: ", iPage);
5456
/* Check that the page exists
5458
cur.pBt = pBt = pCheck->pBt;
5459
usableSize = pBt->usableSize;
5460
if( iPage==0 ) return 0;
5461
if( checkRef(pCheck, iPage, zParentContext) ) return 0;
5462
if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
5463
checkAppendMsg(pCheck, zContext,
5464
"unable to get the page. error code=%d", rc);
5467
maxLocal = pPage->leafData ? pBt->maxLeaf : pBt->maxLocal;
5468
if( (rc = initPage(pPage, pParent))!=0 ){
5469
checkAppendMsg(pCheck, zContext, "initPage() returns error code %d", rc);
5474
/* Check out all the cells.
5478
for(i=0; i<pPage->nCell; i++){
5483
/* Check payload overflow pages
5485
sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
5486
pCell = findCell(pPage,i);
5487
parseCellPtr(pPage, pCell, &info);
5489
if( !pPage->intKey ) sz += info.nKey;
5490
if( sz>info.nLocal ){
5491
int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
5492
Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
5493
#ifndef SQLITE_OMIT_AUTOVACUUM
5494
if( pBt->autoVacuum ){
5495
checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
5498
checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
5501
/* Check sanity of left child page.
5504
pgno = get4byte(pCell);
5505
#ifndef SQLITE_OMIT_AUTOVACUUM
5506
if( pBt->autoVacuum ){
5507
checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
5510
d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
5511
if( i>0 && d2!=depth ){
5512
checkAppendMsg(pCheck, zContext, "Child page depth differs");
5518
pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5519
sprintf(zContext, "On page %d at right child: ", iPage);
5520
#ifndef SQLITE_OMIT_AUTOVACUUM
5521
if( pBt->autoVacuum ){
5522
checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
5525
checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
5528
/* Check for complete coverage of the page
5530
data = pPage->aData;
5531
hdr = pPage->hdrOffset;
5532
hit = sqliteMalloc( usableSize );
5534
memset(hit, 1, get2byte(&data[hdr+5]));
5535
nCell = get2byte(&data[hdr+3]);
5536
cellStart = hdr + 12 - 4*pPage->leaf;
5537
for(i=0; i<nCell; i++){
5538
int pc = get2byte(&data[cellStart+i*2]);
5539
int size = cellSizePtr(pPage, &data[pc]);
5541
if( (pc+size-1)>=usableSize || pc<0 ){
5542
checkAppendMsg(pCheck, 0,
5543
"Corruption detected in cell %d on page %d",i,iPage,0);
5545
for(j=pc+size-1; j>=pc; j--) hit[j]++;
5548
for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
5550
int size = get2byte(&data[i+2]);
5552
if( (i+size-1)>=usableSize || i<0 ){
5553
checkAppendMsg(pCheck, 0,
5554
"Corruption detected in cell %d on page %d",i,iPage,0);
5556
for(j=i+size-1; j>=i; j--) hit[j]++;
5558
i = get2byte(&data[i]);
5560
for(i=cnt=0; i<usableSize; i++){
5563
}else if( hit[i]>1 ){
5564
checkAppendMsg(pCheck, 0,
5565
"Multiple uses for byte %d of page %d", i, iPage);
5569
if( cnt!=data[hdr+7] ){
5570
checkAppendMsg(pCheck, 0,
5571
"Fragmented space is %d byte reported as %d on page %d",
5572
cnt, data[hdr+7], iPage);
5580
#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5582
#ifndef SQLITE_OMIT_INTEGRITY_CHECK
5584
** This routine does a complete check of the given BTree file. aRoot[] is
5585
** an array of pages numbers were each page number is the root page of
5586
** a table. nRoot is the number of entries in aRoot.
5588
** If everything checks out, this routine returns NULL. If something is
5589
** amiss, an error message is written into memory obtained from malloc()
5590
** and a pointer to that error message is returned. The calling function
5591
** is responsible for freeing the error message when it is done.
5593
char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
5598
nRef = *sqlite3pager_stats(pBt->pPager);
5599
if( lockBtreeWithRetry(pBt)!=SQLITE_OK ){
5600
return sqliteStrDup("Unable to acquire a read lock on the database");
5603
sCheck.pPager = pBt->pPager;
5604
sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
5605
if( sCheck.nPage==0 ){
5606
unlockBtreeIfUnused(pBt);
5609
sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
5610
if( !sCheck.anRef ){
5611
unlockBtreeIfUnused(pBt);
5612
return sqlite3MPrintf("Unable to malloc %d bytes",
5613
(sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
5615
for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
5616
i = PENDING_BYTE_PAGE(pBt);
5617
if( i<=sCheck.nPage ){
5618
sCheck.anRef[i] = 1;
5622
/* Check the integrity of the freelist
5624
checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
5625
get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
5627
/* Check all the tables.
5629
for(i=0; i<nRoot; i++){
5630
if( aRoot[i]==0 ) continue;
5631
#ifndef SQLITE_OMIT_AUTOVACUUM
5632
if( pBt->autoVacuum && aRoot[i]>1 ){
5633
checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
5636
checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
5639
/* Make sure every page in the file is referenced
5641
for(i=1; i<=sCheck.nPage; i++){
5642
#ifdef SQLITE_OMIT_AUTOVACUUM
5643
if( sCheck.anRef[i]==0 ){
5644
checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
5647
/* If the database supports auto-vacuum, make sure no tables contain
5648
** references to pointer-map pages.
5650
if( sCheck.anRef[i]==0 &&
5651
(PTRMAP_PAGENO(pBt->usableSize, i)!=i || !pBt->autoVacuum) ){
5652
checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
5654
if( sCheck.anRef[i]!=0 &&
5655
(PTRMAP_PAGENO(pBt->usableSize, i)==i && pBt->autoVacuum) ){
5656
checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
5661
/* Make sure this analysis did not leave any unref() pages
5663
unlockBtreeIfUnused(pBt);
5664
if( nRef != *sqlite3pager_stats(pBt->pPager) ){
5665
checkAppendMsg(&sCheck, 0,
5666
"Outstanding page count goes from %d to %d during this analysis",
5667
nRef, *sqlite3pager_stats(pBt->pPager)
5671
/* Clean up and report errors.
5673
sqliteFree(sCheck.anRef);
5674
return sCheck.zErrMsg;
5676
#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
5679
** Return the full pathname of the underlying database file.
5681
const char *sqlite3BtreeGetFilename(Btree *pBt){
5682
assert( pBt->pPager!=0 );
5683
return sqlite3pager_filename(pBt->pPager);
5687
** Return the pathname of the directory that contains the database file.
5689
const char *sqlite3BtreeGetDirname(Btree *pBt){
5690
assert( pBt->pPager!=0 );
5691
return sqlite3pager_dirname(pBt->pPager);
5695
** Return the pathname of the journal file for this database. The return
5696
** value of this routine is the same regardless of whether the journal file
5697
** has been created or not.
5699
const char *sqlite3BtreeGetJournalname(Btree *pBt){
5700
assert( pBt->pPager!=0 );
5701
return sqlite3pager_journalname(pBt->pPager);
5704
#ifndef SQLITE_OMIT_VACUUM
5706
** Copy the complete content of pBtFrom into pBtTo. A transaction
5707
** must be active for both files.
5709
** The size of file pBtFrom may be reduced by this operation.
5710
** If anything goes wrong, the transaction on pBtFrom is rolled back.
5712
int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
5714
Pgno i, nPage, nToPage;
5716
if( pBtTo->inTrans!=TRANS_WRITE || pBtFrom->inTrans!=TRANS_WRITE ){
5717
return SQLITE_ERROR;
5719
if( pBtTo->pCursor ) return SQLITE_BUSY;
5720
nToPage = sqlite3pager_pagecount(pBtTo->pPager);
5721
nPage = sqlite3pager_pagecount(pBtFrom->pPager);
5722
for(i=1; rc==SQLITE_OK && i<=nPage; i++){
5724
rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
5726
rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
5728
sqlite3pager_unref(pPage);
5730
for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
5732
rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
5734
rc = sqlite3pager_write(pPage);
5735
sqlite3pager_unref(pPage);
5736
sqlite3pager_dont_write(pBtTo->pPager, i);
5738
if( !rc && nPage<nToPage ){
5739
rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
5742
sqlite3BtreeRollback(pBtTo);
5746
#endif /* SQLITE_OMIT_VACUUM */
5749
** Return non-zero if a transaction is active.
5751
int sqlite3BtreeIsInTrans(Btree *pBt){
5752
return (pBt && (pBt->inTrans==TRANS_WRITE));
5756
** Return non-zero if a statement transaction is active.
5758
int sqlite3BtreeIsInStmt(Btree *pBt){
5759
return (pBt && pBt->inStmt);
5763
** This call is a no-op if no write-transaction is currently active on pBt.
5765
** Otherwise, sync the database file for the btree pBt. zMaster points to
5766
** the name of a master journal file that should be written into the
5767
** individual journal file, or is NULL, indicating no master journal file
5768
** (single database transaction).
5770
** When this is called, the master journal should already have been
5771
** created, populated with this journal pointer and synced to disk.
5773
** Once this is routine has returned, the only thing required to commit
5774
** the write-transaction for this database file is to delete the journal.
5776
int sqlite3BtreeSync(Btree *pBt, const char *zMaster){
5777
if( pBt->inTrans==TRANS_WRITE ){
5778
#ifndef SQLITE_OMIT_AUTOVACUUM
5780
if( pBt->autoVacuum ){
5781
int rc = autoVacuumCommit(pBt, &nTrunc);
5782
if( rc!=SQLITE_OK ) return rc;
5784
return sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
5786
return sqlite3pager_sync(pBt->pPager, zMaster, 0);
5791
#ifndef SQLITE_OMIT_GLOBALRECOVER
5793
** Reset the btree and underlying pager after a malloc() failure. Any
5794
** transaction that was active when malloc() failed is rolled back.
5796
int sqlite3BtreeReset(Btree *pBt){
5797
if( pBt->pCursor ) return SQLITE_BUSY;
5798
pBt->inTrans = TRANS_NONE;
5799
unlockBtreeIfUnused(pBt);
5800
return sqlite3pager_reset(pBt->pPager);