2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1996-2001
5
* Sleepycat Software. All rights reserved.
7
* $Id: db_page.h,v 11.32 2001/05/22 17:40:35 ubell Exp $
13
#if defined(__cplusplus)
21
* This implementation requires that values within the following structures
22
* NOT be padded -- note, ANSI C permits random padding within structures.
23
* If your compiler pads randomly you can just forget ever making DB run on
24
* your system. In addition, no data type can require larger alignment than
25
* its own size, e.g., a 4-byte data element may not require 8-byte alignment.
27
* Note that key/data lengths are often stored in db_indx_t's -- this is
28
* not accidental, nor does it limit the key/data size. If the key/data
29
* item fits on a page, it's guaranteed to be small enough to fit into a
30
* db_indx_t, and storing it in one saves space.
33
#define PGNO_INVALID 0 /* Invalid page number in any database. */
34
#define PGNO_BASE_MD 0 /* Base database: metadata page number. */
37
#define P_INVALID 0 /* Invalid page type. */
38
#define __P_DUPLICATE 1 /* Duplicate. DEPRECATED in 3.1 */
39
#define P_HASH 2 /* Hash. */
40
#define P_IBTREE 3 /* Btree internal. */
41
#define P_IRECNO 4 /* Recno internal. */
42
#define P_LBTREE 5 /* Btree leaf. */
43
#define P_LRECNO 6 /* Recno leaf. */
44
#define P_OVERFLOW 7 /* Overflow. */
45
#define P_HASHMETA 8 /* Hash metadata page. */
46
#define P_BTREEMETA 9 /* Btree metadata page. */
47
#define P_QAMMETA 10 /* Queue metadata page. */
48
#define P_QAMDATA 11 /* Queue data page. */
49
#define P_LDUP 12 /* Off-page duplicate leaf. */
50
#define P_PAGETYPE_MAX 13
53
* When we create pages in mpool, we ask mpool to clear some number of bytes
54
* in the header. This number must be at least as big as the regular page
55
* headers and cover enough of the btree and hash meta-data pages to obliterate
58
#define DB_PAGE_DB_LEN 32
59
#define DB_PAGE_QUEUE_LEN 0
61
/************************************************************************
62
GENERIC METADATA PAGE HEADER
65
* The magic and version numbers have to be in the same place in all versions
66
* of the metadata page as the application may not have upgraded the database.
67
************************************************************************/
68
typedef struct _dbmeta33 {
69
DB_LSN lsn; /* 00-07: LSN. */
70
db_pgno_t pgno; /* 08-11: Current page number. */
71
u_int32_t magic; /* 12-15: Magic number. */
72
u_int32_t version; /* 16-19: Version. */
73
u_int32_t pagesize; /* 20-23: Pagesize. */
74
u_int8_t unused1[1]; /* 24: Unused. */
75
u_int8_t type; /* 25: Page type. */
76
u_int8_t unused2[2]; /* 26-27: Unused. */
77
u_int32_t free; /* 28-31: Free list page number. */
78
db_pgno_t last_pgno; /* 32-35: Page number of last page in db. */
79
u_int32_t unused3; /* 36-39: Unused. */
80
u_int32_t key_count; /* 40-43: Cached key count. */
81
u_int32_t record_count; /* 44-47: Cached record count. */
82
u_int32_t flags; /* 48-51: Flags: unique to each AM. */
83
/* 52-71: Unique file ID. */
84
u_int8_t uid[DB_FILE_ID_LEN];
87
/************************************************************************
88
BTREE METADATA PAGE LAYOUT
89
************************************************************************/
90
typedef struct _btmeta33 {
91
#define BTM_DUP 0x001 /* Duplicates. */
92
#define BTM_RECNO 0x002 /* Recno tree. */
93
#define BTM_RECNUM 0x004 /* Btree: maintain record count. */
94
#define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */
95
#define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */
96
#define BTM_SUBDB 0x020 /* Subdatabases. */
97
#define BTM_DUPSORT 0x040 /* Duplicates are sorted. */
98
#define BTM_MASK 0x07f
99
DBMETA dbmeta; /* 00-71: Generic meta-data header. */
101
u_int32_t maxkey; /* 72-75: Btree: Maxkey. */
102
u_int32_t minkey; /* 76-79: Btree: Minkey. */
103
u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */
104
u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */
105
u_int32_t root; /* 88-92: Root page. */
108
* Minimum page size is 128.
112
/************************************************************************
113
HASH METADATA PAGE LAYOUT
114
************************************************************************/
115
typedef struct _hashmeta33 {
116
#define DB_HASH_DUP 0x01 /* Duplicates. */
117
#define DB_HASH_SUBDB 0x02 /* Subdatabases. */
118
#define DB_HASH_DUPSORT 0x04 /* Duplicates are sorted. */
119
DBMETA dbmeta; /* 00-71: Generic meta-data page header. */
121
u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */
122
u_int32_t high_mask; /* 76-79: Modulo mask into table */
123
u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */
124
u_int32_t ffactor; /* 84-87: Fill factor */
125
u_int32_t nelem; /* 88-91: Number of keys in hash table */
126
u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */
127
#define NCACHED 32 /* number of spare points */
128
/* 96-223: Spare pages for overflow */
129
u_int32_t spares[NCACHED];
132
* Minimum page size is 256.
136
/************************************************************************
137
QUEUE METADATA PAGE LAYOUT
138
************************************************************************/
140
* QAM Meta data page structure
143
typedef struct _qmeta33 {
144
DBMETA dbmeta; /* 00-71: Generic meta-data header. */
146
u_int32_t first_recno; /* 72-75: First not deleted record. */
147
u_int32_t cur_recno; /* 76-79: Last recno allocated. */
148
u_int32_t re_len; /* 80-83: Fixed-length record length. */
149
u_int32_t re_pad; /* 84-87: Fixed-length record pad. */
150
u_int32_t rec_page; /* 88-91: Records Per Page. */
151
u_int32_t page_ext; /* 92-95: Pages per extent */
154
* Minimum page size is 128.
159
* DBMETASIZE is a constant used by __db_file_setup and DB->verify
160
* as a buffer which is guaranteed to be larger than any possible
161
* metadata page size and smaller than any disk sector.
163
#define DBMETASIZE 256
165
/************************************************************************
166
BTREE/HASH MAIN PAGE LAYOUT
167
************************************************************************/
169
* +-----------------------------------+
170
* | lsn | pgno | prev pgno |
171
* +-----------------------------------+
172
* | next pgno | entries | hf offset |
173
* +-----------------------------------+
174
* | level | type | index |
175
* +-----------------------------------+
176
* | index | free --> |
177
* +-----------+-----------------------+
178
* | F R E E A R E A |
179
* +-----------------------------------+
180
* | <-- free | item |
181
* +-----------------------------------+
182
* | item | item | item |
183
* +-----------------------------------+
185
* sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
188
* For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
189
* key for inp[1]'s data. All other types of pages only contain single items.
191
typedef struct _db_page {
192
DB_LSN lsn; /* 00-07: Log sequence number. */
193
db_pgno_t pgno; /* 08-11: Current page number. */
194
db_pgno_t prev_pgno; /* 12-15: Previous page number. */
195
db_pgno_t next_pgno; /* 16-19: Next page number. */
196
db_indx_t entries; /* 20-21: Number of items on the page. */
197
db_indx_t hf_offset; /* 22-23: High free byte page offset. */
200
* The btree levels are numbered from the leaf to the root, starting
201
* with 1, so the leaf is level 1, its parent is level 2, and so on.
202
* We maintain this level on all btree pages, but the only place that
203
* we actually need it is on the root page. It would not be difficult
204
* to hide the byte on the root page once it becomes an internal page,
205
* so we could get this byte back if we needed it for something else.
208
#define MAXBTREELEVEL 255
209
u_int8_t level; /* 24: Btree tree level. */
210
u_int8_t type; /* 25: Page type. */
211
db_indx_t inp[1]; /* Variable length index of items. */
214
/* PAGE element macros. */
215
#define LSN(p) (((PAGE *)p)->lsn)
216
#define PGNO(p) (((PAGE *)p)->pgno)
217
#define PREV_PGNO(p) (((PAGE *)p)->prev_pgno)
218
#define NEXT_PGNO(p) (((PAGE *)p)->next_pgno)
219
#define NUM_ENT(p) (((PAGE *)p)->entries)
220
#define HOFFSET(p) (((PAGE *)p)->hf_offset)
221
#define LEVEL(p) (((PAGE *)p)->level)
222
#define TYPE(p) (((PAGE *)p)->type)
224
/************************************************************************
225
QUEUE MAIN PAGE LAYOUT
226
************************************************************************/
227
typedef struct _qpage {
228
DB_LSN lsn; /* 00-07: Log sequence number. */
229
db_pgno_t pgno; /* 08-11: Current page number. */
230
u_int32_t unused0[3]; /* 12-23: Unused. */
231
u_int8_t unused1[1]; /* 24: Unused. */
232
u_int8_t type; /* 25: Page type. */
233
u_int8_t unused2[2]; /* 26-27: Unused. */
238
* The next_pgno and prev_pgno fields are not maintained for btree and recno
239
* internal pages. Doing so only provides a minor performance improvement,
240
* it's hard to do when deleting internal pages, and it increases the chance
241
* of deadlock during deletes and splits because we have to re-link pages at
242
* more than the leaf level.
245
* The btree/recno access method needs db_recno_t bytes of space on the root
246
* page to specify how many records are stored in the tree. (The alternative
247
* is to store the number of records in the meta-data page, which will create
248
* a second hot spot in trees being actively modified, or recalculate it from
249
* the BINTERNAL fields on each access.) Overload the PREV_PGNO field.
252
((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ? PREV_PGNO(p) : \
253
(db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p)))
254
#define RE_NREC_ADJ(p, adj) \
256
#define RE_NREC_SET(p, num) \
263
* Don't modify the page's LSN, code depends on it being unchanged after a
266
#define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \
268
PREV_PGNO(pg) = pg_prev; \
269
NEXT_PGNO(pg) = pg_next; \
271
HOFFSET(pg) = pg_size; \
273
TYPE(pg) = pg_type; \
276
/* Page header length (offset to first index). */
277
#define P_OVERHEAD (SSZA(PAGE, inp))
279
/* First free byte. */
280
#define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t))
282
/* Free space on a regular page. */
283
#define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg))
285
/* Get a pointer to the bytes at a specific index. */
286
#define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx])
288
/************************************************************************
290
************************************************************************/
293
* Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
294
* store a page number (the first page of the overflow item) and a length
295
* (the total length of the overflow item). The overflow item consists of
296
* some number of overflow pages, linked by the next_pgno field of the page.
297
* A next_pgno field of PGNO_INVALID flags the end of the overflow item.
299
* Overflow page overloads:
300
* The amount of overflow data stored on each page is stored in the
303
* The implementation reference counts overflow items as it's possible
304
* for them to be promoted onto btree internal pages. The reference
305
* count is stored in the entries field.
307
#define OV_LEN(p) (((PAGE *)p)->hf_offset)
308
#define OV_REF(p) (((PAGE *)p)->entries)
310
/* Maximum number of bytes that you can put on an overflow page. */
311
#define P_MAXSPACE(psize) ((psize) - P_OVERHEAD)
313
/* Free space on an overflow page. */
314
#define P_OVFLSPACE(psize, pg) (P_MAXSPACE(psize) - HOFFSET(pg))
316
/************************************************************************
318
************************************************************************/
320
/* Each index references a group of bytes on the page. */
321
#define H_KEYDATA 1 /* Key/data item. */
322
#define H_DUPLICATE 2 /* Duplicate key/data item. */
323
#define H_OFFPAGE 3 /* Overflow key/data item. */
324
#define H_OFFDUP 4 /* Overflow page of duplicates. */
328
* Items on hash pages are (potentially) unaligned, so we can never cast the
329
* (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
330
* we do with B+tree on-page structures. Because we frequently want the type
331
* field, it requires no alignment, and it's in the same location in all three
332
* structures, there's a pair of macros.
334
#define HPAGE_PTYPE(p) (*(u_int8_t *)p)
335
#define HPAGE_TYPE(pg, indx) (*P_ENTRY(pg, indx))
338
* The first and second types are H_KEYDATA and H_DUPLICATE, represented
339
* by the HKEYDATA structure:
341
* +-----------------------------------+
342
* | type | key/data ... |
343
* +-----------------------------------+
345
* For duplicates, the data field encodes duplicate elements in the data
348
* +---------------------------------------------------------------+
349
* | type | len1 | element1 | len1 | len2 | element2 | len2 |
350
* +---------------------------------------------------------------+
352
* Thus, by keeping track of the offset in the element, we can do both
353
* backward and forward traversal.
355
typedef struct _hkeydata {
356
u_int8_t type; /* 00: Page type. */
357
u_int8_t data[1]; /* Variable length key/data item. */
359
#define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data))
362
* The length of any HKEYDATA item. Note that indx is an element index,
365
#define LEN_HITEM(pg, pgsize, indx) \
366
(((indx) == 0 ? pgsize : \
367
((PAGE *)(pg))->inp[indx - 1]) - ((PAGE *)(pg))->inp[indx])
369
#define LEN_HKEYDATA(pg, psize, indx) \
370
(LEN_HITEM(pg, psize, indx) - HKEYDATA_SIZE(0))
373
* Page space required to add a new HKEYDATA item to the page, with and
374
* without the index value.
376
#define HKEYDATA_SIZE(len) \
377
((len) + SSZA(HKEYDATA, data))
378
#define HKEYDATA_PSIZE(len) \
379
(HKEYDATA_SIZE(len) + sizeof(db_indx_t))
381
/* Put a HKEYDATA item at the location referenced by a page entry. */
382
#define PUT_HKEYDATA(pe, kd, len, type) { \
383
((HKEYDATA *)pe)->type = type; \
384
memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \
388
* Macros the describe the page layout in terms of key-data pairs.
390
#define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2)
391
#define H_KEYINDEX(indx) (indx)
392
#define H_DATAINDEX(indx) ((indx) + 1)
393
#define H_PAIRKEY(pg, indx) P_ENTRY(pg, H_KEYINDEX(indx))
394
#define H_PAIRDATA(pg, indx) P_ENTRY(pg, H_DATAINDEX(indx))
395
#define H_PAIRSIZE(pg, psize, indx) \
396
(LEN_HITEM(pg, psize, H_KEYINDEX(indx)) + \
397
LEN_HITEM(pg, psize, H_DATAINDEX(indx)))
398
#define LEN_HDATA(p, psize, indx) LEN_HKEYDATA(p, psize, H_DATAINDEX(indx))
399
#define LEN_HKEY(p, psize, indx) LEN_HKEYDATA(p, psize, H_KEYINDEX(indx))
402
* The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
404
typedef struct _hoffpage {
405
u_int8_t type; /* 00: Page type and delete flag. */
406
u_int8_t unused[3]; /* 01-03: Padding, unused. */
407
db_pgno_t pgno; /* 04-07: Offpage page number. */
408
u_int32_t tlen; /* 08-11: Total length of item. */
411
#define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
412
#define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
415
* Page space required to add a new HOFFPAGE item to the page, with and
416
* without the index value.
418
#define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
419
#define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t))
422
* The fourth type is H_OFFDUP represented by the HOFFDUP structure:
424
typedef struct _hoffdup {
425
u_int8_t type; /* 00: Page type and delete flag. */
426
u_int8_t unused[3]; /* 01-03: Padding, unused. */
427
db_pgno_t pgno; /* 04-07: Offpage page number. */
429
#define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
432
* Page space required to add a new HOFFDUP item to the page, with and
433
* without the index value.
435
#define HOFFDUP_SIZE (sizeof(HOFFDUP))
437
/************************************************************************
439
************************************************************************/
441
/* Each index references a group of bytes on the page. */
442
#define B_KEYDATA 1 /* Key/data item. */
443
#define B_DUPLICATE 2 /* Duplicate key/data item. */
444
#define B_OVERFLOW 3 /* Overflow key/data item. */
447
* We have to store a deleted entry flag in the page. The reason is complex,
448
* but the simple version is that we can't delete on-page items referenced by
449
* a cursor -- the return order of subsequent insertions might be wrong. The
450
* delete flag is an overload of the top bit of the type byte.
452
#define B_DELETE (0x80)
453
#define B_DCLR(t) (t) &= ~B_DELETE
454
#define B_DSET(t) (t) |= B_DELETE
455
#define B_DISSET(t) ((t) & B_DELETE)
457
#define B_TYPE(t) ((t) & ~B_DELETE)
458
#define B_TSET(t, type, deleted) { \
465
* The first type is B_KEYDATA, represented by the BKEYDATA structure:
467
typedef struct _bkeydata {
468
db_indx_t len; /* 00-01: Key/data item length. */
469
u_int8_t type; /* 02: Page type AND DELETE FLAG. */
470
u_int8_t data[1]; /* Variable length key/data item. */
473
/* Get a BKEYDATA item for a specific index. */
474
#define GET_BKEYDATA(pg, indx) \
475
((BKEYDATA *)P_ENTRY(pg, indx))
478
* Page space required to add a new BKEYDATA item to the page, with and
479
* without the index value.
481
#define BKEYDATA_SIZE(len) \
482
ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t))
483
#define BKEYDATA_PSIZE(len) \
484
(BKEYDATA_SIZE(len) + sizeof(db_indx_t))
487
* The second and third types are B_DUPLICATE and B_OVERFLOW, represented
488
* by the BOVERFLOW structure.
490
typedef struct _boverflow {
491
db_indx_t unused1; /* 00-01: Padding, unused. */
492
u_int8_t type; /* 02: Page type AND DELETE FLAG. */
493
u_int8_t unused2; /* 03: Padding, unused. */
494
db_pgno_t pgno; /* 04-07: Next page number. */
495
u_int32_t tlen; /* 08-11: Total length of item. */
498
/* Get a BOVERFLOW item for a specific index. */
499
#define GET_BOVERFLOW(pg, indx) \
500
((BOVERFLOW *)P_ENTRY(pg, indx))
503
* Page space required to add a new BOVERFLOW item to the page, with and
504
* without the index value.
506
#define BOVERFLOW_SIZE \
507
ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t))
508
#define BOVERFLOW_PSIZE \
509
(BOVERFLOW_SIZE + sizeof(db_indx_t))
512
* Btree leaf and hash page layouts group indices in sets of two, one for the
513
* key and one for the data. Everything else does it in sets of one to save
514
* space. Use the following macros so that it's real obvious what's going on.
519
/************************************************************************
520
BTREE INTERNAL PAGE LAYOUT
521
************************************************************************/
524
* Btree internal entry.
526
typedef struct _binternal {
527
db_indx_t len; /* 00-01: Key/data item length. */
528
u_int8_t type; /* 02: Page type AND DELETE FLAG. */
529
u_int8_t unused; /* 03: Padding, unused. */
530
db_pgno_t pgno; /* 04-07: Page number of referenced page. */
531
db_recno_t nrecs; /* 08-11: Subtree record count. */
532
u_int8_t data[1]; /* Variable length key item. */
535
/* Get a BINTERNAL item for a specific index. */
536
#define GET_BINTERNAL(pg, indx) \
537
((BINTERNAL *)P_ENTRY(pg, indx))
540
* Page space required to add a new BINTERNAL item to the page, with and
541
* without the index value.
543
#define BINTERNAL_SIZE(len) \
544
ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t))
545
#define BINTERNAL_PSIZE(len) \
546
(BINTERNAL_SIZE(len) + sizeof(db_indx_t))
548
/************************************************************************
549
RECNO INTERNAL PAGE LAYOUT
550
************************************************************************/
553
* The recno internal entry.
555
typedef struct _rinternal {
556
db_pgno_t pgno; /* 00-03: Page number of referenced page. */
557
db_recno_t nrecs; /* 04-07: Subtree record count. */
560
/* Get a RINTERNAL item for a specific index. */
561
#define GET_RINTERNAL(pg, indx) \
562
((RINTERNAL *)P_ENTRY(pg, indx))
565
* Page space required to add a new RINTERNAL item to the page, with and
566
* without the index value.
568
#define RINTERNAL_SIZE \
569
ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t))
570
#define RINTERNAL_PSIZE \
571
(RINTERNAL_SIZE + sizeof(db_indx_t))
573
#if defined(__cplusplus)
577
#endif /* _DB_PAGE_H_ */