1
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
4
#ifndef TOKU_LEAFENTRY_H
5
#define TOKU_LEAFENTRY_H
9
COPYING CONDITIONS NOTICE:
11
This program is free software; you can redistribute it and/or modify
12
it under the terms of version 2 of the GNU General Public License as
13
published by the Free Software Foundation, and provided that the
14
following conditions are met:
16
* Redistributions of source code must retain this COPYING
17
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
18
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
19
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
22
* Redistributions in binary form must reproduce this COPYING
23
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
24
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
25
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
26
GRANT (below) in the documentation and/or other materials
27
provided with the distribution.
29
You should have received a copy of the GNU General Public License
30
along with this program; if not, write to the Free Software
31
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
36
TokuDB, Tokutek Fractal Tree Indexing Library.
37
Copyright (C) 2007-2013 Tokutek, Inc.
41
This program is distributed in the hope that it will be useful, but
42
WITHOUT ANY WARRANTY; without even the implied warranty of
43
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44
General Public License for more details.
46
UNIVERSITY PATENT NOTICE:
48
The technology is licensed by the Massachusetts Institute of
49
Technology, Rutgers State University of New Jersey, and the Research
50
Foundation of State University of New York at Stony Brook under
51
United States of America Serial No. 11/760379 and to the patents
52
and/or patent applications resulting from it.
54
PATENT MARKING NOTICE:
56
This software is covered by US Patent No. 8,185,551.
57
This software is covered by US Patent No. 8,489,638.
61
"THIS IMPLEMENTATION" means the copyrightable works distributed by
62
Tokutek as part of the Fractal Tree project.
64
"PATENT CLAIMS" means the claims of patents that are owned or
65
licensable by Tokutek, both currently or in the future; and that in
66
the absence of this license would be infringed by THIS
67
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
69
"PATENT CHALLENGE" shall mean a challenge to the validity,
70
patentability, enforceability and/or non-infringement of any of the
71
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
73
Tokutek hereby grants to you, for the term and geographical scope of
74
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
75
irrevocable (except as stated in this section) patent license to
76
make, have made, use, offer to sell, sell, import, transfer, and
77
otherwise run, modify, and propagate the contents of THIS
78
IMPLEMENTATION, where such license applies only to the PATENT
79
CLAIMS. This grant does not include claims that would be infringed
80
only as a consequence of further modifications of THIS
81
IMPLEMENTATION. If you or your agent or licensee institute or order
82
or agree to the institution of patent litigation against any entity
83
(including a cross-claim or counterclaim in a lawsuit) alleging that
84
THIS IMPLEMENTATION constitutes direct or contributory patent
85
infringement, or inducement of patent infringement, then any rights
86
granted to you under this License shall terminate as of the date
87
such litigation is filed. If you or your agent or exclusive
88
licensee institute or order or agree to the institution of a PATENT
89
CHALLENGE, then Tokutek may terminate any rights granted to you
93
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
94
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
96
#include <toku_portability.h>
98
#include <util/mempool.h>
100
#include "txn_manager.h"
106
Memory format of packed leaf entry
111
voffset of val/vallen??? (for le_any_val) This must be small if it is interpreted as voffset = realoffset_of_val - keylen
112
GOOD performance optimization.
113
ALSO good for simplicity (no having to scan packed version)
118
Memory format of packed dup leaf entry
129
// enum of possible values for LEAFENTRY->type field
130
// LE_CLEAN means that there is a single committed value in a format that saves disk space
131
// LE_MVCC means that there may be multiple committed values or there are provisional values
133
enum { LE_CLEAN = 0, LE_MVCC = 1 };
135
// This is an on-disk format. static_asserts verify everything is packed and aligned correctly.
137
struct leafentry_clean {
139
uint8_t val[0]; //actual val
140
}; // For the case where LEAFENTRY->type is LE_CLEAN
141
static_assert(4 == sizeof(leafentry::leafentry_clean), "leafentry_clean size is wrong");
142
static_assert(4 == __builtin_offsetof(leafentry::leafentry_clean, val), "val is in the wrong place");
143
struct __attribute__ ((__packed__)) leafentry_mvcc {
144
uint32_t num_cxrs; // number of committed transaction records
145
uint8_t num_pxrs; // number of provisional transaction records
146
uint8_t xrs[0]; //then TXNIDs of XRs relevant for reads:
147
// if provisional XRs exist, store OUTERMOST TXNID
148
// store committed TXNIDs, from most recently committed to least recently committed (newest first)
149
//then lengths of XRs relevant for reads (length is at most 1<<31, MSB is 1 for insert, 0 for delete):
150
// if provisional XRs exist (num_pxrs>0), store length and insert/delete flag associated with INNERMOST TXNID
151
// store length and insert/delete flag associated with each committed TXNID, in same order as above (newest first)
152
//then data of XRs relevant for reads
153
// if provisional XRs exist (num_pxrs>0), store data associated with INNERMOST provisional TXNID
154
// store data associated with committed TXNIDs (all committed data, newest committed values first)
155
//if provisional XRs still exist (that is, num_puxrs > 1, so INNERMOST provisional TXNID != OUTERMOST provisional TXNID):
156
// for OUTERMOST provisional XR:
157
// 1 byte: store type (insert/delete/placeholder)
158
// 4 bytes: length (if type is INSERT, no length stored if placeholder or delete)
160
// for rest of provisional stack (if num_pxrs > 2), from second-outermost to second-innermost (outermost is stored above, innermost is stored separately):
162
// 1 byte: store type (insert/delete/placeholder)
163
// 4 bytes: length (if type is INSERT)
165
// for INNERMOST provisional XR:
167
// (innermost data and length with insert/delete flag are stored above, cannot be a placeholder)
168
}; // For the case where LEAFENTRY->type is LE_MVCC
169
static_assert(5 == sizeof(leafentry::leafentry_mvcc), "leafentry_mvcc size is wrong");
170
static_assert(5 == __builtin_offsetof(leafentry::leafentry_mvcc, xrs), "xrs is in the wrong place");
172
uint8_t type; // type is LE_CLEAN or LE_MVCC
174
union __attribute__ ((__packed__)) {
175
struct leafentry_clean clean;
176
struct leafentry_mvcc mvcc;
179
static_assert(6 == sizeof(leafentry), "leafentry size is wrong");
180
static_assert(1 == __builtin_offsetof(leafentry, u), "union is in the wrong place");
182
#define LE_CLEAN_MEMSIZE(_vallen) \
183
(sizeof(((LEAFENTRY)NULL)->type) /* type */ \
184
+sizeof(((LEAFENTRY)NULL)->u.clean.vallen) /* vallen */ \
185
+(_vallen)) /* actual val */
187
#define LE_MVCC_COMMITTED_HEADER_MEMSIZE \
188
(sizeof(((LEAFENTRY)NULL)->type) /* type */ \
189
+sizeof(((LEAFENTRY)NULL)->u.mvcc.num_cxrs) /* committed */ \
190
+sizeof(((LEAFENTRY)NULL)->u.mvcc.num_pxrs) /* provisional */ \
191
+sizeof(TXNID) /* transaction */ \
192
+sizeof(uint32_t) /* length+bit */ \
193
+sizeof(uint32_t)) /* length+bit */
195
#define LE_MVCC_COMMITTED_MEMSIZE(_vallen) \
196
(LE_MVCC_COMMITTED_HEADER_MEMSIZE \
197
+(_vallen)) /* actual val */
200
typedef struct leafentry *LEAFENTRY;
201
typedef struct leafentry_13 *LEAFENTRY_13;
204
// TODO: consistency among names is very poor.
207
// TODO: rename this helper function for deserialization
208
size_t leafentry_rest_memsize(uint32_t num_puxrs, uint32_t num_cuxrs, uint8_t* start);
209
size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
210
size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk.
211
void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
212
int print_klpair (FILE *outf, const void* key, uint32_t keylen, LEAFENTRY v); // Print a leafentry out in human-readable form.
214
int le_latest_is_del(LEAFENTRY le); // Return true if it is a provisional delete.
215
bool le_is_clean(LEAFENTRY le); //Return how many xids exist (0 does not count)
216
bool le_has_xids(LEAFENTRY le, XIDS xids); // Return true transaction represented by xids is still provisional in this leafentry (le's xid stack is a superset or equal to xids)
217
void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes)
218
uint32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen. Returns 0 for provisional deletes.
219
void* le_latest_val_and_len (LEAFENTRY le, uint32_t *len);
221
uint64_t le_outermost_uncommitted_xid (LEAFENTRY le);
224
// Function checks to see if id is accepted by context.
226
// 0: context ignores this entry, id.
227
// TOKUDB_ACCEPT: context accepts id
228
// r|r!=0&&r!=TOKUDB_ACCEPT: Quit early, return r, because something unexpected went wrong (error case)
229
typedef int(*LE_ITERATE_CALLBACK)(TXNID id, TOKUTXN context);
231
int le_iterate_is_del(LEAFENTRY le, LE_ITERATE_CALLBACK f, bool *is_empty, TOKUTXN context);
233
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, uint32_t *vallenp, TOKUTXN context);
236
leafentry_disksize_13(LEAFENTRY_13 le);
239
toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data.
242
size_t *new_leafentry_memorysize,
243
LEAFENTRY *new_leafentry_p);
246
toku_le_apply_msg(FT_MSG msg,
247
LEAFENTRY old_leafentry, // NULL if there was no stored data.
248
bn_data* data_buffer, // bn_data storing leafentry, if NULL, means there is no bn_data
249
uint32_t idx, // index in data_buffer where leafentry is stored (and should be replaced
250
TXNID oldest_referenced_xid,
252
LEAFENTRY *new_leafentry_p,
253
int64_t * numbytes_delta_p);
255
bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_referenced_xid_known);
258
toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
259
bn_data* data_buffer,
263
LEAFENTRY *new_leaf_entry,
264
const xid_omt_t &snapshot_xids,
265
const rx_omt_t &referenced_xids,
266
const xid_omt_t &live_root_txns,
267
TXNID oldest_referenced_xid_known,
268
int64_t * numbytes_delta_p);
270
#endif /* TOKU_LEAFENTRY_H */