1
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4
This file is part of systemd.
6
Copyright 2011 Lennart Poettering
8
systemd is free software; you can redistribute it and/or modify it
9
under the terms of the GNU General Public License as published by
10
the Free Software Foundation; either version 2 of the License, or
11
(at your option) any later version.
13
systemd is distributed in the hope that it will be useful, but
14
WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
General Public License for more details.
18
You should have received a copy of the GNU General Public License
19
along with systemd; If not, see <http://www.gnu.org/licenses/>.
26
#include <sys/statvfs.h>
30
#include "journal-def.h"
31
#include "journal-file.h"
35
#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36
#define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38
#define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
40
#define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
/* This is the minimum journal file size */
43
#define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45
/* These are the lower and upper bounds if we deduce the max_use value
46
* from the file system size */
47
#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48
#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
/* This is the upper bound if we deduce max_size from max_use */
51
#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53
/* This is the upper bound if we deduce the keep_free value from the
55
#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
/* This is the keep_free value when we can't determine the system
59
#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63
#define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
void journal_file_close(JournalFile *f) {
70
if (f->header && f->writable)
71
f->header->state = STATE_OFFLINE;
74
for (t = 0; t < _WINDOW_MAX; t++)
75
if (f->windows[t].ptr)
76
munmap(f->windows[t].ptr, f->windows[t].size);
79
close_nointr_nofail(f->fd);
84
free(f->compress_buffer);
90
static int journal_file_init_header(JournalFile *f, JournalFile *template) {
98
memcpy(h.signature, signature, 8);
99
h.arena_offset = htole64(ALIGN64(sizeof(h)));
101
r = sd_id128_randomize(&h.file_id);
106
h.seqnum_id = template->header->seqnum_id;
107
h.seqnum = template->header->seqnum;
109
h.seqnum_id = h.file_id;
111
k = pwrite(f->fd, &h, sizeof(h), 0);
121
static int journal_file_refresh_header(JournalFile *f) {
127
r = sd_id128_get_machine(&f->header->machine_id);
131
r = sd_id128_get_boot(&boot_id);
135
if (sd_id128_equal(boot_id, f->header->boot_id))
136
f->tail_entry_monotonic_valid = true;
138
f->header->boot_id = boot_id;
140
f->header->state = STATE_ONLINE;
142
__sync_synchronize();
147
static int journal_file_verify_header(JournalFile *f) {
150
if (memcmp(f->header, signature, 8))
154
if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155
return -EPROTONOSUPPORT;
157
if (f->header->incompatible_flags != 0)
158
return -EPROTONOSUPPORT;
161
if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
166
sd_id128_t machine_id;
169
r = sd_id128_get_machine(&machine_id);
173
if (!sd_id128_equal(machine_id, f->header->machine_id))
176
state = f->header->state;
178
if (state == STATE_ONLINE)
179
log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180
else if (state == STATE_ARCHIVED)
182
else if (state != STATE_OFFLINE)
183
log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
189
static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190
uint64_t old_size, new_size;
194
/* We assume that this file is not sparse, and we know that
195
* for sure, since we always call posix_fallocate()
199
le64toh(f->header->arena_offset) +
200
le64toh(f->header->arena_size);
202
new_size = PAGE_ALIGN(offset + size);
203
if (new_size < le64toh(f->header->arena_offset))
204
new_size = le64toh(f->header->arena_offset);
206
if (new_size <= old_size)
209
if (f->metrics.max_size > 0 &&
210
new_size > f->metrics.max_size)
213
if (new_size > f->metrics.min_size &&
214
f->metrics.keep_free > 0) {
217
if (fstatvfs(f->fd, &svfs) >= 0) {
220
available = svfs.f_bfree * svfs.f_bsize;
222
if (available >= f->metrics.keep_free)
223
available -= f->metrics.keep_free;
227
if (new_size - old_size > available)
232
/* Note that the glibc fallocate() fallback is very
233
inefficient, hence we try to minimize the allocation area
235
if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
238
if (fstat(f->fd, &f->last_stat) < 0)
241
f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
246
static int journal_file_map(
255
uint64_t woffset, wsize;
262
woffset = offset & ~((uint64_t) page_size() - 1ULL);
263
wsize = size + (offset - woffset);
264
wsize = PAGE_ALIGN(wsize);
266
/* Avoid SIGBUS on invalid accesses */
267
if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
268
return -EADDRNOTAVAIL;
270
window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
271
if (window == MAP_FAILED)
283
*ret = (uint8_t*) window + (offset - woffset);
288
static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
297
assert(wt < _WINDOW_MAX);
299
if (offset + size > (uint64_t) f->last_stat.st_size) {
300
/* Hmm, out of range? Let's refresh the fstat() data
301
* first, before we trust that check. */
303
if (fstat(f->fd, &f->last_stat) < 0 ||
304
offset + size > (uint64_t) f->last_stat.st_size)
305
return -EADDRNOTAVAIL;
310
if (_likely_(w->ptr &&
311
w->offset <= offset &&
312
w->offset + w->size >= offset + size)) {
314
*ret = (uint8_t*) w->ptr + (offset - w->offset);
319
if (munmap(w->ptr, w->size) < 0)
323
w->size = w->offset = 0;
326
if (size < DEFAULT_WINDOW_SIZE) {
327
/* If the default window size is larger then what was
328
* asked for extend the mapping a bit in the hope to
329
* minimize needed remappings later on. We add half
330
* the window space before and half behind the
331
* requested mapping */
333
delta = (DEFAULT_WINDOW_SIZE - size) / 2;
339
size = DEFAULT_WINDOW_SIZE;
343
if (offset + size > (uint64_t) f->last_stat.st_size)
344
size = (uint64_t) f->last_stat.st_size - offset;
347
return -EADDRNOTAVAIL;
349
r = journal_file_map(f,
351
&w->ptr, &w->offset, &w->size,
357
*ret = (uint8_t*) p + delta;
361
static bool verify_hash(Object *o) {
366
if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
367
h1 = le64toh(o->data.hash);
368
h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
369
} else if (o->object.type == OBJECT_FIELD) {
370
h1 = le64toh(o->field.hash);
371
h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
378
int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
386
assert(type < _OBJECT_TYPE_MAX);
388
r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
393
s = le64toh(o->object.size);
395
if (s < sizeof(ObjectHeader))
398
if (type >= 0 && o->object.type != type)
401
if (s > sizeof(ObjectHeader)) {
402
r = journal_file_move_to(f, o->object.type, offset, s, &t);
416
static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
421
r = le64toh(f->header->seqnum) + 1;
424
/* If an external seqnum counter was passed, we update
425
* both the local and the external one, and set it to
426
* the maximum of both */
434
f->header->seqnum = htole64(r);
436
if (f->header->first_seqnum == 0)
437
f->header->first_seqnum = htole64(r);
442
static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
449
assert(size >= sizeof(ObjectHeader));
453
p = le64toh(f->header->tail_object_offset);
455
p = le64toh(f->header->arena_offset);
457
r = journal_file_move_to_object(f, -1, p, &tail);
461
p += ALIGN64(le64toh(tail->object.size));
464
r = journal_file_allocate(f, p, size);
468
r = journal_file_move_to(f, type, p, size, &t);
475
o->object.type = type;
476
o->object.size = htole64(size);
478
f->header->tail_object_offset = htole64(p);
479
f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
487
static int journal_file_setup_data_hash_table(JournalFile *f) {
494
s = DEFAULT_DATA_HASH_TABLE_SIZE;
495
r = journal_file_append_object(f,
496
OBJECT_DATA_HASH_TABLE,
497
offsetof(Object, hash_table.items) + s,
502
memset(o->hash_table.items, 0, s);
504
f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505
f->header->data_hash_table_size = htole64(s);
510
static int journal_file_setup_field_hash_table(JournalFile *f) {
517
s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518
r = journal_file_append_object(f,
519
OBJECT_FIELD_HASH_TABLE,
520
offsetof(Object, hash_table.items) + s,
525
memset(o->hash_table.items, 0, s);
527
f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528
f->header->field_hash_table_size = htole64(s);
533
static int journal_file_map_data_hash_table(JournalFile *f) {
540
p = le64toh(f->header->data_hash_table_offset);
541
s = le64toh(f->header->data_hash_table_size);
543
r = journal_file_move_to(f,
544
WINDOW_DATA_HASH_TABLE,
550
f->data_hash_table = t;
554
static int journal_file_map_field_hash_table(JournalFile *f) {
561
p = le64toh(f->header->field_hash_table_offset);
562
s = le64toh(f->header->field_hash_table_size);
564
r = journal_file_move_to(f,
565
WINDOW_FIELD_HASH_TABLE,
571
f->field_hash_table = t;
575
static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
582
assert(o->object.type == OBJECT_DATA);
584
/* This might alter the window we are looking at */
586
o->data.next_hash_offset = o->data.next_field_offset = 0;
587
o->data.entry_offset = o->data.entry_array_offset = 0;
588
o->data.n_entries = 0;
590
h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591
p = le64toh(f->data_hash_table[h].head_hash_offset);
593
/* Only entry in the hash table is easy */
594
f->data_hash_table[h].head_hash_offset = htole64(offset);
596
/* Move back to the previous data object, to patch in
599
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
603
o->data.next_hash_offset = htole64(offset);
606
f->data_hash_table[h].tail_hash_offset = htole64(offset);
611
int journal_file_find_data_object_with_hash(
613
const void *data, uint64_t size, uint64_t hash,
614
Object **ret, uint64_t *offset) {
616
uint64_t p, osize, h;
620
assert(data || size == 0);
622
osize = offsetof(Object, data.payload) + size;
624
if (f->header->data_hash_table_size == 0)
627
h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
628
p = le64toh(f->data_hash_table[h].head_hash_offset);
633
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
637
if (le64toh(o->data.hash) != hash)
640
if (o->object.flags & OBJECT_COMPRESSED) {
644
l = le64toh(o->object.size);
645
if (l <= offsetof(Object, data.payload))
648
l -= offsetof(Object, data.payload);
650
if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
654
memcmp(f->compress_buffer, data, size) == 0) {
665
return -EPROTONOSUPPORT;
668
} else if (le64toh(o->object.size) == osize &&
669
memcmp(o->data.payload, data, size) == 0) {
681
p = le64toh(o->data.next_hash_offset);
687
int journal_file_find_data_object(
689
const void *data, uint64_t size,
690
Object **ret, uint64_t *offset) {
695
assert(data || size == 0);
697
hash = hash64(data, size);
699
return journal_file_find_data_object_with_hash(f,
704
static int journal_file_append_data(
706
const void *data, uint64_t size,
707
Object **ret, uint64_t *offset) {
713
bool compressed = false;
716
assert(data || size == 0);
718
hash = hash64(data, size);
720
r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
734
osize = offsetof(Object, data.payload) + size;
735
r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
739
o->data.hash = htole64(hash);
743
size >= COMPRESSION_SIZE_THRESHOLD) {
746
compressed = compress_blob(data, size, o->data.payload, &rsize);
749
o->object.size = htole64(offsetof(Object, data.payload) + rsize);
750
o->object.flags |= OBJECT_COMPRESSED;
752
f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
754
log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
760
memcpy(o->data.payload, data, size);
762
r = journal_file_link_data(f, o, p, hash);
766
/* The linking might have altered the window, so let's
767
* refresh our pointer */
768
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781
uint64_t journal_file_entry_n_items(Object *o) {
783
assert(o->object.type == OBJECT_ENTRY);
785
return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
788
static uint64_t journal_file_entry_array_n_items(Object *o) {
790
assert(o->object.type == OBJECT_ENTRY_ARRAY);
792
return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
795
static int link_entry_into_array(JournalFile *f,
800
uint64_t n = 0, ap = 0, q, i, a, hidx;
809
i = hidx = le64toh(*idx);
812
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
816
n = journal_file_entry_array_n_items(o);
818
o->entry_array.items[i] = htole64(p);
819
*idx = htole64(hidx + 1);
825
a = le64toh(o->entry_array.next_entry_array_offset);
836
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
837
offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
842
o->entry_array.items[i] = htole64(p);
847
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
851
o->entry_array.next_entry_array_offset = htole64(q);
854
*idx = htole64(hidx + 1);
859
static int link_entry_into_array_plus_one(JournalFile *f,
878
i = htole64(le64toh(*idx) - 1);
879
r = link_entry_into_array(f, first, &i, p);
884
*idx = htole64(le64toh(*idx) + 1);
888
static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
895
p = le64toh(o->entry.items[i].object_offset);
899
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
903
return link_entry_into_array_plus_one(f,
904
&o->data.entry_offset,
905
&o->data.entry_array_offset,
910
static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
917
assert(o->object.type == OBJECT_ENTRY);
919
__sync_synchronize();
921
/* Link up the entry itself */
922
r = link_entry_into_array(f,
923
&f->header->entry_array_offset,
924
&f->header->n_entries,
929
/* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
931
if (f->header->head_entry_realtime == 0)
932
f->header->head_entry_realtime = o->entry.realtime;
934
f->header->tail_entry_realtime = o->entry.realtime;
935
f->header->tail_entry_monotonic = o->entry.monotonic;
937
f->tail_entry_monotonic_valid = true;
939
/* Link up the items */
940
n = journal_file_entry_n_items(o);
941
for (i = 0; i < n; i++) {
942
r = journal_file_link_entry_item(f, o, offset, i);
950
static int journal_file_append_entry_internal(
952
const dual_timestamp *ts,
954
const EntryItem items[], unsigned n_items,
956
Object **ret, uint64_t *offset) {
963
assert(items || n_items == 0);
966
osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
968
r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
972
o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
973
memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
974
o->entry.realtime = htole64(ts->realtime);
975
o->entry.monotonic = htole64(ts->monotonic);
976
o->entry.xor_hash = htole64(xor_hash);
977
o->entry.boot_id = f->header->boot_id;
979
r = journal_file_link_entry(f, o, np);
992
void journal_file_post_change(JournalFile *f) {
995
/* inotify() does not receive IN_MODIFY events from file
996
* accesses done via mmap(). After each access we hence
997
* trigger IN_MODIFY by truncating the journal file to its
998
* current size which triggers IN_MODIFY. */
1000
__sync_synchronize();
1002
if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1003
log_error("Failed to to truncate file to its own size: %m");
1006
int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1010
uint64_t xor_hash = 0;
1011
struct dual_timestamp _ts;
1014
assert(iovec || n_iovec == 0);
1020
dual_timestamp_get(&_ts);
1024
if (f->tail_entry_monotonic_valid &&
1025
ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1028
items = alloca(sizeof(EntryItem) * n_iovec);
1030
for (i = 0; i < n_iovec; i++) {
1034
r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1038
xor_hash ^= le64toh(o->data.hash);
1039
items[i].object_offset = htole64(p);
1040
items[i].hash = o->data.hash;
1043
r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1045
journal_file_post_change(f);
1050
static int generic_array_get(JournalFile *f,
1053
Object **ret, uint64_t *offset) {
1065
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1069
n = journal_file_entry_array_n_items(o);
1071
p = le64toh(o->entry_array.items[i]);
1076
a = le64toh(o->entry_array.next_entry_array_offset);
1079
if (a <= 0 || p <= 0)
1082
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1095
static int generic_array_get_plus_one(JournalFile *f,
1099
Object **ret, uint64_t *offset) {
1108
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1121
return generic_array_get(f, first, i-1, ret, offset);
1130
static int generic_array_bisect(JournalFile *f,
1134
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1135
direction_t direction,
1140
uint64_t a, p, t = 0, i = 0, last_p = 0;
1141
bool subtract_one = false;
1142
Object *o, *array = NULL;
1146
assert(test_object);
1150
uint64_t left, right, k, lp;
1152
r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1156
k = journal_file_entry_array_n_items(array);
1162
lp = p = le64toh(array->entry_array.items[i]);
1166
r = test_object(f, p, needle);
1170
if (r == TEST_FOUND)
1171
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1173
if (r == TEST_RIGHT) {
1177
if (left == right) {
1178
if (direction == DIRECTION_UP)
1179
subtract_one = true;
1185
assert(left < right);
1187
i = (left + right) / 2;
1188
p = le64toh(array->entry_array.items[i]);
1192
r = test_object(f, p, needle);
1196
if (r == TEST_FOUND)
1197
r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1199
if (r == TEST_RIGHT)
1213
a = le64toh(array->entry_array.next_entry_array_offset);
1219
if (subtract_one && t == 0 && i == 0)
1222
if (subtract_one && i == 0)
1224
else if (subtract_one)
1225
p = le64toh(array->entry_array.items[i-1]);
1227
p = le64toh(array->entry_array.items[i]);
1229
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1240
*idx = t + i - (subtract_one ? 1 : 0);
1245
static int generic_array_bisect_plus_one(JournalFile *f,
1250
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1251
direction_t direction,
1259
assert(test_object);
1264
/* This bisects the array in object 'first', but first checks
1266
r = test_object(f, extra, needle);
1269
else if (r == TEST_FOUND) {
1272
r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1286
} else if (r == TEST_RIGHT)
1289
r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1297
static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1304
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1308
if (le64toh(o->entry.seqnum) == needle)
1310
else if (le64toh(o->entry.seqnum) < needle)
1316
int journal_file_move_to_entry_by_seqnum(
1319
direction_t direction,
1323
return generic_array_bisect(f,
1324
le64toh(f->header->entry_array_offset),
1325
le64toh(f->header->n_entries),
1332
static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1339
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1343
if (le64toh(o->entry.realtime) == needle)
1345
else if (le64toh(o->entry.realtime) < needle)
1351
int journal_file_move_to_entry_by_realtime(
1354
direction_t direction,
1358
return generic_array_bisect(f,
1359
le64toh(f->header->entry_array_offset),
1360
le64toh(f->header->n_entries),
1362
test_object_realtime,
1367
static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1374
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1378
if (le64toh(o->entry.monotonic) == needle)
1380
else if (le64toh(o->entry.monotonic) < needle)
1386
int journal_file_move_to_entry_by_monotonic(
1390
direction_t direction,
1394
char t[8+32+1] = "_BOOT_ID=";
1398
sd_id128_to_string(boot_id, t + 8);
1400
r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1406
return generic_array_bisect_plus_one(f,
1407
le64toh(o->data.entry_offset),
1408
le64toh(o->data.entry_array_offset),
1409
le64toh(o->data.n_entries),
1411
test_object_monotonic,
1416
static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1422
else if (p < needle)
1428
int journal_file_next_entry(
1430
Object *o, uint64_t p,
1431
direction_t direction,
1432
Object **ret, uint64_t *offset) {
1438
assert(p > 0 || !o);
1440
n = le64toh(f->header->n_entries);
1445
i = direction == DIRECTION_DOWN ? 0 : n - 1;
1447
if (o->object.type != OBJECT_ENTRY)
1450
r = generic_array_bisect(f,
1451
le64toh(f->header->entry_array_offset),
1452
le64toh(f->header->n_entries),
1461
if (direction == DIRECTION_DOWN) {
1474
/* And jump to it */
1475
return generic_array_get(f,
1476
le64toh(f->header->entry_array_offset),
1481
int journal_file_skip_entry(
1483
Object *o, uint64_t p,
1485
Object **ret, uint64_t *offset) {
1494
if (o->object.type != OBJECT_ENTRY)
1497
r = generic_array_bisect(f,
1498
le64toh(f->header->entry_array_offset),
1499
le64toh(f->header->n_entries),
1508
/* Calculate new index */
1510
if ((uint64_t) -skip >= i)
1513
i = i - (uint64_t) -skip;
1515
i += (uint64_t) skip;
1517
n = le64toh(f->header->n_entries);
1524
return generic_array_get(f,
1525
le64toh(f->header->entry_array_offset),
1530
int journal_file_next_entry_for_data(
1532
Object *o, uint64_t p,
1533
uint64_t data_offset,
1534
direction_t direction,
1535
Object **ret, uint64_t *offset) {
1542
assert(p > 0 || !o);
1544
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1548
n = le64toh(d->data.n_entries);
1553
i = direction == DIRECTION_DOWN ? 0 : n - 1;
1555
if (o->object.type != OBJECT_ENTRY)
1558
r = generic_array_bisect_plus_one(f,
1559
le64toh(d->data.entry_offset),
1560
le64toh(d->data.entry_array_offset),
1561
le64toh(d->data.n_entries),
1571
if (direction == DIRECTION_DOWN) {
1585
return generic_array_get_plus_one(f,
1586
le64toh(d->data.entry_offset),
1587
le64toh(d->data.entry_array_offset),
1592
int journal_file_move_to_entry_by_seqnum_for_data(
1594
uint64_t data_offset,
1596
direction_t direction,
1597
Object **ret, uint64_t *offset) {
1602
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1606
return generic_array_bisect_plus_one(f,
1607
le64toh(d->data.entry_offset),
1608
le64toh(d->data.entry_array_offset),
1609
le64toh(d->data.n_entries),
1616
int journal_file_move_to_entry_by_realtime_for_data(
1618
uint64_t data_offset,
1620
direction_t direction,
1621
Object **ret, uint64_t *offset) {
1626
r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1630
return generic_array_bisect_plus_one(f,
1631
le64toh(d->data.entry_offset),
1632
le64toh(d->data.entry_array_offset),
1633
le64toh(d->data.n_entries),
1635
test_object_realtime,
1640
void journal_file_dump(JournalFile *f) {
1641
char a[33], b[33], c[33];
1648
printf("File Path: %s\n"
1652
"Arena size: %llu\n"
1656
sd_id128_to_string(f->header->file_id, a),
1657
sd_id128_to_string(f->header->machine_id, b),
1658
sd_id128_to_string(f->header->boot_id, c),
1659
(unsigned long long) le64toh(f->header->arena_size),
1660
(unsigned long) le64toh(f->header->n_objects),
1661
(unsigned long) le64toh(f->header->n_entries));
1663
p = le64toh(f->header->arena_offset);
1665
r = journal_file_move_to_object(f, -1, p, &o);
1669
switch (o->object.type) {
1672
printf("Type: OBJECT_UNUSED\n");
1676
printf("Type: OBJECT_DATA\n");
1680
printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1681
(unsigned long long) le64toh(o->entry.seqnum),
1682
(unsigned long long) le64toh(o->entry.monotonic),
1683
(unsigned long long) le64toh(o->entry.realtime));
1686
case OBJECT_FIELD_HASH_TABLE:
1687
printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1690
case OBJECT_DATA_HASH_TABLE:
1691
printf("Type: OBJECT_DATA_HASH_TABLE\n");
1694
case OBJECT_ENTRY_ARRAY:
1695
printf("Type: OBJECT_ENTRY_ARRAY\n");
1699
if (o->object.flags & OBJECT_COMPRESSED)
1700
printf("Flags: COMPRESSED\n");
1702
if (p == le64toh(f->header->tail_object_offset))
1705
p = p + ALIGN64(le64toh(o->object.size));
1710
log_error("File corrupt");
1713
int journal_file_open(
1717
JournalFile *template,
1718
JournalFile **ret) {
1722
bool newly_created = false;
1726
if ((flags & O_ACCMODE) != O_RDONLY &&
1727
(flags & O_ACCMODE) != O_RDWR)
1730
if (!endswith(fname, ".journal"))
1733
f = new0(JournalFile, 1);
1740
f->writable = (flags & O_ACCMODE) != O_RDONLY;
1741
f->prot = prot_from_flags(flags);
1744
f->metrics = template->metrics;
1745
f->compress = template->compress;
1748
f->path = strdup(fname);
1754
f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1760
if (fstat(f->fd, &f->last_stat) < 0) {
1765
if (f->last_stat.st_size == 0 && f->writable) {
1766
newly_created = true;
1768
r = journal_file_init_header(f, template);
1772
if (fstat(f->fd, &f->last_stat) < 0) {
1778
if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1783
f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1784
if (f->header == MAP_FAILED) {
1790
if (!newly_created) {
1791
r = journal_file_verify_header(f);
1797
r = journal_file_refresh_header(f);
1802
if (newly_created) {
1804
r = journal_file_setup_field_hash_table(f);
1808
r = journal_file_setup_data_hash_table(f);
1813
r = journal_file_map_field_hash_table(f);
1817
r = journal_file_map_data_hash_table(f);
1827
journal_file_close(f);
1832
int journal_file_rotate(JournalFile **f) {
1835
JournalFile *old_file, *new_file = NULL;
1843
if (!old_file->writable)
1846
if (!endswith(old_file->path, ".journal"))
1849
l = strlen(old_file->path);
1851
p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1855
memcpy(p, old_file->path, l - 8);
1857
sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1858
snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1859
"-%016llx-%016llx.journal",
1860
(unsigned long long) le64toh((*f)->header->seqnum),
1861
(unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1863
r = rename(old_file->path, p);
1869
old_file->header->state = STATE_ARCHIVED;
1871
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1872
journal_file_close(old_file);
1878
int journal_file_open_reliably(
1882
JournalFile *template,
1883
JournalFile **ret) {
1889
r = journal_file_open(fname, flags, mode, template, ret);
1893
if ((flags & O_ACCMODE) == O_RDONLY)
1896
if (!(flags & O_CREAT))
1899
/* The file is corrupted. Rotate it away and try it again (but only once) */
1902
if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1904
(unsigned long long) now(CLOCK_REALTIME),
1908
r = rename(fname, p);
1913
log_warning("File %s corrupted, renaming and replacing.", fname);
1915
return journal_file_open(fname, flags, mode, template, ret);
1918
struct vacuum_info {
1923
sd_id128_t seqnum_id;
1929
static int vacuum_compare(const void *_a, const void *_b) {
1930
const struct vacuum_info *a, *b;
1935
if (a->have_seqnum && b->have_seqnum &&
1936
sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1937
if (a->seqnum < b->seqnum)
1939
else if (a->seqnum > b->seqnum)
1945
if (a->realtime < b->realtime)
1947
else if (a->realtime > b->realtime)
1949
else if (a->have_seqnum && b->have_seqnum)
1950
return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1952
return strcmp(a->filename, b->filename);
1955
int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1958
struct vacuum_info *list = NULL;
1959
unsigned n_list = 0, n_allocated = 0, i;
1967
d = opendir(directory);
1973
struct dirent buf, *de;
1977
unsigned long long seqnum, realtime;
1978
sd_id128_t seqnum_id;
1981
k = readdir_r(d, &buf, &de);
1990
if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1993
if (!S_ISREG(st.st_mode))
1996
q = strlen(de->d_name);
1998
if (endswith(de->d_name, ".journal")) {
2000
/* Vacuum archived files */
2002
if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2005
if (de->d_name[q-8-16-1] != '-' ||
2006
de->d_name[q-8-16-1-16-1] != '-' ||
2007
de->d_name[q-8-16-1-16-1-32-1] != '@')
2010
p = strdup(de->d_name);
2016
de->d_name[q-8-16-1-16-1] = 0;
2017
if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2022
if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2029
} else if (endswith(de->d_name, ".journal~")) {
2030
unsigned long long tmp;
2032
/* Vacuum corrupted files */
2034
if (q < 1 + 16 + 1 + 16 + 8 + 1)
2037
if (de->d_name[q-1-8-16-1] != '-' ||
2038
de->d_name[q-1-8-16-1-16-1] != '@')
2041
p = strdup(de->d_name);
2047
if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2052
have_seqnum = false;
2056
if (n_list >= n_allocated) {
2057
struct vacuum_info *j;
2059
n_allocated = MAX(n_allocated * 2U, 8U);
2060
j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2070
list[n_list].filename = p;
2071
list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2072
list[n_list].seqnum = seqnum;
2073
list[n_list].realtime = realtime;
2074
list[n_list].seqnum_id = seqnum_id;
2075
list[n_list].have_seqnum = have_seqnum;
2077
sum += list[n_list].usage;
2082
qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2084
for(i = 0; i < n_list; i++) {
2087
if (fstatvfs(dirfd(d), &ss) < 0) {
2092
if (sum <= max_use &&
2093
(uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2096
if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2097
log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2098
sum -= list[i].usage;
2099
} else if (errno != ENOENT)
2100
log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2104
for (i = 0; i < n_list; i++)
2105
free(list[i].filename);
2115
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2117
uint64_t q, xor_hash = 0;
2130
ts.monotonic = le64toh(o->entry.monotonic);
2131
ts.realtime = le64toh(o->entry.realtime);
2133
if (to->tail_entry_monotonic_valid &&
2134
ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2137
if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2140
n = journal_file_entry_n_items(o);
2141
items = alloca(sizeof(EntryItem) * n);
2143
for (i = 0; i < n; i++) {
2144
uint64_t le_hash, l, h;
2149
q = le64toh(o->entry.items[i].object_offset);
2150
le_hash = o->entry.items[i].hash;
2152
r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2156
if (le_hash != o->data.hash)
2159
l = le64toh(o->object.size) - offsetof(Object, data.payload);
2162
/* We hit the limit on 32bit machines */
2163
if ((uint64_t) t != l)
2166
if (o->object.flags & OBJECT_COMPRESSED) {
2170
if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2173
data = from->compress_buffer;
2176
return -EPROTONOSUPPORT;
2179
data = o->data.payload;
2181
r = journal_file_append_data(to, data, l, &u, &h);
2185
xor_hash ^= le64toh(u->data.hash);
2186
items[i].object_offset = htole64(h);
2187
items[i].hash = u->data.hash;
2189
r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2194
return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2197
void journal_default_metrics(JournalMetrics *m, int fd) {
2198
uint64_t fs_size = 0;
2200
char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2205
if (fstatvfs(fd, &ss) >= 0)
2206
fs_size = ss.f_frsize * ss.f_blocks;
2208
if (m->max_use == (uint64_t) -1) {
2211
m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2213
if (m->max_use > DEFAULT_MAX_USE_UPPER)
2214
m->max_use = DEFAULT_MAX_USE_UPPER;
2216
if (m->max_use < DEFAULT_MAX_USE_LOWER)
2217
m->max_use = DEFAULT_MAX_USE_LOWER;
2219
m->max_use = DEFAULT_MAX_USE_LOWER;
2221
m->max_use = PAGE_ALIGN(m->max_use);
2223
if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2224
m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2227
if (m->max_size == (uint64_t) -1) {
2228
m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2230
if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2231
m->max_size = DEFAULT_MAX_SIZE_UPPER;
2233
m->max_size = PAGE_ALIGN(m->max_size);
2235
if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2236
m->max_size = JOURNAL_FILE_SIZE_MIN;
2238
if (m->max_size*2 > m->max_use)
2239
m->max_use = m->max_size*2;
2241
if (m->min_size == (uint64_t) -1)
2242
m->min_size = JOURNAL_FILE_SIZE_MIN;
2244
m->min_size = PAGE_ALIGN(m->min_size);
2246
if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2247
m->min_size = JOURNAL_FILE_SIZE_MIN;
2249
if (m->min_size > m->max_size)
2250
m->max_size = m->min_size;
2253
if (m->keep_free == (uint64_t) -1) {
2256
m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2258
if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2259
m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2262
m->keep_free = DEFAULT_KEEP_FREE;
2265
log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2266
format_bytes(a, sizeof(a), m->max_use),
2267
format_bytes(b, sizeof(b), m->max_size),
2268
format_bytes(c, sizeof(c), m->min_size),
2269
format_bytes(d, sizeof(d), m->keep_free));