2
Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com>
3
This file is part of GlusterFS.
5
GlusterFS is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published
7
by the Free Software Foundation; either version 3 of the License,
8
or (at your option) any later version.
10
GlusterFS is distributed in the hope that it will be useful, but
11
WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with this program. If not, see
17
<http://www.gnu.org/licenses/>.
25
* implement the procedures to interact with bdb */
27
/****************************************************************
29
* General wrappers and utility procedures for bdb xlator
31
****************************************************************/
34
bdb_inode_transform (ino_t parent,
41
hash = gf_dm_hashfn (name, namelen);
43
ino = (((parent << 32) | 0x00000000ffffffffULL)
44
& (hash | 0xffffffff00000000ULL));
50
bdb_generate_secondary_hash (DB *secondary,
56
uint32_t *hash = NULL;
60
hash = calloc (1, sizeof (uint32_t));
62
*hash = gf_dm_hashfn (primary, pkey->size);
65
skey->size = sizeof (hash);
66
skey->flags = DB_DBT_APPMALLOC;
71
/***********************************************************
73
* bdb storage database utilities
75
**********************************************************/
78
* bdb_db_open - opens a storage db.
80
* @ctx: context specific to the directory for which we are supposed to open db
82
* see, if we have empty slots to open a db.
83
* if (no-empty-slots), then prune open dbs and close as many as possible
84
* if (empty-slot-available), tika muchkonDu db open maaDu
88
bdb_db_open (bctx_t *bctx)
93
bctx_table_t *table = NULL;
95
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
98
GF_VALIDATE_OR_GOTO ("bdb-ll", table, out);
100
/* we have to do the following, we can't deny someone of db_open ;) */
101
ret = db_create (&primary, table->dbenv, 0);
103
gf_log ("bdb-ll", GF_LOG_DEBUG,
104
"_BDB_DB_OPEN %s: %s (failed to create database object"
105
" for primary database)",
106
bctx->directory, db_strerror (ret));
111
if (table->page_size) {
112
ret = primary->set_pagesize (primary,
115
gf_log ("bdb-ll", GF_LOG_DEBUG,
116
"_BDB_DB_OPEN %s: %s (failed to set page-size "
118
bctx->directory, db_strerror (ret),
121
gf_log ("bdb-ll", GF_LOG_DEBUG,
122
"_BDB_DB_OPEN %s: page-size set to %"PRIu64,
123
bctx->directory, table->page_size);
127
ret = primary->open (primary, NULL, bctx->db_path, "primary",
128
table->access_mode, table->dbflags, 0);
130
gf_log ("bdb-ll", GF_LOG_ERROR,
131
"_BDB_DB_OPEN %s: %s "
132
"(failed to open primary database)",
133
bctx->directory, db_strerror (ret));
138
ret = db_create (&secondary, table->dbenv, 0);
140
gf_log ("bdb-ll", GF_LOG_DEBUG,
141
"_BDB_DB_OPEN %s: %s (failed to create database object"
142
" for secondary database)",
143
bctx->directory, db_strerror (ret));
148
ret = secondary->open (secondary, NULL, bctx->db_path, "secondary",
149
table->access_mode, table->dbflags, 0);
151
gf_log ("bdb-ll", GF_LOG_ERROR,
152
"_BDB_DB_OPEN %s: %s "
153
"(failed to open secondary database)",
154
bctx->directory, db_strerror (ret));
159
ret = primary->associate (primary, NULL, secondary,
160
bdb_generate_secondary_hash,
161
#ifdef DB_IMMUTABLE_KEY
167
gf_log ("bdb-ll", GF_LOG_ERROR,
168
"_BDB_DB_OPEN %s: %s "
169
"(failed to associate primary database with "
170
"secondary database)",
171
bctx->directory, db_strerror (ret));
177
bctx->primary = primary;
178
bctx->secondary = secondary;
183
primary->close (primary, 0);
185
secondary->close (secondary, 0);
191
bdb_cursor_close (bctx_t *bctx,
196
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
197
GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
201
#ifdef HAVE_BDB_CURSOR_GET
202
ret = cursorp->close (cursorp);
204
ret = cursorp->c_close (cursorp);
207
gf_log ("bdb-ll", GF_LOG_DEBUG,
208
"_BDB_CURSOR_CLOSE %s: %s "
209
"(failed to close database cursor)",
210
bctx->directory, db_strerror (ret));
213
UNLOCK (&bctx->lock);
221
bdb_cursor_open (bctx_t *bctx,
226
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
227
GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out);
231
if (bctx->secondary) {
232
/* do nothing, just continue */
235
ret = bdb_db_open (bctx);
237
gf_log ("bdb-ll", GF_LOG_DEBUG,
238
"_BDB_CURSOR_OPEN %s: ENOMEM "
239
"(failed to open secondary database)",
248
/* all set, open cursor */
249
ret = bctx->secondary->cursor (bctx->secondary,
252
gf_log ("bdb-ll", GF_LOG_DEBUG,
253
"_BDB_CURSOR_OPEN %s: %s "
254
"(failed to open a cursor to database)",
255
bctx->directory, db_strerror (ret));
259
UNLOCK (&bctx->lock);
268
bdb_cache_lookup (bctx_t *bctx,
271
bdb_cache_t *bcache = NULL;
272
bdb_cache_t *trav = NULL;
275
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
276
GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
278
MAKE_KEY_FROM_PATH (key, path);
282
list_for_each_entry (trav, &bctx->c_list, c_list) {
283
if (!strcmp (trav->key, key)){
289
UNLOCK (&bctx->lock);
296
bdb_cache_insert (bctx_t *bctx,
300
bdb_cache_t *bcache = NULL;
303
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
304
GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
305
GF_VALIDATE_OR_GOTO ("bdb-ll", data, out);
309
if (bctx->c_count > 5) {
310
/* most of the times, we enter here */
311
/* FIXME: ugly, not supposed to disect any of the
312
* 'struct list_head' directly */
313
if (!list_empty (&bctx->c_list)) {
314
bcache = list_entry (bctx->c_list.prev,
315
bdb_cache_t, c_list);
316
list_del_init (&bcache->c_list);
320
bcache->key = calloc (key->size + 1,
322
GF_VALIDATE_OR_GOTO ("bdb-ll",
323
bcache->key, unlock);
324
memcpy (bcache->key, (char *)key->data,
327
/* should never come here */
328
gf_log ("bdb-ll", GF_LOG_DEBUG,
329
"_BDB_CACHE_INSERT %s (%s) "
330
"(found a cache entry with empty key)",
331
bctx->directory, (char *)key->data);
332
} /* if(bcache->key)...else */
335
bcache->data = memdup (data->data, data->size);
336
GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data,
338
bcache->size = data->size;
340
/* should never come here */
341
gf_log ("bdb-ll", GF_LOG_CRITICAL,
342
"_BDB_CACHE_INSERT %s (%s) "
343
"(found a cache entry with no data)",
344
bctx->directory, (char *)key->data);
345
} /* if(bcache->data)...else */
346
list_add (&bcache->c_list, &bctx->c_list);
349
/* we will be entering here very rarely */
350
bcache = CALLOC (1, sizeof (*bcache));
351
GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock);
353
bcache->key = calloc (key->size + 1, sizeof (char));
354
GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock);
355
memcpy (bcache->key, key->data, key->size);
357
bcache->data = memdup (data->data, data->size);
358
GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock);
360
bcache->size = data->size;
361
list_add (&bcache->c_list, &bctx->c_list);
364
} /* if(private->c_count < 5)...else */
367
UNLOCK (&bctx->lock);
373
bdb_cache_delete (bctx_t *bctx,
376
bdb_cache_t *bcache = NULL;
377
bdb_cache_t *trav = NULL;
379
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
380
GF_VALIDATE_OR_GOTO ("bdb-ll", key, out);
384
list_for_each_entry (trav, &bctx->c_list, c_list) {
385
if (!strcmp (trav->key, key)){
393
list_del_init (&bcache->c_list);
399
UNLOCK (&bctx->lock);
406
bdb_db_stat (bctx_t *bctx,
416
if (bctx->primary == NULL) {
417
ret = bdb_db_open (bctx);
418
storage = bctx->primary;
420
/* we are just fine, lets continue */
421
storage = bctx->primary;
422
} /* if(bctx->dbp==NULL)...else */
424
UNLOCK (&bctx->lock);
426
GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
428
ret = storage->stat (storage, txnid, &stat, flags);
431
gf_log ("bdb-ll", GF_LOG_DEBUG,
432
"_BDB_DB_STAT %s: %s "
433
"(failed to do stat database)",
434
bctx->directory, db_strerror (ret));
441
/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the
442
* corresponding db file.
444
* @bctx: bctx_t * corresponding to the parent directory of @path. (should
445
* always be a valid bctx). bdb_storage_get should never be called if
447
* @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction
448
* or a valid DB_TXN *, when embedded in an explicit transaction.
449
* @path: path of the file to read from (translated to a database key using
450
* MAKE_KEY_FROM_PATH)
451
* @buf: char ** - pointer to a pointer to char. a read buffer is created in
452
* this procedure and pointer to the buffer is passed through @buf to the
454
* @size: size of the file content to be read.
455
* @offset: offset from which the file content to be read.
457
* NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL
458
* (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
459
* bdb_table_prune()).
461
* NOTE: if private->cache is set (bdb xlator's internal caching enabled), then
462
* bdb_storage_get first looks up the cache for key/value pair. if
463
* bdb_lookup_cache fails, then only DB->get() is called. also, inserts a
464
* newly read key/value pair to cache through bdb_insert_to_cache.
466
* return: 'number of bytes read' on success or -1 on error.
468
* also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb
469
* xlator's internal cache.
472
bdb_db_get (bctx_t *bctx,
483
size_t copy_size = 0;
484
char *key_string = NULL;
485
bdb_cache_t *bcache = NULL;
486
int32_t db_flags = 0;
487
uint8_t need_break = 0;
490
GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out);
491
GF_VALIDATE_OR_GOTO ("bdb-ll", path, out);
493
MAKE_KEY_FROM_PATH (key_string, path);
496
((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) {
498
copy_size = ((bcache->size - offset) < size)?
499
(bcache->size - offset) : size;
501
memcpy (buf, (bcache->data + offset), copy_size);
512
if (bctx->primary == NULL) {
513
ret = bdb_db_open (bctx);
514
storage = bctx->primary;
516
/* we are just fine, lets continue */
517
storage = bctx->primary;
518
} /* if(bctx->dbp==NULL)...else */
520
UNLOCK (&bctx->lock);
522
GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
524
key.data = (char *)key_string;
525
key.size = strlen (key_string);
526
key.flags = DB_DBT_USERMEM;
529
value.flags = DB_DBT_MALLOC;
534
value.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
536
value.flags = DB_DBT_MALLOC;
543
/* TODO: we prefer to give our own buffer to value.data
544
* and ask bdb to fill in it */
545
ret = storage->get (storage, txnid, &key, &value,
548
if (ret == DB_NOTFOUND) {
549
gf_log ("bdb-ll", GF_LOG_DEBUG,
550
"_BDB_DB_GET %s - %s: ENOENT"
551
"(specified key not found in database)",
552
bctx->directory, key_string);
555
} else if (ret == DB_LOCK_DEADLOCK) {
557
gf_log ("bdb-ll", GF_LOG_DEBUG,
558
"_BDB_DB_GET %s - %s"
559
"(deadlock detected, retrying for %d "
561
bctx->directory, key_string, retries);
562
} else if (ret == 0) {
563
/* successfully read data, lets set everything
564
* in place and return */
567
copy_size = ((value.size - offset) < size) ?
568
(value.size - offset) : size;
570
memcpy (buf, (value.data + offset),
575
bdb_cache_insert (bctx, &key, &value);
585
gf_log ("bdb-ll", GF_LOG_DEBUG,
586
"_BDB_DB_GET %s - %s: %s"
587
"(failed to retrieve specified key from"
589
bctx->directory, key_string,
594
} while (!need_break);
600
/* TODO: handle errors here and log. propogate only the errno to caller */
602
bdb_db_fread (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
604
return bdb_db_get (bfd->ctx, NULL, bfd->key, buf, size, offset);
608
bdb_db_iread (struct bdb_ctx *bctx, const char *key, char **bufp)
614
ret = bdb_db_get (bctx, NULL, key, NULL, 0, 0);
618
buf = calloc (size, sizeof (char));
620
ret = bdb_db_get (bctx, NULL, key, buf, size, 0);
626
/* bdb_storage_put - insert a key/value specified to the corresponding DB.
628
* @bctx: bctx_t * corresponding to the parent directory of @path.
629
* (should always be a valid bctx). bdb_storage_put should never be
630
* called if @bctx = NULL.
631
* @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction
632
* or a valid DB_TXN *, when embedded in an explicit transaction.
633
* @key_string: key of the database entry.
634
* @buf: pointer to the buffer data to be written as data for @key_string.
635
* @size: size of @buf.
636
* @offset: offset in the key's data to be modified with provided data.
637
* @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of
638
* @key_string to 0 size).
640
* NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL
641
* (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
642
* bdb_table_prune()).
644
* NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache.
646
* return: 0 on success or -1 on error.
648
* also see: bdb_cache_delete for details on how a cached key/value pair is
652
bdb_db_put (bctx_t *bctx,
654
const char *key_string,
661
DBT key = {0,}, value = {0,};
663
int32_t db_flags = DB_AUTO_COMMIT;
664
uint8_t need_break = 0;
669
if (bctx->primary == NULL) {
670
ret = bdb_db_open (bctx);
671
storage = bctx->primary;
673
/* we are just fine, lets continue */
674
storage = bctx->primary;
677
UNLOCK (&bctx->lock);
679
GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
682
ret = bdb_cache_delete (bctx, (char *)key_string);
683
GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
686
key.data = (void *)key_string;
687
key.size = strlen (key_string);
689
/* NOTE: bdb lets us expand the file, suppose value.size > value.len,
690
* then value.len bytes from value.doff offset and value.size bytes
691
* will be written from value.doff and data from
692
* value.doff + value.dlen will be pushed value.doff + value.size
694
value.data = (void *)buf;
696
if (flags & BDB_TRUNCATE_RECORD) {
705
value.flags = DB_DBT_PARTIAL;
706
if (buf == NULL && size == 0)
707
/* truncate called us */
711
ret = storage->put (storage, txnid, &key, &value, db_flags);
712
if (ret == DB_LOCK_DEADLOCK) {
714
gf_log ("bdb-ll", GF_LOG_DEBUG,
715
"_BDB_DB_PUT %s - %s"
716
"(deadlock detected, retying for %d time)",
717
bctx->directory, key_string, retries);
720
gf_log ("bdb-ll", GF_LOG_DEBUG,
721
"_BDB_DB_PUT %s - %s: %s"
722
"(failed to put specified entry into database)",
723
bctx->directory, key_string, db_strerror (ret));
726
/* successfully wrote */
730
} while (!need_break);
736
bdb_db_icreate (struct bdb_ctx *bctx, const char *key)
738
return bdb_db_put (bctx, NULL, key, NULL, 0, 0, 0);
741
/* TODO: handle errors here and log. propogate only the errno to caller */
743
bdb_db_fwrite (struct bdb_fd *bfd, char *buf, size_t size, off_t offset)
745
return bdb_db_put (bfd->ctx, NULL, bfd->key, buf, size, offset, 0);
748
/* TODO: handle errors here and log. propogate only the errno to caller */
750
bdb_db_iwrite (struct bdb_ctx *bctx, const char *key, char *buf, size_t size)
752
return bdb_db_put (bctx, NULL, key, buf, size, 0, 0);
756
bdb_db_itruncate (struct bdb_ctx *bctx, const char *key)
758
return bdb_db_put (bctx, NULL, key, NULL, 0, 1, 0);
761
/* bdb_storage_del - delete a key/value pair corresponding to @path from
762
* corresponding db file.
764
* @bctx: bctx_t * corresponding to the parent directory of @path.
765
* (should always be a valid bctx). bdb_storage_del should never be called
767
* @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction
768
* or a valid DB_TXN *, when embedded in an explicit transaction.
769
* @path: path to the file, whose key/value pair has to be deleted.
771
* NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL
772
* (@bctx->dbp == NULL, nobody has opened DB till now or DB was closed by
773
* bdb_table_prune()).
775
* return: 0 on success or -1 on error.
778
bdb_db_del (bctx_t *bctx,
780
const char *key_string)
785
int32_t db_flags = 0;
786
uint8_t need_break = 0;
791
if (bctx->primary == NULL) {
792
ret = bdb_db_open (bctx);
793
storage = bctx->primary;
795
/* we are just fine, lets continue */
796
storage = bctx->primary;
799
UNLOCK (&bctx->lock);
801
GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out);
803
ret = bdb_cache_delete (bctx, key_string);
804
GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out);
806
key.data = (char *)key_string;
807
key.size = strlen (key_string);
808
key.flags = DB_DBT_USERMEM;
811
ret = storage->del (storage, txnid, &key, db_flags);
813
if (ret == DB_NOTFOUND) {
814
gf_log ("bdb-ll", GF_LOG_DEBUG,
815
"_BDB_DB_DEL %s - %s: ENOENT"
816
"(failed to delete entry, could not be "
817
"found in the database)",
818
bctx->directory, key_string);
820
} else if (ret == DB_LOCK_DEADLOCK) {
822
gf_log ("bdb-ll", GF_LOG_DEBUG,
823
"_BDB_DB_DEL %s - %s"
824
"(deadlock detected, retying for %d time)",
825
bctx->directory, key_string, retries);
826
} else if (ret == 0) {
827
/* successfully deleted the entry */
828
gf_log ("bdb-ll", GF_LOG_DEBUG,
829
"_BDB_DB_DEL %s - %s"
830
"(successfully deleted entry from database)",
831
bctx->directory, key_string);
835
gf_log ("bdb-ll", GF_LOG_DEBUG,
836
"_BDB_DB_DEL %s - %s: %s"
837
"(failed to delete entry from database)",
838
bctx->directory, key_string, db_strerror (ret));
842
} while (!need_break);
848
bdb_db_iremove (bctx_t *bctx,
851
return bdb_db_del (bctx, NULL, key);
854
/* NOTE: bdb version compatibility wrapper */
856
bdb_cursor_get (DBC *cursorp,
863
GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out);
865
#ifdef HAVE_BDB_CURSOR_GET
866
ret = cursorp->pget (cursorp, sec, pri, val, flags);
868
ret = cursorp->c_pget (cursorp, sec, pri, val, flags);
870
if ((ret != 0) && (ret != DB_NOTFOUND)) {
871
gf_log ("bdb-ll", GF_LOG_DEBUG,
872
"_BDB_CURSOR_GET: %s"
873
"(failed to retrieve entry from database cursor)",
879
}/* bdb_cursor_get */
882
bdb_dirent_size (DBT *key)
884
return ALIGN (24 /* FIX MEEEE!!! */ + key->size);
889
/* bdb_dbenv_init - initialize DB_ENV
891
* initialization includes:
892
* 1. opening DB_ENV (db_env_create(), DB_ENV->open()).
893
* NOTE: see private->envflags for flags used.
894
* 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files
895
* (log files are the files in which transaction logs are written by db).
896
* 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically
897
* clear the unwanted log files (flushed at each checkpoint).
898
* 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed
899
* error logs. used only for debbuging purpose.
901
* return: returns a valid DB_ENV * on success or NULL on error.
905
bdb_dbenv_init (xlator_t *this,
908
/* Create a DB environment */
909
DB_ENV *dbenv = NULL;
911
bdb_private_t *private = NULL;
912
int32_t fatal_flags = 0;
914
VALIDATE_OR_GOTO (this, err);
915
VALIDATE_OR_GOTO (directory, err);
917
private = this->private;
918
VALIDATE_OR_GOTO (private, err);
920
ret = db_env_create (&dbenv, 0);
921
VALIDATE_OR_GOTO ((ret == 0), err);
923
/* NOTE: set_errpfx returns 'void' */
924
dbenv->set_errpfx(dbenv, this->name);
926
ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT);
927
VALIDATE_OR_GOTO ((ret == 0), err);
929
ret = dbenv->open(dbenv, directory,
932
if ((ret != 0) && (ret != DB_RUNRECOVERY)) {
933
gf_log (this->name, GF_LOG_CRITICAL,
934
"failed to join Berkeley DB environment at %s: %s."
935
"please run manual recovery and retry running "
937
directory, db_strerror (ret));
940
} else if (ret == DB_RUNRECOVERY) {
941
fatal_flags = ((private->envflags & (~DB_RECOVER))
943
ret = dbenv->open(dbenv, directory, fatal_flags,
946
gf_log (this->name, GF_LOG_CRITICAL,
947
"failed to join Berkeley DB environment in "
948
"recovery mode at %s: %s. please run manual "
949
"recovery and retry running glusterfs",
950
directory, db_strerror (ret));
958
#if ((DB_VERSION_MAJOR > 4) || \
959
(DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 7))
960
if (private->log_auto_remove) {
961
ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1);
963
ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0);
966
if (private->log_auto_remove) {
967
ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1);
969
ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0);
973
gf_log ("bdb-ll", GF_LOG_ERROR,
974
"autoremoval of transactional log files could not be "
975
"configured (%s). you may have to do a manual "
976
"monitoring of transactional log files and remove "
982
if (private->transaction) {
983
ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1);
986
gf_log ("bdb-ll", GF_LOG_DEBUG,
987
"configuration of auto-commit failed for "
988
"database environment at %s. none of the "
989
"operations will be embedded in transaction "
990
"unless explicitly done so.",
995
if (private->txn_timeout) {
996
ret = dbenv->set_timeout (dbenv, private->txn_timeout,
999
gf_log ("bdb-ll", GF_LOG_ERROR,
1000
"could not configure Berkeley DB "
1001
"transaction timeout to %d (%s). please"
1002
" review 'option transaction-timeout %d"
1004
private->txn_timeout,
1006
private->txn_timeout);
1011
if (private->lock_timeout) {
1012
ret = dbenv->set_timeout(dbenv,
1013
private->txn_timeout,
1014
DB_SET_LOCK_TIMEOUT);
1016
gf_log ("bdb-ll", GF_LOG_ERROR,
1017
"could not configure Berkeley DB "
1018
"lock timeout to %d (%s). please"
1019
" review 'option lock-timeout %d"
1021
private->lock_timeout,
1023
private->lock_timeout);
1028
ret = dbenv->set_lg_dir (dbenv, private->logdir);
1030
gf_log ("bdb-ll", GF_LOG_ERROR,
1031
"failed to configure libdb transaction log "
1032
"directory at %s. please review the "
1033
"'option logdir %s' option.",
1034
db_strerror (ret), private->logdir);
1039
if (private->errfile) {
1040
private->errfp = fopen (private->errfile, "a+");
1041
if (private->errfp) {
1042
dbenv->set_errfile (dbenv, private->errfp);
1044
gf_log ("bdb-ll", GF_LOG_ERROR,
1045
"failed to open error logging file for "
1046
"libdb (Berkeley DB) internal logging (%s)."
1047
"please review the 'option errfile %s' option.",
1048
strerror (errno), private->errfile);
1056
dbenv->close (dbenv, 0);
1062
#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv)
1064
/* bdb_checkpoint - during transactional usage, db does not directly write the
1065
* data to db files, instead db writes a 'log' (similar to a journal entry)
1066
* into a log file. db normally clears the log files during opening of an
1067
* environment. since we expect a filesystem server to run for a pretty long
1068
* duration and flushing 'log's during dbenv->open would prove very costly, if
1069
* we accumulate the log entries for one complete run of glusterfs server. to
1070
* flush the logs frequently, db provides a mechanism called 'checkpointing'.
1071
* when we do a checkpoint, db flushes the logs to disk (writes changes to db
1072
* files) and we can also clear the accumulated log files after checkpointing.
1073
* NOTE: removing unwanted log files is not part of dbenv->txn_checkpoint()
1076
* @data: xlator_t of the current instance of bdb xlator.
1078
* bdb_checkpoint is called in a different thread from the main glusterfs
1079
* thread. bdb xlator creates the checkpoint thread after successfully opening
1080
* the db environment.
1081
* NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem
1084
* db environment checkpointing frequency is controlled by
1085
* 'option checkpoint-timeout <time-in-seconds>' in volfile.
1087
* NOTE: checkpointing thread is started only if 'option transaction on'
1088
* specified in volfile. checkpointing is not valid for non-transactional
1093
bdb_checkpoint (void *data)
1095
xlator_t *this = NULL;
1096
struct bdb_private *private = NULL;
1097
DB_ENV *dbenv = NULL;
1099
uint32_t active = 0;
1101
this = (xlator_t *) data;
1102
dbenv = BDB_ENV(this);
1103
private = this->private;
1105
for (;;sleep (private->checkpoint_interval)) {
1106
LOCK (&private->active_lock);
1107
active = private->active;
1108
UNLOCK (&private->active_lock);
1111
ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
1113
gf_log ("bdb-ll", GF_LOG_DEBUG,
1114
"_BDB_CHECKPOINT: %s"
1115
"(failed to checkpoint environment)",
1118
gf_log ("bdb-ll", GF_LOG_DEBUG,
1119
"_BDB_CHECKPOINT: successfully "
1123
ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0);
1125
gf_log ("bdb-ll", GF_LOG_ERROR,
1126
"_BDB_CHECKPOINT: %s"
1127
"(final checkpointing failed. might "
1128
"need to run recovery tool manually on "
1129
"next usage of this database "
1133
gf_log ("bdb-ll", GF_LOG_DEBUG,
1134
"_BDB_CHECKPOINT: final successfully "
1145
/* bdb_db_init - initialize bdb xlator
1147
* reads the options from @options dictionary and sets appropriate values in
1148
* @this->private. also initializes DB_ENV.
1150
* return: 0 on success or -1 on error
1151
* (with logging the error through gf_log()).
1154
bdb_db_init (xlator_t *this,
1157
/* create a db entry for root */
1159
bdb_private_t *private = NULL;
1160
bctx_table_t *table = NULL;
1162
char *checkpoint_interval_str = NULL;
1163
char *page_size_str = NULL;
1164
char *lru_limit_str = NULL;
1165
char *timeout_str = NULL;
1166
char *access_mode = NULL;
1167
char *endptr = NULL;
1168
char *errfile = NULL;
1169
char *directory = NULL;
1170
char *logdir = NULL;
1172
char *mode_str = NULL;
1175
struct stat stbuf = {0,};
1177
private = this->private;
1179
/* cache is always on */
1180
private->cache = ON;
1182
ret = dict_get_str (options, "access-mode", &access_mode);
1184
&& (!strcmp (access_mode, "btree"))) {
1185
gf_log (this->name, GF_LOG_DEBUG,
1186
"using BTREE access mode to access libdb "
1188
private->access_mode = DB_BTREE;
1190
gf_log (this->name, GF_LOG_DEBUG,
1191
"using HASH access mode to access libdb (Berkeley DB)");
1192
private->access_mode = DB_HASH;
1195
ret = dict_get_str (options, "mode", &mode);
1197
&& (!strcmp (mode, "cache"))) {
1198
gf_log (this->name, GF_LOG_DEBUG,
1199
"cache data mode selected for 'storage/bdb'. filesystem"
1200
" operations are not transactionally protected and "
1201
"system crash does not guarantee recoverability of "
1203
private->envflags = DB_CREATE | DB_INIT_LOG |
1204
DB_INIT_MPOOL | DB_THREAD;
1205
private->dbflags = DB_CREATE | DB_THREAD;
1206
private->transaction = OFF;
1208
gf_log (this->name, GF_LOG_DEBUG,
1209
"persistent data mode selected for 'storage/bdb'. each"
1210
"filesystem operation is guaranteed to be Berkeley DB "
1211
"transaction protected.");
1212
private->transaction = ON;
1213
private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG |
1214
DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD;
1215
private->dbflags = DB_CREATE | DB_THREAD;
1218
ret = dict_get_str (options, "lock-timeout", &timeout_str);
1221
ret = gf_string2time (timeout_str,
1222
&private->lock_timeout);
1224
if (private->lock_timeout > 4260000) {
1225
/* db allows us to DB_SET_LOCK_TIMEOUT to be
1226
* set to a maximum of 71 mins
1227
* (4260000 milliseconds) */
1228
gf_log (this->name, GF_LOG_DEBUG,
1229
"Berkeley DB lock-timeout parameter "
1230
"(%d) is out of range. please specify"
1231
" a valid timeout value for "
1232
"lock-timeout and retry.",
1233
private->lock_timeout);
1237
ret = dict_get_str (options, "transaction-timeout",
1240
ret = gf_string2time (timeout_str,
1241
&private->txn_timeout);
1243
if (private->txn_timeout > 4260000) {
1244
/* db allows us to DB_SET_TXN_TIMEOUT to be set
1245
* to a maximum of 71 mins
1246
* (4260000 milliseconds) */
1247
gf_log (this->name, GF_LOG_DEBUG,
1248
"Berkeley DB lock-timeout parameter "
1249
"(%d) is out of range. please specify"
1250
" a valid timeout value for "
1251
"lock-timeout and retry.",
1252
private->lock_timeout);
1257
private->checkpoint_interval = BDB_DEFAULT_CHECKPOINT_INTERVAL;
1258
ret = dict_get_str (options, "checkpoint-interval",
1259
&checkpoint_interval_str);
1261
ret = gf_string2time (checkpoint_interval_str,
1262
&private->checkpoint_interval);
1265
gf_log (this->name, GF_LOG_DEBUG,
1266
"'%"PRIu32"' is not a valid parameter "
1267
"for checkpoint-interval option. "
1268
"please specify a valid "
1269
"checkpoint-interval and retry",
1270
private->checkpoint_interval);
1276
ret = dict_get_str (options, "file-mode", &mode_str);
1278
private->file_mode = strtol (mode_str, &endptr, 8);
1281
(!IS_VALID_FILE_MODE(private->file_mode))) {
1282
gf_log (this->name, GF_LOG_DEBUG,
1283
"'%o' is not a valid parameter for file-mode "
1284
"option. please specify a valid parameter for "
1285
"file-mode and retry.",
1286
private->file_mode);
1290
private->file_mode = DEFAULT_FILE_MODE;
1292
private->symlink_mode = private->file_mode | S_IFLNK;
1293
private->file_mode = private->file_mode | S_IFREG;
1295
ret = dict_get_str (options, "dir-mode", &mode_str);
1297
private->dir_mode = strtol (mode_str, &endptr, 8);
1299
(!IS_VALID_FILE_MODE(private->dir_mode))) {
1300
gf_log (this->name, GF_LOG_DEBUG,
1301
"'%o' is not a valid parameter for dir-mode "
1302
"option. please specify a valid parameter for "
1303
"dir-mode and retry.",
1308
private->dir_mode = DEFAULT_DIR_MODE;
1311
private->dir_mode = private->dir_mode | S_IFDIR;
1313
table = CALLOC (1, sizeof (*table));
1314
if (table == NULL) {
1315
gf_log ("bdb-ll", GF_LOG_CRITICAL,
1316
"memory allocation for 'storage/bdb' internal "
1317
"context table failed.");
1321
INIT_LIST_HEAD(&(table->b_lru));
1322
INIT_LIST_HEAD(&(table->active));
1323
INIT_LIST_HEAD(&(table->purge));
1325
LOCK_INIT (&table->lock);
1326
LOCK_INIT (&table->checkpoint_lock);
1328
table->transaction = private->transaction;
1329
table->access_mode = private->access_mode;
1330
table->dbflags = private->dbflags;
1333
ret = dict_get_str (options, "lru-limit",
1336
/* TODO: set max lockers and max txns to accomodate
1337
* for more than lru_limit */
1339
ret = gf_string2uint32 (lru_limit_str,
1341
gf_log ("bdb-ll", GF_LOG_DEBUG,
1342
"setting lru limit of 'storage/bdb' internal context"
1343
"table to %d. maximum of %d unused databases can be "
1344
"open at any given point of time.",
1345
table->lru_limit, table->lru_limit);
1347
table->lru_limit = BDB_DEFAULT_LRU_LIMIT;
1350
ret = dict_get_str (options, "page-size",
1354
ret = gf_string2bytesize (page_size_str,
1357
gf_log ("bdb-ll", GF_LOG_ERROR,
1358
"\"%s\" is an invalid parameter to "
1359
"\"option page-size\". please specify a valid "
1365
if (!PAGE_SIZE_IN_RANGE(table->page_size)) {
1366
gf_log ("bdb-ll", GF_LOG_ERROR,
1367
"\"%s\" is out of range for Berkeley DB "
1368
"page-size. allowed page-size range is %d to "
1369
"%d. please specify a page-size value in the "
1371
page_size_str, BDB_LL_PAGE_SIZE_MIN,
1372
BDB_LL_PAGE_SIZE_MAX);
1376
table->page_size = BDB_LL_PAGE_SIZE_DEFAULT;
1379
table->hash_size = BDB_DEFAULT_HASH_SIZE;
1380
table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE,
1381
sizeof (struct list_head));
1383
for (idx = 0; idx < table->hash_size; idx++)
1384
INIT_LIST_HEAD(&(table->b_hash[idx]));
1386
private->b_table = table;
1388
ret = dict_get_str (options, "errfile", &errfile);
1390
private->errfile = strdup (errfile);
1391
gf_log (this->name, GF_LOG_DEBUG,
1392
"using %s as error logging file for libdb (Berkeley DB "
1393
"library) internal logging.", private->errfile);
1396
ret = dict_get_str (options, "directory", &directory);
1399
ret = dict_get_str (options, "logdir", &logdir);
1402
gf_log ("bdb-ll", GF_LOG_DEBUG,
1403
"using the database environment home "
1404
"directory (%s) itself as transaction log "
1405
"directory", directory);
1406
private->logdir = strdup (directory);
1409
private->logdir = strdup (logdir);
1411
op_ret = stat (private->logdir, &stbuf);
1413
|| (!S_ISDIR (stbuf.st_mode))) {
1414
gf_log ("bdb-ll", GF_LOG_ERROR,
1415
"specified logdir %s does not exist. "
1416
"please provide a valid existing "
1417
"directory as parameter to 'option "
1424
private->b_table->dbenv = bdb_dbenv_init (this, directory);
1425
if (private->b_table->dbenv == NULL) {
1426
gf_log ("bdb-ll", GF_LOG_ERROR,
1427
"initialization of database environment "
1431
if (private->transaction) {
1432
/* all well, start the checkpointing thread */
1433
LOCK_INIT (&private->active_lock);
1435
LOCK (&private->active_lock);
1437
private->active = 1;
1439
UNLOCK (&private->active_lock);
1440
pthread_create (&private->checkpoint_thread,
1441
NULL, bdb_checkpoint, this);
1449
FREE (table->b_hash);
1453
if (private->errfile)
1454
FREE (private->errfile);
1456
if (private->logdir)
1457
FREE (private->logdir);