2
* Copyright 2010-2011 Red Hat, Inc.
4
* This copyrighted material is made available to anyone wishing to use,
5
* modify, copy, or redistribute it subject to the terms and conditions
6
* of the GNU General Public License v2 or (at your option) any later version.
22
#include <sys/types.h>
26
#include "sanlock_internal.h"
27
#include "sanlock_sock.h"
30
#include "delta_lease.h"
31
#include "lockspace.h"
37
static uint32_t space_id_counter = 1;
39
static struct space *_search_space(char *name,
40
struct sync_disk *disk,
42
struct list_head *head1,
43
struct list_head *head2,
44
struct list_head *head3)
49
list_for_each_entry(sp, head1, list) {
50
if (name && strncmp(sp->space_name, name, NAME_ID_SIZE))
52
if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN))
54
if (disk && sp->host_id_disk.offset != disk->offset)
56
if (host_id && sp->host_id != host_id)
62
list_for_each_entry(sp, head2, list) {
63
if (name && strncmp(sp->space_name, name, NAME_ID_SIZE))
65
if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN))
67
if (disk && sp->host_id_disk.offset != disk->offset)
69
if (host_id && sp->host_id != host_id)
75
list_for_each_entry(sp, head3, list) {
76
if (name && strncmp(sp->space_name, name, NAME_ID_SIZE))
78
if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN))
80
if (disk && sp->host_id_disk.offset != disk->offset)
82
if (host_id && sp->host_id != host_id)
90
struct space *find_lockspace(char *name)
92
return _search_space(name, NULL, 0, &spaces, &spaces_rem, &spaces_add);
95
int _lockspace_info(char *space_name, struct space *sp_out)
99
list_for_each_entry(sp, &spaces, list) {
100
if (strncmp(sp->space_name, space_name, NAME_ID_SIZE))
102
memcpy(sp_out, sp, sizeof(struct space));
108
int lockspace_info(char *space_name, struct space *sp_out)
112
pthread_mutex_lock(&spaces_mutex);
113
rv = _lockspace_info(space_name, sp_out);
114
pthread_mutex_unlock(&spaces_mutex);
119
int lockspace_disk(char *space_name, struct sync_disk *disk)
124
pthread_mutex_lock(&spaces_mutex);
125
rv = _lockspace_info(space_name, &space);
127
memcpy(disk, &space.host_id_disk, sizeof(struct sync_disk));
130
pthread_mutex_unlock(&spaces_mutex);
136
static void clear_bit(int host_id, char *bitmap)
138
char *byte = bitmap + ((host_id - 1) / 8);
139
unsigned int bit = host_id % 8;
145
static void set_id_bit(int host_id, char *bitmap, char *c)
147
char *byte = bitmap + ((host_id - 1) / 8);
148
unsigned int bit = (host_id - 1) % 8;
158
/* FIXME: another copy in direct_lib.c */
160
int test_id_bit(int host_id, char *bitmap)
162
char *byte = bitmap + ((host_id - 1) / 8);
163
unsigned int bit = (host_id - 1) % 8;
168
return (*byte & mask);
171
int host_status_set_bit(char *space_name, uint64_t host_id)
176
if (!host_id || host_id > DEFAULT_MAX_HOSTS)
179
pthread_mutex_lock(&spaces_mutex);
180
list_for_each_entry(sp, &spaces, list) {
181
if (strncmp(sp->space_name, space_name, NAME_ID_SIZE))
186
pthread_mutex_unlock(&spaces_mutex);
191
pthread_mutex_lock(&sp->mutex);
192
sp->host_status[host_id-1].set_bit_time = monotime();
193
pthread_mutex_unlock(&sp->mutex);
197
int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out)
202
if (!host_id || host_id > DEFAULT_MAX_HOSTS)
205
pthread_mutex_lock(&spaces_mutex);
206
list_for_each_entry(sp, &spaces, list) {
207
if (strncmp(sp->space_name, space_name, NAME_ID_SIZE))
209
memcpy(hs_out, &sp->host_status[host_id-1], sizeof(struct host_status));
213
pthread_mutex_unlock(&spaces_mutex);
220
static void create_bitmap(struct task *task, struct space *sp, char *bitmap)
228
pthread_mutex_lock(&sp->mutex);
229
for (i = 0; i < DEFAULT_MAX_HOSTS; i++) {
230
if (i+1 == sp->host_id)
233
if (!sp->host_status[i].set_bit_time)
236
if (now - sp->host_status[i].set_bit_time > task->request_finish_seconds) {
237
log_space(sp, "bitmap clear host_id %d", i+1);
238
sp->host_status[i].set_bit_time = 0;
240
set_id_bit(i+1, bitmap, &c);
241
log_space(sp, "bitmap set host_id %d byte %x", i+1, c);
244
pthread_mutex_unlock(&sp->mutex);
247
void check_other_leases(struct task *task, struct space *sp, char *buf)
249
struct leader_record *leader;
250
struct sync_disk *disk;
251
struct host_status *hs;
256
disk = &sp->host_id_disk;
261
for (i = 0; i < DEFAULT_MAX_HOSTS; i++) {
262
hs = &sp->host_status[i];
263
hs->last_check = now;
265
leader = (struct leader_record *)(buf + (i * disk->sector_size));
267
if (hs->owner_id == leader->owner_id &&
268
hs->owner_generation == leader->owner_generation &&
269
hs->timestamp == leader->timestamp) {
273
hs->owner_id = leader->owner_id;
274
hs->owner_generation = leader->owner_generation;
275
hs->timestamp = leader->timestamp;
278
if (i+1 == sp->host_id)
281
bitmap = (char *)leader + HOSTID_BITMAP_OFFSET;
283
if (!test_id_bit(sp->host_id, bitmap))
286
/* this host has made a request for us, we won't take a new
287
request from this host for another request_finish_seconds */
289
if (now - hs->last_req < task->request_finish_seconds)
292
log_space(sp, "request from host_id %d", i+1);
298
set_resource_examine(sp->space_name, NULL);
302
* check if our_host_id_thread has renewed within timeout
305
int check_our_lease(struct task *task, struct space *sp, int *check_all, char *check_buf)
307
uint64_t last_success;
311
pthread_mutex_lock(&sp->mutex);
312
last_success = sp->lease_status.renewal_last_success;
313
corrupt_result = sp->lease_status.corrupt_result;
315
if (sp->lease_status.renewal_read_count > sp->lease_status.renewal_read_check) {
316
/* main loop will pass this buf to check_other_leases next */
317
sp->lease_status.renewal_read_check = sp->lease_status.renewal_read_count;
320
memcpy(check_buf, sp->lease_status.renewal_read_buf, sp->align_size);
322
pthread_mutex_unlock(&sp->mutex);
324
if (corrupt_result) {
325
log_erros(sp, "check_our_lease corrupt %d", corrupt_result);
329
gap = monotime() - last_success;
331
if (gap >= task->id_renewal_fail_seconds) {
332
log_erros(sp, "check_our_lease failed %d", gap);
336
if (gap >= task->id_renewal_warn_seconds) {
337
log_erros(sp, "check_our_lease warning %d last_success %llu",
338
gap, (unsigned long long)last_success);
341
if (com.debug_renew > 1) {
342
log_space(sp, "check_our_lease good %d %llu",
343
gap, (unsigned long long)last_success);
349
/* If a renewal result is one of the listed errors, it means our
350
delta lease has been corrupted/overwritten/reinitialized out from
351
under us, and we should stop using it immediately. There's no
352
point in retrying the renewal. */
354
static int corrupt_result(int result)
357
case SANLK_RENEW_OWNER:
358
case SANLK_RENEW_DIFF:
359
case SANLK_LEADER_MAGIC:
360
case SANLK_LEADER_VERSION:
361
case SANLK_LEADER_SECTORSIZE:
362
case SANLK_LEADER_LOCKSPACE:
363
case SANLK_LEADER_CHECKSUM:
370
static void *lockspace_thread(void *arg_in)
372
char bitmap[HOSTID_BITMAP_SIZE];
375
struct leader_record leader;
376
uint64_t delta_begin, last_success;
377
int rv, delta_length, renewal_interval;
378
int acquire_result, delta_result, read_result;
382
sp = (struct space *)arg_in;
384
memset(&task, 0, sizeof(struct task));
385
setup_task_timeouts(&task, main_task.io_timeout_seconds);
386
setup_task_aio(&task, main_task.use_aio, HOSTID_AIO_CB_SIZE);
387
memcpy(task.name, sp->space_name, NAME_ID_SIZE);
389
delta_begin = monotime();
391
rv = open_disk(&sp->host_id_disk);
393
log_erros(sp, "open_disk %s error %d", sp->host_id_disk.path, rv);
394
acquire_result = -ENODEV;
399
sp->align_size = direct_align(&sp->host_id_disk);
400
if (sp->align_size < 0) {
401
log_erros(sp, "direct_align error");
402
acquire_result = sp->align_size;
406
sp->lease_status.renewal_read_buf = malloc(sp->align_size);
407
if (!sp->lease_status.renewal_read_buf) {
408
acquire_result = -ENOMEM;
413
* acquire the delta lease
416
delta_begin = monotime();
418
delta_result = delta_lease_acquire(&task, sp, &sp->host_id_disk,
419
sp->space_name, our_host_name_global,
420
sp->host_id, &leader);
421
delta_length = monotime() - delta_begin;
423
if (delta_result == SANLK_OK)
424
last_success = leader.timestamp;
426
acquire_result = delta_result;
428
/* we need to start the watchdog after we acquire the host_id but
429
before we allow any pid's to begin running */
431
if (delta_result == SANLK_OK) {
432
rv = create_watchdog_file(sp, last_success);
434
log_erros(sp, "create_watchdog failed %d", rv);
435
acquire_result = SANLK_ERROR;
440
pthread_mutex_lock(&sp->mutex);
441
sp->lease_status.acquire_last_result = acquire_result;
442
sp->lease_status.acquire_last_attempt = delta_begin;
443
if (delta_result == SANLK_OK)
444
sp->lease_status.acquire_last_success = last_success;
445
sp->lease_status.renewal_last_result = acquire_result;
446
sp->lease_status.renewal_last_attempt = delta_begin;
447
if (delta_result == SANLK_OK)
448
sp->lease_status.renewal_last_success = last_success;
449
pthread_mutex_unlock(&sp->mutex);
451
if (acquire_result < 0)
454
sp->host_generation = leader.owner_generation;
457
pthread_mutex_lock(&sp->mutex);
458
stop = sp->thread_stop;
459
pthread_mutex_unlock(&sp->mutex);
465
* wait between each renewal
468
if (monotime() - last_success < task.id_renewal_seconds) {
472
/* don't spin too quickly if renew is failing
473
immediately and repeatedly */
479
* do a renewal, measuring length of time spent in renewal,
480
* and the length of time between successful renewals
483
memset(bitmap, 0, sizeof(bitmap));
484
create_bitmap(&task, sp, bitmap);
486
delta_begin = monotime();
488
delta_result = delta_lease_renew(&task, sp, &sp->host_id_disk,
489
sp->space_name, bitmap,
490
delta_result, &read_result,
492
delta_length = monotime() - delta_begin;
494
if (delta_result == SANLK_OK) {
495
renewal_interval = leader.timestamp - last_success;
496
last_success = leader.timestamp;
501
* publish the results
504
pthread_mutex_lock(&sp->mutex);
505
sp->lease_status.renewal_last_result = delta_result;
506
sp->lease_status.renewal_last_attempt = delta_begin;
508
if (delta_result == SANLK_OK)
509
sp->lease_status.renewal_last_success = last_success;
511
if (delta_result != SANLK_OK && !sp->lease_status.corrupt_result)
512
sp->lease_status.corrupt_result = corrupt_result(delta_result);
514
if (read_result == SANLK_OK && task.iobuf) {
515
memcpy(sp->lease_status.renewal_read_buf, task.iobuf, sp->align_size);
516
sp->lease_status.renewal_read_count++;
522
* (don't update on thread_stop because it's probably unlinked)
525
if (delta_result == SANLK_OK && !sp->thread_stop)
526
update_watchdog_file(sp, last_success);
528
pthread_mutex_unlock(&sp->mutex);
535
if (delta_result != SANLK_OK) {
536
log_erros(sp, "renewal error %d delta_length %d last_success %llu",
537
delta_result, delta_length, (unsigned long long)last_success);
538
} else if (delta_length > task.id_renewal_seconds) {
539
log_erros(sp, "renewed %llu delta_length %d too long",
540
(unsigned long long)last_success, delta_length);
541
} else if (com.debug_renew) {
542
log_space(sp, "renewed %llu delta_length %d interval %d",
543
(unsigned long long)last_success, delta_length, renewal_interval);
547
/* watchdog unlink was done in main_loop when thread_stop was set, to
548
get it done as quickly as possible in case the wd is about to fire. */
550
close_watchdog_file(sp);
552
if (delta_result == SANLK_OK)
553
delta_lease_release(&task, sp, &sp->host_id_disk,
554
sp->space_name, &leader, &leader);
557
close(sp->host_id_disk.fd);
559
close_task_aio(&task);
563
static void free_sp(struct space *sp)
565
if (sp->lease_status.renewal_read_buf)
566
free(sp->lease_status.renewal_read_buf);
571
* When this function returns, it needs to be safe to being processing lease
572
* requests and allowing pid's to run, so we need to own our host_id, and the
573
* watchdog needs to be active watching our host_id renewals.
576
int add_lockspace(struct sanlk_lockspace *ls)
578
struct space *sp, *sp2;
581
if (!ls->name[0] || !ls->host_id || !ls->host_id_disk.path[0]) {
582
log_error("add_lockspace bad args id %llu name %zu path %zu",
583
(unsigned long long)ls->host_id,
584
strlen(ls->name), strlen(ls->host_id_disk.path));
588
sp = malloc(sizeof(struct space));
591
memset(sp, 0, sizeof(struct space));
593
memcpy(sp->space_name, ls->name, NAME_ID_SIZE);
594
memcpy(&sp->host_id_disk, &ls->host_id_disk, sizeof(struct sanlk_disk));
595
sp->host_id_disk.sector_size = 0;
596
sp->host_id_disk.fd = -1;
597
sp->host_id = ls->host_id;
598
pthread_mutex_init(&sp->mutex, NULL);
600
pthread_mutex_lock(&spaces_mutex);
602
/* search all lists for an identical lockspace */
604
sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id,
605
&spaces, NULL, NULL);
607
pthread_mutex_unlock(&spaces_mutex);
612
sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id,
613
&spaces_add, NULL, NULL);
615
pthread_mutex_unlock(&spaces_mutex);
620
sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id,
621
&spaces_rem, NULL, NULL);
623
pthread_mutex_unlock(&spaces_mutex);
628
/* search all lists for a lockspace with the same name */
630
sp2 = _search_space(sp->space_name, NULL, 0,
631
&spaces, &spaces_add, &spaces_rem);
633
pthread_mutex_unlock(&spaces_mutex);
638
/* search all lists for a lockspace with the same host_id_disk */
640
sp2 = _search_space(NULL, &sp->host_id_disk, 0,
641
&spaces, &spaces_add, &spaces_rem);
643
pthread_mutex_unlock(&spaces_mutex);
648
sp->space_id = space_id_counter++;
649
list_add(&sp->list, &spaces_add);
650
pthread_mutex_unlock(&spaces_mutex);
652
/* save a record of what this space_id is for later debugging */
653
log_level(sp->space_id, 0, NULL, LOG_WARNING,
654
"lockspace %.48s:%llu:%.256s:%llu",
656
(unsigned long long)sp->host_id,
657
sp->host_id_disk.path,
658
(unsigned long long)sp->host_id_disk.offset);
660
rv = pthread_create(&sp->thread, NULL, lockspace_thread, sp);
662
log_erros(sp, "add_lockspace create thread failed");
667
pthread_mutex_lock(&sp->mutex);
668
result = sp->lease_status.acquire_last_result;
669
pthread_mutex_unlock(&sp->mutex);
675
if (result != SANLK_OK) {
676
/* the thread exits right away if acquire fails */
677
pthread_join(sp->thread, NULL);
682
/* once we move sp to spaces list, tokens can begin using it,
683
and the main loop will begin monitoring its renewals */
685
pthread_mutex_lock(&spaces_mutex);
686
if (sp->external_remove || external_shutdown) {
688
pthread_mutex_unlock(&spaces_mutex);
691
list_move(&sp->list, &spaces);
692
pthread_mutex_unlock(&spaces_mutex);
696
pthread_mutex_lock(&spaces_mutex);
698
pthread_mutex_unlock(&spaces_mutex);
704
int inq_lockspace(struct sanlk_lockspace *ls)
709
pthread_mutex_lock(&spaces_mutex);
711
sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
712
&spaces, NULL, NULL);
721
sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
722
&spaces_add, &spaces_rem, NULL);
728
pthread_mutex_unlock(&spaces_mutex);
732
int rem_lockspace(struct sanlk_lockspace *ls)
734
struct space *sp, *sp2;
738
pthread_mutex_lock(&spaces_mutex);
740
sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
741
&spaces_rem, NULL, NULL);
743
pthread_mutex_unlock(&spaces_mutex);
748
sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
749
&spaces_add, NULL, NULL);
751
sp->external_remove = 1;
752
pthread_mutex_unlock(&spaces_mutex);
757
sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
758
&spaces, NULL, NULL);
760
pthread_mutex_unlock(&spaces_mutex);
765
sp->external_remove = 1;
767
pthread_mutex_unlock(&spaces_mutex);
770
pthread_mutex_lock(&spaces_mutex);
771
sp2 = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id,
772
&spaces, &spaces_rem, NULL);
773
if (sp2 && sp2->space_id == id)
777
pthread_mutex_unlock(&spaces_mutex);
789
* we call stop_host_id() when all pids are gone and we're in a safe state, so
790
* it's safe to unlink the watchdog right away here. We want to sp the unlink
791
* as soon as it's safe, so we can reduce the chance we get killed by the
792
* watchdog (we could actually call this in main_loop just before the break).
793
* Getting this unlink done quickly is more important than doing at the more
794
* "logical" point commented above in host_id_thread.
797
static int stop_lockspace_thread(struct space *sp, int wait)
801
pthread_mutex_lock(&sp->mutex);
802
stop = sp->thread_stop;
804
pthread_mutex_unlock(&sp->mutex);
807
/* should never happen */
808
log_erros(sp, "stop_lockspace_thread zero thread_stop");
813
rv = pthread_join(sp->thread, NULL);
815
rv = pthread_tryjoin_np(sp->thread, NULL);
820
void free_lockspaces(int wait)
822
struct space *sp, *safe;
825
pthread_mutex_lock(&spaces_mutex);
826
list_for_each_entry_safe(sp, safe, &spaces_rem, list) {
827
rv = stop_lockspace_thread(sp, wait);
829
log_space(sp, "free lockspace");
834
pthread_mutex_unlock(&spaces_mutex);