2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 2001-2002
5
* Sleepycat Software. All rights reserved.
10
#include <sys/types.h>
12
#include <netinet/in.h>
13
#include <sys/socket.h>
27
#include <dbinc/queue.h> /* !!!: for the LIST_XXX macros. */
29
#include "ex_repquote.h"
31
int machtab_add __P((machtab_t *, int, u_int32_t, int, int *));
32
ssize_t readn __P((int, void *, size_t));
35
* This file defines the communication infrastructure for the ex_repquote
38
* This application uses TCP/IP for its communication. In an N-site
39
* replication group, this means that there are N * N communication
40
* channels so that every site can communicate with every other site
41
* (this allows elections to be held when the master fails). We do
42
* not require that anyone know about all sites when the application
43
* starts up. In order to communicate, the application should know
44
* about someone, else it has no idea how to ever get in the game.
46
* Communication is handled via a number of different threads. These
47
* thread functions are implemented in rep_util.c In this file, we
48
* define the data structures that maintain the state that describes
49
* the comm infrastructure, the functions that manipulates this state
50
* and the routines used to actually send and receive data over the
55
* The communication infrastructure is represented by a machine table,
56
* machtab_t, which is essentially a mutex-protected linked list of members
57
* of the group. The machtab also contains the parameters that are needed
58
* to call for an election. We hardwire values for these parameters in the
59
* init function, but these could be set via some configuration setup in a
60
* real application. We reserve the machine-id 1 to refer to ourselves and
61
* make the machine-id 0 be invalid.
64
#define MACHID_INVALID 0
68
LIST_HEAD(__machlist, __member) machlist;
70
pthread_mutex_t mtmutex;
71
u_int32_t timeout_time;
78
/* Data structure that describes each entry in the machtab. */
80
u_int32_t hostaddr; /* Host IP address. */
81
int port; /* Port number. */
82
int eid; /* Application-specific machine id. */
83
int fd; /* File descriptor for the socket. */
84
LIST_ENTRY(__member) links;
85
/* For linked list of all members we know of. */
88
static int quote_send_broadcast __P((machtab_t *,
89
const DBT *, const DBT *, u_int32_t));
90
static int quote_send_one __P((const DBT *, const DBT *, int, u_int32_t));
94
* Initialize the machine ID table.
95
* XXX Right now we treat the number of sites as the maximum
96
* number we've ever had on the list at one time. We probably
97
* want to make that smarter.
100
machtab_init(machtabp, pri, nsites)
101
machtab_t **machtabp;
107
if ((machtab = malloc(sizeof(machtab_t))) == NULL)
110
LIST_INIT(&machtab->machlist);
112
/* Reserve eid's 0 and 1. */
114
machtab->timeout_time = 2 * 1000000; /* 2 seconds. */
115
machtab->current = machtab->max = 0;
116
machtab->priority = pri;
117
machtab->nsites = nsites;
119
ret = pthread_mutex_init(&machtab->mtmutex, NULL);
128
* Add a file descriptor to the table of machines, returning
132
machtab_add(machtab, fd, hostaddr, port, idp)
139
member_t *m, *member;
141
if ((member = malloc(sizeof(member_t))) == NULL)
145
member->hostaddr = hostaddr;
148
if ((ret = pthread_mutex_lock(&machtab->mtmutex)) != 0)
151
for (m = LIST_FIRST(&machtab->machlist);
152
m != NULL; m = LIST_NEXT(m, links))
153
if (m->hostaddr == hostaddr && m->port == port)
157
member->eid = machtab->nextid++;
158
LIST_INSERT_HEAD(&machtab->machlist, member, links);
160
member->eid = m->eid;
162
ret = pthread_mutex_unlock(&machtab->mtmutex);
168
if (++machtab->current > machtab->max)
169
machtab->max = machtab->current;
179
* Return host and port information for a particular machine id.
182
machtab_getinfo(machtab, eid, hostp, portp)
191
if ((ret = pthread_mutex_lock(&machtab->mtmutex)) != 0)
194
for (member = LIST_FIRST(&machtab->machlist);
196
member = LIST_NEXT(member, links))
197
if (member->eid == eid) {
198
*hostp = member->hostaddr;
199
*portp = member->port;
203
if ((ret = pthread_mutex_unlock(&machtab->mtmutex)) != 0)
206
return (member != NULL ? 0 : EINVAL);
211
* Remove a mapping from the table of machines. Lock indicates
212
* whether we need to lock the machtab or not (0 indicates we do not
213
* need to lock; non-zero indicates that we do need to lock).
216
machtab_rem(machtab, eid, lock)
225
if (lock && (ret = pthread_mutex_lock(&machtab->mtmutex)) != 0)
228
for (found = 0, member = LIST_FIRST(&machtab->machlist);
230
member = LIST_NEXT(member, links))
231
if (member->eid == eid) {
233
LIST_REMOVE(member, links);
234
(void)close(member->fd);
240
if (LIST_FIRST(&machtab->machlist) == NULL)
244
ret = pthread_mutex_unlock(&machtab->mtmutex);
250
machtab_parm(machtab, nump, prip, timeoutp)
255
if (machtab->nsites == 0)
256
*nump = machtab->max;
258
*nump = machtab->nsites;
259
*prip = machtab->priority;
260
*timeoutp = machtab->timeout_time;
264
* listen_socket_init --
265
* Initialize a socket for listening on the specified port. Returns
266
* a file descriptor for the socket, ready for an accept() call
267
* in a thread that we're happy to let block.
270
listen_socket_init(progname, port)
271
const char *progname;
275
struct protoent *proto;
276
struct sockaddr_in si;
278
if ((proto = getprotobyname("tcp")) == NULL)
281
if ((s = socket(AF_INET, SOCK_STREAM, proto->p_proto)) < 0)
284
memset(&si, 0, sizeof(si));
285
si.sin_family = AF_INET;
286
si.sin_addr.s_addr = htonl(INADDR_ANY);
287
si.sin_port = htons(port);
289
if (bind(s, (struct sockaddr *)&si, sizeof(si)) != 0)
292
if (listen(s, 5) != 0)
297
err: fprintf(stderr, "%s: %s", progname, strerror(errno));
303
* listen_socket_accept --
304
* Accept a connection on a socket. This is essentially just a wrapper
308
listen_socket_accept(machtab, progname, s, eidp)
310
const char *progname;
313
struct sockaddr_in si;
315
int host, ns, port, ret;
317
COMPQUIET(progname, NULL);
319
wait: memset(&si, 0, sizeof(si));
321
ns = accept(s, (struct sockaddr *)&si, &si_len);
322
host = ntohl(si.sin_addr.s_addr);
323
port = ntohs(si.sin_port);
324
ret = machtab_add(machtab, ns, host, port, eidp);
338
* get_accepted_socket --
339
* Listen on the specified port, and return a file descriptor
340
* when we have accepted a connection on it.
343
get_accepted_socket(progname, port)
344
const char *progname;
347
struct protoent *proto;
348
struct sockaddr_in si;
352
if ((proto = getprotobyname("tcp")) == NULL)
355
if ((s = socket(AF_INET, SOCK_STREAM, proto->p_proto)) < 0)
358
memset(&si, 0, sizeof(si));
359
si.sin_family = AF_INET;
360
si.sin_addr.s_addr = htonl(INADDR_ANY);
361
si.sin_port = htons(port);
363
if (bind(s, (struct sockaddr *)&si, sizeof(si)) != 0)
366
if (listen(s, 5) != 0)
369
memset(&si, 0, sizeof(si));
371
ns = accept(s, (struct sockaddr *)&si, &si_len);
375
err: fprintf(stderr, "%s: %s", progname, strerror(errno));
381
* get_connected_socket --
382
* Connect to the specified port of the specified remote machine,
383
* and return a file descriptor when we have accepted a connection on it.
384
* Add this connection to the machtab. If we already have a connection
385
* open to this machine, then don't create another one, return the eid
386
* of the connection (in *eidp) and set is_open to 1. Return 0.
389
get_connected_socket(machtab, progname, remotehost, port, is_open, eidp)
391
const char *progname, *remotehost;
392
int port, *is_open, *eidp;
396
struct protoent *proto;
397
struct sockaddr_in si;
402
if ((proto = getprotobyname("tcp")) == NULL)
405
if ((hp = gethostbyname(remotehost)) == NULL) {
406
fprintf(stderr, "%s: host not found: %s\n", progname,
411
if ((s = socket(AF_INET, SOCK_STREAM, proto->p_proto)) < 0)
413
memset(&si, 0, sizeof(si));
414
memcpy((char *)&si.sin_addr, hp->h_addr, hp->h_length);
415
addr = ntohl(si.sin_addr.s_addr);
416
ret = machtab_add(machtab, s, addr, port, eidp);
421
} else if (ret != 0) {
426
si.sin_family = AF_INET;
427
si.sin_port = htons(port);
428
if (connect(s, (struct sockaddr *)&si, sizeof(si)) < 0) {
429
fprintf(stderr, "%s: connection failed: %s",
430
progname, strerror(errno));
431
(void)machtab_rem(machtab, *eidp, 1);
439
* get_next_message --
440
* Read a single message from the specified file descriptor, and
441
* return it in the format used by rep functions (two DBTs and a type).
443
* This function is called in a loop by both clients and masters, and
444
* the resulting DBTs are manually dispatched to DB_ENV->rep_process_message().
447
get_next_message(fd, rec, control)
452
u_int32_t rsize, csize;
453
u_int8_t *recbuf, *controlbuf;
456
* The protocol we use on the wire is dead simple:
458
* 4 bytes - rec->size
459
* (# read above) - rec->data
460
* 4 bytes - control->size
461
* (# read above) - control->data
464
/* Read rec->size. */
465
nr = readn(fd, &rsize, 4);
469
/* Read the record itself. */
471
if (rec->size < rsize)
472
rec->data = realloc(rec->data, rsize);
474
nr = readn(fd, recbuf, rsize);
476
if (rec->data != NULL)
482
/* Read control->size. */
483
nr = readn(fd, &csize, 4);
487
/* Read the control struct itself. */
489
controlbuf = control->data;
490
if (control->size < csize)
491
controlbuf = realloc(controlbuf, csize);
492
nr = readn(fd, controlbuf, csize);
496
if (control->data != NULL)
500
control->data = controlbuf;
501
control->size = csize;
508
* Read a full n characters from a file descriptor, unless we get an error
524
if ( (nread = read(fd, ptr, nleft)) < 0) {
526
* Call read() again on interrupted system call;
527
* on other errors, bail.
533
} else if (nread == 0)
545
* The f_send function for DB_ENV->set_rep_transport.
548
quote_send(dbenv, control, rec, eid, flags)
550
const DBT *control, *rec;
554
int fd, n, ret, t_ret;
558
machtab = (machtab_t *)dbenv->app_private;
560
if (eid == DB_EID_BROADCAST) {
562
* Right now, we do not require successful transmission.
563
* I'd like to move this requiring at least one successful
564
* transmission on PERMANENT requests.
566
n = quote_send_broadcast(machtab, rec, control, flags);
567
if (n < 0 /*|| (n == 0 && LF_ISSET(DB_REP_PERMANENT))*/)
568
return (DB_REP_UNAVAIL);
572
if ((ret = pthread_mutex_lock(&machtab->mtmutex)) != 0)
576
for (m = LIST_FIRST(&machtab->machlist); m != NULL;
577
m = LIST_NEXT(m, links)) {
585
dbenv->err(dbenv, DB_REP_UNAVAIL,
586
"quote_send: cannot find machine ID %d", eid);
587
return (DB_REP_UNAVAIL);
590
ret = quote_send_one(rec, control, fd, flags);
592
if ((t_ret = (pthread_mutex_unlock(&machtab->mtmutex))) != 0 &&
600
* quote_send_broadcast --
601
* Send a message to everybody.
602
* Returns the number of sites to which this message was successfully
603
* communicated. A -1 indicates a fatal error.
606
quote_send_broadcast(machtab, rec, control, flags)
608
const DBT *rec, *control;
614
if ((ret = pthread_mutex_lock(&machtab->mtmutex)) != 0)
618
for (m = LIST_FIRST(&machtab->machlist); m != NULL; m = next) {
619
next = LIST_NEXT(m, links);
620
if ((ret = quote_send_one(rec, control, m->fd, flags)) != 0) {
621
(void)machtab_rem(machtab, m->eid, 0);
626
if (pthread_mutex_unlock(&machtab->mtmutex) != 0)
634
* Send a message to a single machine, given that machine's file
638
* Note that the machtab mutex should be held through this call.
639
* It doubles as a synchronizer to make sure that two threads don't
640
* intersperse writes that are part of two single messages.
643
quote_send_one(rec, control, fd, flags)
644
const DBT *rec, *control;
650
ssize_t bytes_left, nw;
656
* The protocol is simply: write rec->size, write rec->data,
657
* write control->size, write control->data.
659
nw = write(fd, &rec->size, 4);
661
return (DB_REP_UNAVAIL);
664
nw = write(fd, rec->data, rec->size);
666
return (DB_REP_UNAVAIL);
667
if (nw != (ssize_t)rec->size) {
668
/* Try a couple of times to finish the write. */
669
wp = (u_int8_t *)rec->data + nw;
670
bytes_left = rec->size - nw;
671
for (retry = 0; bytes_left > 0 && retry < 3; retry++) {
672
nw = write(fd, wp, bytes_left);
674
return (DB_REP_UNAVAIL);
679
return (DB_REP_UNAVAIL);
683
nw = write(fd, &control->size, 4);
685
return (DB_REP_UNAVAIL);
686
if (control->size > 0) {
687
nw = write(fd, control->data, control->size);
688
if (nw != (ssize_t)control->size)
689
return (DB_REP_UNAVAIL);