~codership/galera/2.x

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
/*
 * Copyright (C) 2008-2013 Codership Oy <info@codership.com>
 *
 * $Id: gcs.h 3446 2014-01-13 01:33:30Z teemu $
 */

/*!
 * @file gcs.c Public GCS API
 */

#ifndef _gcs_h_
#define _gcs_h_

#ifdef __cplusplus
extern "C" {
#endif

#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>
#include <stdio.h>

#include "gu_config.h"
#include "gcache.h"
#include "gu_errno.h"

/*! @typedef @brief Sequence number type. */
typedef int64_t gcs_seqno_t;

/*! @def @brief Illegal sequence number. Action not serialized. */
static const gcs_seqno_t GCS_SEQNO_ILL   = -1;
/*! @def @brief Empty state. No actions applied. */
static const gcs_seqno_t GCS_SEQNO_NIL   =  0;
/*! @def @brief Start of the sequence */
static const gcs_seqno_t GCS_SEQNO_FIRST =  1;
/*! @def @brief history UUID length */
#define GCS_UUID_LEN 16
/*! @def @brief maximum supported size of an action (2GB - 1) */
#define GCS_MAX_ACT_SIZE 0x7FFFFFFF

/*! Connection handle type */
typedef struct gcs_conn gcs_conn_t;

/*! @brief Creates GCS connection handle.
 *
 * @param conf      gu_config_t* configuration object, can be null.
 * @param cache     pointer to the gcache object.
 * @param node_name human readable name of the node, can be null.
 * @param inc_addr  address at which application accepts incoming requests.
 *                  Used for load balancing, can be null.
 * @param repl_proto_ver max replicator protocol version.
 * @param appl_proto_ver max application ptotocol version.
 * @return pointer to GCS connection handle, NULL in case of failure.
 */
extern gcs_conn_t*
gcs_create  (gu_config_t* conf, gcache_t* cache,
             const char* node_name, const char* inc_addr,
             int repl_proto_ver, int appl_proto_ver);

/*! @brief Initialize group history values (optional).
 * Serves to provide group history persistence after process restart (in case
 * these data were saved somewhere on persistent storage or the like). If these
 * values are provided, it is only a hint for the group, as they might be
 * outdated. Actual seqno and UUID are returned in GCS_ACT_CONF action (see
 * below) and are determined by quorum.
 *
 * This function must be called before gcs_open() or after gcs_close().
 *
 * @param seqno Sequence number of the application state (last action applied).
 *              Should be negative for undefined state.
 * @param uuid  UUID of the sequence (group ID).
 *              Should be all zeroes for undefined state.
 *
 * @return 0 in case of success, -EBUSY if conneciton is already opened,
 *         -EBADFD if connection object is being destroyed.
 */
extern long gcs_init (gcs_conn_t   *conn,
                      gcs_seqno_t   seqno,
                      const uint8_t uuid[GCS_UUID_LEN]);

/*! @brief Opens connection to group (joins channel).
 *
 * @param conn connection object
 * @param channel a name of the channel to join. It must uniquely identify
 *                the channel. If the channel with such name does not exist,
 *                it is created. Processes that joined the same channel
 *                receive the same actions.
 * @param url     an URL-like string that specifies backend communication
 *                driver in the form "TYPE://ADDRESS?options". For gcomm
 *                backend it can be "gcomm://localhost:4567", for dummy backend
 *                ADDRESS field is ignored.
 *                Currently supported backend types: "dummy", "vsbes", "gcomm"
 *
 * @return negative error code, 0 in case of success.
 */
extern long gcs_open  (gcs_conn_t *conn,
                       const char *channel,
                       const char *url);

/*! @brief Closes connection to group.
 *
 * @param  conn connection handle
 * @return negative error code or 0 in case of success.
 */
extern long gcs_close (gcs_conn_t *conn);

/*! @brief Frees resources associuated with connection handle.
 *
 * @param  conn connection handle
 * @return negative error code or 0 in case of success.
 */
extern long gcs_destroy (gcs_conn_t *conn);

/*! @brief Deprecated. Waits until the group catches up.
 * This call checks if any member of the group (including this one) has a
 * long slave queue. Should be called before gcs_repl(), gcs_send().
 *
 * @return negative error code, 1 if wait is required, 0 otherwise
 */
extern long gcs_wait (gcs_conn_t *conn);

/*! @typedef @brief Action types.
 * There is a conceptual difference between "messages"
 * and "actions". Messages are ELEMENTARY pieces of information
 * atomically delivered by group communication. They are typically
 * limited in size to a single IP packet. Events generated by group
 * communication layer must be delivered as a single message.
 *
 * For the purpose of this work "action" is a higher level concept
 * introduced to overcome the message size limitation. Application
 * replicates information in actions of ARBITRARY size that are
 * fragmented into as many messages as needed. As such actions
 * can be delivered only in primary configuration, when total order
 * of underlying messages is established.
 * The best analogy for action/message concept would be word/letter.
 *
 * The purpose of GCS library is to hide message handling from application.
 * Therefore application deals only with "actions".
 * Application can only send actions of types GCS_ACT_TORDERED,
 * GCS_ACT_COMMIT_CUT and GCS_ACT_STATE_REQ.
 * Actions of type GCS_ACT_SYNC, GCS_ACT_CONF are generated by the library.
 */
typedef enum gcs_act_type
{
/* ordered actions */
    GCS_ACT_TORDERED,   //! action representing state change, will be assigned global seqno
    GCS_ACT_COMMIT_CUT, //! group-wide action commit cut
    GCS_ACT_STATE_REQ,  //! request for state transfer
    GCS_ACT_CONF,       //! new configuration
    GCS_ACT_JOIN,       //! joined group (received all state data)
    GCS_ACT_SYNC,       //! synchronized with group
    GCS_ACT_FLOW,       //! flow control
    GCS_ACT_SERVICE,    //! service action, sent by GCS
    GCS_ACT_ERROR,      //! error happened while receiving the action
    GCS_ACT_UNKNOWN     //! undefined/unknown action type
}
gcs_act_type_t;

/*! String representations of action types */
extern const char* gcs_act_type_to_str(gcs_act_type_t);

struct gcs_action {
    const void*    buf;
    ssize_t        size;
    gcs_seqno_t    seqno_g;
    gcs_seqno_t    seqno_l;
    gcs_act_type_t type;
};

/*! @brief Sends an action to group and returns.
 * A copy of action will be returned through gcs_recv() call, or discarded
 * in case it is not delivered by group.
 * For a better means to replicate an action see gcs_repl(). @see gcs_repl()
 *
 * @param conn group connection handle
 * @param act_buf    action buffer
 * @param act_size   action size
 * @param act_type   action type
 * @param scheduled  whether the call was scheduled by gcs_schedule()
 * @return           negative error code, action size in case of success
 * @retval -EINTR    thread was interrupted while waiting to enter the monitor
 */
extern long gcs_send (gcs_conn_t*    conn,
                      const void*    act_buf,
                      size_t         act_size,
                      gcs_act_type_t act_type,
                      bool           scheduled);

/*! @brief Receives an action from group.
 * Blocks if no actions are available. Action buffer is allocated by GCS
 * and must be freed by application when action is no longer needed.
 * Also sets global and local action IDs. Global action ID uniquely identifies
 * action in the history of the group and can be used to identify the state
 * of the application for state snapshot purposes. Local action ID is a
 * monotonic gapless number sequence starting with 1 which can be used
 * to serialize access to critical sections.
 *
 * @param conn   group connection handle
 * @param action action object
 * @return       negative error code, action size in case of success,
 * @retval 0     on connection close
 */
extern long gcs_recv (gcs_conn_t*        conn,
                      struct gcs_action* action);

/*! @brief Replicates an action.
 * Sends action to group and blocks until it is received. Upon return global
 * and local IDs are set. Arguments are the same as in gcs_recv().
 * @see gcs_recv()
 *
 * @param conn      group connection handle
 * @param action    action object
 * @param scheduled whether the call was preceded by gcs_schedule()
 * @return          negative error code, action size in case of success
 * @retval -EINTR:  thread was interrupted while waiting to enter the monitor
 */
extern long gcs_repl (gcs_conn_t*        conn,
                      struct gcs_action* action,
                      bool               scheduled);

/*!
 * @brief Schedules entry to CGS send monitor.
 * Locks send monitor and should be quickly followed by gcs_repl()/gcs_send()
 *
 * @retval 0       - won't queue
 * @retval >0      - queue handle
 * @retval -EAGAIN - too many queued threads
 * @retval -EBADFD - connection is closed
 */
extern long gcs_schedule (gcs_conn_t* conn);

/*!
 * @brief Interrupt a thread waiting to enter send monitor.
 *
 * @param  conn    GCS connection
 * @param  handle  queue handle returned by @func gcs_schedule(). Must be > 0
 *
 * @retval 0       success
 * @retval -ESRCH  no such thread/already interrupted
 */
extern long gcs_interrupt (gcs_conn_t* conn, long handle);

/*!
 * Resume receivng from group.
 *
 * @param conn     GCS connection
 *
 * @retval 0       success
 * @retval -EBADFD connection is in closed state
 */
extern long gcs_resume_recv (gcs_conn_t* conn);

/*!
 * After action with this seqno is applied, this thread is guaranteed to see
 * all the changes made by the client, even on other nodes.
 *
 * @return global sequence number or negative error code
 */
extern gcs_seqno_t gcs_caused(gcs_conn_t* conn);

/*! @brief Sends state transfer request
 * Broadcasts state transfer request which will be passed to one of the
 * suitable group members.
 *
 * @param conn  connection to group
 * @param req   opaque byte array that contains data required for
 *              the state transfer (application dependent)
 * @param size  request size
 * @param donor desired state transfer donor name. Supply empty string to
 *              choose automatically.
 * @param seqno response to request was ordered with this seqno.
 *              Must be skipped in local queues.
 * @return negative error code, index of state transfer donor in case of success
 *         (notably, -EAGAIN means try later, -EHOSTUNREACH means desired donor
 *         is unavailable)
 */
extern long gcs_request_state_transfer (gcs_conn_t  *conn,
                                        const void  *req,
                                        size_t       size,
                                        const char  *donor,
                                        gcs_seqno_t *seqno);

/*! @brief Turns off flow control on the node.
 * Effectively desynchronizes the node from the cluster (while the node keeps on
 * receiving all the actions). Requires gcs_join() to return to normal.
 *
 * @param conn  connection to group
 * @param seqno response to request was ordered with this seqno.
 *              Must be skipped in local queues.
 * @return negative error code, 0 in case of success.
 */
extern long gcs_desync (gcs_conn_t* conn, gcs_seqno_t* seqno);

/*! @brief Informs group on behalf of donor that state stransfer is over.
 * If status is non-negative, joiner will be considered fully joined to group.
 *
 * @param conn opened connection to group
 * @param status negative error code in case of state transfer failure,
 *               0 or (optional) seqno corresponding to transferred state.
 * @return negative error code, 0 in case of success
 */
extern long gcs_join (gcs_conn_t *conn, gcs_seqno_t status);

/*! @brief Allocate local seqno for accessing local resources.
 *
 *
 * @param conn connection to group
 * @return local seqno, negative error code in case of error
 */
extern gcs_seqno_t gcs_local_sequence(gcs_conn_t* conn);


///////////////////////////////////////////////////////////////////////////////

/* Service functions */

/*! Informs group about the last applied action on this node */
extern long gcs_set_last_applied (gcs_conn_t* conn, gcs_seqno_t seqno);

/* GCS Configuration */

/*! sets the key to a given value
 * 
 * @return 0 in case of success, 1 if key not found or negative error code */
extern long
gcs_param_set (gcs_conn_t* conn, const char* key, const char *value);

/*! returns the value of the key
 * 
 * @return NULL if key not found */
extern const char*
gcs_param_get (gcs_conn_t* conn, const char* key);

/* Logging options */
extern long gcs_conf_set_log_file     (FILE *file);
extern long gcs_conf_set_log_callback (void (*logger) (int, const char*));
extern long gcs_conf_self_tstamp_on   ();
extern long gcs_conf_self_tstamp_off  ();
extern long gcs_conf_debug_on         ();
extern long gcs_conf_debug_off        ();

/* Sending options (deprecated, use gcs_param_set instead) */
/* Sets maximum DESIRED network packet size.
 * For best results should be multiple of MTU */
extern long
gcs_conf_set_pkt_size (gcs_conn_t *conn, long pkt_size);

#define GCS_DEFAULT_PKT_SIZE 64500 /* 43 Eth. frames to carry max IP packet */

/*
 * Configuration action
 */

/*! Possible node states */
typedef enum gcs_node_state
{
    GCS_NODE_STATE_NON_PRIM, /// in non-primary configuration, outdated state
    GCS_NODE_STATE_PRIM,     /// in primary conf, needs state transfer
    GCS_NODE_STATE_JOINER,   /// in primary conf, receiving state transfer
    GCS_NODE_STATE_DONOR,    /// joined, donating state transfer
    GCS_NODE_STATE_JOINED,   /// contains full state
    GCS_NODE_STATE_SYNCED,   /// syncronized with group
    GCS_NODE_STATE_MAX
}
gcs_node_state_t;

/*! Convert state code to null-terminates string */
extern const char*
gcs_node_state_to_str (gcs_node_state_t state);

/*! New configuration action */
typedef struct gcs_act_conf {
    gcs_seqno_t      seqno;    //! last global seqno applied by this group
    gcs_seqno_t      conf_id;  //! configuration ID (-1 if non-primary)
    uint8_t          uuid[GCS_UUID_LEN];/// group UUID
    long             memb_num; //! number of members in configuration
    long             my_idx;   //! index of this node in the configuration
    gcs_node_state_t my_state; //! current node state
    int              repl_proto_ver; //! replicator  protocol version to use
    int              appl_proto_ver; //! application protocol version to use
    char             data[1];  /*! member array (null-terminated ID, name,
                                *  incoming address) */
} gcs_act_conf_t;

struct gcs_stats
{
    double send_q_len_avg;  //! average send queue length per send call
    double recv_q_len_avg;  //! average recv queue length per queued action
    double fc_paused;       //! faction of time paused due to flow control
    size_t recv_q_size;     //! current recv queue size
    long   recv_q_len;      //! current recv queue length
    long   send_q_len;      //! current send queue length
    long   fc_sent;         //! flow control stops sent
    long   fc_received;     //! flow control stops received
};

/*! Fills stats struct and resets stats counters */
extern void gcs_get_stats (gcs_conn_t *conn, struct gcs_stats* stats);

/*! A node with this name will be treated as a stateless arbitrator */
#define GCS_ARBITRATOR_NAME "garb"

#ifdef __cplusplus
}
#endif

#endif // _gcs_h_