1
/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
4
* The main memcached header holding commonly used data
5
* structures and function prototypes.
14
#include <sys/types.h>
15
#include <sys/socket.h>
17
#include <netinet/in.h>
23
#include "protocol_binary.h"
26
#include "sasl_defs.h"
28
/** Maximum length of a key. */
29
#define KEY_MAX_LENGTH 250
31
/** Size of an incr buf. */
32
#define INCR_MAX_STORAGE_LEN 24
34
#define DATA_BUFFER_SIZE 2048
35
#define UDP_READ_BUFFER_SIZE 65536
36
#define UDP_MAX_PAYLOAD_SIZE 1400
37
#define UDP_HEADER_SIZE 8
38
#define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
39
/* I'm told the max length of a 64-bit num converted to string is 20 bytes.
40
* Plus a few for spaces, \r\n, \0 */
41
#define SUFFIX_SIZE 24
43
/** Initial size of list of items being returned by "get". */
44
#define ITEM_LIST_INITIAL 200
46
/** Initial size of list of CAS suffixes appended to "gets" lines. */
47
#define SUFFIX_LIST_INITIAL 20
49
/** Initial size of the sendmsg() scatter/gather array. */
50
#define IOV_LIST_INITIAL 400
52
/** Initial number of sendmsg() argument structures to allocate. */
53
#define MSG_LIST_INITIAL 10
55
/** High water marks for buffer shrinking */
56
#define READ_BUFFER_HIGHWAT 8192
57
#define ITEM_LIST_HIGHWAT 400
58
#define IOV_LIST_HIGHWAT 600
59
#define MSG_LIST_HIGHWAT 100
61
/* Binary protocol stuff */
62
#define MIN_BIN_PKT_LENGTH 16
63
#define BIN_PKT_HDR_WORDS (MIN_BIN_PKT_LENGTH/sizeof(uint32_t))
65
/* Initial power multiplier for the hash table */
66
#define HASHPOWER_DEFAULT 16
68
/* unistd.h is here */
73
/* Slab sizing definitions. */
74
#define POWER_SMALLEST 1
75
#define POWER_LARGEST 200
76
#define CHUNK_ALIGN_BYTES 8
77
#define DONT_PREALLOC_SLABS
78
#define MAX_NUMBER_OF_SLAB_CLASSES (POWER_LARGEST + 1)
80
/** How long an object can reasonably be assumed to be locked before
81
harvesting it on a low memory condition. */
82
#define TAIL_REPAIR_TIME (3 * 3600)
84
/* warning: don't use these macros with a function, as it evals its arg twice */
85
#define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
86
(i)->data->cas : (uint64_t)0)
88
#define ITEM_set_cas(i,v) { \
89
if ((i)->it_flags & ITEM_CAS) { \
94
#define ITEM_key(item) (((char*)&((item)->data)) \
95
+ (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
97
#define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
98
+ (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
100
#define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
102
+ (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
104
#define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
105
+ (item)->nsuffix + (item)->nbytes \
106
+ (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
108
#define STAT_KEY_LEN 128
109
#define STAT_VAL_LEN 128
111
/** Append a simple stat with a stat name, value format and value */
112
#define APPEND_STAT(name, fmt, val) \
113
append_stat(name, add_stats, c, fmt, val);
115
/** Append an indexed stat with a stat name (with format), value format
117
#define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val) \
118
klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name); \
119
vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val); \
120
add_stats(key_str, klen, val_str, vlen, c);
122
/** Common APPEND_NUM_FMT_STAT format. */
123
#define APPEND_NUM_STAT(num, name, fmt, val) \
124
APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
127
* Callback for any function producing stats.
129
* @param key the stat's key
130
* @param klen length of the key
131
* @param val the stat's value in an ascii form (e.g. text form of a number)
132
* @param vlen length of the value
133
* @parm cookie magic callback cookie
135
typedef void (*ADD_STAT)(const char *key, const uint16_t klen,
136
const char *val, const uint32_t vlen,
140
* NOTE: If you modify this table you _MUST_ update the function state_text
143
* Possible states of a connection.
146
conn_listening, /**< the socket which listens for connections */
147
conn_new_cmd, /**< Prepare connection for next command */
148
conn_waiting, /**< waiting for a readable socket */
149
conn_read, /**< reading in a command line */
150
conn_parse_cmd, /**< try to parse a command from the input buffer */
151
conn_write, /**< writing out a simple response */
152
conn_nread, /**< reading in a fixed number of bytes */
153
conn_swallow, /**< swallowing unnecessary bytes w/o storing */
154
conn_closing, /**< closing this connection */
155
conn_mwrite, /**< writing out many items sequentially */
156
conn_max_state /**< Max state value (used for assertion) */
161
bin_reading_set_header,
162
bin_reading_cas_header,
166
bin_reading_del_header,
167
bin_reading_incr_header,
168
bin_read_flush_exptime,
169
bin_reading_sasl_auth,
170
bin_reading_sasl_auth_data,
171
bin_reading_touch_key,
175
ascii_prot = 3, /* arbitrary value. */
177
negotiating_prot /* Discovering the protocol */
180
enum network_transport {
181
local_transport, /* Unix sockets*/
186
#define IS_UDP(x) (x == udp_transport)
190
#define NREAD_REPLACE 3
191
#define NREAD_APPEND 4
192
#define NREAD_PREPEND 5
195
enum store_item_type {
196
NOT_STORED=0, STORED, EXISTS, NOT_FOUND
199
enum delta_result_type {
200
OK, NON_NUMERIC, EOM, DELTA_ITEM_NOT_FOUND, DELTA_ITEM_CAS_MISMATCH
203
/** Time relative to server start. Smaller than time_t on 64-bit systems. */
204
typedef unsigned int rel_time_t;
206
/** Stats stored per slab (and per thread). */
211
uint64_t delete_hits;
219
* Stats stored per-thread.
221
struct thread_stats {
222
pthread_mutex_t mutex;
226
uint64_t touch_misses;
227
uint64_t delete_misses;
228
uint64_t incr_misses;
229
uint64_t decr_misses;
232
uint64_t bytes_written;
234
uint64_t conn_yields; /* # of yields for connections (-R option)*/
236
uint64_t auth_errors;
237
struct slab_stats slab_stats[MAX_NUMBER_OF_SLAB_CLASSES];
244
pthread_mutex_t mutex;
245
unsigned int curr_items;
246
unsigned int total_items;
248
unsigned int curr_conns;
249
unsigned int total_conns;
250
uint64_t rejected_conns;
251
unsigned int reserved_fds;
252
unsigned int conn_structs;
259
uint64_t touch_misses;
262
time_t started; /* when the process was started */
263
bool accepting_conns; /* whether we are currently accepting */
264
uint64_t listen_disabled_num;
265
unsigned int hash_power_level; /* Better hope it's not over 9000 */
266
uint64_t hash_bytes; /* size used for hash tables */
267
bool hash_is_expanding; /* If the hash table is being expanded */
268
uint64_t expired_unfetched; /* items reclaimed but never touched */
269
uint64_t evicted_unfetched; /* items evicted but never touched */
270
bool slab_reassign_running; /* slab reassign in progress */
271
uint64_t slabs_moved; /* times slabs were moved around */
274
#define MAX_VERBOSITY_LEVEL 2
276
/* When adding a setting, be sure to update process_stat_settings */
278
* Globally accessible settings as derived from the commandline.
287
rel_time_t oldest_live; /* ignore existing items older than this */
289
char *socketpath; /* path to unix socket if using local socket */
290
int access; /* access mask (a la chmod) for unix domain socket */
291
double factor; /* chunk size growth factor */
293
int num_threads; /* number of worker (without dispatcher) libevent threads to run */
294
int num_threads_per_udp; /* number of worker threads serving each udp socket */
295
char prefix_delimiter; /* character that marks a key prefix (for stats) */
296
int detail_enabled; /* nonzero if we're collecting detailed stats */
297
int reqs_per_event; /* Maximum number of io to process on each
300
enum protocol binding_protocol;
302
int item_size_max; /* Maximum item size, and upper end for slabs */
303
bool sasl; /* SASL on/off */
304
bool maxconns_fast; /* Whether or not to early close connections */
305
bool slab_reassign; /* Whether or not slab reassignment is allowed */
306
bool slab_automove; /* Whether or not to automatically move slabs */
307
int hashpower_init; /* Starting hash power level */
310
#ifndef __INTEL_COMPILER
311
#pragma GCC diagnostic ignored "-Wshadow"
313
extern struct stats stats;
314
extern time_t process_started;
315
extern struct settings settings;
317
#define ITEM_LINKED 1
321
#define ITEM_SLABBED 4
323
#define ITEM_FETCHED 8
325
#ifndef __INTEL_COMPILER
326
#pragma GCC diagnostic ignored "-Wshadow"
329
* Structure for storing items within memcached.
331
typedef struct _stritem {
332
struct _stritem *next;
333
struct _stritem *prev;
334
struct _stritem *h_next; /* hash chain next */
335
rel_time_t time; /* least recent access */
336
rel_time_t exptime; /* expire time */
337
int nbytes; /* size of data */
338
unsigned short refcount;
339
uint8_t nsuffix; /* length of flags-and-length string */
340
uint8_t it_flags; /* ITEM_* above */
341
uint8_t slabs_clsid;/* which slab class we're in */
342
uint8_t nkey; /* key length, w/terminating null and padding */
343
/* this odd type prevents type-punning issues when we do
344
* the little shuffle to save space when not using CAS. */
349
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
350
/* then null-terminated key */
351
/* then " flags length\r\n" (no terminating null) */
352
/* then data with terminating \r\n (no terminating null; it's binary!) */
356
pthread_t thread_id; /* unique ID of this thread */
357
struct event_base *base; /* libevent handle this thread uses */
358
struct event notify_event; /* listen event for notify pipe */
359
int notify_receive_fd; /* receiving end of notify pipe */
360
int notify_send_fd; /* sending end of notify pipe */
361
struct thread_stats stats; /* Stats generated by this thread */
362
struct conn_queue *new_conn_queue; /* queue of new connections to handle */
363
cache_t *suffix_cache; /* suffix cache */
367
pthread_t thread_id; /* unique ID of this thread */
368
struct event_base *base; /* libevent handle this thread uses */
369
} LIBEVENT_DISPATCHER_THREAD;
372
* The structure representing a connection into memcached.
374
typedef struct conn conn;
377
sasl_conn_t *sasl_conn;
378
enum conn_states state;
379
enum bin_substates substate;
382
short which; /** which events were just triggered */
384
char *rbuf; /** buffer to read commands into */
385
char *rcurr; /** but if we parsed some already, this is where we stopped */
386
int rsize; /** total allocated size of rbuf */
387
int rbytes; /** how much data, starting from rcur, do we have unparsed */
393
/** which state to go into after finishing current write */
394
enum conn_states write_and_go;
395
void *write_and_free; /** free this memory after finishing writing */
397
char *ritem; /** when we read in an item's value, it goes here */
400
/* data for the nread state */
403
* item is used to hold an item structure created after reading the command
404
* line of set/add/replace commands, but before we finished reading the actual
405
* data. The data is read into ITEM_data(item) to avoid extra copying.
408
void *item; /* for commands set/add/replace */
410
/* data for the swallow state */
411
int sbytes; /* how many bytes to swallow */
413
/* data for the mwrite state */
415
int iovsize; /* number of elements allocated in iov[] */
416
int iovused; /* number of elements used in iov[] */
418
struct msghdr *msglist;
419
int msgsize; /* number of elements allocated in msglist[] */
420
int msgused; /* number of elements used in msglist[] */
421
int msgcurr; /* element in msglist[] being transmitted now */
422
int msgbytes; /* number of bytes in current msg */
424
item **ilist; /* list of items to write out */
434
enum protocol protocol; /* which protocol this connection speaks */
435
enum network_transport transport; /* what transport is used by this connection */
437
/* data for UDP clients */
438
int request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
439
struct sockaddr request_addr; /* Who sent the most recent request */
440
socklen_t request_addr_size;
441
unsigned char *hdrbuf; /* udp packet headers */
442
int hdrsize; /* number of headers' worth of space is allocated */
444
bool noreply; /* True if the reply should not be sent. */
445
/* current stats command */
452
/* Binary protocol stuff */
453
/* This is where the binary header goes */
454
protocol_binary_request_header binary_header;
455
uint64_t cas; /* the cas to return */
456
short cmd; /* current command being processed */
459
conn *next; /* Used for generating a list of conn structures */
460
LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
464
/* current time of day (updated periodically) */
465
extern volatile rel_time_t current_time;
467
/* TODO: Move to slabs.h? */
468
extern volatile int slab_rebalance_signal;
470
struct slab_rebalance {
480
extern struct slab_rebalance slab_rebal;
485
void do_accept_new_conns(const bool do_accept);
486
enum delta_result_type do_add_delta(conn *c, const char *key,
487
const size_t nkey, const bool incr,
488
const int64_t delta, char *buf,
489
uint64_t *cas, const uint32_t hv);
490
enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv);
491
conn *conn_new(const int sfd, const enum conn_states init_state, const int event_flags, const int read_buffer_size, enum network_transport transport, struct event_base *base);
492
extern int daemonize(int nochdir, int noclose);
494
static inline int mutex_lock(pthread_mutex_t *mutex)
496
while (pthread_mutex_trylock(mutex));
500
#define mutex_unlock(x) pthread_mutex_unlock(x)
511
* Functions such as the libevent-related calls that need to do cross-thread
512
* communication in multithreaded mode (rather than actually doing the work
513
* in the current thread) are called via "dispatch_" frontends, which are
514
* also #define-d to directly call the underlying code in singlethreaded mode.
517
void thread_init(int nthreads, struct event_base *main_base);
518
int dispatch_event_add(int thread, conn *c);
519
void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, int read_buffer_size, enum network_transport transport);
521
/* Lock wrappers for cache functions that are called from main loop. */
522
enum delta_result_type add_delta(conn *c, const char *key,
523
const size_t nkey, const int incr,
524
const int64_t delta, char *buf,
526
void accept_new_conns(const bool do_accept);
527
conn *conn_from_freelist(void);
528
bool conn_add_to_freelist(conn *c);
529
int is_listen_thread(void);
530
item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes);
531
char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes);
532
void item_flush_expired(void);
533
item *item_get(const char *key, const size_t nkey);
534
item *item_touch(const char *key, const size_t nkey, uint32_t exptime);
535
int item_link(item *it);
536
void item_remove(item *it);
537
int item_replace(item *it, item *new_it, const uint32_t hv);
538
void item_stats(ADD_STAT add_stats, void *c);
539
void item_stats_sizes(ADD_STAT add_stats, void *c);
540
void item_unlink(item *it);
541
void item_update(item *it);
543
void item_lock(uint32_t hv);
544
void item_unlock(uint32_t hv);
545
unsigned short refcount_incr(unsigned short *refcount);
546
unsigned short refcount_decr(unsigned short *refcount);
547
void STATS_LOCK(void);
548
void STATS_UNLOCK(void);
549
void threadlocal_stats_reset(void);
550
void threadlocal_stats_aggregate(struct thread_stats *stats);
551
void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out);
553
/* Stat processing functions */
554
void append_stat(const char *name, ADD_STAT add_stats, conn *c,
555
const char *fmt, ...);
557
enum store_item_type store_item(item *item, int comm, conn *c);
559
#if defined(HAVE_DROP_PRIVILEGES) && HAVE_DROP_PRIVILEGES
560
extern void drop_privileges(void);
562
#define drop_privileges()
565
/* If supported, give compiler hints for branch prediction. */
566
#if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
567
#define __builtin_expect(x, expected_value) (x)
570
#define likely(x) __builtin_expect((x),1)
571
#define unlikely(x) __builtin_expect((x),0)