2
* Fast Weighted Round Robin load balancing algorithm.
4
* Copyright 2000-2009 Willy Tarreau <w@1wt.eu>
6
* This program is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU General Public License
8
* as published by the Free Software Foundation; either version
9
* 2 of the License, or (at your option) any later version.
13
#include <common/compat.h>
14
#include <common/config.h>
15
#include <common/debug.h>
18
#include <types/global.h>
19
#include <types/server.h>
21
#include <proto/backend.h>
22
#include <proto/queue.h>
24
static inline void fwrr_remove_from_tree(struct server *s);
25
static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s);
26
static inline void fwrr_dequeue_srv(struct server *s);
27
static void fwrr_get_srv(struct server *s);
28
static void fwrr_queue_srv(struct server *s);
31
/* This function updates the server trees according to server <srv>'s new
32
* state. It should be called when server <srv>'s status changes to down.
33
* It is not important whether the server was already down or not. It is not
34
* important either that the new state is completely down (the caller may not
35
* know all the variables of a server's state).
37
static void fwrr_set_server_status_down(struct server *srv)
39
struct proxy *p = srv->proxy;
40
struct fwrr_group *grp;
42
if (srv->state == srv->prev_state &&
43
srv->eweight == srv->prev_eweight)
46
if (srv_is_usable(srv->state, srv->eweight))
47
goto out_update_state;
49
if (!srv_is_usable(srv->prev_state, srv->prev_eweight))
50
/* server was already down */
51
goto out_update_backend;
53
grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
54
grp->next_weight -= srv->prev_eweight;
56
if (srv->state & SRV_BACKUP) {
57
p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
60
if (srv == p->lbprm.fbck) {
61
/* we lost the first backup server in a single-backup
62
* configuration, we must search another one.
64
struct server *srv2 = p->lbprm.fbck;
68
!((srv2->state & SRV_BACKUP) &&
69
srv_is_usable(srv2->state, srv2->eweight)));
73
p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
77
fwrr_dequeue_srv(srv);
78
fwrr_remove_from_tree(srv);
81
/* check/update tot_used, tot_weight */
82
update_backend_weight(p);
84
srv->prev_state = srv->state;
85
srv->prev_eweight = srv->eweight;
88
/* This function updates the server trees according to server <srv>'s new
89
* state. It should be called when server <srv>'s status changes to up.
90
* It is not important whether the server was already down or not. It is not
91
* important either that the new state is completely UP (the caller may not
92
* know all the variables of a server's state). This function will not change
93
* the weight of a server which was already up.
95
static void fwrr_set_server_status_up(struct server *srv)
97
struct proxy *p = srv->proxy;
98
struct fwrr_group *grp;
100
if (srv->state == srv->prev_state &&
101
srv->eweight == srv->prev_eweight)
104
if (!srv_is_usable(srv->state, srv->eweight))
105
goto out_update_state;
107
if (srv_is_usable(srv->prev_state, srv->prev_eweight))
108
/* server was already up */
109
goto out_update_backend;
111
grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
112
grp->next_weight += srv->eweight;
114
if (srv->state & SRV_BACKUP) {
115
p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
118
if (!(p->options & PR_O_USE_ALL_BK)) {
119
if (!p->lbprm.fbck) {
120
/* there was no backup server anymore */
123
/* we may have restored a backup server prior to fbck,
124
* in which case it should replace it.
126
struct server *srv2 = srv;
129
} while (srv2 && (srv2 != p->lbprm.fbck));
135
p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
139
/* note that eweight cannot be 0 here */
141
srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->eweight;
145
/* check/update tot_used, tot_weight */
146
update_backend_weight(p);
148
srv->prev_state = srv->state;
149
srv->prev_eweight = srv->eweight;
152
/* This function must be called after an update to server <srv>'s effective
153
* weight. It may be called after a state change too.
155
static void fwrr_update_server_weight(struct server *srv)
157
int old_state, new_state;
158
struct proxy *p = srv->proxy;
159
struct fwrr_group *grp;
161
if (srv->state == srv->prev_state &&
162
srv->eweight == srv->prev_eweight)
165
/* If changing the server's weight changes its state, we simply apply
166
* the procedures we already have for status change. If the state
167
* remains down, the server is not in any tree, so it's as easy as
168
* updating its values. If the state remains up with different weights,
169
* there are some computations to perform to find a new place and
170
* possibly a new tree for this server.
173
old_state = srv_is_usable(srv->prev_state, srv->prev_eweight);
174
new_state = srv_is_usable(srv->state, srv->eweight);
176
if (!old_state && !new_state) {
177
srv->prev_state = srv->state;
178
srv->prev_eweight = srv->eweight;
181
else if (!old_state && new_state) {
182
fwrr_set_server_status_up(srv);
185
else if (old_state && !new_state) {
186
fwrr_set_server_status_down(srv);
190
grp = (srv->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
191
grp->next_weight = grp->next_weight - srv->prev_eweight + srv->eweight;
193
p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight;
194
p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight;
196
if (srv->lb_tree == grp->init) {
197
fwrr_dequeue_srv(srv);
198
fwrr_queue_by_weight(grp->init, srv);
200
else if (!srv->lb_tree) {
201
/* FIXME: server was down. This is not possible right now but
202
* may be needed soon for slowstart or graceful shutdown.
204
fwrr_dequeue_srv(srv);
206
srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->eweight;
209
/* The server is either active or in the next queue. If it's
210
* still in the active queue and it has not consumed all of its
211
* places, let's adjust its next position.
215
if (srv->eweight > 0) {
216
int prev_next = srv->npos;
217
int step = grp->next_weight / srv->eweight;
219
srv->npos = srv->lpos + step;
222
if (srv->npos > prev_next)
223
srv->npos = prev_next;
224
if (srv->npos < grp->curr_pos + 2)
225
srv->npos = grp->curr_pos + step;
227
/* push it into the next tree */
228
srv->npos = grp->curr_pos + grp->curr_weight;
231
fwrr_dequeue_srv(srv);
235
update_backend_weight(p);
236
srv->prev_state = srv->state;
237
srv->prev_eweight = srv->eweight;
240
/* Remove a server from a tree. It must have previously been dequeued. This
241
* function is meant to be called when a server is going down or has its
244
static inline void fwrr_remove_from_tree(struct server *s)
249
/* Queue a server in the weight tree <root>, assuming the weight is >0.
250
* We want to sort them by inverted weights, because we need to place
251
* heavy servers first in order to get a smooth distribution.
253
static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s)
255
s->lb_node.key = SRV_EWGHT_MAX - s->eweight;
256
eb32_insert(root, &s->lb_node);
260
/* This function is responsible for building the weight trees in case of fast
261
* weighted round-robin. It also sets p->lbprm.wdiv to the eweight to uweight
262
* ratio. Both active and backup groups are initialized.
264
void fwrr_init_server_groups(struct proxy *p)
267
struct eb_root init_head = EB_ROOT;
269
p->lbprm.set_server_status_up = fwrr_set_server_status_up;
270
p->lbprm.set_server_status_down = fwrr_set_server_status_down;
271
p->lbprm.update_server_eweight = fwrr_update_server_weight;
273
p->lbprm.wdiv = BE_WEIGHT_SCALE;
274
for (srv = p->srv; srv; srv = srv->next) {
275
srv->prev_eweight = srv->eweight = srv->uweight * BE_WEIGHT_SCALE;
276
srv->prev_state = srv->state;
280
update_backend_weight(p);
282
/* prepare the active servers group */
283
p->lbprm.fwrr.act.curr_pos = p->lbprm.fwrr.act.curr_weight =
284
p->lbprm.fwrr.act.next_weight = p->lbprm.tot_wact;
285
p->lbprm.fwrr.act.curr = p->lbprm.fwrr.act.t0 =
286
p->lbprm.fwrr.act.t1 = init_head;
287
p->lbprm.fwrr.act.init = &p->lbprm.fwrr.act.t0;
288
p->lbprm.fwrr.act.next = &p->lbprm.fwrr.act.t1;
290
/* prepare the backup servers group */
291
p->lbprm.fwrr.bck.curr_pos = p->lbprm.fwrr.bck.curr_weight =
292
p->lbprm.fwrr.bck.next_weight = p->lbprm.tot_wbck;
293
p->lbprm.fwrr.bck.curr = p->lbprm.fwrr.bck.t0 =
294
p->lbprm.fwrr.bck.t1 = init_head;
295
p->lbprm.fwrr.bck.init = &p->lbprm.fwrr.bck.t0;
296
p->lbprm.fwrr.bck.next = &p->lbprm.fwrr.bck.t1;
298
/* queue active and backup servers in two distinct groups */
299
for (srv = p->srv; srv; srv = srv->next) {
300
if (!srv_is_usable(srv->state, srv->eweight))
302
fwrr_queue_by_weight((srv->state & SRV_BACKUP) ?
303
p->lbprm.fwrr.bck.init :
304
p->lbprm.fwrr.act.init,
309
/* simply removes a server from a weight tree */
310
static inline void fwrr_dequeue_srv(struct server *s)
312
eb32_delete(&s->lb_node);
315
/* queues a server into the appropriate group and tree depending on its
316
* backup status, and ->npos. If the server is disabled, simply assign
317
* it to the NULL tree.
319
static void fwrr_queue_srv(struct server *s)
321
struct proxy *p = s->proxy;
322
struct fwrr_group *grp;
324
grp = (s->state & SRV_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act;
326
/* Delay everything which does not fit into the window and everything
327
* which does not fit into the theorical new window.
329
if (!srv_is_usable(s->state, s->eweight)) {
330
fwrr_remove_from_tree(s);
332
else if (s->eweight <= 0 ||
333
s->npos >= 2 * grp->curr_weight ||
334
s->npos >= grp->curr_weight + grp->next_weight) {
335
/* put into next tree, and readjust npos in case we could
336
* finally take this back to current. */
337
s->npos -= grp->curr_weight;
338
fwrr_queue_by_weight(grp->next, s);
341
/* The sorting key is stored in units of s->npos * user_weight
342
* in order to avoid overflows. As stated in backend.h, the
343
* lower the scale, the rougher the weights modulation, and the
344
* higher the scale, the lower the number of servers without
345
* overflow. With this formula, the result is always positive,
346
* so we can use eb3ļæ½_insert().
348
s->lb_node.key = SRV_UWGHT_RANGE * s->npos +
349
(unsigned)(SRV_EWGHT_MAX + s->rweight - s->eweight) / BE_WEIGHT_SCALE;
351
eb32_insert(&grp->curr, &s->lb_node);
352
s->lb_tree = &grp->curr;
356
/* prepares a server when extracting it from the "init" tree */
357
static inline void fwrr_get_srv_init(struct server *s)
359
s->npos = s->rweight = 0;
362
/* prepares a server when extracting it from the "next" tree */
363
static inline void fwrr_get_srv_next(struct server *s)
365
struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
366
&s->proxy->lbprm.fwrr.bck :
367
&s->proxy->lbprm.fwrr.act;
369
s->npos += grp->curr_weight;
372
/* prepares a server when it was marked down */
373
static inline void fwrr_get_srv_down(struct server *s)
375
struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
376
&s->proxy->lbprm.fwrr.bck :
377
&s->proxy->lbprm.fwrr.act;
379
s->npos = grp->curr_pos;
382
/* prepares a server when extracting it from its tree */
383
static void fwrr_get_srv(struct server *s)
385
struct proxy *p = s->proxy;
386
struct fwrr_group *grp = (s->state & SRV_BACKUP) ?
390
if (s->lb_tree == grp->init) {
391
fwrr_get_srv_init(s);
393
else if (s->lb_tree == grp->next) {
394
fwrr_get_srv_next(s);
396
else if (s->lb_tree == NULL) {
397
fwrr_get_srv_down(s);
401
/* switches trees "init" and "next" for FWRR group <grp>. "init" should be empty
402
* when this happens, and "next" filled with servers sorted by weights.
404
static inline void fwrr_switch_trees(struct fwrr_group *grp)
406
struct eb_root *swap;
408
grp->init = grp->next;
410
grp->curr_weight = grp->next_weight;
411
grp->curr_pos = grp->curr_weight;
414
/* return next server from the current tree in FWRR group <grp>, or a server
415
* from the "init" tree if appropriate. If both trees are empty, return NULL.
417
static struct server *fwrr_get_server_from_group(struct fwrr_group *grp)
419
struct eb32_node *node;
422
node = eb32_first(&grp->curr);
423
s = eb32_entry(node, struct server, lb_node);
425
if (!node || s->npos > grp->curr_pos) {
426
/* either we have no server left, or we have a hole */
427
struct eb32_node *node2;
428
node2 = eb32_first(grp->init);
431
s = eb32_entry(node, struct server, lb_node);
432
fwrr_get_srv_init(s);
433
if (s->eweight == 0) /* FIXME: is it possible at all ? */
443
/* Computes next position of server <s> in the group. It is mandatory for <s>
444
* to have a non-zero, positive eweight.
446
static inline void fwrr_update_position(struct fwrr_group *grp, struct server *s)
449
/* first time ever for this server */
450
s->lpos = grp->curr_pos;
451
s->npos = grp->curr_pos + grp->next_weight / s->eweight;
452
s->rweight += grp->next_weight % s->eweight;
454
if (s->rweight >= s->eweight) {
455
s->rweight -= s->eweight;
460
s->npos += grp->next_weight / s->eweight;
461
s->rweight += grp->next_weight % s->eweight;
463
if (s->rweight >= s->eweight) {
464
s->rweight -= s->eweight;
470
/* Return next server from the current tree in backend <p>, or a server from
471
* the init tree if appropriate. If both trees are empty, return NULL.
472
* Saturated servers are skipped and requeued.
474
struct server *fwrr_get_next_server(struct proxy *p, struct server *srvtoavoid)
476
struct server *srv, *full, *avoided;
477
struct fwrr_group *grp;
481
grp = &p->lbprm.fwrr.act;
482
else if (p->lbprm.fbck)
483
return p->lbprm.fbck;
485
grp = &p->lbprm.fwrr.bck;
491
full = NULL; /* NULL-terminated list of saturated servers */
493
/* if we see an empty group, let's first try to collect weights
494
* which might have recently changed.
496
if (!grp->curr_weight)
497
grp->curr_pos = grp->curr_weight = grp->next_weight;
499
/* get first server from the "current" tree. When the end of
500
* the tree is reached, we may have to switch, but only once.
503
srv = fwrr_get_server_from_group(grp);
511
goto requeue_servers;
514
fwrr_switch_trees(grp);
518
/* OK, we have a server. However, it may be saturated, in which
519
* case we don't want to reconsider it for now. We'll update
520
* its position and dequeue it anyway, so that we can move it
521
* to a better place afterwards.
523
fwrr_update_position(grp, srv);
524
fwrr_dequeue_srv(srv);
526
if (!srv->maxconn || (!srv->nbpend && srv->served < srv_dynamic_maxconn(srv))) {
527
/* make sure it is not the server we are trying to exclude... */
528
if (srv != srvtoavoid || avoided)
531
avoided = srv; /* ...but remember that is was selected yet avoided */
534
/* the server is saturated or avoided, let's chain it for later reinsertion */
535
srv->next_full = full;
539
/* OK, we got the best server, let's update it */
543
/* Requeue all extracted servers. If full==srv then it was
544
* avoided (unsucessfully) and chained, omit it now.
546
if (unlikely(full != NULL)) {
548
/* the tree has switched, requeue all extracted servers
549
* into "init", because their place was lost, and only
550
* their weight matters.
553
if (likely(full != srv))
554
fwrr_queue_by_weight(grp->init, full);
555
full = full->next_full;
558
/* requeue all extracted servers just as if they were consumed
559
* so that they regain their expected place.
562
if (likely(full != srv))
563
fwrr_queue_srv(full);
564
full = full->next_full;