~ubuntu-branches/ubuntu/precise/linux-ti-omap4/precise

« back to all changes in this revision

Viewing changes to net/ipv4/inetpeer.c

  • Committer: Bazaar Package Importer
  • Author(s): Paolo Pisati
  • Date: 2011-06-29 15:23:51 UTC
  • mfrom: (26.1.1 natty-proposed)
  • Revision ID: james.westby@ubuntu.com-20110629152351-xs96tm303d95rpbk
Tags: 3.0.0-1200.2
* Rebased against 3.0.0-6.7
* BSP from TI based on 3.0.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
81
81
 
82
82
struct inet_peer_base {
83
83
        struct inet_peer __rcu *root;
84
 
        spinlock_t      lock;
 
84
        seqlock_t       lock;
85
85
        int             total;
86
86
};
87
87
 
88
88
static struct inet_peer_base v4_peers = {
89
89
        .root           = peer_avl_empty_rcu,
90
 
        .lock           = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
 
90
        .lock           = __SEQLOCK_UNLOCKED(v4_peers.lock),
91
91
        .total          = 0,
92
92
};
93
93
 
94
94
static struct inet_peer_base v6_peers = {
95
95
        .root           = peer_avl_empty_rcu,
96
 
        .lock           = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
 
96
        .lock           = __SEQLOCK_UNLOCKED(v6_peers.lock),
97
97
        .total          = 0,
98
98
};
99
99
 
154
154
/* Called with or without local BH being disabled. */
155
155
static void unlink_from_unused(struct inet_peer *p)
156
156
{
157
 
        if (!list_empty(&p->unused)) {
158
 
                spin_lock_bh(&unused_peers.lock);
159
 
                list_del_init(&p->unused);
160
 
                spin_unlock_bh(&unused_peers.lock);
161
 
        }
 
157
        spin_lock_bh(&unused_peers.lock);
 
158
        list_del_init(&p->unused);
 
159
        spin_unlock_bh(&unused_peers.lock);
162
160
}
163
161
 
164
162
static int addr_compare(const struct inetpeer_addr *a,
167
165
        int i, n = (a->family == AF_INET ? 1 : 4);
168
166
 
169
167
        for (i = 0; i < n; i++) {
170
 
                if (a->a6[i] == b->a6[i])
 
168
                if (a->addr.a6[i] == b->addr.a6[i])
171
169
                        continue;
172
 
                if (a->a6[i] < b->a6[i])
 
170
                if (a->addr.a6[i] < b->addr.a6[i])
173
171
                        return -1;
174
172
                return 1;
175
173
        }
177
175
        return 0;
178
176
}
179
177
 
 
178
#define rcu_deref_locked(X, BASE)                               \
 
179
        rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
 
180
 
180
181
/*
181
182
 * Called with local BH disabled and the pool lock held.
182
183
 */
187
188
                                                                \
188
189
        stackptr = _stack;                                      \
189
190
        *stackptr++ = &_base->root;                             \
190
 
        for (u = rcu_dereference_protected(_base->root,         \
191
 
                        lockdep_is_held(&_base->lock));         \
 
191
        for (u = rcu_deref_locked(_base->root, _base);          \
192
192
             u != peer_avl_empty; ) {                           \
193
193
                int cmp = addr_compare(_daddr, &u->daddr);      \
194
194
                if (cmp == 0)                                   \
198
198
                else                                            \
199
199
                        v = &u->avl_right;                      \
200
200
                *stackptr++ = v;                                \
201
 
                u = rcu_dereference_protected(*v,               \
202
 
                        lockdep_is_held(&_base->lock));         \
 
201
                u = rcu_deref_locked(*v, _base);                \
203
202
        }                                                       \
204
203
        u;                                                      \
205
204
})
206
205
 
 
206
static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
 
207
{
 
208
        int cur, old = atomic_read(ptr);
 
209
 
 
210
        while (old != u) {
 
211
                *newv = old + a;
 
212
                cur = atomic_cmpxchg(ptr, old, *newv);
 
213
                if (cur == old)
 
214
                        return true;
 
215
                old = cur;
 
216
        }
 
217
        return false;
 
218
}
 
219
 
207
220
/*
208
 
 * Called with rcu_read_lock_bh()
 
221
 * Called with rcu_read_lock()
209
222
 * Because we hold no lock against a writer, its quite possible we fall
210
223
 * in an endless loop.
211
224
 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212
225
 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213
226
 */
214
 
static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
215
 
                                       struct inet_peer_base *base)
 
227
static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
 
228
                                    struct inet_peer_base *base,
 
229
                                    int *newrefcnt)
216
230
{
217
 
        struct inet_peer *u = rcu_dereference_bh(base->root);
 
231
        struct inet_peer *u = rcu_dereference(base->root);
218
232
        int count = 0;
219
233
 
220
234
        while (u != peer_avl_empty) {
225
239
                         * distinction between an unused entry (refcnt=0) and
226
240
                         * a freed one.
227
241
                         */
228
 
                        if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
 
242
                        if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
229
243
                                u = NULL;
230
244
                        return u;
231
245
                }
232
246
                if (cmp == -1)
233
 
                        u = rcu_dereference_bh(u->avl_left);
 
247
                        u = rcu_dereference(u->avl_left);
234
248
                else
235
 
                        u = rcu_dereference_bh(u->avl_right);
 
249
                        u = rcu_dereference(u->avl_right);
236
250
                if (unlikely(++count == PEER_MAXDEPTH))
237
251
                        break;
238
252
        }
246
260
        struct inet_peer __rcu **v;                             \
247
261
        *stackptr++ = &start->avl_left;                         \
248
262
        v = &start->avl_left;                                   \
249
 
        for (u = rcu_dereference_protected(*v,                  \
250
 
                        lockdep_is_held(&base->lock));          \
 
263
        for (u = rcu_deref_locked(*v, base);                    \
251
264
             u->avl_right != peer_avl_empty_rcu; ) {            \
252
265
                v = &u->avl_right;                              \
253
266
                *stackptr++ = v;                                \
254
 
                u = rcu_dereference_protected(*v,               \
255
 
                        lockdep_is_held(&base->lock));          \
 
267
                u = rcu_deref_locked(*v, base);                 \
256
268
        }                                                       \
257
269
        u;                                                      \
258
270
})
271
283
 
272
284
        while (stackend > stack) {
273
285
                nodep = *--stackend;
274
 
                node = rcu_dereference_protected(*nodep,
275
 
                                lockdep_is_held(&base->lock));
276
 
                l = rcu_dereference_protected(node->avl_left,
277
 
                                lockdep_is_held(&base->lock));
278
 
                r = rcu_dereference_protected(node->avl_right,
279
 
                                lockdep_is_held(&base->lock));
 
286
                node = rcu_deref_locked(*nodep, base);
 
287
                l = rcu_deref_locked(node->avl_left, base);
 
288
                r = rcu_deref_locked(node->avl_right, base);
280
289
                lh = node_height(l);
281
290
                rh = node_height(r);
282
291
                if (lh > rh + 1) { /* l: RH+2 */
283
292
                        struct inet_peer *ll, *lr, *lrl, *lrr;
284
293
                        int lrh;
285
 
                        ll = rcu_dereference_protected(l->avl_left,
286
 
                                lockdep_is_held(&base->lock));
287
 
                        lr = rcu_dereference_protected(l->avl_right,
288
 
                                lockdep_is_held(&base->lock));
 
294
                        ll = rcu_deref_locked(l->avl_left, base);
 
295
                        lr = rcu_deref_locked(l->avl_right, base);
289
296
                        lrh = node_height(lr);
290
297
                        if (lrh <= node_height(ll)) {   /* ll: RH+1 */
291
298
                                RCU_INIT_POINTER(node->avl_left, lr);   /* lr: RH or RH+1 */
296
303
                                l->avl_height = node->avl_height + 1;
297
304
                                RCU_INIT_POINTER(*nodep, l);
298
305
                        } else { /* ll: RH, lr: RH+1 */
299
 
                                lrl = rcu_dereference_protected(lr->avl_left,
300
 
                                        lockdep_is_held(&base->lock));  /* lrl: RH or RH-1 */
301
 
                                lrr = rcu_dereference_protected(lr->avl_right,
302
 
                                        lockdep_is_held(&base->lock));  /* lrr: RH or RH-1 */
 
306
                                lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
 
307
                                lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
303
308
                                RCU_INIT_POINTER(node->avl_left, lrr);  /* lrr: RH or RH-1 */
304
309
                                RCU_INIT_POINTER(node->avl_right, r);   /* r: RH */
305
310
                                node->avl_height = rh + 1; /* node: RH+1 */
314
319
                } else if (rh > lh + 1) { /* r: LH+2 */
315
320
                        struct inet_peer *rr, *rl, *rlr, *rll;
316
321
                        int rlh;
317
 
                        rr = rcu_dereference_protected(r->avl_right,
318
 
                                lockdep_is_held(&base->lock));
319
 
                        rl = rcu_dereference_protected(r->avl_left,
320
 
                                lockdep_is_held(&base->lock));
 
322
                        rr = rcu_deref_locked(r->avl_right, base);
 
323
                        rl = rcu_deref_locked(r->avl_left, base);
321
324
                        rlh = node_height(rl);
322
325
                        if (rlh <= node_height(rr)) {   /* rr: LH+1 */
323
326
                                RCU_INIT_POINTER(node->avl_right, rl);  /* rl: LH or LH+1 */
328
331
                                r->avl_height = node->avl_height + 1;
329
332
                                RCU_INIT_POINTER(*nodep, r);
330
333
                        } else { /* rr: RH, rl: RH+1 */
331
 
                                rlr = rcu_dereference_protected(rl->avl_right,
332
 
                                        lockdep_is_held(&base->lock));  /* rlr: LH or LH-1 */
333
 
                                rll = rcu_dereference_protected(rl->avl_left,
334
 
                                        lockdep_is_held(&base->lock));  /* rll: LH or LH-1 */
 
334
                                rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
 
335
                                rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
335
336
                                RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336
337
                                RCU_INIT_POINTER(node->avl_left, l);    /* l: LH */
337
338
                                node->avl_height = lh + 1; /* node: LH+1 */
366
367
}
367
368
 
368
369
/* May be called with local BH enabled. */
369
 
static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
 
370
static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
 
371
                             struct inet_peer __rcu **stack[PEER_MAXDEPTH])
370
372
{
371
373
        int do_free;
372
374
 
373
375
        do_free = 0;
374
376
 
375
 
        spin_lock_bh(&base->lock);
 
377
        write_seqlock_bh(&base->lock);
376
378
        /* Check the reference counter.  It was artificially incremented by 1
377
379
         * in cleanup() function to prevent sudden disappearing.  If we can
378
380
         * atomically (because of lockless readers) take this last reference,
380
382
         * We use refcnt=-1 to alert lockless readers this entry is deleted.
381
383
         */
382
384
        if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
383
 
                struct inet_peer __rcu **stack[PEER_MAXDEPTH];
384
385
                struct inet_peer __rcu ***stackptr, ***delp;
385
386
                if (lookup(&p->daddr, stack, base) != p)
386
387
                        BUG();
392
393
                        /* look for a node to insert instead of p */
393
394
                        struct inet_peer *t;
394
395
                        t = lookup_rightempty(p, base);
395
 
                        BUG_ON(rcu_dereference_protected(*stackptr[-1],
396
 
                                        lockdep_is_held(&base->lock)) != t);
 
396
                        BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
397
397
                        **--stackptr = t->avl_left;
398
398
                        /* t is removed, t->daddr > x->daddr for any
399
399
                         * x in p->avl_left subtree.
409
409
                base->total--;
410
410
                do_free = 1;
411
411
        }
412
 
        spin_unlock_bh(&base->lock);
 
412
        write_sequnlock_bh(&base->lock);
413
413
 
414
414
        if (do_free)
415
 
                call_rcu_bh(&p->rcu, inetpeer_free_rcu);
 
415
                call_rcu(&p->rcu, inetpeer_free_rcu);
416
416
        else
417
417
                /* The node is used again.  Decrease the reference counter
418
418
                 * back.  The loop "cleanup -> unlink_from_unused
435
435
}
436
436
 
437
437
/* May be called with local BH enabled. */
438
 
static int cleanup_once(unsigned long ttl)
 
438
static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
439
439
{
440
440
        struct inet_peer *p = NULL;
441
441
 
467
467
                 * happen because of entry limits in route cache. */
468
468
                return -1;
469
469
 
470
 
        unlink_from_pool(p, peer_to_base(p));
 
470
        unlink_from_pool(p, peer_to_base(p), stack);
471
471
        return 0;
472
472
}
473
473
 
477
477
        struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478
478
        struct inet_peer_base *base = family_to_base(daddr->family);
479
479
        struct inet_peer *p;
 
480
        unsigned int sequence;
 
481
        int invalidated, newrefcnt = 0;
480
482
 
481
483
        /* Look up for the address quickly, lockless.
482
484
         * Because of a concurrent writer, we might not find an existing entry.
483
485
         */
484
 
        rcu_read_lock_bh();
485
 
        p = lookup_rcu_bh(daddr, base);
486
 
        rcu_read_unlock_bh();
 
486
        rcu_read_lock();
 
487
        sequence = read_seqbegin(&base->lock);
 
488
        p = lookup_rcu(daddr, base, &newrefcnt);
 
489
        invalidated = read_seqretry(&base->lock, sequence);
 
490
        rcu_read_unlock();
487
491
 
488
492
        if (p) {
489
 
                /* The existing node has been found.
 
493
found:          /* The existing node has been found.
490
494
                 * Remove the entry from unused list if it was there.
491
495
                 */
492
 
                unlink_from_unused(p);
 
496
                if (newrefcnt == 1)
 
497
                        unlink_from_unused(p);
493
498
                return p;
494
499
        }
495
500
 
 
501
        /* If no writer did a change during our lookup, we can return early. */
 
502
        if (!create && !invalidated)
 
503
                return NULL;
 
504
 
496
505
        /* retry an exact lookup, taking the lock before.
497
506
         * At least, nodes should be hot in our cache.
498
507
         */
499
 
        spin_lock_bh(&base->lock);
 
508
        write_seqlock_bh(&base->lock);
500
509
        p = lookup(daddr, stack, base);
501
510
        if (p != peer_avl_empty) {
502
 
                atomic_inc(&p->refcnt);
503
 
                spin_unlock_bh(&base->lock);
504
 
                /* Remove the entry from unused list if it was there. */
505
 
                unlink_from_unused(p);
506
 
                return p;
 
511
                newrefcnt = atomic_inc_return(&p->refcnt);
 
512
                write_sequnlock_bh(&base->lock);
 
513
                goto found;
507
514
        }
508
515
        p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
509
516
        if (p) {
510
517
                p->daddr = *daddr;
511
518
                atomic_set(&p->refcnt, 1);
512
519
                atomic_set(&p->rid, 0);
513
 
                atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
 
520
                atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514
521
                p->tcp_ts_stamp = 0;
 
522
                p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 
523
                p->rate_tokens = 0;
 
524
                p->rate_last = 0;
 
525
                p->pmtu_expires = 0;
 
526
                p->pmtu_orig = 0;
 
527
                memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515
528
                INIT_LIST_HEAD(&p->unused);
516
529
 
517
530
 
519
532
                link_to_pool(p, base);
520
533
                base->total++;
521
534
        }
522
 
        spin_unlock_bh(&base->lock);
 
535
        write_sequnlock_bh(&base->lock);
523
536
 
524
537
        if (base->total >= inet_peer_threshold)
525
538
                /* Remove one less-recently-used entry. */
526
 
                cleanup_once(0);
 
539
                cleanup_once(0, stack);
527
540
 
528
541
        return p;
529
542
}
539
552
{
540
553
        unsigned long now = jiffies;
541
554
        int ttl, total;
 
555
        struct inet_peer __rcu **stack[PEER_MAXDEPTH];
542
556
 
543
557
        total = compute_total();
544
558
        if (total >= inet_peer_threshold)
547
561
                ttl = inet_peer_maxttl
548
562
                                - (inet_peer_maxttl - inet_peer_minttl) / HZ *
549
563
                                        total / inet_peer_threshold * HZ;
550
 
        while (!cleanup_once(ttl)) {
 
564
        while (!cleanup_once(ttl, stack)) {
551
565
                if (jiffies != now)
552
566
                        break;
553
567
        }
579
593
        local_bh_enable();
580
594
}
581
595
EXPORT_SYMBOL_GPL(inet_putpeer);
 
596
 
 
597
/*
 
598
 *      Check transmit rate limitation for given message.
 
599
 *      The rate information is held in the inet_peer entries now.
 
600
 *      This function is generic and could be used for other purposes
 
601
 *      too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
 
602
 *
 
603
 *      Note that the same inet_peer fields are modified by functions in
 
604
 *      route.c too, but these work for packet destinations while xrlim_allow
 
605
 *      works for icmp destinations. This means the rate limiting information
 
606
 *      for one "ip object" is shared - and these ICMPs are twice limited:
 
607
 *      by source and by destination.
 
608
 *
 
609
 *      RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
 
610
 *                        SHOULD allow setting of rate limits
 
611
 *
 
612
 *      Shared between ICMPv4 and ICMPv6.
 
613
 */
 
614
#define XRLIM_BURST_FACTOR 6
 
615
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
 
616
{
 
617
        unsigned long now, token;
 
618
        bool rc = false;
 
619
 
 
620
        if (!peer)
 
621
                return true;
 
622
 
 
623
        token = peer->rate_tokens;
 
624
        now = jiffies;
 
625
        token += now - peer->rate_last;
 
626
        peer->rate_last = now;
 
627
        if (token > XRLIM_BURST_FACTOR * timeout)
 
628
                token = XRLIM_BURST_FACTOR * timeout;
 
629
        if (token >= timeout) {
 
630
                token -= timeout;
 
631
                rc = true;
 
632
        }
 
633
        peer->rate_tokens = token;
 
634
        return rc;
 
635
}
 
636
EXPORT_SYMBOL(inet_peer_xrlim_allow);