~ubuntu-branches/debian/wheezy/linux-2.6/wheezy

« back to all changes in this revision

Viewing changes to kernel/trace/ring_buffer.c

  • Committer: Bazaar Package Importer
  • Author(s): Ben Hutchings, Ben Hutchings, Aurelien Jarno, Martin Michlmayr
  • Date: 2011-04-06 13:53:30 UTC
  • mfrom: (43.1.5 sid)
  • Revision ID: james.westby@ubuntu.com-20110406135330-wjufxhd0tvn3zx4z
Tags: 2.6.38-3
[ Ben Hutchings ]
* [ppc64] Add to linux-tools package architectures (Closes: #620124)
* [amd64] Save cr4 to mmu_cr4_features at boot time (Closes: #620284)
* appletalk: Fix bugs introduced when removing use of BKL
* ALSA: Fix yet another race in disconnection
* cciss: Fix lost command issue
* ath9k: Fix kernel panic in AR2427
* ses: Avoid kernel panic when lun 0 is not mapped
* PCI/ACPI: Report ASPM support to BIOS if not disabled from command line

[ Aurelien Jarno ]
* rtlwifi: fix build when PCI is not enabled.

[ Martin Michlmayr ]
* rtlwifi: Eliminate udelay calls with too large values (Closes: #620204)

Show diffs side-by-side

added added

removed removed

Lines of Context:
14
14
#include <linux/module.h>
15
15
#include <linux/percpu.h>
16
16
#include <linux/mutex.h>
 
17
#include <linux/slab.h>
17
18
#include <linux/init.h>
18
19
#include <linux/hash.h>
19
20
#include <linux/list.h>
20
21
#include <linux/cpu.h>
21
22
#include <linux/fs.h>
22
23
 
 
24
#include <asm/local.h>
23
25
#include "trace.h"
24
26
 
25
27
/*
206
208
#define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207
209
#define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
208
210
 
 
211
#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 
212
# define RB_FORCE_8BYTE_ALIGNMENT       0
 
213
# define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
 
214
#else
 
215
# define RB_FORCE_8BYTE_ALIGNMENT       1
 
216
# define RB_ARCH_ALIGNMENT              8U
 
217
#endif
 
218
 
209
219
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210
220
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211
221
 
214
224
        RB_LEN_TIME_STAMP = 16,
215
225
};
216
226
 
 
227
#define skip_time_extend(event) \
 
228
        ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
 
229
 
217
230
static inline int rb_null_event(struct ring_buffer_event *event)
218
231
{
219
232
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
238
251
        return length + RB_EVNT_HDR_SIZE;
239
252
}
240
253
 
241
 
/* inline for ring buffer fast paths */
242
 
static unsigned
 
254
/*
 
255
 * Return the length of the given event. Will return
 
256
 * the length of the time extend if the event is a
 
257
 * time extend.
 
258
 */
 
259
static inline unsigned
243
260
rb_event_length(struct ring_buffer_event *event)
244
261
{
245
262
        switch (event->type_len) {
264
281
        return 0;
265
282
}
266
283
 
 
284
/*
 
285
 * Return total length of time extend and data,
 
286
 *   or just the event length for all other events.
 
287
 */
 
288
static inline unsigned
 
289
rb_event_ts_length(struct ring_buffer_event *event)
 
290
{
 
291
        unsigned len = 0;
 
292
 
 
293
        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
 
294
                /* time extends include the data event after it */
 
295
                len = RB_LEN_TIME_EXTEND;
 
296
                event = skip_time_extend(event);
 
297
        }
 
298
        return len + rb_event_length(event);
 
299
}
 
300
 
267
301
/**
268
302
 * ring_buffer_event_length - return the length of the event
269
303
 * @event: the event to get the length of
 
304
 *
 
305
 * Returns the size of the data load of a data event.
 
306
 * If the event is something other than a data event, it
 
307
 * returns the size of the event itself. With the exception
 
308
 * of a TIME EXTEND, where it still returns the size of the
 
309
 * data load of the data event after it.
270
310
 */
271
311
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272
312
{
273
 
        unsigned length = rb_event_length(event);
 
313
        unsigned length;
 
314
 
 
315
        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
 
316
                event = skip_time_extend(event);
 
317
 
 
318
        length = rb_event_length(event);
274
319
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275
320
                return length;
276
321
        length -= RB_EVNT_HDR_SIZE;
284
329
static void *
285
330
rb_event_data(struct ring_buffer_event *event)
286
331
{
 
332
        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
 
333
                event = skip_time_extend(event);
287
334
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288
335
        /* If length is in len field, then array[0] has the data */
289
336
        if (event->type_len)
309
356
#define TS_MASK         ((1ULL << TS_SHIFT) - 1)
310
357
#define TS_DELTA_TEST   (~TS_MASK)
311
358
 
 
359
/* Flag when events were overwritten */
 
360
#define RB_MISSED_EVENTS        (1 << 31)
 
361
/* Missed count stored at end */
 
362
#define RB_MISSED_STORED        (1 << 30)
 
363
 
312
364
struct buffer_data_page {
313
365
        u64              time_stamp;    /* page time stamp */
314
366
        local_t          commit;        /* write committed index */
328
380
        local_t          write;         /* index for next write */
329
381
        unsigned         read;          /* index for next read */
330
382
        local_t          entries;       /* entries on this page */
 
383
        unsigned long    real_end;      /* real end of data */
331
384
        struct buffer_data_page *page;  /* Actual data page */
332
385
};
333
386
 
388
441
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
389
442
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
390
443
 
391
 
/* Max number of timestamps that can fit on a page */
392
 
#define RB_TIMESTAMPS_PER_PAGE  (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
393
 
 
394
444
int ring_buffer_print_page_header(struct trace_seq *s)
395
445
{
396
446
        struct buffer_data_page field;
397
447
        int ret;
398
448
 
399
449
        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
400
 
                               "offset:0;\tsize:%u;\n",
401
 
                               (unsigned int)sizeof(field.time_stamp));
 
450
                               "offset:0;\tsize:%u;\tsigned:%u;\n",
 
451
                               (unsigned int)sizeof(field.time_stamp),
 
452
                               (unsigned int)is_signed_type(u64));
402
453
 
403
454
        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
404
 
                               "offset:%u;\tsize:%u;\n",
405
 
                               (unsigned int)offsetof(typeof(field), commit),
406
 
                               (unsigned int)sizeof(field.commit));
 
455
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
 
456
                               (unsigned int)offsetof(typeof(field), commit),
 
457
                               (unsigned int)sizeof(field.commit),
 
458
                               (unsigned int)is_signed_type(long));
 
459
 
 
460
        ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
 
461
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
 
462
                               (unsigned int)offsetof(typeof(field), commit),
 
463
                               1,
 
464
                               (unsigned int)is_signed_type(long));
407
465
 
408
466
        ret = trace_seq_printf(s, "\tfield: char data;\t"
409
 
                               "offset:%u;\tsize:%u;\n",
 
467
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
410
468
                               (unsigned int)offsetof(typeof(field), data),
411
 
                               (unsigned int)BUF_PAGE_SIZE);
 
469
                               (unsigned int)BUF_PAGE_SIZE,
 
470
                               (unsigned int)is_signed_type(char));
412
471
 
413
472
        return ret;
414
473
}
418
477
 */
419
478
struct ring_buffer_per_cpu {
420
479
        int                             cpu;
 
480
        atomic_t                        record_disabled;
421
481
        struct ring_buffer              *buffer;
422
482
        spinlock_t                      reader_lock;    /* serialize readers */
423
 
        raw_spinlock_t                  lock;
 
483
        arch_spinlock_t                 lock;
424
484
        struct lock_class_key           lock_key;
425
485
        struct list_head                *pages;
426
486
        struct buffer_page              *head_page;     /* read from head */
427
487
        struct buffer_page              *tail_page;     /* write to tail */
428
488
        struct buffer_page              *commit_page;   /* committed pages */
429
489
        struct buffer_page              *reader_page;
 
490
        unsigned long                   lost_events;
 
491
        unsigned long                   last_overrun;
430
492
        local_t                         commit_overrun;
431
493
        local_t                         overrun;
432
494
        local_t                         entries;
435
497
        unsigned long                   read;
436
498
        u64                             write_stamp;
437
499
        u64                             read_stamp;
438
 
        atomic_t                        record_disabled;
439
500
};
440
501
 
441
502
struct ring_buffer {
461
522
        struct ring_buffer_per_cpu      *cpu_buffer;
462
523
        unsigned long                   head;
463
524
        struct buffer_page              *head_page;
 
525
        struct buffer_page              *cache_reader_page;
 
526
        unsigned long                   cache_read;
464
527
        u64                             read_stamp;
465
528
};
466
529
 
995
1058
        cpu_buffer->buffer = buffer;
996
1059
        spin_lock_init(&cpu_buffer->reader_lock);
997
1060
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
998
 
        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
1061
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
999
1062
 
1000
1063
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1001
1064
                            GFP_KERNEL, cpu_to_node(cpu));
1190
1253
        struct list_head *p;
1191
1254
        unsigned i;
1192
1255
 
1193
 
        atomic_inc(&cpu_buffer->record_disabled);
1194
 
        synchronize_sched();
1195
 
 
1196
1256
        spin_lock_irq(&cpu_buffer->reader_lock);
1197
1257
        rb_head_page_deactivate(cpu_buffer);
1198
1258
 
1199
1259
        for (i = 0; i < nr_pages; i++) {
1200
1260
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201
 
                        return;
 
1261
                        goto out;
1202
1262
                p = cpu_buffer->pages->next;
1203
1263
                bpage = list_entry(p, struct buffer_page, list);
1204
1264
                list_del_init(&bpage->list);
1205
1265
                free_buffer_page(bpage);
1206
1266
        }
1207
1267
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208
 
                return;
 
1268
                goto out;
1209
1269
 
1210
1270
        rb_reset_cpu(cpu_buffer);
 
1271
        rb_check_pages(cpu_buffer);
 
1272
 
 
1273
out:
1211
1274
        spin_unlock_irq(&cpu_buffer->reader_lock);
1212
 
 
1213
 
        rb_check_pages(cpu_buffer);
1214
 
 
1215
 
        atomic_dec(&cpu_buffer->record_disabled);
1216
 
 
1217
1275
}
1218
1276
 
1219
1277
static void
1224
1282
        struct list_head *p;
1225
1283
        unsigned i;
1226
1284
 
1227
 
        atomic_inc(&cpu_buffer->record_disabled);
1228
 
        synchronize_sched();
1229
 
 
1230
1285
        spin_lock_irq(&cpu_buffer->reader_lock);
1231
1286
        rb_head_page_deactivate(cpu_buffer);
1232
1287
 
1233
1288
        for (i = 0; i < nr_pages; i++) {
1234
1289
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1235
 
                        return;
 
1290
                        goto out;
1236
1291
                p = pages->next;
1237
1292
                bpage = list_entry(p, struct buffer_page, list);
1238
1293
                list_del_init(&bpage->list);
1239
1294
                list_add_tail(&bpage->list, cpu_buffer->pages);
1240
1295
        }
1241
1296
        rb_reset_cpu(cpu_buffer);
 
1297
        rb_check_pages(cpu_buffer);
 
1298
 
 
1299
out:
1242
1300
        spin_unlock_irq(&cpu_buffer->reader_lock);
1243
 
 
1244
 
        rb_check_pages(cpu_buffer);
1245
 
 
1246
 
        atomic_dec(&cpu_buffer->record_disabled);
1247
1301
}
1248
1302
 
1249
1303
/**
1251
1305
 * @buffer: the buffer to resize.
1252
1306
 * @size: the new size.
1253
1307
 *
1254
 
 * The tracer is responsible for making sure that the buffer is
1255
 
 * not being used while changing the size.
1256
 
 * Note: We may be able to change the above requirement by using
1257
 
 *  RCU synchronizations.
1258
 
 *
1259
1308
 * Minimum size is 2 * BUF_PAGE_SIZE.
1260
1309
 *
1261
1310
 * Returns -1 on failure.
1287
1336
        if (size == buffer_size)
1288
1337
                return size;
1289
1338
 
 
1339
        atomic_inc(&buffer->record_disabled);
 
1340
 
 
1341
        /* Make sure all writers are done with this buffer. */
 
1342
        synchronize_sched();
 
1343
 
1290
1344
        mutex_lock(&buffer->mutex);
1291
1345
        get_online_cpus();
1292
1346
 
1349
1403
        put_online_cpus();
1350
1404
        mutex_unlock(&buffer->mutex);
1351
1405
 
 
1406
        atomic_dec(&buffer->record_disabled);
 
1407
 
1352
1408
        return size;
1353
1409
 
1354
1410
 free_pages:
1358
1414
        }
1359
1415
        put_online_cpus();
1360
1416
        mutex_unlock(&buffer->mutex);
 
1417
        atomic_dec(&buffer->record_disabled);
1361
1418
        return -ENOMEM;
1362
1419
 
1363
1420
        /*
1367
1424
 out_fail:
1368
1425
        put_online_cpus();
1369
1426
        mutex_unlock(&buffer->mutex);
 
1427
        atomic_dec(&buffer->record_disabled);
1370
1428
        return -1;
1371
1429
}
1372
1430
EXPORT_SYMBOL_GPL(ring_buffer_resize);
1522
1580
        iter->head = 0;
1523
1581
}
1524
1582
 
 
1583
/* Slow path, do not inline */
 
1584
static noinline struct ring_buffer_event *
 
1585
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 
1586
{
 
1587
        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
1588
 
 
1589
        /* Not the first event on the page? */
 
1590
        if (rb_event_index(event)) {
 
1591
                event->time_delta = delta & TS_MASK;
 
1592
                event->array[0] = delta >> TS_SHIFT;
 
1593
        } else {
 
1594
                /* nope, just zero it */
 
1595
                event->time_delta = 0;
 
1596
                event->array[0] = 0;
 
1597
        }
 
1598
 
 
1599
        return skip_time_extend(event);
 
1600
}
 
1601
 
1525
1602
/**
1526
1603
 * ring_buffer_update_event - update event type and data
1527
1604
 * @event: the even to update
1534
1611
 * data field.
1535
1612
 */
1536
1613
static void
1537
 
rb_update_event(struct ring_buffer_event *event,
1538
 
                         unsigned type, unsigned length)
 
1614
rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 
1615
                struct ring_buffer_event *event, unsigned length,
 
1616
                int add_timestamp, u64 delta)
1539
1617
{
1540
 
        event->type_len = type;
1541
 
 
1542
 
        switch (type) {
1543
 
 
1544
 
        case RINGBUF_TYPE_PADDING:
1545
 
        case RINGBUF_TYPE_TIME_EXTEND:
1546
 
        case RINGBUF_TYPE_TIME_STAMP:
1547
 
                break;
1548
 
 
1549
 
        case 0:
1550
 
                length -= RB_EVNT_HDR_SIZE;
1551
 
                if (length > RB_MAX_SMALL_DATA)
1552
 
                        event->array[0] = length;
1553
 
                else
1554
 
                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1555
 
                break;
1556
 
        default:
1557
 
                BUG();
 
1618
        /* Only a commit updates the timestamp */
 
1619
        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
 
1620
                delta = 0;
 
1621
 
 
1622
        /*
 
1623
         * If we need to add a timestamp, then we
 
1624
         * add it to the start of the resevered space.
 
1625
         */
 
1626
        if (unlikely(add_timestamp)) {
 
1627
                event = rb_add_time_stamp(event, delta);
 
1628
                length -= RB_LEN_TIME_EXTEND;
 
1629
                delta = 0;
1558
1630
        }
 
1631
 
 
1632
        event->time_delta = delta;
 
1633
        length -= RB_EVNT_HDR_SIZE;
 
1634
        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
 
1635
                event->type_len = 0;
 
1636
                event->array[0] = length;
 
1637
        } else
 
1638
                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1559
1639
}
1560
1640
 
1561
1641
/*
1723
1803
        if (!length)
1724
1804
                length = 1;
1725
1805
 
1726
 
        if (length > RB_MAX_SMALL_DATA)
 
1806
        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1727
1807
                length += sizeof(event.array[0]);
1728
1808
 
1729
1809
        length += RB_EVNT_HDR_SIZE;
1730
 
        length = ALIGN(length, RB_ALIGNMENT);
 
1810
        length = ALIGN(length, RB_ARCH_ALIGNMENT);
1731
1811
 
1732
1812
        return length;
1733
1813
}
1744
1824
         * must fill the old tail_page with padding.
1745
1825
         */
1746
1826
        if (tail >= BUF_PAGE_SIZE) {
 
1827
                /*
 
1828
                 * If the page was filled, then we still need
 
1829
                 * to update the real_end. Reset it to zero
 
1830
                 * and the reader will ignore it.
 
1831
                 */
 
1832
                if (tail == BUF_PAGE_SIZE)
 
1833
                        tail_page->real_end = 0;
 
1834
 
1747
1835
                local_sub(length, &tail_page->write);
1748
1836
                return;
1749
1837
        }
1752
1840
        kmemcheck_annotate_bitfield(event, bitfield);
1753
1841
 
1754
1842
        /*
 
1843
         * Save the original length to the meta data.
 
1844
         * This will be used by the reader to add lost event
 
1845
         * counter.
 
1846
         */
 
1847
        tail_page->real_end = tail;
 
1848
 
 
1849
        /*
1755
1850
         * If this event is bigger than the minimum size, then
1756
1851
         * we need to be careful that we don't subtract the
1757
1852
         * write counter enough to allow another writer to slip
1784
1879
        local_sub(length, &tail_page->write);
1785
1880
}
1786
1881
 
1787
 
static struct ring_buffer_event *
 
1882
/*
 
1883
 * This is the slow path, force gcc not to inline it.
 
1884
 */
 
1885
static noinline struct ring_buffer_event *
1788
1886
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1789
1887
             unsigned long length, unsigned long tail,
1790
 
             struct buffer_page *commit_page,
1791
 
             struct buffer_page *tail_page, u64 *ts)
 
1888
             struct buffer_page *tail_page, u64 ts)
1792
1889
{
 
1890
        struct buffer_page *commit_page = cpu_buffer->commit_page;
1793
1891
        struct ring_buffer *buffer = cpu_buffer->buffer;
1794
1892
        struct buffer_page *next_page;
1795
1893
        int ret;
1870
1968
                 * Nested commits always have zero deltas, so
1871
1969
                 * just reread the time stamp
1872
1970
                 */
1873
 
                *ts = rb_time_stamp(buffer);
1874
 
                next_page->page->time_stamp = *ts;
 
1971
                ts = rb_time_stamp(buffer);
 
1972
                next_page->page->time_stamp = ts;
1875
1973
        }
1876
1974
 
1877
1975
 out_again:
1890
1988
 
1891
1989
static struct ring_buffer_event *
1892
1990
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1893
 
                  unsigned type, unsigned long length, u64 *ts)
 
1991
                  unsigned long length, u64 ts,
 
1992
                  u64 delta, int add_timestamp)
1894
1993
{
1895
 
        struct buffer_page *tail_page, *commit_page;
 
1994
        struct buffer_page *tail_page;
1896
1995
        struct ring_buffer_event *event;
1897
1996
        unsigned long tail, write;
1898
1997
 
1899
 
        commit_page = cpu_buffer->commit_page;
1900
 
        /* we just need to protect against interrupts */
1901
 
        barrier();
 
1998
        /*
 
1999
         * If the time delta since the last event is too big to
 
2000
         * hold in the time field of the event, then we append a
 
2001
         * TIME EXTEND event ahead of the data event.
 
2002
         */
 
2003
        if (unlikely(add_timestamp))
 
2004
                length += RB_LEN_TIME_EXTEND;
 
2005
 
1902
2006
        tail_page = cpu_buffer->tail_page;
1903
2007
        write = local_add_return(length, &tail_page->write);
1904
2008
 
1907
2011
        tail = write - length;
1908
2012
 
1909
2013
        /* See if we shot pass the end of this buffer page */
1910
 
        if (write > BUF_PAGE_SIZE)
 
2014
        if (unlikely(write > BUF_PAGE_SIZE))
1911
2015
                return rb_move_tail(cpu_buffer, length, tail,
1912
 
                                    commit_page, tail_page, ts);
 
2016
                                    tail_page, ts);
1913
2017
 
1914
2018
        /* We reserved something on the buffer */
1915
2019
 
1916
2020
        event = __rb_page_index(tail_page, tail);
1917
2021
        kmemcheck_annotate_bitfield(event, bitfield);
1918
 
        rb_update_event(event, type, length);
 
2022
        rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1919
2023
 
1920
 
        /* The passed in type is zero for DATA */
1921
 
        if (likely(!type))
1922
 
                local_inc(&tail_page->entries);
 
2024
        local_inc(&tail_page->entries);
1923
2025
 
1924
2026
        /*
1925
2027
         * If this is the first commit on the page, then update
1926
2028
         * its timestamp.
1927
2029
         */
1928
2030
        if (!tail)
1929
 
                tail_page->page->time_stamp = *ts;
 
2031
                tail_page->page->time_stamp = ts;
1930
2032
 
1931
2033
        return event;
1932
2034
}
1941
2043
        unsigned long addr;
1942
2044
 
1943
2045
        new_index = rb_event_index(event);
1944
 
        old_index = new_index + rb_event_length(event);
 
2046
        old_index = new_index + rb_event_ts_length(event);
1945
2047
        addr = (unsigned long)event;
1946
2048
        addr &= PAGE_MASK;
1947
2049
 
1967
2069
        return 0;
1968
2070
}
1969
2071
 
1970
 
static int
1971
 
rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1972
 
                  u64 *ts, u64 *delta)
1973
 
{
1974
 
        struct ring_buffer_event *event;
1975
 
        static int once;
1976
 
        int ret;
1977
 
 
1978
 
        if (unlikely(*delta > (1ULL << 59) && !once++)) {
1979
 
                printk(KERN_WARNING "Delta way too big! %llu"
1980
 
                       " ts=%llu write stamp = %llu\n",
1981
 
                       (unsigned long long)*delta,
1982
 
                       (unsigned long long)*ts,
1983
 
                       (unsigned long long)cpu_buffer->write_stamp);
1984
 
                WARN_ON(1);
1985
 
        }
1986
 
 
1987
 
        /*
1988
 
         * The delta is too big, we to add a
1989
 
         * new timestamp.
1990
 
         */
1991
 
        event = __rb_reserve_next(cpu_buffer,
1992
 
                                  RINGBUF_TYPE_TIME_EXTEND,
1993
 
                                  RB_LEN_TIME_EXTEND,
1994
 
                                  ts);
1995
 
        if (!event)
1996
 
                return -EBUSY;
1997
 
 
1998
 
        if (PTR_ERR(event) == -EAGAIN)
1999
 
                return -EAGAIN;
2000
 
 
2001
 
        /* Only a commited time event can update the write stamp */
2002
 
        if (rb_event_is_commit(cpu_buffer, event)) {
2003
 
                /*
2004
 
                 * If this is the first on the page, then it was
2005
 
                 * updated with the page itself. Try to discard it
2006
 
                 * and if we can't just make it zero.
2007
 
                 */
2008
 
                if (rb_event_index(event)) {
2009
 
                        event->time_delta = *delta & TS_MASK;
2010
 
                        event->array[0] = *delta >> TS_SHIFT;
2011
 
                } else {
2012
 
                        /* try to discard, since we do not need this */
2013
 
                        if (!rb_try_to_discard(cpu_buffer, event)) {
2014
 
                                /* nope, just zero it */
2015
 
                                event->time_delta = 0;
2016
 
                                event->array[0] = 0;
2017
 
                        }
2018
 
                }
2019
 
                cpu_buffer->write_stamp = *ts;
2020
 
                /* let the caller know this was the commit */
2021
 
                ret = 1;
2022
 
        } else {
2023
 
                /* Try to discard the event */
2024
 
                if (!rb_try_to_discard(cpu_buffer, event)) {
2025
 
                        /* Darn, this is just wasted space */
2026
 
                        event->time_delta = 0;
2027
 
                        event->array[0] = 0;
2028
 
                }
2029
 
                ret = 0;
2030
 
        }
2031
 
 
2032
 
        *delta = 0;
2033
 
 
2034
 
        return ret;
2035
 
}
2036
 
 
2037
2072
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2038
2073
{
2039
2074
        local_inc(&cpu_buffer->committing);
2040
2075
        local_inc(&cpu_buffer->commits);
2041
2076
}
2042
2077
 
2043
 
static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 
2078
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2044
2079
{
2045
2080
        unsigned long commits;
2046
2081
 
2078
2113
                      unsigned long length)
2079
2114
{
2080
2115
        struct ring_buffer_event *event;
2081
 
        u64 ts, delta = 0;
2082
 
        int commit = 0;
 
2116
        u64 ts, delta;
2083
2117
        int nr_loops = 0;
 
2118
        int add_timestamp;
 
2119
        u64 diff;
2084
2120
 
2085
2121
        rb_start_commit(cpu_buffer);
2086
2122
 
2101
2137
 
2102
2138
        length = rb_calculate_event_length(length);
2103
2139
 again:
 
2140
        add_timestamp = 0;
 
2141
        delta = 0;
 
2142
 
2104
2143
        /*
2105
2144
         * We allow for interrupts to reenter here and do a trace.
2106
2145
         * If one does, it will cause this original code to loop
2114
2153
                goto out_fail;
2115
2154
 
2116
2155
        ts = rb_time_stamp(cpu_buffer->buffer);
2117
 
 
2118
 
        /*
2119
 
         * Only the first commit can update the timestamp.
2120
 
         * Yes there is a race here. If an interrupt comes in
2121
 
         * just after the conditional and it traces too, then it
2122
 
         * will also check the deltas. More than one timestamp may
2123
 
         * also be made. But only the entry that did the actual
2124
 
         * commit will be something other than zero.
2125
 
         */
2126
 
        if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2127
 
                   rb_page_write(cpu_buffer->tail_page) ==
2128
 
                   rb_commit_index(cpu_buffer))) {
2129
 
                u64 diff;
2130
 
 
2131
 
                diff = ts - cpu_buffer->write_stamp;
2132
 
 
2133
 
                /* make sure this diff is calculated here */
2134
 
                barrier();
2135
 
 
2136
 
                /* Did the write stamp get updated already? */
2137
 
                if (unlikely(ts < cpu_buffer->write_stamp))
2138
 
                        goto get_event;
2139
 
 
 
2156
        diff = ts - cpu_buffer->write_stamp;
 
2157
 
 
2158
        /* make sure this diff is calculated here */
 
2159
        barrier();
 
2160
 
 
2161
        /* Did the write stamp get updated already? */
 
2162
        if (likely(ts >= cpu_buffer->write_stamp)) {
2140
2163
                delta = diff;
2141
2164
                if (unlikely(test_time_stamp(delta))) {
2142
 
 
2143
 
                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
2144
 
                        if (commit == -EBUSY)
2145
 
                                goto out_fail;
2146
 
 
2147
 
                        if (commit == -EAGAIN)
2148
 
                                goto again;
2149
 
 
2150
 
                        RB_WARN_ON(cpu_buffer, commit < 0);
 
2165
                        WARN_ONCE(delta > (1ULL << 59),
 
2166
                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
 
2167
                                  (unsigned long long)delta,
 
2168
                                  (unsigned long long)ts,
 
2169
                                  (unsigned long long)cpu_buffer->write_stamp);
 
2170
                        add_timestamp = 1;
2151
2171
                }
2152
2172
        }
2153
2173
 
2154
 
 get_event:
2155
 
        event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
 
2174
        event = __rb_reserve_next(cpu_buffer, length, ts,
 
2175
                                  delta, add_timestamp);
2156
2176
        if (unlikely(PTR_ERR(event) == -EAGAIN))
2157
2177
                goto again;
2158
2178
 
2159
2179
        if (!event)
2160
2180
                goto out_fail;
2161
2181
 
2162
 
        if (!rb_event_is_commit(cpu_buffer, event))
2163
 
                delta = 0;
2164
 
 
2165
 
        event->time_delta = delta;
2166
 
 
2167
2182
        return event;
2168
2183
 
2169
2184
 out_fail:
2175
2190
 
2176
2191
#define TRACE_RECURSIVE_DEPTH 16
2177
2192
 
2178
 
static int trace_recursive_lock(void)
 
2193
/* Keep this code out of the fast path cache */
 
2194
static noinline void trace_recursive_fail(void)
2179
2195
{
2180
 
        current->trace_recursion++;
2181
 
 
2182
 
        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2183
 
                return 0;
2184
 
 
2185
2196
        /* Disable all tracing before we do anything else */
2186
2197
        tracing_off_permanent();
2187
2198
 
2193
2204
                    in_nmi());
2194
2205
 
2195
2206
        WARN_ON_ONCE(1);
 
2207
}
 
2208
 
 
2209
static inline int trace_recursive_lock(void)
 
2210
{
 
2211
        current->trace_recursion++;
 
2212
 
 
2213
        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
 
2214
                return 0;
 
2215
 
 
2216
        trace_recursive_fail();
 
2217
 
2196
2218
        return -1;
2197
2219
}
2198
2220
 
2199
 
static void trace_recursive_unlock(void)
 
2221
static inline void trace_recursive_unlock(void)
2200
2222
{
2201
2223
        WARN_ON_ONCE(!current->trace_recursion);
2202
2224
 
2210
2232
 
2211
2233
#endif
2212
2234
 
2213
 
static DEFINE_PER_CPU(int, rb_need_resched);
2214
 
 
2215
2235
/**
2216
2236
 * ring_buffer_lock_reserve - reserve a part of the buffer
2217
2237
 * @buffer: the ring buffer to reserve from
2232
2252
{
2233
2253
        struct ring_buffer_per_cpu *cpu_buffer;
2234
2254
        struct ring_buffer_event *event;
2235
 
        int cpu, resched;
 
2255
        int cpu;
2236
2256
 
2237
2257
        if (ring_buffer_flags != RB_BUFFERS_ON)
2238
2258
                return NULL;
2239
2259
 
2240
 
        if (atomic_read(&buffer->record_disabled))
2241
 
                return NULL;
2242
 
 
2243
2260
        /* If we are tracing schedule, we don't want to recurse */
2244
 
        resched = ftrace_preempt_disable();
 
2261
        preempt_disable_notrace();
 
2262
 
 
2263
        if (atomic_read(&buffer->record_disabled))
 
2264
                goto out_nocheck;
2245
2265
 
2246
2266
        if (trace_recursive_lock())
2247
2267
                goto out_nocheck;
2263
2283
        if (!event)
2264
2284
                goto out;
2265
2285
 
2266
 
        /*
2267
 
         * Need to store resched state on this cpu.
2268
 
         * Only the first needs to.
2269
 
         */
2270
 
 
2271
 
        if (preempt_count() == 1)
2272
 
                per_cpu(rb_need_resched, cpu) = resched;
2273
 
 
2274
2286
        return event;
2275
2287
 
2276
2288
 out:
2277
2289
        trace_recursive_unlock();
2278
2290
 
2279
2291
 out_nocheck:
2280
 
        ftrace_preempt_enable(resched);
 
2292
        preempt_enable_notrace();
2281
2293
        return NULL;
2282
2294
}
2283
2295
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
2286
2298
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2287
2299
                      struct ring_buffer_event *event)
2288
2300
{
 
2301
        u64 delta;
 
2302
 
2289
2303
        /*
2290
2304
         * The event first in the commit queue updates the
2291
2305
         * time stamp.
2292
2306
         */
2293
 
        if (rb_event_is_commit(cpu_buffer, event))
2294
 
                cpu_buffer->write_stamp += event->time_delta;
 
2307
        if (rb_event_is_commit(cpu_buffer, event)) {
 
2308
                /*
 
2309
                 * A commit event that is first on a page
 
2310
                 * updates the write timestamp with the page stamp
 
2311
                 */
 
2312
                if (!rb_event_index(event))
 
2313
                        cpu_buffer->write_stamp =
 
2314
                                cpu_buffer->commit_page->page->time_stamp;
 
2315
                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
 
2316
                        delta = event->array[0];
 
2317
                        delta <<= TS_SHIFT;
 
2318
                        delta += event->time_delta;
 
2319
                        cpu_buffer->write_stamp += delta;
 
2320
                } else
 
2321
                        cpu_buffer->write_stamp += event->time_delta;
 
2322
        }
2295
2323
}
2296
2324
 
2297
2325
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2323
2351
 
2324
2352
        trace_recursive_unlock();
2325
2353
 
2326
 
        /*
2327
 
         * Only the last preempt count needs to restore preemption.
2328
 
         */
2329
 
        if (preempt_count() == 1)
2330
 
                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2331
 
        else
2332
 
                preempt_enable_no_resched_notrace();
 
2354
        preempt_enable_notrace();
2333
2355
 
2334
2356
        return 0;
2335
2357
}
2337
2359
 
2338
2360
static inline void rb_event_discard(struct ring_buffer_event *event)
2339
2361
{
 
2362
        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
 
2363
                event = skip_time_extend(event);
 
2364
 
2340
2365
        /* array[0] holds the actual length for the discarded event */
2341
2366
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2342
2367
        event->type_len = RINGBUF_TYPE_PADDING;
2437
2462
 
2438
2463
        trace_recursive_unlock();
2439
2464
 
2440
 
        /*
2441
 
         * Only the last preempt count needs to restore preemption.
2442
 
         */
2443
 
        if (preempt_count() == 1)
2444
 
                ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2445
 
        else
2446
 
                preempt_enable_no_resched_notrace();
 
2465
        preempt_enable_notrace();
2447
2466
 
2448
2467
}
2449
2468
EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2469
2488
        struct ring_buffer_event *event;
2470
2489
        void *body;
2471
2490
        int ret = -EBUSY;
2472
 
        int cpu, resched;
 
2491
        int cpu;
2473
2492
 
2474
2493
        if (ring_buffer_flags != RB_BUFFERS_ON)
2475
2494
                return -EBUSY;
2476
2495
 
 
2496
        preempt_disable_notrace();
 
2497
 
2477
2498
        if (atomic_read(&buffer->record_disabled))
2478
 
                return -EBUSY;
2479
 
 
2480
 
        resched = ftrace_preempt_disable();
 
2499
                goto out;
2481
2500
 
2482
2501
        cpu = raw_smp_processor_id();
2483
2502
 
2504
2523
 
2505
2524
        ret = 0;
2506
2525
 out:
2507
 
        ftrace_preempt_enable(resched);
 
2526
        preempt_enable_notrace();
2508
2527
 
2509
2528
        return ret;
2510
2529
}
2546
2565
 * @buffer: The ring buffer to enable writes
2547
2566
 *
2548
2567
 * Note, multiple disables will need the same number of enables
2549
 
 * to truely enable the writing (much like preempt_disable).
 
2568
 * to truly enable the writing (much like preempt_disable).
2550
2569
 */
2551
2570
void ring_buffer_record_enable(struct ring_buffer *buffer)
2552
2571
{
2582
2601
 * @cpu: The CPU to enable.
2583
2602
 *
2584
2603
 * Note, multiple disables will need the same number of enables
2585
 
 * to truely enable the writing (much like preempt_disable).
 
2604
 * to truly enable the writing (much like preempt_disable).
2586
2605
 */
2587
2606
void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2588
2607
{
2596
2615
}
2597
2616
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2598
2617
 
 
2618
/*
 
2619
 * The total entries in the ring buffer is the running counter
 
2620
 * of entries entered into the ring buffer, minus the sum of
 
2621
 * the entries read from the ring buffer and the number of
 
2622
 * entries that were overwritten.
 
2623
 */
 
2624
static inline unsigned long
 
2625
rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 
2626
{
 
2627
        return local_read(&cpu_buffer->entries) -
 
2628
                (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
 
2629
}
 
2630
 
2599
2631
/**
2600
2632
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2601
2633
 * @buffer: The ring buffer
2604
2636
unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2605
2637
{
2606
2638
        struct ring_buffer_per_cpu *cpu_buffer;
2607
 
        unsigned long ret;
2608
2639
 
2609
2640
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
2610
2641
                return 0;
2611
2642
 
2612
2643
        cpu_buffer = buffer->buffers[cpu];
2613
 
        ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2614
 
                - cpu_buffer->read;
2615
2644
 
2616
 
        return ret;
 
2645
        return rb_num_of_entries(cpu_buffer);
2617
2646
}
2618
2647
EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2619
2648
 
2674
2703
        /* if you care about this being correct, lock the buffer */
2675
2704
        for_each_buffer_cpu(buffer, cpu) {
2676
2705
                cpu_buffer = buffer->buffers[cpu];
2677
 
                entries += (local_read(&cpu_buffer->entries) -
2678
 
                            local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
 
2706
                entries += rb_num_of_entries(cpu_buffer);
2679
2707
        }
2680
2708
 
2681
2709
        return entries;
2723
2751
                iter->read_stamp = cpu_buffer->read_stamp;
2724
2752
        else
2725
2753
                iter->read_stamp = iter->head_page->page->time_stamp;
 
2754
        iter->cache_reader_page = cpu_buffer->reader_page;
 
2755
        iter->cache_read = cpu_buffer->read;
2726
2756
}
2727
2757
 
2728
2758
/**
2829
2859
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2830
2860
{
2831
2861
        struct buffer_page *reader = NULL;
 
2862
        unsigned long overwrite;
2832
2863
        unsigned long flags;
2833
2864
        int nr_loops = 0;
2834
2865
        int ret;
2835
2866
 
2836
2867
        local_irq_save(flags);
2837
 
        __raw_spin_lock(&cpu_buffer->lock);
 
2868
        arch_spin_lock(&cpu_buffer->lock);
2838
2869
 
2839
2870
 again:
2840
2871
        /*
2870
2901
        local_set(&cpu_buffer->reader_page->write, 0);
2871
2902
        local_set(&cpu_buffer->reader_page->entries, 0);
2872
2903
        local_set(&cpu_buffer->reader_page->page->commit, 0);
 
2904
        cpu_buffer->reader_page->real_end = 0;
2873
2905
 
2874
2906
 spin:
2875
2907
        /*
2876
2908
         * Splice the empty reader page into the list around the head.
2877
2909
         */
2878
2910
        reader = rb_set_head_page(cpu_buffer);
2879
 
        cpu_buffer->reader_page->list.next = reader->list.next;
 
2911
        cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2880
2912
        cpu_buffer->reader_page->list.prev = reader->list.prev;
2881
2913
 
2882
2914
        /*
2890
2922
        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2891
2923
 
2892
2924
        /*
 
2925
         * We want to make sure we read the overruns after we set up our
 
2926
         * pointers to the next object. The writer side does a
 
2927
         * cmpxchg to cross pages which acts as the mb on the writer
 
2928
         * side. Note, the reader will constantly fail the swap
 
2929
         * while the writer is updating the pointers, so this
 
2930
         * guarantees that the overwrite recorded here is the one we
 
2931
         * want to compare with the last_overrun.
 
2932
         */
 
2933
        smp_mb();
 
2934
        overwrite = local_read(&(cpu_buffer->overrun));
 
2935
 
 
2936
        /*
2893
2937
         * Here's the tricky part.
2894
2938
         *
2895
2939
         * We need to move the pointer past the header page.
2913
2957
         *
2914
2958
         * Now make the new head point back to the reader page.
2915
2959
         */
2916
 
        reader->list.next->prev = &cpu_buffer->reader_page->list;
 
2960
        rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2917
2961
        rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2918
2962
 
2919
2963
        /* Finally update the reader page to the new head */
2920
2964
        cpu_buffer->reader_page = reader;
2921
2965
        rb_reset_reader_page(cpu_buffer);
2922
2966
 
 
2967
        if (overwrite != cpu_buffer->last_overrun) {
 
2968
                cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
 
2969
                cpu_buffer->last_overrun = overwrite;
 
2970
        }
 
2971
 
2923
2972
        goto again;
2924
2973
 
2925
2974
 out:
2926
 
        __raw_spin_unlock(&cpu_buffer->lock);
 
2975
        arch_spin_unlock(&cpu_buffer->lock);
2927
2976
        local_irq_restore(flags);
2928
2977
 
2929
2978
        return reader;
2954
3003
 
2955
3004
static void rb_advance_iter(struct ring_buffer_iter *iter)
2956
3005
{
2957
 
        struct ring_buffer *buffer;
2958
3006
        struct ring_buffer_per_cpu *cpu_buffer;
2959
3007
        struct ring_buffer_event *event;
2960
3008
        unsigned length;
2961
3009
 
2962
3010
        cpu_buffer = iter->cpu_buffer;
2963
 
        buffer = cpu_buffer->buffer;
2964
3011
 
2965
3012
        /*
2966
3013
         * Check if we are at the end of the buffer.
2996
3043
                rb_advance_iter(iter);
2997
3044
}
2998
3045
 
 
3046
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
 
3047
{
 
3048
        return cpu_buffer->lost_events;
 
3049
}
 
3050
 
2999
3051
static struct ring_buffer_event *
3000
 
rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 
3052
rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 
3053
               unsigned long *lost_events)
3001
3054
{
3002
3055
        struct ring_buffer_event *event;
3003
3056
        struct buffer_page *reader;
3005
3058
 
3006
3059
 again:
3007
3060
        /*
3008
 
         * We repeat when a timestamp is encountered. It is possible
3009
 
         * to get multiple timestamps from an interrupt entering just
3010
 
         * as one timestamp is about to be written, or from discarded
3011
 
         * commits. The most that we can have is the number on a single page.
 
3061
         * We repeat when a time extend is encountered.
 
3062
         * Since the time extend is always attached to a data event,
 
3063
         * we should never loop more than once.
 
3064
         * (We never hit the following condition more than twice).
3012
3065
         */
3013
 
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 
3066
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3014
3067
                return NULL;
3015
3068
 
3016
3069
        reader = rb_get_reader_page(cpu_buffer);
3049
3102
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3050
3103
                                                         cpu_buffer->cpu, ts);
3051
3104
                }
 
3105
                if (lost_events)
 
3106
                        *lost_events = rb_lost_events(cpu_buffer);
3052
3107
                return event;
3053
3108
 
3054
3109
        default:
3067
3122
        struct ring_buffer_event *event;
3068
3123
        int nr_loops = 0;
3069
3124
 
 
3125
        cpu_buffer = iter->cpu_buffer;
 
3126
        buffer = cpu_buffer->buffer;
 
3127
 
 
3128
        /*
 
3129
         * Check if someone performed a consuming read to
 
3130
         * the buffer. A consuming read invalidates the iterator
 
3131
         * and we need to reset the iterator in this case.
 
3132
         */
 
3133
        if (unlikely(iter->cache_read != cpu_buffer->read ||
 
3134
                     iter->cache_reader_page != cpu_buffer->reader_page))
 
3135
                rb_iter_reset(iter);
 
3136
 
 
3137
 again:
3070
3138
        if (ring_buffer_iter_empty(iter))
3071
3139
                return NULL;
3072
3140
 
3073
 
        cpu_buffer = iter->cpu_buffer;
3074
 
        buffer = cpu_buffer->buffer;
3075
 
 
3076
 
 again:
3077
3141
        /*
3078
 
         * We repeat when a timestamp is encountered.
3079
 
         * We can get multiple timestamps by nested interrupts or also
3080
 
         * if filtering is on (discarding commits). Since discarding
3081
 
         * commits can be frequent we can get a lot of timestamps.
3082
 
         * But we limit them by not adding timestamps if they begin
3083
 
         * at the start of a page.
 
3142
         * We repeat when a time extend is encountered.
 
3143
         * Since the time extend is always attached to a data event,
 
3144
         * we should never loop more than once.
 
3145
         * (We never hit the following condition more than twice).
3084
3146
         */
3085
 
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 
3147
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3086
3148
                return NULL;
3087
3149
 
3088
3150
        if (rb_per_cpu_empty(cpu_buffer))
3089
3151
                return NULL;
3090
3152
 
 
3153
        if (iter->head >= local_read(&iter->head_page->page->commit)) {
 
3154
                rb_inc_iter(iter);
 
3155
                goto again;
 
3156
        }
 
3157
 
3091
3158
        event = rb_iter_head_event(iter);
3092
3159
 
3093
3160
        switch (event->type_len) {
3145
3212
 * @buffer: The ring buffer to read
3146
3213
 * @cpu: The cpu to peak at
3147
3214
 * @ts: The timestamp counter of this event.
 
3215
 * @lost_events: a variable to store if events were lost (may be NULL)
3148
3216
 *
3149
3217
 * This will return the event that will be read next, but does
3150
3218
 * not consume the data.
3151
3219
 */
3152
3220
struct ring_buffer_event *
3153
 
ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
3221
ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 
3222
                 unsigned long *lost_events)
3154
3223
{
3155
3224
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3156
3225
        struct ring_buffer_event *event;
3165
3234
        local_irq_save(flags);
3166
3235
        if (dolock)
3167
3236
                spin_lock(&cpu_buffer->reader_lock);
3168
 
        event = rb_buffer_peek(cpu_buffer, ts);
 
3237
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3169
3238
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
3170
3239
                rb_advance_reader(cpu_buffer);
3171
3240
        if (dolock)
3207
3276
/**
3208
3277
 * ring_buffer_consume - return an event and consume it
3209
3278
 * @buffer: The ring buffer to get the next event from
 
3279
 * @cpu: the cpu to read the buffer from
 
3280
 * @ts: a variable to store the timestamp (may be NULL)
 
3281
 * @lost_events: a variable to store if events were lost (may be NULL)
3210
3282
 *
3211
3283
 * Returns the next event in the ring buffer, and that event is consumed.
3212
3284
 * Meaning, that sequential reads will keep returning a different event,
3213
3285
 * and eventually empty the ring buffer if the producer is slower.
3214
3286
 */
3215
3287
struct ring_buffer_event *
3216
 
ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 
3288
ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 
3289
                    unsigned long *lost_events)
3217
3290
{
3218
3291
        struct ring_buffer_per_cpu *cpu_buffer;
3219
3292
        struct ring_buffer_event *event = NULL;
3234
3307
        if (dolock)
3235
3308
                spin_lock(&cpu_buffer->reader_lock);
3236
3309
 
3237
 
        event = rb_buffer_peek(cpu_buffer, ts);
3238
 
        if (event)
 
3310
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 
3311
        if (event) {
 
3312
                cpu_buffer->lost_events = 0;
3239
3313
                rb_advance_reader(cpu_buffer);
 
3314
        }
3240
3315
 
3241
3316
        if (dolock)
3242
3317
                spin_unlock(&cpu_buffer->reader_lock);
3253
3328
EXPORT_SYMBOL_GPL(ring_buffer_consume);
3254
3329
 
3255
3330
/**
3256
 
 * ring_buffer_read_start - start a non consuming read of the buffer
 
3331
 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3257
3332
 * @buffer: The ring buffer to read from
3258
3333
 * @cpu: The cpu buffer to iterate over
3259
3334
 *
3260
 
 * This starts up an iteration through the buffer. It also disables
3261
 
 * the recording to the buffer until the reading is finished.
3262
 
 * This prevents the reading from being corrupted. This is not
3263
 
 * a consuming read, so a producer is not expected.
3264
 
 *
3265
 
 * Must be paired with ring_buffer_finish.
 
3335
 * This performs the initial preparations necessary to iterate
 
3336
 * through the buffer.  Memory is allocated, buffer recording
 
3337
 * is disabled, and the iterator pointer is returned to the caller.
 
3338
 *
 
3339
 * Disabling buffer recordng prevents the reading from being
 
3340
 * corrupted. This is not a consuming read, so a producer is not
 
3341
 * expected.
 
3342
 *
 
3343
 * After a sequence of ring_buffer_read_prepare calls, the user is
 
3344
 * expected to make at least one call to ring_buffer_prepare_sync.
 
3345
 * Afterwards, ring_buffer_read_start is invoked to get things going
 
3346
 * for real.
 
3347
 *
 
3348
 * This overall must be paired with ring_buffer_finish.
3266
3349
 */
3267
3350
struct ring_buffer_iter *
3268
 
ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 
3351
ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3269
3352
{
3270
3353
        struct ring_buffer_per_cpu *cpu_buffer;
3271
3354
        struct ring_buffer_iter *iter;
3272
 
        unsigned long flags;
3273
3355
 
3274
3356
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
3275
3357
                return NULL;
3283
3365
        iter->cpu_buffer = cpu_buffer;
3284
3366
 
3285
3367
        atomic_inc(&cpu_buffer->record_disabled);
3286
 
        synchronize_sched();
3287
 
 
3288
 
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3289
 
        __raw_spin_lock(&cpu_buffer->lock);
3290
 
        rb_iter_reset(iter);
3291
 
        __raw_spin_unlock(&cpu_buffer->lock);
3292
 
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3293
3368
 
3294
3369
        return iter;
3295
3370
}
 
3371
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
 
3372
 
 
3373
/**
 
3374
 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
 
3375
 *
 
3376
 * All previously invoked ring_buffer_read_prepare calls to prepare
 
3377
 * iterators will be synchronized.  Afterwards, read_buffer_read_start
 
3378
 * calls on those iterators are allowed.
 
3379
 */
 
3380
void
 
3381
ring_buffer_read_prepare_sync(void)
 
3382
{
 
3383
        synchronize_sched();
 
3384
}
 
3385
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
 
3386
 
 
3387
/**
 
3388
 * ring_buffer_read_start - start a non consuming read of the buffer
 
3389
 * @iter: The iterator returned by ring_buffer_read_prepare
 
3390
 *
 
3391
 * This finalizes the startup of an iteration through the buffer.
 
3392
 * The iterator comes from a call to ring_buffer_read_prepare and
 
3393
 * an intervening ring_buffer_read_prepare_sync must have been
 
3394
 * performed.
 
3395
 *
 
3396
 * Must be paired with ring_buffer_finish.
 
3397
 */
 
3398
void
 
3399
ring_buffer_read_start(struct ring_buffer_iter *iter)
 
3400
{
 
3401
        struct ring_buffer_per_cpu *cpu_buffer;
 
3402
        unsigned long flags;
 
3403
 
 
3404
        if (!iter)
 
3405
                return;
 
3406
 
 
3407
        cpu_buffer = iter->cpu_buffer;
 
3408
 
 
3409
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
3410
        arch_spin_lock(&cpu_buffer->lock);
 
3411
        rb_iter_reset(iter);
 
3412
        arch_spin_unlock(&cpu_buffer->lock);
 
3413
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
3414
}
3296
3415
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3297
3416
 
3298
3417
/**
3385
3504
        cpu_buffer->write_stamp = 0;
3386
3505
        cpu_buffer->read_stamp = 0;
3387
3506
 
 
3507
        cpu_buffer->lost_events = 0;
 
3508
        cpu_buffer->last_overrun = 0;
 
3509
 
3388
3510
        rb_head_page_activate(cpu_buffer);
3389
3511
}
3390
3512
 
3408
3530
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409
3531
                goto out;
3410
3532
 
3411
 
        __raw_spin_lock(&cpu_buffer->lock);
 
3533
        arch_spin_lock(&cpu_buffer->lock);
3412
3534
 
3413
3535
        rb_reset_cpu(cpu_buffer);
3414
3536
 
3415
 
        __raw_spin_unlock(&cpu_buffer->lock);
 
3537
        arch_spin_unlock(&cpu_buffer->lock);
3416
3538
 
3417
3539
 out:
3418
3540
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3660
3782
        struct ring_buffer_event *event;
3661
3783
        struct buffer_data_page *bpage;
3662
3784
        struct buffer_page *reader;
 
3785
        unsigned long missed_events;
3663
3786
        unsigned long flags;
3664
3787
        unsigned int commit;
3665
3788
        unsigned int read;
3696
3819
        read = reader->read;
3697
3820
        commit = rb_page_commit(reader);
3698
3821
 
 
3822
        /* Check if any events were dropped */
 
3823
        missed_events = cpu_buffer->lost_events;
 
3824
 
3699
3825
        /*
3700
3826
         * If this page has been partially read or
3701
3827
         * if len is not big enough to read the rest of the page or
3716
3842
                if (len > (commit - read))
3717
3843
                        len = (commit - read);
3718
3844
 
3719
 
                size = rb_event_length(event);
 
3845
                /* Always keep the time extend and data together */
 
3846
                size = rb_event_ts_length(event);
3720
3847
 
3721
3848
                if (len < size)
3722
3849
                        goto out_unlock;
3726
3853
 
3727
3854
                /* Need to copy one event at a time */
3728
3855
                do {
 
3856
                        /* We need the size of one event, because
 
3857
                         * rb_advance_reader only advances by one event,
 
3858
                         * whereas rb_event_ts_length may include the size of
 
3859
                         * one or two events.
 
3860
                         * We have already ensured there's enough space if this
 
3861
                         * is a time extend. */
 
3862
                        size = rb_event_length(event);
3729
3863
                        memcpy(bpage->data + pos, rpage->data + rpos, size);
3730
3864
 
3731
3865
                        len -= size;
3734
3868
                        rpos = reader->read;
3735
3869
                        pos += size;
3736
3870
 
 
3871
                        if (rpos >= commit)
 
3872
                                break;
 
3873
 
3737
3874
                        event = rb_reader_event(cpu_buffer);
3738
 
                        size = rb_event_length(event);
3739
 
                } while (len > size);
 
3875
                        /* Always keep the time extend and data together */
 
3876
                        size = rb_event_ts_length(event);
 
3877
                } while (len >= size);
3740
3878
 
3741
3879
                /* update bpage */
3742
3880
                local_set(&bpage->commit, pos);
3756
3894
                local_set(&reader->entries, 0);
3757
3895
                reader->read = 0;
3758
3896
                *data_page = bpage;
 
3897
 
 
3898
                /*
 
3899
                 * Use the real_end for the data size,
 
3900
                 * This gives us a chance to store the lost events
 
3901
                 * on the page.
 
3902
                 */
 
3903
                if (reader->real_end)
 
3904
                        local_set(&bpage->commit, reader->real_end);
3759
3905
        }
3760
3906
        ret = read;
3761
3907
 
 
3908
        cpu_buffer->lost_events = 0;
 
3909
 
 
3910
        commit = local_read(&bpage->commit);
 
3911
        /*
 
3912
         * Set a flag in the commit field if we lost events
 
3913
         */
 
3914
        if (missed_events) {
 
3915
                /* If there is room at the end of the page to save the
 
3916
                 * missed events, then record it there.
 
3917
                 */
 
3918
                if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
 
3919
                        memcpy(&bpage->data[commit], &missed_events,
 
3920
                               sizeof(missed_events));
 
3921
                        local_add(RB_MISSED_STORED, &bpage->commit);
 
3922
                        commit += sizeof(missed_events);
 
3923
                }
 
3924
                local_add(RB_MISSED_EVENTS, &bpage->commit);
 
3925
        }
 
3926
 
 
3927
        /*
 
3928
         * This page may be off to user land. Zero it out here.
 
3929
         */
 
3930
        if (commit < BUF_PAGE_SIZE)
 
3931
                memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 
3932
 
3762
3933
 out_unlock:
3763
3934
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3764
3935
 
3819
3990
        .open           = tracing_open_generic,
3820
3991
        .read           = rb_simple_read,
3821
3992
        .write          = rb_simple_write,
 
3993
        .llseek         = default_llseek,
3822
3994
};
3823
3995
 
3824
3996