9
$ perf stat ~/hackbench 10
12
Performance counter stats for '/home/mingo/hackbench':
14
1255.538611 task clock ticks # 10.143 CPU utilization factor
15
54011 context switches # 0.043 M/sec
16
385 CPU migrations # 0.000 M/sec
17
17755 pagefaults # 0.014 M/sec
18
3808323185 CPU cycles # 3033.219 M/sec
19
1575111190 instructions # 1254.530 M/sec
20
17367895 cache references # 13.833 M/sec
21
7674421 cache misses # 6.112 M/sec
23
Wall-clock time elapsed: 123.786620 msecs
9
$ perf stat ./hackbench 10
13
Performance counter stats for './hackbench 10':
15
1708.761321 task-clock # 11.037 CPUs utilized
16
41,190 context-switches # 0.024 M/sec
17
6,735 CPU-migrations # 0.004 M/sec
18
17,318 page-faults # 0.010 M/sec
19
5,205,202,243 cycles # 3.046 GHz
20
3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle
21
1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle
22
2,603,501,247 instructions # 0.50 insns per cycle
23
# 1.48 stalled cycles per insn
24
484,357,498 branches # 283.455 M/sec
25
6,388,934 branch-misses # 1.32% of all branches
27
0.154822978 seconds time elapsed
26
* Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
30
* Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
28
32
* Improvements and fixes by:
65
70
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
67
72
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
73
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
74
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
68
75
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
69
76
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
70
77
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
71
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES },
72
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
82
* Detailed stats (-d), covering the L1 and last level data caches:
84
static struct perf_event_attr detailed_attrs[] = {
86
{ .type = PERF_TYPE_HW_CACHE,
88
PERF_COUNT_HW_CACHE_L1D << 0 |
89
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
90
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
92
{ .type = PERF_TYPE_HW_CACHE,
94
PERF_COUNT_HW_CACHE_L1D << 0 |
95
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
96
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
98
{ .type = PERF_TYPE_HW_CACHE,
100
PERF_COUNT_HW_CACHE_LL << 0 |
101
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
102
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
104
{ .type = PERF_TYPE_HW_CACHE,
106
PERF_COUNT_HW_CACHE_LL << 0 |
107
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
108
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
112
* Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
114
static struct perf_event_attr very_detailed_attrs[] = {
116
{ .type = PERF_TYPE_HW_CACHE,
118
PERF_COUNT_HW_CACHE_L1I << 0 |
119
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
120
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
122
{ .type = PERF_TYPE_HW_CACHE,
124
PERF_COUNT_HW_CACHE_L1I << 0 |
125
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
126
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
128
{ .type = PERF_TYPE_HW_CACHE,
130
PERF_COUNT_HW_CACHE_DTLB << 0 |
131
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
132
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
134
{ .type = PERF_TYPE_HW_CACHE,
136
PERF_COUNT_HW_CACHE_DTLB << 0 |
137
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
138
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
140
{ .type = PERF_TYPE_HW_CACHE,
142
PERF_COUNT_HW_CACHE_ITLB << 0 |
143
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
144
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
146
{ .type = PERF_TYPE_HW_CACHE,
148
PERF_COUNT_HW_CACHE_ITLB << 0 |
149
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
150
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
155
* Very, very detailed stats (-d -d -d), adding prefetch events:
157
static struct perf_event_attr very_very_detailed_attrs[] = {
159
{ .type = PERF_TYPE_HW_CACHE,
161
PERF_COUNT_HW_CACHE_L1D << 0 |
162
(PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
163
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
165
{ .type = PERF_TYPE_HW_CACHE,
167
PERF_COUNT_HW_CACHE_L1D << 0 |
168
(PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
169
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
76
174
struct perf_evlist *evsel_list;
304
* Update various tracking values we maintain to print
305
* more semantic information such as miss/hit ratios,
306
* instruction rates, etc:
308
static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
310
if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
311
update_stats(&runtime_nsecs_stats[0], count[0]);
312
else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
313
update_stats(&runtime_cycles_stats[0], count[0]);
314
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
315
update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
316
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
317
update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
318
else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
319
update_stats(&runtime_branches_stats[0], count[0]);
320
else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
321
update_stats(&runtime_cacherefs_stats[0], count[0]);
322
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
323
update_stats(&runtime_l1_dcache_stats[0], count[0]);
324
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
325
update_stats(&runtime_l1_icache_stats[0], count[0]);
326
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
327
update_stats(&runtime_ll_cache_stats[0], count[0]);
328
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
329
update_stats(&runtime_dtlb_cache_stats[0], count[0]);
330
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
331
update_stats(&runtime_itlb_cache_stats[0], count[0]);
196
335
* Read out the results of a single counter:
197
336
* aggregate counts across CPUs in system-wide mode
406
549
if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
407
fprintf(stderr, " # %10.3f CPUs ",
408
avg / avg_stats(&walltime_nsecs_stats));
550
fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats));
553
static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
555
double total, ratio = 0.0;
558
total = avg_stats(&runtime_cycles_stats[cpu]);
561
ratio = avg / total * 100.0;
563
color = PERF_COLOR_NORMAL;
565
color = PERF_COLOR_RED;
566
else if (ratio > 30.0)
567
color = PERF_COLOR_MAGENTA;
568
else if (ratio > 10.0)
569
color = PERF_COLOR_YELLOW;
571
fprintf(stderr, " # ");
572
color_fprintf(stderr, color, "%6.2f%%", ratio);
573
fprintf(stderr, " frontend cycles idle ");
576
static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
578
double total, ratio = 0.0;
581
total = avg_stats(&runtime_cycles_stats[cpu]);
584
ratio = avg / total * 100.0;
586
color = PERF_COLOR_NORMAL;
588
color = PERF_COLOR_RED;
589
else if (ratio > 50.0)
590
color = PERF_COLOR_MAGENTA;
591
else if (ratio > 20.0)
592
color = PERF_COLOR_YELLOW;
594
fprintf(stderr, " # ");
595
color_fprintf(stderr, color, "%6.2f%%", ratio);
596
fprintf(stderr, " backend cycles idle ");
599
static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
601
double total, ratio = 0.0;
604
total = avg_stats(&runtime_branches_stats[cpu]);
607
ratio = avg / total * 100.0;
609
color = PERF_COLOR_NORMAL;
611
color = PERF_COLOR_RED;
612
else if (ratio > 10.0)
613
color = PERF_COLOR_MAGENTA;
614
else if (ratio > 5.0)
615
color = PERF_COLOR_YELLOW;
617
fprintf(stderr, " # ");
618
color_fprintf(stderr, color, "%6.2f%%", ratio);
619
fprintf(stderr, " of all branches ");
622
static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
624
double total, ratio = 0.0;
627
total = avg_stats(&runtime_l1_dcache_stats[cpu]);
630
ratio = avg / total * 100.0;
632
color = PERF_COLOR_NORMAL;
634
color = PERF_COLOR_RED;
635
else if (ratio > 10.0)
636
color = PERF_COLOR_MAGENTA;
637
else if (ratio > 5.0)
638
color = PERF_COLOR_YELLOW;
640
fprintf(stderr, " # ");
641
color_fprintf(stderr, color, "%6.2f%%", ratio);
642
fprintf(stderr, " of all L1-dcache hits ");
645
static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
647
double total, ratio = 0.0;
650
total = avg_stats(&runtime_l1_icache_stats[cpu]);
653
ratio = avg / total * 100.0;
655
color = PERF_COLOR_NORMAL;
657
color = PERF_COLOR_RED;
658
else if (ratio > 10.0)
659
color = PERF_COLOR_MAGENTA;
660
else if (ratio > 5.0)
661
color = PERF_COLOR_YELLOW;
663
fprintf(stderr, " # ");
664
color_fprintf(stderr, color, "%6.2f%%", ratio);
665
fprintf(stderr, " of all L1-icache hits ");
668
static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
670
double total, ratio = 0.0;
673
total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
676
ratio = avg / total * 100.0;
678
color = PERF_COLOR_NORMAL;
680
color = PERF_COLOR_RED;
681
else if (ratio > 10.0)
682
color = PERF_COLOR_MAGENTA;
683
else if (ratio > 5.0)
684
color = PERF_COLOR_YELLOW;
686
fprintf(stderr, " # ");
687
color_fprintf(stderr, color, "%6.2f%%", ratio);
688
fprintf(stderr, " of all dTLB cache hits ");
691
static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
693
double total, ratio = 0.0;
696
total = avg_stats(&runtime_itlb_cache_stats[cpu]);
699
ratio = avg / total * 100.0;
701
color = PERF_COLOR_NORMAL;
703
color = PERF_COLOR_RED;
704
else if (ratio > 10.0)
705
color = PERF_COLOR_MAGENTA;
706
else if (ratio > 5.0)
707
color = PERF_COLOR_YELLOW;
709
fprintf(stderr, " # ");
710
color_fprintf(stderr, color, "%6.2f%%", ratio);
711
fprintf(stderr, " of all iTLB cache hits ");
714
static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
716
double total, ratio = 0.0;
719
total = avg_stats(&runtime_ll_cache_stats[cpu]);
722
ratio = avg / total * 100.0;
724
color = PERF_COLOR_NORMAL;
726
color = PERF_COLOR_RED;
727
else if (ratio > 10.0)
728
color = PERF_COLOR_MAGENTA;
729
else if (ratio > 5.0)
730
color = PERF_COLOR_YELLOW;
732
fprintf(stderr, " # ");
733
color_fprintf(stderr, color, "%6.2f%%", ratio);
734
fprintf(stderr, " of all LL-cache hits ");
411
737
static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
443
769
ratio = avg / total;
445
fprintf(stderr, " # %10.3f IPC ", ratio);
771
fprintf(stderr, " # %5.2f insns per cycle ", ratio);
773
total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
774
total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
778
fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio);
446
781
} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
447
782
runtime_branches_stats[cpu].n != 0) {
448
total = avg_stats(&runtime_branches_stats[cpu]);
783
print_branch_misses(cpu, evsel, avg);
785
evsel->attr.type == PERF_TYPE_HW_CACHE &&
786
evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D |
787
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
788
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
789
runtime_l1_dcache_stats[cpu].n != 0) {
790
print_l1_dcache_misses(cpu, evsel, avg);
792
evsel->attr.type == PERF_TYPE_HW_CACHE &&
793
evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I |
794
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
795
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
796
runtime_l1_icache_stats[cpu].n != 0) {
797
print_l1_icache_misses(cpu, evsel, avg);
799
evsel->attr.type == PERF_TYPE_HW_CACHE &&
800
evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB |
801
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
802
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
803
runtime_dtlb_cache_stats[cpu].n != 0) {
804
print_dtlb_cache_misses(cpu, evsel, avg);
806
evsel->attr.type == PERF_TYPE_HW_CACHE &&
807
evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB |
808
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
809
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
810
runtime_itlb_cache_stats[cpu].n != 0) {
811
print_itlb_cache_misses(cpu, evsel, avg);
813
evsel->attr.type == PERF_TYPE_HW_CACHE &&
814
evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL |
815
((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
816
((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
817
runtime_ll_cache_stats[cpu].n != 0) {
818
print_ll_cache_misses(cpu, evsel, avg);
819
} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
820
runtime_cacherefs_stats[cpu].n != 0) {
821
total = avg_stats(&runtime_cacherefs_stats[cpu]);
451
824
ratio = avg * 100 / total;
453
fprintf(stderr, " # %10.3f %% ", ratio);
826
fprintf(stderr, " # %8.3f %% of all cache refs ", ratio);
828
} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
829
print_stalled_cycles_frontend(cpu, evsel, avg);
830
} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
831
print_stalled_cycles_backend(cpu, evsel, avg);
832
} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
833
total = avg_stats(&runtime_nsecs_stats[cpu]);
836
ratio = 1.0 * avg / total;
838
fprintf(stderr, " # %8.3f GHz ", ratio);
455
839
} else if (runtime_nsecs_stats[cpu].n != 0) {
456
840
total = avg_stats(&runtime_nsecs_stats[cpu]);
459
843
ratio = 1000.0 * avg / total;
461
fprintf(stderr, " # %10.3f M/sec", ratio);
845
fprintf(stderr, " # %8.3f M/sec ", ratio);
847
fprintf(stderr, " ");
1066
* Add default attributes, if there were no attributes specified or
1067
* if -d/--detailed, -d -d or -d -d -d is used:
1069
static int add_default_attributes(void)
1071
struct perf_evsel *pos;
1075
/* Set attrs if no event is selected and !null_run: */
1079
if (!evsel_list->nr_entries) {
1080
for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
1081
pos = perf_evsel__new(default_attrs + c, c + attr_nr);
1084
perf_evlist__add(evsel_list, pos);
1089
/* Detailed events get appended to the event list: */
1091
if (detailed_run < 1)
1094
/* Append detailed run extra attributes: */
1095
for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
1096
pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
1099
perf_evlist__add(evsel_list, pos);
1103
if (detailed_run < 2)
1106
/* Append very detailed run extra attributes: */
1107
for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
1108
pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
1111
perf_evlist__add(evsel_list, pos);
1114
if (detailed_run < 3)
1117
/* Append very, very detailed run extra attributes: */
1118
for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
1119
pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
1122
perf_evlist__add(evsel_list, pos);
677
1129
int cmd_stat(int argc, const char **argv, const char *prefix __used)
679
1131
struct perf_evsel *pos;