21
21
* it under the terms of the GNU General Public License as published by
22
22
* the Free Software Foundation; either version 2 of the License, or
23
23
* (at your option) any later version.
25
25
* This program is distributed in the hope that it will be useful,
26
26
* but WITHOUT ANY WARRANTY; without even the implied warranty of
27
27
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28
28
* GNU General Public License for more details.
30
30
* You should have received a copy of the GNU General Public License
31
31
* along with this program; if not, write to the Free Software
32
32
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
37
37
* CPU Profiling implementation.
39
39
* This library implements the Probes needed to gather stats.
40
40
* See src/ProfStats.c which implements historical recording and
41
41
* presentation in CacheMgr.cgi.
43
43
* For timing we prefer on-CPU ops that retrieve cpu ticks counter.
44
44
* For Intel, this is "rdtsc", which is 64-bit counter that virtually
45
45
* never wraps. For alpha, this is "rpcc" which is 32-bit counter and
46
46
* wraps every few seconds. Currently, no handling of wrapping counters
47
47
* is implemented. Other CPU's are also not covered. Potentially all
48
48
* modern CPU's has similar counters.
51
51
* Insert macro PROF_state(probename) in strategic places in code.
52
52
* PROF_start(probename);
53
53
* ... section of code measured ...
54
54
* PROF_stop(probename);
56
56
* probename must be added to profiling.h into xprof_type enum list
57
57
* with prepended "XPROF_" string.
60
60
* PROF gathers stats per probename into structures. It indexes these
61
61
* structures by enum type index in an array.
63
63
* PROF records best, best, average and worst values for delta time,
64
64
* also, if UNACCED is defined, it measures "empty" time during which
65
65
* no probes are in measuring state. This allows to see time "unaccounted"
66
66
* for. If OVERHEAD is defined, additional calculations are made at every
67
67
* probe to measure approximate overhead of the probe code itself.
69
69
* Probe data is stored in linked-list, so the more probes you define,
70
70
* the more overhead is added to find the deepest nested probe. To reduce
71
71
* average overhead, linked list is manipulated each time PR_start is
76
76
* ticks, which on the scale of submicroseconds. Yet, to optimise really
77
77
* fast and frequent sections of code, we want to reduce this overhead
78
78
* to absolute minimum possible.
80
80
* For actual measurements, probe overhead cancels out mostly. Still,
81
81
* do not take the measured times as facts, they should be viewed in
82
82
* relative comparison to overall CPU time and on the same platform.
84
84
* Every 1 second, Event within squid is called that parses gathered
85
85
* statistics of every probe, and accumulates that into historical
86
86
* structures for last 1,5,30 secs, 1,5,30 mins, and 1,5 and 24 hours.
87
87
* Each second active probe stats are reset, and only historical data
88
88
* is presented in cachemgr output.
91
91
* "Worst case" may be misleading. Anything can happen at any section
92
92
* of code that could delay reaching to probe stop. For eg. system may
98
98
* time in given timeframe, which really show percentage of time spent
99
99
* in given section of code, and its average completion time. This data
100
100
* could be used to detect bottlenecks withing squid and optimise them.
102
102
* TOTALS are quite off reality. Its there just to summarise cumulative
103
103
* times and percent column. Percent values over 100% shows that there
104
104
* have been some probes nested into each other.
108
108
#include "profiling.h"
131
131
#define MAXSTACKDEPTH 512
133
133
struct _callstack_entry {
134
int timer; /* index into timers array */
136
hrtime_t start, stop, accum;
134
int timer; /* index into timers array */
136
hrtime_t start, stop, accum;
139
139
struct _callstack_entry cstack[MAXSTACKDEPTH];
147
147
xprof_update(xprof_stats_data * head)
149
149
if (head->delta < head->best)
150
head->best = head->delta;
150
head->best = head->delta;
151
151
if (head->worst < head->delta)
152
head->worst = head->delta;
152
head->worst = head->delta;
153
153
head->summ += head->delta;
178
178
hrtime_t tt = get_tick();
179
179
if (!xprof_inited)
182
182
/* If nested, stop current stack frame */
183
183
if (cstack_head > 0) {
184
cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start;
185
cstack[cstack_head - 1].start = -1;
184
cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start;
185
cstack[cstack_head - 1].start = -1;
188
188
/* Are we at the first level? If so; stop the unaccounted timer */
189
189
if (cstack_head == 0) {
190
assert(xp_UNACCOUNTED->start != -1);
191
xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start;
192
xp_UNACCOUNTED->start = -1;
193
xprof_update(xp_UNACCOUNTED);
190
assert(xp_UNACCOUNTED->start != -1);
191
xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start;
192
xp_UNACCOUNTED->start = -1;
193
xprof_update(xp_UNACCOUNTED);
196
196
/* Allocate new stack frame */
223
223
/* Restart previous timer if we're not at the top level */
224
224
if (cstack_head > 0) {
225
cstack[cstack_head - 1].start = tt;
226
cstack[cstack_head - 1].stop = 0;
225
cstack[cstack_head - 1].start = tt;
226
cstack[cstack_head - 1].stop = 0;
229
229
/* Get here? We're at the top level; unaccounted */
230
230
xp_UNACCOUNTED->start = tt;