1
// Copyright 2009 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
11
#define Ureg Ureg_amd64
12
#include <ureg_amd64.h>
23
struct Ureg_amd64 ureg_amd64;
24
struct Ureg_x86 ureg_x86;
30
// pprof data, stored as sequences of N followed by N PC values.
31
// See http://code.google.com/p/google-perftools .
32
uvlong *ppdata; // traces
33
Biobuf* pproffd; // file descriptor to write trace info
34
long ppstart; // start position of current trace
35
long nppdata; // length of data
36
long ppalloc; // size of allocated data
37
char ppmapdata[10*1024]; // the map information for the output file
40
int pprof; // print pprof output to named file
41
int functions; // print functions
42
int histograms; // print histograms
43
int linenums; // print file and line numbers rather than function names
44
int registers; // print registers
45
int stacks; // print stack traces
47
int pid; // main process pid
49
int nthread; // number of threads
50
int thread[32]; // thread pids
51
Map *map[32]; // thread maps
56
fprint(2, "Usage: prof -p pid [-t total_secs] [-d delta_msec]\n");
57
fprint(2, " prof [-t total_secs] [-d delta_msec] 6.out args ...\n");
58
fprint(2, "\tformats (default -h):\n");
59
fprint(2, "\t\t-P file.prof: write [c]pprof output to file.prof\n");
60
fprint(2, "\t\t-h: histograms\n");
61
fprint(2, "\t\t-f: dynamic functions\n");
62
fprint(2, "\t\t-l: dynamic file and line numbers\n");
63
fprint(2, "\t\t-r: dynamic registers\n");
64
fprint(2, "\t\t-s: dynamic function stack traces\n");
65
fprint(2, "\t\t-hs: include stack info in histograms\n");
81
PC *counters[Ncounters];
83
// Set up by setarch() to make most of the code architecture-independent.
84
typedef struct Arch Arch;
87
void (*regprint)(void);
91
uvlong (*uregPC)(void);
92
uvlong (*uregSP)(void);
93
void (*ppword)(uvlong w);
99
fprint(2, "ax\t0x%llux\n", ureg_amd64.ax);
100
fprint(2, "bx\t0x%llux\n", ureg_amd64.bx);
101
fprint(2, "cx\t0x%llux\n", ureg_amd64.cx);
102
fprint(2, "dx\t0x%llux\n", ureg_amd64.dx);
103
fprint(2, "si\t0x%llux\n", ureg_amd64.si);
104
fprint(2, "di\t0x%llux\n", ureg_amd64.di);
105
fprint(2, "bp\t0x%llux\n", ureg_amd64.bp);
106
fprint(2, "r8\t0x%llux\n", ureg_amd64.r8);
107
fprint(2, "r9\t0x%llux\n", ureg_amd64.r9);
108
fprint(2, "r10\t0x%llux\n", ureg_amd64.r10);
109
fprint(2, "r11\t0x%llux\n", ureg_amd64.r11);
110
fprint(2, "r12\t0x%llux\n", ureg_amd64.r12);
111
fprint(2, "r13\t0x%llux\n", ureg_amd64.r13);
112
fprint(2, "r14\t0x%llux\n", ureg_amd64.r14);
113
fprint(2, "r15\t0x%llux\n", ureg_amd64.r15);
114
fprint(2, "ds\t0x%llux\n", ureg_amd64.ds);
115
fprint(2, "es\t0x%llux\n", ureg_amd64.es);
116
fprint(2, "fs\t0x%llux\n", ureg_amd64.fs);
117
fprint(2, "gs\t0x%llux\n", ureg_amd64.gs);
118
fprint(2, "type\t0x%llux\n", ureg_amd64.type);
119
fprint(2, "error\t0x%llux\n", ureg_amd64.error);
120
fprint(2, "pc\t0x%llux\n", ureg_amd64.ip);
121
fprint(2, "cs\t0x%llux\n", ureg_amd64.cs);
122
fprint(2, "flags\t0x%llux\n", ureg_amd64.flags);
123
fprint(2, "sp\t0x%llux\n", ureg_amd64.sp);
124
fprint(2, "ss\t0x%llux\n", ureg_amd64.ss);
128
amd64_getregs(Map *map)
133
struct Ureg_amd64 ureg;
136
for(i = 0; i < sizeof ureg_amd64; i+=8) {
137
if(get8(map, (uvlong)i, &u.regs[i/8]) < 0)
145
amd64_getPC(Map *map)
150
r = get8(map, offsetof(struct Ureg_amd64, ip), &x);
156
amd64_getSP(Map *map)
161
r = get8(map, offsetof(struct Ureg_amd64, sp), &x);
169
return ureg_amd64.ip;
174
return ureg_amd64.sp;
178
amd64_ppword(uvlong w)
190
Bwrite(pproffd, buf, 8);
196
fprint(2, "ax\t0x%ux\n", ureg_x86.ax);
197
fprint(2, "bx\t0x%ux\n", ureg_x86.bx);
198
fprint(2, "cx\t0x%ux\n", ureg_x86.cx);
199
fprint(2, "dx\t0x%ux\n", ureg_x86.dx);
200
fprint(2, "si\t0x%ux\n", ureg_x86.si);
201
fprint(2, "di\t0x%ux\n", ureg_x86.di);
202
fprint(2, "bp\t0x%ux\n", ureg_x86.bp);
203
fprint(2, "ds\t0x%ux\n", ureg_x86.ds);
204
fprint(2, "es\t0x%ux\n", ureg_x86.es);
205
fprint(2, "fs\t0x%ux\n", ureg_x86.fs);
206
fprint(2, "gs\t0x%ux\n", ureg_x86.gs);
207
fprint(2, "cs\t0x%ux\n", ureg_x86.cs);
208
fprint(2, "flags\t0x%ux\n", ureg_x86.flags);
209
fprint(2, "pc\t0x%ux\n", ureg_x86.pc);
210
fprint(2, "sp\t0x%ux\n", ureg_x86.sp);
211
fprint(2, "ss\t0x%ux\n", ureg_x86.ss);
215
x86_getregs(Map *map)
219
for(i = 0; i < sizeof ureg_x86; i+=4) {
220
if(get4(map, (uvlong)i, &((uint32*)&ureg_x86)[i/4]) < 0)
229
return get4(map, offsetof(struct Ureg_x86, pc), &ureg_x86.pc);
235
return get4(map, offsetof(struct Ureg_x86, sp), &ureg_x86.sp);
241
return (uvlong)ureg_x86.pc;
247
return (uvlong)ureg_x86.sp;
259
Bwrite(pproffd, buf, 4);
296
for(i = 0; archtab[i].name != nil; i++) {
297
if (strcmp(mach->name, archtab[i].name) == 0) {
309
int i, j, curn, found;
310
Map *curmap[nelem(map)];
311
int curthread[nelem(map)];
312
static int complained = 0;
314
curn = procthreadpids(pid, curthread, nelem(curthread));
318
if(curn > nelem(map)) {
319
if(complained == 0) {
320
fprint(2, "prof: too many threads; limiting to %d\n", nthread, nelem(map));
325
if(curn == nthread && memcmp(thread, curthread, curn*sizeof(*thread)) == 0)
326
return curn; // no changes
328
// Number of threads has changed (might be the init case).
329
// A bit expensive but rare enough not to bother being clever.
330
for(i = 0; i < curn; i++) {
332
for(j = 0; j < nthread; j++) {
333
if(curthread[i] == thread[j]) {
344
curmap[i] = attachproc(curthread[i], &fhdr);
345
if(curmap[i] == nil) {
346
fprint(2, "prof: can't attach to %d: %r\n", curthread[i]);
351
for(j = 0; j < nthread; j++)
356
memmove(thread, curthread, nthread*sizeof thread[0]);
357
memmove(map, curmap, sizeof map);
368
if(arch->getregs(map) < 0)
371
// we need only two registers
372
if(arch->getPC(map) < 0)
374
if(arch->getSP(map) < 0)
380
fprint(2, "prof: can't read registers: %r\n");
385
addtohistogram(uvlong pc, uvlong callerpc, uvlong sp)
390
h = (pc + callerpc*101) % Ncounters;
391
for(x = counters[h]; x != NULL; x = x->next) {
392
if(x->pc == pc && x->callerpc == callerpc) {
397
x = malloc(sizeof(PC));
399
x->callerpc = callerpc;
401
x->next = counters[h];
411
if(nppdata == ppalloc) {
412
ppalloc = (1000+nppdata)*2;
413
ppdata = realloc(ppdata, ppalloc * sizeof ppdata[0]);
415
fprint(2, "prof: realloc failed: %r\n");
419
ppdata[nppdata++] = pc;
432
ppdata[ppstart] = nppdata-ppstart-1;
438
xptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym)
446
addtohistogram(nextpc, pc, sp);
447
if(!histograms || stacks > 1 || pprof) {
451
fprint(2, "%s(", sym->name);
453
if(nextpc != sym->value)
454
fprint(2, "+%#llux ", nextpc - sym->value);
455
if(have_syms && linenums && fileline(buf, sizeof buf, pc)) {
456
fprint(2, " %s", buf);
468
stacktracepcsp(Map *map, uvlong pc, uvlong sp)
474
if(machdata->ctrace==nil)
475
fprint(2, "no machdata->ctrace\n");
476
else if(machdata->ctrace(map, pc, sp, 0, xptrace) <= 0)
477
fprint(2, "no stack frame: pc=%#p sp=%#p\n", pc, sp);
479
addtohistogram(nextpc, 0, sp);
489
printpc(Map *map, uvlong pc, uvlong sp)
494
if(have_syms > 0 && linenums && fileline(buf, sizeof buf, pc))
495
fprint(2, "%s\n", buf);
496
if(have_syms > 0 && functions) {
497
symoff(buf, sizeof(buf), pc, CANY);
498
fprint(2, "%s\n", buf);
501
stacktracepcsp(map, pc, sp);
504
addtohistogram(pc, 0, sp);
515
// If it's Linux, the info is in /proc/$pid/maps
516
snprint(tmp, sizeof tmp, "/proc/%d/maps", pid);
519
n = read(fd, ppmapdata, sizeof ppmapdata - 1);
522
fprint(2, "prof: can't read %s: %r\n", tmp);
529
// It's probably a mac. Synthesize an entry for the text file.
530
// The register segment may come first but it has a zero offset, so grab the first non-zero offset segment.
531
for(n = 0; n < 3; n++){
532
seg = &map[0]->seg[n];
536
snprint(ppmapdata, sizeof ppmapdata,
537
"%.16x-%.16x r-xp %d 00:00 34968549 %s\n",
538
seg->b, seg->e, seg->f, "/home/r/6.out"
542
fprint(2, "prof: no text segment in maps for %s\n", file);
553
req.tv_sec = delta_msec/1000;
554
req.tv_nsec = 1000000*(delta_msec % 1000);
558
for(msec = 0; total_sec <= 0 || msec < 1000*total_sec; msec += delta_msec) {
560
nsamplethread += nthread;
561
for(i = 0; i < nthread; i++) {
563
if(ctlproc(pid, "stop") < 0)
565
if(!sample(map[i])) {
566
ctlproc(pid, "start");
569
printpc(map[i], arch->uregPC(), arch->uregSP());
570
ctlproc(pid, "start");
572
nanosleep(&req, NULL);
583
typedef struct Func Func;
605
if(!findsym(pc, CTEXT, &s))
608
h = s.value % nelem(func);
609
for(f = func[h]; f != NULL; f = f->next)
610
if(f->s.value == s.value)
613
f = malloc(sizeof *f);
614
memset(f, 0, sizeof *f);
623
compareleaf(const void *va, const void *vb)
629
if(a->leaf != b->leaf)
630
return b->leaf - a->leaf;
631
if(a->onstack != b->onstack)
632
return b->onstack - a->onstack;
633
return strcmp(a->s.name, b->s.name);
646
// assign counts to functions.
647
for(h = 0; h < Ncounters; h++) {
648
for(x = counters[h]; x != NULL; x = x->next) {
651
f->onstack += x->count;
654
f = findfunc(x->callerpc);
661
ff = malloc(nfunc*sizeof ff[0]);
663
for(h = 0; h < nelem(func); h++)
664
for(f = func[h]; f != NULL; f = f->next)
667
// sort by leaf counts
668
qsort(ff, nfunc, sizeof ff[0], compareleaf);
671
fprint(2, "%d samples (avg %.1g threads)\n", nsample, (double)nsamplethread/nsample);
672
for(i = 0; i < nfunc; i++) {
674
fprint(2, "%6.2f%%\t", 100.0*(double)f->leaf/nsample);
676
fprint(2, "%6.2f%%\t", 100.0*(double)f->onstack/nsample);
677
fprint(2, "%s\n", f->s.name);
681
typedef struct Trace Trace;
694
Trace *trace, *tp, *up, *prev;
698
e = ppdata + nppdata;
699
// Create list of traces. First, count the traces
701
for(p = ppdata; p < e;) {
710
// Allocate and link the traces together.
711
trace = malloc(ntrace * sizeof(Trace));
713
for(p = ppdata; p < e;) {
724
trace[ntrace-1].next = nil;
725
// Eliminate duplicates. Lousy algorithm, although not as bad as it looks because
726
// the list collapses fast.
727
for(tp = trace; tp != nil; tp = tp->next) {
729
for(up = tp->next; up != nil; up = up->next) {
730
if(up->npc == tp->npc && memcmp(up->pc, tp->pc, up->npc*sizeof up->pc[0]) == 0) {
732
prev->next = up->next;
739
// See http://code.google.com/p/google-perftools/source/browse/trunk/doc/cpuprofile-fileformat.html
741
arch->ppword(0); // must be zero
742
arch->ppword(3); // 3 words follow in header
743
arch->ppword(0); // must be zero
744
arch->ppword(delta_msec * 1000); // sampling period in microseconds
745
arch->ppword(0); // must be zero (padding)
746
// 2) One record for each trace.
747
for(tp = trace; tp != nil; tp = tp->next) {
748
arch->ppword(tp->count);
749
arch->ppword(tp->npc);
750
for(i = 0; i < tp->npc; i++) {
751
arch->ppword(tp->pc[i]);
755
arch->ppword(0); // must be zero
756
arch->ppword(1); // must be one
757
arch->ppword(0); // must be zero
758
// 4) Mapped objects.
759
Bwrite(pproffd, ppmapdata, strlen(ppmapdata));
765
startprocess(char **argv)
769
if((pid = fork()) == 0) {
771
if(ctlproc(pid, "hang") < 0){
772
fprint(2, "prof: child process could not hang\n");
775
execv(argv[0], argv);
776
fprint(2, "prof: could not exec %s: %r\n", argv[0]);
781
fprint(2, "prof: could not fork\n");
784
if(ctlproc(pid, "attached") < 0 || ctlproc(pid, "waitstop") < 0) {
785
fprint(2, "prof: could not attach to child process: %r\n");
796
for(i = 0; i < nthread; i++)
801
main(int argc, char *argv[])
809
ppfile = EARGF(Usage());
810
pproffd = Bopen(ppfile, OWRITE);
812
fprint(2, "prof: cannot open %s: %r\n", ppfile);
817
delta_msec = atoi(EARGF(Usage()));
820
total_sec = atoi(EARGF(Usage()));
823
pid = atoi(EARGF(Usage()));
843
if(pid <= 0 && argc == 0)
845
if(functions+linenums+registers+stacks+pprof == 0)
847
if(!machbyname("amd64")) {
848
fprint(2, "prof: no amd64 support\n", pid);
854
file = proctextfile(pid);
856
fprint(2, "prof: can't find file for pid %d: %r\n", pid);
857
fprint(2, "prof: on Darwin, need to provide file name explicitly\n");
863
fprint(2, "prof: can't open %s: %r\n", file);
866
if(crackhdr(fd, &fhdr)) {
867
have_syms = syminit(fd, &fhdr);
869
fprint(2, "prof: no symbols for %s: %r\n", file);
872
fprint(2, "prof: crack header for %s: %r\n", file);
876
pid = startprocess(argv);
877
attachproc(pid, &fhdr); // initializes thread list
880
fprint(2, "prof: can't identify binary architecture for pid %d\n", pid);
883
if(getthreads() <= 0) {
885
fprint(2, "prof: can't find threads for pid %d\n", pid);
888
for(i = 0; i < nthread; i++)
889
ctlproc(thread[i], "start");