2
* @file operf_utils.cpp
3
* Helper methods for perf_events-based OProfile.
5
* @remark Copyright 2011 OProfile authors
6
* @remark Read the file COPYING
8
* Created on: Dec 7, 2011
9
* @author Maynard Johnson
10
* (C) Copyright IBM Corp. 2011
12
* Modified by Maynard Johnson <maynardj@us.ibm.com>
13
* (C) Copyright IBM Corporation 2012
25
#include "operf_counter.h"
26
#include "operf_utils.h"
28
#include <perfmon/pfmlib.h>
31
#include "operf_process_info.h"
32
#include "file_manip.h"
33
#include "operf_kernel.h"
34
#include "operf_sfile.h"
35
#include "op_fileio.h"
36
#include "op_libiberty.h"
37
#include "operf_stats.h"
41
extern volatile bool quit;
42
extern volatile bool read_quit;
43
extern operf_read operfRead;
44
extern int sample_reads;
45
extern unsigned int pagesize;
46
extern char * app_name;
48
extern verbose vrecord;
49
extern verbose vconvert;
53
map<pid_t, operf_process_info *> process_map;
54
multimap<string, struct operf_mmap *> all_images_map;
55
map<u64, struct operf_mmap *> kernel_modules;
56
struct operf_mmap * kernel_mmap;
57
bool first_time_processing;
62
static list<event_t *> unresolved_events;
63
static struct operf_transient trans;
64
static bool sfile_init_done;
66
/* The handling of mmap's for a process was a bit tricky to get right, in particular,
67
* the handling of what I refer to as "deferred mmap's" -- i.e., when we receive an
68
* mmap event for which we've not yet received a comm event (so we don't know app name
69
* for the process). I have left in some debugging code here (compiled out via #ifdef)
70
* so we can easily test and validate any changes we ever may need to make to this code.
72
//#define _TEST_DEFERRED_MAPPING
73
#ifdef _TEST_DEFERRED_MAPPING
74
static bool do_comm_event;
75
static event_t comm_event;
79
/* Some architectures (e.g., ppc64) do not use the same event value (code) for oprofile
80
* and for perf_events. The operf-record process requires event values that perf_events
81
* understands, but the operf-read process requires oprofile event values. The purpose of
82
* the following method is to map the operf-record event value to a value that
83
* opreport can understand.
85
#if (defined(__powerpc__) || defined(__powerpc64__))
89
static bool _get_codes_for_match(unsigned int pfm_idx, const char name[],
90
vector<operf_event_t> * evt_vec)
92
unsigned int num_events = evt_vec->size();
94
char evt_name[OP_MAX_EVT_NAME_LEN];
96
unsigned int events_converted = 0;
97
for (unsigned int i = 0; i < num_events; i++) {
98
operf_event_t event = (*evt_vec)[i];
99
if (event.evt_code != NIL_CODE) {
103
memset(evt_name, 0, OP_MAX_EVT_NAME_LEN);
104
if (!strcmp(event.name, "CYCLES")) {
105
strcpy(evt_name ,"PM_CYC") ;
106
} else if ((grp_name = strstr(event.name, "_GRP"))) {
107
strncpy(evt_name, event.name, grp_name - event.name);
109
strncpy(evt_name, event.name, strlen(event.name));
111
if (strncmp(name, evt_name, OP_MAX_EVT_NAME_LEN))
113
ret = pfm_get_event_code(pfm_idx, &tmp_code);
114
if (ret != PFMLIB_SUCCESS) {
115
string evt_name_str = event.name;
116
string msg = "libpfm cannot find event code for " + evt_name_str +
118
throw runtime_error(msg);
120
event.evt_code = tmp_code;
121
(*evt_vec)[i] = event;
123
cverb << vrecord << "Successfully converted " << event.name << " to perf_event code "
124
<< hex << tmp_code << endl;
126
return (events_converted == num_events);
129
static bool _op_get_event_codes(vector<operf_event_t> * evt_vec)
132
unsigned int num_events = evt_vec->size();
133
char evt_name[OP_MAX_EVT_NAME_LEN];
135
unsigned int events_converted = 0;
144
} pfm_raw_pmu_encode_t;
146
pfm_raw_pmu_encode_t raw;
151
if (pfm_initialize() != PFM_SUCCESS)
152
throw runtime_error("Unable to initialize libpfm; cannot continue");
154
for (unsigned int i = 0; i < num_events; i++) {
155
operf_event_t event = (*evt_vec)[i];
156
memset(evt_name, 0, OP_MAX_EVT_NAME_LEN);
157
if (!strcmp(event.name, "CYCLES")) {
158
strcpy(evt_name ,"PM_CYC") ;
159
} else if ((grp_name = strstr(event.name, "_GRP"))) {
160
strncpy(evt_name, event.name, grp_name - event.name);
162
strncpy(evt_name, event.name, strlen(event.name));
165
memset(&raw, 0, sizeof(raw));
166
ret = pfm_get_os_event_encoding(evt_name, PFM_PLM3, PFM_OS_NONE, &raw);
167
if (ret != PFM_SUCCESS) {
168
string evt_name_str = event.name;
169
string msg = "libpfm cannot find event code for " + evt_name_str +
171
throw runtime_error(msg);
174
event.evt_code = raw.codes[0];
175
(*evt_vec)[i] = event;
177
cverb << vrecord << "Successfully converted " << event.name << " to perf_event code "
178
<< hex << event.evt_code << endl;
180
return (events_converted == num_events);
184
bool OP_perf_utils::op_convert_event_vals(vector<operf_event_t> * evt_vec)
186
unsigned int i, count;
189
for (unsigned int i = 0; i < evt_vec->size(); i++) {
190
operf_event_t event = (*evt_vec)[i];
191
event.evt_code = NIL_CODE;
192
(*evt_vec)[i] = event;
196
if (pfm_initialize() != PFMLIB_SUCCESS)
197
throw runtime_error("Unable to initialize libpfm; cannot continue");
199
ret = pfm_get_num_events(&count);
200
if (ret != PFMLIB_SUCCESS)
201
throw runtime_error("Unable to use libpfm to obtain event code; cannot continue");
202
for(i =0 ; i < count; i++)
204
ret = pfm_get_event_name(i, name, 256);
205
if (ret != PFMLIB_SUCCESS)
207
if (_get_codes_for_match(i, name, evt_vec))
212
return _op_get_event_codes(evt_vec);
219
static inline void update_trans_last(struct operf_transient * trans)
221
trans->last = trans->current;
222
trans->last_pc = trans->pc;
225
static inline void clear_trans(struct operf_transient * trans)
228
trans->cur_procinfo = NULL;
231
static void __handle_fork_event(event_t * event)
233
if (cverb << vconvert)
234
cout << "PERF_RECORD_FORK for tgid/tid = " << event->fork.pid
235
<< "/" << event->fork.tid << endl;
237
map<pid_t, operf_process_info *>::iterator it;
238
operf_process_info * parent = NULL;
239
operf_process_info * forked_proc = NULL;
241
it = process_map.find(event->fork.ppid);
242
if (it != process_map.end()) {
245
// Create a new proc info object for the parent, but mark it invalid since we have
246
// not yet received a COMM event for this PID.
247
parent = new operf_process_info(event->fork.ppid, app_name ? app_name : NULL,
248
app_name != NULL, false);
249
if (cverb << vconvert)
250
cout << "Adding new proc info to collection for PID " << event->fork.ppid << endl;
251
process_map[event->fork.ppid] = parent;
254
it = process_map.find(event->fork.pid);
255
if (it == process_map.end()) {
256
forked_proc = new operf_process_info(event->fork.pid,
257
parent->get_app_name().c_str(),
258
parent->is_appname_valid(), parent->is_valid());
259
if (cverb << vconvert)
260
cout << "Adding new proc info to collection for PID " << event->fork.pid << endl;
261
process_map[event->fork.pid] = forked_proc;
262
forked_proc->connect_forked_process_to_parent(parent);
263
parent->add_forked_pid_association(forked_proc);
264
if (cverb << vconvert)
265
cout << "Connecting forked proc " << event->fork.pid << " to parent" << endl;
267
/* There are two ways that we may get to this point. One way is if
268
* we've received a COMM event for the forked process before the FORK event.
269
* Normally, if parent process A forks child process B which then does an exec, we
270
* first see a FORK event, followed by a COMM event. But apparently there's no
271
* guarantee in what order these events may be seen by userspace. No matter -- since
272
* the exec'ed process is now a standalone process (which will get MMAP events
273
* for all of its mmappings, there's no need to re-associate it back to the parent
274
* as we do for a non-exec'ed forked process. So we'll just ignore it.
276
* But the second way that there may be an existing operf_process_info object is if
277
* a new mmap event (a real MMAP event or a synthesized event (e.g. for hypervisor
278
* mmapping) occurred for the forked process before a COMM event was received for it.
279
* In this case, the forked process will be marked invalid until the COMM event
280
* is received. But if this process does *not* do an exec, there will never be a
281
* COMM event for it. Such forked processes should be tightly connected to their
282
* parent, so we'll go ahead and associate the forked process with its parent.
283
* If a COMM event comes later for the forked process, we'll disassociate them.
285
forked_proc = it->second;
286
if (!forked_proc->is_valid()) {
287
forked_proc->connect_forked_process_to_parent(parent);
288
parent->add_forked_pid_association(forked_proc);
289
if (cverb << vconvert)
290
cout << "Connecting existing incomplete forked proc " << event->fork.pid
291
<< " to parent" << endl;
296
static void __handle_comm_event(event_t * event)
298
#ifdef _TEST_DEFERRED_MAPPING
299
if (!do_comm_event) {
304
if (cverb << vconvert)
305
cout << "PERF_RECORD_COMM for " << event->comm.comm << ", tgid/tid = "
306
<< event->comm.pid << "/" << event->comm.tid << endl;
308
map<pid_t, operf_process_info *>::iterator it;
309
it = process_map.find(event->comm.pid);
310
if (it == process_map.end()) {
311
/* TODO: Handle system housekeeping tasks. For certain kinds of processes,
312
* we will get a COMM event, but never get an MMAP event (e.g, kpsmoused).
313
* Without receiving an MMAP event, we have no clue whether the name given
314
* with the COMM event is a full "appname" or not, so the operf_process_info
315
* is marked invalid. We end up dropping all samples for such tasks when
316
* doing a system-wide profile.
319
/* A COMM event can occur as the result of the app doing a fork/exec,
320
* where the COMM event is for the forked process. In that case, we
321
* pass the event->comm field as the appname argument to the ctor.
323
const char * appname_arg;
324
bool is_complete_appname;
325
if (app_name && (app_PID == event->comm.pid)) {
326
appname_arg = app_name;
327
is_complete_appname = true;
329
appname_arg = event->comm.comm;
330
is_complete_appname = false;
332
operf_process_info * proc = new operf_process_info(event->comm.pid,appname_arg,
333
is_complete_appname, true);
334
if (cverb << vconvert)
335
cout << "Adding new proc info to collection for PID " << event->comm.pid << endl;
336
process_map[event->comm.pid] = proc;
338
if (it->second->is_valid()) {
339
if (it->second->is_forked()) {
340
/* If the operf_process_info object we found was created as a result of
341
* a FORK event, then it was associated with the parent process and contains
342
* the parent's appname. But now we're getting a COMM event for this forked
343
* process, which means it did an exec, so we need to change the appname
344
* to the executable associated with this COMM event, which is done via
345
* calling disassociate_from_parent().
347
if (cverb << vconvert)
348
cout << "Disassociating forked proc " << event->comm.pid
349
<< " from parent" << endl;
350
it->second->disassociate_from_parent(event->comm.comm);
352
if (cverb << vconvert)
353
cout << "Received extraneous COMM event for " << event->comm.comm
354
<< ", PID " << event->comm.pid << endl;
357
if (cverb << vconvert)
358
cout << "Processing deferred mappings" << endl;
359
it->second->process_deferred_mappings(event->comm.comm);
364
static void __handle_mmap_event(event_t * event)
366
static bool kptr_restrict_warning_displayed_already = false;
367
string image_basename = op_basename(event->mmap.filename);
368
struct operf_mmap * mapping = NULL;
369
multimap<string, struct operf_mmap *>::iterator it;
370
pair<multimap<string, struct operf_mmap *>::iterator,
371
multimap<string, struct operf_mmap *>::iterator> range;
373
range = all_images_map.equal_range(image_basename);
374
for (it = range.first; it != range.second; it++) {
375
if (((strcmp((*it).second->filename, image_basename.c_str())) == 0)
376
&& ((*it).second->start_addr == event->mmap.start)) {
377
mapping = (*it).second;
382
mapping = new struct operf_mmap;
383
memset(mapping, 0, sizeof(struct operf_mmap));
384
mapping->start_addr = event->mmap.start;
385
strcpy(mapping->filename, event->mmap.filename);
386
/* Mappings starting with "/" are for either a file or shared memory object.
387
* From the kernel's perf_events subsystem, anon maps have labels like:
388
* [heap], [stack], [vdso], //anon
390
if (mapping->filename[0] == '[') {
391
mapping->is_anon_mapping = true;
392
} else if ((strncmp(mapping->filename, "//anon",
393
strlen("//anon")) == 0)) {
394
mapping->is_anon_mapping = true;
395
strcpy(mapping->filename, "anon");
397
mapping->end_addr = (event->mmap.len == 0ULL)? 0ULL : mapping->start_addr + event->mmap.len - 1;
398
mapping->pgoff = event->mmap.pgoff;
400
if (cverb << vconvert) {
401
cout << "PERF_RECORD_MMAP for " << event->mmap.filename << endl;
402
cout << "\tstart_addr: " << hex << mapping->start_addr;
403
cout << "; end addr: " << mapping->end_addr << endl;
406
if (event->header.misc & PERF_RECORD_MISC_USER)
407
all_images_map.insert(pair<string, struct operf_mmap *>(image_basename, mapping));
410
if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
411
if (!strncmp(mapping->filename, operf_get_vmlinux_name(),
412
strlen(mapping->filename))) {
413
/* The kernel_mmap is just a convenience variable
414
* for use when mapping samples to kernel space, since
415
* most of the kernel samples will be attributable to
416
* the vmlinux file versus kernel modules.
418
kernel_mmap = mapping;
420
if ((kptr_restrict == 1) && !no_vmlinux && (my_uid != 0)) {
421
if (!kptr_restrict_warning_displayed_already) {
422
kptr_restrict_warning_displayed_already = true;
423
cerr << endl << "< < < WARNING > > >" << endl;
424
cerr << "Samples for vmlinux kernel will be recorded, but kernel module profiling"
425
<< endl << "is not possible with current system config." << endl;
426
cerr << "Set /proc/sys/kernel/kptr_restrict to 0 to see samples for kernel modules."
427
<< endl << "< < < < < > > > > >" << endl << endl;
430
operf_create_module(mapping->filename,
433
kernel_modules[mapping->start_addr] = mapping;
437
map<pid_t, operf_process_info *>::iterator it;
438
it = process_map.find(event->mmap.pid);
439
if (it == process_map.end()) {
440
/* Create a new proc info object, but mark it invalid since we have
441
* not yet received a COMM event for this PID. This MMAP event may
442
* be on behalf of a process created as a result of a fork/exec.
443
* The order of delivery of events is not guaranteed so we may see
444
* this MMAP event before getting the COMM event for that process.
445
* If this is the case here, we just pass NULL for appname arg.
446
* It will get fixed up later when the COMM event occurs.
448
const char * appname_arg;
449
bool is_complete_appname;
450
if (app_name && (app_PID == event->mmap.pid)) {
451
appname_arg = app_name;
452
is_complete_appname = true;
455
is_complete_appname = false;
458
operf_process_info * proc = new operf_process_info(event->mmap.pid, appname_arg,
459
is_complete_appname, false);
460
proc->add_deferred_mapping(mapping);
461
if (cverb << vconvert)
462
cout << "Added deferred mapping " << event->mmap.filename
463
<< " for new process_info object" << endl;
464
process_map[event->mmap.pid] = proc;
465
#ifdef _TEST_DEFERRED_MAPPING
466
if (!do_comm_event) {
467
do_comm_event = true;
468
__handle_comm_event(comm_event, out);
471
} else if (!it->second->is_valid()) {
472
it->second->add_deferred_mapping(mapping);
473
if (cverb << vconvert)
474
cout << "Added deferred mapping " << event->mmap.filename
475
<< " for existing but incomplete process_info object" << endl;
477
if (cverb << vconvert)
478
cout << "Process mapping for " << event->mmap.filename << " on behalf of "
479
<< event->mmap.pid << endl;
480
it->second->process_new_mapping(mapping);
485
static struct operf_transient * __get_operf_trans(struct sample_data * data, bool hypervisor_domain,
488
operf_process_info * proc = NULL;
489
const struct operf_mmap * op_mmap = NULL;
490
struct operf_transient * retval = NULL;
492
if (trans.tgid == data->pid) {
493
proc = trans.cur_procinfo;
494
if (cverb << vconvert)
495
cout << "trans.tgid == data->pid : " << data->pid << endl;
498
// Find operf_process info for data.tgid.
499
std::map<pid_t, operf_process_info *>::const_iterator it = process_map.find(data->pid);
500
if (it != process_map.end() && (it->second->is_appname_valid())) {
503
/* This can happen for the following reasons:
504
* - We get a sample before getting a COMM or MMAP
505
* event for the process being profiled
506
* - The COMM event has been processed, but since that
507
* only gives 16 chars of the app name, we don't
508
* have a valid app name yet
509
* - The kernel incorrectly records a sample for a
510
* process other than the one we requested (not
511
* likely -- this would be a kernel bug if it did)
514
if ((cverb << vconvert) && !first_time_processing) {
515
cerr << "Dropping sample -- process info unavailable" << endl;
517
operf_stats[OPERF_NO_APP_KERNEL_SAMPLE]++;
519
operf_stats[OPERF_NO_APP_USER_SAMPLE]++;
525
// Now find mmapping that contains the data.ip address.
526
// Use that mmapping to set fields in trans.
528
if (data->ip >= kernel_mmap->start_addr &&
529
data->ip <= kernel_mmap->end_addr) {
530
op_mmap = kernel_mmap;
532
map<u64, struct operf_mmap *>::iterator it;
533
it = kernel_modules.begin();
534
while (it != kernel_modules.end()) {
535
if (data->ip >= it->second->start_addr &&
536
data->ip <= it->second->end_addr) {
537
op_mmap = it->second;
543
if ((kernel_mmap->start_addr == 0ULL) &&
544
(kernel_mmap->end_addr == 0ULL))
545
op_mmap = kernel_mmap;
548
/* This can happen if a kernel module is loaded after profiling
549
* starts, and then we get samples for that kernel module.
554
op_mmap = proc->find_mapping_for_sample(data->ip);
555
if (op_mmap && op_mmap->is_hypervisor && !hypervisor_domain) {
556
cverb << vconvert << "Invalid sample: Address falls within hypervisor address range, but is not a hypervisor domain sample." << endl;
557
operf_stats[OPERF_INVALID_CTX]++;
562
if (cverb << vconvert)
563
cout << "Found mmap for sample; image_name is " << op_mmap->filename <<
564
" and app name is " << proc->get_app_name() << endl;
565
trans.image_name = op_mmap->filename;
566
trans.app_filename = proc->get_app_name().c_str();
567
trans.image_len = strlen(trans.image_name);
568
trans.app_len = strlen(trans.app_filename);
569
trans.start_addr = op_mmap->start_addr;
570
trans.end_addr = op_mmap->end_addr;
571
trans.tgid = data->pid;
572
trans.tid = data->tid;
573
trans.cur_procinfo = proc;
574
trans.cpu = data->cpu;
575
trans.is_anon = op_mmap->is_anon_mapping;
576
trans.in_kernel = kernel_mode;
577
if (trans.in_kernel || trans.is_anon)
580
trans.pc = data->ip - trans.start_addr;
582
trans.sample_id = data->id;
585
if ((cverb << vconvert) && !first_time_processing) {
586
string domain = trans.in_kernel ? "kernel" : "userspace";
587
cerr << "Discarding " << domain << " sample for process " << data->pid
588
<< " where no appropriate mapping was found. (pc=0x"
589
<< hex << data->ip <<")" << endl;
590
operf_stats[OPERF_LOST_NO_MAPPING]++;
598
static void __handle_callchain(u64 * array, struct sample_data * data)
600
bool in_kernel = false;
601
data->callchain = (struct ip_callchain *) array;
602
if (data->callchain->nr) {
603
if (cverb << vconvert)
604
cout << "Processing callchain" << endl;
605
for (int i = 0; i < data->callchain->nr; i++) {
606
data->ip = data->callchain->ips[i];
607
if (data->ip >= PERF_CONTEXT_MAX) {
609
case PERF_CONTEXT_HV:
610
// hypervisor samples are not supported for callgraph
611
// TODO: log lost callgraph arc
613
case PERF_CONTEXT_KERNEL:
616
case PERF_CONTEXT_USER:
624
if (data->ip && __get_operf_trans(data, false, in_kernel)) {
625
if ((trans.current = operf_sfile_find(&trans))) {
626
operf_sfile_log_arc(&trans);
627
update_trans_last(&trans);
631
operf_stats[OPERF_BT_LOST_NO_MAPPING]++;
637
static void __map_hypervisor_sample(u64 ip, u32 pid)
639
operf_process_info * proc;
640
map<pid_t, operf_process_info *>::iterator it;
641
it = process_map.find(pid);
642
if (it == process_map.end()) {
643
/* Create a new proc info object, but mark it invalid since we have
644
* not yet received a COMM event for this PID. This sample may be
645
* on behalf of a process created as a result of a fork/exec.
646
* The order of delivery of events is not guaranteed so we may see
647
* this sample event before getting the COMM event for that process.
648
* If this is the case here, we just pass NULL for appname arg.
649
* It will get fixed up later when the COMM event occurs.
651
const char * appname_arg;
652
bool is_complete_appname;
653
if (app_name && (app_PID == pid)) {
654
appname_arg = app_name;
655
is_complete_appname = true;
658
is_complete_appname = false;
661
proc = new operf_process_info(pid, appname_arg,
662
is_complete_appname, false);
664
if (cverb << vconvert)
665
cout << "Adding new proc info to collection for PID " << pid << endl;
666
process_map[pid] = proc;
671
proc->process_hypervisor_mapping(ip);
674
static void __handle_sample_event(event_t * event, u64 sample_type)
676
struct sample_data data;
677
bool found_trans = false;
679
const struct operf_mmap * op_mmap = NULL;
680
bool hypervisor = (event->header.misc == PERF_RECORD_MISC_HYPERVISOR);
681
u64 *array = event->sample.array;
683
if (sample_type & PERF_SAMPLE_IP) {
684
data.ip = event->ip.ip;
688
if (sample_type & PERF_SAMPLE_TID) {
689
u_int32_t *p = (u_int32_t *)array;
696
if (sample_type & PERF_SAMPLE_ID) {
701
if (sample_type & PERF_SAMPLE_CPU) {
702
u_int32_t *p = (u_int32_t *)array;
706
if (event->header.misc == PERF_RECORD_MISC_KERNEL) {
708
} else if (event->header.misc == PERF_RECORD_MISC_USER) {
711
#if (defined(__powerpc__) || defined(__powerpc64__))
712
else if (event->header.misc == PERF_RECORD_MISC_HYPERVISOR) {
713
#define MAX_HYPERVISOR_ADDRESS 0xfffffffULL
714
if (data.ip > MAX_HYPERVISOR_ADDRESS) {
715
cverb << vconvert << "Discarding out-of-range hypervisor sample: "
716
<< hex << data.ip << endl;
717
operf_stats[OPERF_LOST_INVALID_HYPERV_ADDR]++;
721
if (first_time_processing) {
722
__map_hypervisor_sample(data.ip, data.pid);
727
// TODO: Unhandled types are the guest kernel and guest user samples.
728
// We should at least log what we're throwing away.
729
if (cverb << vconvert) {
731
switch (event->header.misc) {
732
case PERF_RECORD_MISC_HYPERVISOR:
733
domain = "hypervisor";
735
case PERF_RECORD_MISC_GUEST_KERNEL:
738
case PERF_RECORD_MISC_GUEST_USER:
739
domain = "guest user";
745
cerr << "Discarding sample from " << domain << " domain: "
746
<< hex << data.ip << endl;
751
/* If the static variable trans.tgid is still holding its initial value of 0,
752
* then we would incorrectly find trans.tgid and data.pid matching, and
753
* and make wrong assumptions from that match -- ending seg fault. So we
754
* will bail out early if we see a sample for PID 0 coming in and trans.image_name
755
* is NULL (implying the trans object is still in its initial state).
757
if (!trans.image_name && (data.pid == 0)) {
758
cverb << vconvert << "Discarding sample for PID 0" << endl;
762
if (cverb << vconvert)
763
cout << "(IP, " << event->header.misc << "): " << dec << data.pid << "/"
764
<< data.tid << ": " << hex << (unsigned long long)data.ip
765
<< endl << "\tdata ID: " << data.id << endl;
767
// Verify the sample.
768
trans.event = operfRead.get_eventnum_by_perf_event_id(data.id);
769
if (trans.event < 0) {
770
cerr << "Event num " << trans.event << " for id " << data.id
771
<< " is invalid. Skipping sample." << endl;
775
/* Only need to check for "no_user" since "no_kernel" is done by
778
if ((operfRead.get_event_by_counter(trans.event)->no_user) &&
779
(event->header.misc == PERF_RECORD_MISC_USER)) {
780
// Dropping user domain sample by user request in event spec.
784
if ((event->header.misc == PERF_RECORD_MISC_HYPERVISOR) && first_time_processing) {
785
/* We defer processing hypervisor samples until all the samples
786
* are processed. We do this because we synthesize an mmapping
787
* for hypervisor samples and need to modify it (start_addr and/or
788
* end_addr) as new hypervisor samples arrive. If we completely
789
* processed the hypervisor samples during "first_time_processing",
790
* we would end up (usually) with multiple "[hypervisor_bucket]" sample files,
791
* each with a unique address range. So we'll stick the event on
792
* the unresolved_events list to be re-processed later.
794
event_t * ev = (event_t *)xmalloc(event->header.size);
795
memcpy(ev, event, event->header.size);
796
unresolved_events.push_back(ev);
797
if (cverb << vconvert)
798
cout << "Deferring processing of hypervisor sample." << endl;
801
/* Check for the common case first -- i.e., where the current sample is from
802
* the same context as the previous sample. For the "no-vmlinux" case, start_addr
803
* and end_addr will be zero, so need to make sure we detect that.
804
* The last resort (and most expensive) is to call __get_operf_trans() if the
805
* sample cannot be matched up with a previous tran object.
808
if (trans.image_name && trans.tgid == data.pid) {
809
// For the no-vmlinux case . . .
810
if ((trans.start_addr == 0ULL) && (trans.end_addr == 0ULL)) {
813
// For samples in vmlinux or kernel module
814
} else if (data.ip >= trans.start_addr && data.ip <= trans.end_addr) {
819
} else if (trans.tgid == data.pid && data.ip >= trans.start_addr && data.ip <= trans.end_addr) {
820
trans.tid = data.tid;
824
trans.pc = data.ip - trans.start_addr;
828
if (!found_trans && __get_operf_trans(&data, hypervisor, in_kernel)) {
829
trans.current = operf_sfile_find(&trans);
834
* trans.current may be NULL if a kernel sample falls through
835
* the cracks, or if it's a sample from an anon region we couldn't find
837
if (found_trans && trans.current) {
838
/* log the sample or arc */
839
operf_sfile_log_sample(&trans);
841
update_trans_last(&trans);
842
if (sample_type & PERF_SAMPLE_CALLCHAIN)
843
__handle_callchain(array, &data);
847
if (first_time_processing) {
848
event_t * ev = (event_t *)xmalloc(event->header.size);
849
memcpy(ev, event, event->header.size);
850
unresolved_events.push_back(ev);
860
/* This function is used by operf_read::convertPerfData() to convert perf-formatted
861
* data to oprofile sample data files. After the header information in the perf sample data,
862
* the next piece of data is typically the PERF_RECORD_COMM record which tells us the name of the
863
* application/command being profiled. This is followed by PERF_RECORD_MMAP records
864
* which indicate what binary executables and libraries were mmap'ed into process memory
865
* when profiling began. Additional PERF_RECORD_MMAP records may appear later in the data
866
* stream (e.g., dlopen for single-process profiling or new process startup for system-wide
869
void OP_perf_utils::op_write_event(event_t * event, u64 sample_type)
872
if (event->header.type < PERF_RECORD_MAX) {
873
cverb << vconvert << "PERF_RECORD type " << hex << event->header.type << endl;
877
switch (event->header.type) {
878
case PERF_RECORD_SAMPLE:
879
__handle_sample_event(event, sample_type);
881
case PERF_RECORD_MMAP:
882
__handle_mmap_event(event);
884
case PERF_RECORD_COMM:
885
if (!sfile_init_done) {
887
sfile_init_done = true;
889
__handle_comm_event(event);
891
case PERF_RECORD_FORK:
892
__handle_fork_event(event);
894
case PERF_RECORD_THROTTLE:
897
case PERF_RECORD_LOST:
898
operf_stats[OPERF_RECORD_LOST_SAMPLE] += event->lost.lost;
900
case PERF_RECORD_EXIT:
903
// OK, ignore all other header types.
904
cverb << vconvert << "No matching event type for " << hex << event->header.type << endl;
909
void OP_perf_utils::op_reprocess_unresolved_events(u64 sample_type)
911
cverb << vconvert << "Reprocessing samples" << endl;
912
list<event_t *>::const_iterator it = unresolved_events.begin();
913
for (; it != unresolved_events.end(); it++) {
914
event_t * evt = (*it);
915
// This is just a sanity check, since all events in this list
916
// are unresolved sample events.
917
if (evt->header.type == PERF_RECORD_SAMPLE) {
918
__handle_sample_event(evt, sample_type);
924
void OP_perf_utils::op_release_resources(void)
926
map<pid_t, operf_process_info *>::iterator it = process_map.begin();
927
while (it != process_map.end())
931
multimap<string, struct operf_mmap *>::iterator images_it = all_images_map.begin();
932
while (images_it != all_images_map.end())
933
delete images_it++->second;
934
all_images_map.clear();
937
operf_sfile_close_files();
938
operf_free_modules_list();
942
void OP_perf_utils::op_perfrecord_sigusr1_handler(int sig __attribute__((unused)),
943
siginfo_t * siginfo __attribute__((unused)),
944
void *u_context __attribute__((unused)))
949
void OP_perf_utils::op_perfread_sigusr1_handler(int sig __attribute__((unused)),
950
siginfo_t * siginfo __attribute__((unused)),
951
void *u_context __attribute__((unused)))
956
int OP_perf_utils::op_read_from_stream(ifstream & is, char * buf, streamsize sz)
960
if (!is.eof() && is.fail()) {
961
cerr << "Internal error: Failed to read from input file." << endl;
970
static int __mmap_trace_file(struct mmap_info & info)
972
int mmap_prot = PROT_READ;
973
int mmap_flags = MAP_SHARED;
975
info.buf = (char *) mmap(NULL, mmap_size, mmap_prot,
976
mmap_flags, info.traceFD, info.offset);
977
if (info.buf == MAP_FAILED) {
978
cerr << "Error: mmap failed with errno:\n\t" << strerror(errno) << endl;
982
cverb << vconvert << hex << "mmap with the following parameters" << endl
983
<< "\tinfo.head: " << info.head << endl
984
<< "\tinfo.offset: " << info.offset << endl;
990
int OP_perf_utils::op_mmap_trace_file(struct mmap_info & info, bool init)
995
pg_sz = sysconf(_SC_PAGESIZE);
997
if (MMAP_WINDOW_SZ > info.file_data_size) {
998
mmap_size = info.file_data_size;
1000
mmap_size = MMAP_WINDOW_SZ;
1004
info.head = info.file_data_offset;
1005
shift = pg_sz * (info.head / pg_sz);
1006
info.offset += shift;
1009
return __mmap_trace_file(info);
1013
int OP_perf_utils::op_write_output(int output, void *buf, size_t size)
1017
int ret = write(output, buf, size);
1020
string errmsg = "Internal error: Failed to write sample data to pipe. errno is ";
1021
errmsg += strerror(errno);
1022
throw runtime_error(errmsg);
1026
buf = (char *)buf + ret;
1033
static void op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr)
1035
char fname[PATH_MAX];
1038
snprintf(fname, sizeof(fname), "/proc/%d/maps", tgid);
1040
fp = fopen(fname, "r");
1042
// Process must have exited already or invalid pid.
1043
cverb << vrecord << "couldn't open " << fname << endl;
1048
char line_buffer[BUFSIZ];
1049
char perms[5], pathname[PATH_MAX], dev[16];
1050
unsigned long long start_addr, end_addr, offset;
1053
memset(pathname, '\0', sizeof(pathname));
1054
struct mmap_event mmap;
1056
memset(&mmap, 0, sizeof(mmap));
1058
mmap.header.type = PERF_RECORD_MMAP;
1059
mmap.header.misc = PERF_RECORD_MISC_USER;
1061
if (fgets(line_buffer, sizeof(line_buffer), fp) == NULL)
1064
sscanf(line_buffer, "%llx-%llx %s %llx %s %d %s",
1065
&start_addr, &end_addr, perms, &offset, dev, &inode, pathname);
1066
if (perms[2] == 'x') {
1067
char *imagename = strchr(pathname, '/');
1069
if (imagename == NULL)
1070
imagename = strstr(pathname, "[vdso]");
1072
if (imagename == NULL)
1075
size = strlen(imagename) + 1;
1076
strcpy(mmap.filename, imagename);
1077
size = align_64bit(size);
1078
mmap.start = start_addr;
1079
mmap.len = end_addr - mmap.start;
1082
mmap.header.size = (sizeof(mmap) -
1083
(sizeof(mmap.filename) - size));
1084
int num = OP_perf_utils::op_write_output(output_fd, &mmap, mmap.header.size);
1085
if (cverb << vrecord)
1086
cout << "Created MMAP event for " << imagename << endl;
1087
pr->add_to_total(num);
1095
static int _record_one_process_info(pid_t pid, bool sys_wide, operf_record * pr,
1098
struct comm_event comm;
1099
char fname[PATH_MAX];
1105
struct dirent dirent, *next;
1108
snprintf(fname, sizeof(fname), "/proc/%d/status", pid);
1109
fp = fopen(fname, "r");
1111
/* Process must have finished or invalid PID passed into us.
1112
* If we're doing system-wide profiling, this case can naturally
1113
* occur, and it's not an error. But if profiling on a single
1114
* application, we can't continue after this, so we'll bail out now.
1117
cerr << "Unable to find process information for process " << pid << "." << endl;
1118
cverb << vrecord << "couldn't open " << fname << endl;
1125
memset(&comm, 0, sizeof(comm));
1126
while (!comm.comm[0] || !comm.pid) {
1127
if (fgets(buff, sizeof(buff), fp) == NULL) {
1129
cverb << vrecord << "Did not find Name or PID field in status file." << endl;
1132
if (!strncmp(buff, "Name:", 5)) {
1133
char *name = buff + 5;
1134
while (*name && isspace(*name))
1136
size = strlen(name) - 1;
1137
// The "Name" field in /proc/pid/status currently only allows for 16 characters,
1138
// but I'm not going to count on that being stable. We'll ensure we copy no more
1139
// than 16 chars since the comm.comm char array only holds 16.
1140
size = size > 16 ? 16 : size;
1141
memcpy(comm.comm, name, size++);
1142
} else if (memcmp(buff, "Tgid:", 5) == 0) {
1143
char *tgids = buff + 5;
1144
while (*tgids && isspace(*tgids))
1146
tgid = comm.pid = atoi(tgids);
1150
comm.header.type = PERF_RECORD_COMM;
1151
size = align_64bit(size);
1152
comm.header.size = sizeof(comm) - (sizeof(comm.comm) - size);
1154
// passed pid must have been a secondary thread
1156
int num = OP_perf_utils::op_write_output(output_fd, &comm, comm.header.size);
1157
pr->add_to_total(num);
1161
snprintf(fname, sizeof(fname), "/proc/%d/task", pid);
1162
tids = opendir(fname);
1164
// process must have exited
1166
cverb << vrecord << "opendir returned NULL" << endl;
1170
while (!readdir_r(tids, &dirent, &next) && next) {
1172
pid = strtol(dirent.d_name, &end, 10);
1178
int num = OP_perf_utils::op_write_output(output_fd, &comm, comm.header.size);
1179
pr->add_to_total(num);
1182
if (cverb << vrecord)
1183
cout << "Created COMM event for " << comm.comm << endl;
1186
op_record_process_exec_mmaps(pid, tgid, output_fd, pr);
1190
cverb << vrecord << "couldn't get app name and tgid for pid "
1191
<< dec << pid << " from /proc fs." << endl;
1197
/* Obtain process information for an active process (where the user has
1198
* passed in a process ID via the --pid option) or all active processes
1199
* (where system_wide==true). Then generate the necessary PERF_RECORD_COMM
1200
* and PERF_RECORD_MMAP entries into the profile data stream.
1202
int OP_perf_utils::op_record_process_info(bool system_wide, pid_t pid, operf_record * pr,
1206
if (cverb << vrecord)
1207
cout << "op_record_process_info" << endl;
1209
ret = _record_one_process_info(pid, system_wide, pr, output_fd);
1215
struct dirent dirent, *next;
1217
pids = opendir("/proc");
1219
cerr << "Unable to open /proc." << endl;
1223
while (!readdir_r(pids, &dirent, &next) && next) {
1225
pid = strtol(dirent.d_name, &end, 10);
1226
if (((errno == ERANGE && (pid == LONG_MAX || pid == LONG_MIN))
1227
|| (errno != 0 && pid == 0)) || (end == dirent.d_name)) {
1228
cverb << vmisc << "/proc entry " << dirent.d_name << " is not a PID" << endl;
1231
if ((ret = _record_one_process_info(pid, system_wide, pr, output_fd)) < 0)
1241
* each line is in the format:
1243
* module_name 16480 1 dependencies Live 0xe091e000
1245
* without any blank space in each field
1247
static void _record_module_info(int output_fd, operf_record * pr)
1249
const char * fname = "/proc/modules";
1252
struct operf_kernel_image * image;
1254
char ref_count[32+1];
1256
char module_name[256+1];
1257
char live_info[32+1];
1258
char dependencies[4096+1];
1259
unsigned long long start_address;
1261
fp = fopen(fname, "r");
1263
cerr << "Error opening /proc/modules. Unable to process module samples" << endl;
1264
cerr << strerror(errno) << endl;
1269
struct mmap_event mmap;
1271
memset(&mmap, 0, sizeof(mmap));
1273
line = op_get_line(fp);
1278
if (line[0] == '\0') {
1283
ret = sscanf(line, "%256s %u %32s %4096s %32s %llx",
1284
module_name, &module_size, ref_count,
1285
dependencies, live_info, &start_address);
1287
cerr << "op_record_kernel_info: Bad /proc/modules entry: \n\t" << line << endl;
1292
mmap.header.type = PERF_RECORD_MMAP;
1293
mmap.header.misc = PERF_RECORD_MISC_KERNEL;
1294
size = strlen(module_name) + 1;
1295
strncpy(mmap.filename, module_name, size);
1296
size = align_64bit(size);
1297
mmap.start = start_address;
1298
mmap.len = module_size;
1301
mmap.header.size = (sizeof(mmap) -
1302
(sizeof(mmap.filename) - size));
1303
int num = OP_perf_utils::op_write_output(output_fd, &mmap, mmap.header.size);
1304
if (cverb << vrecord)
1305
cout << "Created MMAP event for " << module_name << ". Size: "
1306
<< module_size << "; start addr: " << start_address << endl;
1307
pr->add_to_total(num);
1314
void OP_perf_utils::op_record_kernel_info(string vmlinux_file, u64 start_addr, u64 end_addr,
1315
int output_fd, operf_record * pr)
1317
struct mmap_event mmap;
1319
memset(&mmap, 0, sizeof(mmap));
1321
mmap.header.type = PERF_RECORD_MMAP;
1322
mmap.header.misc = PERF_RECORD_MISC_KERNEL;
1323
if (vmlinux_file.empty()) {
1324
size = strlen( "no_vmlinux") + 1;
1325
strncpy(mmap.filename, "no-vmlinux", size);
1329
size = vmlinux_file.length() + 1;
1330
strncpy(mmap.filename, vmlinux_file.c_str(), size);
1331
mmap.start = start_addr;
1332
mmap.len = end_addr - mmap.start;
1334
size = align_64bit(size);
1337
mmap.header.size = (sizeof(mmap) -
1338
(sizeof(mmap.filename) - size));
1339
int num = op_write_output(output_fd, &mmap, mmap.header.size);
1340
if (cverb << vrecord)
1341
cout << "Created MMAP event of size " << mmap.header.size << " for " <<mmap.filename << ". length: "
1342
<< hex << mmap.len << "; start addr: " << mmap.start << endl;
1343
pr->add_to_total(num);
1344
_record_module_info(output_fd, pr);
1347
void OP_perf_utils::op_get_kernel_event_data(struct mmap_data *md, operf_record * pr)
1349
struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)md->base;
1350
int out_fd = pr->out_fd();
1352
uint64_t head = pc->data_head;
1353
// Comment in perf_event.h says "User-space reading the @data_head value should issue
1354
// an rmb(), on SMP capable platforms, after reading this value."
1357
uint64_t old = md->prev;
1358
unsigned char *data = ((unsigned char *)md->base) + pagesize;
1365
throw runtime_error("ERROR: event buffer wrapped, which should NEVER happen.");
1373
if ((old & md->mask) + size != (head & md->mask)) {
1374
buf = &data[old & md->mask];
1375
size = md->mask + 1 - (old & md->mask);
1377
pr->add_to_total(op_write_output(out_fd, buf, size));
1380
buf = &data[old & md->mask];
1383
pr->add_to_total(op_write_output(out_fd, buf, size));
1385
pc->data_tail = old;
1389
int OP_perf_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry)
1391
#define OFFLINE 0x30
1392
unsigned int cpu_num;
1393
char cpu_online_pathname[40];
1398
entry = readdir(dir);
1401
} while (entry->d_type != DT_DIR);
1403
res = sscanf(entry->d_name, "cpu%u", &cpu_num);
1408
snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num);
1409
if ((online = fopen(cpu_online_pathname, "r")) == NULL) {
1410
cerr << "Unable to open " << cpu_online_pathname << endl;
1412
cerr << strerror(errno) << endl;
1415
res = fgetc(online);