blob: 8ae01d51f2914dbc8f148f8970d8b14c9de562da [file] [log] [blame]
Ingo Molnarddcacfa2009-04-20 15:37:32 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
Peter Zijlstra1a482f32009-05-23 18:28:58 +020064#include "perf.h"
Ingo Molnar148be2c2009-04-27 08:02:14 +020065#include "util/util.h"
Ingo Molnarddcacfa2009-04-20 15:37:32 +020066
67#include <getopt.h>
68#include <assert.h>
69#include <fcntl.h>
70#include <stdio.h>
71#include <errno.h>
Ingo Molnarddcacfa2009-04-20 15:37:32 +020072#include <time.h>
73#include <sched.h>
74#include <pthread.h>
75
76#include <sys/syscall.h>
77#include <sys/ioctl.h>
78#include <sys/poll.h>
79#include <sys/prctl.h>
80#include <sys/wait.h>
81#include <sys/uio.h>
82#include <sys/mman.h>
83
84#include <linux/unistd.h>
85#include <linux/types.h>
86
Peter Zijlstra16c8a102009-05-05 17:50:27 +020087#define EVENT_MASK_KERNEL 1
88#define EVENT_MASK_USER 2
89
Ingo Molnarddcacfa2009-04-20 15:37:32 +020090static int system_wide = 0;
91
92static int nr_counters = 0;
93static __u64 event_id[MAX_COUNTERS] = {
94 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
95 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
96 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
97 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
98
99 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
100 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
101 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
102 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
103};
104static int default_interval = 100000;
105static int event_count[MAX_COUNTERS];
106static int fd[MAX_NR_CPUS][MAX_COUNTERS];
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200107static int event_mask[MAX_COUNTERS];
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200108
109static int tid = -1;
110static int profile_cpu = -1;
111static int nr_cpus = 0;
112static int nmi = 1;
113static int group = 0;
114static unsigned int page_size;
115
116static int zero;
117
Ingo Molnar66cf7822009-04-30 13:53:33 +0200118static int scale = 1;
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200119
120static const unsigned int default_count[] = {
121 1000000,
122 1000000,
123 10000,
124 10000,
125 1000000,
126 10000,
127};
128
129static char *hw_event_names[] = {
130 "CPU cycles",
131 "instructions",
132 "cache references",
133 "cache misses",
134 "branches",
135 "branch misses",
136 "bus cycles",
137};
138
139static char *sw_event_names[] = {
140 "cpu clock ticks",
141 "task clock ticks",
142 "pagefaults",
143 "context switches",
144 "CPU migrations",
145 "minor faults",
146 "major faults",
147};
148
149struct event_symbol {
150 __u64 event;
151 char *symbol;
152};
153
154static struct event_symbol event_symbols[] = {
155 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
156 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
157 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
158 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
164
165 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
166 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
167 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
168 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
175};
176
177#define __PERF_COUNTER_FIELD(config, name) \
178 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
179
180#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
181#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
182#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
183#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
184
185static void display_events_help(void)
186{
187 unsigned int i;
188 __u64 e;
189
190 printf(
191 " -e EVENT --event=EVENT # symbolic-name abbreviations");
192
193 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
194 int type, id;
195
196 e = event_symbols[i].event;
197 type = PERF_COUNTER_TYPE(e);
198 id = PERF_COUNTER_ID(e);
199
200 printf("\n %d:%d: %-20s",
201 type, id, event_symbols[i].symbol);
202 }
203
204 printf("\n"
205 " rNNN: raw PMU events (eventsel+umask)\n\n");
206}
207
208static void display_help(void)
209{
210 printf(
211 "Usage: perfstat [<events...>] <cmd...>\n\n"
212 "PerfStat Options (up to %d event types can be specified):\n\n",
213 MAX_COUNTERS);
214
215 display_events_help();
216
217 printf(
218 " -l # scale counter values\n"
219 " -a # system-wide collection\n");
220 exit(0);
221}
222
223static char *event_name(int ctr)
224{
225 __u64 config = event_id[ctr];
226 int type = PERF_COUNTER_TYPE(config);
227 int id = PERF_COUNTER_ID(config);
228 static char buf[32];
229
230 if (PERF_COUNTER_RAW(config)) {
231 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
232 return buf;
233 }
234
235 switch (type) {
236 case PERF_TYPE_HARDWARE:
237 if (id < PERF_HW_EVENTS_MAX)
238 return hw_event_names[id];
239 return "unknown-hardware";
240
241 case PERF_TYPE_SOFTWARE:
242 if (id < PERF_SW_EVENTS_MAX)
243 return sw_event_names[id];
244 return "unknown-software";
245
246 default:
247 break;
248 }
249
250 return "unknown";
251}
252
253/*
254 * Each event can have multiple symbolic names.
255 * Symbolic names are (almost) exactly matched.
256 */
257static __u64 match_event_symbols(char *str)
258{
259 __u64 config, id;
260 int type;
261 unsigned int i;
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200262 char mask_str[4];
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200263
264 if (sscanf(str, "r%llx", &config) == 1)
265 return config | PERF_COUNTER_RAW_MASK;
266
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200267 switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
268 case 3:
269 if (strchr(mask_str, 'u'))
270 event_mask[nr_counters] |= EVENT_MASK_USER;
271 if (strchr(mask_str, 'k'))
272 event_mask[nr_counters] |= EVENT_MASK_KERNEL;
273 case 2:
274 return EID(type, id);
275
276 default:
277 break;
278 }
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200279
280 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
281 if (!strncmp(str, event_symbols[i].symbol,
282 strlen(event_symbols[i].symbol)))
283 return event_symbols[i].event;
284 }
285
286 return ~0ULL;
287}
288
289static int parse_events(char *str)
290{
291 __u64 config;
292
293again:
294 if (nr_counters == MAX_COUNTERS)
295 return -1;
296
297 config = match_event_symbols(str);
298 if (config == ~0ULL)
299 return -1;
300
301 event_id[nr_counters] = config;
302 nr_counters++;
303
304 str = strstr(str, ",");
305 if (str) {
306 str++;
307 goto again;
308 }
309
310 return 0;
311}
312
313
314/*
315 * perfstat
316 */
317
318char fault_here[1000000];
319
320static void create_perfstat_counter(int counter)
321{
322 struct perf_counter_hw_event hw_event;
323
324 memset(&hw_event, 0, sizeof(hw_event));
325 hw_event.config = event_id[counter];
326 hw_event.record_type = 0;
327 hw_event.nmi = 0;
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200328 hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
329 hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER;
330
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200331 if (scale)
332 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
333 PERF_FORMAT_TOTAL_TIME_RUNNING;
334
335 if (system_wide) {
336 int cpu;
337 for (cpu = 0; cpu < nr_cpus; cpu ++) {
338 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
339 if (fd[cpu][counter] < 0) {
340 printf("perfstat error: syscall returned with %d (%s)\n",
341 fd[cpu][counter], strerror(errno));
342 exit(-1);
343 }
344 }
345 } else {
346 hw_event.inherit = 1;
347 hw_event.disabled = 1;
348
349 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
350 if (fd[0][counter] < 0) {
351 printf("perfstat error: syscall returned with %d (%s)\n",
352 fd[0][counter], strerror(errno));
353 exit(-1);
354 }
355 }
356}
357
358int do_perfstat(int argc, char *argv[])
359{
360 unsigned long long t0, t1;
361 int counter;
362 ssize_t res;
363 int status;
364 int pid;
365
366 if (!system_wide)
367 nr_cpus = 1;
368
369 for (counter = 0; counter < nr_counters; counter++)
370 create_perfstat_counter(counter);
371
372 argc -= optind;
373 argv += optind;
374
375 if (!argc)
376 display_help();
377
378 /*
379 * Enable counters and exec the command:
380 */
381 t0 = rdclock();
382 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
383
384 if ((pid = fork()) < 0)
385 perror("failed to fork");
386 if (!pid) {
387 if (execvp(argv[0], argv)) {
388 perror(argv[0]);
389 exit(-1);
390 }
391 }
392 while (wait(&status) >= 0)
393 ;
394 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
395 t1 = rdclock();
396
397 fflush(stdout);
398
399 fprintf(stderr, "\n");
400 fprintf(stderr, " Performance counter stats for \'%s\':\n",
401 argv[0]);
402 fprintf(stderr, "\n");
403
404 for (counter = 0; counter < nr_counters; counter++) {
405 int cpu, nv;
406 __u64 count[3], single_count[3];
407 int scaled;
408
409 count[0] = count[1] = count[2] = 0;
410 nv = scale ? 3 : 1;
411 for (cpu = 0; cpu < nr_cpus; cpu ++) {
412 res = read(fd[cpu][counter],
413 single_count, nv * sizeof(__u64));
414 assert(res == nv * sizeof(__u64));
415
416 count[0] += single_count[0];
417 if (scale) {
418 count[1] += single_count[1];
419 count[2] += single_count[2];
420 }
421 }
422
423 scaled = 0;
424 if (scale) {
425 if (count[2] == 0) {
426 fprintf(stderr, " %14s %-20s\n",
427 "<not counted>", event_name(counter));
428 continue;
429 }
430 if (count[2] < count[1]) {
431 scaled = 1;
432 count[0] = (unsigned long long)
433 ((double)count[0] * count[1] / count[2] + 0.5);
434 }
435 }
436
437 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
438 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
439
440 double msecs = (double)count[0] / 1000000;
441
442 fprintf(stderr, " %14.6f %-20s (msecs)",
443 msecs, event_name(counter));
444 } else {
445 fprintf(stderr, " %14Ld %-20s (events)",
446 count[0], event_name(counter));
447 }
448 if (scaled)
449 fprintf(stderr, " (scaled from %.2f%%)",
450 (double) count[2] / count[1] * 100);
451 fprintf(stderr, "\n");
452 }
453 fprintf(stderr, "\n");
454 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
455 (double)(t1-t0)/1e6);
456 fprintf(stderr, "\n");
457
458 return 0;
459}
460
461static void process_options(int argc, char **argv)
462{
463 int error = 0, counter;
464
465 for (;;) {
466 int option_index = 0;
467 /** Options for getopt */
468 static struct option long_options[] = {
469 {"count", required_argument, NULL, 'c'},
470 {"cpu", required_argument, NULL, 'C'},
471 {"delay", required_argument, NULL, 'd'},
472 {"dump_symtab", no_argument, NULL, 'D'},
473 {"event", required_argument, NULL, 'e'},
474 {"filter", required_argument, NULL, 'f'},
475 {"group", required_argument, NULL, 'g'},
476 {"help", no_argument, NULL, 'h'},
477 {"nmi", required_argument, NULL, 'n'},
478 {"munmap_info", no_argument, NULL, 'U'},
479 {"pid", required_argument, NULL, 'p'},
480 {"realtime", required_argument, NULL, 'r'},
481 {"scale", no_argument, NULL, 'l'},
482 {"symbol", required_argument, NULL, 's'},
483 {"stat", no_argument, NULL, 'S'},
484 {"vmlinux", required_argument, NULL, 'x'},
485 {"zero", no_argument, NULL, 'z'},
486 {NULL, 0, NULL, 0 }
487 };
488 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
489 long_options, &option_index);
490 if (c == -1)
491 break;
492
493 switch (c) {
494 case 'a': system_wide = 1; break;
495 case 'c': default_interval = atoi(optarg); break;
496 case 'C':
497 /* CPU and PID are mutually exclusive */
498 if (tid != -1) {
499 printf("WARNING: CPU switch overriding PID\n");
500 sleep(1);
501 tid = -1;
502 }
503 profile_cpu = atoi(optarg); break;
504
505 case 'e': error = parse_events(optarg); break;
506
507 case 'g': group = atoi(optarg); break;
508 case 'h': display_help(); break;
509 case 'l': scale = 1; break;
510 case 'n': nmi = atoi(optarg); break;
511 case 'p':
512 /* CPU and PID are mutually exclusive */
513 if (profile_cpu != -1) {
514 printf("WARNING: PID switch overriding CPU\n");
515 sleep(1);
516 profile_cpu = -1;
517 }
518 tid = atoi(optarg); break;
519 case 'z': zero = 1; break;
520 default: error = 1; break;
521 }
522 }
523 if (error)
524 display_help();
525
526 if (!nr_counters) {
527 nr_counters = 8;
528 }
529
530 for (counter = 0; counter < nr_counters; counter++) {
531 if (event_count[counter])
532 continue;
533
534 event_count[counter] = default_interval;
535 }
536}
537
Ingo Molnar58d7e992009-05-15 11:03:23 +0200538static void skip_signal(int signo)
539{
540}
541
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200542int cmd_stat(int argc, char **argv, const char *prefix)
543{
Ingo Molnar58d7e992009-05-15 11:03:23 +0200544 sigset_t blocked;
545
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200546 page_size = sysconf(_SC_PAGE_SIZE);
547
548 process_options(argc, argv);
549
550 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
551 assert(nr_cpus <= MAX_NR_CPUS);
552 assert(nr_cpus >= 0);
553
Ingo Molnar58d7e992009-05-15 11:03:23 +0200554 /*
555 * We dont want to block the signals - that would cause
556 * child tasks to inherit that and Ctrl-C would not work.
557 * What we want is for Ctrl-C to work in the exec()-ed
558 * task, but being ignored by perf stat itself:
559 */
560 signal(SIGINT, skip_signal);
561 signal(SIGALRM, skip_signal);
562 signal(SIGABRT, skip_signal);
563
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200564 return do_perfstat(argc, argv);
565}