blob: d7ace631fc4ff10a9b43152335b9cb958fb6f38d [file] [log] [blame]
Ingo Molnarddcacfa2009-04-20 15:37:32 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
Ingo Molnar148be2c2009-04-27 08:02:14 +020064#include "util/util.h"
Ingo Molnarddcacfa2009-04-20 15:37:32 +020065
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <ctype.h>
72#include <time.h>
73#include <sched.h>
74#include <pthread.h>
75
76#include <sys/syscall.h>
77#include <sys/ioctl.h>
78#include <sys/poll.h>
79#include <sys/prctl.h>
80#include <sys/wait.h>
81#include <sys/uio.h>
82#include <sys/mman.h>
83
84#include <linux/unistd.h>
85#include <linux/types.h>
86
87#include "../../include/linux/perf_counter.h"
88
89
90/*
91 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
92 * counters in the current task.
93 */
94#define PR_TASK_PERF_COUNTERS_DISABLE 31
95#define PR_TASK_PERF_COUNTERS_ENABLE 32
96
97#define rdclock() \
98({ \
99 struct timespec ts; \
100 \
101 clock_gettime(CLOCK_MONOTONIC, &ts); \
102 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
103})
104
105/*
106 * Pick up some kernel type conventions:
107 */
108#define __user
109#define asmlinkage
110
111#ifdef __x86_64__
112#define __NR_perf_counter_open 295
113#define rmb() asm volatile("lfence" ::: "memory")
114#define cpu_relax() asm volatile("rep; nop" ::: "memory");
115#endif
116
117#ifdef __i386__
118#define __NR_perf_counter_open 333
119#define rmb() asm volatile("lfence" ::: "memory")
120#define cpu_relax() asm volatile("rep; nop" ::: "memory");
121#endif
122
123#ifdef __powerpc__
124#define __NR_perf_counter_open 319
125#define rmb() asm volatile ("sync" ::: "memory")
126#define cpu_relax() asm volatile ("" ::: "memory");
127#endif
128
129#define unlikely(x) __builtin_expect(!!(x), 0)
130#define min(x, y) ({ \
131 typeof(x) _min1 = (x); \
132 typeof(y) _min2 = (y); \
133 (void) (&_min1 == &_min2); \
134 _min1 < _min2 ? _min1 : _min2; })
135
136extern asmlinkage int sys_perf_counter_open(
137 struct perf_counter_hw_event *hw_event_uptr __user,
138 pid_t pid,
139 int cpu,
140 int group_fd,
141 unsigned long flags);
142
143#define MAX_COUNTERS 64
144#define MAX_NR_CPUS 256
145
146#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
147
148static int system_wide = 0;
149
150static int nr_counters = 0;
151static __u64 event_id[MAX_COUNTERS] = {
152 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
153 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
154 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
155 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
156
157 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
158 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
159 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
160 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
161};
162static int default_interval = 100000;
163static int event_count[MAX_COUNTERS];
164static int fd[MAX_NR_CPUS][MAX_COUNTERS];
165
166static int tid = -1;
167static int profile_cpu = -1;
168static int nr_cpus = 0;
169static int nmi = 1;
170static int group = 0;
171static unsigned int page_size;
172
173static int zero;
174
175static int scale;
176
177static const unsigned int default_count[] = {
178 1000000,
179 1000000,
180 10000,
181 10000,
182 1000000,
183 10000,
184};
185
186static char *hw_event_names[] = {
187 "CPU cycles",
188 "instructions",
189 "cache references",
190 "cache misses",
191 "branches",
192 "branch misses",
193 "bus cycles",
194};
195
196static char *sw_event_names[] = {
197 "cpu clock ticks",
198 "task clock ticks",
199 "pagefaults",
200 "context switches",
201 "CPU migrations",
202 "minor faults",
203 "major faults",
204};
205
206struct event_symbol {
207 __u64 event;
208 char *symbol;
209};
210
211static struct event_symbol event_symbols[] = {
212 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
213 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
214 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
215 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
216 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
217 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
218 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
219 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
220 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
221
222 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
223 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
224 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
225 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
226 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
227 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
228 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
229 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
231 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
232};
233
234#define __PERF_COUNTER_FIELD(config, name) \
235 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
236
237#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
238#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
239#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
240#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
241
242static void display_events_help(void)
243{
244 unsigned int i;
245 __u64 e;
246
247 printf(
248 " -e EVENT --event=EVENT # symbolic-name abbreviations");
249
250 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
251 int type, id;
252
253 e = event_symbols[i].event;
254 type = PERF_COUNTER_TYPE(e);
255 id = PERF_COUNTER_ID(e);
256
257 printf("\n %d:%d: %-20s",
258 type, id, event_symbols[i].symbol);
259 }
260
261 printf("\n"
262 " rNNN: raw PMU events (eventsel+umask)\n\n");
263}
264
265static void display_help(void)
266{
267 printf(
268 "Usage: perfstat [<events...>] <cmd...>\n\n"
269 "PerfStat Options (up to %d event types can be specified):\n\n",
270 MAX_COUNTERS);
271
272 display_events_help();
273
274 printf(
275 " -l # scale counter values\n"
276 " -a # system-wide collection\n");
277 exit(0);
278}
279
280static char *event_name(int ctr)
281{
282 __u64 config = event_id[ctr];
283 int type = PERF_COUNTER_TYPE(config);
284 int id = PERF_COUNTER_ID(config);
285 static char buf[32];
286
287 if (PERF_COUNTER_RAW(config)) {
288 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
289 return buf;
290 }
291
292 switch (type) {
293 case PERF_TYPE_HARDWARE:
294 if (id < PERF_HW_EVENTS_MAX)
295 return hw_event_names[id];
296 return "unknown-hardware";
297
298 case PERF_TYPE_SOFTWARE:
299 if (id < PERF_SW_EVENTS_MAX)
300 return sw_event_names[id];
301 return "unknown-software";
302
303 default:
304 break;
305 }
306
307 return "unknown";
308}
309
310/*
311 * Each event can have multiple symbolic names.
312 * Symbolic names are (almost) exactly matched.
313 */
314static __u64 match_event_symbols(char *str)
315{
316 __u64 config, id;
317 int type;
318 unsigned int i;
319
320 if (sscanf(str, "r%llx", &config) == 1)
321 return config | PERF_COUNTER_RAW_MASK;
322
323 if (sscanf(str, "%d:%llu", &type, &id) == 2)
324 return EID(type, id);
325
326 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
327 if (!strncmp(str, event_symbols[i].symbol,
328 strlen(event_symbols[i].symbol)))
329 return event_symbols[i].event;
330 }
331
332 return ~0ULL;
333}
334
335static int parse_events(char *str)
336{
337 __u64 config;
338
339again:
340 if (nr_counters == MAX_COUNTERS)
341 return -1;
342
343 config = match_event_symbols(str);
344 if (config == ~0ULL)
345 return -1;
346
347 event_id[nr_counters] = config;
348 nr_counters++;
349
350 str = strstr(str, ",");
351 if (str) {
352 str++;
353 goto again;
354 }
355
356 return 0;
357}
358
359
360/*
361 * perfstat
362 */
363
364char fault_here[1000000];
365
366static void create_perfstat_counter(int counter)
367{
368 struct perf_counter_hw_event hw_event;
369
370 memset(&hw_event, 0, sizeof(hw_event));
371 hw_event.config = event_id[counter];
372 hw_event.record_type = 0;
373 hw_event.nmi = 0;
374 if (scale)
375 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
376 PERF_FORMAT_TOTAL_TIME_RUNNING;
377
378 if (system_wide) {
379 int cpu;
380 for (cpu = 0; cpu < nr_cpus; cpu ++) {
381 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
382 if (fd[cpu][counter] < 0) {
383 printf("perfstat error: syscall returned with %d (%s)\n",
384 fd[cpu][counter], strerror(errno));
385 exit(-1);
386 }
387 }
388 } else {
389 hw_event.inherit = 1;
390 hw_event.disabled = 1;
391
392 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
393 if (fd[0][counter] < 0) {
394 printf("perfstat error: syscall returned with %d (%s)\n",
395 fd[0][counter], strerror(errno));
396 exit(-1);
397 }
398 }
399}
400
401int do_perfstat(int argc, char *argv[])
402{
403 unsigned long long t0, t1;
404 int counter;
405 ssize_t res;
406 int status;
407 int pid;
408
409 if (!system_wide)
410 nr_cpus = 1;
411
412 for (counter = 0; counter < nr_counters; counter++)
413 create_perfstat_counter(counter);
414
415 argc -= optind;
416 argv += optind;
417
418 if (!argc)
419 display_help();
420
421 /*
422 * Enable counters and exec the command:
423 */
424 t0 = rdclock();
425 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
426
427 if ((pid = fork()) < 0)
428 perror("failed to fork");
429 if (!pid) {
430 if (execvp(argv[0], argv)) {
431 perror(argv[0]);
432 exit(-1);
433 }
434 }
435 while (wait(&status) >= 0)
436 ;
437 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
438 t1 = rdclock();
439
440 fflush(stdout);
441
442 fprintf(stderr, "\n");
443 fprintf(stderr, " Performance counter stats for \'%s\':\n",
444 argv[0]);
445 fprintf(stderr, "\n");
446
447 for (counter = 0; counter < nr_counters; counter++) {
448 int cpu, nv;
449 __u64 count[3], single_count[3];
450 int scaled;
451
452 count[0] = count[1] = count[2] = 0;
453 nv = scale ? 3 : 1;
454 for (cpu = 0; cpu < nr_cpus; cpu ++) {
455 res = read(fd[cpu][counter],
456 single_count, nv * sizeof(__u64));
457 assert(res == nv * sizeof(__u64));
458
459 count[0] += single_count[0];
460 if (scale) {
461 count[1] += single_count[1];
462 count[2] += single_count[2];
463 }
464 }
465
466 scaled = 0;
467 if (scale) {
468 if (count[2] == 0) {
469 fprintf(stderr, " %14s %-20s\n",
470 "<not counted>", event_name(counter));
471 continue;
472 }
473 if (count[2] < count[1]) {
474 scaled = 1;
475 count[0] = (unsigned long long)
476 ((double)count[0] * count[1] / count[2] + 0.5);
477 }
478 }
479
480 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
481 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
482
483 double msecs = (double)count[0] / 1000000;
484
485 fprintf(stderr, " %14.6f %-20s (msecs)",
486 msecs, event_name(counter));
487 } else {
488 fprintf(stderr, " %14Ld %-20s (events)",
489 count[0], event_name(counter));
490 }
491 if (scaled)
492 fprintf(stderr, " (scaled from %.2f%%)",
493 (double) count[2] / count[1] * 100);
494 fprintf(stderr, "\n");
495 }
496 fprintf(stderr, "\n");
497 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
498 (double)(t1-t0)/1e6);
499 fprintf(stderr, "\n");
500
501 return 0;
502}
503
504static void process_options(int argc, char **argv)
505{
506 int error = 0, counter;
507
508 for (;;) {
509 int option_index = 0;
510 /** Options for getopt */
511 static struct option long_options[] = {
512 {"count", required_argument, NULL, 'c'},
513 {"cpu", required_argument, NULL, 'C'},
514 {"delay", required_argument, NULL, 'd'},
515 {"dump_symtab", no_argument, NULL, 'D'},
516 {"event", required_argument, NULL, 'e'},
517 {"filter", required_argument, NULL, 'f'},
518 {"group", required_argument, NULL, 'g'},
519 {"help", no_argument, NULL, 'h'},
520 {"nmi", required_argument, NULL, 'n'},
521 {"munmap_info", no_argument, NULL, 'U'},
522 {"pid", required_argument, NULL, 'p'},
523 {"realtime", required_argument, NULL, 'r'},
524 {"scale", no_argument, NULL, 'l'},
525 {"symbol", required_argument, NULL, 's'},
526 {"stat", no_argument, NULL, 'S'},
527 {"vmlinux", required_argument, NULL, 'x'},
528 {"zero", no_argument, NULL, 'z'},
529 {NULL, 0, NULL, 0 }
530 };
531 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
532 long_options, &option_index);
533 if (c == -1)
534 break;
535
536 switch (c) {
537 case 'a': system_wide = 1; break;
538 case 'c': default_interval = atoi(optarg); break;
539 case 'C':
540 /* CPU and PID are mutually exclusive */
541 if (tid != -1) {
542 printf("WARNING: CPU switch overriding PID\n");
543 sleep(1);
544 tid = -1;
545 }
546 profile_cpu = atoi(optarg); break;
547
548 case 'e': error = parse_events(optarg); break;
549
550 case 'g': group = atoi(optarg); break;
551 case 'h': display_help(); break;
552 case 'l': scale = 1; break;
553 case 'n': nmi = atoi(optarg); break;
554 case 'p':
555 /* CPU and PID are mutually exclusive */
556 if (profile_cpu != -1) {
557 printf("WARNING: PID switch overriding CPU\n");
558 sleep(1);
559 profile_cpu = -1;
560 }
561 tid = atoi(optarg); break;
562 case 'z': zero = 1; break;
563 default: error = 1; break;
564 }
565 }
566 if (error)
567 display_help();
568
569 if (!nr_counters) {
570 nr_counters = 8;
571 }
572
573 for (counter = 0; counter < nr_counters; counter++) {
574 if (event_count[counter])
575 continue;
576
577 event_count[counter] = default_interval;
578 }
579}
580
581int cmd_stat(int argc, char **argv, const char *prefix)
582{
583 page_size = sysconf(_SC_PAGE_SIZE);
584
585 process_options(argc, argv);
586
587 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
588 assert(nr_cpus <= MAX_NR_CPUS);
589 assert(nr_cpus >= 0);
590
591 return do_perfstat(argc, argv);
592}