Merge tag 'perf-core-for-mingo-20161005' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/urgent

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

- Intel PT timestamp fixes (Adrian Hunter)

- Fix Intel JSON fixed counter conversions (Andi Kleen)

- Sync memcpy, cpufeatures and bpf headers with the kernel (Arnaldo Carvalho de Melo)

- Add some more tool tips (Donghyun Kim, Kim SeonYoung, Nambong Ha)

- Fix libtraceevent's kbuffer_read_at_offset() handling of offsets before or
  equal the first event (Namhyung Kim)

- Fix uretprobe probe placement on ppc64le (Ravi Bangoria)

- Support building C++ source files and add feature detection for g++,
  prep work for supporting a builtin clang/llvm, to remove the need for having
  that toolchain installed to automagically build BPF scriptlets that then
  gets uploaded to the kernel via sys_bpf() (Wang Nan)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 92a8308..1188bc8 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -106,7 +106,6 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
index 2ec0b0abb..49e6eba 100644
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -181,11 +181,11 @@
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe - memory copy with machine check exception handling
+ * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe)
+ENTRY(memcpy_mcsafe_unrolled)
 	cmpl $8, %edx
 	/* Less than 8 bytes? Go to byte copy loop */
 	jb .L_no_whole_words
@@ -273,7 +273,7 @@
 .L_done_memcpy_trap:
 	xorq %rax, %rax
 	ret
-ENDPROC(memcpy_mcsafe)
+ENDPROC(memcpy_mcsafe_unrolled)
 
 	.section .fixup, "ax"
 	/* Return -EFAULT for any failure */
diff --git a/tools/build/Build.include b/tools/build/Build.include
index 0248938..1dcb95e 100644
--- a/tools/build/Build.include
+++ b/tools/build/Build.include
@@ -90,6 +90,7 @@
 # - per object C flags
 # - BUILD_STR macro to allow '-D"$(variable)"' constructs
 c_flags = -Wp,-MD,$(depfile),-MT,$@ $(CFLAGS) -D"BUILD_STR(s)=\#s" $(CFLAGS_$(basetarget).o) $(CFLAGS_$(obj))
+cxx_flags = -Wp,-MD,$(depfile),-MT,$@ $(CXXFLAGS) -D"BUILD_STR(s)=\#s" $(CXXFLAGS_$(basetarget).o) $(CXXFLAGS_$(obj))
 
 ###
 ## HOSTCC C flags
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 190519a..99c0ccd 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -61,6 +61,9 @@
 quiet_cmd_host_cc_o_c = HOSTCC   $@
       cmd_host_cc_o_c = $(HOSTCC) $(host_c_flags) -c -o $@ $<
 
+quiet_cmd_cxx_o_c = CXX      $@
+      cmd_cxx_o_c = $(CXX) $(cxx_flags) -c -o $@ $<
+
 quiet_cmd_cpp_i_c = CPP      $@
       cmd_cpp_i_c = $(CC) $(c_flags) -E -o $@ $<
 
@@ -88,6 +91,10 @@
 	$(call rule_mkdir)
 	$(call if_changed_dep,$(host)cc_o_c)
 
+$(OUTPUT)%.o: %.cpp FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cxx_o_c)
+
 $(OUTPUT)%.o: %.S FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,$(host)cc_o_c)
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index a120c6b..ae52e02 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -7,7 +7,7 @@
 
 feature_check = $(eval $(feature_check_code))
 define feature_check_code
-  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" CXXFLAGS="$(EXTRA_CXXFLAGS) $(FEATURE_CHECK_CXXFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
 endef
 
 feature_set = $(eval $(feature_set_code))
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index a0b29a3..ac9c477 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -46,11 +46,13 @@
 	test-lzma.bin			\
 	test-bpf.bin			\
 	test-get_cpuid.bin		\
-	test-sdt.bin
+	test-sdt.bin			\
+	test-cxx.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
 
 CC := $(CROSS_COMPILE)gcc -MD
+CXX := $(CROSS_COMPILE)g++ -MD
 PKG_CONFIG := $(CROSS_COMPILE)pkg-config
 
 all: $(FILES)
@@ -58,6 +60,9 @@
 __BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
   BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
 
+__BUILDXX = $(CXX) $(CXXFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
+  BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
+
 ###############################
 
 $(OUTPUT)test-all.bin:
@@ -217,6 +222,9 @@
 $(OUTPUT)test-sdt.bin:
 	$(BUILD)
 
+$(OUTPUT)test-cxx.bin:
+	$(BUILDXX) -std=gnu++11
+
 -include $(OUTPUT)*.d
 
 ###############################
diff --git a/tools/build/feature/test-cxx.cpp b/tools/build/feature/test-cxx.cpp
new file mode 100644
index 0000000..b1dee9a3
--- /dev/null
+++ b/tools/build/feature/test-cxx.cpp
@@ -0,0 +1,15 @@
+#include <iostream>
+#include <memory>
+
+static void print_str(std::string s)
+{
+	std::cout << s << std::endl;
+}
+
+int main()
+{
+	std::string s("Hello World!");
+	print_str(std::move(s));
+	std::cout << "|" << s << "|" << std::endl;
+	return 0;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da218fe..9e5fc16 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -339,7 +339,7 @@
 	BPF_FUNC_skb_change_type,
 
 	/**
-	 * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb
+	 * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb
 	 * @skb: pointer to skb
 	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
 	 * @index: index of the cgroup in the bpf_map
@@ -348,7 +348,7 @@
 	 *   == 1 skb succeeded the cgroup2 descendant test
 	 *    < 0 error
 	 */
-	BPF_FUNC_skb_in_cgroup,
+	BPF_FUNC_skb_under_cgroup,
 
 	/**
 	 * bpf_get_hash_recalc(skb)
diff --git a/tools/lib/traceevent/kbuffer-parse.c b/tools/lib/traceevent/kbuffer-parse.c
index 3bcada3..65984f1 100644
--- a/tools/lib/traceevent/kbuffer-parse.c
+++ b/tools/lib/traceevent/kbuffer-parse.c
@@ -622,6 +622,7 @@
 
 	/* Reset the buffer */
 	kbuffer_load_subbuffer(kbuf, kbuf->subbuffer);
+	data = kbuffer_read_event(kbuf, ts);
 
 	while (kbuf->curr < offset) {
 		data = kbuffer_next_event(kbuf, ts);
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index 5950b5a2..8a6479c 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -28,3 +28,7 @@
 See assembly instructions with percentage: perf annotate <symbol>
 If you prefer Intel style assembly, try: perf annotate -M intel
 For hierarchical output, try: perf report --hierarchy
+Order by the overhead of source file name and line number: perf report -s srcline
+System-wide collection from all CPUs: perf record -a
+Show current config key-value pairs: perf config --list
+Show user configuration overrides: perf config --user --list
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index ed9d5d1..1030a6e 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -82,7 +82,8 @@
 	 *
 	 * In addition, we shouldn't specify an offset for kretprobes.
 	 */
-	if (pev->point.offset || pev->point.retprobe || !map || !sym)
+	if (pev->point.offset || (!pev->uprobes && pev->point.retprobe) ||
+	    !map || !sym)
 		return;
 
 	lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym);
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index 79c2133..41611d7 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -312,6 +312,8 @@
 	const char *event;
 } fixed[] = {
 	{ "inst_retired.any", "event=0xc0" },
+	{ "inst_retired.any_p", "event=0xc0" },
+	{ "cpu_clk_unhalted.ref", "event=0x0,umask=0x03" },
 	{ "cpu_clk_unhalted.thread", "event=0x3c" },
 	{ "cpu_clk_unhalted.thread_any", "event=0x3c,any=1" },
 	{ NULL, NULL},
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 7591a0c..16c06d3a 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -90,6 +90,7 @@
 	bool pge;
 	bool have_tma;
 	bool have_cyc;
+	bool fixup_last_mtc;
 	uint64_t pos;
 	uint64_t last_ip;
 	uint64_t ip;
@@ -586,10 +587,31 @@
 	uint64_t        tsc_timestamp;
 	uint64_t        timestamp;
 	bool            have_tma;
+	bool            fixup_last_mtc;
 	bool            from_mtc;
 	double          cbr_cyc_to_tsc;
 };
 
+/*
+ * MTC provides a 8-bit slice of CTC but the TMA packet only provides the lower
+ * 16 bits of CTC. If mtc_shift > 8 then some of the MTC bits are not in the CTC
+ * provided by the TMA packet. Fix-up the last_mtc calculated from the TMA
+ * packet by copying the missing bits from the current MTC assuming the least
+ * difference between the two, and that the current MTC comes after last_mtc.
+ */
+static void intel_pt_fixup_last_mtc(uint32_t mtc, int mtc_shift,
+				    uint32_t *last_mtc)
+{
+	uint32_t first_missing_bit = 1U << (16 - mtc_shift);
+	uint32_t mask = ~(first_missing_bit - 1);
+
+	*last_mtc |= mtc & mask;
+	if (*last_mtc >= mtc) {
+		*last_mtc -= first_missing_bit;
+		*last_mtc &= 0xff;
+	}
+}
+
 static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
 {
 	struct intel_pt_decoder *decoder = pkt_info->decoder;
@@ -619,6 +641,11 @@
 			return 0;
 
 		mtc = pkt_info->packet.payload;
+		if (decoder->mtc_shift > 8 && data->fixup_last_mtc) {
+			data->fixup_last_mtc = false;
+			intel_pt_fixup_last_mtc(mtc, decoder->mtc_shift,
+						&data->last_mtc);
+		}
 		if (mtc > data->last_mtc)
 			mtc_delta = mtc - data->last_mtc;
 		else
@@ -687,6 +714,7 @@
 
 		data->ctc_delta = 0;
 		data->have_tma = true;
+		data->fixup_last_mtc = true;
 
 		return 0;
 
@@ -753,6 +781,7 @@
 		.tsc_timestamp  = decoder->tsc_timestamp,
 		.timestamp      = decoder->timestamp,
 		.have_tma       = decoder->have_tma,
+		.fixup_last_mtc = decoder->fixup_last_mtc,
 		.from_mtc       = from_mtc,
 		.cbr_cyc_to_tsc = 0,
 	};
@@ -1271,6 +1300,7 @@
 	}
 	decoder->ctc_delta = 0;
 	decoder->have_tma = true;
+	decoder->fixup_last_mtc = true;
 	intel_pt_log("CTC timestamp " x64_fmt " last MTC %#x  CTC rem %#x\n",
 		     decoder->ctc_timestamp, decoder->last_mtc, ctc_rem);
 }
@@ -1285,6 +1315,12 @@
 
 	mtc = decoder->packet.payload;
 
+	if (decoder->mtc_shift > 8 && decoder->fixup_last_mtc) {
+		decoder->fixup_last_mtc = false;
+		intel_pt_fixup_last_mtc(mtc, decoder->mtc_shift,
+					&decoder->last_mtc);
+	}
+
 	if (mtc > decoder->last_mtc)
 		mtc_delta = mtc - decoder->last_mtc;
 	else
@@ -1353,6 +1389,8 @@
 			     timestamp, decoder->timestamp);
 	else
 		decoder->timestamp = timestamp;
+
+	decoder->timestamp_insn_cnt = 0;
 }
 
 /* Walk PSB+ packets when already in sync. */