Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

User visible changes:

  - IPC and cycle accounting in 'perf annotate'. (Andi Kleen)

  - Display cycles in branch sort mode in 'perf report'. (Andi Kleen)

  - Add total time column to 'perf trace' syscall stats summary. (Milian Woff)

Infrastructure changes:

  - PMU helpers to use in Intel PT. (Adrian Hunter)

  - Fix perf-with-kcore script not to split args with spaces. (Adrian Hunter)

  - Add empty Build files for some more architectures. (Ben Hutchings)

  - Move 'perf stat' config variables to a struct to allow using some
    of its functions in more places. (Jiri Olsa)

  - Add DWARF register names for 'xtensa' arch. (Max Filippov)

  - Implement BPF programs attached to uprobes. (Wang Nan)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 1063c85..ed27917 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -243,6 +243,7 @@
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
+	TRACE_EVENT_FL_UPROBE_BIT,
 };
 
 /*
@@ -257,6 +258,7 @@
  *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
+ *  UPROBE        - Event is a uprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -267,8 +269,11 @@
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
+	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
 };
 
+#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
+
 struct trace_event_call {
 	struct list_head	list;
 	struct trace_event_class *class;
@@ -542,7 +547,7 @@
 		event_triggers_post_call(file, tt);
 }
 
-#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
 #else
 static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bdea129..77f9e5d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6846,8 +6846,8 @@
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
-		/* bpf programs can only be attached to kprobes */
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+		/* bpf programs can only be attached to u/kprobes */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48a..1153c43 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@
 
 config BPF_EVENTS
 	depends on BPF_SYSCALL
-	depends on KPROBE_EVENT
+	depends on KPROBE_EVENT || UPROBE_EVENT
 	bool
 	default y
 	help
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b..f97479f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1095,11 +1095,15 @@
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
+	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 
 	size = esize + tu->tp.size + dsize;
@@ -1289,6 +1293,7 @@
 		return -ENODEV;
 	}
 
+	call->flags = TRACE_EVENT_FL_UPROBE;
 	call->class->reg = trace_uprobe_register;
 	call->data = tu;
 	ret = trace_add_event_call(call);
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index c33b69f..960da20 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -109,6 +109,7 @@
 	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
 	- in_tx: branch in TSX transaction
 	- abort: TSX transaction abort.
+	- cycles: Cycles in basic block
 
 	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
 	and symbol_to, see '--branch-stack'.
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 776aec4..f6a23eb 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -208,6 +208,27 @@
 	This option sets the time out limit. The default value is 500 ms.
 
 
+-b::
+--branch-any::
+	Enable taken branch stack sampling. Any type of taken branch may be sampled.
+	This is a shortcut for --branch-filter any. See --branch-filter for more infos.
+
+-j::
+--branch-filter::
+	Enable taken branch stack sampling. Each sample captures a series of consecutive
+	taken branches. The number of branches captured with each sample depends on the
+	underlying hardware, the type of branches of interest, and the executed code.
+	It is possible to select the types of branches captured by enabling filters.
+	For a full list of modifiers please see the perf record manpage.
+
+	The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
+	The privilege levels may be omitted, in which case, the privilege levels of the associated
+	event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
+	levels are subject to permissions.  When sampling on multiple events, branch stack sampling
+	is enabled for all the sampling events. The sampled branch type is the same for all events.
+	The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
+	Note that this feature may not be available on all processors.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
diff --git a/tools/perf/arch/alpha/Build b/tools/perf/arch/alpha/Build
new file mode 100644
index 0000000..1bb8bf6
--- /dev/null
+++ b/tools/perf/arch/alpha/Build
@@ -0,0 +1 @@
+# empty
diff --git a/tools/perf/arch/mips/Build b/tools/perf/arch/mips/Build
new file mode 100644
index 0000000..1bb8bf6
--- /dev/null
+++ b/tools/perf/arch/mips/Build
@@ -0,0 +1 @@
+# empty
diff --git a/tools/perf/arch/parisc/Build b/tools/perf/arch/parisc/Build
new file mode 100644
index 0000000..1bb8bf6
--- /dev/null
+++ b/tools/perf/arch/parisc/Build
@@ -0,0 +1 @@
+# empty
diff --git a/tools/perf/arch/xtensa/Build b/tools/perf/arch/xtensa/Build
new file mode 100644
index 0000000..54afe4a
--- /dev/null
+++ b/tools/perf/arch/xtensa/Build
@@ -0,0 +1 @@
+libperf-y += util/
diff --git a/tools/perf/arch/xtensa/Makefile b/tools/perf/arch/xtensa/Makefile
new file mode 100644
index 0000000..7fbca17
--- /dev/null
+++ b/tools/perf/arch/xtensa/Makefile
@@ -0,0 +1,3 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+endif
diff --git a/tools/perf/arch/xtensa/util/Build b/tools/perf/arch/xtensa/util/Build
new file mode 100644
index 0000000..954e287
--- /dev/null
+++ b/tools/perf/arch/xtensa/util/Build
@@ -0,0 +1 @@
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/xtensa/util/dwarf-regs.c b/tools/perf/arch/xtensa/util/dwarf-regs.c
new file mode 100644
index 0000000..4dba76b
--- /dev/null
+++ b/tools/perf/arch/xtensa/util/dwarf-regs.c
@@ -0,0 +1,25 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (c) 2015 Cadence Design Systems Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+#define XTENSA_MAX_REGS 16
+
+const char *xtensa_regs_table[XTENSA_MAX_REGS] = {
+	"a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+	"a8", "a9", "a10", "a11", "a12", "a13", "a14", "a15",
+};
+
+const char *get_arch_regstr(unsigned int n)
+{
+	return n < XTENSA_MAX_REGS ? xtensa_regs_table[n] : NULL;
+}
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 2c1bec3..467a23b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -187,6 +187,7 @@
 			 * symbol, free he->ms.sym->src to signal we already
 			 * processed this symbol.
 			 */
+			zfree(&notes->src->cycles_hist);
 			zfree(&notes->src);
 		}
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 95a4771..3a9d1b6 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -53,6 +53,7 @@
 	bool			mem_mode;
 	bool			header;
 	bool			header_only;
+	bool			nonany_branch_mode;
 	int			max_stack;
 	struct perf_read_values	show_threads_values;
 	const char		*pretty_printing_style;
@@ -102,6 +103,9 @@
 	if (!ui__has_annotation())
 		return 0;
 
+	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
+			     rep->nonany_branch_mode);
+
 	if (sort__mode == SORT_MODE__BRANCH) {
 		bi = he->branch_info;
 		err = addr_map_symbol__inc_samples(&bi->from, evsel->idx);
@@ -258,6 +262,12 @@
 		else
 			callchain_param.record_mode = CALLCHAIN_FP;
 	}
+
+	/* ??? handle more cases than just ANY? */
+	if (!(perf_evlist__combined_branch_type(session->evlist) &
+				PERF_SAMPLE_BRANCH_ANY))
+		rep->nonany_branch_mode = true;
+
 	return 0;
 }
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d99d850..a054ddc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -101,8 +101,6 @@
 
 static int			run_count			=  1;
 static bool			no_inherit			= false;
-static bool			scale				=  true;
-static enum aggr_mode		aggr_mode			= AGGR_GLOBAL;
 static volatile pid_t		child_pid			= -1;
 static bool			null_run			=  false;
 static int			detailed_run			=  0;
@@ -112,11 +110,9 @@
 static const char		*csv_sep			= NULL;
 static bool			csv_output			= false;
 static bool			group				= false;
-static FILE			*output				= NULL;
 static const char		*pre_cmd			= NULL;
 static const char		*post_cmd			= NULL;
 static bool			sync_run			= false;
-static unsigned int		interval			= 0;
 static unsigned int		initial_delay			= 0;
 static unsigned int		unit_width			= 4; /* strlen("unit") */
 static bool			forever				= false;
@@ -126,6 +122,11 @@
 
 static volatile int done = 0;
 
+static struct perf_stat_config stat_config = {
+	.aggr_mode	= AGGR_GLOBAL,
+	.scale		= true,
+};
+
 static inline void diff_timespec(struct timespec *r, struct timespec *a,
 				 struct timespec *b)
 {
@@ -148,7 +149,7 @@
 {
 	struct perf_event_attr *attr = &evsel->attr;
 
-	if (scale)
+	if (stat_config.scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
@@ -178,142 +179,6 @@
 	return 0;
 }
 
-static void zero_per_pkg(struct perf_evsel *counter)
-{
-	if (counter->per_pkg_mask)
-		memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
-}
-
-static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
-{
-	unsigned long *mask = counter->per_pkg_mask;
-	struct cpu_map *cpus = perf_evsel__cpus(counter);
-	int s;
-
-	*skip = false;
-
-	if (!counter->per_pkg)
-		return 0;
-
-	if (cpu_map__empty(cpus))
-		return 0;
-
-	if (!mask) {
-		mask = zalloc(MAX_NR_CPUS);
-		if (!mask)
-			return -ENOMEM;
-
-		counter->per_pkg_mask = mask;
-	}
-
-	s = cpu_map__get_socket(cpus, cpu);
-	if (s < 0)
-		return -1;
-
-	*skip = test_and_set_bit(s, mask) == 1;
-	return 0;
-}
-
-static int
-process_counter_values(struct perf_evsel *evsel, int cpu, int thread,
-		       struct perf_counts_values *count)
-{
-	struct perf_counts_values *aggr = &evsel->counts->aggr;
-	static struct perf_counts_values zero;
-	bool skip = false;
-
-	if (check_per_pkg(evsel, cpu, &skip)) {
-		pr_err("failed to read per-pkg counter\n");
-		return -1;
-	}
-
-	if (skip)
-		count = &zero;
-
-	switch (aggr_mode) {
-	case AGGR_THREAD:
-	case AGGR_CORE:
-	case AGGR_SOCKET:
-	case AGGR_NONE:
-		if (!evsel->snapshot)
-			perf_evsel__compute_deltas(evsel, cpu, thread, count);
-		perf_counts_values__scale(count, scale, NULL);
-		if (aggr_mode == AGGR_NONE)
-			perf_stat__update_shadow_stats(evsel, count->values, cpu);
-		break;
-	case AGGR_GLOBAL:
-		aggr->val += count->val;
-		if (scale) {
-			aggr->ena += count->ena;
-			aggr->run += count->run;
-		}
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static int process_counter_maps(struct perf_evsel *counter)
-{
-	int nthreads = thread_map__nr(counter->threads);
-	int ncpus = perf_evsel__nr_cpus(counter);
-	int cpu, thread;
-
-	if (counter->system_wide)
-		nthreads = 1;
-
-	for (thread = 0; thread < nthreads; thread++) {
-		for (cpu = 0; cpu < ncpus; cpu++) {
-			if (process_counter_values(counter, cpu, thread,
-						   perf_counts(counter->counts, cpu, thread)))
-				return -1;
-		}
-	}
-
-	return 0;
-}
-
-static int process_counter(struct perf_evsel *counter)
-{
-	struct perf_counts_values *aggr = &counter->counts->aggr;
-	struct perf_stat *ps = counter->priv;
-	u64 *count = counter->counts->aggr.values;
-	int i, ret;
-
-	aggr->val = aggr->ena = aggr->run = 0;
-	init_stats(ps->res_stats);
-
-	if (counter->per_pkg)
-		zero_per_pkg(counter);
-
-	ret = process_counter_maps(counter);
-	if (ret)
-		return ret;
-
-	if (aggr_mode != AGGR_GLOBAL)
-		return 0;
-
-	if (!counter->snapshot)
-		perf_evsel__compute_deltas(counter, -1, -1, aggr);
-	perf_counts_values__scale(aggr, scale, &counter->counts->scaled);
-
-	for (i = 0; i < 3; i++)
-		update_stats(&ps->res_stats[i], count[i]);
-
-	if (verbose) {
-		fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
-			perf_evsel__name(counter), count[0], count[1], count[2]);
-	}
-
-	/*
-	 * Save the full runtime - to allow normalization during printout:
-	 */
-	perf_stat__update_shadow_stats(counter, count, 0);
-
-	return 0;
-}
-
 /*
  * Read out the results of a single counter:
  * do not aggregate counts across CPUs in system-wide mode
@@ -351,7 +216,7 @@
 		if (read_counter(counter))
 			pr_warning("failed to read counter %s\n", counter->name);
 
-		if (process_counter(counter))
+		if (perf_stat_process_counter(&stat_config, counter))
 			pr_warning("failed to process counter %s\n", counter->name);
 
 		if (close_counters) {
@@ -402,6 +267,7 @@
 
 static int __run_perf_stat(int argc, const char **argv)
 {
+	int interval = stat_config.interval;
 	char msg[512];
 	unsigned long long t0, t1;
 	struct perf_evsel *counter;
@@ -545,13 +411,13 @@
 static void print_running(u64 run, u64 ena)
 {
 	if (csv_output) {
-		fprintf(output, "%s%" PRIu64 "%s%.2f",
+		fprintf(stat_config.output, "%s%" PRIu64 "%s%.2f",
 					csv_sep,
 					run,
 					csv_sep,
 					ena ? 100.0 * run / ena : 100.0);
 	} else if (run != ena) {
-		fprintf(output, "  (%.2f%%)", 100.0 * run / ena);
+		fprintf(stat_config.output, "  (%.2f%%)", 100.0 * run / ena);
 	}
 }
 
@@ -560,9 +426,9 @@
 	double pct = rel_stddev_stats(total, avg);
 
 	if (csv_output)
-		fprintf(output, "%s%.2f%%", csv_sep, pct);
+		fprintf(stat_config.output, "%s%.2f%%", csv_sep, pct);
 	else if (pct)
-		fprintf(output, "  ( +-%6.2f%% )", pct);
+		fprintf(stat_config.output, "  ( +-%6.2f%% )", pct);
 }
 
 static void print_noise(struct perf_evsel *evsel, double avg)
@@ -578,9 +444,9 @@
 
 static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
 {
-	switch (aggr_mode) {
+	switch (stat_config.aggr_mode) {
 	case AGGR_CORE:
-		fprintf(output, "S%d-C%*d%s%*d%s",
+		fprintf(stat_config.output, "S%d-C%*d%s%*d%s",
 			cpu_map__id_to_socket(id),
 			csv_output ? 0 : -8,
 			cpu_map__id_to_cpu(id),
@@ -590,7 +456,7 @@
 			csv_sep);
 		break;
 	case AGGR_SOCKET:
-		fprintf(output, "S%*d%s%*d%s",
+		fprintf(stat_config.output, "S%*d%s%*d%s",
 			csv_output ? 0 : -5,
 			id,
 			csv_sep,
@@ -599,12 +465,12 @@
 			csv_sep);
 			break;
 	case AGGR_NONE:
-		fprintf(output, "CPU%*d%s",
+		fprintf(stat_config.output, "CPU%*d%s",
 			csv_output ? 0 : -4,
 			perf_evsel__cpus(evsel)->map[id], csv_sep);
 		break;
 	case AGGR_THREAD:
-		fprintf(output, "%*s-%*d%s",
+		fprintf(stat_config.output, "%*s-%*d%s",
 			csv_output ? 0 : 16,
 			thread_map__comm(evsel->threads, id),
 			csv_output ? 0 : -8,
@@ -619,6 +485,7 @@
 
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
+	FILE *output = stat_config.output;
 	double msecs = avg / 1e6;
 	const char *fmt_v, *fmt_n;
 	char name[25];
@@ -643,7 +510,7 @@
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
-	if (csv_output || interval)
+	if (csv_output || stat_config.interval)
 		return;
 
 	if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
@@ -655,6 +522,7 @@
 
 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
+	FILE *output = stat_config.output;
 	double sc =  evsel->scale;
 	const char *fmt;
 	int cpu = cpu_map__id_to_cpu(id);
@@ -670,7 +538,7 @@
 
 	aggr_printout(evsel, id, nr);
 
-	if (aggr_mode == AGGR_GLOBAL)
+	if (stat_config.aggr_mode == AGGR_GLOBAL)
 		cpu = 0;
 
 	fprintf(output, fmt, avg, csv_sep);
@@ -685,14 +553,16 @@
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
-	if (csv_output || interval)
+	if (csv_output || stat_config.interval)
 		return;
 
-	perf_stat__print_shadow_stats(output, evsel, avg, cpu, aggr_mode);
+	perf_stat__print_shadow_stats(output, evsel, avg, cpu,
+				      stat_config.aggr_mode);
 }
 
 static void print_aggr(char *prefix)
 {
+	FILE *output = stat_config.output;
 	struct perf_evsel *counter;
 	int cpu, cpu2, s, s2, id, nr;
 	double uval;
@@ -761,6 +631,7 @@
 
 static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
 {
+	FILE *output = stat_config.output;
 	int nthreads = thread_map__nr(counter->threads);
 	int ncpus = cpu_map__nr(counter->cpus);
 	int cpu, thread;
@@ -799,6 +670,7 @@
  */
 static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 {
+	FILE *output = stat_config.output;
 	struct perf_stat *ps = counter->priv;
 	double avg = avg_stats(&ps->res_stats[0]);
 	int scaled = counter->counts->scaled;
@@ -850,6 +722,7 @@
  */
 static void print_counter(struct perf_evsel *counter, char *prefix)
 {
+	FILE *output = stat_config.output;
 	u64 ena, run, val;
 	double uval;
 	int cpu;
@@ -904,12 +777,13 @@
 
 static void print_interval(char *prefix, struct timespec *ts)
 {
+	FILE *output = stat_config.output;
 	static int num_print_interval;
 
 	sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
 
 	if (num_print_interval == 0 && !csv_output) {
-		switch (aggr_mode) {
+		switch (stat_config.aggr_mode) {
 		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
 			break;
@@ -934,6 +808,7 @@
 
 static void print_header(int argc, const char **argv)
 {
+	FILE *output = stat_config.output;
 	int i;
 
 	fflush(stdout);
@@ -963,6 +838,8 @@
 
 static void print_footer(void)
 {
+	FILE *output = stat_config.output;
+
 	if (!null_run)
 		fprintf(output, "\n");
 	fprintf(output, " %17.9f seconds time elapsed",
@@ -977,6 +854,7 @@
 
 static void print_counters(struct timespec *ts, int argc, const char **argv)
 {
+	int interval = stat_config.interval;
 	struct perf_evsel *counter;
 	char buf[64], *prefix = NULL;
 
@@ -985,7 +863,7 @@
 	else
 		print_header(argc, argv);
 
-	switch (aggr_mode) {
+	switch (stat_config.aggr_mode) {
 	case AGGR_CORE:
 	case AGGR_SOCKET:
 		print_aggr(prefix);
@@ -1009,14 +887,14 @@
 	if (!interval && !csv_output)
 		print_footer();
 
-	fflush(output);
+	fflush(stat_config.output);
 }
 
 static volatile int signr = -1;
 
 static void skip_signal(int signo)
 {
-	if ((child_pid == -1) || interval)
+	if ((child_pid == -1) || stat_config.interval)
 		done = 1;
 
 	signr = signo;
@@ -1064,7 +942,7 @@
 
 static int perf_stat_init_aggr_mode(void)
 {
-	switch (aggr_mode) {
+	switch (stat_config.aggr_mode) {
 	case AGGR_SOCKET:
 		if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) {
 			perror("cannot build socket map");
@@ -1270,7 +1148,7 @@
 		    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('g', "group", &group,
 		    "put the counters into a counter group"),
-	OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"),
+	OPT_BOOLEAN('c', "scale", &stat_config.scale, "scale/normalize counters"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_INTEGER('r', "repeat", &run_count,
@@ -1286,7 +1164,7 @@
 			   stat__set_big_num),
 	OPT_STRING('C', "cpu", &target.cpu_list, "cpu",
 		    "list of cpus to monitor in system-wide"),
-	OPT_SET_UINT('A', "no-aggr", &aggr_mode,
+	OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode,
 		    "disable CPU count aggregation", AGGR_NONE),
 	OPT_STRING('x', "field-separator", &csv_sep, "separator",
 		   "print counts with custom separator"),
@@ -1300,13 +1178,13 @@
 			"command to run prior to the measured command"),
 	OPT_STRING(0, "post", &post_cmd, "command",
 			"command to run after to the measured command"),
-	OPT_UINTEGER('I', "interval-print", &interval,
+	OPT_UINTEGER('I', "interval-print", &stat_config.interval,
 		    "print counts at regular interval in ms (>= 100)"),
-	OPT_SET_UINT(0, "per-socket", &aggr_mode,
+	OPT_SET_UINT(0, "per-socket", &stat_config.aggr_mode,
 		     "aggregate counts per processor socket", AGGR_SOCKET),
-	OPT_SET_UINT(0, "per-core", &aggr_mode,
+	OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
-	OPT_SET_UINT(0, "per-thread", &aggr_mode,
+	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
 		     "aggregate counts per thread", AGGR_THREAD),
 	OPT_UINTEGER('D', "delay", &initial_delay,
 		     "ms to wait before starting measurement after program start"),
@@ -1318,6 +1196,8 @@
 	};
 	int status = -EINVAL, run_idx;
 	const char *mode;
+	FILE *output = stderr;
+	unsigned int interval;
 
 	setlocale(LC_ALL, "");
 
@@ -1328,7 +1208,8 @@
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
 
-	output = stderr;
+	interval = stat_config.interval;
+
 	if (output_name && strcmp(output_name, "-"))
 		output = NULL;
 
@@ -1365,6 +1246,8 @@
 		}
 	}
 
+	stat_config.output = output;
+
 	if (csv_sep) {
 		csv_output = true;
 		if (!strcmp(csv_sep, "\\t"))
@@ -1399,7 +1282,7 @@
 		run_count = 1;
 	}
 
-	if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
+	if ((stat_config.aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
 		fprintf(stderr, "The --per-thread option is only available "
 			"when monitoring via -p -t options.\n");
 		parse_options_usage(NULL, options, "p", 1);
@@ -1411,7 +1294,8 @@
 	 * no_aggr, cgroup are for system-wide only
 	 * --per-thread is aggregated per thread, we dont mix it with cpu mode
 	 */
-	if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) &&
+	if (((stat_config.aggr_mode != AGGR_GLOBAL &&
+	      stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) &&
 	    !target__has_cpu(&target)) {
 		fprintf(stderr, "both cgroup and no-aggregation "
 			"modes only available in system-wide mode\n");
@@ -1444,7 +1328,7 @@
 	 * Initialize thread_map with comm names,
 	 * so we could print it out on output.
 	 */
-	if (aggr_mode == AGGR_THREAD)
+	if (stat_config.aggr_mode == AGGR_THREAD)
 		thread_map__read_comms(evsel_list->threads);
 
 	if (interval && interval < 100) {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ecf3197..bfe24f1 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -40,6 +40,7 @@
 #include "util/xyarray.h"
 #include "util/sort.h"
 #include "util/intlist.h"
+#include "util/parse-branch-options.h"
 #include "arch/common.h"
 
 #include "util/debug.h"
@@ -695,6 +696,8 @@
 		perf_top__record_precise_ip(top, he, evsel->idx, ip);
 	}
 
+	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
+		     !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY));
 	return 0;
 }
 
@@ -1171,6 +1174,12 @@
 		   "don't try to adjust column width, use these fixed values"),
 	OPT_UINTEGER(0, "proc-map-timeout", &opts->proc_map_timeout,
 			"per thread proc mmap processing timeout in ms"),
+	OPT_CALLBACK_NOOPT('b', "branch-any", &opts->branch_stack,
+		     "branch any", "sample any taken branches",
+		     parse_branch_stack),
+	OPT_CALLBACK('j', "branch-filter", &opts->branch_stack,
+		     "branch filter mask", "branch stack filter modes",
+		     parse_branch_stack),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index a474970..a25048c 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2773,9 +2773,9 @@
 
 	printed += fprintf(fp, "\n");
 
-	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
-	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
-	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
+	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
+	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
+	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
 
 	/* each int_node is a syscall */
 	while (inode) {
@@ -2792,8 +2792,8 @@
 
 			sc = &trace->syscalls.table[inode->i];
 			printed += fprintf(fp, "   %-15s", sc->name);
-			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
-					   n, min, avg);
+			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
+					   n, avg * n, min, avg);
 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 		}
 
diff --git a/tools/perf/perf-with-kcore.sh b/tools/perf/perf-with-kcore.sh
index c7ff90a..7e47a7c 100644
--- a/tools/perf/perf-with-kcore.sh
+++ b/tools/perf/perf-with-kcore.sh
@@ -50,7 +50,7 @@
 	fi
 
 	rm -f perf.data.junk
-	("$PERF" record -o perf.data.junk $PERF_OPTIONS -- sleep 60) >/dev/null 2>/dev/null &
+	("$PERF" record -o perf.data.junk "${PERF_OPTIONS[@]}" -- sleep 60) >/dev/null 2>/dev/null &
 	PERF_PID=$!
 
 	# Need to make sure that perf has started
@@ -160,18 +160,18 @@
 			echo "*** WARNING *** /proc/sys/kernel/kptr_restrict prevents access to kernel addresses" >&2
 		fi
 
-		if echo "$PERF_OPTIONS" | grep -q ' -a \|^-a \| -a$\|^-a$\| --all-cpus \|^--all-cpus \| --all-cpus$\|^--all-cpus$' ; then
+		if echo "${PERF_OPTIONS[@]}" | grep -q ' -a \|^-a \| -a$\|^-a$\| --all-cpus \|^--all-cpus \| --all-cpus$\|^--all-cpus$' ; then
 			echo "*** WARNING *** system-wide tracing without root access will not be able to read all necessary information from /proc" >&2
 		fi
 
-		if echo "$PERF_OPTIONS" | grep -q 'intel_pt\|intel_bts\| -I\|^-I' ; then
+		if echo "${PERF_OPTIONS[@]}" | grep -q 'intel_pt\|intel_bts\| -I\|^-I' ; then
 			if [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt -1 ] ; then
 				echo "*** WARNING *** /proc/sys/kernel/perf_event_paranoid restricts buffer size and tracepoint (sched_switch) use" >&2
 			fi
 
-			if echo "$PERF_OPTIONS" | grep -q ' --per-thread \|^--per-thread \| --per-thread$\|^--per-thread$' ; then
+			if echo "${PERF_OPTIONS[@]}" | grep -q ' --per-thread \|^--per-thread \| --per-thread$\|^--per-thread$' ; then
 				true
-			elif echo "$PERF_OPTIONS" | grep -q ' -t \|^-t \| -t$\|^-t$' ; then
+			elif echo "${PERF_OPTIONS[@]}" | grep -q ' -t \|^-t \| -t$\|^-t$' ; then
 				true
 			elif [ ! -r /sys/kernel/debug -o ! -x /sys/kernel/debug ] ; then
 				echo "*** WARNING *** /sys/kernel/debug permissions prevent tracepoint (sched_switch) use" >&2
@@ -193,8 +193,8 @@
 
 	mkdir "$PERF_DATA_DIR"
 
-	echo "$PERF record -o $PERF_DATA_DIR/perf.data $PERF_OPTIONS -- $*"
-	"$PERF" record -o "$PERF_DATA_DIR/perf.data" $PERF_OPTIONS -- $* || true
+	echo "$PERF record -o $PERF_DATA_DIR/perf.data ${PERF_OPTIONS[@]} -- $@"
+	"$PERF" record -o "$PERF_DATA_DIR/perf.data" "${PERF_OPTIONS[@]}" -- "$@" || true
 
 	if rmdir "$PERF_DATA_DIR" > /dev/null 2>/dev/null ; then
 		exit 1
@@ -209,8 +209,8 @@
 {
 	find_perf
 	check_buildid_cache_permissions
-	echo "$PERF $PERF_SUB_COMMAND -i $PERF_DATA_DIR/perf.data --kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms $*"
-	"$PERF" $PERF_SUB_COMMAND -i "$PERF_DATA_DIR/perf.data" "--kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms" $*
+	echo "$PERF $PERF_SUB_COMMAND -i $PERF_DATA_DIR/perf.data --kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms $@"
+	"$PERF" $PERF_SUB_COMMAND -i "$PERF_DATA_DIR/perf.data" "--kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms" "$@"
 }
 
 if [ "$1" = "fix_buildid_cache_permissions" ] ; then
@@ -234,7 +234,7 @@
 case "$PERF_SUB_COMMAND" in
 "record")
 	while [ "$1" != "--" ] ; do
-		PERF_OPTIONS+="$1 "
+		PERF_OPTIONS+=("$1")
 		shift || break
 	done
 	if [ "$1" != "--" ] ; then
@@ -242,16 +242,16 @@
 		usage
 	fi
 	shift
-	record $*
+	record "$@"
 ;;
 "script")
-	subcommand $*
+	subcommand "$@"
 ;;
 "report")
-	subcommand $*
+	subcommand "$@"
 ;;
 "inject")
-	subcommand $*
+	subcommand "$@"
 ;;
 *)
 	usage
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 5995a8b..b5fc847 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -16,6 +16,9 @@
 	u64		nr;
 };
 
+#define IPC_WIDTH 6
+#define CYCLES_WIDTH 6
+
 struct browser_disasm_line {
 	struct rb_node			rb_node;
 	u32				idx;
@@ -53,6 +56,7 @@
 	int		    max_jump_sources;
 	int		    nr_jumps;
 	bool		    searching_backwards;
+	bool		    have_cycles;
 	u8		    addr_width;
 	u8		    jumps_width;
 	u8		    target_width;
@@ -96,6 +100,15 @@
 	 return ui_browser__set_color(&browser->b, color);
 }
 
+static int annotate_browser__pcnt_width(struct annotate_browser *ab)
+{
+	int w = 7 * ab->nr_events;
+
+	if (ab->have_cycles)
+		w += IPC_WIDTH + CYCLES_WIDTH;
+	return w;
+}
+
 static void annotate_browser__write(struct ui_browser *browser, void *entry, int row)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
@@ -106,7 +119,7 @@
 			     (!current_entry || (browser->use_navkeypressed &&
 					         !browser->navkeypressed)));
 	int width = browser->width, printed;
-	int i, pcnt_width = 7 * ab->nr_events;
+	int i, pcnt_width = annotate_browser__pcnt_width(ab);
 	double percent_max = 0.0;
 	char bf[256];
 
@@ -116,19 +129,34 @@
 	}
 
 	if (dl->offset != -1 && percent_max != 0.0) {
-		for (i = 0; i < ab->nr_events; i++) {
-			ui_browser__set_percent_color(browser,
-						      bdl->samples[i].percent,
-						      current_entry);
-			if (annotate_browser__opts.show_total_period)
-				slsmg_printf("%6" PRIu64 " ",
-					     bdl->samples[i].nr);
-			else
-				slsmg_printf("%6.2f ", bdl->samples[i].percent);
+		if (percent_max != 0.0) {
+			for (i = 0; i < ab->nr_events; i++) {
+				ui_browser__set_percent_color(browser,
+							bdl->samples[i].percent,
+							current_entry);
+				if (annotate_browser__opts.show_total_period)
+					slsmg_printf("%6" PRIu64 " ",
+						     bdl->samples[i].nr);
+				else
+					slsmg_printf("%6.2f ", bdl->samples[i].percent);
+			}
+		} else {
+			slsmg_write_nstring(" ", 7 * ab->nr_events);
 		}
 	} else {
 		ui_browser__set_percent_color(browser, 0, current_entry);
-		slsmg_write_nstring(" ", pcnt_width);
+		slsmg_write_nstring(" ", 7 * ab->nr_events);
+	}
+	if (ab->have_cycles) {
+		if (dl->ipc)
+			slsmg_printf("%*.2f ", IPC_WIDTH - 1, dl->ipc);
+		else
+			slsmg_write_nstring(" ", IPC_WIDTH);
+		if (dl->cycles)
+			slsmg_printf("%*" PRIu64 " ",
+				     CYCLES_WIDTH - 1, dl->cycles);
+		else
+			slsmg_write_nstring(" ", CYCLES_WIDTH);
 	}
 
 	SLsmg_write_char(' ');
@@ -231,7 +259,7 @@
 	unsigned int from, to;
 	struct map_symbol *ms = ab->b.priv;
 	struct symbol *sym = ms->sym;
-	u8 pcnt_width = 7;
+	u8 pcnt_width = annotate_browser__pcnt_width(ab);
 
 	/* PLT symbols contain external offsets */
 	if (strstr(sym->name, "@plt"))
@@ -255,8 +283,6 @@
 		to = (u64)btarget->idx;
 	}
 
-	pcnt_width *= ab->nr_events;
-
 	ui_browser__set_color(browser, HE_COLORSET_CODE);
 	__ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
 				 from, to);
@@ -266,9 +292,7 @@
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
 	int ret = ui_browser__list_head_refresh(browser);
-	int pcnt_width;
-
-	pcnt_width = 7 * ab->nr_events;
+	int pcnt_width = annotate_browser__pcnt_width(ab);
 
 	if (annotate_browser__opts.jump_arrows)
 		annotate_browser__draw_current_jump(browser);
@@ -390,7 +414,7 @@
 				max_percent = bpos->samples[i].percent;
 		}
 
-		if (max_percent < 0.01) {
+		if (max_percent < 0.01 && pos->ipc == 0) {
 			RB_CLEAR_NODE(&bpos->rb_node);
 			continue;
 		}
@@ -869,6 +893,75 @@
 	return map_symbol__tui_annotate(&he->ms, evsel, hbt);
 }
 
+
+static unsigned count_insn(struct annotate_browser *browser, u64 start, u64 end)
+{
+	unsigned n_insn = 0;
+	u64 offset;
+
+	for (offset = start; offset <= end; offset++) {
+		if (browser->offsets[offset])
+			n_insn++;
+	}
+	return n_insn;
+}
+
+static void count_and_fill(struct annotate_browser *browser, u64 start, u64 end,
+			   struct cyc_hist *ch)
+{
+	unsigned n_insn;
+	u64 offset;
+
+	n_insn = count_insn(browser, start, end);
+	if (n_insn && ch->num && ch->cycles) {
+		float ipc = n_insn / ((double)ch->cycles / (double)ch->num);
+
+		/* Hide data when there are too many overlaps. */
+		if (ch->reset >= 0x7fff || ch->reset >= ch->num / 2)
+			return;
+
+		for (offset = start; offset <= end; offset++) {
+			struct disasm_line *dl = browser->offsets[offset];
+
+			if (dl)
+				dl->ipc = ipc;
+		}
+	}
+}
+
+/*
+ * This should probably be in util/annotate.c to share with the tty
+ * annotate, but right now we need the per byte offsets arrays,
+ * which are only here.
+ */
+static void annotate__compute_ipc(struct annotate_browser *browser, size_t size,
+			   struct symbol *sym)
+{
+	u64 offset;
+	struct annotation *notes = symbol__annotation(sym);
+
+	if (!notes->src || !notes->src->cycles_hist)
+		return;
+
+	pthread_mutex_lock(&notes->lock);
+	for (offset = 0; offset < size; ++offset) {
+		struct cyc_hist *ch;
+
+		ch = &notes->src->cycles_hist[offset];
+		if (ch && ch->cycles) {
+			struct disasm_line *dl;
+
+			if (ch->have_start)
+				count_and_fill(browser, ch->start, offset, ch);
+			dl = browser->offsets[offset];
+			if (dl && ch->num_aggr)
+				dl->cycles = ch->cycles_aggr / ch->num_aggr;
+			browser->have_cycles = true;
+		}
+	}
+	pthread_mutex_unlock(&notes->lock);
+}
+
 static void annotate_browser__mark_jump_targets(struct annotate_browser *browser,
 						size_t size)
 {
@@ -991,6 +1084,7 @@
 	}
 
 	annotate_browser__mark_jump_targets(&browser, size);
+	annotate__compute_ipc(&browser, size, sym);
 
 	browser.addr_width = browser.target_width = browser.min_addr_width = hex_width(size);
 	browser.max_addr_width = hex_width(sym->end);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 03b7bc70..e0b6146 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -473,17 +473,73 @@
 	return 0;
 }
 
+/* The cycles histogram is lazily allocated. */
+static int symbol__alloc_hist_cycles(struct symbol *sym)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	const size_t size = symbol__size(sym);
+
+	notes->src->cycles_hist = calloc(size, sizeof(struct cyc_hist));
+	if (notes->src->cycles_hist == NULL)
+		return -1;
+	return 0;
+}
+
 void symbol__annotate_zero_histograms(struct symbol *sym)
 {
 	struct annotation *notes = symbol__annotation(sym);
 
 	pthread_mutex_lock(&notes->lock);
-	if (notes->src != NULL)
+	if (notes->src != NULL) {
 		memset(notes->src->histograms, 0,
 		       notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+		if (notes->src->cycles_hist)
+			memset(notes->src->cycles_hist, 0,
+				symbol__size(sym) * sizeof(struct cyc_hist));
+	}
 	pthread_mutex_unlock(&notes->lock);
 }
 
+static int __symbol__account_cycles(struct annotation *notes,
+				    u64 start,
+				    unsigned offset, unsigned cycles,
+				    unsigned have_start)
+{
+	struct cyc_hist *ch;
+
+	ch = notes->src->cycles_hist;
+	/*
+	 * For now we can only account one basic block per
+	 * final jump. But multiple could be overlapping.
+	 * Always account the longest one. So when
+	 * a shorter one has been already seen throw it away.
+	 *
+	 * We separately always account the full cycles.
+	 */
+	ch[offset].num_aggr++;
+	ch[offset].cycles_aggr += cycles;
+
+	if (!have_start && ch[offset].have_start)
+		return 0;
+	if (ch[offset].num) {
+		if (have_start && (!ch[offset].have_start ||
+				   ch[offset].start > start)) {
+			ch[offset].have_start = 0;
+			ch[offset].cycles = 0;
+			ch[offset].num = 0;
+			if (ch[offset].reset < 0xffff)
+				ch[offset].reset++;
+		} else if (have_start &&
+			   ch[offset].start < start)
+			return 0;
+	}
+	ch[offset].have_start = have_start;
+	ch[offset].start = start;
+	ch[offset].cycles += cycles;
+	ch[offset].num++;
+	return 0;
+}
+
 static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 				      struct annotation *notes, int evidx, u64 addr)
 {
@@ -506,7 +562,7 @@
 	return 0;
 }
 
-static struct annotation *symbol__get_annotation(struct symbol *sym)
+static struct annotation *symbol__get_annotation(struct symbol *sym, bool cycles)
 {
 	struct annotation *notes = symbol__annotation(sym);
 
@@ -514,6 +570,10 @@
 		if (symbol__alloc_hist(sym) < 0)
 			return NULL;
 	}
+	if (!notes->src->cycles_hist && cycles) {
+		if (symbol__alloc_hist_cycles(sym) < 0)
+			return NULL;
+	}
 	return notes;
 }
 
@@ -524,12 +584,73 @@
 
 	if (sym == NULL)
 		return 0;
-	notes = symbol__get_annotation(sym);
+	notes = symbol__get_annotation(sym, false);
 	if (notes == NULL)
 		return -ENOMEM;
 	return __symbol__inc_addr_samples(sym, map, notes, evidx, addr);
 }
 
+static int symbol__account_cycles(u64 addr, u64 start,
+				  struct symbol *sym, unsigned cycles)
+{
+	struct annotation *notes;
+	unsigned offset;
+
+	if (sym == NULL)
+		return 0;
+	notes = symbol__get_annotation(sym, true);
+	if (notes == NULL)
+		return -ENOMEM;
+	if (addr < sym->start || addr >= sym->end)
+		return -ERANGE;
+
+	if (start) {
+		if (start < sym->start || start >= sym->end)
+			return -ERANGE;
+		if (start >= addr)
+			start = 0;
+	}
+	offset = addr - sym->start;
+	return __symbol__account_cycles(notes,
+					start ? start - sym->start : 0,
+					offset, cycles,
+					!!start);
+}
+
+int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
+				    struct addr_map_symbol *start,
+				    unsigned cycles)
+{
+	unsigned long saddr = 0;
+	int err;
+
+	if (!cycles)
+		return 0;
+
+	/*
+	 * Only set start when IPC can be computed. We can only
+	 * compute it when the basic block is completely in a single
+	 * function.
+	 * Special case the case when the jump is elsewhere, but
+	 * it starts on the function start.
+	 */
+	if (start &&
+		(start->sym == ams->sym ||
+		 (ams->sym &&
+		   start->addr == ams->sym->start + ams->map->start)))
+		saddr = start->al_addr;
+	if (saddr == 0)
+		pr_debug2("BB with bad start: addr %lx start %lx sym %lx saddr %lx\n",
+			ams->addr,
+			start ? start->addr : 0,
+			ams->sym ? ams->sym->start + ams->map->start : 0,
+			saddr);
+	err = symbol__account_cycles(ams->al_addr, saddr, ams->sym, cycles);
+	if (err)
+		pr_debug2("account_cycles failed %d\n", err);
+	return err;
+}
+
 int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx)
 {
 	return symbol__inc_addr_samples(ams->sym, ams->map, evidx, ams->al_addr);
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 7e78e6c..e999609 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -59,6 +59,8 @@
 	char		    *name;
 	struct ins	    *ins;
 	int		    line_nr;
+	float		    ipc;
+	u64		    cycles;
 	struct ins_operands ops;
 };
 
@@ -79,6 +81,17 @@
 	u64		addr[0];
 };
 
+struct cyc_hist {
+	u64	start;
+	u64	cycles;
+	u64	cycles_aggr;
+	u32	num;
+	u32	num_aggr;
+	u8	have_start;
+	/* 1 byte padding */
+	u16	reset;
+};
+
 struct source_line_samples {
 	double		percent;
 	double		percent_sum;
@@ -97,6 +110,7 @@
  * @histogram: Array of addr hit histograms per event being monitored
  * @lines: If 'print_lines' is specified, per source code line percentages
  * @source: source parsed from a disassembler like objdump -dS
+ * @cyc_hist: Average cycles per basic block
  *
  * lines is allocated, percentages calculated and all sorted by percentage
  * when the annotation is about to be presented, so the percentages are for
@@ -109,6 +123,7 @@
 	struct source_line *lines;
 	int    		   nr_histograms;
 	int    		   sizeof_sym_hist;
+	struct cyc_hist	   *cycles_hist;
 	struct sym_hist	   histograms[0];
 };
 
@@ -130,6 +145,10 @@
 
 int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx);
 
+int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
+				    struct addr_map_symbol *start,
+				    unsigned cycles);
+
 int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 addr);
 
 int symbol__alloc_hist(struct symbol *sym);
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 83d9dd9..a25b360 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -942,6 +942,7 @@
 	struct itrace_synth_opts *synth_opts = opt->value;
 	const char *p;
 	char *endptr;
+	bool period_type_set = false;
 
 	synth_opts->set = true;
 
@@ -970,10 +971,12 @@
 				case 'i':
 					synth_opts->period_type =
 						PERF_ITRACE_PERIOD_INSTRUCTIONS;
+					period_type_set = true;
 					break;
 				case 't':
 					synth_opts->period_type =
 						PERF_ITRACE_PERIOD_TICKS;
+					period_type_set = true;
 					break;
 				case 'm':
 					synth_opts->period *= 1000;
@@ -986,6 +989,7 @@
 						goto out_err;
 					synth_opts->period_type =
 						PERF_ITRACE_PERIOD_NANOSECS;
+					period_type_set = true;
 					break;
 				case '\0':
 					goto out;
@@ -1039,7 +1043,7 @@
 	}
 out:
 	if (synth_opts->instructions) {
-		if (!synth_opts->period_type)
+		if (!period_type_set)
 			synth_opts->period_type =
 					PERF_ITRACE_DEFAULT_PERIOD_TYPE;
 		if (!synth_opts->period)
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 2da5581..86d9c73 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -36,6 +36,11 @@
 	return ret;
 }
 
+int veprintf(int level, int var, const char *fmt, va_list args)
+{
+	return _eprintf(level, var, fmt, args);
+}
+
 int eprintf(int level, int var, const char *fmt, ...)
 {
 	va_list args;
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index caac2fd..8b9a088 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -50,6 +50,7 @@
 
 int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4)));
 int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5)));
+int veprintf(int level, int var, const char *fmt, va_list args);
 
 int perf_debug_option(const char *str);
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 4bb2ae8..f729df5 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -134,7 +134,8 @@
 	u64 predicted:1;
 	u64 in_tx:1;
 	u64 abort:1;
-	u64 reserved:60;
+	u64 cycles:16;
+	u64 reserved:44;
 };
 
 struct branch_entry {
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 3b9f411..373f65b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1273,6 +1273,16 @@
 	return __perf_evlist__combined_sample_type(evlist);
 }
 
+u64 perf_evlist__combined_branch_type(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+	u64 branch_type = 0;
+
+	evlist__for_each(evlist, evsel)
+		branch_type |= evsel->attr.branch_sample_type;
+	return branch_type;
+}
+
 bool perf_evlist__valid_read_format(struct perf_evlist *evlist)
 {
 	struct perf_evsel *first = perf_evlist__first(evlist), *pos = first;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a8930b68..3977570 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -165,6 +165,7 @@
 u64 perf_evlist__read_format(struct perf_evlist *evlist);
 u64 __perf_evlist__combined_sample_type(struct perf_evlist *evlist);
 u64 perf_evlist__combined_sample_type(struct perf_evlist *evlist);
+u64 perf_evlist__combined_branch_type(struct perf_evlist *evlist);
 bool perf_evlist__sample_id_all(struct perf_evlist *evlist);
 u16 perf_evlist__id_hdr_size(struct perf_evlist *evlist);
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6f28d53..a6e9ddd 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -618,7 +618,8 @@
 	 * and not events sampled. Thus we use a pseudo period of 1.
 	 */
 	he = __hists__add_entry(hists, al, iter->parent, &bi[i], NULL,
-				1, 1, 0, true);
+				1, bi->flags.cycles ? bi->flags.cycles : 1,
+				0, true);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -1414,6 +1415,39 @@
 	return 0;
 }
 
+void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
+			  struct perf_sample *sample, bool nonany_branch_mode)
+{
+	struct branch_info *bi;
+
+	/* If we have branch cycles always annotate them. */
+	if (bs && bs->nr && bs->entries[0].flags.cycles) {
+		int i;
+
+		bi = sample__resolve_bstack(sample, al);
+		if (bi) {
+			struct addr_map_symbol *prev = NULL;
+
+			/*
+			 * Ignore errors, still want to process the
+			 * other entries.
+			 *
+			 * For non standard branch modes always
+			 * force no IPC (prev == NULL)
+			 *
+			 * Note that perf stores branches reversed from
+			 * program order!
+			 */
+			for (i = bs->nr - 1; i >= 0; i--) {
+				addr_map_symbol__account_cycles(&bi[i].from,
+					nonany_branch_mode ? NULL : prev,
+					bi[i].flags.cycles);
+				prev = &bi[i].to;
+			}
+			free(bi);
+		}
+	}
+}
 
 size_t perf_evlist__fprintf_nr_events(struct perf_evlist *evlist, FILE *fp)
 {
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 5ed8d9c..e2f712f 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -47,6 +47,7 @@
 	HISTC_MEM_SNOOP,
 	HISTC_MEM_DCACHELINE,
 	HISTC_TRANSACTION,
+	HISTC_CYCLES,
 	HISTC_NR_COLS, /* Last entry */
 };
 
@@ -349,6 +350,9 @@
 
 unsigned int hists__sort_list_width(struct hists *hists);
 
+void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
+			  struct perf_sample *sample, bool nonany_branch_mode);
+
 struct option;
 int parse_filter_percentage(const struct option *opt __maybe_unused,
 			    const char *arg, int unset __maybe_unused);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a6cb9af..828936d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1168,7 +1168,7 @@
 		 * Maximum error index indent, we will cut
 		 * the event string if it's bigger.
 		 */
-		int max_err_idx = 10;
+		int max_err_idx = 13;
 
 		/*
 		 * Let's be specific with the message when
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index b615cdf..d4b0e64 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -542,7 +542,7 @@
 }
 
 static struct perf_pmu_format *
-pmu_find_format(struct list_head *formats, char *name)
+pmu_find_format(struct list_head *formats, const char *name)
 {
 	struct perf_pmu_format *format;
 
@@ -553,6 +553,21 @@
 	return NULL;
 }
 
+__u64 perf_pmu__format_bits(struct list_head *formats, const char *name)
+{
+	struct perf_pmu_format *format = pmu_find_format(formats, name);
+	__u64 bits = 0;
+	int fbit;
+
+	if (!format)
+		return 0;
+
+	for_each_set_bit(fbit, format->bits, PERF_PMU_FORMAT_BITS)
+		bits |= 1ULL << fbit;
+
+	return bits;
+}
+
 /*
  * Sets value based on the format definition (format parameter)
  * and unformated value (value parameter).
@@ -574,6 +589,18 @@
 	}
 }
 
+static __u64 pmu_format_max_value(const unsigned long *format)
+{
+	int w;
+
+	w = bitmap_weight(format, PERF_PMU_FORMAT_BITS);
+	if (!w)
+		return 0;
+	if (w < 64)
+		return (1ULL << w) - 1;
+	return -1;
+}
+
 /*
  * Term is a string term, and might be a param-term. Try to look up it's value
  * in the remaining terms.
@@ -647,7 +674,7 @@
 {
 	struct perf_pmu_format *format;
 	__u64 *vp;
-	__u64 val;
+	__u64 val, max_val;
 
 	/*
 	 * If this is a parameter we've already used for parameterized-eval,
@@ -713,6 +740,22 @@
 	} else
 		return -EINVAL;
 
+	max_val = pmu_format_max_value(format->bits);
+	if (val > max_val) {
+		if (err) {
+			err->idx = term->err_val;
+			if (asprintf(&err->str,
+				     "value too big for format, maximum is %llu",
+				     (unsigned long long)max_val) < 0)
+				err->str = strdup("value too big for format");
+			return -EINVAL;
+		}
+		/*
+		 * Assume we don't care if !err, in which case the value will be
+		 * silently truncated.
+		 */
+	}
+
 	pmu_format_value(format->bits, val, vp, zero);
 	return 0;
 }
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 7b9c8cf..5d7e844 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -54,6 +54,7 @@
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms,
 			   bool zero, struct parse_events_error *error);
+__u64 perf_pmu__format_bits(struct list_head *formats, const char *name);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 			  struct perf_pmu_info *info);
 struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 20f555d..83ee95e 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -106,6 +106,8 @@
 	struct strlist			*vars;	/* Available variables */
 };
 
+struct map;
+
 /* Command string to events */
 extern int parse_perf_probe_command(const char *cmd,
 				    struct perf_probe_event *pev);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index f51eb54..18722e7 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -784,10 +784,18 @@
 
 	printf("... branch stack: nr:%" PRIu64 "\n", sample->branch_stack->nr);
 
-	for (i = 0; i < sample->branch_stack->nr; i++)
-		printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 "\n",
-			i, sample->branch_stack->entries[i].from,
-			sample->branch_stack->entries[i].to);
+	for (i = 0; i < sample->branch_stack->nr; i++) {
+		struct branch_entry *e = &sample->branch_stack->entries[i];
+
+		printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
+			i, e->from, e->to,
+			e->flags.cycles,
+			e->flags.mispred ? "M" : " ",
+			e->flags.predicted ? "P" : " ",
+			e->flags.abort ? "A" : " ",
+			e->flags.in_tx ? "T" : " ",
+			(unsigned)e->flags.reserved);
+	}
 }
 
 static void regs_dump__printf(u64 mask, u64 *regs)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4c65a14..5177088 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -9,7 +9,7 @@
 const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
 const char	default_sort_order[] = "comm,dso,symbol";
-const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,dso_to,symbol_to";
+const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
 const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
@@ -526,6 +526,29 @@
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, out);
 }
 
+static int64_t
+sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return left->branch_info->flags.cycles -
+		right->branch_info->flags.cycles;
+}
+
+static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
+				    size_t size, unsigned int width)
+{
+	if (he->branch_info->flags.cycles == 0)
+		return repsep_snprintf(bf, size, "%-*s", width, "-");
+	return repsep_snprintf(bf, size, "%-*hd", width,
+			       he->branch_info->flags.cycles);
+}
+
+struct sort_entry sort_cycles = {
+	.se_header	= "Basic Block Cycles",
+	.se_cmp		= sort__cycles_cmp,
+	.se_snprintf	= hist_entry__cycles_snprintf,
+	.se_width_idx	= HISTC_CYCLES,
+};
+
 /* --sort daddr_sym */
 static int64_t
 sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right)
@@ -1190,6 +1213,7 @@
 	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
 	DIM(SORT_IN_TX, "in_tx", sort_in_tx),
 	DIM(SORT_ABORT, "abort", sort_abort),
+	DIM(SORT_CYCLES, "cycles", sort_cycles),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index e97cd47..bc6c87a 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -185,6 +185,7 @@
 	SORT_MISPREDICT,
 	SORT_ABORT,
 	SORT_IN_TX,
+	SORT_CYCLES,
 
 	/* memory mode specific sort keys */
 	__SORT_MEMORY_MODE,
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index f2a0d15..c5c709c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -238,3 +238,142 @@
 		perf_evsel__reset_counts(evsel);
 	}
 }
+
+static void zero_per_pkg(struct perf_evsel *counter)
+{
+	if (counter->per_pkg_mask)
+		memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
+}
+
+static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
+{
+	unsigned long *mask = counter->per_pkg_mask;
+	struct cpu_map *cpus = perf_evsel__cpus(counter);
+	int s;
+
+	*skip = false;
+
+	if (!counter->per_pkg)
+		return 0;
+
+	if (cpu_map__empty(cpus))
+		return 0;
+
+	if (!mask) {
+		mask = zalloc(MAX_NR_CPUS);
+		if (!mask)
+			return -ENOMEM;
+
+		counter->per_pkg_mask = mask;
+	}
+
+	s = cpu_map__get_socket(cpus, cpu);
+	if (s < 0)
+		return -1;
+
+	*skip = test_and_set_bit(s, mask) == 1;
+	return 0;
+}
+
+static int
+process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel,
+		       int cpu, int thread,
+		       struct perf_counts_values *count)
+{
+	struct perf_counts_values *aggr = &evsel->counts->aggr;
+	static struct perf_counts_values zero;
+	bool skip = false;
+
+	if (check_per_pkg(evsel, cpu, &skip)) {
+		pr_err("failed to read per-pkg counter\n");
+		return -1;
+	}
+
+	if (skip)
+		count = &zero;
+
+	switch (config->aggr_mode) {
+	case AGGR_THREAD:
+	case AGGR_CORE:
+	case AGGR_SOCKET:
+	case AGGR_NONE:
+		if (!evsel->snapshot)
+			perf_evsel__compute_deltas(evsel, cpu, thread, count);
+		perf_counts_values__scale(count, config->scale, NULL);
+		if (config->aggr_mode == AGGR_NONE)
+			perf_stat__update_shadow_stats(evsel, count->values, cpu);
+		break;
+	case AGGR_GLOBAL:
+		aggr->val += count->val;
+		if (config->scale) {
+			aggr->ena += count->ena;
+			aggr->run += count->run;
+		}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int process_counter_maps(struct perf_stat_config *config,
+				struct perf_evsel *counter)
+{
+	int nthreads = thread_map__nr(counter->threads);
+	int ncpus = perf_evsel__nr_cpus(counter);
+	int cpu, thread;
+
+	if (counter->system_wide)
+		nthreads = 1;
+
+	for (thread = 0; thread < nthreads; thread++) {
+		for (cpu = 0; cpu < ncpus; cpu++) {
+			if (process_counter_values(config, counter, cpu, thread,
+						   perf_counts(counter->counts, cpu, thread)))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+int perf_stat_process_counter(struct perf_stat_config *config,
+			      struct perf_evsel *counter)
+{
+	struct perf_counts_values *aggr = &counter->counts->aggr;
+	struct perf_stat *ps = counter->priv;
+	u64 *count = counter->counts->aggr.values;
+	int i, ret;
+
+	aggr->val = aggr->ena = aggr->run = 0;
+	init_stats(ps->res_stats);
+
+	if (counter->per_pkg)
+		zero_per_pkg(counter);
+
+	ret = process_counter_maps(config, counter);
+	if (ret)
+		return ret;
+
+	if (config->aggr_mode != AGGR_GLOBAL)
+		return 0;
+
+	if (!counter->snapshot)
+		perf_evsel__compute_deltas(counter, -1, -1, aggr);
+	perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled);
+
+	for (i = 0; i < 3; i++)
+		update_stats(&ps->res_stats[i], count[i]);
+
+	if (verbose) {
+		fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+			perf_evsel__name(counter), count[0], count[1], count[2]);
+	}
+
+	/*
+	 * Save the full runtime - to allow normalization during printout:
+	 */
+	perf_stat__update_shadow_stats(counter, count, 0);
+
+	return 0;
+}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 1cfbe0a..0b897b0 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -50,6 +50,13 @@
 	struct xyarray		  *values;
 };
 
+struct perf_stat_config {
+	enum aggr_mode	aggr_mode;
+	bool		scale;
+	FILE		*output;
+	unsigned int	interval;
+};
+
 static inline struct perf_counts_values*
 perf_counts(struct perf_counts *counts, int cpu, int thread)
 {
@@ -109,4 +116,7 @@
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct perf_evlist *evlist);
 void perf_evlist__reset_stats(struct perf_evlist *evlist);
+
+int perf_stat_process_counter(struct perf_stat_config *config,
+			      struct perf_evsel *counter);
 #endif