tools/perf/util/intel-tpebs.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * intel_tpebs.c: Intel TPEBS support
  */


 #include <sys/param.h>
 #include <subcmd/run-command.h>
 #include <thread.h>
 #include "intel-tpebs.h"
 #include <linux/list.h>
 #include <linux/zalloc.h>
 #include <linux/err.h>
 #include "sample.h"
 #include "debug.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "session.h"
 #include "tool.h"
 #include "cpumap.h"
 #include "metricgroup.h"
 #include <sys/stat.h>
 #include <sys/file.h>
 #include <poll.h>
 #include <math.h>

 #define PERF_DATA		"-"

 bool tpebs_recording;
 static pid_t tpebs_pid = -1;
 static size_t tpebs_event_size;
 static LIST_HEAD(tpebs_results);
 static pthread_t tpebs_reader_thread;
 static struct child_process *tpebs_cmd;

 struct tpebs_retire_lat {
 	struct list_head nd;
 	/* Event name */
 	const char *name;
 	/* Event name with the TPEBS modifier R */
 	const char *tpebs_name;
 	/* Count of retire_latency values found in sample data */
 	size_t count;
 	/* Sum of all the retire_latency values in sample data */
 	int sum;
 	/* Average of retire_latency, val = sum / count */
 	double val;
 };

 static int get_perf_record_args(const char **record_argv, char buf[],
 				const char *cpumap_buf)
 {
 	struct tpebs_retire_lat *e;
 	int i = 0;

 	pr_debug("tpebs: Prepare perf record for retire_latency\n");

 	record_argv[i++] = "perf";
 	record_argv[i++] = "record";
 	record_argv[i++] = "-W";
 	record_argv[i++] = "--synth=no";
 	record_argv[i++] = buf;

 	if (!cpumap_buf) {
 		pr_err("tpebs: Require cpumap list to run sampling\n");
 		return -ECANCELED;
 	}
 	/* Use -C when cpumap_buf is not "-1" */
 	if (strcmp(cpumap_buf, "-1")) {
 		record_argv[i++] = "-C";
 		record_argv[i++] = cpumap_buf;
 	}

 	list_for_each_entry(e, &tpebs_results, nd) {
 		record_argv[i++] = "-e";
 		record_argv[i++] = e->name;
 	}

 	record_argv[i++] = "-o";
 	record_argv[i++] = PERF_DATA;

 	return 0;
 }

 static int prepare_run_command(const char **argv)
 {
 	tpebs_cmd = zalloc(sizeof(struct child_process));
 	if (!tpebs_cmd)
 		return -ENOMEM;
 	tpebs_cmd->argv = argv;
 	tpebs_cmd->out = -1;
 	return 0;
 }

 static int start_perf_record(int control_fd[], int ack_fd[],
 				const char *cpumap_buf)
 {
 	const char **record_argv;
 	int ret;
 	char buf[32];

 	scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]);

 	record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
 	if (!record_argv)
 		return -ENOMEM;

 	ret = get_perf_record_args(record_argv, buf, cpumap_buf);
 	if (ret)
 		goto out;

 	ret = prepare_run_command(record_argv);
 	if (ret)
 		goto out;
 	ret = start_command(tpebs_cmd);
 out:
 	free(record_argv);
 	return ret;
 }

 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample,
 				struct evsel *evsel,
 				struct machine *machine __maybe_unused)
 {
 	int ret = 0;
 	const char *evname;
 	struct tpebs_retire_lat *t;

 	evname = evsel__name(evsel);

 	/*
 	 * Need to handle per core results? We are assuming average retire
 	 * latency value will be used. Save the number of samples and the sum of
 	 * retire latency value for each event.
 	 */
 	list_for_each_entry(t, &tpebs_results, nd) {
 		if (!strcmp(evname, t->name)) {
 			t->count += 1;
 			t->sum += sample->retire_lat;
 			t->val = (double) t->sum / t->count;
 			break;
 		}
 	}

 	return ret;
 }

 static int process_feature_event(struct perf_session *session,
 				 union perf_event *event)
 {
 	if (event->feat.feat_id < HEADER_LAST_FEATURE)
 		return perf_event__process_feature(session, event);
 	return 0;
 }

 static void *__sample_reader(void *arg)
 {
 	struct child_process *child = arg;
 	struct perf_session *session;
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
 		.path = PERF_DATA,
 		.file.fd = child->out,
 	};
 	struct perf_tool tool;

 	perf_tool__init(&tool, /*ordered_events=*/false);
 	tool.sample = process_sample_event;
 	tool.feature = process_feature_event;
 	tool.attr = perf_event__process_attr;

 	session = perf_session__new(&data, &tool);
 	if (IS_ERR(session))
 		return NULL;
 	perf_session__process_events(session);
 	perf_session__delete(session);

 	return NULL;
 }

 /*
  * tpebs_stop - stop the sample data read thread and the perf record process.
  */
 static int tpebs_stop(void)
 {
 	int ret = 0;

 	/* Like tpebs_start, we should only run tpebs_end once. */
 	if (tpebs_pid != -1) {
 		kill(tpebs_cmd->pid, SIGTERM);
 		tpebs_pid = -1;
 		pthread_join(tpebs_reader_thread, NULL);
 		close(tpebs_cmd->out);
 		ret = finish_command(tpebs_cmd);
 		if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
 			ret = 0;
 	}
 	return ret;
 }

 /*
  * tpebs_start - start tpebs execution.
  * @evsel_list: retire_latency evsels in this list will be selected and sampled
  * to get the average retire_latency value.
  *
  * This function will be called from evlist level later when evlist__open() is
  * called consistently.
  */
 int tpebs_start(struct evlist *evsel_list)
 {
 	int ret = 0;
 	struct evsel *evsel;
 	char cpumap_buf[50];

 	/*
 	 * We should only run tpebs_start when tpebs_recording is enabled.
 	 * And we should only run it once with all the required events.
 	 */
 	if (tpebs_pid != -1 || !tpebs_recording)
 		return 0;

 	cpu_map__snprint(evsel_list->core.user_requested_cpus, cpumap_buf, sizeof(cpumap_buf));
 	/*
 	 * Prepare perf record for sampling event retire_latency before fork and
 	 * prepare workload
 	 */
 	evlist__for_each_entry(evsel_list, evsel) {
 		int i;
 		char *name;
 		struct tpebs_retire_lat *new;

 		if (!evsel->retire_lat)
 			continue;

 		pr_debug("tpebs: Retire_latency of event %s is required\n", evsel->name);
 		for (i = strlen(evsel->name) - 1; i > 0; i--) {
 			if (evsel->name[i] == 'R')
 				break;
 		}
 		if (i <= 0 || evsel->name[i] != 'R') {
 			ret = -1;
 			goto err;
 		}

 		name = strdup(evsel->name);
 		if (!name) {
 			ret = -ENOMEM;
 			goto err;
 		}
 		name[i] = 'p';

 		new = zalloc(sizeof(*new));
 		if (!new) {
 			ret = -1;
 			zfree(name);
 			goto err;
 		}
 		new->name = name;
 		new->tpebs_name = evsel->name;
 		list_add_tail(&new->nd, &tpebs_results);
 		tpebs_event_size += 1;
 	}

 	if (tpebs_event_size > 0) {
 		struct pollfd pollfd = { .events = POLLIN, };
 		int control_fd[2], ack_fd[2], len;
 		char ack_buf[8];

 		/*Create control and ack fd for --control*/
 		if (pipe(control_fd) < 0) {
 			pr_err("tpebs: Failed to create control fifo");
 			ret = -1;
 			goto out;
 		}
 		if (pipe(ack_fd) < 0) {
 			pr_err("tpebs: Failed to create control fifo");
 			ret = -1;
 			goto out;
 		}

 		ret = start_perf_record(control_fd, ack_fd, cpumap_buf);
 		if (ret)
 			goto out;
 		tpebs_pid = tpebs_cmd->pid;
 		if (pthread_create(&tpebs_reader_thread, NULL, __sample_reader, tpebs_cmd)) {
 			kill(tpebs_cmd->pid, SIGTERM);
 			close(tpebs_cmd->out);
 			pr_err("Could not create thread to process sample data.\n");
 			ret = -1;
 			goto out;
 		}
 		/* Wait for perf record initialization.*/
 		len = strlen(EVLIST_CTL_CMD_ENABLE_TAG);
 		ret = write(control_fd[1], EVLIST_CTL_CMD_ENABLE_TAG, len);
 		if (ret != len) {
 			pr_err("perf record control write control message failed\n");
 			goto out;
 		}

 		/* wait for an ack */
 		pollfd.fd = ack_fd[0];

 		/*
 		 * We need this poll to ensure the ack_fd PIPE will not hang
 		 * when perf record failed for any reason. The timeout value
 		 * 3000ms is an empirical selection.
 		 */
 		if (!poll(&pollfd, 1, 3000)) {
 			pr_err("tpebs failed: perf record ack timeout\n");
 			ret = -1;
 			goto out;
 		}

 		if (!(pollfd.revents & POLLIN)) {
 			pr_err("tpebs failed: did not received an ack\n");
 			ret = -1;
 			goto out;
 		}

 		ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
 		if (ret > 0)
 			ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
 		else {
 			pr_err("tpebs: perf record control ack failed\n");
 			goto out;
 		}
 out:
 		close(control_fd[0]);
 		close(control_fd[1]);
 		close(ack_fd[0]);
 		close(ack_fd[1]);
 	}
 err:
 	if (ret)
 		tpebs_delete();
 	return ret;
 }


 int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread)
 {
 	__u64 val;
 	bool found = false;
 	struct tpebs_retire_lat *t;
 	struct perf_counts_values *count;

 	/* Non reitre_latency evsel should never enter this function. */
 	if (!evsel__is_retire_lat(evsel))
 		return -1;

 	/*
 	 * Need to stop the forked record to ensure get sampled data from the
 	 * PIPE to process and get non-zero retire_lat value for hybrid.
 	 */
 	tpebs_stop();
 	count = perf_counts(evsel->counts, cpu_map_idx, thread);

 	list_for_each_entry(t, &tpebs_results, nd) {
 		if (t->tpebs_name == evsel->name ||
 		    (evsel->metric_id && !strcmp(t->tpebs_name, evsel->metric_id))) {
 			found = true;
 			break;
 		}
 	}

 	/* Set ena and run to non-zero */
 	count->ena = count->run = 1;
 	count->lost = 0;

 	if (!found) {
 		/*
 		 * Set default value or 0 when retire_latency for this event is
 		 * not found from sampling data (record_tpebs not set or 0
 		 * sample recorded).
 		 */
 		count->val = 0;
 		return 0;
 	}

 	/*
 	 * Only set retire_latency value to the first CPU and thread.
 	 */
 	if (cpu_map_idx == 0 && thread == 0)
 		val = rint(t->val);
 	else
 		val = 0;

 	count->val = val;
 	return 0;
 }

 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
 {
 	zfree(&r->name);
 	free(r);
 }


 /*
  * tpebs_delete - delete tpebs related data and stop the created thread and
  * process by calling tpebs_stop().
  *
  * This function is called from evlist_delete() and also from builtin-stat
  * stat_handle_error(). If tpebs_start() is called from places other then perf
  * stat, need to ensure tpebs_delete() is also called to safely free mem and
  * close the data read thread and the forked perf record process.
  *
  * This function is also called in evsel__close() to be symmetric with
  * tpebs_start() being called in evsel__open(). We will update this call site
  * when move tpebs_start() to evlist level.
  */
 void tpebs_delete(void)
 {
 	struct tpebs_retire_lat *r, *rtmp;

 	if (tpebs_pid == -1)
 		return;

 	tpebs_stop();

 	list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
 		list_del_init(&r->nd);
 		tpebs_retire_lat__delete(r);
 	}

 	if (tpebs_cmd) {
 		free(tpebs_cmd);
 		tpebs_cmd = NULL;
 	}
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* intel_tpebs.c: Intel TPEBS support
	*/


	#include <sys/param.h>
	#include <subcmd/run-command.h>
	#include <thread.h>
	#include "intel-tpebs.h"
	#include <linux/list.h>
	#include <linux/zalloc.h>
	#include <linux/err.h>
	#include "sample.h"
	#include "debug.h"
	#include "evlist.h"
	#include "evsel.h"
	#include "session.h"
	#include "tool.h"
	#include "cpumap.h"
	#include "metricgroup.h"
	#include <sys/stat.h>
	#include <sys/file.h>
	#include <poll.h>
	#include <math.h>

	#define PERF_DATA "-"

	bool tpebs_recording;
	static pid_t tpebs_pid = -1;
	static size_t tpebs_event_size;
	static LIST_HEAD(tpebs_results);
	static pthread_t tpebs_reader_thread;
	static struct child_process *tpebs_cmd;

	struct tpebs_retire_lat {
	struct list_head nd;
	/* Event name */
	const char *name;
	/* Event name with the TPEBS modifier R */
	const char *tpebs_name;
	/* Count of retire_latency values found in sample data */
	size_t count;
	/* Sum of all the retire_latency values in sample data */
	int sum;
	/* Average of retire_latency, val = sum / count */
	double val;
	};

	static int get_perf_record_args(const char **record_argv, char buf[],
	const char *cpumap_buf)
	{
	struct tpebs_retire_lat *e;
	int i = 0;

	pr_debug("tpebs: Prepare perf record for retire_latency\n");

	record_argv[i++] = "perf";
	record_argv[i++] = "record";
	record_argv[i++] = "-W";
	record_argv[i++] = "--synth=no";
	record_argv[i++] = buf;

	if (!cpumap_buf) {
	pr_err("tpebs: Require cpumap list to run sampling\n");
	return -ECANCELED;
	}
	/* Use -C when cpumap_buf is not "-1" */
	if (strcmp(cpumap_buf, "-1")) {
	record_argv[i++] = "-C";
	record_argv[i++] = cpumap_buf;
	}

	list_for_each_entry(e, &tpebs_results, nd) {
	record_argv[i++] = "-e";
	record_argv[i++] = e->name;
	}

	record_argv[i++] = "-o";
	record_argv[i++] = PERF_DATA;

	return 0;
	}

	static int prepare_run_command(const char **argv)
	{
	tpebs_cmd = zalloc(sizeof(struct child_process));
	if (!tpebs_cmd)
	return -ENOMEM;
	tpebs_cmd->argv = argv;
	tpebs_cmd->out = -1;
	return 0;
	}

	static int start_perf_record(int control_fd[], int ack_fd[],
	const char *cpumap_buf)
	{
	const char **record_argv;
	int ret;
	char buf[32];

	scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]);

	record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *));
	if (!record_argv)
	return -ENOMEM;

	ret = get_perf_record_args(record_argv, buf, cpumap_buf);
	if (ret)
	goto out;

	ret = prepare_run_command(record_argv);
	if (ret)
	goto out;
	ret = start_command(tpebs_cmd);
	out:
	free(record_argv);
	return ret;
	}

	static int process_sample_event(const struct perf_tool *tool __maybe_unused,
	union perf_event *event __maybe_unused,
	struct perf_sample *sample,
	struct evsel *evsel,
	struct machine *machine __maybe_unused)
	{
	int ret = 0;
	const char *evname;
	struct tpebs_retire_lat *t;

	evname = evsel__name(evsel);

	/*
	* Need to handle per core results? We are assuming average retire
	* latency value will be used. Save the number of samples and the sum of
	* retire latency value for each event.
	*/
	list_for_each_entry(t, &tpebs_results, nd) {
	if (!strcmp(evname, t->name)) {
	t->count += 1;
	t->sum += sample->retire_lat;
	t->val = (double) t->sum / t->count;
	break;
	}
	}

	return ret;
	}

	static int process_feature_event(struct perf_session *session,
	union perf_event *event)
	{
	if (event->feat.feat_id < HEADER_LAST_FEATURE)
	return perf_event__process_feature(session, event);
	return 0;
	}

	static void __sample_reader(void arg)
	{
	struct child_process *child = arg;
	struct perf_session *session;
	struct perf_data data = {
	.mode = PERF_DATA_MODE_READ,
	.path = PERF_DATA,
	.file.fd = child->out,
	};
	struct perf_tool tool;

	perf_tool__init(&tool, /ordered_events=/false);
	tool.sample = process_sample_event;
	tool.feature = process_feature_event;
	tool.attr = perf_event__process_attr;

	session = perf_session__new(&data, &tool);
	if (IS_ERR(session))
	return NULL;
	perf_session__process_events(session);
	perf_session__delete(session);

	return NULL;
	}

	/*
	* tpebs_stop - stop the sample data read thread and the perf record process.
	*/
	static int tpebs_stop(void)
	{
	int ret = 0;

	/* Like tpebs_start, we should only run tpebs_end once. */
	if (tpebs_pid != -1) {
	kill(tpebs_cmd->pid, SIGTERM);
	tpebs_pid = -1;
	pthread_join(tpebs_reader_thread, NULL);
	close(tpebs_cmd->out);
	ret = finish_command(tpebs_cmd);
	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
	ret = 0;
	}
	return ret;
	}

	/*
	* tpebs_start - start tpebs execution.
	* @evsel_list: retire_latency evsels in this list will be selected and sampled
	* to get the average retire_latency value.
	*
	* This function will be called from evlist level later when evlist__open() is
	* called consistently.
	*/
	int tpebs_start(struct evlist *evsel_list)
	{
	int ret = 0;
	struct evsel *evsel;
	char cpumap_buf[50];

	/*
	* We should only run tpebs_start when tpebs_recording is enabled.
	* And we should only run it once with all the required events.
	*/
	if (tpebs_pid != -1 \|\| !tpebs_recording)
	return 0;

	cpu_map__snprint(evsel_list->core.user_requested_cpus, cpumap_buf, sizeof(cpumap_buf));
	/*
	* Prepare perf record for sampling event retire_latency before fork and
	* prepare workload
	*/
	evlist__for_each_entry(evsel_list, evsel) {
	int i;
	char *name;
	struct tpebs_retire_lat *new;

	if (!evsel->retire_lat)
	continue;

	pr_debug("tpebs: Retire_latency of event %s is required\n", evsel->name);
	for (i = strlen(evsel->name) - 1; i > 0; i--) {
	if (evsel->name[i] == 'R')
	break;
	}
	if (i <= 0 \|\| evsel->name[i] != 'R') {
	ret = -1;
	goto err;
	}

	name = strdup(evsel->name);
	if (!name) {
	ret = -ENOMEM;
	goto err;
	}
	name[i] = 'p';

	new = zalloc(sizeof(*new));
	if (!new) {
	ret = -1;
	zfree(name);
	goto err;
	}
	new->name = name;
	new->tpebs_name = evsel->name;
	list_add_tail(&new->nd, &tpebs_results);
	tpebs_event_size += 1;
	}

	if (tpebs_event_size > 0) {
	struct pollfd pollfd = { .events = POLLIN, };
	int control_fd[2], ack_fd[2], len;
	char ack_buf[8];

	/Create control and ack fd for --control/
	if (pipe(control_fd) < 0) {
	pr_err("tpebs: Failed to create control fifo");
	ret = -1;
	goto out;
	}
	if (pipe(ack_fd) < 0) {
	pr_err("tpebs: Failed to create control fifo");
	ret = -1;
	goto out;
	}

	ret = start_perf_record(control_fd, ack_fd, cpumap_buf);
	if (ret)
	goto out;
	tpebs_pid = tpebs_cmd->pid;
	if (pthread_create(&tpebs_reader_thread, NULL, __sample_reader, tpebs_cmd)) {
	kill(tpebs_cmd->pid, SIGTERM);
	close(tpebs_cmd->out);
	pr_err("Could not create thread to process sample data.\n");
	ret = -1;
	goto out;
	}
	/* Wait for perf record initialization.*/
	len = strlen(EVLIST_CTL_CMD_ENABLE_TAG);
	ret = write(control_fd[1], EVLIST_CTL_CMD_ENABLE_TAG, len);
	if (ret != len) {
	pr_err("perf record control write control message failed\n");
	goto out;
	}

	/* wait for an ack */
	pollfd.fd = ack_fd[0];

	/*
	* We need this poll to ensure the ack_fd PIPE will not hang
	* when perf record failed for any reason. The timeout value
	* 3000ms is an empirical selection.
	*/
	if (!poll(&pollfd, 1, 3000)) {
	pr_err("tpebs failed: perf record ack timeout\n");
	ret = -1;
	goto out;
	}

	if (!(pollfd.revents & POLLIN)) {
	pr_err("tpebs failed: did not received an ack\n");
	ret = -1;
	goto out;
	}

	ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
	if (ret > 0)
	ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
	else {
	pr_err("tpebs: perf record control ack failed\n");
	goto out;
	}
	out:
	close(control_fd[0]);
	close(control_fd[1]);
	close(ack_fd[0]);
	close(ack_fd[1]);
	}
	err:
	if (ret)
	tpebs_delete();
	return ret;
	}


	int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread)
	{
	__u64 val;
	bool found = false;
	struct tpebs_retire_lat *t;
	struct perf_counts_values *count;

	/* Non reitre_latency evsel should never enter this function. */
	if (!evsel__is_retire_lat(evsel))
	return -1;

	/*
	* Need to stop the forked record to ensure get sampled data from the
	* PIPE to process and get non-zero retire_lat value for hybrid.
	*/
	tpebs_stop();
	count = perf_counts(evsel->counts, cpu_map_idx, thread);

	list_for_each_entry(t, &tpebs_results, nd) {
	if (t->tpebs_name == evsel->name \|\|
	(evsel->metric_id && !strcmp(t->tpebs_name, evsel->metric_id))) {
	found = true;
	break;
	}
	}

	/* Set ena and run to non-zero */
	count->ena = count->run = 1;
	count->lost = 0;

	if (!found) {
	/*
	* Set default value or 0 when retire_latency for this event is
	* not found from sampling data (record_tpebs not set or 0
	* sample recorded).
	*/
	count->val = 0;
	return 0;
	}

	/*
	* Only set retire_latency value to the first CPU and thread.
	*/
	if (cpu_map_idx == 0 && thread == 0)
	val = rint(t->val);
	else
	val = 0;

	count->val = val;
	return 0;
	}

	static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
	{
	zfree(&r->name);
	free(r);
	}


	/*
	* tpebs_delete - delete tpebs related data and stop the created thread and
	* process by calling tpebs_stop().
	*
	* This function is called from evlist_delete() and also from builtin-stat
	* stat_handle_error(). If tpebs_start() is called from places other then perf
	* stat, need to ensure tpebs_delete() is also called to safely free mem and
	* close the data read thread and the forked perf record process.
	*
	* This function is also called in evsel__close() to be symmetric with
	* tpebs_start() being called in evsel__open(). We will update this call site
	* when move tpebs_start() to evlist level.
	*/
	void tpebs_delete(void)
	{
	struct tpebs_retire_lat r, rtmp;

	if (tpebs_pid == -1)
	return;

	tpebs_stop();

	list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) {
	list_del_init(&r->nd);
	tpebs_retire_lat__delete(r);
	}

	if (tpebs_cmd) {
	free(tpebs_cmd);
	tpebs_cmd = NULL;
	}
	}