arch/s390/kernel/hiperdispatch.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright IBM Corp. 2024
  */

 #define KMSG_COMPONENT "hd"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

 /*
  * Hiperdispatch:
  * Dynamically calculates the optimum number of high capacity COREs
  * by considering the state the system is in. When hiperdispatch decides
  * that a capacity update is necessary, it schedules a topology update.
  * During topology updates the CPU capacities are always re-adjusted.
  *
  * There is two places where CPU capacities are being accessed within
  * hiperdispatch.
  * -> hiperdispatch's reoccuring work function reads CPU capacities to
  *    determine high capacity CPU count.
  * -> during a topology update hiperdispatch's adjustment function
  *    updates CPU capacities.
  * These two can run on different CPUs in parallel which can cause
  * hiperdispatch to make wrong decisions. This can potentially cause
  * some overhead by leading to extra rebuild_sched_domains() calls
  * for correction. Access to capacities within hiperdispatch has to be
  * serialized to prevent the overhead.
  *
  * Hiperdispatch decision making revolves around steal time.
  * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
  * crosses the threshold value hiperdispatch falls back to giving high
  * capacities to entitled CPUs. When steal time drops below the
  * threshold boundary, hiperdispatch utilizes all CPUs by giving all
  * of them high capacity.
  *
  * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
  * performance. Comparing the throughput of;
  * - single CORE, with N threads, running N tasks
  * - N separate COREs running N tasks,
  * using individual COREs for individual tasks yield better
  * performance. This performance difference is roughly ~30% (can change
  * between machine generations)
  *
  * Hiperdispatch tries to hint scheduler to use individual COREs for
  * each task, as long as steal time on those COREs are less than 30%,
  * therefore delaying the throughput loss caused by using SMP threads.
  */

 #include <linux/cpumask.h>
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/kernel_stat.h>
 #include <linux/kstrtox.h>
 #include <linux/ktime.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <asm/hiperdispatch.h>
 #include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/topology.h>

 #define CREATE_TRACE_POINTS
 #include <asm/trace/hiperdispatch.h>

 #define HD_DELAY_FACTOR			(4)
 #define HD_DELAY_INTERVAL		(HZ / 4)
 #define HD_STEAL_THRESHOLD		30
 #define HD_STEAL_AVG_WEIGHT		16

 static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
 static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
 static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
 static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
 static int hd_online_cores;		/* Current online CORE count */

 static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
 static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
 static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
 static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */

 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
 static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
 static int hd_enabled;

 static void hd_capacity_work_fn(struct work_struct *work);
 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);

 static int hd_set_hiperdispatch_mode(int enable)
 {
 	if (!MACHINE_HAS_TOPOLOGY)
 		enable = 0;
 	if (hd_enabled == enable)
 		return 0;
 	hd_enabled = enable;
 	return 1;
 }

 void hd_reset_state(void)
 {
 	cpumask_clear(&hd_vl_coremask);
 	cpumask_clear(&hd_vmvl_cpumask);
 	hd_entitled_cores = 0;
 	hd_online_cores = 0;
 }

 void hd_add_core(int cpu)
 {
 	const struct cpumask *siblings;
 	int polarization;

 	hd_online_cores++;
 	polarization = smp_cpu_get_polarization(cpu);
 	siblings = topology_sibling_cpumask(cpu);
 	switch (polarization) {
 	case POLARIZATION_VH:
 		hd_entitled_cores++;
 		break;
 	case POLARIZATION_VM:
 		hd_entitled_cores++;
 		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
 		break;
 	case POLARIZATION_VL:
 		cpumask_set_cpu(cpu, &hd_vl_coremask);
 		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
 		break;
 	}
 }

 /* Serialize update and read operations of debug counters. */
 static DEFINE_MUTEX(hd_counter_mutex);

 static void hd_update_times(void)
 {
 	static ktime_t prev;
 	ktime_t now;

 	/*
 	 * Check if hiperdispatch is active, if not set the prev to 0.
 	 * This way it is possible to differentiate the first update iteration after
 	 * enabling hiperdispatch.
 	 */
 	if (hd_entitled_cores == 0 || hd_enabled == 0) {
 		prev = ktime_set(0, 0);
 		return;
 	}
 	now = ktime_get();
 	if (ktime_after(prev, 0)) {
 		if (hd_high_capacity_cores == hd_online_cores)
 			hd_high_time += ktime_ms_delta(now, prev);
 		else
 			hd_low_time += ktime_ms_delta(now, prev);
 	}
 	prev = now;
 }

 static void hd_update_capacities(void)
 {
 	int cpu, upscaling_cores;
 	unsigned long capacity;

 	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
 	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
 	hd_high_capacity_cores = hd_entitled_cores;
 	for_each_cpu(cpu, &hd_vl_coremask) {
 		smp_set_core_capacity(cpu, capacity);
 		if (capacity != CPU_CAPACITY_HIGH)
 			continue;
 		hd_high_capacity_cores++;
 		upscaling_cores--;
 		if (upscaling_cores == 0)
 			capacity = CPU_CAPACITY_LOW;
 	}
 }

 void hd_disable_hiperdispatch(void)
 {
 	cancel_delayed_work_sync(&hd_capacity_work);
 	hd_high_capacity_cores = hd_online_cores;
 	hd_previous_steal = 0;
 }

 int hd_enable_hiperdispatch(void)
 {
 	mutex_lock(&hd_counter_mutex);
 	hd_update_times();
 	mutex_unlock(&hd_counter_mutex);
 	if (hd_enabled == 0)
 		return 0;
 	if (hd_entitled_cores == 0)
 		return 0;
 	if (hd_online_cores <= hd_entitled_cores)
 		return 0;
 	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
 	hd_update_capacities();
 	return 1;
 }

 static unsigned long hd_steal_avg(unsigned long new)
 {
 	static unsigned long steal;

 	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
 	return steal;
 }

 static unsigned long hd_calculate_steal_percentage(void)
 {
 	unsigned long time_delta, steal_delta, steal, percentage;
 	static ktime_t prev;
 	int cpus, cpu;
 	ktime_t now;

 	cpus = 0;
 	steal = 0;
 	percentage = 0;
 	for_each_cpu(cpu, &hd_vmvl_cpumask) {
 		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
 		cpus++;
 	}
 	/*
 	 * If there is no vertical medium and low CPUs steal time
 	 * is 0 as vertical high CPUs shouldn't experience steal time.
 	 */
 	if (cpus == 0)
 		return percentage;
 	now = ktime_get();
 	time_delta = ktime_to_ns(ktime_sub(now, prev));
 	if (steal > hd_previous_steal && hd_previous_steal != 0) {
 		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
 		percentage = steal_delta / cpus;
 	}
 	hd_previous_steal = steal;
 	prev = now;
 	return percentage;
 }

 static void hd_capacity_work_fn(struct work_struct *work)
 {
 	unsigned long steal_percentage, new_cores;

 	mutex_lock(&smp_cpu_state_mutex);
 	/*
 	 * If online cores are less or equal to entitled cores hiperdispatch
 	 * does not need to make any adjustments, call a topology update to
 	 * disable hiperdispatch.
 	 * Normally this check is handled on topology update, but during cpu
 	 * unhotplug, topology and cpu mask updates are done in reverse
 	 * order, causing hd_enable_hiperdispatch() to get stale data.
 	 */
 	if (hd_online_cores <= hd_entitled_cores) {
 		topology_schedule_update();
 		mutex_unlock(&smp_cpu_state_mutex);
 		return;
 	}
 	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
 	if (steal_percentage < hd_steal_threshold)
 		new_cores = hd_online_cores;
 	else
 		new_cores = hd_entitled_cores;
 	if (hd_high_capacity_cores != new_cores) {
 		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
 		hd_high_capacity_cores = new_cores;
 		atomic64_inc(&hd_adjustments);
 		topology_schedule_update();
 	}
 	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
 	mutex_unlock(&smp_cpu_state_mutex);
 	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
 }

 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
 				     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int hiperdispatch;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &hiperdispatch,
 		.maxlen		= sizeof(int),
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	};

 	hiperdispatch = hd_enabled;
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
 	if (rc < 0 || !write)
 		return rc;
 	mutex_lock(&smp_cpu_state_mutex);
 	if (hd_set_hiperdispatch_mode(hiperdispatch))
 		topology_schedule_update();
 	mutex_unlock(&smp_cpu_state_mutex);
 	return 0;
 }

 static struct ctl_table hiperdispatch_ctl_table[] = {
 	{
 		.procname	= "hiperdispatch",
 		.mode		= 0644,
 		.proc_handler	= hiperdispatch_ctl_handler,
 	},
 };

 static ssize_t hd_steal_threshold_show(struct device *dev,
 				       struct device_attribute *attr,
 				       char *buf)
 {
 	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
 }

 static ssize_t hd_steal_threshold_store(struct device *dev,
 					struct device_attribute *attr,
 					const char *buf,
 					size_t count)
 {
 	unsigned int val;
 	int rc;

 	rc = kstrtouint(buf, 0, &val);
 	if (rc)
 		return rc;
 	if (val > 100)
 		return -ERANGE;
 	hd_steal_threshold = val;
 	return count;
 }

 static DEVICE_ATTR_RW(hd_steal_threshold);

 static ssize_t hd_delay_factor_show(struct device *dev,
 				    struct device_attribute *attr,
 				    char *buf)
 {
 	return sysfs_emit(buf, "%u\n", hd_delay_factor);
 }

 static ssize_t hd_delay_factor_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf,
 				     size_t count)
 {
 	unsigned int val;
 	int rc;

 	rc = kstrtouint(buf, 0, &val);
 	if (rc)
 		return rc;
 	if (!val)
 		return -ERANGE;
 	hd_delay_factor = val;
 	return count;
 }

 static DEVICE_ATTR_RW(hd_delay_factor);

 static struct attribute *hd_attrs[] = {
 	&dev_attr_hd_steal_threshold.attr,
 	&dev_attr_hd_delay_factor.attr,
 	NULL,
 };

 static const struct attribute_group hd_attr_group = {
 	.name  = "hiperdispatch",
 	.attrs = hd_attrs,
 };

 static int hd_greedy_time_get(void *unused, u64 *val)
 {
 	mutex_lock(&hd_counter_mutex);
 	hd_update_times();
 	*val = hd_high_time;
 	mutex_unlock(&hd_counter_mutex);
 	return 0;
 }

 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");

 static int hd_conservative_time_get(void *unused, u64 *val)
 {
 	mutex_lock(&hd_counter_mutex);
 	hd_update_times();
 	*val = hd_low_time;
 	mutex_unlock(&hd_counter_mutex);
 	return 0;
 }

 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");

 static int hd_adjustment_count_get(void *unused, u64 *val)
 {
 	*val = atomic64_read(&hd_adjustments);
 	return 0;
 }

 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");

 static void __init hd_create_debugfs_counters(void)
 {
 	struct dentry *dir;

 	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
 	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
 	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
 	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
 }

 static void __init hd_create_attributes(void)
 {
 	struct device *dev;

 	dev = bus_get_dev_root(&cpu_subsys);
 	if (!dev)
 		return;
 	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
 		pr_warn("Unable to create hiperdispatch attribute group\n");
 	put_device(dev);
 }

 static int __init hd_init(void)
 {
 	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
 		hd_set_hiperdispatch_mode(1);
 		topology_schedule_update();
 	}
 	if (!register_sysctl("s390", hiperdispatch_ctl_table))
 		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
 	hd_create_debugfs_counters();
 	hd_create_attributes();
 	return 0;
 }
 late_initcall(hd_init);
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Copyright IBM Corp. 2024
	*/

	#define KMSG_COMPONENT "hd"
	#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

	/*
	* Hiperdispatch:
	* Dynamically calculates the optimum number of high capacity COREs
	* by considering the state the system is in. When hiperdispatch decides
	* that a capacity update is necessary, it schedules a topology update.
	* During topology updates the CPU capacities are always re-adjusted.
	*
	* There is two places where CPU capacities are being accessed within
	* hiperdispatch.
	* -> hiperdispatch's reoccuring work function reads CPU capacities to
	* determine high capacity CPU count.
	* -> during a topology update hiperdispatch's adjustment function
	* updates CPU capacities.
	* These two can run on different CPUs in parallel which can cause
	* hiperdispatch to make wrong decisions. This can potentially cause
	* some overhead by leading to extra rebuild_sched_domains() calls
	* for correction. Access to capacities within hiperdispatch has to be
	* serialized to prevent the overhead.
	*
	* Hiperdispatch decision making revolves around steal time.
	* HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
	* crosses the threshold value hiperdispatch falls back to giving high
	* capacities to entitled CPUs. When steal time drops below the
	* threshold boundary, hiperdispatch utilizes all CPUs by giving all
	* of them high capacity.
	*
	* The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
	* performance. Comparing the throughput of;
	* - single CORE, with N threads, running N tasks
	* - N separate COREs running N tasks,
	* using individual COREs for individual tasks yield better
	* performance. This performance difference is roughly ~30% (can change
	* between machine generations)
	*
	* Hiperdispatch tries to hint scheduler to use individual COREs for
	* each task, as long as steal time on those COREs are less than 30%,
	* therefore delaying the throughput loss caused by using SMP threads.
	*/

	#include <linux/cpumask.h>
	#include <linux/debugfs.h>
	#include <linux/device.h>
	#include <linux/kernel_stat.h>
	#include <linux/kstrtox.h>
	#include <linux/ktime.h>
	#include <linux/sysctl.h>
	#include <linux/types.h>
	#include <linux/workqueue.h>
	#include <asm/hiperdispatch.h>
	#include <asm/setup.h>
	#include <asm/smp.h>
	#include <asm/topology.h>

	#define CREATE_TRACE_POINTS
	#include <asm/trace/hiperdispatch.h>

	#define HD_DELAY_FACTOR (4)
	#define HD_DELAY_INTERVAL (HZ / 4)
	#define HD_STEAL_THRESHOLD 30
	#define HD_STEAL_AVG_WEIGHT 16

	static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
	static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
	static int hd_high_capacity_cores; /* Current CORE count with high capacity */
	static int hd_entitled_cores; /* Total vertical high and medium CORE count */
	static int hd_online_cores; /* Current online CORE count */

	static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
	static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
	static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
	static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */

	static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
	static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
	static int hd_enabled;

	static void hd_capacity_work_fn(struct work_struct *work);
	static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);

	static int hd_set_hiperdispatch_mode(int enable)
	{
	if (!MACHINE_HAS_TOPOLOGY)
	enable = 0;
	if (hd_enabled == enable)
	return 0;
	hd_enabled = enable;
	return 1;
	}

	void hd_reset_state(void)
	{
	cpumask_clear(&hd_vl_coremask);
	cpumask_clear(&hd_vmvl_cpumask);
	hd_entitled_cores = 0;
	hd_online_cores = 0;
	}

	void hd_add_core(int cpu)
	{
	const struct cpumask *siblings;
	int polarization;

	hd_online_cores++;
	polarization = smp_cpu_get_polarization(cpu);
	siblings = topology_sibling_cpumask(cpu);
	switch (polarization) {
	case POLARIZATION_VH:
	hd_entitled_cores++;
	break;
	case POLARIZATION_VM:
	hd_entitled_cores++;
	cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
	break;
	case POLARIZATION_VL:
	cpumask_set_cpu(cpu, &hd_vl_coremask);
	cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
	break;
	}
	}

	/* Serialize update and read operations of debug counters. */
	static DEFINE_MUTEX(hd_counter_mutex);

	static void hd_update_times(void)
	{
	static ktime_t prev;
	ktime_t now;

	/*
	* Check if hiperdispatch is active, if not set the prev to 0.
	* This way it is possible to differentiate the first update iteration after
	* enabling hiperdispatch.
	*/
	if (hd_entitled_cores == 0 \|\| hd_enabled == 0) {
	prev = ktime_set(0, 0);
	return;
	}
	now = ktime_get();
	if (ktime_after(prev, 0)) {
	if (hd_high_capacity_cores == hd_online_cores)
	hd_high_time += ktime_ms_delta(now, prev);
	else
	hd_low_time += ktime_ms_delta(now, prev);
	}
	prev = now;
	}

	static void hd_update_capacities(void)
	{
	int cpu, upscaling_cores;
	unsigned long capacity;

	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
	hd_high_capacity_cores = hd_entitled_cores;
	for_each_cpu(cpu, &hd_vl_coremask) {
	smp_set_core_capacity(cpu, capacity);
	if (capacity != CPU_CAPACITY_HIGH)
	continue;
	hd_high_capacity_cores++;
	upscaling_cores--;
	if (upscaling_cores == 0)
	capacity = CPU_CAPACITY_LOW;
	}
	}

	void hd_disable_hiperdispatch(void)
	{
	cancel_delayed_work_sync(&hd_capacity_work);
	hd_high_capacity_cores = hd_online_cores;
	hd_previous_steal = 0;
	}

	int hd_enable_hiperdispatch(void)
	{
	mutex_lock(&hd_counter_mutex);
	hd_update_times();
	mutex_unlock(&hd_counter_mutex);
	if (hd_enabled == 0)
	return 0;
	if (hd_entitled_cores == 0)
	return 0;
	if (hd_online_cores <= hd_entitled_cores)
	return 0;
	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
	hd_update_capacities();
	return 1;
	}

	static unsigned long hd_steal_avg(unsigned long new)
	{
	static unsigned long steal;

	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
	return steal;
	}

	static unsigned long hd_calculate_steal_percentage(void)
	{
	unsigned long time_delta, steal_delta, steal, percentage;
	static ktime_t prev;
	int cpus, cpu;
	ktime_t now;

	cpus = 0;
	steal = 0;
	percentage = 0;
	for_each_cpu(cpu, &hd_vmvl_cpumask) {
	steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
	cpus++;
	}
	/*
	* If there is no vertical medium and low CPUs steal time
	* is 0 as vertical high CPUs shouldn't experience steal time.
	*/
	if (cpus == 0)
	return percentage;
	now = ktime_get();
	time_delta = ktime_to_ns(ktime_sub(now, prev));
	if (steal > hd_previous_steal && hd_previous_steal != 0) {
	steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
	percentage = steal_delta / cpus;
	}
	hd_previous_steal = steal;
	prev = now;
	return percentage;
	}

	static void hd_capacity_work_fn(struct work_struct *work)
	{
	unsigned long steal_percentage, new_cores;

	mutex_lock(&smp_cpu_state_mutex);
	/*
	* If online cores are less or equal to entitled cores hiperdispatch
	* does not need to make any adjustments, call a topology update to
	* disable hiperdispatch.
	* Normally this check is handled on topology update, but during cpu
	* unhotplug, topology and cpu mask updates are done in reverse
	* order, causing hd_enable_hiperdispatch() to get stale data.
	*/
	if (hd_online_cores <= hd_entitled_cores) {
	topology_schedule_update();
	mutex_unlock(&smp_cpu_state_mutex);
	return;
	}
	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
	if (steal_percentage < hd_steal_threshold)
	new_cores = hd_online_cores;
	else
	new_cores = hd_entitled_cores;
	if (hd_high_capacity_cores != new_cores) {
	trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
	hd_high_capacity_cores = new_cores;
	atomic64_inc(&hd_adjustments);
	topology_schedule_update();
	}
	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
	mutex_unlock(&smp_cpu_state_mutex);
	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
	}

	static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
	void buffer, size_t lenp, loff_t *ppos)
	{
	int hiperdispatch;
	int rc;
	struct ctl_table ctl_entry = {
	.procname = ctl->procname,
	.data = &hiperdispatch,
	.maxlen = sizeof(int),
	.extra1 = SYSCTL_ZERO,
	.extra2 = SYSCTL_ONE,
	};

	hiperdispatch = hd_enabled;
	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
	if (rc < 0 \|\| !write)
	return rc;
	mutex_lock(&smp_cpu_state_mutex);
	if (hd_set_hiperdispatch_mode(hiperdispatch))
	topology_schedule_update();
	mutex_unlock(&smp_cpu_state_mutex);
	return 0;
	}

	static struct ctl_table hiperdispatch_ctl_table[] = {
	{
	.procname = "hiperdispatch",
	.mode = 0644,
	.proc_handler = hiperdispatch_ctl_handler,
	},
	};

	static ssize_t hd_steal_threshold_show(struct device *dev,
	struct device_attribute *attr,
	char *buf)
	{
	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
	}

	static ssize_t hd_steal_threshold_store(struct device *dev,
	struct device_attribute *attr,
	const char *buf,
	size_t count)
	{
	unsigned int val;
	int rc;

	rc = kstrtouint(buf, 0, &val);
	if (rc)
	return rc;
	if (val > 100)
	return -ERANGE;
	hd_steal_threshold = val;
	return count;
	}

	static DEVICE_ATTR_RW(hd_steal_threshold);

	static ssize_t hd_delay_factor_show(struct device *dev,
	struct device_attribute *attr,
	char *buf)
	{
	return sysfs_emit(buf, "%u\n", hd_delay_factor);
	}

	static ssize_t hd_delay_factor_store(struct device *dev,
	struct device_attribute *attr,
	const char *buf,
	size_t count)
	{
	unsigned int val;
	int rc;

	rc = kstrtouint(buf, 0, &val);
	if (rc)
	return rc;
	if (!val)
	return -ERANGE;
	hd_delay_factor = val;
	return count;
	}

	static DEVICE_ATTR_RW(hd_delay_factor);

	static struct attribute *hd_attrs[] = {
	&dev_attr_hd_steal_threshold.attr,
	&dev_attr_hd_delay_factor.attr,
	NULL,
	};

	static const struct attribute_group hd_attr_group = {
	.name = "hiperdispatch",
	.attrs = hd_attrs,
	};

	static int hd_greedy_time_get(void unused, u64 val)
	{
	mutex_lock(&hd_counter_mutex);
	hd_update_times();
	*val = hd_high_time;
	mutex_unlock(&hd_counter_mutex);
	return 0;
	}

	DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");

	static int hd_conservative_time_get(void unused, u64 val)
	{
	mutex_lock(&hd_counter_mutex);
	hd_update_times();
	*val = hd_low_time;
	mutex_unlock(&hd_counter_mutex);
	return 0;
	}

	DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");

	static int hd_adjustment_count_get(void unused, u64 val)
	{
	*val = atomic64_read(&hd_adjustments);
	return 0;
	}

	DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");

	static void __init hd_create_debugfs_counters(void)
	{
	struct dentry *dir;

	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
	}

	static void __init hd_create_attributes(void)
	{
	struct device *dev;

	dev = bus_get_dev_root(&cpu_subsys);
	if (!dev)
	return;
	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
	pr_warn("Unable to create hiperdispatch attribute group\n");
	put_device(dev);
	}

	static int __init hd_init(void)
	{
	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
	hd_set_hiperdispatch_mode(1);
	topology_schedule_update();
	}
	if (!register_sysctl("s390", hiperdispatch_ctl_table))
	pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
	hd_create_debugfs_counters();
	hd_create_attributes();
	return 0;
	}
	late_initcall(hd_init);