| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * (C) 2010,2011 Thomas Renninger <trenn@suse.de>, Novell Inc. |
| */ |
| |
| #if defined(__i386__) || defined(__x86_64__) |
| |
| #include <stdio.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <limits.h> |
| |
| #include <cpufreq.h> |
| |
| #include "helpers/helpers.h" |
| #include "idle_monitor/cpupower-monitor.h" |
| |
| #define MSR_APERF 0xE8 |
| #define MSR_MPERF 0xE7 |
| |
| #define MSR_TSC 0x10 |
| |
| #define MSR_AMD_HWCR 0xc0010015 |
| |
| enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; |
| |
| static int mperf_get_count_percent(unsigned int self_id, double *percent, |
| unsigned int cpu); |
| static int mperf_get_count_freq(unsigned int id, unsigned long long *count, |
| unsigned int cpu); |
| static struct timespec time_start, time_end; |
| |
| static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { |
| { |
| .name = "C0", |
| .desc = N_("Processor Core not idle"), |
| .id = C0, |
| .range = RANGE_THREAD, |
| .get_count_percent = mperf_get_count_percent, |
| }, |
| { |
| .name = "Cx", |
| .desc = N_("Processor Core in an idle state"), |
| .id = Cx, |
| .range = RANGE_THREAD, |
| .get_count_percent = mperf_get_count_percent, |
| }, |
| |
| { |
| .name = "Freq", |
| .desc = N_("Average Frequency (including boost) in MHz"), |
| .id = AVG_FREQ, |
| .range = RANGE_THREAD, |
| .get_count = mperf_get_count_freq, |
| }, |
| }; |
| |
| enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; |
| static int max_freq_mode; |
| /* |
| * The max frequency mperf is ticking at (in C0), either retrieved via: |
| * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency |
| * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time |
| * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) |
| */ |
| static unsigned long max_frequency; |
| |
| static unsigned long long tsc_at_measure_start; |
| static unsigned long long tsc_at_measure_end; |
| static unsigned long long *mperf_previous_count; |
| static unsigned long long *aperf_previous_count; |
| static unsigned long long *mperf_current_count; |
| static unsigned long long *aperf_current_count; |
| |
| /* valid flag for all CPUs. If a MSR read failed it will be zero */ |
| static int *is_valid; |
| |
| static int mperf_get_tsc(unsigned long long *tsc) |
| { |
| int ret; |
| |
| ret = read_msr(base_cpu, MSR_TSC, tsc); |
| if (ret) |
| dprint("Reading TSC MSR failed, returning %llu\n", *tsc); |
| return ret; |
| } |
| |
| static int mperf_init_stats(unsigned int cpu) |
| { |
| unsigned long long val; |
| int ret; |
| |
| ret = read_msr(cpu, MSR_APERF, &val); |
| aperf_previous_count[cpu] = val; |
| ret |= read_msr(cpu, MSR_MPERF, &val); |
| mperf_previous_count[cpu] = val; |
| is_valid[cpu] = !ret; |
| |
| return 0; |
| } |
| |
| static int mperf_measure_stats(unsigned int cpu) |
| { |
| unsigned long long val; |
| int ret; |
| |
| ret = read_msr(cpu, MSR_APERF, &val); |
| aperf_current_count[cpu] = val; |
| ret |= read_msr(cpu, MSR_MPERF, &val); |
| mperf_current_count[cpu] = val; |
| is_valid[cpu] = !ret; |
| |
| return 0; |
| } |
| |
| static int mperf_get_count_percent(unsigned int id, double *percent, |
| unsigned int cpu) |
| { |
| unsigned long long aperf_diff, mperf_diff, tsc_diff; |
| unsigned long long timediff; |
| |
| if (!is_valid[cpu]) |
| return -1; |
| |
| if (id != C0 && id != Cx) |
| return -1; |
| |
| mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; |
| aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; |
| |
| if (max_freq_mode == MAX_FREQ_TSC_REF) { |
| tsc_diff = tsc_at_measure_end - tsc_at_measure_start; |
| *percent = 100.0 * mperf_diff / tsc_diff; |
| dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", |
| mperf_cstates[id].name, mperf_diff, tsc_diff); |
| } else if (max_freq_mode == MAX_FREQ_SYSFS) { |
| timediff = max_frequency * timespec_diff_us(time_start, time_end); |
| *percent = 100.0 * mperf_diff / timediff; |
| dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", |
| mperf_cstates[id].name, mperf_diff, timediff); |
| } else |
| return -1; |
| |
| if (id == Cx) |
| *percent = 100.0 - *percent; |
| |
| dprint("%s: previous: %llu - current: %llu - (%u)\n", |
| mperf_cstates[id].name, mperf_diff, aperf_diff, cpu); |
| dprint("%s: %f\n", mperf_cstates[id].name, *percent); |
| return 0; |
| } |
| |
| static int mperf_get_count_freq(unsigned int id, unsigned long long *count, |
| unsigned int cpu) |
| { |
| unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; |
| |
| if (id != AVG_FREQ) |
| return 1; |
| |
| if (!is_valid[cpu]) |
| return -1; |
| |
| mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; |
| aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; |
| |
| if (max_freq_mode == MAX_FREQ_TSC_REF) { |
| /* Calculate max_freq from TSC count */ |
| tsc_diff = tsc_at_measure_end - tsc_at_measure_start; |
| time_diff = timespec_diff_us(time_start, time_end); |
| max_frequency = tsc_diff / time_diff; |
| } |
| |
| *count = max_frequency * ((double)aperf_diff / mperf_diff); |
| dprint("%s: Average freq based on %s maximum frequency:\n", |
| mperf_cstates[id].name, |
| (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); |
| dprint("max_frequency: %lu\n", max_frequency); |
| dprint("aperf_diff: %llu\n", aperf_diff); |
| dprint("mperf_diff: %llu\n", mperf_diff); |
| dprint("avg freq: %llu\n", *count); |
| return 0; |
| } |
| |
| static int mperf_start(void) |
| { |
| int cpu; |
| unsigned long long dbg; |
| |
| clock_gettime(CLOCK_REALTIME, &time_start); |
| mperf_get_tsc(&tsc_at_measure_start); |
| |
| for (cpu = 0; cpu < cpu_count; cpu++) |
| mperf_init_stats(cpu); |
| |
| mperf_get_tsc(&dbg); |
| dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start); |
| return 0; |
| } |
| |
| static int mperf_stop(void) |
| { |
| unsigned long long dbg; |
| int cpu; |
| |
| for (cpu = 0; cpu < cpu_count; cpu++) |
| mperf_measure_stats(cpu); |
| |
| mperf_get_tsc(&tsc_at_measure_end); |
| clock_gettime(CLOCK_REALTIME, &time_end); |
| |
| mperf_get_tsc(&dbg); |
| dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); |
| |
| return 0; |
| } |
| |
| /* |
| * Mperf register is defined to tick at P0 (maximum) frequency |
| * |
| * Instead of reading out P0 which can be tricky to read out from HW, |
| * we use TSC counter if it reliably ticks at P0/mperf frequency. |
| * |
| * Still try to fall back to: |
| * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq |
| * on older Intel HW without invariant TSC feature. |
| * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but |
| * it's still double checked (MSR_AMD_HWCR)). |
| * |
| * On these machines the user would still get useful mperf |
| * stats when acpi-cpufreq driver is loaded. |
| */ |
| static int init_maxfreq_mode(void) |
| { |
| int ret; |
| unsigned long long hwcr; |
| unsigned long min; |
| |
| if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) |
| goto use_sysfs; |
| |
| if (cpupower_cpu_info.vendor == X86_VENDOR_AMD || |
| cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { |
| /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf |
| * freq. |
| * A test whether hwcr is accessable/available would be: |
| * (cpupower_cpu_info.family > 0x10 || |
| * cpupower_cpu_info.family == 0x10 && |
| * cpupower_cpu_info.model >= 0x2)) |
| * This should be the case for all aperf/mperf |
| * capable AMD machines and is therefore safe to test here. |
| * Compare with Linus kernel git commit: acf01734b1747b1ec4 |
| */ |
| ret = read_msr(0, MSR_AMD_HWCR, &hwcr); |
| /* |
| * If the MSR read failed, assume a Xen system that did |
| * not explicitly provide access to it and assume TSC works |
| */ |
| if (ret != 0) { |
| dprint("TSC read 0x%x failed - assume TSC working\n", |
| MSR_AMD_HWCR); |
| return 0; |
| } else if (1 & (hwcr >> 24)) { |
| max_freq_mode = MAX_FREQ_TSC_REF; |
| return 0; |
| } else { /* Use sysfs max frequency if available */ } |
| } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { |
| /* |
| * On Intel we assume mperf (in C0) is ticking at same |
| * rate than TSC |
| */ |
| max_freq_mode = MAX_FREQ_TSC_REF; |
| return 0; |
| } |
| use_sysfs: |
| if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { |
| dprint("Cannot retrieve max freq from cpufreq kernel " |
| "subsystem\n"); |
| return -1; |
| } |
| max_freq_mode = MAX_FREQ_SYSFS; |
| max_frequency /= 1000; /* Default automatically to MHz value */ |
| return 0; |
| } |
| |
| /* |
| * This monitor provides: |
| * |
| * 1) Average frequency a CPU resided in |
| * This always works if the CPU has aperf/mperf capabilities |
| * |
| * 2) C0 and Cx (any sleep state) time a CPU resided in |
| * Works if mperf timer stops ticking in sleep states which |
| * seem to be the case on all current HW. |
| * Both is directly retrieved from HW registers and is independent |
| * from kernel statistics. |
| */ |
| struct cpuidle_monitor mperf_monitor; |
| struct cpuidle_monitor *mperf_register(void) |
| { |
| if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) |
| return NULL; |
| |
| if (init_maxfreq_mode()) |
| return NULL; |
| |
| /* Free this at program termination */ |
| is_valid = calloc(cpu_count, sizeof(int)); |
| mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); |
| aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); |
| mperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); |
| aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); |
| |
| mperf_monitor.name_len = strlen(mperf_monitor.name); |
| return &mperf_monitor; |
| } |
| |
| void mperf_unregister(void) |
| { |
| free(mperf_previous_count); |
| free(aperf_previous_count); |
| free(mperf_current_count); |
| free(aperf_current_count); |
| free(is_valid); |
| } |
| |
| struct cpuidle_monitor mperf_monitor = { |
| .name = "Mperf", |
| .hw_states_num = MPERF_CSTATE_COUNT, |
| .hw_states = mperf_cstates, |
| .start = mperf_start, |
| .stop = mperf_stop, |
| .do_register = mperf_register, |
| .unregister = mperf_unregister, |
| .needs_root = 1, |
| .overflow_s = 922000000 /* 922337203 seconds TSC overflow |
| at 20GHz */ |
| }; |
| #endif /* #if defined(__i386__) || defined(__x86_64__) */ |