Merge tag 'pm-5.11-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm

Pull power management fixes from Rafael Wysocki:
"Address a performance regression related to scale-invariance on x86
that may prevent turbo CPU frequencies from being used in certain
workloads on systems using acpi-cpufreq as the CPU performance scaling
driver and schedutil as the scaling governor"

* tag 'pm-5.11-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
cpufreq: ACPI: Update arch scale-invariance max perf ratio if CPPC is not there
cpufreq: ACPI: Extend frequency tables to cover boost frequencies

+105 -13
+1
arch/x86/kernel/smpboot.c
··· 1833 1833 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 1834 1834 arch_turbo_freq_ratio; 1835 1835 } 1836 + EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); 1836 1837 1837 1838 static bool turbo_disabled(void) 1838 1839 {
+104 -13
drivers/cpufreq/acpi-cpufreq.c
··· 26 26 #include <linux/uaccess.h> 27 27 28 28 #include <acpi/processor.h> 29 + #include <acpi/cppc_acpi.h> 29 30 30 31 #include <asm/msr.h> 31 32 #include <asm/processor.h> ··· 54 53 unsigned int resume; 55 54 unsigned int cpu_feature; 56 55 unsigned int acpi_perf_cpu; 56 + unsigned int first_perf_state; 57 57 cpumask_var_t freqdomain_cpus; 58 58 void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val); 59 59 u32 (*cpu_freq_read)(struct acpi_pct_register *reg); ··· 223 221 224 222 perf = to_perf_data(data); 225 223 226 - cpufreq_for_each_entry(pos, policy->freq_table) 224 + cpufreq_for_each_entry(pos, policy->freq_table + data->first_perf_state) 227 225 if (msr == perf->states[pos->driver_data].status) 228 226 return pos->frequency; 229 - return policy->freq_table[0].frequency; 227 + return policy->freq_table[data->first_perf_state].frequency; 230 228 } 231 229 232 230 static unsigned extract_freq(struct cpufreq_policy *policy, u32 val) ··· 365 363 struct cpufreq_policy *policy; 366 364 unsigned int freq; 367 365 unsigned int cached_freq; 366 + unsigned int state; 368 367 369 368 pr_debug("%s (%d)\n", __func__, cpu); 370 369 ··· 377 374 if (unlikely(!data || !policy->freq_table)) 378 375 return 0; 379 376 380 - cached_freq = policy->freq_table[to_perf_data(data)->state].frequency; 377 + state = to_perf_data(data)->state; 378 + if (state < data->first_perf_state) 379 + state = data->first_perf_state; 380 + 381 + cached_freq = policy->freq_table[state].frequency; 381 382 freq = extract_freq(policy, get_cur_val(cpumask_of(cpu), data)); 382 383 if (freq != cached_freq) { 383 384 /* ··· 635 628 } 636 629 #endif 637 630 631 + #ifdef CONFIG_ACPI_CPPC_LIB 632 + static u64 get_max_boost_ratio(unsigned int cpu) 633 + { 634 + struct cppc_perf_caps perf_caps; 635 + u64 highest_perf, nominal_perf; 636 + int ret; 637 + 638 + if (acpi_pstate_strict) 639 + return 0; 640 + 641 + ret = cppc_get_perf_caps(cpu, &perf_caps); 642 + if (ret) { 643 + pr_debug("CPU%d: Unable to get performance capabilities (%d)\n", 644 + cpu, ret); 645 + return 0; 646 + } 647 + 648 + highest_perf = perf_caps.highest_perf; 649 + nominal_perf = perf_caps.nominal_perf; 650 + 651 + if (!highest_perf || !nominal_perf) { 652 + pr_debug("CPU%d: highest or nominal performance missing\n", cpu); 653 + return 0; 654 + } 655 + 656 + if (highest_perf < nominal_perf) { 657 + pr_debug("CPU%d: nominal performance above highest\n", cpu); 658 + return 0; 659 + } 660 + 661 + return div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); 662 + } 663 + #else 664 + static inline u64 get_max_boost_ratio(unsigned int cpu) { return 0; } 665 + #endif 666 + 638 667 static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) 639 668 { 640 - unsigned int i; 641 - unsigned int valid_states = 0; 642 - unsigned int cpu = policy->cpu; 643 - struct acpi_cpufreq_data *data; 644 - unsigned int result = 0; 645 - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 646 - struct acpi_processor_performance *perf; 647 669 struct cpufreq_frequency_table *freq_table; 670 + struct acpi_processor_performance *perf; 671 + struct acpi_cpufreq_data *data; 672 + unsigned int cpu = policy->cpu; 673 + struct cpuinfo_x86 *c = &cpu_data(cpu); 674 + unsigned int valid_states = 0; 675 + unsigned int result = 0; 676 + unsigned int state_count; 677 + u64 max_boost_ratio; 678 + unsigned int i; 648 679 #ifdef CONFIG_SMP 649 680 static int blacklisted; 650 681 #endif ··· 795 750 goto err_unreg; 796 751 } 797 752 798 - freq_table = kcalloc(perf->state_count + 1, sizeof(*freq_table), 799 - GFP_KERNEL); 753 + state_count = perf->state_count + 1; 754 + 755 + max_boost_ratio = get_max_boost_ratio(cpu); 756 + if (max_boost_ratio) { 757 + /* 758 + * Make a room for one more entry to represent the highest 759 + * available "boost" frequency. 760 + */ 761 + state_count++; 762 + valid_states++; 763 + data->first_perf_state = valid_states; 764 + } else { 765 + /* 766 + * If the maximum "boost" frequency is unknown, ask the arch 767 + * scale-invariance code to use the "nominal" performance for 768 + * CPU utilization scaling so as to prevent the schedutil 769 + * governor from selecting inadequate CPU frequencies. 770 + */ 771 + arch_set_max_freq_ratio(true); 772 + } 773 + 774 + freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL); 800 775 if (!freq_table) { 801 776 result = -ENOMEM; 802 777 goto err_unreg; ··· 850 785 valid_states++; 851 786 } 852 787 freq_table[valid_states].frequency = CPUFREQ_TABLE_END; 788 + 789 + if (max_boost_ratio) { 790 + unsigned int state = data->first_perf_state; 791 + unsigned int freq = freq_table[state].frequency; 792 + 793 + /* 794 + * Because the loop above sorts the freq_table entries in the 795 + * descending order, freq is the maximum frequency in the table. 796 + * Assume that it corresponds to the CPPC nominal frequency and 797 + * use it to populate the frequency field of the extra "boost" 798 + * frequency entry. 799 + */ 800 + freq_table[0].frequency = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; 801 + /* 802 + * The purpose of the extra "boost" frequency entry is to make 803 + * the rest of cpufreq aware of the real maximum frequency, but 804 + * the way to request it is the same as for the first_perf_state 805 + * entry that is expected to cover the entire range of "boost" 806 + * frequencies of the CPU, so copy the driver_data value from 807 + * that entry. 808 + */ 809 + freq_table[0].driver_data = freq_table[state].driver_data; 810 + } 811 + 853 812 policy->freq_table = freq_table; 854 813 perf->state = 0; 855 814 ··· 947 858 { 948 859 struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, 949 860 policy->cpu); 861 + struct acpi_cpufreq_data *data = policy->driver_data; 862 + unsigned int freq = policy->freq_table[data->first_perf_state].frequency; 950 863 951 - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) 864 + if (perf->states[0].core_frequency * 1000 != freq) 952 865 pr_warn(FW_WARN "P-state 0 is not max freq\n"); 953 866 } 954 867