Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branches 'pm-sleep', 'pm-domains' and 'pm-docs'

Merge changes related to system sleep, PM domains changes and power
management documentation changes for 5.18-rc1:

- Fix load_image_and_restore() error path (Ye Bin).

- Fix typos in comments in the system wakeup hadling code (Tom Rix).

- Clean up non-kernel-doc comments in hibernation code (Jiapeng
Chong).

- Fix __setup handler error handling in system-wide suspend and
hibernation core code (Randy Dunlap).

- Add device name to suspend_report_result() (Youngjin Jang).

- Make virtual guests honour ACPI S4 hardware signature by
default (David Woodhouse).

- Block power off of a parent PM domain unless child is in deepest
state (Ulf Hansson).

- Use dev_err_probe() to simplify error handling for generic PM
domains (Ahmad Fatoum).

- Fix sleep-in-atomic bug caused by genpd_debug_remove() (Shawn Guo).

- Document Intel uncore frequency scaling (Srinivas Pandruvada).

* pm-sleep:
PM: hibernate: Honour ACPI hardware signature by default for virtual guests
PM: sleep: Add device name to suspend_report_result()
PM: suspend: fix return value of __setup handler
PM: hibernate: fix __setup handler error handling
PM: hibernate: Clean up non-kernel-doc comments
PM: sleep: wakeup: Fix typos in comments
PM: hibernate: fix load_image_and_restore() error path

* pm-domains:
PM: domains: Fix sleep-in-atomic bug caused by genpd_debug_remove()
PM: domains: use dev_err_probe() to simplify error handling
PM: domains: Prevent power off for parent unless child is in deepest state

* pm-docs:
Documentation: admin-guide: pm: Document uncore frequency scaling

+146 -59
+60
Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst
··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + .. include:: <isonum.txt> 3 + 4 + ============================== 5 + Intel Uncore Frequency Scaling 6 + ============================== 7 + 8 + :Copyright: |copy| 2022 Intel Corporation 9 + 10 + :Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> 11 + 12 + Introduction 13 + ------------ 14 + 15 + The uncore can consume significant amount of power in Intel's Xeon servers based 16 + on the workload characteristics. To optimize the total power and improve overall 17 + performance, SoCs have internal algorithms for scaling uncore frequency. These 18 + algorithms monitor workload usage of uncore and set a desirable frequency. 19 + 20 + It is possible that users have different expectations of uncore performance and 21 + want to have control over it. The objective is similar to allowing users to set 22 + the scaling min/max frequencies via cpufreq sysfs to improve CPU performance. 23 + Users may have some latency sensitive workloads where they do not want any 24 + change to uncore frequency. Also, users may have workloads which require 25 + different core and uncore performance at distinct phases and they may want to 26 + use both cpufreq and the uncore scaling interface to distribute power and 27 + improve overall performance. 28 + 29 + Sysfs Interface 30 + --------------- 31 + 32 + To control uncore frequency, a sysfs interface is provided in the directory: 33 + `/sys/devices/system/cpu/intel_uncore_frequency/`. 34 + 35 + There is one directory for each package and die combination as the scope of 36 + uncore scaling control is per die in multiple die/package SoCs or per 37 + package for single die per package SoCs. The name represents the 38 + scope of control. For example: 'package_00_die_00' is for package id 0 and 39 + die 0. 40 + 41 + Each package_*_die_* contains the following attributes: 42 + 43 + ``initial_max_freq_khz`` 44 + Out of reset, this attribute represent the maximum possible frequency. 45 + This is a read-only attribute. If users adjust max_freq_khz, 46 + they can always go back to maximum using the value from this attribute. 47 + 48 + ``initial_min_freq_khz`` 49 + Out of reset, this attribute represent the minimum possible frequency. 50 + This is a read-only attribute. If users adjust min_freq_khz, 51 + they can always go back to minimum using the value from this attribute. 52 + 53 + ``max_freq_khz`` 54 + This attribute is used to set the maximum uncore frequency. 55 + 56 + ``min_freq_khz`` 57 + This attribute is used to set the minimum uncore frequency. 58 + 59 + ``current_freq_khz`` 60 + This attribute is used to get the current uncore frequency.
+1
Documentation/admin-guide/pm/working-state.rst
··· 15 15 cpufreq_drivers 16 16 intel_epb 17 17 intel-speed-select 18 + intel_uncore_frequency_scaling
+21 -2
arch/x86/kernel/acpi/sleep.c
··· 15 15 #include <asm/desc.h> 16 16 #include <asm/cacheflush.h> 17 17 #include <asm/realmode.h> 18 + #include <asm/hypervisor.h> 18 19 19 20 #include <linux/ftrace.h> 20 21 #include "../../realmode/rm/wakeup.h" ··· 141 140 acpi_realmode_flags |= 4; 142 141 #ifdef CONFIG_HIBERNATION 143 142 if (strncmp(str, "s4_hwsig", 8) == 0) 144 - acpi_check_s4_hw_signature(1); 143 + acpi_check_s4_hw_signature = 1; 145 144 if (strncmp(str, "s4_nohwsig", 10) == 0) 146 - acpi_check_s4_hw_signature(0); 145 + acpi_check_s4_hw_signature = 0; 147 146 #endif 148 147 if (strncmp(str, "nonvs", 5) == 0) 149 148 acpi_nvs_nosave(); ··· 161 160 } 162 161 163 162 __setup("acpi_sleep=", acpi_sleep_setup); 163 + 164 + #if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST) 165 + static int __init init_s4_sigcheck(void) 166 + { 167 + /* 168 + * If running on a hypervisor, honour the ACPI specification 169 + * by default and trigger a clean reboot when the hardware 170 + * signature in FACS is changed after hibernation. 171 + */ 172 + if (acpi_check_s4_hw_signature == -1 && 173 + !hypervisor_is_type(X86_HYPER_NATIVE)) 174 + acpi_check_s4_hw_signature = 1; 175 + 176 + return 0; 177 + } 178 + /* This must happen before acpi_init() which is a subsys initcall */ 179 + arch_initcall(init_s4_sigcheck); 180 + #endif
+3 -8
drivers/acpi/sleep.c
··· 869 869 #ifdef CONFIG_HIBERNATION 870 870 static unsigned long s4_hardware_signature; 871 871 static struct acpi_table_facs *facs; 872 - static int sigcheck = -1; /* Default behaviour is just to warn */ 873 - 874 - void __init acpi_check_s4_hw_signature(int check) 875 - { 876 - sigcheck = check; 877 - } 872 + int acpi_check_s4_hw_signature = -1; /* Default behaviour is just to warn */ 878 873 879 874 static int acpi_hibernation_begin(pm_message_t stage) 880 875 { ··· 994 999 hibernation_set_ops(old_suspend_ordering ? 995 1000 &acpi_hibernation_ops_old : &acpi_hibernation_ops); 996 1001 sleep_states[ACPI_STATE_S4] = 1; 997 - if (!sigcheck) 1002 + if (!acpi_check_s4_hw_signature) 998 1003 return; 999 1004 1000 1005 acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs); ··· 1006 1011 */ 1007 1012 s4_hardware_signature = facs->hardware_signature; 1008 1013 1009 - if (sigcheck > 0) { 1014 + if (acpi_check_s4_hw_signature > 0) { 1010 1015 /* 1011 1016 * If we're actually obeying the ACPI specification 1012 1017 * then the signature is written out as part of the
+26 -16
drivers/base/power/domain.c
··· 636 636 atomic_read(&genpd->sd_count) > 0) 637 637 return -EBUSY; 638 638 639 + /* 640 + * The children must be in their deepest (powered-off) states to allow 641 + * the parent to be powered off. Note that, there's no need for 642 + * additional locking, as powering on a child, requires the parent's 643 + * lock to be acquired first. 644 + */ 645 + list_for_each_entry(link, &genpd->parent_links, parent_node) { 646 + struct generic_pm_domain *child = link->child; 647 + if (child->state_idx < child->state_count - 1) 648 + return -EBUSY; 649 + } 650 + 639 651 list_for_each_entry(pdd, &genpd->dev_list, list_node) { 640 652 enum pm_qos_flags_status stat; 641 653 ··· 1084 1072 if (genpd->suspended_count != genpd->device_count 1085 1073 || atomic_read(&genpd->sd_count) > 0) 1086 1074 return; 1075 + 1076 + /* Check that the children are in their deepest (powered-off) state. */ 1077 + list_for_each_entry(link, &genpd->parent_links, parent_node) { 1078 + struct generic_pm_domain *child = link->child; 1079 + if (child->state_idx < child->state_count - 1) 1080 + return; 1081 + } 1087 1082 1088 1083 /* Choose the deepest state when suspending */ 1089 1084 genpd->state_idx = genpd->state_count - 1; ··· 2077 2058 kfree(link); 2078 2059 } 2079 2060 2080 - genpd_debug_remove(genpd); 2081 2061 list_del(&genpd->gpd_list_node); 2082 2062 genpd_unlock(genpd); 2063 + genpd_debug_remove(genpd); 2083 2064 cancel_work_sync(&genpd->power_off_work); 2084 2065 if (genpd_is_cpu_domain(genpd)) 2085 2066 free_cpumask_var(genpd->cpus); ··· 2267 2248 /* Parse genpd OPP table */ 2268 2249 if (genpd->set_performance_state) { 2269 2250 ret = dev_pm_opp_of_add_table(&genpd->dev); 2270 - if (ret) { 2271 - if (ret != -EPROBE_DEFER) 2272 - dev_err(&genpd->dev, "Failed to add OPP table: %d\n", 2273 - ret); 2274 - return ret; 2275 - } 2251 + if (ret) 2252 + return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n"); 2276 2253 2277 2254 /* 2278 2255 * Save table for faster processing while setting performance ··· 2327 2312 if (genpd->set_performance_state) { 2328 2313 ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i); 2329 2314 if (ret) { 2330 - if (ret != -EPROBE_DEFER) 2331 - dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", 2332 - i, ret); 2315 + dev_err_probe(&genpd->dev, ret, 2316 + "Failed to add OPP table for index %d\n", i); 2333 2317 goto error; 2334 2318 } 2335 2319 ··· 2686 2672 ret = genpd_add_device(pd, dev, base_dev); 2687 2673 mutex_unlock(&gpd_list_lock); 2688 2674 2689 - if (ret < 0) { 2690 - if (ret != -EPROBE_DEFER) 2691 - dev_err(dev, "failed to add to PM domain %s: %d", 2692 - pd->name, ret); 2693 - return ret; 2694 - } 2675 + if (ret < 0) 2676 + return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name); 2695 2677 2696 2678 dev->pm_domain->detach = genpd_dev_pm_detach; 2697 2679 dev->pm_domain->sync = genpd_dev_pm_sync;
+5 -5
drivers/base/power/main.c
··· 485 485 trace_device_pm_callback_start(dev, info, state.event); 486 486 error = cb(dev); 487 487 trace_device_pm_callback_end(dev, error); 488 - suspend_report_result(cb, error); 488 + suspend_report_result(dev, cb, error); 489 489 490 490 initcall_debug_report(dev, calltime, cb, error); 491 491 ··· 1568 1568 trace_device_pm_callback_start(dev, info, state.event); 1569 1569 error = cb(dev, state); 1570 1570 trace_device_pm_callback_end(dev, error); 1571 - suspend_report_result(cb, error); 1571 + suspend_report_result(dev, cb, error); 1572 1572 1573 1573 initcall_debug_report(dev, calltime, cb, error); 1574 1574 ··· 1855 1855 device_unlock(dev); 1856 1856 1857 1857 if (ret < 0) { 1858 - suspend_report_result(callback, ret); 1858 + suspend_report_result(dev, callback, ret); 1859 1859 pm_runtime_put(dev); 1860 1860 return ret; 1861 1861 } ··· 1960 1960 } 1961 1961 EXPORT_SYMBOL_GPL(dpm_suspend_start); 1962 1962 1963 - void __suspend_report_result(const char *function, void *fn, int ret) 1963 + void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) 1964 1964 { 1965 1965 if (ret) 1966 - pr_err("%s(): %pS returns %d\n", function, fn, ret); 1966 + dev_err(dev, "%s(): %pS returns %d\n", function, fn, ret); 1967 1967 } 1968 1968 EXPORT_SYMBOL_GPL(__suspend_report_result); 1969 1969
+1 -1
drivers/base/power/wakeirq.c
··· 289 289 * 290 290 * Enables wakeirq conditionally. We need to enable wake-up interrupt 291 291 * lazily on the first rpm_suspend(). This is needed as the consumer device 292 - * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would 292 + * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would 293 293 * otherwise try to disable already disabled wakeirq. The wake-up interrupt 294 294 * starts disabled with IRQ_NOAUTOEN set. 295 295 *
+2 -2
drivers/base/power/wakeup.c
··· 587 587 * @ws: Wakeup source to handle. 588 588 * 589 589 * Update the @ws' statistics and, if @ws has just been activated, notify the PM 590 - * core of the event by incrementing the counter of of wakeup events being 590 + * core of the event by incrementing the counter of the wakeup events being 591 591 * processed. 592 592 */ 593 593 static void wakeup_source_activate(struct wakeup_source *ws) ··· 733 733 734 734 /* 735 735 * Increment the counter of registered wakeup events and decrement the 736 - * couter of wakeup events in progress simultaneously. 736 + * counter of wakeup events in progress simultaneously. 737 737 */ 738 738 cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); 739 739 trace_wakeup_source_deactivate(ws->name, cec);
+7 -7
drivers/pci/pci-driver.c
··· 596 596 int error; 597 597 598 598 error = drv->suspend(pci_dev, state); 599 - suspend_report_result(drv->suspend, error); 599 + suspend_report_result(dev, drv->suspend, error); 600 600 if (error) 601 601 return error; 602 602 ··· 775 775 int error; 776 776 777 777 error = pm->suspend(dev); 778 - suspend_report_result(pm->suspend, error); 778 + suspend_report_result(dev, pm->suspend, error); 779 779 if (error) 780 780 return error; 781 781 ··· 821 821 int error; 822 822 823 823 error = pm->suspend_noirq(dev); 824 - suspend_report_result(pm->suspend_noirq, error); 824 + suspend_report_result(dev, pm->suspend_noirq, error); 825 825 if (error) 826 826 return error; 827 827 ··· 1010 1010 int error; 1011 1011 1012 1012 error = pm->freeze(dev); 1013 - suspend_report_result(pm->freeze, error); 1013 + suspend_report_result(dev, pm->freeze, error); 1014 1014 if (error) 1015 1015 return error; 1016 1016 } ··· 1030 1030 int error; 1031 1031 1032 1032 error = pm->freeze_noirq(dev); 1033 - suspend_report_result(pm->freeze_noirq, error); 1033 + suspend_report_result(dev, pm->freeze_noirq, error); 1034 1034 if (error) 1035 1035 return error; 1036 1036 } ··· 1116 1116 int error; 1117 1117 1118 1118 error = pm->poweroff(dev); 1119 - suspend_report_result(pm->poweroff, error); 1119 + suspend_report_result(dev, pm->poweroff, error); 1120 1120 if (error) 1121 1121 return error; 1122 1122 } ··· 1154 1154 int error; 1155 1155 1156 1156 error = pm->poweroff_noirq(dev); 1157 - suspend_report_result(pm->poweroff_noirq, error); 1157 + suspend_report_result(dev, pm->poweroff_noirq, error); 1158 1158 if (error) 1159 1159 return error; 1160 1160 }
+1 -1
drivers/pnp/driver.c
··· 171 171 172 172 if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) { 173 173 error = pnp_drv->driver.pm->suspend(dev); 174 - suspend_report_result(pnp_drv->driver.pm->suspend, error); 174 + suspend_report_result(dev, pnp_drv->driver.pm->suspend, error); 175 175 if (error) 176 176 return error; 177 177 }
+2 -2
drivers/usb/core/hcd-pci.c
··· 446 446 HCD_WAKEUP_PENDING(hcd->shared_hcd)) 447 447 return -EBUSY; 448 448 retval = hcd->driver->pci_suspend(hcd, do_wakeup); 449 - suspend_report_result(hcd->driver->pci_suspend, retval); 449 + suspend_report_result(dev, hcd->driver->pci_suspend, retval); 450 450 451 451 /* Check again in case wakeup raced with pci_suspend */ 452 452 if ((retval == 0 && do_wakeup && HCD_WAKEUP_PENDING(hcd)) || ··· 556 556 dev_dbg(dev, "--> PCI %s\n", 557 557 pci_power_name(pci_dev->current_state)); 558 558 } else { 559 - suspend_report_result(pci_prepare_to_sleep, retval); 559 + suspend_report_result(dev, pci_prepare_to_sleep, retval); 560 560 return retval; 561 561 } 562 562
+1 -1
include/linux/acpi.h
··· 526 526 int acpi_resources_are_enforced(void); 527 527 528 528 #ifdef CONFIG_HIBERNATION 529 - void __init acpi_check_s4_hw_signature(int check); 529 + extern int acpi_check_s4_hw_signature; 530 530 #endif 531 531 532 532 #ifdef CONFIG_PM_SLEEP
+4 -4
include/linux/pm.h
··· 770 770 extern int dpm_suspend(pm_message_t state); 771 771 extern int dpm_prepare(pm_message_t state); 772 772 773 - extern void __suspend_report_result(const char *function, void *fn, int ret); 773 + extern void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret); 774 774 775 - #define suspend_report_result(fn, ret) \ 775 + #define suspend_report_result(dev, fn, ret) \ 776 776 do { \ 777 - __suspend_report_result(__func__, fn, ret); \ 777 + __suspend_report_result(__func__, dev, fn, ret); \ 778 778 } while (0) 779 779 780 780 extern int device_pm_wait_for_dev(struct device *sub, struct device *dev); ··· 814 814 return 0; 815 815 } 816 816 817 - #define suspend_report_result(fn, ret) do {} while (0) 817 + #define suspend_report_result(dev, fn, ret) do {} while (0) 818 818 819 819 static inline int device_pm_wait_for_dev(struct device *a, struct device *b) 820 820 {
+4 -2
kernel/power/hibernate.c
··· 689 689 690 690 lock_device_hotplug(); 691 691 error = create_basic_memory_bitmaps(); 692 - if (error) 692 + if (error) { 693 + swsusp_close(FMODE_READ | FMODE_EXCL); 693 694 goto Unlock; 695 + } 694 696 695 697 error = swsusp_read(&flags); 696 698 swsusp_close(FMODE_READ | FMODE_EXCL); ··· 1330 1328 int rc = kstrtouint(str, 0, &resume_delay); 1331 1329 1332 1330 if (rc) 1333 - return rc; 1331 + pr_warn("resumedelay: bad option string '%s'\n", str); 1334 1332 return 1; 1335 1333 } 1336 1334
+4 -4
kernel/power/suspend_test.c
··· 157 157 value++; 158 158 suspend_type = strsep(&value, ","); 159 159 if (!suspend_type) 160 - return 0; 160 + return 1; 161 161 162 162 repeat = strsep(&value, ","); 163 163 if (repeat) { 164 164 if (kstrtou32(repeat, 0, &test_repeat_count_max)) 165 - return 0; 165 + return 1; 166 166 } 167 167 168 168 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 169 169 if (!strcmp(pm_labels[i], suspend_type)) { 170 170 test_state_label = pm_labels[i]; 171 - return 0; 171 + return 1; 172 172 } 173 173 174 174 printk(warn_bad_state, suspend_type); 175 - return 0; 175 + return 1; 176 176 } 177 177 __setup("test_suspend", setup_test_suspend); 178 178
+4 -4
kernel/power/swap.c
··· 89 89 struct swap_map_page_list *next; 90 90 }; 91 91 92 - /** 92 + /* 93 93 * The swap_map_handle structure is used for handling swap in 94 94 * a file-alike way 95 95 */ ··· 117 117 118 118 static struct swsusp_header *swsusp_header; 119 119 120 - /** 120 + /* 121 121 * The following functions are used for tracing the allocated 122 122 * swap pages, so that they can be freed in case of an error. 123 123 */ ··· 171 171 return 0; 172 172 } 173 173 174 - /** 174 + /* 175 175 * alloc_swapdev_block - allocate a swap page and register that it has 176 176 * been allocated, so that it can be freed in case of an error. 177 177 */ ··· 190 190 return 0; 191 191 } 192 192 193 - /** 193 + /* 194 194 * free_all_swap_pages - free swap pages allocated for saving image data. 195 195 * It also frees the extents used to register which swap entries had been 196 196 * allocated.