Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: trans_pgd: hibernate: idmap the single page that holds the copy page routines

To resume from hibernate, the contents of memory are restored from
the swap image. This may overwrite any page, including the running
kernel and its page tables.

Hibernate copies the code it uses to do the restore into a single
page that it knows won't be overwritten, and maps it with page tables
built from pages that won't be overwritten.

Today the address it uses for this mapping is arbitrary, but to allow
kexec to reuse this code, it needs to be idmapped. To idmap the page
we must avoid the kernel helpers that have VA_BITS baked in.

Convert create_single_mapping() to take a single PA, and idmap it.
The page tables are built in the reverse order to normal using
pfn_pte() to stir in any bits between 52:48. T0SZ is always increased
to cover 48bits, or 52 if the copy code has bits 52:48 in its PA.

Signed-off-by: James Morse <james.morse@arm.com>

[Adopted the original patch from James to trans_pgd interface, so it can be
commonly used by both Kexec and Hibernate. Some minor clean-ups.]

Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Link: https://lore.kernel.org/linux-arm-kernel/20200115143322.214247-4-james.morse@arm.com/
Link: https://lore.kernel.org/r/20210125191923.1060122-9-pasha.tatashin@soleen.com
Signed-off-by: Will Deacon <will@kernel.org>

authored by

James Morse and committed by
Will Deacon
7018d467 1401bef7

+63 -21
+3
arch/arm64/include/asm/trans_pgd.h
··· 33 33 int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, 34 34 void *page, unsigned long dst_addr, pgprot_t pgprot); 35 35 36 + int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, 37 + unsigned long *t0sz, void *page); 38 + 36 39 #endif /* _ASM_TRANS_TABLE_H */
+11 -21
arch/arm64/kernel/hibernate.c
··· 194 194 * page system. 195 195 */ 196 196 static int create_safe_exec_page(void *src_start, size_t length, 197 - unsigned long dst_addr, 198 197 phys_addr_t *phys_dst_addr) 199 198 { 200 199 struct trans_pgd_info trans_info = { ··· 202 203 }; 203 204 204 205 void *page = (void *)get_safe_page(GFP_ATOMIC); 205 - pgd_t *trans_pgd; 206 + phys_addr_t trans_ttbr0; 207 + unsigned long t0sz; 206 208 int rc; 207 209 208 210 if (!page) ··· 211 211 212 212 memcpy(page, src_start, length); 213 213 __flush_icache_range((unsigned long)page, (unsigned long)page + length); 214 - 215 - trans_pgd = (void *)get_safe_page(GFP_ATOMIC); 216 - if (!trans_pgd) 217 - return -ENOMEM; 218 - 219 - rc = trans_pgd_map_page(&trans_info, trans_pgd, page, dst_addr, 220 - PAGE_KERNEL_EXEC); 214 + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); 221 215 if (rc) 222 216 return rc; 223 217 ··· 224 230 * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI 225 231 * runtime services), while for a userspace-driven test_resume cycle it 226 232 * points to userspace page tables (and we must point it at a zero page 227 - * ourselves). Elsewhere we only (un)install the idmap with preemption 228 - * disabled, so T0SZ should be as required regardless. 233 + * ourselves). 234 + * 235 + * We change T0SZ as part of installing the idmap. This is undone by 236 + * cpu_uninstall_idmap() in __cpu_suspend_exit(). 229 237 */ 230 238 cpu_set_reserved_ttbr0(); 231 239 local_flush_tlb_all(); 232 - write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1); 240 + __cpu_set_tcr_t0sz(t0sz); 241 + write_sysreg(trans_ttbr0, ttbr0_el1); 233 242 isb(); 234 243 235 244 *phys_dst_addr = virt_to_phys(page); ··· 431 434 void *zero_page; 432 435 size_t exit_size; 433 436 pgd_t *tmp_pg_dir; 434 - phys_addr_t phys_hibernate_exit; 435 437 void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, 436 438 void *, phys_addr_t, phys_addr_t); 437 439 struct trans_pgd_info trans_info = { ··· 458 462 return -ENOMEM; 459 463 } 460 464 461 - /* 462 - * Locate the exit code in the bottom-but-one page, so that *NULL 463 - * still has disastrous affects. 464 - */ 465 - hibernate_exit = (void *)PAGE_SIZE; 466 465 exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; 467 466 /* 468 467 * Copy swsusp_arch_suspend_exit() to a safe page. This will generate 469 468 * a new set of ttbr0 page tables and load them. 470 469 */ 471 470 rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, 472 - (unsigned long)hibernate_exit, 473 - &phys_hibernate_exit); 471 + (phys_addr_t *)&hibernate_exit); 474 472 if (rc) { 475 473 pr_err("Failed to create safe executable page for hibernate_exit code.\n"); 476 474 return rc; ··· 483 493 * We can skip this step if we booted at EL1, or are running with VHE. 484 494 */ 485 495 if (el2_reset_needed()) { 486 - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ 496 + phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit; 487 497 el2_vectors += hibernate_el2_vectors - 488 498 __hibernate_exit_text_start; /* offset */ 489 499
+49
arch/arm64/mm/trans_pgd.c
··· 273 273 274 274 return 0; 275 275 } 276 + 277 + /* 278 + * The page we want to idmap may be outside the range covered by VA_BITS that 279 + * can be built using the kernel's p?d_populate() helpers. As a one off, for a 280 + * single page, we build these page tables bottom up and just assume that will 281 + * need the maximum T0SZ. 282 + * 283 + * Returns 0 on success, and -ENOMEM on failure. 284 + * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to 285 + * maximum T0SZ for this page. 286 + */ 287 + int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, 288 + unsigned long *t0sz, void *page) 289 + { 290 + phys_addr_t dst_addr = virt_to_phys(page); 291 + unsigned long pfn = __phys_to_pfn(dst_addr); 292 + int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47; 293 + int bits_mapped = PAGE_SHIFT - 4; 294 + unsigned long level_mask, prev_level_entry, *levels[4]; 295 + int this_level, index, level_lsb, level_msb; 296 + 297 + dst_addr &= PAGE_MASK; 298 + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); 299 + 300 + for (this_level = 3; this_level >= 0; this_level--) { 301 + levels[this_level] = trans_alloc(info); 302 + if (!levels[this_level]) 303 + return -ENOMEM; 304 + 305 + level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level); 306 + level_msb = min(level_lsb + bits_mapped, max_msb); 307 + level_mask = GENMASK_ULL(level_msb, level_lsb); 308 + 309 + index = (dst_addr & level_mask) >> level_lsb; 310 + *(levels[this_level] + index) = prev_level_entry; 311 + 312 + pfn = virt_to_pfn(levels[this_level]); 313 + prev_level_entry = pte_val(pfn_pte(pfn, 314 + __pgprot(PMD_TYPE_TABLE))); 315 + 316 + if (level_msb == max_msb) 317 + break; 318 + } 319 + 320 + *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn)); 321 + *t0sz = TCR_T0SZ(max_msb + 1); 322 + 323 + return 0; 324 + }