Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch kvm-arm64/s1ptw-write-fault into kvmarm-master/fixes

* kvm-arm64/s1ptw-write-fault:
: .
: Fix S1PTW fault handling that was until then always taken
: as a write. From the cover letter:
:
: `Recent developments on the EFI front have resulted in guests that
: simply won't boot if the page tables are in a read-only memslot and
: that you're a bit unlucky in the way S2 gets paged in... The core
: issue is related to the fact that we treat a S1PTW as a write, which
: is close enough to what needs to be done. Until to get to RO memslots.
:
: The first patch fixes this and is definitely a stable candidate. It
: splits the faulting of page tables in two steps (RO translation fault,
: followed by a writable permission fault -- should it even happen).
: The second one documents the slightly odd behaviour of PTW writes to
: RO memslot, which do not result in a KVM_MMIO exit. The last patch is
: totally optional, only tangentially related, and randomly repainting
: stuff (maybe that's contagious, who knows)."
:
: .
KVM: arm64: Convert FSC_* over to ESR_ELx_FSC_*
KVM: arm64: Document the behaviour of S1PTW faults on RO memslots
KVM: arm64: Fix S1PTW handling on RO memslots

Signed-off-by: Marc Zyngier <maz@kernel.org>

+61 -38
+8
Documentation/virt/kvm/api.rst
··· 1354 1354 mmap() that affects the region will be made visible immediately. Another 1355 1355 example is madvise(MADV_DROP). 1356 1356 1357 + Note: On arm64, a write generated by the page-table walker (to update 1358 + the Access and Dirty flags, for example) never results in a 1359 + KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This 1360 + is because KVM cannot provide the data that would be written by the 1361 + page-table walker, making it impossible to emulate the access. 1362 + Instead, an abort (data abort if the cause of the page-table update 1363 + was a load or a store, instruction abort if it was an instruction 1364 + fetch) is injected in the guest. 1357 1365 1358 1366 4.36 KVM_SET_TSS_ADDR 1359 1367 ---------------------
+9
arch/arm64/include/asm/esr.h
··· 114 114 #define ESR_ELx_FSC_ACCESS (0x08) 115 115 #define ESR_ELx_FSC_FAULT (0x04) 116 116 #define ESR_ELx_FSC_PERM (0x0C) 117 + #define ESR_ELx_FSC_SEA_TTW0 (0x14) 118 + #define ESR_ELx_FSC_SEA_TTW1 (0x15) 119 + #define ESR_ELx_FSC_SEA_TTW2 (0x16) 120 + #define ESR_ELx_FSC_SEA_TTW3 (0x17) 121 + #define ESR_ELx_FSC_SECC (0x18) 122 + #define ESR_ELx_FSC_SECC_TTW0 (0x1c) 123 + #define ESR_ELx_FSC_SECC_TTW1 (0x1d) 124 + #define ESR_ELx_FSC_SECC_TTW2 (0x1e) 125 + #define ESR_ELx_FSC_SECC_TTW3 (0x1f) 117 126 118 127 /* ISS field definitions for Data Aborts */ 119 128 #define ESR_ELx_ISV_SHIFT (24)
-15
arch/arm64/include/asm/kvm_arm.h
··· 319 319 BIT(18) | \ 320 320 GENMASK(16, 15)) 321 321 322 - /* For compatibility with fault code shared with 32-bit */ 323 - #define FSC_FAULT ESR_ELx_FSC_FAULT 324 - #define FSC_ACCESS ESR_ELx_FSC_ACCESS 325 - #define FSC_PERM ESR_ELx_FSC_PERM 326 - #define FSC_SEA ESR_ELx_FSC_EXTABT 327 - #define FSC_SEA_TTW0 (0x14) 328 - #define FSC_SEA_TTW1 (0x15) 329 - #define FSC_SEA_TTW2 (0x16) 330 - #define FSC_SEA_TTW3 (0x17) 331 - #define FSC_SECC (0x18) 332 - #define FSC_SECC_TTW0 (0x1c) 333 - #define FSC_SECC_TTW1 (0x1d) 334 - #define FSC_SECC_TTW2 (0x1e) 335 - #define FSC_SECC_TTW3 (0x1f) 336 - 337 322 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ 338 323 #define HPFAR_MASK (~UL(0xf)) 339 324 /*
+30 -12
arch/arm64/include/asm/kvm_emulate.h
··· 349 349 static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) 350 350 { 351 351 switch (kvm_vcpu_trap_get_fault(vcpu)) { 352 - case FSC_SEA: 353 - case FSC_SEA_TTW0: 354 - case FSC_SEA_TTW1: 355 - case FSC_SEA_TTW2: 356 - case FSC_SEA_TTW3: 357 - case FSC_SECC: 358 - case FSC_SECC_TTW0: 359 - case FSC_SECC_TTW1: 360 - case FSC_SECC_TTW2: 361 - case FSC_SECC_TTW3: 352 + case ESR_ELx_FSC_EXTABT: 353 + case ESR_ELx_FSC_SEA_TTW0: 354 + case ESR_ELx_FSC_SEA_TTW1: 355 + case ESR_ELx_FSC_SEA_TTW2: 356 + case ESR_ELx_FSC_SEA_TTW3: 357 + case ESR_ELx_FSC_SECC: 358 + case ESR_ELx_FSC_SECC_TTW0: 359 + case ESR_ELx_FSC_SECC_TTW1: 360 + case ESR_ELx_FSC_SECC_TTW2: 361 + case ESR_ELx_FSC_SECC_TTW3: 362 362 return true; 363 363 default: 364 364 return false; ··· 373 373 374 374 static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) 375 375 { 376 - if (kvm_vcpu_abt_iss1tw(vcpu)) 377 - return true; 376 + if (kvm_vcpu_abt_iss1tw(vcpu)) { 377 + /* 378 + * Only a permission fault on a S1PTW should be 379 + * considered as a write. Otherwise, page tables baked 380 + * in a read-only memslot will result in an exception 381 + * being delivered in the guest. 382 + * 383 + * The drawback is that we end-up faulting twice if the 384 + * guest is using any of HW AF/DB: a translation fault 385 + * to map the page containing the PT (read only at 386 + * first), then a permission fault to allow the flags 387 + * to be set. 388 + */ 389 + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 390 + case ESR_ELx_FSC_PERM: 391 + return true; 392 + default: 393 + return false; 394 + } 395 + } 378 396 379 397 if (kvm_vcpu_trap_is_iabt(vcpu)) 380 398 return false;
+1 -1
arch/arm64/kvm/hyp/include/hyp/fault.h
··· 60 60 */ 61 61 if (!(esr & ESR_ELx_S1PTW) && 62 62 (cpus_have_final_cap(ARM64_WORKAROUND_834220) || 63 - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { 63 + (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { 64 64 if (!__translate_far_to_hpfar(far, &hpfar)) 65 65 return false; 66 66 } else {
+1 -1
arch/arm64/kvm/hyp/include/hyp/switch.h
··· 367 367 if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { 368 368 bool valid; 369 369 370 - valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && 370 + valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && 371 371 kvm_vcpu_dabt_isvalid(vcpu) && 372 372 !kvm_vcpu_abt_issea(vcpu) && 373 373 !kvm_vcpu_abt_iss1tw(vcpu);
+12 -9
arch/arm64/kvm/mmu.c
··· 1212 1212 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1213 1213 VM_BUG_ON(write_fault && exec_fault); 1214 1214 1215 - if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1215 + if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { 1216 1216 kvm_err("Unexpected L2 read permission error\n"); 1217 1217 return -EFAULT; 1218 1218 } ··· 1277 1277 * only exception to this is when dirty logging is enabled at runtime 1278 1278 * and a write fault needs to collapse a block entry into a table. 1279 1279 */ 1280 - if (fault_status != FSC_PERM || (logging_active && write_fault)) { 1280 + if (fault_status != ESR_ELx_FSC_PERM || 1281 + (logging_active && write_fault)) { 1281 1282 ret = kvm_mmu_topup_memory_cache(memcache, 1282 1283 kvm_mmu_cache_min_pages(kvm)); 1283 1284 if (ret) ··· 1343 1342 * backed by a THP and thus use block mapping if possible. 1344 1343 */ 1345 1344 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { 1346 - if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) 1345 + if (fault_status == ESR_ELx_FSC_PERM && 1346 + fault_granule > PAGE_SIZE) 1347 1347 vma_pagesize = fault_granule; 1348 1348 else 1349 1349 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, ··· 1352 1350 &fault_ipa); 1353 1351 } 1354 1352 1355 - if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { 1353 + if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { 1356 1354 /* Check the VMM hasn't introduced a new disallowed VMA */ 1357 1355 if (kvm_vma_mte_allowed(vma)) { 1358 1356 sanitise_mte_tags(kvm, pfn, vma_pagesize); ··· 1378 1376 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1379 1377 * kvm_pgtable_stage2_map() should be called to change block size. 1380 1378 */ 1381 - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) 1379 + if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) 1382 1380 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); 1383 1381 else 1384 1382 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, ··· 1443 1441 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1444 1442 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1445 1443 1446 - if (fault_status == FSC_FAULT) { 1444 + if (fault_status == ESR_ELx_FSC_FAULT) { 1447 1445 /* Beyond sanitised PARange (which is the IPA limit) */ 1448 1446 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1449 1447 kvm_inject_size_fault(vcpu); ··· 1478 1476 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1479 1477 1480 1478 /* Check the stage-2 fault is trans. fault or write fault */ 1481 - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1482 - fault_status != FSC_ACCESS) { 1479 + if (fault_status != ESR_ELx_FSC_FAULT && 1480 + fault_status != ESR_ELx_FSC_PERM && 1481 + fault_status != ESR_ELx_FSC_ACCESS) { 1483 1482 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1484 1483 kvm_vcpu_trap_get_class(vcpu), 1485 1484 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), ··· 1542 1539 /* Userspace should not be able to register out-of-bounds IPAs */ 1543 1540 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 1544 1541 1545 - if (fault_status == FSC_ACCESS) { 1542 + if (fault_status == ESR_ELx_FSC_ACCESS) { 1546 1543 handle_access_fault(vcpu, fault_ipa); 1547 1544 ret = 1; 1548 1545 goto out_unlock;