Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/irq: KVM: Add helper for harvesting PIR to deduplicate KVM and posted MSIs

Now that posted MSI and KVM harvesting of PIR is identical, extract the
code (and posted MSI's wonderful comment) to a common helper.

No functional change intended.

Link: https://lore.kernel.org/r/20250401163447.846608-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>

+69 -61
+64
arch/x86/include/asm/posted_intr.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef _X86_POSTED_INTR_H 3 3 #define _X86_POSTED_INTR_H 4 + 5 + #include <asm/cmpxchg.h> 6 + #include <asm/rwonce.h> 4 7 #include <asm/irq_vectors.h> 8 + 9 + #include <linux/bitmap.h> 5 10 6 11 #define POSTED_INTR_ON 0 7 12 #define POSTED_INTR_SN 1 ··· 30 25 }; 31 26 u32 rsvd[6]; 32 27 } __aligned(64); 28 + 29 + /* 30 + * De-multiplexing posted interrupts is on the performance path, the code 31 + * below is written to optimize the cache performance based on the following 32 + * considerations: 33 + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 34 + * accessed by both CPU and IOMMU. 35 + * 2.During software processing of posted interrupts, the CPU needs to do 36 + * natural width read and xchg for checking and clearing posted interrupt 37 + * request (PIR), a 256 bit field within the PID. 38 + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 39 + * line when posting interrupts and setting control bits. 40 + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 41 + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 42 + * cache line. The cache line states after each operation are as follows, 43 + * assuming a 64-bit kernel: 44 + * CPU IOMMU PID Cache line state 45 + * --------------------------------------------------------------- 46 + *...read64 exclusive 47 + *...lock xchg64 modified 48 + *... post/atomic swap invalid 49 + *...------------------------------------------------------------- 50 + * 51 + * To reduce L1 data cache miss, it is important to avoid contention with 52 + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 53 + * when processing posted interrupts in software, e.g. to dispatch interrupt 54 + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR 55 + * in KVM. 56 + * 57 + * In addition, the code is trying to keep the cache line state consistent 58 + * as much as possible. e.g. when making a copy and clearing the PIR 59 + * (assuming non-zero PIR bits are present in the entire PIR), it does: 60 + * read, read, read, read, xchg, xchg, xchg, xchg 61 + * instead of: 62 + * read, xchg, read, xchg, read, xchg, read, xchg 63 + */ 64 + static __always_inline bool pi_harvest_pir(unsigned long *pir, 65 + unsigned long *pir_vals) 66 + { 67 + unsigned long pending = 0; 68 + int i; 69 + 70 + for (i = 0; i < NR_PIR_WORDS; i++) { 71 + pir_vals[i] = READ_ONCE(pir[i]); 72 + pending |= pir_vals[i]; 73 + } 74 + 75 + if (!pending) 76 + return false; 77 + 78 + for (i = 0; i < NR_PIR_WORDS; i++) { 79 + if (!pir_vals[i]) 80 + continue; 81 + 82 + pir_vals[i] = arch_xchg(&pir[i], 0); 83 + } 84 + 85 + return true; 86 + } 33 87 34 88 static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) 35 89 {
+3 -47
arch/x86/kernel/irq.c
··· 380 380 this_cpu_write(posted_msi_pi_desc.ndst, destination); 381 381 } 382 382 383 - /* 384 - * De-multiplexing posted interrupts is on the performance path, the code 385 - * below is written to optimize the cache performance based on the following 386 - * considerations: 387 - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 388 - * accessed by both CPU and IOMMU. 389 - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg 390 - * for checking and clearing posted interrupt request (PIR), a 256 bit field 391 - * within the PID. 392 - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 393 - * line when posting interrupts and setting control bits. 394 - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 395 - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 396 - * cache line. The cache line states after each operation are as follows: 397 - * CPU IOMMU PID Cache line state 398 - * --------------------------------------------------------------- 399 - *...read64 exclusive 400 - *...lock xchg64 modified 401 - *... post/atomic swap invalid 402 - *...------------------------------------------------------------- 403 - * 404 - * To reduce L1 data cache miss, it is important to avoid contention with 405 - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 406 - * to dispatch interrupt handlers. 407 - * 408 - * In addition, the code is trying to keep the cache line state consistent 409 - * as much as possible. e.g. when making a copy and clearing the PIR 410 - * (assuming non-zero PIR bits are present in the entire PIR), it does: 411 - * read, read, read, read, xchg, xchg, xchg, xchg 412 - * instead of: 413 - * read, xchg, read, xchg, read, xchg, read, xchg 414 - */ 415 383 static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) 416 384 { 417 - unsigned long pir_copy[NR_PIR_WORDS], pending = 0; 418 - int i, vec = FIRST_EXTERNAL_VECTOR; 385 + unsigned long pir_copy[NR_PIR_WORDS]; 386 + int vec = FIRST_EXTERNAL_VECTOR; 419 387 420 - for (i = 0; i < NR_PIR_WORDS; i++) { 421 - pir_copy[i] = READ_ONCE(pir[i]); 422 - pending |= pir_copy[i]; 423 - } 424 - 425 - if (!pending) 388 + if (!pi_harvest_pir(pir, pir_copy)) 426 389 return false; 427 - 428 - for (i = 0; i < NR_PIR_WORDS; i++) { 429 - if (!pir_copy[i]) 430 - continue; 431 - 432 - pir_copy[i] = arch_xchg(&pir[i], 0); 433 - } 434 390 435 391 for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) 436 392 call_irq_handler(vec, regs);
+2 -14
arch/x86/kvm/lapic.c
··· 657 657 658 658 bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) 659 659 { 660 - unsigned long pir_vals[NR_PIR_WORDS], pending = 0; 660 + unsigned long pir_vals[NR_PIR_WORDS]; 661 661 u32 *__pir = (void *)pir_vals; 662 662 u32 i, vec; 663 663 u32 irr_val, prev_irr_val; ··· 666 666 max_updated_irr = -1; 667 667 *max_irr = -1; 668 668 669 - for (i = 0; i < NR_PIR_WORDS; i++) { 670 - pir_vals[i] = READ_ONCE(pir[i]); 671 - pending |= pir_vals[i]; 672 - } 673 - 674 - if (!pending) 669 + if (!pi_harvest_pir(pir, pir_vals)) 675 670 return false; 676 - 677 - for (i = 0; i < NR_PIR_WORDS; i++) { 678 - if (!pir_vals[i]) 679 - continue; 680 - 681 - pir_vals[i] = arch_xchg(&pir[i], 0); 682 - } 683 671 684 672 for (i = vec = 0; i <= 7; i++, vec += 32) { 685 673 u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);