x86/irq: KVM: Add helper for harvesting PIR to deduplicate KVM and posted MSIs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Now that posted MSI and KVM harvesting of PIR is identical, extract the
code (and posted MSI's wonderful comment) to a common helper.

No functional change intended.

Link: https://lore.kernel.org/r/20250401163447.846608-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>

Sean Christopherson 11 months ago edaf3ede baf68a0e

+69 -61

3 changed files

expand all

arch

x86

include

asm

posted_intr.h

kernel

irq.c

kvm

lapic.c

+64

arch/x86/include/asm/posted_intr.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef _X86_POSTED_INTR_H 3 3 #define _X86_POSTED_INTR_H 4 + 5 + #include <asm/cmpxchg.h> 6 + #include <asm/rwonce.h> 4 7 #include <asm/irq_vectors.h> 8 + 9 + #include <linux/bitmap.h> 5 10 6 11 #define POSTED_INTR_ON 0 7 12 #define POSTED_INTR_SN 1 ··· 30 25 }; 31 26 u32 rsvd[6]; 32 27 } __aligned(64); 28 + 29 + /* 30 + * De-multiplexing posted interrupts is on the performance path, the code 31 + * below is written to optimize the cache performance based on the following 32 + * considerations: 33 + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 34 + * accessed by both CPU and IOMMU. 35 + * 2.During software processing of posted interrupts, the CPU needs to do 36 + * natural width read and xchg for checking and clearing posted interrupt 37 + * request (PIR), a 256 bit field within the PID. 38 + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 39 + * line when posting interrupts and setting control bits. 40 + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 41 + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 42 + * cache line. The cache line states after each operation are as follows, 43 + * assuming a 64-bit kernel: 44 + * CPU IOMMU PID Cache line state 45 + * --------------------------------------------------------------- 46 + *...read64 exclusive 47 + *...lock xchg64 modified 48 + *... post/atomic swap invalid 49 + *...------------------------------------------------------------- 50 + * 51 + * To reduce L1 data cache miss, it is important to avoid contention with 52 + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 53 + * when processing posted interrupts in software, e.g. to dispatch interrupt 54 + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR 55 + * in KVM. 56 + * 57 + * In addition, the code is trying to keep the cache line state consistent 58 + * as much as possible. e.g. when making a copy and clearing the PIR 59 + * (assuming non-zero PIR bits are present in the entire PIR), it does: 60 + * read, read, read, read, xchg, xchg, xchg, xchg 61 + * instead of: 62 + * read, xchg, read, xchg, read, xchg, read, xchg 63 + */ 64 + static __always_inline bool pi_harvest_pir(unsigned long *pir, 65 + unsigned long *pir_vals) 66 + { 67 + unsigned long pending = 0; 68 + int i; 69 + 70 + for (i = 0; i < NR_PIR_WORDS; i++) { 71 + pir_vals[i] = READ_ONCE(pir[i]); 72 + pending |= pir_vals[i]; 73 + } 74 + 75 + if (!pending) 76 + return false; 77 + 78 + for (i = 0; i < NR_PIR_WORDS; i++) { 79 + if (!pir_vals[i]) 80 + continue; 81 + 82 + pir_vals[i] = arch_xchg(&pir[i], 0); 83 + } 84 + 85 + return true; 86 + } 33 87 34 88 static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) 35 89 {

+3 -47

arch/x86/kernel/irq.c

··· 380 380 this_cpu_write(posted_msi_pi_desc.ndst, destination); 381 381 } 382 382 383 - /* 384 - * De-multiplexing posted interrupts is on the performance path, the code 385 - * below is written to optimize the cache performance based on the following 386 - * considerations: 387 - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 388 - * accessed by both CPU and IOMMU. 389 - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg 390 - * for checking and clearing posted interrupt request (PIR), a 256 bit field 391 - * within the PID. 392 - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 393 - * line when posting interrupts and setting control bits. 394 - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 395 - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 396 - * cache line. The cache line states after each operation are as follows: 397 - * CPU IOMMU PID Cache line state 398 - * --------------------------------------------------------------- 399 - *...read64 exclusive 400 - *...lock xchg64 modified 401 - *... post/atomic swap invalid 402 - *...------------------------------------------------------------- 403 - * 404 - * To reduce L1 data cache miss, it is important to avoid contention with 405 - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 406 - * to dispatch interrupt handlers. 407 - * 408 - * In addition, the code is trying to keep the cache line state consistent 409 - * as much as possible. e.g. when making a copy and clearing the PIR 410 - * (assuming non-zero PIR bits are present in the entire PIR), it does: 411 - * read, read, read, read, xchg, xchg, xchg, xchg 412 - * instead of: 413 - * read, xchg, read, xchg, read, xchg, read, xchg 414 - */ 415 383 static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) 416 384 { 417 - unsigned long pir_copy[NR_PIR_WORDS], pending = 0; 418 - int i, vec = FIRST_EXTERNAL_VECTOR; 385 + unsigned long pir_copy[NR_PIR_WORDS]; 386 + int vec = FIRST_EXTERNAL_VECTOR; 419 387 420 - for (i = 0; i < NR_PIR_WORDS; i++) { 421 - pir_copy[i] = READ_ONCE(pir[i]); 422 - pending |= pir_copy[i]; 423 - } 424 - 425 - if (!pending) 388 + if (!pi_harvest_pir(pir, pir_copy)) 426 389 return false; 427 - 428 - for (i = 0; i < NR_PIR_WORDS; i++) { 429 - if (!pir_copy[i]) 430 - continue; 431 - 432 - pir_copy[i] = arch_xchg(&pir[i], 0); 433 - } 434 390 435 391 for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) 436 392 call_irq_handler(vec, regs);

+2 -14

arch/x86/kvm/lapic.c

··· 657 657 658 658 bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) 659 659 { 660 - unsigned long pir_vals[NR_PIR_WORDS], pending = 0; 660 + unsigned long pir_vals[NR_PIR_WORDS]; 661 661 u32 *__pir = (void *)pir_vals; 662 662 u32 i, vec; 663 663 u32 irr_val, prev_irr_val; ··· 666 666 max_updated_irr = -1; 667 667 *max_irr = -1; 668 668 669 - for (i = 0; i < NR_PIR_WORDS; i++) { 670 - pir_vals[i] = READ_ONCE(pir[i]); 671 - pending |= pir_vals[i]; 672 - } 673 - 674 - if (!pending) 669 + if (!pi_harvest_pir(pir, pir_vals)) 675 670 return false; 676 - 677 - for (i = 0; i < NR_PIR_WORDS; i++) { 678 - if (!pir_vals[i]) 679 - continue; 680 - 681 - pir_vals[i] = arch_xchg(&pir[i], 0); 682 - } 683 671 684 672 for (i = vec = 0; i <= 7; i++, vec += 32) { 685 673 u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);