Merge tag 'kvm-x86-pir-6.16' of https://github.com/kvm-x86/linux into HEAD

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

KVM x86 posted interrupt changes for 6.16:

Refine and optimize KVM's software processing of the PIR, and ultimately share
PIR harvesting code between KVM and the kernel's Posted MSI handler

Paolo Bonzini 10 months ago db44dcbd 5d816c13

+95 -72

5 changed files

expand all

arch

x86

include

asm

posted_intr.h

kernel

irq.c

kvm

lapic.c

lapic.h

vmx

posted_intr.h

+71 -7

arch/x86/include/asm/posted_intr.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef _X86_POSTED_INTR_H 3 3 #define _X86_POSTED_INTR_H 4 + 5 + #include <asm/cmpxchg.h> 6 + #include <asm/rwonce.h> 4 7 #include <asm/irq_vectors.h> 8 + 9 + #include <linux/bitmap.h> 5 10 6 11 #define POSTED_INTR_ON 0 7 12 #define POSTED_INTR_SN 1 8 13 9 14 #define PID_TABLE_ENTRY_VALID 1 10 15 16 + #define NR_PIR_VECTORS 256 17 + #define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) 18 + 11 19 /* Posted-Interrupt Descriptor */ 12 20 struct pi_desc { 13 - union { 14 - u32 pir[8]; /* Posted interrupt requested */ 15 - u64 pir64[4]; 16 - }; 21 + unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ 17 22 union { 18 23 struct { 19 24 u16 notifications; /* Suppress and outstanding bits */ ··· 30 25 }; 31 26 u32 rsvd[6]; 32 27 } __aligned(64); 28 + 29 + /* 30 + * De-multiplexing posted interrupts is on the performance path, the code 31 + * below is written to optimize the cache performance based on the following 32 + * considerations: 33 + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 34 + * accessed by both CPU and IOMMU. 35 + * 2.During software processing of posted interrupts, the CPU needs to do 36 + * natural width read and xchg for checking and clearing posted interrupt 37 + * request (PIR), a 256 bit field within the PID. 38 + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 39 + * line when posting interrupts and setting control bits. 40 + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 41 + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 42 + * cache line. The cache line states after each operation are as follows, 43 + * assuming a 64-bit kernel: 44 + * CPU IOMMU PID Cache line state 45 + * --------------------------------------------------------------- 46 + *...read64 exclusive 47 + *...lock xchg64 modified 48 + *... post/atomic swap invalid 49 + *...------------------------------------------------------------- 50 + * 51 + * To reduce L1 data cache miss, it is important to avoid contention with 52 + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 53 + * when processing posted interrupts in software, e.g. to dispatch interrupt 54 + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR 55 + * in KVM. 56 + * 57 + * In addition, the code is trying to keep the cache line state consistent 58 + * as much as possible. e.g. when making a copy and clearing the PIR 59 + * (assuming non-zero PIR bits are present in the entire PIR), it does: 60 + * read, read, read, read, xchg, xchg, xchg, xchg 61 + * instead of: 62 + * read, xchg, read, xchg, read, xchg, read, xchg 63 + */ 64 + static __always_inline bool pi_harvest_pir(unsigned long *pir, 65 + unsigned long *pir_vals) 66 + { 67 + unsigned long pending = 0; 68 + int i; 69 + 70 + for (i = 0; i < NR_PIR_WORDS; i++) { 71 + pir_vals[i] = READ_ONCE(pir[i]); 72 + pending |= pir_vals[i]; 73 + } 74 + 75 + if (!pending) 76 + return false; 77 + 78 + for (i = 0; i < NR_PIR_WORDS; i++) { 79 + if (!pir_vals[i]) 80 + continue; 81 + 82 + pir_vals[i] = arch_xchg(&pir[i], 0); 83 + } 84 + 85 + return true; 86 + } 33 87 34 88 static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) 35 89 { ··· 107 43 108 44 static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 109 45 { 110 - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 46 + return test_and_set_bit(vector, pi_desc->pir); 111 47 } 112 48 113 49 static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) 114 50 { 115 - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); 51 + return bitmap_empty(pi_desc->pir, NR_VECTORS); 116 52 } 117 53 118 54 static inline void pi_set_sn(struct pi_desc *pi_desc) ··· 174 110 if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) 175 111 return false; 176 112 177 - return test_bit(vector, (unsigned long *)pid->pir); 113 + return test_bit(vector, pid->pir); 178 114 } 179 115 180 116 extern void intel_posted_msi_init(void);

+10 -53

arch/x86/kernel/irq.c

··· 380 380 this_cpu_write(posted_msi_pi_desc.ndst, destination); 381 381 } 382 382 383 - /* 384 - * De-multiplexing posted interrupts is on the performance path, the code 385 - * below is written to optimize the cache performance based on the following 386 - * considerations: 387 - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 388 - * accessed by both CPU and IOMMU. 389 - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg 390 - * for checking and clearing posted interrupt request (PIR), a 256 bit field 391 - * within the PID. 392 - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 393 - * line when posting interrupts and setting control bits. 394 - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 395 - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 396 - * cache line. The cache line states after each operation are as follows: 397 - * CPU IOMMU PID Cache line state 398 - * --------------------------------------------------------------- 399 - *...read64 exclusive 400 - *...lock xchg64 modified 401 - *... post/atomic swap invalid 402 - *...------------------------------------------------------------- 403 - * 404 - * To reduce L1 data cache miss, it is important to avoid contention with 405 - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 406 - * to dispatch interrupt handlers. 407 - * 408 - * In addition, the code is trying to keep the cache line state consistent 409 - * as much as possible. e.g. when making a copy and clearing the PIR 410 - * (assuming non-zero PIR bits are present in the entire PIR), it does: 411 - * read, read, read, read, xchg, xchg, xchg, xchg 412 - * instead of: 413 - * read, xchg, read, xchg, read, xchg, read, xchg 414 - */ 415 - static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) 383 + static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) 416 384 { 417 - int i, vec = FIRST_EXTERNAL_VECTOR; 418 - unsigned long pir_copy[4]; 419 - bool handled = false; 385 + unsigned long pir_copy[NR_PIR_WORDS]; 386 + int vec = FIRST_EXTERNAL_VECTOR; 420 387 421 - for (i = 0; i < 4; i++) 422 - pir_copy[i] = pir[i]; 388 + if (!pi_harvest_pir(pir, pir_copy)) 389 + return false; 423 390 424 - for (i = 0; i < 4; i++) { 425 - if (!pir_copy[i]) 426 - continue; 391 + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) 392 + call_irq_handler(vec, regs); 427 393 428 - pir_copy[i] = arch_xchg(&pir[i], 0); 429 - handled = true; 430 - } 431 - 432 - if (handled) { 433 - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) 434 - call_irq_handler(vec, regs); 435 - } 436 - 437 - return handled; 394 + return true; 438 395 } 439 396 440 397 /* ··· 421 464 * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. 422 465 */ 423 466 while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { 424 - if (!handle_pending_pir(pid->pir64, regs)) 467 + if (!handle_pending_pir(pid->pir, regs)) 425 468 break; 426 469 } 427 470 ··· 436 479 * process PIR bits one last time such that handling the new interrupts 437 480 * are not delayed until the next IRQ. 438 481 */ 439 - handle_pending_pir(pid->pir64, regs); 482 + handle_pending_pir(pid->pir, regs); 440 483 441 484 apic_eoi(); 442 485 irq_exit();

+11 -9

arch/x86/kvm/lapic.c

··· 655 655 return count; 656 656 } 657 657 658 - bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) 658 + bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) 659 659 { 660 + unsigned long pir_vals[NR_PIR_WORDS]; 661 + u32 *__pir = (void *)pir_vals; 660 662 u32 i, vec; 661 - u32 pir_val, irr_val, prev_irr_val; 663 + u32 irr_val, prev_irr_val; 662 664 int max_updated_irr; 663 665 664 666 max_updated_irr = -1; 665 667 *max_irr = -1; 666 668 669 + if (!pi_harvest_pir(pir, pir_vals)) 670 + return false; 671 + 667 672 for (i = vec = 0; i <= 7; i++, vec += 32) { 668 673 u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); 669 674 670 - irr_val = *p_irr; 671 - pir_val = READ_ONCE(pir[i]); 675 + irr_val = READ_ONCE(*p_irr); 672 676 673 - if (pir_val) { 674 - pir_val = xchg(&pir[i], 0); 675 - 677 + if (__pir[i]) { 676 678 prev_irr_val = irr_val; 677 679 do { 678 - irr_val = prev_irr_val | pir_val; 680 + irr_val = prev_irr_val | __pir[i]; 679 681 } while (prev_irr_val != irr_val && 680 682 !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); 681 683 ··· 693 691 } 694 692 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); 695 693 696 - bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) 694 + bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) 697 695 { 698 696 struct kvm_lapic *apic = vcpu->arch.apic; 699 697 bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);

+2 -2

arch/x86/kvm/lapic.h

··· 103 103 int shorthand, unsigned int dest, int dest_mode); 104 104 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 105 105 void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); 106 - bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); 107 - bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); 106 + bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); 107 + bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); 108 108 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); 109 109 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 110 110 struct dest_map *dest_map);

+1 -1

arch/x86/kvm/vmx/posted_intr.h

··· 20 20 { 21 21 int vec; 22 22 23 - vec = find_last_bit((unsigned long *)pi_desc->pir, 256); 23 + vec = find_last_bit(pi_desc->pir, 256); 24 24 return vec < 256 ? vec : -1; 25 25 } 26 26