Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/irq: Install posted MSI notification handler

All MSI vectors are multiplexed into a single notification vector when
posted MSI is enabled. It is the responsibility of the notification vector
handler to demultiplex MSI vectors. In the handler the MSI vector handlers
are dispatched without IDT delivery for each pending MSI interrupt.

For example, the interrupt flow will change as follows:
(3 MSIs of different vectors arrive in a a high frequency burst)

BEFORE:
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()
interrupt(MSI)
irq_enter()
handler() /* EOI */
irq_exit()
process_softirq()

AFTER:
interrupt /* Posted MSI notification vector */
irq_enter()
atomic_xchg(PIR)
handler()
handler()
handler()
pi_clear_on()
apic_eoi()
irq_exit()
process_softirq()

Except for the leading MSI, CPU notifications are skipped/coalesced.

For MSIs which arrive at a low frequency, the demultiplexing loop does not
wait for more interrupts to coalesce. Therefore, there's no additional
latency other than the processing time.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240423174114.526704-9-jacob.jun.pan@linux.intel.com

authored by

Jacob Pan and committed by
Thomas Gleixner
1b03d82b 6087c7f3

+135 -4
+2
arch/x86/entry/entry_fred.c
··· 117 117 SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), 118 118 SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), 119 119 SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), 120 + 121 + SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification), 120 122 }; 121 123 122 124 static bool fred_setup_done __initdata;
+3
arch/x86/include/asm/hardirq.h
··· 44 44 unsigned int irq_hv_reenlightenment_count; 45 45 unsigned int hyperv_stimer0_count; 46 46 #endif 47 + #ifdef CONFIG_X86_POSTED_MSI 48 + unsigned int posted_msi_notification_count; 49 + #endif 47 50 } ____cacheline_aligned irq_cpustat_t; 48 51 49 52 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+6
arch/x86/include/asm/idtentry.h
··· 751 751 # define fred_sysvec_kvm_posted_intr_nested_ipi NULL 752 752 #endif 753 753 754 + # ifdef CONFIG_X86_POSTED_MSI 755 + DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); 756 + #else 757 + # define fred_sysvec_posted_msi_notification NULL 758 + # endif 759 + 754 760 #if IS_ENABLED(CONFIG_HYPERV) 755 761 DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); 756 762 DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+3
arch/x86/kernel/idt.c
··· 163 163 # endif 164 164 INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), 165 165 INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), 166 + # ifdef CONFIG_X86_POSTED_MSI 167 + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), 168 + # endif 166 169 #endif 167 170 }; 168 171
+121 -4
arch/x86/kernel/irq.c
··· 184 184 irq_stats(j)->kvm_posted_intr_wakeup_ipis); 185 185 seq_puts(p, " Posted-interrupt wakeup event\n"); 186 186 #endif 187 + #ifdef CONFIG_X86_POSTED_MSI 188 + seq_printf(p, "%*s: ", prec, "PMN"); 189 + for_each_online_cpu(j) 190 + seq_printf(p, "%10u ", 191 + irq_stats(j)->posted_msi_notification_count); 192 + seq_puts(p, " Posted MSI notification event\n"); 193 + #endif 187 194 return 0; 188 195 } 189 196 ··· 249 242 __handle_irq(desc, regs); 250 243 } 251 244 252 - static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) 245 + static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) 253 246 { 254 247 struct irq_desc *desc; 248 + int ret = 0; 255 249 256 250 desc = __this_cpu_read(vector_irq[vector]); 257 251 if (likely(!IS_ERR_OR_NULL(desc))) { 258 252 handle_irq(desc, regs); 259 253 } else { 260 - apic_eoi(); 261 - 254 + ret = -EINVAL; 262 255 if (desc == VECTOR_UNUSED) { 263 256 pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", 264 257 __func__, smp_processor_id(), ··· 267 260 __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); 268 261 } 269 262 } 263 + 264 + return ret; 270 265 } 271 266 272 267 /* ··· 282 273 /* entry code tells RCU that we're not quiescent. Check it. */ 283 274 RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); 284 275 285 - call_irq_handler(vector, regs); 276 + if (unlikely(call_irq_handler(vector, regs))) 277 + apic_eoi(); 278 + 286 279 set_irq_regs(old_regs); 287 280 } 288 281 ··· 371 360 apic_id = this_cpu_read(x86_cpu_to_apicid); 372 361 destination = x2apic_enabled() ? apic_id : apic_id << 8; 373 362 this_cpu_write(posted_msi_pi_desc.ndst, destination); 363 + } 364 + 365 + /* 366 + * De-multiplexing posted interrupts is on the performance path, the code 367 + * below is written to optimize the cache performance based on the following 368 + * considerations: 369 + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently 370 + * accessed by both CPU and IOMMU. 371 + * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg 372 + * for checking and clearing posted interrupt request (PIR), a 256 bit field 373 + * within the PID. 374 + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache 375 + * line when posting interrupts and setting control bits. 376 + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. 377 + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID 378 + * cache line. The cache line states after each operation are as follows: 379 + * CPU IOMMU PID Cache line state 380 + * --------------------------------------------------------------- 381 + *...read64 exclusive 382 + *...lock xchg64 modified 383 + *... post/atomic swap invalid 384 + *...------------------------------------------------------------- 385 + * 386 + * To reduce L1 data cache miss, it is important to avoid contention with 387 + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used 388 + * to dispatch interrupt handlers. 389 + * 390 + * In addition, the code is trying to keep the cache line state consistent 391 + * as much as possible. e.g. when making a copy and clearing the PIR 392 + * (assuming non-zero PIR bits are present in the entire PIR), it does: 393 + * read, read, read, read, xchg, xchg, xchg, xchg 394 + * instead of: 395 + * read, xchg, read, xchg, read, xchg, read, xchg 396 + */ 397 + static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) 398 + { 399 + int i, vec = FIRST_EXTERNAL_VECTOR; 400 + unsigned long pir_copy[4]; 401 + bool handled = false; 402 + 403 + for (i = 0; i < 4; i++) 404 + pir_copy[i] = pir[i]; 405 + 406 + for (i = 0; i < 4; i++) { 407 + if (!pir_copy[i]) 408 + continue; 409 + 410 + pir_copy[i] = arch_xchg(&pir[i], 0); 411 + handled = true; 412 + } 413 + 414 + if (handled) { 415 + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) 416 + call_irq_handler(vec, regs); 417 + } 418 + 419 + return handled; 420 + } 421 + 422 + /* 423 + * Performance data shows that 3 is good enough to harvest 90+% of the benefit 424 + * on high IRQ rate workload. 425 + */ 426 + #define MAX_POSTED_MSI_COALESCING_LOOP 3 427 + 428 + /* 429 + * For MSIs that are delivered as posted interrupts, the CPU notifications 430 + * can be coalesced if the MSIs arrive in high frequency bursts. 431 + */ 432 + DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) 433 + { 434 + struct pt_regs *old_regs = set_irq_regs(regs); 435 + struct pi_desc *pid; 436 + int i = 0; 437 + 438 + pid = this_cpu_ptr(&posted_msi_pi_desc); 439 + 440 + inc_irq_stat(posted_msi_notification_count); 441 + irq_enter(); 442 + 443 + /* 444 + * Max coalescing count includes the extra round of handle_pending_pir 445 + * after clearing the outstanding notification bit. Hence, at most 446 + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. 447 + */ 448 + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { 449 + if (!handle_pending_pir(pid->pir64, regs)) 450 + break; 451 + } 452 + 453 + /* 454 + * Clear outstanding notification bit to allow new IRQ notifications, 455 + * do this last to maximize the window of interrupt coalescing. 456 + */ 457 + pi_clear_on(pid); 458 + 459 + /* 460 + * There could be a race of PI notification and the clearing of ON bit, 461 + * process PIR bits one last time such that handling the new interrupts 462 + * are not delayed until the next IRQ. 463 + */ 464 + handle_pending_pir(pid->pir64, regs); 465 + 466 + apic_eoi(); 467 + irq_exit(); 468 + set_irq_regs(old_regs); 374 469 } 375 470 #endif /* X86_POSTED_MSI */ 376 471