gpu: host1x: Syncpoint interrupt performance optimization

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Optimize performance of syncpoint interrupt handling by reading
the status register in 64-bit chunks when possible, and skipping
processing when the read value is zero.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://patch.msgid.link/20250917-host1x-syncpt-irq-perf-v2-1-736ef69b1347@nvidia.com

authored by

Mikko Perttunen and committed by

Thierry Reding 4 months ago bfe68975 ca258341

+59 -9

3 changed files

expand all

drivers

gpu

host1x

dev.c

dev.h

intr_hw.c

drivers/gpu/host1x/dev.c

··· 71 71 return readl(sync_regs + r); 72 72 } 73 73 74 + #ifdef CONFIG_64BIT 75 + u64 host1x_sync_readq(struct host1x *host1x, u32 r) 76 + { 77 + void __iomem *sync_regs = host1x->regs + host1x->info->sync_offset; 78 + 79 + return readq(sync_regs + r); 80 + } 81 + #endif 82 + 74 83 void host1x_ch_writel(struct host1x_channel *ch, u32 v, u32 r) 75 84 { 76 85 writel(v, ch->regs + r);

drivers/gpu/host1x/dev.h

··· 179 179 u32 host1x_hypervisor_readl(struct host1x *host1x, u32 r); 180 180 void host1x_sync_writel(struct host1x *host1x, u32 v, u32 r); 181 181 u32 host1x_sync_readl(struct host1x *host1x, u32 r); 182 + #ifdef CONFIG_64BIT 183 + u64 host1x_sync_readq(struct host1x *host1x, u32 r); 184 + #endif 182 185 void host1x_ch_writel(struct host1x_channel *ch, u32 v, u32 r); 183 186 u32 host1x_ch_readl(struct host1x_channel *ch, u32 r); 184 187

+47 -9

drivers/gpu/host1x/hw/intr_hw.c

··· 11 11 #include "../intr.h" 12 12 #include "../dev.h" 13 13 14 + static void process_32_syncpts(struct host1x *host, unsigned long val, u32 reg_offset) 15 + { 16 + unsigned int id; 17 + 18 + if (!val) 19 + return; 20 + 21 + host1x_sync_writel(host, val, HOST1X_SYNC_SYNCPT_THRESH_INT_DISABLE(reg_offset)); 22 + host1x_sync_writel(host, val, HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(reg_offset)); 23 + 24 + for_each_set_bit(id, &val, 32) 25 + host1x_intr_handle_interrupt(host, reg_offset * 32 + id); 26 + } 27 + 14 28 static irqreturn_t syncpt_thresh_isr(int irq, void *dev_id) 15 29 { 16 30 struct host1x_intr_irq_data *irq_data = dev_id; 17 31 struct host1x *host = irq_data->host; 18 32 unsigned long reg; 19 - unsigned int i, id; 33 + unsigned int i; 20 34 35 + #if !defined(CONFIG_64BIT) 21 36 for (i = irq_data->offset; i < DIV_ROUND_UP(host->info->nb_pts, 32); 22 37 i += host->num_syncpt_irqs) { 23 38 reg = host1x_sync_readl(host, 24 39 HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(i)); 25 40 26 - host1x_sync_writel(host, reg, 27 - HOST1X_SYNC_SYNCPT_THRESH_INT_DISABLE(i)); 28 - host1x_sync_writel(host, reg, 41 + process_32_syncpts(host, reg, i); 42 + } 43 + #elif HOST1X_HW == 6 || HOST1X_HW == 7 44 + /* 45 + * Tegra186 and Tegra194 have the first INT_STATUS register not 64-bit aligned, 46 + * and only have one interrupt line. 47 + */ 48 + reg = host1x_sync_readl(host, HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(0)); 49 + process_32_syncpts(host, reg, 0); 50 + 51 + for (i = 1; i < (host->info->nb_pts / 32) - 1; i += 2) { 52 + reg = host1x_sync_readq(host, 29 53 HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(i)); 30 54 31 - for_each_set_bit(id, &reg, 32) 32 - host1x_intr_handle_interrupt(host, i * 32 + id); 55 + process_32_syncpts(host, lower_32_bits(reg), i); 56 + process_32_syncpts(host, upper_32_bits(reg), i + 1); 33 57 } 58 + 59 + reg = host1x_sync_readl(host, HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(i)); 60 + process_32_syncpts(host, reg, i); 61 + #else 62 + /* All 64-bit capable SoCs have number of syncpoints divisible by 64 */ 63 + for (i = irq_data->offset; i < DIV_ROUND_UP(host->info->nb_pts, 64); 64 + i += host->num_syncpt_irqs) { 65 + reg = host1x_sync_readq(host, 66 + HOST1X_SYNC_SYNCPT_THRESH_CPU0_INT_STATUS(i * 2)); 67 + 68 + process_32_syncpts(host, lower_32_bits(reg), i * 2 + 0); 69 + process_32_syncpts(host, upper_32_bits(reg), i * 2 + 1); 70 + } 71 + #endif 34 72 35 73 return IRQ_HANDLED; 36 74 } ··· 106 68 107 69 /* 108 70 * Program threshold interrupt destination among 8 lines per VM, 109 - * per syncpoint. For each group of 32 syncpoints (corresponding to one 110 - * interrupt status register), direct to one interrupt line, going 71 + * per syncpoint. For each group of 64 syncpoints (corresponding to two 72 + * interrupt status registers), direct to one interrupt line, going 111 73 * around in a round robin fashion. 112 74 */ 113 75 for (id = 0; id < host->info->nb_pts; id++) { 114 - u32 reg_offset = id / 32; 76 + u32 reg_offset = id / 64; 115 77 u32 irq_index = reg_offset % host->num_syncpt_irqs; 116 78 117 79 host1x_sync_writel(host, irq_index, HOST1X_SYNC_SYNCPT_INTR_DEST(id));