Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mtip32xx: minor performance enhancements

This patch adds the following:

1) Compiler hinting in the fast path.
2) A prefetch of port->flags to eliminate moderate cpu stalling later
in mtip_hw_submit_io().
3) Eliminate a redundant rq_data_dir().
4) Reorder members of driver_data to eliminate false cacheline sharing
between irq_workers_active and unal_qdepth.

With some workload and topology configurations, I'm seeing ~1.5%
throughput improvement in small block random read benchmarks as well
as improved latency std. dev.

Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>

Add include of <linux/prefetch.h>

Signed-off-by: Jens Axboe <axboe@fb.com>

authored by

Sam Bradshaw and committed by
Jens Axboe
f45c40a9 f6be4fb4

+13 -10
+9 -6
drivers/block/mtip32xx/mtip32xx.c
··· 39 39 #include <../drivers/ata/ahci.h> 40 40 #include <linux/export.h> 41 41 #include <linux/debugfs.h> 42 + #include <linux/prefetch.h> 42 43 #include "mtip32xx.h" 43 44 44 45 #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) ··· 2381 2380 /* Map the scatter list for DMA access */ 2382 2381 nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); 2383 2382 2383 + prefetch(&port->flags); 2384 + 2384 2385 command->scatter_ents = nents; 2385 2386 2386 2387 /* ··· 2395 2392 fis = command->command; 2396 2393 fis->type = 0x27; 2397 2394 fis->opts = 1 << 7; 2398 - if (rq_data_dir(rq) == READ) 2395 + if (dma_dir == DMA_FROM_DEVICE) 2399 2396 fis->command = ATA_CMD_FPDMA_READ; 2400 2397 else 2401 2398 fis->command = ATA_CMD_FPDMA_WRITE; ··· 2415 2412 fis->res3 = 0; 2416 2413 fill_command_sg(dd, command, nents); 2417 2414 2418 - if (command->unaligned) 2415 + if (unlikely(command->unaligned)) 2419 2416 fis->device |= 1 << 7; 2420 2417 2421 2418 /* Populate the command header */ ··· 2436 2433 * To prevent this command from being issued 2437 2434 * if an internal command is in progress or error handling is active. 2438 2435 */ 2439 - if (port->flags & MTIP_PF_PAUSE_IO) { 2436 + if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { 2440 2437 set_bit(rq->tag, port->cmds_to_issue); 2441 2438 set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); 2442 2439 return; ··· 3757 3754 struct driver_data *dd = hctx->queue->queuedata; 3758 3755 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); 3759 3756 3760 - if (!dd->unal_qdepth || rq_data_dir(rq) == READ) 3757 + if (rq_data_dir(rq) == READ || !dd->unal_qdepth) 3761 3758 return false; 3762 3759 3763 3760 /* ··· 3779 3776 { 3780 3777 int ret; 3781 3778 3782 - if (mtip_check_unal_depth(hctx, rq)) 3779 + if (unlikely(mtip_check_unal_depth(hctx, rq))) 3783 3780 return BLK_MQ_RQ_QUEUE_BUSY; 3784 3781 3785 3782 ret = mtip_submit_request(hctx, rq); 3786 - if (!ret) 3783 + if (likely(!ret)) 3787 3784 return BLK_MQ_RQ_QUEUE_OK; 3788 3785 3789 3786 rq->errors = ret;
+4 -4
drivers/block/mtip32xx/mtip32xx.h
··· 493 493 494 494 struct workqueue_struct *isr_workq; 495 495 496 - struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; 497 - 498 496 atomic_t irq_workers_active; 497 + 498 + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; 499 499 500 500 int isr_binding; 501 501 502 502 struct block_device *bdev; 503 503 504 - int unal_qdepth; /* qdepth of unaligned IO queue */ 505 - 506 504 struct list_head online_list; /* linkage for online list */ 507 505 508 506 struct list_head remove_list; /* linkage for removing list */ 507 + 508 + int unal_qdepth; /* qdepth of unaligned IO queue */ 509 509 }; 510 510 511 511 #endif