Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'p2pdma-mmio-6.19.v5' into for-6.19/block

Merge MMIO P2P DMA series from Leon:

"This patch series improves block layer and NVMe driver support for MMIO
memory regions, particularly for peer-to-peer (P2P) DMA transfers that
go through the host bridge.

The series addresses a critical gap where P2P transfers through the
host bridge (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) were not properly marked
as MMIO memory, leading to potential issues with:

- Inappropriate CPU cache synchronization operations on MMIO regions
- Incorrect DMA mapping/unmapping that doesn't respect MMIO semantics
- Missing IOMMU configuration for MMIO memory handling

This work is extracted from the larger DMA physical API improvement
series [1] and focuses specifically on block layer and NVMe
requirements for MMIO memory support.

[1] https://lore.kernel.org/all/cover.1757423202.git.leonro@nvidia.com/"

Link: https://lore.kernel.org/linux-block/20251114-block-with-mmio-v5-0-69d00f73d766@nvidia.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* p2pdma-mmio-6.19.v5:
block-dma: properly take MMIO path
nvme-pci: migrate to dma_map_phys instead of map_page

+100 -55
+13 -7
block/blk-mq-dma.c
··· 92 92 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, 93 93 struct blk_dma_iter *iter, struct phys_vec *vec) 94 94 { 95 - iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), 96 - offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); 95 + unsigned int attrs = 0; 96 + 97 + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) 98 + attrs |= DMA_ATTR_MMIO; 99 + 100 + iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, 101 + rq_dma_dir(req), attrs); 97 102 if (dma_mapping_error(dma_dev, iter->addr)) { 98 103 iter->status = BLK_STS_RESOURCE; 99 104 return false; ··· 113 108 { 114 109 enum dma_data_direction dir = rq_dma_dir(req); 115 110 unsigned int mapped = 0; 111 + unsigned int attrs = 0; 116 112 int error; 117 113 118 114 iter->addr = state->addr; 119 115 iter->len = dma_iova_size(state); 120 116 117 + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) 118 + attrs |= DMA_ATTR_MMIO; 119 + 121 120 do { 122 121 error = dma_iova_link(dma_dev, state, vec->paddr, mapped, 123 - vec->len, dir, 0); 122 + vec->len, dir, attrs); 124 123 if (error) 125 124 break; 126 125 mapped += vec->len; ··· 171 162 172 163 memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); 173 164 iter->status = BLK_STS_OK; 165 + iter->p2pdma.map = PCI_P2PDMA_MAP_NONE; 174 166 175 167 /* 176 168 * Grab the first segment ASAP because we'll need it to check for P2P ··· 183 173 switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, 184 174 phys_to_page(vec.paddr))) { 185 175 case PCI_P2PDMA_MAP_BUS_ADDR: 186 - if (iter->iter.is_integrity) 187 - bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; 188 - else 189 - req->cmd_flags |= REQ_P2PDMA; 190 176 return blk_dma_map_bus(iter, &vec); 191 177 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: 192 178 /*
+74 -16
drivers/nvme/host/pci.c
··· 260 260 /* single segment dma mapping */ 261 261 IOD_SINGLE_SEGMENT = 1U << 2, 262 262 263 + /* Data payload contains p2p memory */ 264 + IOD_DATA_P2P = 1U << 3, 265 + 266 + /* Metadata contains p2p memory */ 267 + IOD_META_P2P = 1U << 4, 268 + 269 + /* Data payload contains MMIO memory */ 270 + IOD_DATA_MMIO = 1U << 5, 271 + 272 + /* Metadata contains MMIO memory */ 273 + IOD_META_MMIO = 1U << 6, 274 + 263 275 /* Metadata using non-coalesced MPTR */ 264 - IOD_SINGLE_META_SEGMENT = 1U << 5, 276 + IOD_SINGLE_META_SEGMENT = 1U << 7, 265 277 }; 266 278 267 279 struct nvme_dma_vec { ··· 710 698 } 711 699 } 712 700 713 - static void nvme_free_prps(struct request *req) 701 + static void nvme_free_prps(struct request *req, unsigned int attrs) 714 702 { 715 703 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 716 704 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 717 705 unsigned int i; 718 706 719 707 for (i = 0; i < iod->nr_dma_vecs; i++) 720 - dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr, 721 - iod->dma_vecs[i].len, rq_dma_dir(req)); 708 + dma_unmap_phys(nvmeq->dev->dev, iod->dma_vecs[i].addr, 709 + iod->dma_vecs[i].len, rq_dma_dir(req), attrs); 722 710 mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); 723 711 } 724 712 725 713 static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, 726 - struct nvme_sgl_desc *sg_list) 714 + struct nvme_sgl_desc *sg_list, unsigned int attrs) 727 715 { 728 716 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 729 717 enum dma_data_direction dir = rq_dma_dir(req); ··· 732 720 unsigned int i; 733 721 734 722 if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { 735 - dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir); 723 + dma_unmap_phys(dma_dev, le64_to_cpu(sge->addr), len, dir, 724 + attrs); 736 725 return; 737 726 } 738 727 739 728 for (i = 0; i < len / sizeof(*sg_list); i++) 740 - dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), 741 - le32_to_cpu(sg_list[i].length), dir); 729 + dma_unmap_phys(dma_dev, le64_to_cpu(sg_list[i].addr), 730 + le32_to_cpu(sg_list[i].length), dir, attrs); 742 731 } 743 732 744 733 static void nvme_unmap_metadata(struct request *req) 745 734 { 746 735 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 736 + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; 747 737 enum dma_data_direction dir = rq_dma_dir(req); 748 738 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 749 739 struct device *dma_dev = nvmeq->dev->dev; 750 740 struct nvme_sgl_desc *sge = iod->meta_descriptor; 741 + unsigned int attrs = 0; 751 742 752 743 if (iod->flags & IOD_SINGLE_META_SEGMENT) { 753 744 dma_unmap_page(dma_dev, iod->meta_dma, ··· 759 744 return; 760 745 } 761 746 762 - if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, 763 - iod->meta_total_len)) { 747 + if (iod->flags & IOD_META_P2P) 748 + map = PCI_P2PDMA_MAP_BUS_ADDR; 749 + else if (iod->flags & IOD_META_MMIO) { 750 + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; 751 + attrs |= DMA_ATTR_MMIO; 752 + } 753 + 754 + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, 755 + iod->meta_total_len, map)) { 764 756 if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) 765 - nvme_free_sgls(req, sge, &sge[1]); 757 + nvme_free_sgls(req, sge, &sge[1], attrs); 766 758 else 767 - dma_unmap_page(dma_dev, iod->meta_dma, 768 - iod->meta_total_len, dir); 759 + dma_unmap_phys(dma_dev, iod->meta_dma, 760 + iod->meta_total_len, dir, attrs); 769 761 } 770 762 771 763 if (iod->meta_descriptor) ··· 782 760 783 761 static void nvme_unmap_data(struct request *req) 784 762 { 763 + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; 785 764 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 786 765 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 787 766 struct device *dma_dev = nvmeq->dev->dev; 767 + unsigned int attrs = 0; 788 768 789 769 if (iod->flags & IOD_SINGLE_SEGMENT) { 790 770 static_assert(offsetof(union nvme_data_ptr, prp1) == ··· 796 772 return; 797 773 } 798 774 799 - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { 775 + if (iod->flags & IOD_DATA_P2P) 776 + map = PCI_P2PDMA_MAP_BUS_ADDR; 777 + else if (iod->flags & IOD_DATA_MMIO) { 778 + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; 779 + attrs |= DMA_ATTR_MMIO; 780 + } 781 + 782 + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, 783 + map)) { 800 784 if (nvme_pci_cmd_use_sgl(&iod->cmd)) 801 785 nvme_free_sgls(req, iod->descriptors[0], 802 - &iod->cmd.common.dptr.sgl); 786 + &iod->cmd.common.dptr.sgl, attrs); 803 787 else 804 - nvme_free_prps(req); 788 + nvme_free_prps(req, attrs); 805 789 } 806 790 807 791 if (iod->nr_descriptors) ··· 1080 1048 if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) 1081 1049 return iter.status; 1082 1050 1051 + switch (iter.p2pdma.map) { 1052 + case PCI_P2PDMA_MAP_BUS_ADDR: 1053 + iod->flags |= IOD_DATA_P2P; 1054 + break; 1055 + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: 1056 + iod->flags |= IOD_DATA_MMIO; 1057 + break; 1058 + case PCI_P2PDMA_MAP_NONE: 1059 + break; 1060 + default: 1061 + return BLK_STS_RESOURCE; 1062 + } 1063 + 1083 1064 if (use_sgl == SGL_FORCED || 1084 1065 (use_sgl == SGL_SUPPORTED && 1085 1066 (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) ··· 1114 1069 if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, 1115 1070 &iod->meta_dma_state, &iter)) 1116 1071 return iter.status; 1072 + 1073 + switch (iter.p2pdma.map) { 1074 + case PCI_P2PDMA_MAP_BUS_ADDR: 1075 + iod->flags |= IOD_META_P2P; 1076 + break; 1077 + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: 1078 + iod->flags |= IOD_META_MMIO; 1079 + break; 1080 + case PCI_P2PDMA_MAP_NONE: 1081 + break; 1082 + default: 1083 + return BLK_STS_RESOURCE; 1084 + } 1117 1085 1118 1086 if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) 1119 1087 entries = 1;
-1
include/linux/bio-integrity.h
··· 13 13 BIP_CHECK_GUARD = 1 << 5, /* guard check */ 14 14 BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ 15 15 BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ 16 - BIP_P2P_DMA = 1 << 8, /* using P2P address */ 17 16 18 17 BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ 19 18 };
-14
include/linux/blk-integrity.h
··· 33 33 #ifdef CONFIG_BLK_DEV_INTEGRITY 34 34 int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); 35 35 36 - static inline bool blk_rq_integrity_dma_unmap(struct request *req, 37 - struct device *dma_dev, struct dma_iova_state *state, 38 - size_t mapped_len) 39 - { 40 - return blk_dma_unmap(req, dma_dev, state, mapped_len, 41 - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); 42 - } 43 - 44 36 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); 45 37 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, 46 38 ssize_t bytes); ··· 120 128 struct scatterlist *s) 121 129 { 122 130 return 0; 123 - } 124 - static inline bool blk_rq_integrity_dma_unmap(struct request *req, 125 - struct device *dma_dev, struct dma_iova_state *state, 126 - size_t mapped_len) 127 - { 128 - return false; 129 131 } 130 132 static inline int blk_rq_integrity_map_user(struct request *rq, 131 133 void __user *ubuf,
+13 -15
include/linux/blk-mq-dma.h
··· 16 16 /* Output address range for this iteration */ 17 17 dma_addr_t addr; 18 18 u32 len; 19 + struct pci_p2pdma_map_state p2pdma; 19 20 20 21 /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ 21 22 blk_status_t status; 22 23 23 24 /* Internal to blk_rq_dma_map_iter_* */ 24 25 struct blk_map_iter iter; 25 - struct pci_p2pdma_map_state p2pdma; 26 26 }; 27 27 28 28 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, ··· 43 43 } 44 44 45 45 /** 46 - * blk_dma_unmap - try to DMA unmap a request 46 + * blk_rq_dma_unmap - try to DMA unmap a request 47 47 * @req: request to unmap 48 48 * @dma_dev: device to unmap from 49 49 * @state: DMA IOVA state 50 50 * @mapped_len: number of bytes to unmap 51 - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR 51 + * @map: peer-to-peer mapping type 52 52 * 53 53 * Returns %false if the callers need to manually unmap every DMA segment 54 54 * mapped using @iter or %true if no work is left to be done. 55 55 */ 56 - static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, 57 - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) 56 + static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, 57 + struct dma_iova_state *state, size_t mapped_len, 58 + enum pci_p2pdma_map_type map) 58 59 { 59 - if (is_p2p) 60 + if (map == PCI_P2PDMA_MAP_BUS_ADDR) 60 61 return true; 61 62 62 63 if (dma_use_iova(state)) { 64 + unsigned int attrs = 0; 65 + 66 + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) 67 + attrs |= DMA_ATTR_MMIO; 68 + 63 69 dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), 64 - 0); 70 + attrs); 65 71 return true; 66 72 } 67 73 68 74 return !dma_need_unmap(dma_dev); 69 75 } 70 - 71 - static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, 72 - struct dma_iova_state *state, size_t mapped_len) 73 - { 74 - return blk_dma_unmap(req, dma_dev, state, mapped_len, 75 - req->cmd_flags & REQ_P2PDMA); 76 - } 77 - 78 76 #endif /* BLK_MQ_DMA_H */
-2
include/linux/blk_types.h
··· 393 393 __REQ_DRV, /* for driver use */ 394 394 __REQ_FS_PRIVATE, /* for file system (submitter) use */ 395 395 __REQ_ATOMIC, /* for atomic write operations */ 396 - __REQ_P2PDMA, /* contains P2P DMA pages */ 397 396 /* 398 397 * Command specific flags, keep last: 399 398 */ ··· 425 426 #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) 426 427 #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) 427 428 #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) 428 - #define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) 429 429 430 430 #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) 431 431