Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nd_blk: change aperture mapping from WC to WB

This should result in a pretty sizeable performance gain for reads. For
rough comparison I did some simple read testing using PMEM to compare
reads of write combining (WC) mappings vs write-back (WB). This was
done on a random lab machine.

PMEM reads from a write combining mapping:
# dd of=/dev/null if=/dev/pmem0 bs=4096 count=100000
100000+0 records in
100000+0 records out
409600000 bytes (410 MB) copied, 9.2855 s, 44.1 MB/s

PMEM reads from a write-back mapping:
# dd of=/dev/null if=/dev/pmem0 bs=4096 count=1000000
1000000+0 records in
1000000+0 records out
4096000000 bytes (4.1 GB) copied, 3.44034 s, 1.2 GB/s

To be able to safely support a write-back aperture I needed to add
support for the "read flush" _DSM flag, as outlined in the DSM spec:

http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

This flag tells the ND BLK driver that it needs to flush the cache lines
associated with the aperture after the aperture is moved but before any
new data is read. This ensures that any stale cache lines from the
previous contents of the aperture will be discarded from the processor
cache, and the new data will be read properly from the DIMM. We know
that the cache lines are clean and will be discarded without any
writeback because either a) the previous aperture operation was a read,
and we never modified the contents of the aperture, or b) the previous
aperture operation was a write and we must have written back the dirtied
contents of the aperture to the DIMM before the I/O was completed.

In order to add support for the "read flush" flag I needed to add a
generic routine to invalidate cache lines, mmio_flush_range(). This is
protected by the ARCH_HAS_MMIO_FLUSH Kconfig variable, and is currently
only supported on x86.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

authored by

Ross Zwisler and committed by
Dan Williams
67a3e8fe e2e05394

+88 -36
+1
arch/x86/Kconfig
··· 28 28 select ARCH_HAS_FAST_MULTIPLIER 29 29 select ARCH_HAS_GCOV_PROFILE_ALL 30 30 select ARCH_HAS_PMEM_API 31 + select ARCH_HAS_MMIO_FLUSH 31 32 select ARCH_HAS_SG_CHAIN 32 33 select ARCH_HAVE_NMI_SAFE_CMPXCHG 33 34 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+2
arch/x86/include/asm/cacheflush.h
··· 89 89 90 90 void clflush_cache_range(void *addr, unsigned int size); 91 91 92 + #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) 93 + 92 94 #ifdef CONFIG_DEBUG_RODATA 93 95 void mark_rodata_ro(void); 94 96 extern const int rodata_test_data;
-2
arch/x86/include/asm/io.h
··· 248 248 #endif 249 249 } 250 250 251 - #define ARCH_MEMREMAP_PMEM MEMREMAP_WB 252 - 253 251 #endif /* __KERNEL__ */ 254 252 255 253 extern void native_io_delay(void);
+2
arch/x86/include/asm/pmem.h
··· 18 18 #include <asm/cpufeature.h> 19 19 #include <asm/special_insns.h> 20 20 21 + #define ARCH_MEMREMAP_PMEM MEMREMAP_WB 22 + 21 23 #ifdef CONFIG_ARCH_HAS_PMEM_API 22 24 /** 23 25 * arch_memcpy_to_pmem - copy data to persistent memory
+1
drivers/acpi/Kconfig
··· 410 410 tristate "ACPI NVDIMM Firmware Interface Table (NFIT)" 411 411 depends on PHYS_ADDR_T_64BIT 412 412 depends on BLK_DEV 413 + depends on ARCH_HAS_MMIO_FLUSH 413 414 select LIBNVDIMM 414 415 help 415 416 Infrastructure to probe ACPI 6 compliant platforms for
+31 -24
drivers/acpi/nfit.c
··· 1017 1017 if (mmio->num_lines) 1018 1018 offset = to_interleave_offset(offset, mmio); 1019 1019 1020 - return readq(mmio->base + offset); 1020 + return readq(mmio->addr.base + offset); 1021 1021 } 1022 1022 1023 1023 static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, ··· 1042 1042 if (mmio->num_lines) 1043 1043 offset = to_interleave_offset(offset, mmio); 1044 1044 1045 - writeq(cmd, mmio->base + offset); 1045 + writeq(cmd, mmio->addr.base + offset); 1046 1046 wmb_blk(nfit_blk); 1047 1047 1048 1048 if (nfit_blk->dimm_flags & ND_BLK_DCR_LATCH) 1049 - readq(mmio->base + offset); 1049 + readq(mmio->addr.base + offset); 1050 1050 } 1051 1051 1052 1052 static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, ··· 1078 1078 } 1079 1079 1080 1080 if (rw) 1081 - memcpy_to_pmem(mmio->aperture + offset, 1081 + memcpy_to_pmem(mmio->addr.aperture + offset, 1082 1082 iobuf + copied, c); 1083 - else 1083 + else { 1084 + if (nfit_blk->dimm_flags & ND_BLK_READ_FLUSH) 1085 + mmio_flush_range((void __force *) 1086 + mmio->addr.aperture + offset, c); 1087 + 1084 1088 memcpy_from_pmem(iobuf + copied, 1085 - mmio->aperture + offset, c); 1089 + mmio->addr.aperture + offset, c); 1090 + } 1086 1091 1087 1092 copied += c; 1088 1093 len -= c; ··· 1134 1129 1135 1130 WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); 1136 1131 dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index); 1137 - iounmap(spa_map->iomem); 1132 + if (spa_map->type == SPA_MAP_APERTURE) 1133 + memunmap((void __force *)spa_map->addr.aperture); 1134 + else 1135 + iounmap(spa_map->addr.base); 1138 1136 release_mem_region(spa->address, spa->length); 1139 1137 list_del(&spa_map->list); 1140 1138 kfree(spa_map); ··· 1183 1175 spa_map = find_spa_mapping(acpi_desc, spa); 1184 1176 if (spa_map) { 1185 1177 kref_get(&spa_map->kref); 1186 - return spa_map->iomem; 1178 + return spa_map->addr.base; 1187 1179 } 1188 1180 1189 1181 spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL); ··· 1199 1191 if (!res) 1200 1192 goto err_mem; 1201 1193 1202 - if (type == SPA_MAP_APERTURE) { 1203 - /* 1204 - * TODO: memremap_pmem() support, but that requires cache 1205 - * flushing when the aperture is moved. 1206 - */ 1207 - spa_map->iomem = ioremap_wc(start, n); 1208 - } else 1209 - spa_map->iomem = ioremap_nocache(start, n); 1194 + spa_map->type = type; 1195 + if (type == SPA_MAP_APERTURE) 1196 + spa_map->addr.aperture = (void __pmem *)memremap(start, n, 1197 + ARCH_MEMREMAP_PMEM); 1198 + else 1199 + spa_map->addr.base = ioremap_nocache(start, n); 1210 1200 1211 - if (!spa_map->iomem) 1201 + 1202 + if (!spa_map->addr.base) 1212 1203 goto err_map; 1213 1204 1214 1205 list_add_tail(&spa_map->list, &acpi_desc->spa_maps); 1215 - return spa_map->iomem; 1206 + return spa_map->addr.base; 1216 1207 1217 1208 err_map: 1218 1209 release_mem_region(start, n); ··· 1274 1267 nfit_blk->dimm_flags = flags.flags; 1275 1268 else if (rc == -ENOTTY) { 1276 1269 /* fall back to a conservative default */ 1277 - nfit_blk->dimm_flags = ND_BLK_DCR_LATCH; 1270 + nfit_blk->dimm_flags = ND_BLK_DCR_LATCH | ND_BLK_READ_FLUSH; 1278 1271 rc = 0; 1279 1272 } else 1280 1273 rc = -ENXIO; ··· 1314 1307 /* map block aperture memory */ 1315 1308 nfit_blk->bdw_offset = nfit_mem->bdw->offset; 1316 1309 mmio = &nfit_blk->mmio[BDW]; 1317 - mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw, 1310 + mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw, 1318 1311 SPA_MAP_APERTURE); 1319 - if (!mmio->base) { 1312 + if (!mmio->addr.base) { 1320 1313 dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, 1321 1314 nvdimm_name(nvdimm)); 1322 1315 return -ENOMEM; ··· 1337 1330 nfit_blk->cmd_offset = nfit_mem->dcr->command_offset; 1338 1331 nfit_blk->stat_offset = nfit_mem->dcr->status_offset; 1339 1332 mmio = &nfit_blk->mmio[DCR]; 1340 - mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr, 1333 + mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr, 1341 1334 SPA_MAP_CONTROL); 1342 - if (!mmio->base) { 1335 + if (!mmio->addr.base) { 1343 1336 dev_dbg(dev, "%s: %s failed to map dcr\n", __func__, 1344 1337 nvdimm_name(nvdimm)); 1345 1338 return -ENOMEM; ··· 1406 1399 for (i = 0; i < 2; i++) { 1407 1400 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i]; 1408 1401 1409 - if (mmio->base) 1402 + if (mmio->addr.base) 1410 1403 nfit_spa_unmap(acpi_desc, mmio->spa); 1411 1404 } 1412 1405 nd_blk_region_set_provider_data(ndbr, NULL);
+11 -5
drivers/acpi/nfit.h
··· 41 41 }; 42 42 43 43 enum { 44 + ND_BLK_READ_FLUSH = 1, 44 45 ND_BLK_DCR_LATCH = 2, 45 46 }; 46 47 ··· 118 117 DCR, 119 118 }; 120 119 120 + struct nd_blk_addr { 121 + union { 122 + void __iomem *base; 123 + void __pmem *aperture; 124 + }; 125 + }; 126 + 121 127 struct nfit_blk { 122 128 struct nfit_blk_mmio { 123 - union { 124 - void __iomem *base; 125 - void __pmem *aperture; 126 - }; 129 + struct nd_blk_addr addr; 127 130 u64 size; 128 131 u64 base_offset; 129 132 u32 line_size; ··· 154 149 struct acpi_nfit_system_address *spa; 155 150 struct list_head list; 156 151 struct kref kref; 157 - void __iomem *iomem; 152 + enum spa_map_type type; 153 + struct nd_blk_addr addr; 158 154 }; 159 155 160 156 static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
+3
lib/Kconfig
··· 531 531 config ARCH_HAS_PMEM_API 532 532 bool 533 533 534 + config ARCH_HAS_MMIO_FLUSH 535 + bool 536 + 534 537 endmenu
+2
tools/testing/nvdimm/Kbuild
··· 1 1 ldflags-y += --wrap=ioremap_wc 2 + ldflags-y += --wrap=memremap 2 3 ldflags-y += --wrap=devm_ioremap_nocache 3 4 ldflags-y += --wrap=devm_memremap 4 5 ldflags-y += --wrap=ioremap_nocache 5 6 ldflags-y += --wrap=iounmap 7 + ldflags-y += --wrap=memunmap 6 8 ldflags-y += --wrap=__devm_request_region 7 9 ldflags-y += --wrap=__request_region 8 10 ldflags-y += --wrap=__release_region
+28 -2
tools/testing/nvdimm/test/iomap.c
··· 89 89 nfit_res = get_nfit_res(offset); 90 90 rcu_read_unlock(); 91 91 if (nfit_res) 92 - return (void __iomem *) nfit_res->buf + offset 93 - - nfit_res->res->start; 92 + return nfit_res->buf + offset - nfit_res->res->start; 94 93 return devm_memremap(dev, offset, size, flags); 95 94 } 96 95 EXPORT_SYMBOL(__wrap_devm_memremap); 96 + 97 + void *__wrap_memremap(resource_size_t offset, size_t size, 98 + unsigned long flags) 99 + { 100 + struct nfit_test_resource *nfit_res; 101 + 102 + rcu_read_lock(); 103 + nfit_res = get_nfit_res(offset); 104 + rcu_read_unlock(); 105 + if (nfit_res) 106 + return nfit_res->buf + offset - nfit_res->res->start; 107 + return memremap(offset, size, flags); 108 + } 109 + EXPORT_SYMBOL(__wrap_memremap); 97 110 98 111 void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size) 99 112 { ··· 132 119 return iounmap(addr); 133 120 } 134 121 EXPORT_SYMBOL(__wrap_iounmap); 122 + 123 + void __wrap_memunmap(void *addr) 124 + { 125 + struct nfit_test_resource *nfit_res; 126 + 127 + rcu_read_lock(); 128 + nfit_res = get_nfit_res((unsigned long) addr); 129 + rcu_read_unlock(); 130 + if (nfit_res) 131 + return; 132 + return memunmap(addr); 133 + } 134 + EXPORT_SYMBOL(__wrap_memunmap); 135 135 136 136 static struct resource *nfit_test_request_region(struct device *dev, 137 137 struct resource *parent, resource_size_t start,
+7 -3
tools/testing/nvdimm/test/nfit.c
··· 1029 1029 1030 1030 lane = nd_region_acquire_lane(nd_region); 1031 1031 if (rw) 1032 - memcpy(mmio->base + dpa, iobuf, len); 1033 - else 1034 - memcpy(iobuf, mmio->base + dpa, len); 1032 + memcpy(mmio->addr.base + dpa, iobuf, len); 1033 + else { 1034 + memcpy(iobuf, mmio->addr.base + dpa, len); 1035 + 1036 + /* give us some some coverage of the mmio_flush_range() API */ 1037 + mmio_flush_range(mmio->addr.base + dpa, len); 1038 + } 1035 1039 nd_region_release_lane(nd_region, lane); 1036 1040 1037 1041 return 0;