Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm and DAX updates from Dan Williams:
"New support for clearing memory errors when a file is in DAX mode,
alongside with some other fixes and cleanups.

Previously it was only possible to clear these errors using a truncate
or hole-punch operation to trigger the filesystem to reallocate the
block, now, any page aligned write can opportunistically clear errors
as well.

This change spans x86/mm, nvdimm, and fs/dax, and has received the
appropriate sign-offs. Thanks to Jane for her work on this.

Summary:

- Add support for clearing memory error via pwrite(2) on DAX

- Fix 'security overwrite' support in the presence of media errors

- Miscellaneous cleanups and fixes for nfit_test (nvdimm unit tests)"

* tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
pmem: implement pmem_recovery_write()
pmem: refactor pmem_clear_poison()
dax: add .recovery_write dax_operation
dax: introduce DAX_RECOVERY_WRITE dax access mode
mce: fix set_mce_nospec to always unmap the whole page
x86/mce: relocate set{clear}_mce_nospec() functions
acpi/nfit: rely on mce->misc to determine poison granularity
testing: nvdimm: asm/mce.h is not needed in nfit.c
testing: nvdimm: iomap: make __nfit_test_ioremap a macro
nvdimm: Allow overwrite in the presence of disabled dimms
tools/testing/nvdimm: remove unneeded flush_workqueue

+360 -172
-52
arch/x86/include/asm/set_memory.h
··· 86 86 87 87 extern int kernel_set_to_readonly; 88 88 89 - #ifdef CONFIG_X86_64 90 - /* 91 - * Prevent speculative access to the page by either unmapping 92 - * it (if we do not require access to any part of the page) or 93 - * marking it uncacheable (if we want to try to retrieve data 94 - * from non-poisoned lines in the page). 95 - */ 96 - static inline int set_mce_nospec(unsigned long pfn, bool unmap) 97 - { 98 - unsigned long decoy_addr; 99 - int rc; 100 - 101 - /* SGX pages are not in the 1:1 map */ 102 - if (arch_is_platform_page(pfn << PAGE_SHIFT)) 103 - return 0; 104 - /* 105 - * We would like to just call: 106 - * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); 107 - * but doing that would radically increase the odds of a 108 - * speculative access to the poison page because we'd have 109 - * the virtual address of the kernel 1:1 mapping sitting 110 - * around in registers. 111 - * Instead we get tricky. We create a non-canonical address 112 - * that looks just like the one we want, but has bit 63 flipped. 113 - * This relies on set_memory_XX() properly sanitizing any __pa() 114 - * results with __PHYSICAL_MASK or PTE_PFN_MASK. 115 - */ 116 - decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 117 - 118 - if (unmap) 119 - rc = set_memory_np(decoy_addr, 1); 120 - else 121 - rc = set_memory_uc(decoy_addr, 1); 122 - if (rc) 123 - pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 124 - return rc; 125 - } 126 - #define set_mce_nospec set_mce_nospec 127 - 128 - /* Restore full speculative operation to the pfn. */ 129 - static inline int clear_mce_nospec(unsigned long pfn) 130 - { 131 - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); 132 - } 133 - #define clear_mce_nospec clear_mce_nospec 134 - #else 135 - /* 136 - * Few people would run a 32-bit kernel on a machine that supports 137 - * recoverable errors because they have too much memory to boot 32-bit. 138 - */ 139 - #endif 140 - 141 89 #endif /* _ASM_X86_SET_MEMORY_H */
+3 -3
arch/x86/kernel/cpu/mce/core.c
··· 581 581 582 582 pfn = mce->addr >> PAGE_SHIFT; 583 583 if (!memory_failure(pfn, 0)) { 584 - set_mce_nospec(pfn, whole_page(mce)); 584 + set_mce_nospec(pfn); 585 585 mce->kflags |= MCE_HANDLED_UC; 586 586 } 587 587 ··· 1318 1318 1319 1319 ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); 1320 1320 if (!ret) { 1321 - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); 1321 + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); 1322 1322 sync_core(); 1323 1323 return; 1324 1324 } ··· 1344 1344 p->mce_count = 0; 1345 1345 pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); 1346 1346 if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) 1347 - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); 1347 + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); 1348 1348 } 1349 1349 1350 1350 static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+47 -2
arch/x86/mm/pat/set_memory.c
··· 19 19 #include <linux/vmstat.h> 20 20 #include <linux/kernel.h> 21 21 #include <linux/cc_platform.h> 22 + #include <linux/set_memory.h> 22 23 23 24 #include <asm/e820/api.h> 24 25 #include <asm/processor.h> ··· 30 29 #include <asm/pgalloc.h> 31 30 #include <asm/proto.h> 32 31 #include <asm/memtype.h> 33 - #include <asm/set_memory.h> 34 32 #include <asm/hyperv-tlfs.h> 35 33 #include <asm/mshyperv.h> 36 34 ··· 1805 1805 } 1806 1806 1807 1807 /* 1808 - * _set_memory_prot is an internal helper for callers that have been passed 1808 + * __set_memory_prot is an internal helper for callers that have been passed 1809 1809 * a pgprot_t value from upper layers and a reservation has already been taken. 1810 1810 * If you want to set the pgprot to a specific page protocol, use the 1811 1811 * set_memory_xx() functions. ··· 1913 1913 return 0; 1914 1914 } 1915 1915 EXPORT_SYMBOL(set_memory_wb); 1916 + 1917 + /* Prevent speculative access to a page by marking it not-present */ 1918 + #ifdef CONFIG_X86_64 1919 + int set_mce_nospec(unsigned long pfn) 1920 + { 1921 + unsigned long decoy_addr; 1922 + int rc; 1923 + 1924 + /* SGX pages are not in the 1:1 map */ 1925 + if (arch_is_platform_page(pfn << PAGE_SHIFT)) 1926 + return 0; 1927 + /* 1928 + * We would like to just call: 1929 + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); 1930 + * but doing that would radically increase the odds of a 1931 + * speculative access to the poison page because we'd have 1932 + * the virtual address of the kernel 1:1 mapping sitting 1933 + * around in registers. 1934 + * Instead we get tricky. We create a non-canonical address 1935 + * that looks just like the one we want, but has bit 63 flipped. 1936 + * This relies on set_memory_XX() properly sanitizing any __pa() 1937 + * results with __PHYSICAL_MASK or PTE_PFN_MASK. 1938 + */ 1939 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 1940 + 1941 + rc = set_memory_np(decoy_addr, 1); 1942 + if (rc) 1943 + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 1944 + return rc; 1945 + } 1946 + 1947 + static int set_memory_present(unsigned long *addr, int numpages) 1948 + { 1949 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1950 + } 1951 + 1952 + /* Restore full speculative operation to the pfn. */ 1953 + int clear_mce_nospec(unsigned long pfn) 1954 + { 1955 + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); 1956 + 1957 + return set_memory_present(&addr, 1); 1958 + } 1959 + EXPORT_SYMBOL_GPL(clear_mce_nospec); 1960 + #endif /* CONFIG_X86_64 */ 1916 1961 1917 1962 int set_memory_x(unsigned long addr, int numpages) 1918 1963 {
+2 -2
drivers/acpi/nfit/mce.c
··· 32 32 */ 33 33 mutex_lock(&acpi_desc_lock); 34 34 list_for_each_entry(acpi_desc, &acpi_descs, list) { 35 + unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc); 35 36 struct device *dev = acpi_desc->dev; 36 37 int found_match = 0; 37 38 ··· 64 63 65 64 /* If this fails due to an -ENOMEM, there is little we can do */ 66 65 nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus, 67 - ALIGN(mce->addr, L1_CACHE_BYTES), 68 - L1_CACHE_BYTES); 66 + ALIGN_DOWN(mce->addr, align), align); 69 67 nvdimm_region_notify(nfit_spa->nd_region, 70 68 NVDIMM_REVALIDATE_POISON); 71 69
+12 -2
drivers/dax/super.c
··· 117 117 * @dax_dev: a dax_device instance representing the logical memory range 118 118 * @pgoff: offset in pages from the start of the device to translate 119 119 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 120 + * @mode: indicator on normal access or recovery write 120 121 * @kaddr: output parameter that returns a virtual address mapping of pfn 121 122 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 122 123 * ··· 125 124 * pages accessible at the device relative @pgoff. 126 125 */ 127 126 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 128 - void **kaddr, pfn_t *pfn) 127 + enum dax_access_mode mode, void **kaddr, pfn_t *pfn) 129 128 { 130 129 long avail; 131 130 ··· 139 138 return -EINVAL; 140 139 141 140 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 142 - kaddr, pfn); 141 + mode, kaddr, pfn); 143 142 if (!avail) 144 143 return -ERANGE; 145 144 return min(avail, nr_pages); ··· 194 193 return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); 195 194 } 196 195 EXPORT_SYMBOL_GPL(dax_zero_page_range); 196 + 197 + size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 198 + void *addr, size_t bytes, struct iov_iter *iter) 199 + { 200 + if (!dax_dev->ops->recovery_write) 201 + return 0; 202 + return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); 203 + } 204 + EXPORT_SYMBOL_GPL(dax_recovery_write); 197 205 198 206 #ifdef CONFIG_ARCH_HAS_PMEM_API 199 207 void arch_wb_cache_pmem(void *addr, size_t size);
+13 -2
drivers/md/dm-linear.c
··· 165 165 } 166 166 167 167 static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 168 - long nr_pages, void **kaddr, pfn_t *pfn) 168 + long nr_pages, enum dax_access_mode mode, void **kaddr, 169 + pfn_t *pfn) 169 170 { 170 171 struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); 171 172 172 - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 173 + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); 173 174 } 174 175 175 176 static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, ··· 181 180 return dax_zero_page_range(dax_dev, pgoff, nr_pages); 182 181 } 183 182 183 + static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, 184 + void *addr, size_t bytes, struct iov_iter *i) 185 + { 186 + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); 187 + 188 + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); 189 + } 190 + 184 191 #else 185 192 #define linear_dax_direct_access NULL 186 193 #define linear_dax_zero_page_range NULL 194 + #define linear_dax_recovery_write NULL 187 195 #endif 188 196 189 197 static struct target_type linear_target = { ··· 210 200 .iterate_devices = linear_iterate_devices, 211 201 .direct_access = linear_dax_direct_access, 212 202 .dax_zero_page_range = linear_dax_zero_page_range, 203 + .dax_recovery_write = linear_dax_recovery_write, 213 204 }; 214 205 215 206 int __init dm_linear_init(void)
+13 -2
drivers/md/dm-log-writes.c
··· 888 888 } 889 889 890 890 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 891 - long nr_pages, void **kaddr, pfn_t *pfn) 891 + long nr_pages, enum dax_access_mode mode, void **kaddr, 892 + pfn_t *pfn) 892 893 { 893 894 struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); 894 895 895 - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 896 + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); 896 897 } 897 898 898 899 static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, ··· 904 903 return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); 905 904 } 906 905 906 + static size_t log_writes_dax_recovery_write(struct dm_target *ti, 907 + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) 908 + { 909 + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); 910 + 911 + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); 912 + } 913 + 907 914 #else 908 915 #define log_writes_dax_direct_access NULL 909 916 #define log_writes_dax_zero_page_range NULL 917 + #define log_writes_dax_recovery_write NULL 910 918 #endif 911 919 912 920 static struct target_type log_writes_target = { ··· 933 923 .io_hints = log_writes_io_hints, 934 924 .direct_access = log_writes_dax_direct_access, 935 925 .dax_zero_page_range = log_writes_dax_zero_page_range, 926 + .dax_recovery_write = log_writes_dax_recovery_write, 936 927 }; 937 928 938 929 static int __init dm_log_writes_init(void)
+13 -2
drivers/md/dm-stripe.c
··· 315 315 } 316 316 317 317 static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 318 - long nr_pages, void **kaddr, pfn_t *pfn) 318 + long nr_pages, enum dax_access_mode mode, void **kaddr, 319 + pfn_t *pfn) 319 320 { 320 321 struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); 321 322 322 - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); 323 + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); 323 324 } 324 325 325 326 static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, ··· 331 330 return dax_zero_page_range(dax_dev, pgoff, nr_pages); 332 331 } 333 332 333 + static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, 334 + void *addr, size_t bytes, struct iov_iter *i) 335 + { 336 + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); 337 + 338 + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); 339 + } 340 + 334 341 #else 335 342 #define stripe_dax_direct_access NULL 336 343 #define stripe_dax_zero_page_range NULL 344 + #define stripe_dax_recovery_write NULL 337 345 #endif 338 346 339 347 /* ··· 479 469 .io_hints = stripe_io_hints, 480 470 .direct_access = stripe_dax_direct_access, 481 471 .dax_zero_page_range = stripe_dax_zero_page_range, 472 + .dax_recovery_write = stripe_dax_recovery_write, 482 473 }; 483 474 484 475 int __init dm_stripe_init(void)
+3 -1
drivers/md/dm-target.c
··· 10 10 #include <linux/init.h> 11 11 #include <linux/kmod.h> 12 12 #include <linux/bio.h> 13 + #include <linux/dax.h> 13 14 14 15 #define DM_MSG_PREFIX "target" 15 16 ··· 143 142 } 144 143 145 144 static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 146 - long nr_pages, void **kaddr, pfn_t *pfn) 145 + long nr_pages, enum dax_access_mode mode, void **kaddr, 146 + pfn_t *pfn) 147 147 { 148 148 return -EIO; 149 149 }
+4 -3
drivers/md/dm-writecache.c
··· 286 286 287 287 id = dax_read_lock(); 288 288 289 - da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn); 289 + da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 290 + &wc->memory_map, &pfn); 290 291 if (da < 0) { 291 292 wc->memory_map = NULL; 292 293 r = da; ··· 309 308 i = 0; 310 309 do { 311 310 long daa; 312 - daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i, 313 - NULL, &pfn); 311 + daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 312 + p - i, DAX_ACCESS, NULL, &pfn); 314 313 if (daa <= 0) { 315 314 r = daa ? daa : -EINVAL; 316 315 goto err3;
+23 -2
drivers/md/dm.c
··· 1143 1143 } 1144 1144 1145 1145 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1146 - long nr_pages, void **kaddr, pfn_t *pfn) 1146 + long nr_pages, enum dax_access_mode mode, void **kaddr, 1147 + pfn_t *pfn) 1147 1148 { 1148 1149 struct mapped_device *md = dax_get_private(dax_dev); 1149 1150 sector_t sector = pgoff * PAGE_SECTORS; ··· 1162 1161 if (len < 1) 1163 1162 goto out; 1164 1163 nr_pages = min(len, nr_pages); 1165 - ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1164 + ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); 1166 1165 1167 1166 out: 1168 1167 dm_put_live_table(md, srcu_idx); ··· 1194 1193 out: 1195 1194 dm_put_live_table(md, srcu_idx); 1196 1195 1196 + return ret; 1197 + } 1198 + 1199 + static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 1200 + void *addr, size_t bytes, struct iov_iter *i) 1201 + { 1202 + struct mapped_device *md = dax_get_private(dax_dev); 1203 + sector_t sector = pgoff * PAGE_SECTORS; 1204 + struct dm_target *ti; 1205 + int srcu_idx; 1206 + long ret = 0; 1207 + 1208 + ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1209 + if (!ti || !ti->type->dax_recovery_write) 1210 + goto out; 1211 + 1212 + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); 1213 + out: 1214 + dm_put_live_table(md, srcu_idx); 1197 1215 return ret; 1198 1216 } 1199 1217 ··· 3251 3231 static const struct dax_operations dm_dax_ops = { 3252 3232 .direct_access = dm_dax_direct_access, 3253 3233 .zero_page_range = dm_dax_zero_page_range, 3234 + .recovery_write = dm_dax_recovery_write, 3254 3235 }; 3255 3236 3256 3237 /*
+146 -59
drivers/nvdimm/pmem.c
··· 45 45 return to_nd_region(to_dev(pmem)->parent); 46 46 } 47 47 48 - static void hwpoison_clear(struct pmem_device *pmem, 49 - phys_addr_t phys, unsigned int len) 48 + static phys_addr_t to_phys(struct pmem_device *pmem, phys_addr_t offset) 50 49 { 50 + return pmem->phys_addr + offset; 51 + } 52 + 53 + static sector_t to_sect(struct pmem_device *pmem, phys_addr_t offset) 54 + { 55 + return (offset - pmem->data_offset) >> SECTOR_SHIFT; 56 + } 57 + 58 + static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector) 59 + { 60 + return (sector << SECTOR_SHIFT) + pmem->data_offset; 61 + } 62 + 63 + static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset, 64 + unsigned int len) 65 + { 66 + phys_addr_t phys = to_phys(pmem, offset); 51 67 unsigned long pfn_start, pfn_end, pfn; 52 68 53 69 /* only pmem in the linear map supports HWPoison */ ··· 85 69 } 86 70 } 87 71 72 + static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks) 73 + { 74 + if (blks == 0) 75 + return; 76 + badblocks_clear(&pmem->bb, sector, blks); 77 + if (pmem->bb_state) 78 + sysfs_notify_dirent(pmem->bb_state); 79 + } 80 + 81 + static long __pmem_clear_poison(struct pmem_device *pmem, 82 + phys_addr_t offset, unsigned int len) 83 + { 84 + phys_addr_t phys = to_phys(pmem, offset); 85 + long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len); 86 + 87 + if (cleared > 0) { 88 + pmem_mkpage_present(pmem, offset, cleared); 89 + arch_invalidate_pmem(pmem->virt_addr + offset, len); 90 + } 91 + return cleared; 92 + } 93 + 88 94 static blk_status_t pmem_clear_poison(struct pmem_device *pmem, 89 95 phys_addr_t offset, unsigned int len) 90 96 { 91 - struct device *dev = to_dev(pmem); 92 - sector_t sector; 93 - long cleared; 94 - blk_status_t rc = BLK_STS_OK; 97 + long cleared = __pmem_clear_poison(pmem, offset, len); 95 98 96 - sector = (offset - pmem->data_offset) / 512; 99 + if (cleared < 0) 100 + return BLK_STS_IOERR; 97 101 98 - cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 102 + pmem_clear_bb(pmem, to_sect(pmem, offset), cleared >> SECTOR_SHIFT); 99 103 if (cleared < len) 100 - rc = BLK_STS_IOERR; 101 - if (cleared > 0 && cleared / 512) { 102 - hwpoison_clear(pmem, pmem->phys_addr + offset, cleared); 103 - cleared /= 512; 104 - dev_dbg(dev, "%#llx clear %ld sector%s\n", 105 - (unsigned long long) sector, cleared, 106 - cleared > 1 ? "s" : ""); 107 - badblocks_clear(&pmem->bb, sector, cleared); 108 - if (pmem->bb_state) 109 - sysfs_notify_dirent(pmem->bb_state); 110 - } 111 - 112 - arch_invalidate_pmem(pmem->virt_addr + offset, len); 113 - 114 - return rc; 104 + return BLK_STS_IOERR; 105 + return BLK_STS_OK; 115 106 } 116 107 117 108 static void write_pmem(void *pmem_addr, struct page *page, ··· 166 143 sector_t sector, unsigned int len) 167 144 { 168 145 blk_status_t rc; 169 - phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 146 + phys_addr_t pmem_off = to_offset(pmem, sector); 170 147 void *pmem_addr = pmem->virt_addr + pmem_off; 171 148 172 149 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) ··· 181 158 struct page *page, unsigned int page_off, 182 159 sector_t sector, unsigned int len) 183 160 { 184 - blk_status_t rc = BLK_STS_OK; 185 - bool bad_pmem = false; 186 - phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 161 + phys_addr_t pmem_off = to_offset(pmem, sector); 187 162 void *pmem_addr = pmem->virt_addr + pmem_off; 188 163 189 - if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 190 - bad_pmem = true; 164 + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { 165 + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); 191 166 192 - /* 193 - * Note that we write the data both before and after 194 - * clearing poison. The write before clear poison 195 - * handles situations where the latest written data is 196 - * preserved and the clear poison operation simply marks 197 - * the address range as valid without changing the data. 198 - * In this case application software can assume that an 199 - * interrupted write will either return the new good 200 - * data or an error. 201 - * 202 - * However, if pmem_clear_poison() leaves the data in an 203 - * indeterminate state we need to perform the write 204 - * after clear poison. 205 - */ 206 - flush_dcache_page(page); 207 - write_pmem(pmem_addr, page, page_off, len); 208 - if (unlikely(bad_pmem)) { 209 - rc = pmem_clear_poison(pmem, pmem_off, len); 210 - write_pmem(pmem_addr, page, page_off, len); 167 + if (rc != BLK_STS_OK) 168 + return rc; 211 169 } 212 170 213 - return rc; 171 + flush_dcache_page(page); 172 + write_pmem(pmem_addr, page, page_off, len); 173 + 174 + return BLK_STS_OK; 214 175 } 215 176 216 177 static void pmem_submit_bio(struct bio *bio) ··· 262 255 263 256 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 264 257 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 265 - long nr_pages, void **kaddr, pfn_t *pfn) 258 + long nr_pages, enum dax_access_mode mode, void **kaddr, 259 + pfn_t *pfn) 266 260 { 267 261 resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; 268 - 269 - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, 270 - PFN_PHYS(nr_pages)))) 271 - return -EIO; 262 + sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT; 263 + unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT; 264 + struct badblocks *bb = &pmem->bb; 265 + sector_t first_bad; 266 + int num_bad; 272 267 273 268 if (kaddr) 274 269 *kaddr = pmem->virt_addr + offset; 275 270 if (pfn) 276 271 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 277 272 273 + if (bb->count && 274 + badblocks_check(bb, sector, num, &first_bad, &num_bad)) { 275 + long actual_nr; 276 + 277 + if (mode != DAX_RECOVERY_WRITE) 278 + return -EIO; 279 + 280 + /* 281 + * Set the recovery stride is set to kernel page size because 282 + * the underlying driver and firmware clear poison functions 283 + * don't appear to handle large chunk(such as 2MiB) reliably. 284 + */ 285 + actual_nr = PHYS_PFN( 286 + PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT)); 287 + dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n", 288 + sector, nr_pages, first_bad, actual_nr); 289 + if (actual_nr) 290 + return actual_nr; 291 + return 1; 292 + } 293 + 278 294 /* 279 - * If badblocks are present, limit known good range to the 280 - * requested range. 295 + * If badblocks are present but not in the range, limit known good range 296 + * to the requested range. 281 297 */ 282 - if (unlikely(pmem->bb.count)) 298 + if (bb->count) 283 299 return nr_pages; 284 300 return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); 285 301 } ··· 324 294 } 325 295 326 296 static long pmem_dax_direct_access(struct dax_device *dax_dev, 327 - pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) 297 + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, 298 + void **kaddr, pfn_t *pfn) 328 299 { 329 300 struct pmem_device *pmem = dax_get_private(dax_dev); 330 301 331 - return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); 302 + return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); 303 + } 304 + 305 + /* 306 + * The recovery write thread started out as a normal pwrite thread and 307 + * when the filesystem was told about potential media error in the 308 + * range, filesystem turns the normal pwrite to a dax_recovery_write. 309 + * 310 + * The recovery write consists of clearing media poison, clearing page 311 + * HWPoison bit, reenable page-wide read-write permission, flush the 312 + * caches and finally write. A competing pread thread will be held 313 + * off during the recovery process since data read back might not be 314 + * valid, and this is achieved by clearing the badblock records after 315 + * the recovery write is complete. Competing recovery write threads 316 + * are already serialized by writer lock held by dax_iomap_rw(). 317 + */ 318 + static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 319 + void *addr, size_t bytes, struct iov_iter *i) 320 + { 321 + struct pmem_device *pmem = dax_get_private(dax_dev); 322 + size_t olen, len, off; 323 + phys_addr_t pmem_off; 324 + struct device *dev = pmem->bb.dev; 325 + long cleared; 326 + 327 + off = offset_in_page(addr); 328 + len = PFN_PHYS(PFN_UP(off + bytes)); 329 + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len)) 330 + return _copy_from_iter_flushcache(addr, bytes, i); 331 + 332 + /* 333 + * Not page-aligned range cannot be recovered. This should not 334 + * happen unless something else went wrong. 335 + */ 336 + if (off || !PAGE_ALIGNED(bytes)) { 337 + dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n", 338 + addr, bytes); 339 + return 0; 340 + } 341 + 342 + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; 343 + cleared = __pmem_clear_poison(pmem, pmem_off, len); 344 + if (cleared > 0 && cleared < len) { 345 + dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n", 346 + cleared, len); 347 + return 0; 348 + } 349 + if (cleared < 0) { 350 + dev_dbg(dev, "poison clear failed: %ld\n", cleared); 351 + return 0; 352 + } 353 + 354 + olen = _copy_from_iter_flushcache(addr, bytes, i); 355 + pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT); 356 + 357 + return olen; 332 358 } 333 359 334 360 static const struct dax_operations pmem_dax_ops = { 335 361 .direct_access = pmem_dax_direct_access, 336 362 .zero_page_range = pmem_dax_zero_page_range, 363 + .recovery_write = pmem_recovery_write, 337 364 }; 338 365 339 366 static ssize_t write_cache_show(struct device *dev,
+4 -1
drivers/nvdimm/pmem.h
··· 8 8 #include <linux/pfn_t.h> 9 9 #include <linux/fs.h> 10 10 11 + enum dax_access_mode; 12 + 11 13 /* this definition is in it's own header for tools/testing/nvdimm to consume */ 12 14 struct pmem_device { 13 15 /* One contiguous memory region per device */ ··· 30 28 }; 31 29 32 30 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 33 - long nr_pages, void **kaddr, pfn_t *pfn); 31 + long nr_pages, enum dax_access_mode mode, void **kaddr, 32 + pfn_t *pfn); 34 33 35 34 #ifdef CONFIG_MEMORY_FAILURE 36 35 static inline bool test_and_clear_pmem_poison(struct page *page)
-5
drivers/nvdimm/security.c
··· 379 379 || !nvdimm->sec.flags) 380 380 return -EOPNOTSUPP; 381 381 382 - if (dev->driver == NULL) { 383 - dev_dbg(dev, "Unable to overwrite while DIMM active.\n"); 384 - return -EINVAL; 385 - } 386 - 387 382 rc = check_security_state(nvdimm); 388 383 if (rc) 389 384 return rc;
+6 -3
drivers/s390/block/dcssblk.c
··· 32 32 static void dcssblk_release(struct gendisk *disk, fmode_t mode); 33 33 static void dcssblk_submit_bio(struct bio *bio); 34 34 static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 35 - long nr_pages, void **kaddr, pfn_t *pfn); 35 + long nr_pages, enum dax_access_mode mode, void **kaddr, 36 + pfn_t *pfn); 36 37 37 38 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 38 39 ··· 51 50 long rc; 52 51 void *kaddr; 53 52 54 - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); 53 + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, 54 + &kaddr, NULL); 55 55 if (rc < 0) 56 56 return rc; 57 57 memset(kaddr, 0, nr_pages << PAGE_SHIFT); ··· 929 927 930 928 static long 931 929 dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 932 - long nr_pages, void **kaddr, pfn_t *pfn) 930 + long nr_pages, enum dax_access_mode mode, void **kaddr, 931 + pfn_t *pfn) 933 932 { 934 933 struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev); 935 934
+17 -5
fs/dax.c
··· 722 722 int id; 723 723 724 724 id = dax_read_lock(); 725 - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); 725 + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, 726 + &kaddr, NULL); 726 727 if (rc < 0) { 727 728 dax_read_unlock(id); 728 729 return rc; ··· 940 939 941 940 id = dax_read_lock(); 942 941 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 943 - NULL, pfnp); 942 + DAX_ACCESS, NULL, pfnp); 944 943 if (length < 0) { 945 944 rc = length; 946 945 goto out; ··· 1049 1048 void *kaddr; 1050 1049 long ret; 1051 1050 1052 - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); 1051 + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); 1053 1052 if (ret > 0) { 1054 1053 memset(kaddr + offset, 0, size); 1055 1054 dax_flush(dax_dev, kaddr + offset, size); ··· 1166 1165 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1167 1166 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1168 1167 ssize_t map_len; 1168 + bool recovery = false; 1169 1169 void *kaddr; 1170 1170 1171 1171 if (fatal_signal_pending(current)) { ··· 1175 1173 } 1176 1174 1177 1175 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1178 - &kaddr, NULL); 1176 + DAX_ACCESS, &kaddr, NULL); 1177 + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { 1178 + map_len = dax_direct_access(dax_dev, pgoff, 1179 + PHYS_PFN(size), DAX_RECOVERY_WRITE, 1180 + &kaddr, NULL); 1181 + if (map_len > 0) 1182 + recovery = true; 1183 + } 1179 1184 if (map_len < 0) { 1180 1185 ret = map_len; 1181 1186 break; ··· 1194 1185 if (map_len > end - pos) 1195 1186 map_len = end - pos; 1196 1187 1197 - if (iov_iter_rw(iter) == WRITE) 1188 + if (recovery) 1189 + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, 1190 + map_len, iter); 1191 + else if (iov_iter_rw(iter) == WRITE) 1198 1192 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1199 1193 map_len, iter); 1200 1194 else
+2 -2
fs/fuse/dax.c
··· 1241 1241 INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); 1242 1242 1243 1243 id = dax_read_lock(); 1244 - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, 1245 - NULL); 1244 + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), 1245 + DAX_ACCESS, NULL, NULL); 1246 1246 dax_read_unlock(id); 1247 1247 if (nr_pages < 0) { 1248 1248 pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+4 -2
fs/fuse/virtio_fs.c
··· 752 752 * offset. 753 753 */ 754 754 static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 755 - long nr_pages, void **kaddr, pfn_t *pfn) 755 + long nr_pages, enum dax_access_mode mode, 756 + void **kaddr, pfn_t *pfn) 756 757 { 757 758 struct virtio_fs *fs = dax_get_private(dax_dev); 758 759 phys_addr_t offset = PFN_PHYS(pgoff); ··· 773 772 long rc; 774 773 void *kaddr; 775 774 776 - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); 775 + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, 776 + NULL); 777 777 if (rc < 0) 778 778 return rc; 779 779 memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+20 -2
include/linux/dax.h
··· 14 14 struct iomap_iter; 15 15 struct iomap; 16 16 17 + enum dax_access_mode { 18 + DAX_ACCESS, 19 + DAX_RECOVERY_WRITE, 20 + }; 21 + 17 22 struct dax_operations { 18 23 /* 19 24 * direct_access: translate a device-relative ··· 26 21 * number of pages available for DAX at that pfn. 27 22 */ 28 23 long (*direct_access)(struct dax_device *, pgoff_t, long, 29 - void **, pfn_t *); 24 + enum dax_access_mode, void **, pfn_t *); 30 25 /* 31 26 * Validate whether this device is usable as an fsdax backing 32 27 * device. ··· 35 30 sector_t, sector_t); 36 31 /* zero_page_range: required operation. Zero page range */ 37 32 int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); 33 + /* 34 + * recovery_write: recover a poisoned range by DAX device driver 35 + * capable of clearing poison. 36 + */ 37 + size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, 38 + void *addr, size_t bytes, struct iov_iter *iter); 38 39 }; 39 40 40 41 #if IS_ENABLED(CONFIG_DAX) ··· 51 40 bool dax_write_cache_enabled(struct dax_device *dax_dev); 52 41 bool dax_synchronous(struct dax_device *dax_dev); 53 42 void set_dax_synchronous(struct dax_device *dax_dev); 43 + size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 44 + void *addr, size_t bytes, struct iov_iter *i); 54 45 /* 55 46 * Check if given mapping is supported by the file / underlying device. 56 47 */ ··· 99 86 struct dax_device *dax_dev) 100 87 { 101 88 return !(vma->vm_flags & VM_SYNC); 89 + } 90 + static inline size_t dax_recovery_write(struct dax_device *dax_dev, 91 + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) 92 + { 93 + return 0; 102 94 } 103 95 #endif 104 96 ··· 196 178 bool dax_alive(struct dax_device *dax_dev); 197 179 void *dax_get_private(struct dax_device *dax_dev); 198 180 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 199 - void **kaddr, pfn_t *pfn); 181 + enum dax_access_mode mode, void **kaddr, pfn_t *pfn); 200 182 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 201 183 size_t bytes, struct iov_iter *i); 202 184 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+12 -1
include/linux/device-mapper.h
··· 20 20 struct dm_report_zones_args; 21 21 struct mapped_device; 22 22 struct bio_vec; 23 + enum dax_access_mode; 23 24 24 25 /* 25 26 * Type of table, mapped_device's mempool and request_queue ··· 147 146 * >= 0 : the number of bytes accessible at the address 148 147 */ 149 148 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, 150 - long nr_pages, void **kaddr, pfn_t *pfn); 149 + long nr_pages, enum dax_access_mode node, void **kaddr, 150 + pfn_t *pfn); 151 151 typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, 152 152 size_t nr_pages); 153 + 154 + /* 155 + * Returns: 156 + * != 0 : number of bytes transferred 157 + * 0 : recovery write failed 158 + */ 159 + typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff, 160 + void *addr, size_t bytes, struct iov_iter *i); 153 161 154 162 void dm_error(const char *message); 155 163 ··· 209 199 dm_io_hints_fn io_hints; 210 200 dm_dax_direct_access_fn direct_access; 211 201 dm_dax_zero_page_range_fn dax_zero_page_range; 202 + dm_dax_recovery_write_fn dax_recovery_write; 212 203 213 204 /* For internal device-mapper use. */ 214 205 struct list_head list;
+5 -5
include/linux/set_memory.h
··· 42 42 #endif 43 43 #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ 44 44 45 - #ifndef set_mce_nospec 46 - static inline int set_mce_nospec(unsigned long pfn, bool unmap) 45 + #ifdef CONFIG_X86_64 46 + int set_mce_nospec(unsigned long pfn); 47 + int clear_mce_nospec(unsigned long pfn); 48 + #else 49 + static inline int set_mce_nospec(unsigned long pfn) 47 50 { 48 51 return 0; 49 52 } 50 - #endif 51 - 52 - #ifndef clear_mce_nospec 53 53 static inline int clear_mce_nospec(unsigned long pfn) 54 54 { 55 55 return 0;
+3 -1
tools/testing/nvdimm/pmem-dax.c
··· 4 4 */ 5 5 #include "test/nfit_test.h" 6 6 #include <linux/blkdev.h> 7 + #include <linux/dax.h> 7 8 #include <pmem.h> 8 9 #include <nd.h> 9 10 10 11 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 11 - long nr_pages, void **kaddr, pfn_t *pfn) 12 + long nr_pages, enum dax_access_mode mode, void **kaddr, 13 + pfn_t *pfn) 12 14 { 13 15 resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; 14 16
+8 -10
tools/testing/nvdimm/test/iomap.c
··· 62 62 } 63 63 EXPORT_SYMBOL(get_nfit_res); 64 64 65 - static void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, 66 - void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) 67 - { 68 - struct nfit_test_resource *nfit_res = get_nfit_res(offset); 69 - 70 - if (nfit_res) 71 - return (void __iomem *) nfit_res->buf + offset 72 - - nfit_res->res.start; 73 - return fallback_fn(offset, size); 74 - } 65 + #define __nfit_test_ioremap(offset, size, fallback_fn) ({ \ 66 + struct nfit_test_resource *nfit_res = get_nfit_res(offset); \ 67 + nfit_res ? \ 68 + (void __iomem *) nfit_res->buf + (offset) \ 69 + - nfit_res->res.start \ 70 + : \ 71 + fallback_fn((offset), (size)) ; \ 72 + }) 75 73 76 74 void __iomem *__wrap_devm_ioremap(struct device *dev, 77 75 resource_size_t offset, unsigned long size)
-3
tools/testing/nvdimm/test/nfit.c
··· 23 23 #include "nfit_test.h" 24 24 #include "../watermark.h" 25 25 26 - #include <asm/mce.h> 27 - 28 26 /* 29 27 * Generate an NFIT table to describe the following topology: 30 28 * ··· 3373 3375 { 3374 3376 int i; 3375 3377 3376 - flush_workqueue(nfit_wq); 3377 3378 destroy_workqueue(nfit_wq); 3378 3379 for (i = 0; i < NUM_NFITS; i++) 3379 3380 platform_device_unregister(&instances[i]->pdev);