Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mce: fix set_mce_nospec to always unmap the whole page

The set_memory_uc() approach doesn't work well in all cases.
As Dan pointed out when "The VMM unmapped the bad page from
guest physical space and passed the machine check to the guest."
"The guest gets virtual #MC on an access to that page. When
the guest tries to do set_memory_uc() and instructs cpa_flush()
to do clean caches that results in taking another fault / exception
perhaps because the VMM unmapped the page from the guest."

Since the driver has special knowledge to handle NP or UC,
mark the poisoned page with NP and let driver handle it when
it comes down to repair.

Please refer to discussions here for more details.
https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/

Now since poisoned page is marked as not-present, in order to
avoid writing to a not-present page and trigger kernel Oops,
also fix pmem_do_write().

Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/165272615484.103830.2563950688772226611.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

authored by

Jane Chu and committed by
Dan Williams
5898b43a b3fdf939

+24 -41
+3 -3
arch/x86/kernel/cpu/mce/core.c
··· 579 579 580 580 pfn = mce->addr >> PAGE_SHIFT; 581 581 if (!memory_failure(pfn, 0)) { 582 - set_mce_nospec(pfn, whole_page(mce)); 582 + set_mce_nospec(pfn); 583 583 mce->kflags |= MCE_HANDLED_UC; 584 584 } 585 585 ··· 1316 1316 1317 1317 ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); 1318 1318 if (!ret) { 1319 - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); 1319 + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); 1320 1320 sync_core(); 1321 1321 return; 1322 1322 } ··· 1342 1342 p->mce_count = 0; 1343 1343 pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); 1344 1344 if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) 1345 - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); 1345 + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); 1346 1346 } 1347 1347 1348 1348 static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+11 -12
arch/x86/mm/pat/set_memory.c
··· 1925 1925 } 1926 1926 EXPORT_SYMBOL(set_memory_wb); 1927 1927 1928 - /* 1929 - * Prevent speculative access to the page by either unmapping 1930 - * it (if we do not require access to any part of the page) or 1931 - * marking it uncacheable (if we want to try to retrieve data 1932 - * from non-poisoned lines in the page). 1933 - */ 1928 + /* Prevent speculative access to a page by marking it not-present */ 1934 1929 #ifdef CONFIG_X86_64 1935 - int set_mce_nospec(unsigned long pfn, bool unmap) 1930 + int set_mce_nospec(unsigned long pfn) 1936 1931 { 1937 1932 unsigned long decoy_addr; 1938 1933 int rc; ··· 1949 1954 */ 1950 1955 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 1951 1956 1952 - if (unmap) 1953 - rc = set_memory_np(decoy_addr, 1); 1954 - else 1955 - rc = set_memory_uc(decoy_addr, 1); 1957 + rc = set_memory_np(decoy_addr, 1); 1956 1958 if (rc) 1957 1959 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 1958 1960 return rc; 1959 1961 } 1960 1962 1963 + static int set_memory_present(unsigned long *addr, int numpages) 1964 + { 1965 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1966 + } 1967 + 1961 1968 /* Restore full speculative operation to the pfn. */ 1962 1969 int clear_mce_nospec(unsigned long pfn) 1963 1970 { 1964 - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); 1971 + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); 1972 + 1973 + return set_memory_present(&addr, 1); 1965 1974 } 1966 1975 EXPORT_SYMBOL_GPL(clear_mce_nospec); 1967 1976 #endif /* CONFIG_X86_64 */
+8 -24
drivers/nvdimm/pmem.c
··· 158 158 struct page *page, unsigned int page_off, 159 159 sector_t sector, unsigned int len) 160 160 { 161 - blk_status_t rc = BLK_STS_OK; 162 - bool bad_pmem = false; 163 161 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 164 162 void *pmem_addr = pmem->virt_addr + pmem_off; 165 163 166 - if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 167 - bad_pmem = true; 164 + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { 165 + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); 168 166 169 - /* 170 - * Note that we write the data both before and after 171 - * clearing poison. The write before clear poison 172 - * handles situations where the latest written data is 173 - * preserved and the clear poison operation simply marks 174 - * the address range as valid without changing the data. 175 - * In this case application software can assume that an 176 - * interrupted write will either return the new good 177 - * data or an error. 178 - * 179 - * However, if pmem_clear_poison() leaves the data in an 180 - * indeterminate state we need to perform the write 181 - * after clear poison. 182 - */ 183 - flush_dcache_page(page); 184 - write_pmem(pmem_addr, page, page_off, len); 185 - if (unlikely(bad_pmem)) { 186 - rc = pmem_clear_poison(pmem, pmem_off, len); 187 - write_pmem(pmem_addr, page, page_off, len); 167 + if (rc != BLK_STS_OK) 168 + return rc; 188 169 } 189 170 190 - return rc; 171 + flush_dcache_page(page); 172 + write_pmem(pmem_addr, page, page_off, len); 173 + 174 + return BLK_STS_OK; 191 175 } 192 176 193 177 static void pmem_submit_bio(struct bio *bio)
+2 -2
include/linux/set_memory.h
··· 43 43 #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ 44 44 45 45 #ifdef CONFIG_X86_64 46 - int set_mce_nospec(unsigned long pfn, bool unmap); 46 + int set_mce_nospec(unsigned long pfn); 47 47 int clear_mce_nospec(unsigned long pfn); 48 48 #else 49 - static inline int set_mce_nospec(unsigned long pfn, bool unmap) 49 + static inline int set_mce_nospec(unsigned long pfn) 50 50 { 51 51 return 0; 52 52 }