Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/devm_memremap_pages: fix final page put race

Logan noticed that devm_memremap_pages_release() kills the percpu_ref
drops all the page references that were acquired at init and then
immediately proceeds to unplug, arch_remove_memory(), the backing pages
for the pagemap. If for some reason device shutdown actually collides
with a busy / elevated-ref-count page then arch_remove_memory() should
be deferred until after that reference is dropped.

As it stands the "wait for last page ref drop" happens *after*
devm_memremap_pages_release() returns, which is obviously too late and
can lead to crashes.

Fix this situation by assigning the responsibility to wait for the
percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup()
callback. Implement the new cleanup callback for all
devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma.

Link: http://lkml.kernel.org/r/155727339156.292046.5432007428235387859.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 41e94a851304 ("add devm_memremap_pages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Dan Williams and committed by
Linus Torvalds
50f44ee7 1570175a

+38 -44
+3 -10
drivers/dax/device.c
··· 27 27 complete(&dev_dax->cmp); 28 28 } 29 29 30 - static void dev_dax_percpu_exit(void *data) 30 + static void dev_dax_percpu_exit(struct percpu_ref *ref) 31 31 { 32 - struct percpu_ref *ref = data; 33 32 struct dev_dax *dev_dax = ref_to_dev_dax(ref); 34 33 35 34 dev_dbg(&dev_dax->dev, "%s\n", __func__); ··· 465 466 if (rc) 466 467 return rc; 467 468 468 - rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref); 469 - if (rc) 470 - return rc; 471 - 472 469 dev_dax->pgmap.ref = &dev_dax->ref; 473 470 dev_dax->pgmap.kill = dev_dax_percpu_kill; 471 + dev_dax->pgmap.cleanup = dev_dax_percpu_exit; 474 472 addr = devm_memremap_pages(dev, &dev_dax->pgmap); 475 - if (IS_ERR(addr)) { 476 - devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); 477 - percpu_ref_exit(&dev_dax->ref); 473 + if (IS_ERR(addr)) 478 474 return PTR_ERR(addr); 479 - } 480 475 481 476 inode = dax_inode(dax_dev); 482 477 cdev = inode->i_cdev;
+13 -4
drivers/nvdimm/pmem.c
··· 303 303 NULL, 304 304 }; 305 305 306 - static void pmem_release_queue(void *q) 306 + static void __pmem_release_queue(struct percpu_ref *ref) 307 307 { 308 + struct request_queue *q; 309 + 310 + q = container_of(ref, typeof(*q), q_usage_counter); 308 311 blk_cleanup_queue(q); 312 + } 313 + 314 + static void pmem_release_queue(void *ref) 315 + { 316 + __pmem_release_queue(ref); 309 317 } 310 318 311 319 static void pmem_freeze_queue(struct percpu_ref *ref) ··· 407 399 if (!q) 408 400 return -ENOMEM; 409 401 410 - if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 411 - return -ENOMEM; 412 - 413 402 pmem->pfn_flags = PFN_DEV; 414 403 pmem->pgmap.ref = &q->q_usage_counter; 415 404 pmem->pgmap.kill = pmem_freeze_queue; 405 + pmem->pgmap.cleanup = __pmem_release_queue; 416 406 if (is_nd_pfn(dev)) { 417 407 if (setup_pagemap_fsdax(dev, &pmem->pgmap)) 418 408 return -ENOMEM; ··· 431 425 pmem->pfn_flags |= PFN_MAP; 432 426 memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); 433 427 } else { 428 + if (devm_add_action_or_reset(dev, pmem_release_queue, 429 + &q->q_usage_counter)) 430 + return -ENOMEM; 434 431 addr = devm_memremap(dev, pmem->phys_addr, 435 432 pmem->size, ARCH_MEMREMAP_PMEM); 436 433 memcpy(&bb_res, &nsio->res, sizeof(bb_res));
+3 -14
drivers/pci/p2pdma.c
··· 95 95 percpu_ref_kill(ref); 96 96 } 97 97 98 - static void pci_p2pdma_percpu_cleanup(void *ref) 98 + static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref) 99 99 { 100 100 struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref); 101 101 ··· 198 198 if (error) 199 199 goto pgmap_free; 200 200 201 - /* 202 - * FIXME: the percpu_ref_exit needs to be coordinated internal 203 - * to devm_memremap_pages_release(). Duplicate the same ordering 204 - * as other devm_memremap_pages() users for now. 205 - */ 206 - error = devm_add_action(&pdev->dev, pci_p2pdma_percpu_cleanup, 207 - &p2p_pgmap->ref); 208 - if (error) 209 - goto ref_cleanup; 210 - 211 201 pgmap = &p2p_pgmap->pgmap; 212 202 213 203 pgmap->res.start = pci_resource_start(pdev, bar) + offset; ··· 208 218 pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - 209 219 pci_resource_start(pdev, bar); 210 220 pgmap->kill = pci_p2pdma_percpu_kill; 221 + pgmap->cleanup = pci_p2pdma_percpu_cleanup; 211 222 212 223 addr = devm_memremap_pages(&pdev->dev, pgmap); 213 224 if (IS_ERR(addr)) { 214 225 error = PTR_ERR(addr); 215 - goto ref_exit; 226 + goto pgmap_free; 216 227 } 217 228 218 229 error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr, ··· 230 239 231 240 pages_free: 232 241 devm_memunmap_pages(&pdev->dev, pgmap); 233 - ref_cleanup: 234 - percpu_ref_exit(&p2p_pgmap->ref); 235 242 pgmap_free: 236 243 devm_kfree(&pdev->dev, p2p_pgmap); 237 244 return error;
+2
include/linux/memremap.h
··· 81 81 * @res: physical address range covered by @ref 82 82 * @ref: reference count that pins the devm_memremap_pages() mapping 83 83 * @kill: callback to transition @ref to the dead state 84 + * @cleanup: callback to wait for @ref to be idle and reap it 84 85 * @dev: host device of the mapping for debug 85 86 * @data: private data pointer for page_free() 86 87 * @type: memory type: see MEMORY_* in memory_hotplug.h ··· 93 92 struct resource res; 94 93 struct percpu_ref *ref; 95 94 void (*kill)(struct percpu_ref *ref); 95 + void (*cleanup)(struct percpu_ref *ref); 96 96 struct device *dev; 97 97 void *data; 98 98 enum memory_type type;
+12 -5
kernel/memremap.c
··· 95 95 pgmap->kill(pgmap->ref); 96 96 for_each_device_pfn(pfn, pgmap) 97 97 put_page(pfn_to_page(pfn)); 98 + pgmap->cleanup(pgmap->ref); 98 99 99 100 /* pages are dead and unused, undo the arch mapping */ 100 101 align_start = res->start & ~(SECTION_SIZE - 1); ··· 134 133 * 2/ The altmap field may optionally be initialized, in which case altmap_valid 135 134 * must be set to true 136 135 * 137 - * 3/ pgmap->ref must be 'live' on entry and will be killed at 138 - * devm_memremap_pages_release() time, or if this routine fails. 136 + * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped 137 + * at devm_memremap_pages_release() time, or if this routine fails. 139 138 * 140 139 * 4/ res is expected to be a host memory range that could feasibly be 141 140 * treated as a "System RAM" range, i.e. not a device mmio range, but ··· 157 156 pgprot_t pgprot = PAGE_KERNEL; 158 157 int error, nid, is_ram; 159 158 160 - if (!pgmap->ref || !pgmap->kill) 159 + if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { 160 + WARN(1, "Missing reference count teardown definition\n"); 161 161 return ERR_PTR(-EINVAL); 162 + } 162 163 163 164 align_start = res->start & ~(SECTION_SIZE - 1); 164 165 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) ··· 171 168 if (conflict_pgmap) { 172 169 dev_WARN(dev, "Conflicting mapping in same section\n"); 173 170 put_dev_pagemap(conflict_pgmap); 174 - return ERR_PTR(-ENOMEM); 171 + error = -ENOMEM; 172 + goto err_array; 175 173 } 176 174 177 175 conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); 178 176 if (conflict_pgmap) { 179 177 dev_WARN(dev, "Conflicting mapping in same section\n"); 180 178 put_dev_pagemap(conflict_pgmap); 181 - return ERR_PTR(-ENOMEM); 179 + error = -ENOMEM; 180 + goto err_array; 182 181 } 183 182 184 183 is_ram = region_intersects(align_start, align_size, ··· 272 267 pgmap_array_delete(res); 273 268 err_array: 274 269 pgmap->kill(pgmap->ref); 270 + pgmap->cleanup(pgmap->ref); 271 + 275 272 return ERR_PTR(error); 276 273 } 277 274 EXPORT_SYMBOL_GPL(devm_memremap_pages);
+3 -11
mm/hmm.c
··· 1354 1354 complete(&devmem->completion); 1355 1355 } 1356 1356 1357 - static void hmm_devmem_ref_exit(void *data) 1357 + static void hmm_devmem_ref_exit(struct percpu_ref *ref) 1358 1358 { 1359 - struct percpu_ref *ref = data; 1360 1359 struct hmm_devmem *devmem; 1361 1360 1362 1361 devmem = container_of(ref, struct hmm_devmem, ref); ··· 1432 1433 if (ret) 1433 1434 return ERR_PTR(ret); 1434 1435 1435 - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); 1436 - if (ret) 1437 - return ERR_PTR(ret); 1438 - 1439 1436 size = ALIGN(size, PA_SECTION_SIZE); 1440 1437 addr = min((unsigned long)iomem_resource.end, 1441 1438 (1UL << MAX_PHYSMEM_BITS) - 1); ··· 1470 1475 devmem->pagemap.ref = &devmem->ref; 1471 1476 devmem->pagemap.data = devmem; 1472 1477 devmem->pagemap.kill = hmm_devmem_ref_kill; 1478 + devmem->pagemap.cleanup = hmm_devmem_ref_exit; 1473 1479 1474 1480 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1475 1481 if (IS_ERR(result)) ··· 1508 1512 if (ret) 1509 1513 return ERR_PTR(ret); 1510 1514 1511 - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, 1512 - &devmem->ref); 1513 - if (ret) 1514 - return ERR_PTR(ret); 1515 - 1516 1515 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1517 1516 devmem->pfn_last = devmem->pfn_first + 1518 1517 (resource_size(devmem->resource) >> PAGE_SHIFT); ··· 1520 1529 devmem->pagemap.ref = &devmem->ref; 1521 1530 devmem->pagemap.data = devmem; 1522 1531 devmem->pagemap.kill = hmm_devmem_ref_kill; 1532 + devmem->pagemap.cleanup = hmm_devmem_ref_exit; 1523 1533 1524 1534 result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1525 1535 if (IS_ERR(result))
+2
tools/testing/nvdimm/test/iomap.c
··· 100 100 { 101 101 struct dev_pagemap *pgmap = _pgmap; 102 102 103 + WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup); 103 104 pgmap->kill(pgmap->ref); 105 + pgmap->cleanup(pgmap->ref); 104 106 } 105 107 106 108 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)