Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node

Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware
Interface Table), is the first known instance of a memory range
described by a unique "target" proximity domain. Where "initiator" and
"target" proximity domains is an approach that the ACPI HMAT
(Heterogeneous Memory Attributes Table) uses to described the unique
performance properties of a memory range relative to a given initiator
(e.g. CPU or DMA device).

Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y
char-device follows the traditional notion of 'numa-node' where the
attribute conveys the closest online numa-node. That numa-node attribute
is useful for cpu-binding and memory-binding processes *near* the
device. However, when the memory range backing a 'pmem', or 'dax' device
is onlined (memory hot-add) the memory-only-numa-node representing that
address needs to be differentiated from the set of online nodes. In
other words, the numa-node association of the device depends on whether
you can bind processes *near* the cpu-numa-node in the offline
device-case, or bind process *on* the memory-range directly after the
backing address range is onlined.

Allow for the case that platform firmware describes persistent memory
with a unique proximity domain, i.e. when it is distinct from the
proximity of DRAM and CPUs that are on the same socket. Plumb the Linux
numa-node translation of that proximity through the libnvdimm region
device to namespaces that are in device-dax mode. With this in place the
proposed kmem driver [1] can optionally discover a unique numa-node
number for the address range as it transitions the memory from an
offline state managed by a device-driver to an online memory range
managed by the core-mm.

[1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com

Reported-by: Fan Du <fan.du@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

+30 -6
+1
arch/powerpc/platforms/pseries/papr_scm.c
··· 236 236 memset(&ndr_desc, 0, sizeof(ndr_desc)); 237 237 ndr_desc.attr_groups = region_attr_groups; 238 238 ndr_desc.numa_node = dev_to_node(&p->pdev->dev); 239 + ndr_desc.target_node = ndr_desc.numa_node; 239 240 ndr_desc.res = &p->res; 240 241 ndr_desc.of_node = p->dn; 241 242 ndr_desc.provider_data = p;
+6 -2
drivers/acpi/nfit/core.c
··· 2869 2869 ndr_desc->res = &res; 2870 2870 ndr_desc->provider_data = nfit_spa; 2871 2871 ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; 2872 - if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) 2872 + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { 2873 2873 ndr_desc->numa_node = acpi_map_pxm_to_online_node( 2874 2874 spa->proximity_domain); 2875 - else 2875 + ndr_desc->target_node = acpi_map_pxm_to_node( 2876 + spa->proximity_domain); 2877 + } else { 2876 2878 ndr_desc->numa_node = NUMA_NO_NODE; 2879 + ndr_desc->target_node = NUMA_NO_NODE; 2880 + } 2877 2881 2878 2882 /* 2879 2883 * Persistence domain bits are hierarchical, if
+1
drivers/acpi/numa.c
··· 84 84 85 85 return node; 86 86 } 87 + EXPORT_SYMBOL(acpi_map_pxm_to_node); 87 88 88 89 /** 89 90 * acpi_map_pxm_to_online_node - Map proximity ID to online node
+3 -1
drivers/dax/bus.c
··· 214 214 } 215 215 216 216 struct dax_region *alloc_dax_region(struct device *parent, int region_id, 217 - struct resource *res, unsigned int align, 217 + struct resource *res, int target_node, unsigned int align, 218 218 unsigned long pfn_flags) 219 219 { 220 220 struct dax_region *dax_region; ··· 244 244 dax_region->id = region_id; 245 245 dax_region->align = align; 246 246 dax_region->dev = parent; 247 + dax_region->target_node = target_node; 247 248 if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { 248 249 kfree(dax_region); 249 250 return NULL; ··· 349 348 350 349 dev_dax->dax_dev = dax_dev; 351 350 dev_dax->region = dax_region; 351 + dev_dax->target_node = dax_region->target_node; 352 352 kref_get(&dax_region->kref); 353 353 354 354 inode = dax_inode(dax_dev);
+2 -1
drivers/dax/bus.h
··· 10 10 struct dax_region; 11 11 void dax_region_put(struct dax_region *dax_region); 12 12 struct dax_region *alloc_dax_region(struct device *parent, int region_id, 13 - struct resource *res, unsigned int align, unsigned long flags); 13 + struct resource *res, int target_node, unsigned int align, 14 + unsigned long flags); 14 15 15 16 enum dev_dax_subsys { 16 17 DEV_DAX_BUS,
+4
drivers/dax/dax-private.h
··· 26 26 /** 27 27 * struct dax_region - mapping infrastructure for dax devices 28 28 * @id: kernel-wide unique region for a memory range 29 + * @target_node: effective numa node if this memory range is onlined 29 30 * @kref: to pin while other agents have a need to do lookups 30 31 * @dev: parent device backing this region 31 32 * @align: allocation and mapping alignment for child dax devices ··· 35 34 */ 36 35 struct dax_region { 37 36 int id; 37 + int target_node; 38 38 struct kref kref; 39 39 struct device *dev; 40 40 unsigned int align; ··· 48 46 * data while the device is activated in the driver. 49 47 * @region - parent region 50 48 * @dax_dev - core dax functionality 49 + * @target_node: effective numa node if dev_dax memory range is onlined 51 50 * @dev - device core 52 51 * @pgmap - pgmap for memmap setup / lifetime (driver owned) 53 52 * @ref: pgmap reference count (driver owned) ··· 57 54 struct dev_dax { 58 55 struct dax_region *region; 59 56 struct dax_device *dax_dev; 57 + int target_node; 60 58 struct device dev; 61 59 struct dev_pagemap pgmap; 62 60 struct percpu_ref ref;
+3 -1
drivers/dax/pmem/core.c
··· 20 20 struct nd_namespace_common *ndns; 21 21 struct nd_dax *nd_dax = to_nd_dax(dev); 22 22 struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; 23 + struct nd_region *nd_region = to_nd_region(dev->parent); 23 24 24 25 ndns = nvdimm_namespace_common_probe(dev); 25 26 if (IS_ERR(ndns)) ··· 53 52 memcpy(&res, &pgmap.res, sizeof(res)); 54 53 res.start += offset; 55 54 dax_region = alloc_dax_region(dev, region_id, &res, 56 - le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); 55 + nd_region->target_node, le32_to_cpu(pfn_sb->align), 56 + PFN_DEV|PFN_MAP); 57 57 if (!dax_region) 58 58 return ERR_PTR(-ENOMEM); 59 59
+1
drivers/nvdimm/e820.c
··· 47 47 ndr_desc.res = res; 48 48 ndr_desc.attr_groups = e820_pmem_region_attribute_groups; 49 49 ndr_desc.numa_node = e820_range_to_nid(res->start); 50 + ndr_desc.target_node = ndr_desc.numa_node; 50 51 set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); 51 52 if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) 52 53 return -ENXIO;
+1 -1
drivers/nvdimm/nd.h
··· 153 153 u16 ndr_mappings; 154 154 u64 ndr_size; 155 155 u64 ndr_start; 156 - int id, num_lanes, ro, numa_node; 156 + int id, num_lanes, ro, numa_node, target_node; 157 157 void *provider_data; 158 158 struct kernfs_node *bb_state; 159 159 struct badblocks bb;
+1
drivers/nvdimm/of_pmem.c
··· 68 68 memset(&ndr_desc, 0, sizeof(ndr_desc)); 69 69 ndr_desc.attr_groups = region_attr_groups; 70 70 ndr_desc.numa_node = dev_to_node(&pdev->dev); 71 + ndr_desc.target_node = ndr_desc.numa_node; 71 72 ndr_desc.res = &pdev->resource[i]; 72 73 ndr_desc.of_node = np; 73 74 set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+1
drivers/nvdimm/region_devs.c
··· 1065 1065 nd_region->flags = ndr_desc->flags; 1066 1066 nd_region->ro = ro; 1067 1067 nd_region->numa_node = ndr_desc->numa_node; 1068 + nd_region->target_node = ndr_desc->target_node; 1068 1069 ida_init(&nd_region->ns_ida); 1069 1070 ida_init(&nd_region->btt_ida); 1070 1071 ida_init(&nd_region->pfn_ida);
+5
include/linux/acpi.h
··· 400 400 401 401 #ifdef CONFIG_ACPI_NUMA 402 402 int acpi_map_pxm_to_online_node(int pxm); 403 + int acpi_map_pxm_to_node(int pxm); 403 404 int acpi_get_node(acpi_handle handle); 404 405 #else 405 406 static inline int acpi_map_pxm_to_online_node(int pxm) 407 + { 408 + return 0; 409 + } 410 + static inline int acpi_map_pxm_to_node(int pxm) 406 411 { 407 412 return 0; 408 413 }
+1
include/linux/libnvdimm.h
··· 128 128 void *provider_data; 129 129 int num_lanes; 130 130 int numa_node; 131 + int target_node; 131 132 unsigned long flags; 132 133 struct device_node *of_node; 133 134 };