Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/pseries/iommu: memory notifier incorrectly adds TCEs for pmemory

iommu_mem_notifier() is invoked when RAM is dynamically added/removed. This
notifier call is responsible to add/remove TCEs from the Dynamic DMA Window
(DDW) when TCEs are pre-mapped. TCEs are pre-mapped only for RAM and not
for persistent memory (pmemory). For DMA buffers in pmemory, TCEs are
dynamically mapped when the device driver instructs to do so.

The issue is 'daxctl' command is capable of adding pmemory as "System RAM"
after LPAR boot. The command to do so is -

daxctl reconfigure-device --mode=system-ram dax0.0 --force

This will dynamically add pmemory range to LPAR RAM eventually invoking
iommu_mem_notifier(). The address range of pmemory is way beyond the Max
RAM that the LPAR can have. Which means, this range is beyond the DDW
created for the device, at device initialization time.

As a result when TCEs are pre-mapped for the pmemory range, by
iommu_mem_notifier(), PHYP HCALL returns H_PARAMETER. This failed the
command, daxctl, to add pmemory as RAM.

The solution is to not pre-map TCEs for pmemory.

Signed-off-by: Gaurav Batra <gbatra@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Donet Tom <donettom@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250130183854.92258-1-gbatra@linux.ibm.com

authored by

Gaurav Batra and committed by
Madhavan Srinivasan
6aa989ab 67dfc119

+18 -14
+1
arch/powerpc/include/asm/mmzone.h
··· 29 29 #ifdef CONFIG_MEMORY_HOTPLUG 30 30 extern unsigned long max_pfn; 31 31 u64 memory_hotplug_max(void); 32 + u64 hot_add_drconf_memory_max(void); 32 33 #else 33 34 #define memory_hotplug_max() memblock_end_of_DRAM() 34 35 #endif
+1 -1
arch/powerpc/mm/numa.c
··· 1336 1336 return nid; 1337 1337 } 1338 1338 1339 - static u64 hot_add_drconf_memory_max(void) 1339 + u64 hot_add_drconf_memory_max(void) 1340 1340 { 1341 1341 struct device_node *memory = NULL; 1342 1342 struct device_node *dn = NULL;
+16 -13
arch/powerpc/platforms/pseries/iommu.c
··· 1285 1285 1286 1286 static phys_addr_t ddw_memory_hotplug_max(void) 1287 1287 { 1288 - resource_size_t max_addr = memory_hotplug_max(); 1289 - struct device_node *memory; 1288 + resource_size_t max_addr; 1290 1289 1291 - for_each_node_by_type(memory, "memory") { 1292 - struct resource res; 1293 - 1294 - if (of_address_to_resource(memory, 0, &res)) 1295 - continue; 1296 - 1297 - max_addr = max_t(resource_size_t, max_addr, res.end + 1); 1298 - } 1290 + #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 1291 + max_addr = hot_add_drconf_memory_max(); 1292 + #else 1293 + max_addr = memblock_end_of_DRAM(); 1294 + #endif 1299 1295 1300 1296 return max_addr; 1301 1297 } ··· 1668 1672 1669 1673 if (direct_mapping) { 1670 1674 /* DDW maps the whole partition, so enable direct DMA mapping */ 1671 - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, 1675 + ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT, 1672 1676 win64->value, tce_setrange_multi_pSeriesLP_walk); 1673 1677 if (ret) { 1674 1678 dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", ··· 2423 2427 struct memory_notify *arg = data; 2424 2428 int ret = 0; 2425 2429 2430 + /* This notifier can get called when onlining persistent memory as well. 2431 + * TCEs are not pre-mapped for persistent memory. Persistent memory will 2432 + * always be above ddw_memory_hotplug_max() 2433 + */ 2434 + 2426 2435 switch (action) { 2427 2436 case MEM_GOING_ONLINE: 2428 2437 spin_lock(&dma_win_list_lock); 2429 2438 list_for_each_entry(window, &dma_win_list, list) { 2430 - if (window->direct) { 2439 + if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2440 + ddw_memory_hotplug_max()) { 2431 2441 ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, 2432 2442 arg->nr_pages, window->prop); 2433 2443 } ··· 2445 2443 case MEM_OFFLINE: 2446 2444 spin_lock(&dma_win_list_lock); 2447 2445 list_for_each_entry(window, &dma_win_list, list) { 2448 - if (window->direct) { 2446 + if (window->direct && (arg->start_pfn << PAGE_SHIFT) < 2447 + ddw_memory_hotplug_max()) { 2449 2448 ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, 2450 2449 arg->nr_pages, window->prop); 2451 2450 }