Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/mlx5: Reduce IMR KSM size when 5-level paging is enabled

Enabling 5-level paging (LA57) increases TASK_SIZE on x86_64 from 2^47
to 2^56. This affects implicit ODP, which uses TASK_SIZE to calculate
the number of IMR KSM entries.

As a result, the number of entries and the memory usage for KSM mkeys
increase drastically:

- With 2^47 TASK_SIZE: 0x20000 entries (~2MB)
- With 2^56 TASK_SIZE: 0x4000000 entries (~1GB)

This issue could happen previously on systems with LA57 manually
enabled, but now commit 7212b58d6d71 ("x86/mm/64: Make 5-level paging
support unconditional") enables LA57 by default on all supported
systems. This makes the issue impact widespread.

To mitigate this, increase the size each MTT entry maps from 1GB to 16GB
when 5-level paging is enabled. This reduces the number of KSM entries
and lowers the memory usage on LA57 systems from 1GB to 64MB per IMR.

As now 'mlx5_imr_mtt_size' is larger than 32 bits, we move to use u64
instead of int as part of populate_klm() to prevent overflow of the
'step' variable.

In addition, as populate_klm() actually handles KSM and not KLM, as it's
used only by implicit ODP, we renamed its signature and the internal
structures accordingly while dropping the byte_count handling which is
not relevant in KSM. The page size in KSM is fixed for all the entries
and come from the log_page_size of the mkey.

Note:
On platforms where the calculated value for 'mlx5_imr_ksm_page_shift' is
higher than the max firmware cap to be changed over UMR, or that the
calculated value for 'log_va_pages' is higher than what we may expect,
the implicit ODP cap will be simply turned off.

Co-developed-by: Or Har-Toov <ohartoov@nvidia.com>
Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251120-reduce-ksm-v1-1-6864bfc814dc@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Yishai Hadas and committed by
Leon Romanovsky
6dbd547a a26c4c7c

+52 -39
+52 -39
drivers/infiniband/hw/mlx5/odp.c
··· 97 97 * a pagefault. */ 98 98 #define MMU_NOTIFIER_TIMEOUT 1000 99 99 100 - #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 101 - #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 102 - #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 103 - #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 104 - #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 105 - 106 - #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 107 - 108 100 static u64 mlx5_imr_ksm_entries; 101 + static u64 mlx5_imr_mtt_entries; 102 + static u64 mlx5_imr_mtt_size; 103 + static u8 mlx5_imr_mtt_shift; 104 + static u8 mlx5_imr_ksm_page_shift; 109 105 110 - static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, 106 + static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, 111 107 struct mlx5_ib_mr *imr, int flags) 112 108 { 113 109 struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; 114 - struct mlx5_klm *end = pklm + nentries; 115 - int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; 110 + struct mlx5_ksm *end = pksm + nentries; 111 + u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; 116 112 __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? 117 113 cpu_to_be32(imr->null_mmkey.key) : 118 114 mr_to_mdev(imr)->mkeys.null_mkey; 119 115 u64 va = 120 - MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; 116 + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; 121 117 122 118 if (flags & MLX5_IB_UPD_XLT_ZAP) { 123 - for (; pklm != end; pklm++, idx++, va += step) { 124 - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 125 - pklm->key = key; 126 - pklm->va = cpu_to_be64(va); 119 + for (; pksm != end; pksm++, idx++, va += step) { 120 + pksm->key = key; 121 + pksm->va = cpu_to_be64(va); 127 122 } 128 123 return; 129 124 } ··· 142 147 */ 143 148 lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); 144 149 145 - for (; pklm != end; pklm++, idx++, va += step) { 150 + for (; pksm != end; pksm++, idx++, va += step) { 146 151 struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); 147 152 148 - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 149 153 if (mtt) { 150 - pklm->key = cpu_to_be32(mtt->ibmr.lkey); 151 - pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); 154 + pksm->key = cpu_to_be32(mtt->ibmr.lkey); 155 + pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); 152 156 } else { 153 - pklm->key = key; 154 - pklm->va = cpu_to_be64(va); 157 + pksm->key = key; 158 + pksm->va = cpu_to_be64(va); 155 159 } 156 160 } 157 161 } ··· 195 201 struct mlx5_ib_mr *mr, int flags) 196 202 { 197 203 if (flags & MLX5_IB_UPD_XLT_INDIRECT) { 198 - populate_klm(xlt, idx, nentries, mr, flags); 204 + populate_ksm(xlt, idx, nentries, mr, flags); 199 205 return 0; 200 206 } else { 201 207 return populate_mtt(xlt, idx, nentries, mr, flags); ··· 220 226 221 227 mutex_lock(&odp_imr->umem_mutex); 222 228 mlx5r_umr_update_xlt(mr->parent, 223 - ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, 229 + ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, 224 230 MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); 225 231 mutex_unlock(&odp_imr->umem_mutex); 226 232 mlx5_ib_dereg_mr(&mr->ibmr, NULL); ··· 231 237 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) 232 238 { 233 239 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 234 - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 240 + unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; 235 241 struct mlx5_ib_mr *imr = mr->parent; 236 242 237 243 /* ··· 419 425 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 420 426 MLX5_CAP_GEN(dev->mdev, null_mkey) && 421 427 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && 422 - !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) 428 + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && 429 + mlx5_imr_ksm_entries != 0 && 430 + !(mlx5_imr_ksm_page_shift > 431 + get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) 423 432 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 424 433 } 425 434 ··· 473 476 int err; 474 477 475 478 odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), 476 - idx * MLX5_IMR_MTT_SIZE, 477 - MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); 479 + idx * mlx5_imr_mtt_size, 480 + mlx5_imr_mtt_size, &mlx5_mn_ops); 478 481 if (IS_ERR(odp)) 479 482 return ERR_CAST(odp); 480 483 481 484 mr = mlx5_mr_cache_alloc(dev, imr->access_flags, 482 485 MLX5_MKC_ACCESS_MODE_MTT, 483 - MLX5_IMR_MTT_ENTRIES); 486 + mlx5_imr_mtt_entries); 484 487 if (IS_ERR(mr)) { 485 488 ib_umem_odp_release(odp); 486 489 return mr; ··· 492 495 mr->umem = &odp->umem; 493 496 mr->ibmr.lkey = mr->mmkey.key; 494 497 mr->ibmr.rkey = mr->mmkey.key; 495 - mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; 498 + mr->ibmr.iova = idx * mlx5_imr_mtt_size; 496 499 mr->parent = imr; 497 500 odp->private = mr; 498 501 ··· 503 506 refcount_set(&mr->mmkey.usecount, 2); 504 507 505 508 err = mlx5r_umr_update_xlt(mr, 0, 506 - MLX5_IMR_MTT_ENTRIES, 509 + mlx5_imr_mtt_entries, 507 510 PAGE_SHIFT, 508 511 MLX5_IB_UPD_XLT_ZAP | 509 512 MLX5_IB_UPD_XLT_ENABLE); ··· 608 611 struct mlx5_ib_mr *imr; 609 612 int err; 610 613 611 - if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) 614 + if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) 612 615 return ERR_PTR(-EOPNOTSUPP); 613 616 614 617 umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); ··· 644 647 645 648 err = mlx5r_umr_update_xlt(imr, 0, 646 649 mlx5_imr_ksm_entries, 647 - MLX5_KSM_PAGE_SHIFT, 650 + mlx5_imr_ksm_page_shift, 648 651 MLX5_IB_UPD_XLT_INDIRECT | 649 652 MLX5_IB_UPD_XLT_ZAP | 650 653 MLX5_IB_UPD_XLT_ENABLE); ··· 747 750 struct ib_umem_odp *odp_imr, u64 user_va, 748 751 size_t bcnt, u32 *bytes_mapped, u32 flags) 749 752 { 750 - unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; 753 + unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; 751 754 unsigned long upd_start_idx = end_idx + 1; 752 755 unsigned long upd_len = 0; 753 756 unsigned long npages = 0; 754 757 int err; 755 758 int ret; 756 759 757 - if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || 758 - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) 760 + if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || 761 + mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) 759 762 return -EFAULT; 760 763 761 764 /* Fault each child mr that intersects with our interval. */ 762 765 while (bcnt) { 763 - unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; 766 + unsigned long idx = user_va >> mlx5_imr_mtt_shift; 764 767 struct ib_umem_odp *umem_odp; 765 768 struct mlx5_ib_mr *mtt; 766 769 u64 len; ··· 1921 1924 1922 1925 int mlx5_ib_odp_init(void) 1923 1926 { 1924 - mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1925 - MLX5_IMR_MTT_BITS); 1927 + u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; 1928 + u8 mlx5_imr_mtt_bits; 1926 1929 1930 + /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ 1931 + if (log_va_pages <= 48 - PAGE_SHIFT) 1932 + mlx5_imr_mtt_shift = 30; 1933 + /* 56 is x86-64, 5-level paging */ 1934 + else if (log_va_pages <= 56 - PAGE_SHIFT) 1935 + mlx5_imr_mtt_shift = 34; 1936 + else 1937 + return 0; 1938 + 1939 + mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); 1940 + mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; 1941 + mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); 1942 + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1943 + mlx5_imr_mtt_bits); 1944 + 1945 + mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; 1927 1946 return 0; 1928 1947 } 1929 1948