Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/mlx5: Page faults handling infrastructure

* Refactor MR registration and cleanup, and fix reg_pages accounting.
* Create a work queue to handle page fault events in a kthread context.
* Register a fault handler to get events from the core for each QP.

The registered fault handler is empty in this patch, and only a later
patch implements it.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>

authored by

Haggai Eran and committed by
Roland Dreier
6aec21f6 832a6b06

+295 -23
+27 -4
drivers/infiniband/hw/mlx5/main.c
··· 864 864 struct mlx5_ib_dev *dev = 865 865 container_of(device, struct mlx5_ib_dev, ib_dev.dev); 866 866 867 - return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); 867 + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); 868 868 } 869 869 870 870 static ssize_t show_hca(struct device *device, struct device_attribute *attr, ··· 1389 1389 goto err_eqs; 1390 1390 1391 1391 mutex_init(&dev->cap_mask_mutex); 1392 - spin_lock_init(&dev->mr_lock); 1393 1392 1394 1393 err = create_dev_resources(&dev->devr); 1395 1394 if (err) 1396 1395 goto err_eqs; 1397 1396 1398 - err = ib_register_device(&dev->ib_dev, NULL); 1397 + err = mlx5_ib_odp_init_one(dev); 1399 1398 if (err) 1400 1399 goto err_rsrc; 1400 + 1401 + err = ib_register_device(&dev->ib_dev, NULL); 1402 + if (err) 1403 + goto err_odp; 1401 1404 1402 1405 err = create_umr_res(dev); 1403 1406 if (err) ··· 1423 1420 err_dev: 1424 1421 ib_unregister_device(&dev->ib_dev); 1425 1422 1423 + err_odp: 1424 + mlx5_ib_odp_remove_one(dev); 1425 + 1426 1426 err_rsrc: 1427 1427 destroy_dev_resources(&dev->devr); 1428 1428 ··· 1441 1435 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) 1442 1436 { 1443 1437 struct mlx5_ib_dev *dev = context; 1438 + 1444 1439 ib_unregister_device(&dev->ib_dev); 1445 1440 destroy_umrc_res(dev); 1441 + mlx5_ib_odp_remove_one(dev); 1446 1442 destroy_dev_resources(&dev->devr); 1447 1443 free_comp_eqs(dev); 1448 1444 ib_dealloc_device(&dev->ib_dev); ··· 1458 1450 1459 1451 static int __init mlx5_ib_init(void) 1460 1452 { 1453 + int err; 1454 + 1461 1455 if (deprecated_prof_sel != 2) 1462 1456 pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); 1463 1457 1464 - return mlx5_register_interface(&mlx5_ib_interface); 1458 + err = mlx5_ib_odp_init(); 1459 + if (err) 1460 + return err; 1461 + 1462 + err = mlx5_register_interface(&mlx5_ib_interface); 1463 + if (err) 1464 + goto clean_odp; 1465 + 1466 + return err; 1467 + 1468 + clean_odp: 1469 + mlx5_ib_odp_cleanup(); 1470 + return err; 1465 1471 } 1466 1472 1467 1473 static void __exit mlx5_ib_cleanup(void) 1468 1474 { 1469 1475 mlx5_unregister_interface(&mlx5_ib_interface); 1476 + mlx5_ib_odp_cleanup(); 1470 1477 } 1471 1478 1472 1479 module_init(mlx5_ib_init);
+65 -2
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 149 149 MLX5_QP_EMPTY 150 150 }; 151 151 152 + /* 153 + * Connect-IB can trigger up to four concurrent pagefaults 154 + * per-QP. 155 + */ 156 + enum mlx5_ib_pagefault_context { 157 + MLX5_IB_PAGEFAULT_RESPONDER_READ, 158 + MLX5_IB_PAGEFAULT_REQUESTOR_READ, 159 + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, 160 + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, 161 + MLX5_IB_PAGEFAULT_CONTEXTS 162 + }; 163 + 164 + static inline enum mlx5_ib_pagefault_context 165 + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) 166 + { 167 + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); 168 + } 169 + 170 + struct mlx5_ib_pfault { 171 + struct work_struct work; 172 + struct mlx5_pagefault mpfault; 173 + }; 174 + 152 175 struct mlx5_ib_qp { 153 176 struct ib_qp ibqp; 154 177 struct mlx5_core_qp mqp; ··· 217 194 218 195 /* Store signature errors */ 219 196 bool signature_en; 197 + 198 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 199 + /* 200 + * A flag that is true for QP's that are in a state that doesn't 201 + * allow page faults, and shouldn't schedule any more faults. 202 + */ 203 + int disable_page_faults; 204 + /* 205 + * The disable_page_faults_lock protects a QP's disable_page_faults 206 + * field, allowing for a thread to atomically check whether the QP 207 + * allows page faults, and if so schedule a page fault. 208 + */ 209 + spinlock_t disable_page_faults_lock; 210 + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; 211 + #endif 220 212 }; 221 213 222 214 struct mlx5_ib_cq_buf { ··· 430 392 struct umr_common umrc; 431 393 /* sync used page count stats 432 394 */ 433 - spinlock_t mr_lock; 434 395 struct mlx5_ib_resources devr; 435 396 struct mlx5_mr_cache cache; 436 397 struct timer_list delay_timer; 437 398 int fill_delay; 438 399 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 439 400 struct ib_odp_caps odp_caps; 401 + /* 402 + * Sleepable RCU that prevents destruction of MRs while they are still 403 + * being used by a page fault handler. 404 + */ 405 + struct srcu_struct mr_srcu; 440 406 #endif 441 407 }; 442 408 ··· 617 575 struct ib_mr_status *mr_status); 618 576 619 577 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 578 + extern struct workqueue_struct *mlx5_ib_page_fault_wq; 579 + 620 580 int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); 621 - #else 581 + void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 582 + struct mlx5_ib_pfault *pfault); 583 + void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); 584 + int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); 585 + void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); 586 + int __init mlx5_ib_odp_init(void); 587 + void mlx5_ib_odp_cleanup(void); 588 + void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); 589 + void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); 590 + 591 + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 622 592 static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) 623 593 { 624 594 return 0; 625 595 } 596 + 597 + static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} 598 + static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } 599 + static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} 600 + static inline int mlx5_ib_odp_init(void) { return 0; } 601 + static inline void mlx5_ib_odp_cleanup(void) {} 602 + static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} 603 + static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} 604 + 626 605 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 627 606 628 607 static inline void init_query_mad(struct ib_smp *mad)
+32 -15
drivers/infiniband/hw/mlx5/mr.c
··· 52 52 static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); 53 53 #endif 54 54 55 + static int clean_mr(struct mlx5_ib_mr *mr); 56 + 55 57 static int order2idx(struct mlx5_ib_dev *dev, int order) 56 58 { 57 59 struct mlx5_mr_cache *cache = &dev->cache; ··· 1051 1049 mlx5_ib_dbg(dev, "cache empty for order %d", order); 1052 1050 mr = NULL; 1053 1051 } 1052 + } else if (access_flags & IB_ACCESS_ON_DEMAND) { 1053 + err = -EINVAL; 1054 + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); 1055 + goto error; 1054 1056 } 1055 1057 1056 1058 if (!mr) ··· 1070 1064 1071 1065 mr->umem = umem; 1072 1066 mr->npages = npages; 1073 - spin_lock(&dev->mr_lock); 1074 - dev->mdev->priv.reg_pages += npages; 1075 - spin_unlock(&dev->mr_lock); 1067 + atomic_add(npages, &dev->mdev->priv.reg_pages); 1076 1068 mr->ibmr.lkey = mr->mmr.key; 1077 1069 mr->ibmr.rkey = mr->mmr.key; 1078 1070 ··· 1114 1110 return err; 1115 1111 } 1116 1112 1117 - int mlx5_ib_dereg_mr(struct ib_mr *ibmr) 1113 + static int clean_mr(struct mlx5_ib_mr *mr) 1118 1114 { 1119 - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1120 - struct mlx5_ib_mr *mr = to_mmr(ibmr); 1121 - struct ib_umem *umem = mr->umem; 1122 - int npages = mr->npages; 1115 + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1123 1116 int umred = mr->umred; 1124 1117 int err; 1125 1118 ··· 1136 1135 free_cached_mr(dev, mr); 1137 1136 } 1138 1137 1139 - if (umem) { 1140 - ib_umem_release(umem); 1141 - spin_lock(&dev->mr_lock); 1142 - dev->mdev->priv.reg_pages -= npages; 1143 - spin_unlock(&dev->mr_lock); 1144 - } 1145 - 1146 1138 if (!umred) 1147 1139 kfree(mr); 1140 + 1141 + return 0; 1142 + } 1143 + 1144 + int mlx5_ib_dereg_mr(struct ib_mr *ibmr) 1145 + { 1146 + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1147 + struct mlx5_ib_mr *mr = to_mmr(ibmr); 1148 + int npages = mr->npages; 1149 + struct ib_umem *umem = mr->umem; 1150 + 1151 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 1152 + if (umem) 1153 + /* Wait for all running page-fault handlers to finish. */ 1154 + synchronize_srcu(&dev->mr_srcu); 1155 + #endif 1156 + 1157 + clean_mr(mr); 1158 + 1159 + if (umem) { 1160 + ib_umem_release(umem); 1161 + atomic_sub(npages, &dev->mdev->priv.reg_pages); 1162 + } 1148 1163 1149 1164 return 0; 1150 1165 }
+145
drivers/infiniband/hw/mlx5/odp.c
··· 32 32 33 33 #include "mlx5_ib.h" 34 34 35 + struct workqueue_struct *mlx5_ib_page_fault_wq; 36 + 35 37 #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ 36 38 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ 37 39 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ ··· 59 57 * such capabilities are supported so far. */ 60 58 out: 61 59 return err; 60 + } 61 + 62 + static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 63 + u32 key) 64 + { 65 + u32 base_key = mlx5_base_mkey(key); 66 + struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); 67 + 68 + if (!mmr || mmr->key != key) 69 + return NULL; 70 + 71 + return container_of(mmr, struct mlx5_ib_mr, mmr); 72 + } 73 + 74 + static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, 75 + struct mlx5_ib_pfault *pfault, 76 + int error) { 77 + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 78 + int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, 79 + pfault->mpfault.flags, 80 + error); 81 + if (ret) 82 + pr_err("Failed to resolve the page fault on QP 0x%x\n", 83 + qp->mqp.qpn); 84 + } 85 + 86 + void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 87 + struct mlx5_ib_pfault *pfault) 88 + { 89 + u8 event_subtype = pfault->mpfault.event_subtype; 90 + 91 + switch (event_subtype) { 92 + default: 93 + pr_warn("Invalid page fault event subtype: 0x%x\n", 94 + event_subtype); 95 + mlx5_ib_page_fault_resume(qp, pfault, 1); 96 + break; 97 + } 98 + } 99 + 100 + static void mlx5_ib_qp_pfault_action(struct work_struct *work) 101 + { 102 + struct mlx5_ib_pfault *pfault = container_of(work, 103 + struct mlx5_ib_pfault, 104 + work); 105 + enum mlx5_ib_pagefault_context context = 106 + mlx5_ib_get_pagefault_context(&pfault->mpfault); 107 + struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, 108 + pagefaults[context]); 109 + mlx5_ib_mr_pfault_handler(qp, pfault); 110 + } 111 + 112 + void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) 113 + { 114 + unsigned long flags; 115 + 116 + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 117 + qp->disable_page_faults = 1; 118 + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 119 + 120 + /* 121 + * Note that at this point, we are guarenteed that no more 122 + * work queue elements will be posted to the work queue with 123 + * the QP we are closing. 124 + */ 125 + flush_workqueue(mlx5_ib_page_fault_wq); 126 + } 127 + 128 + void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) 129 + { 130 + unsigned long flags; 131 + 132 + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 133 + qp->disable_page_faults = 0; 134 + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 135 + } 136 + 137 + static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, 138 + struct mlx5_pagefault *pfault) 139 + { 140 + /* 141 + * Note that we will only get one fault event per QP per context 142 + * (responder/initiator, read/write), until we resolve the page fault 143 + * with the mlx5_ib_page_fault_resume command. Since this function is 144 + * called from within the work element, there is no risk of missing 145 + * events. 146 + */ 147 + struct mlx5_ib_qp *mibqp = to_mibqp(qp); 148 + enum mlx5_ib_pagefault_context context = 149 + mlx5_ib_get_pagefault_context(pfault); 150 + struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; 151 + 152 + qp_pfault->mpfault = *pfault; 153 + 154 + /* No need to stop interrupts here since we are in an interrupt */ 155 + spin_lock(&mibqp->disable_page_faults_lock); 156 + if (!mibqp->disable_page_faults) 157 + queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); 158 + spin_unlock(&mibqp->disable_page_faults_lock); 159 + } 160 + 161 + void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) 162 + { 163 + int i; 164 + 165 + qp->disable_page_faults = 1; 166 + spin_lock_init(&qp->disable_page_faults_lock); 167 + 168 + qp->mqp.pfault_handler = mlx5_ib_pfault_handler; 169 + 170 + for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) 171 + INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); 172 + } 173 + 174 + int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 175 + { 176 + int ret; 177 + 178 + ret = init_srcu_struct(&ibdev->mr_srcu); 179 + if (ret) 180 + return ret; 181 + 182 + return 0; 183 + } 184 + 185 + void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 186 + { 187 + cleanup_srcu_struct(&ibdev->mr_srcu); 188 + } 189 + 190 + int __init mlx5_ib_odp_init(void) 191 + { 192 + mlx5_ib_page_fault_wq = 193 + create_singlethread_workqueue("mlx5_ib_page_faults"); 194 + if (!mlx5_ib_page_fault_wq) 195 + return -ENOMEM; 196 + 197 + return 0; 198 + } 199 + 200 + void mlx5_ib_odp_cleanup(void) 201 + { 202 + destroy_workqueue(mlx5_ib_page_fault_wq); 62 203 }
+25 -1
drivers/infiniband/hw/mlx5/qp.c
··· 876 876 int inlen = sizeof(*in); 877 877 int err; 878 878 879 + mlx5_ib_odp_create_qp(qp); 880 + 879 881 gen = &dev->mdev->caps.gen; 880 882 mutex_init(&qp->mutex); 881 883 spin_lock_init(&qp->sq.lock); ··· 1162 1160 in = kzalloc(sizeof(*in), GFP_KERNEL); 1163 1161 if (!in) 1164 1162 return; 1165 - if (qp->state != IB_QPS_RESET) 1163 + if (qp->state != IB_QPS_RESET) { 1164 + mlx5_ib_qp_disable_pagefaults(qp); 1166 1165 if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), 1167 1166 MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) 1168 1167 mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", 1169 1168 qp->mqp.qpn); 1169 + } 1170 1170 1171 1171 get_cqs(qp, &send_cq, &recv_cq); 1172 1172 ··· 1716 1712 if (mlx5_st < 0) 1717 1713 goto out; 1718 1714 1715 + /* If moving to a reset or error state, we must disable page faults on 1716 + * this QP and flush all current page faults. Otherwise a stale page 1717 + * fault may attempt to work on this QP after it is reset and moved 1718 + * again to RTS, and may cause the driver and the device to get out of 1719 + * sync. */ 1720 + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && 1721 + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) 1722 + mlx5_ib_qp_disable_pagefaults(qp); 1723 + 1719 1724 optpar = ib_mask_to_mlx5_opt(attr_mask); 1720 1725 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; 1721 1726 in->optparam = cpu_to_be32(optpar); ··· 1733 1720 &qp->mqp); 1734 1721 if (err) 1735 1722 goto out; 1723 + 1724 + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 1725 + mlx5_ib_qp_enable_pagefaults(qp); 1736 1726 1737 1727 qp->state = new_state; 1738 1728 ··· 3041 3025 struct mlx5_qp_context *context; 3042 3026 int mlx5_state; 3043 3027 int err = 0; 3028 + 3029 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 3030 + /* 3031 + * Wait for any outstanding page faults, in case the user frees memory 3032 + * based upon this query's result. 3033 + */ 3034 + flush_workqueue(mlx5_ib_page_fault_wq); 3035 + #endif 3044 3036 3045 3037 mutex_lock(&qp->mutex); 3046 3038 outb = kzalloc(sizeof(*outb), GFP_KERNEL);
+1 -1
include/linux/mlx5/driver.h
··· 474 474 struct workqueue_struct *pg_wq; 475 475 struct rb_root page_root; 476 476 int fw_pages; 477 - int reg_pages; 477 + atomic_t reg_pages; 478 478 struct list_head free_list; 479 479 480 480 struct mlx5_core_health health;