IB/core: Implement support for MMU notifiers regarding on demand paging regions

+1

drivers/infiniband/Kconfig

··· 41 41 config INFINIBAND_ON_DEMAND_PAGING 42 42 bool "InfiniBand on-demand paging support" 43 43 depends on INFINIBAND_USER_MEM 44 + select MMU_NOTIFIER 44 45 default y 45 46 ---help--- 46 47 On demand paging support for the InfiniBand subsystem.

+1 -1

drivers/infiniband/core/Makefile

··· 11 11 ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ 12 12 device.o fmr_pool.o cache.o netlink.o 13 13 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14 - ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o 14 + ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o 15 15 16 16 ib_mad-y := mad.o smi.o agent.o mad_rmpp.o 17 17

+1 -1

drivers/infiniband/core/umem.c

··· 72 72 * ib_umem_get - Pin and DMA map userspace memory. 73 73 * 74 74 * If access flags indicate ODP memory, avoid pinning. Instead, stores 75 - * the mm for future page fault handling. 75 + * the mm for future page fault handling in conjunction with MMU notifiers. 76 76 * 77 77 * @context: userspace context to pin memory for 78 78 * @addr: userspace virtual address to start at

+369 -10

drivers/infiniband/core/umem_odp.c

··· 41 41 #include <rdma/ib_umem.h> 42 42 #include <rdma/ib_umem_odp.h> 43 43 44 + static void ib_umem_notifier_start_account(struct ib_umem *item) 45 + { 46 + mutex_lock(&item->odp_data->umem_mutex); 47 + 48 + /* Only update private counters for this umem if it has them. 49 + * Otherwise skip it. All page faults will be delayed for this umem. */ 50 + if (item->odp_data->mn_counters_active) { 51 + int notifiers_count = item->odp_data->notifiers_count++; 52 + 53 + if (notifiers_count == 0) 54 + /* Initialize the completion object for waiting on 55 + * notifiers. Since notifier_count is zero, no one 56 + * should be waiting right now. */ 57 + reinit_completion(&item->odp_data->notifier_completion); 58 + } 59 + mutex_unlock(&item->odp_data->umem_mutex); 60 + } 61 + 62 + static void ib_umem_notifier_end_account(struct ib_umem *item) 63 + { 64 + mutex_lock(&item->odp_data->umem_mutex); 65 + 66 + /* Only update private counters for this umem if it has them. 67 + * Otherwise skip it. All page faults will be delayed for this umem. */ 68 + if (item->odp_data->mn_counters_active) { 69 + /* 70 + * This sequence increase will notify the QP page fault that 71 + * the page that is going to be mapped in the spte could have 72 + * been freed. 73 + */ 74 + ++item->odp_data->notifiers_seq; 75 + if (--item->odp_data->notifiers_count == 0) 76 + complete_all(&item->odp_data->notifier_completion); 77 + } 78 + mutex_unlock(&item->odp_data->umem_mutex); 79 + } 80 + 81 + /* Account for a new mmu notifier in an ib_ucontext. */ 82 + static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) 83 + { 84 + atomic_inc(&context->notifier_count); 85 + } 86 + 87 + /* Account for a terminating mmu notifier in an ib_ucontext. 88 + * 89 + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since 90 + * the function takes the semaphore itself. */ 91 + static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) 92 + { 93 + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); 94 + 95 + if (zero_notifiers && 96 + !list_empty(&context->no_private_counters)) { 97 + /* No currently running mmu notifiers. Now is the chance to 98 + * add private accounting to all previously added umems. */ 99 + struct ib_umem_odp *odp_data, *next; 100 + 101 + /* Prevent concurrent mmu notifiers from working on the 102 + * no_private_counters list. */ 103 + down_write(&context->umem_rwsem); 104 + 105 + /* Read the notifier_count again, with the umem_rwsem 106 + * semaphore taken for write. */ 107 + if (!atomic_read(&context->notifier_count)) { 108 + list_for_each_entry_safe(odp_data, next, 109 + &context->no_private_counters, 110 + no_private_counters) { 111 + mutex_lock(&odp_data->umem_mutex); 112 + odp_data->mn_counters_active = true; 113 + list_del(&odp_data->no_private_counters); 114 + complete_all(&odp_data->notifier_completion); 115 + mutex_unlock(&odp_data->umem_mutex); 116 + } 117 + } 118 + 119 + up_write(&context->umem_rwsem); 120 + } 121 + } 122 + 123 + static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, 124 + u64 end, void *cookie) { 125 + /* 126 + * Increase the number of notifiers running, to 127 + * prevent any further fault handling on this MR. 128 + */ 129 + ib_umem_notifier_start_account(item); 130 + item->odp_data->dying = 1; 131 + /* Make sure that the fact the umem is dying is out before we release 132 + * all pending page faults. */ 133 + smp_wmb(); 134 + complete_all(&item->odp_data->notifier_completion); 135 + item->context->invalidate_range(item, ib_umem_start(item), 136 + ib_umem_end(item)); 137 + return 0; 138 + } 139 + 140 + static void ib_umem_notifier_release(struct mmu_notifier *mn, 141 + struct mm_struct *mm) 142 + { 143 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 144 + 145 + if (!context->invalidate_range) 146 + return; 147 + 148 + ib_ucontext_notifier_start_account(context); 149 + down_read(&context->umem_rwsem); 150 + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, 151 + ULLONG_MAX, 152 + ib_umem_notifier_release_trampoline, 153 + NULL); 154 + up_read(&context->umem_rwsem); 155 + } 156 + 157 + static int invalidate_page_trampoline(struct ib_umem *item, u64 start, 158 + u64 end, void *cookie) 159 + { 160 + ib_umem_notifier_start_account(item); 161 + item->context->invalidate_range(item, start, start + PAGE_SIZE); 162 + ib_umem_notifier_end_account(item); 163 + return 0; 164 + } 165 + 166 + static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, 167 + struct mm_struct *mm, 168 + unsigned long address) 169 + { 170 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 171 + 172 + if (!context->invalidate_range) 173 + return; 174 + 175 + ib_ucontext_notifier_start_account(context); 176 + down_read(&context->umem_rwsem); 177 + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, 178 + address + PAGE_SIZE, 179 + invalidate_page_trampoline, NULL); 180 + up_read(&context->umem_rwsem); 181 + ib_ucontext_notifier_end_account(context); 182 + } 183 + 184 + static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, 185 + u64 end, void *cookie) 186 + { 187 + ib_umem_notifier_start_account(item); 188 + item->context->invalidate_range(item, start, end); 189 + return 0; 190 + } 191 + 192 + static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, 193 + struct mm_struct *mm, 194 + unsigned long start, 195 + unsigned long end) 196 + { 197 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 198 + 199 + if (!context->invalidate_range) 200 + return; 201 + 202 + ib_ucontext_notifier_start_account(context); 203 + down_read(&context->umem_rwsem); 204 + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 205 + end, 206 + invalidate_range_start_trampoline, NULL); 207 + up_read(&context->umem_rwsem); 208 + } 209 + 210 + static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, 211 + u64 end, void *cookie) 212 + { 213 + ib_umem_notifier_end_account(item); 214 + return 0; 215 + } 216 + 217 + static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, 218 + struct mm_struct *mm, 219 + unsigned long start, 220 + unsigned long end) 221 + { 222 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 223 + 224 + if (!context->invalidate_range) 225 + return; 226 + 227 + down_read(&context->umem_rwsem); 228 + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 229 + end, 230 + invalidate_range_end_trampoline, NULL); 231 + up_read(&context->umem_rwsem); 232 + ib_ucontext_notifier_end_account(context); 233 + } 234 + 235 + static struct mmu_notifier_ops ib_umem_notifiers = { 236 + .release = ib_umem_notifier_release, 237 + .invalidate_page = ib_umem_notifier_invalidate_page, 238 + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 239 + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 240 + }; 241 + 44 242 int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) 45 243 { 46 244 int ret_val; 47 245 struct pid *our_pid; 246 + struct mm_struct *mm = get_task_mm(current); 247 + 248 + if (!mm) 249 + return -EINVAL; 48 250 49 251 /* Prevent creating ODP MRs in child processes */ 50 252 rcu_read_lock(); 51 253 our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); 52 254 rcu_read_unlock(); 53 255 put_pid(our_pid); 54 - if (context->tgid != our_pid) 55 - return -EINVAL; 256 + if (context->tgid != our_pid) { 257 + ret_val = -EINVAL; 258 + goto out_mm; 259 + } 56 260 57 261 umem->hugetlb = 0; 58 262 umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); 59 - if (!umem->odp_data) 60 - return -ENOMEM; 263 + if (!umem->odp_data) { 264 + ret_val = -ENOMEM; 265 + goto out_mm; 266 + } 267 + umem->odp_data->umem = umem; 61 268 62 269 mutex_init(&umem->odp_data->umem_mutex); 270 + 271 + init_completion(&umem->odp_data->notifier_completion); 63 272 64 273 umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * 65 274 sizeof(*umem->odp_data->page_list)); ··· 284 75 goto out_page_list; 285 76 } 286 77 78 + /* 79 + * When using MMU notifiers, we will get a 80 + * notification before the "current" task (and MM) is 81 + * destroyed. We use the umem_rwsem semaphore to synchronize. 82 + */ 83 + down_write(&context->umem_rwsem); 84 + context->odp_mrs_count++; 85 + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 86 + rbt_ib_umem_insert(&umem->odp_data->interval_tree, 87 + &context->umem_tree); 88 + if (likely(!atomic_read(&context->notifier_count))) 89 + umem->odp_data->mn_counters_active = true; 90 + else 91 + list_add(&umem->odp_data->no_private_counters, 92 + &context->no_private_counters); 93 + downgrade_write(&context->umem_rwsem); 94 + 95 + if (context->odp_mrs_count == 1) { 96 + /* 97 + * Note that at this point, no MMU notifier is running 98 + * for this context! 99 + */ 100 + atomic_set(&context->notifier_count, 0); 101 + INIT_HLIST_NODE(&context->mn.hlist); 102 + context->mn.ops = &ib_umem_notifiers; 103 + /* 104 + * Lock-dep detects a false positive for mmap_sem vs. 105 + * umem_rwsem, due to not grasping downgrade_write correctly. 106 + */ 107 + lockdep_off(); 108 + ret_val = mmu_notifier_register(&context->mn, mm); 109 + lockdep_on(); 110 + if (ret_val) { 111 + pr_err("Failed to register mmu_notifier %d\n", ret_val); 112 + ret_val = -EBUSY; 113 + goto out_mutex; 114 + } 115 + } 116 + 117 + up_read(&context->umem_rwsem); 118 + 119 + /* 120 + * Note that doing an mmput can cause a notifier for the relevant mm. 121 + * If the notifier is called while we hold the umem_rwsem, this will 122 + * cause a deadlock. Therefore, we release the reference only after we 123 + * released the semaphore. 124 + */ 125 + mmput(mm); 287 126 return 0; 288 127 128 + out_mutex: 129 + up_read(&context->umem_rwsem); 130 + vfree(umem->odp_data->dma_list); 289 131 out_page_list: 290 132 vfree(umem->odp_data->page_list); 291 133 out_odp_data: 292 134 kfree(umem->odp_data); 135 + out_mm: 136 + mmput(mm); 293 137 return ret_val; 294 138 } 295 139 296 140 void ib_umem_odp_release(struct ib_umem *umem) 297 141 { 142 + struct ib_ucontext *context = umem->context; 143 + 298 144 /* 299 145 * Ensure that no more pages are mapped in the umem. 300 146 * ··· 358 94 */ 359 95 ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), 360 96 ib_umem_end(umem)); 97 + 98 + down_write(&context->umem_rwsem); 99 + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 100 + rbt_ib_umem_remove(&umem->odp_data->interval_tree, 101 + &context->umem_tree); 102 + context->odp_mrs_count--; 103 + if (!umem->odp_data->mn_counters_active) { 104 + list_del(&umem->odp_data->no_private_counters); 105 + complete_all(&umem->odp_data->notifier_completion); 106 + } 107 + 108 + /* 109 + * Downgrade the lock to a read lock. This ensures that the notifiers 110 + * (who lock the mutex for reading) will be able to finish, and we 111 + * will be able to enventually obtain the mmu notifiers SRCU. Note 112 + * that since we are doing it atomically, no other user could register 113 + * and unregister while we do the check. 114 + */ 115 + downgrade_write(&context->umem_rwsem); 116 + if (!context->odp_mrs_count) { 117 + struct task_struct *owning_process = NULL; 118 + struct mm_struct *owning_mm = NULL; 119 + 120 + owning_process = get_pid_task(context->tgid, 121 + PIDTYPE_PID); 122 + if (owning_process == NULL) 123 + /* 124 + * The process is already dead, notifier were removed 125 + * already. 126 + */ 127 + goto out; 128 + 129 + owning_mm = get_task_mm(owning_process); 130 + if (owning_mm == NULL) 131 + /* 132 + * The process' mm is already dead, notifier were 133 + * removed already. 134 + */ 135 + goto out_put_task; 136 + mmu_notifier_unregister(&context->mn, owning_mm); 137 + 138 + mmput(owning_mm); 139 + 140 + out_put_task: 141 + put_task_struct(owning_process); 142 + } 143 + out: 144 + up_read(&context->umem_rwsem); 361 145 362 146 vfree(umem->odp_data->dma_list); 363 147 vfree(umem->odp_data->page_list); ··· 424 112 * the sequence number is taken from 425 113 * umem->odp_data->notifiers_seq. 426 114 * 427 - * The function returns -EFAULT if the DMA mapping operation fails. 115 + * The function returns -EFAULT if the DMA mapping operation fails. It returns 116 + * -EAGAIN if a concurrent invalidation prevents us from updating the page. 428 117 * 429 118 * The page is released via put_page even if the operation failed. For 430 119 * on-demand pinning, the page is released whenever it isn't stored in the ··· 434 121 static int ib_umem_odp_map_dma_single_page( 435 122 struct ib_umem *umem, 436 123 int page_index, 124 + u64 base_virt_addr, 437 125 struct page *page, 438 126 u64 access_mask, 439 127 unsigned long current_seq) ··· 442 128 struct ib_device *dev = umem->context->device; 443 129 dma_addr_t dma_addr; 444 130 int stored_page = 0; 131 + int remove_existing_mapping = 0; 445 132 int ret = 0; 446 133 447 134 mutex_lock(&umem->odp_data->umem_mutex); 135 + /* 136 + * Note: we avoid writing if seq is different from the initial seq, to 137 + * handle case of a racing notifier. This check also allows us to bail 138 + * early if we have a notifier running in parallel with us. 139 + */ 140 + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { 141 + ret = -EAGAIN; 142 + goto out; 143 + } 448 144 if (!(umem->odp_data->dma_list[page_index])) { 449 145 dma_addr = ib_dma_map_page(dev, 450 146 page, ··· 472 148 } else { 473 149 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 474 150 umem->odp_data->page_list[page_index], page); 151 + /* Better remove the mapping now, to prevent any further 152 + * damage. */ 153 + remove_existing_mapping = 1; 475 154 } 476 155 477 156 out: 478 157 mutex_unlock(&umem->odp_data->umem_mutex); 479 158 480 - if (!stored_page) 159 + /* On Demand Paging - avoid pinning the page */ 160 + if (umem->context->invalidate_range || !stored_page) 481 161 put_page(page); 162 + 163 + if (remove_existing_mapping && umem->context->invalidate_range) { 164 + invalidate_page_trampoline( 165 + umem, 166 + base_virt_addr + (page_index * PAGE_SIZE), 167 + base_virt_addr + ((page_index+1)*PAGE_SIZE), 168 + NULL); 169 + ret = -EAGAIN; 170 + } 482 171 483 172 return ret; 484 173 } ··· 505 168 * 506 169 * Returns the number of pages mapped in success, negative error code 507 170 * for failure. 171 + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents 172 + * the function from completing its task. 508 173 * 509 174 * @umem: the umem to map and pin 510 175 * @user_virt: the address from which we need to map. ··· 528 189 struct page **local_page_list = NULL; 529 190 u64 off; 530 191 int j, k, ret = 0, start_idx, npages = 0; 192 + u64 base_virt_addr; 531 193 532 194 if (access_mask == 0) 533 195 return -EINVAL; ··· 543 203 544 204 off = user_virt & (~PAGE_MASK); 545 205 user_virt = user_virt & PAGE_MASK; 206 + base_virt_addr = user_virt; 546 207 bcnt += off; /* Charge for the first page offset as well. */ 547 208 548 209 owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); ··· 587 246 user_virt += npages << PAGE_SHIFT; 588 247 for (j = 0; j < npages; ++j) { 589 248 ret = ib_umem_odp_map_dma_single_page( 590 - umem, k, local_page_list[j], access_mask, 591 - current_seq); 249 + umem, k, base_virt_addr, local_page_list[j], 250 + access_mask, current_seq); 592 251 if (ret < 0) 593 252 break; 594 253 k++; ··· 627 286 628 287 virt = max_t(u64, virt, ib_umem_start(umem)); 629 288 bound = min_t(u64, bound, ib_umem_end(umem)); 289 + /* Note that during the run of this function, the 290 + * notifiers_count of the MR is > 0, preventing any racing 291 + * faults from completion. We might be racing with other 292 + * invalidations, so we must make sure we free each page only 293 + * once. */ 630 294 for (addr = virt; addr < bound; addr += (u64)umem->page_size) { 631 295 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 632 296 mutex_lock(&umem->odp_data->umem_mutex); ··· 646 300 ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, 647 301 DMA_BIDIRECTIONAL); 648 302 if (dma & ODP_WRITE_ALLOWED_BIT) 649 - set_page_dirty_lock(head_page); 650 - put_page(page); 303 + /* 304 + * set_page_dirty prefers being called with 305 + * the page lock. However, MMU notifiers are 306 + * called sometimes with and sometimes without 307 + * the lock. We rely on the umem_mutex instead 308 + * to prevent other mmu notifiers from 309 + * continuing and allowing the page mapping to 310 + * be removed. 311 + */ 312 + set_page_dirty(head_page); 313 + /* on demand pinning support */ 314 + if (!umem->context->invalidate_range) 315 + put_page(page); 316 + umem->odp_data->page_list[idx] = NULL; 317 + umem->odp_data->dma_list[idx] = 0; 651 318 } 652 319 mutex_unlock(&umem->odp_data->umem_mutex); 653 320 }

+94

drivers/infiniband/core/umem_rbtree.c

··· 1 + /* 2 + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the 8 + * OpenIB.org BSD license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or 11 + * without modification, are permitted provided that the following 12 + * conditions are met: 13 + * 14 + * - Redistributions of source code must retain the above 15 + * copyright notice, this list of conditions and the following 16 + * disclaimer. 17 + * 18 + * - Redistributions in binary form must reproduce the above 19 + * copyright notice, this list of conditions and the following 20 + * disclaimer in the documentation and/or other materials 21 + * provided with the distribution. 22 + * 23 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 + * SOFTWARE. 31 + */ 32 + 33 + #include <linux/kernel.h> 34 + #include <linux/module.h> 35 + #include <linux/interval_tree_generic.h> 36 + #include <linux/sched.h> 37 + #include <linux/gfp.h> 38 + #include <rdma/ib_umem_odp.h> 39 + 40 + /* 41 + * The ib_umem list keeps track of memory regions for which the HW 42 + * device request to receive notification when the related memory 43 + * mapping is changed. 44 + * 45 + * ib_umem_lock protects the list. 46 + */ 47 + 48 + static inline u64 node_start(struct umem_odp_node *n) 49 + { 50 + struct ib_umem_odp *umem_odp = 51 + container_of(n, struct ib_umem_odp, interval_tree); 52 + 53 + return ib_umem_start(umem_odp->umem); 54 + } 55 + 56 + /* Note that the representation of the intervals in the interval tree 57 + * considers the ending point as contained in the interval, while the 58 + * function ib_umem_end returns the first address which is not contained 59 + * in the umem. 60 + */ 61 + static inline u64 node_last(struct umem_odp_node *n) 62 + { 63 + struct ib_umem_odp *umem_odp = 64 + container_of(n, struct ib_umem_odp, interval_tree); 65 + 66 + return ib_umem_end(umem_odp->umem) - 1; 67 + } 68 + 69 + INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, 70 + node_start, node_last, , rbt_ib_umem) 71 + 72 + /* @last is not a part of the interval. See comment for function 73 + * node_last. 74 + */ 75 + int rbt_ib_umem_for_each_in_range(struct rb_root *root, 76 + u64 start, u64 last, 77 + umem_call_back cb, 78 + void *cookie) 79 + { 80 + int ret_val = 0; 81 + struct umem_odp_node *node; 82 + struct ib_umem_odp *umem; 83 + 84 + if (unlikely(start == last)) 85 + return ret_val; 86 + 87 + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; 88 + node = rbt_ib_umem_iter_next(node, start, last - 1)) { 89 + umem = container_of(node, struct ib_umem_odp, interval_tree); 90 + ret_val = cb(umem->umem, start, last, cookie) || ret_val; 91 + } 92 + 93 + return ret_val; 94 + }

+17

drivers/infiniband/core/uverbs_cmd.c

··· 289 289 struct ib_uverbs_get_context_resp resp; 290 290 struct ib_udata udata; 291 291 struct ib_device *ibdev = file->device->ib_dev; 292 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 293 + struct ib_device_attr dev_attr; 294 + #endif 292 295 struct ib_ucontext *ucontext; 293 296 struct file *filp; 294 297 int ret; ··· 333 330 ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 334 331 rcu_read_unlock(); 335 332 ucontext->closing = 0; 333 + 334 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 335 + ucontext->umem_tree = RB_ROOT; 336 + init_rwsem(&ucontext->umem_rwsem); 337 + ucontext->odp_mrs_count = 0; 338 + INIT_LIST_HEAD(&ucontext->no_private_counters); 339 + 340 + ret = ib_query_device(ibdev, &dev_attr); 341 + if (ret) 342 + goto err_free; 343 + if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) 344 + ucontext->invalidate_range = NULL; 345 + 346 + #endif 336 347 337 348 resp.num_comp_vectors = file->device->num_comp_vectors; 338 349

+64 -1

include/rdma/ib_umem_odp.h

··· 34 34 #define IB_UMEM_ODP_H 35 35 36 36 #include <rdma/ib_umem.h> 37 + #include <rdma/ib_verbs.h> 38 + #include <linux/interval_tree.h> 39 + 40 + struct umem_odp_node { 41 + u64 __subtree_last; 42 + struct rb_node rb; 43 + }; 37 44 38 45 struct ib_umem_odp { 39 46 /* ··· 58 51 dma_addr_t *dma_list; 59 52 /* 60 53 * The umem_mutex protects the page_list and dma_list fields of an ODP 61 - * umem, allowing only a single thread to map/unmap pages. 54 + * umem, allowing only a single thread to map/unmap pages. The mutex 55 + * also protects access to the mmu notifier counters. 62 56 */ 63 57 struct mutex umem_mutex; 64 58 void *private; /* for the HW driver to use. */ 59 + 60 + /* When false, use the notifier counter in the ucontext struct. */ 61 + bool mn_counters_active; 62 + int notifiers_seq; 63 + int notifiers_count; 64 + 65 + /* A linked list of umems that don't have private mmu notifier 66 + * counters yet. */ 67 + struct list_head no_private_counters; 68 + struct ib_umem *umem; 69 + 70 + /* Tree tracking */ 71 + struct umem_odp_node interval_tree; 72 + 73 + struct completion notifier_completion; 74 + int dying; 65 75 }; 66 76 67 77 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING ··· 105 81 106 82 void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, 107 83 u64 bound); 84 + 85 + void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); 86 + void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); 87 + typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, 88 + void *cookie); 89 + /* 90 + * Call the callback on each ib_umem in the range. Returns the logical or of 91 + * the return values of the functions called. 92 + */ 93 + int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, 94 + umem_call_back cb, void *cookie); 95 + 96 + struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, 97 + u64 start, u64 last); 98 + struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, 99 + u64 start, u64 last); 100 + 101 + static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, 102 + unsigned long mmu_seq) 103 + { 104 + /* 105 + * This code is strongly based on the KVM code from 106 + * mmu_notifier_retry. Should be called with 107 + * the relevant locks taken (item->odp_data->umem_mutex 108 + * and the ucontext umem_mutex semaphore locked for read). 109 + */ 110 + 111 + /* Do not allow page faults while the new ib_umem hasn't seen a state 112 + * with zero notifiers yet, and doesn't have its own valid set of 113 + * private counters. */ 114 + if (!item->odp_data->mn_counters_active) 115 + return 1; 116 + 117 + if (unlikely(item->odp_data->notifiers_count)) 118 + return 1; 119 + if (item->odp_data->notifiers_seq != mmu_seq) 120 + return 1; 121 + return 0; 122 + } 108 123 109 124 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 110 125

+19

include/rdma/ib_verbs.h

··· 51 51 #include <uapi/linux/if_ether.h> 52 52 53 53 #include <linux/atomic.h> 54 + #include <linux/mmu_notifier.h> 54 55 #include <asm/uaccess.h> 55 56 56 57 extern struct workqueue_struct *ib_wq; ··· 1140 1139 u8 page_shift; 1141 1140 }; 1142 1141 1142 + struct ib_umem; 1143 + 1143 1144 struct ib_ucontext { 1144 1145 struct ib_device *device; 1145 1146 struct list_head pd_list; ··· 1156 1153 int closing; 1157 1154 1158 1155 struct pid *tgid; 1156 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 1157 + struct rb_root umem_tree; 1158 + /* 1159 + * Protects .umem_rbroot and tree, as well as odp_mrs_count and 1160 + * mmu notifiers registration. 1161 + */ 1162 + struct rw_semaphore umem_rwsem; 1163 + void (*invalidate_range)(struct ib_umem *umem, 1164 + unsigned long start, unsigned long end); 1165 + 1166 + struct mmu_notifier mn; 1167 + atomic_t notifier_count; 1168 + /* A list of umems that don't have private mmu notifier counters yet. */ 1169 + struct list_head no_private_counters; 1170 + int odp_mrs_count; 1171 + #endif 1159 1172 }; 1160 1173 1161 1174 struct ib_uobject {