Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/core: Implement support for MMU notifiers regarding on demand paging regions

* Add an interval tree implementation for ODP umems. Create an
interval tree for each ucontext (including a count of the number of
ODP MRs in this context, semaphore, etc.), and register ODP umems in
the interval tree.
* Add MMU notifiers handling functions, using the interval tree to
notify only the relevant umems and underlying MRs.
* Register to receive MMU notifier events from the MM subsystem upon
ODP MR registration (and unregister accordingly).
* Add a completion object to synchronize the destruction of ODP umems.
* Add mechanism to abort page faults when there's a concurrent invalidation.

The way we synchronize between concurrent invalidations and page
faults is by keeping a counter of currently running invalidations, and
a sequence number that is incremented whenever an invalidation is
caught. The page fault code checks the counter and also verifies that
the sequence number hasn't progressed before it updates the umem's
page tables. This is similar to what the kvm module does.

In order to prevent the case where we register a umem in the middle of
an ongoing notifier, we also keep a per ucontext counter of the total
number of active mmu notifiers. We only enable new umems when all the
running notifiers complete.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yuval Dagan <yuvalda@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>

authored by

Haggai Eran and committed by
Roland Dreier
882214e2 8ada2c1c

+566 -13
+1
drivers/infiniband/Kconfig
··· 41 41 config INFINIBAND_ON_DEMAND_PAGING 42 42 bool "InfiniBand on-demand paging support" 43 43 depends on INFINIBAND_USER_MEM 44 + select MMU_NOTIFIER 44 45 default y 45 46 ---help--- 46 47 On demand paging support for the InfiniBand subsystem.
+1 -1
drivers/infiniband/core/Makefile
··· 11 11 ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ 12 12 device.o fmr_pool.o cache.o netlink.o 13 13 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14 - ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o 14 + ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o 15 15 16 16 ib_mad-y := mad.o smi.o agent.o mad_rmpp.o 17 17
+1 -1
drivers/infiniband/core/umem.c
··· 72 72 * ib_umem_get - Pin and DMA map userspace memory. 73 73 * 74 74 * If access flags indicate ODP memory, avoid pinning. Instead, stores 75 - * the mm for future page fault handling. 75 + * the mm for future page fault handling in conjunction with MMU notifiers. 76 76 * 77 77 * @context: userspace context to pin memory for 78 78 * @addr: userspace virtual address to start at
+369 -10
drivers/infiniband/core/umem_odp.c
··· 41 41 #include <rdma/ib_umem.h> 42 42 #include <rdma/ib_umem_odp.h> 43 43 44 + static void ib_umem_notifier_start_account(struct ib_umem *item) 45 + { 46 + mutex_lock(&item->odp_data->umem_mutex); 47 + 48 + /* Only update private counters for this umem if it has them. 49 + * Otherwise skip it. All page faults will be delayed for this umem. */ 50 + if (item->odp_data->mn_counters_active) { 51 + int notifiers_count = item->odp_data->notifiers_count++; 52 + 53 + if (notifiers_count == 0) 54 + /* Initialize the completion object for waiting on 55 + * notifiers. Since notifier_count is zero, no one 56 + * should be waiting right now. */ 57 + reinit_completion(&item->odp_data->notifier_completion); 58 + } 59 + mutex_unlock(&item->odp_data->umem_mutex); 60 + } 61 + 62 + static void ib_umem_notifier_end_account(struct ib_umem *item) 63 + { 64 + mutex_lock(&item->odp_data->umem_mutex); 65 + 66 + /* Only update private counters for this umem if it has them. 67 + * Otherwise skip it. All page faults will be delayed for this umem. */ 68 + if (item->odp_data->mn_counters_active) { 69 + /* 70 + * This sequence increase will notify the QP page fault that 71 + * the page that is going to be mapped in the spte could have 72 + * been freed. 73 + */ 74 + ++item->odp_data->notifiers_seq; 75 + if (--item->odp_data->notifiers_count == 0) 76 + complete_all(&item->odp_data->notifier_completion); 77 + } 78 + mutex_unlock(&item->odp_data->umem_mutex); 79 + } 80 + 81 + /* Account for a new mmu notifier in an ib_ucontext. */ 82 + static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) 83 + { 84 + atomic_inc(&context->notifier_count); 85 + } 86 + 87 + /* Account for a terminating mmu notifier in an ib_ucontext. 88 + * 89 + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since 90 + * the function takes the semaphore itself. */ 91 + static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) 92 + { 93 + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); 94 + 95 + if (zero_notifiers && 96 + !list_empty(&context->no_private_counters)) { 97 + /* No currently running mmu notifiers. Now is the chance to 98 + * add private accounting to all previously added umems. */ 99 + struct ib_umem_odp *odp_data, *next; 100 + 101 + /* Prevent concurrent mmu notifiers from working on the 102 + * no_private_counters list. */ 103 + down_write(&context->umem_rwsem); 104 + 105 + /* Read the notifier_count again, with the umem_rwsem 106 + * semaphore taken for write. */ 107 + if (!atomic_read(&context->notifier_count)) { 108 + list_for_each_entry_safe(odp_data, next, 109 + &context->no_private_counters, 110 + no_private_counters) { 111 + mutex_lock(&odp_data->umem_mutex); 112 + odp_data->mn_counters_active = true; 113 + list_del(&odp_data->no_private_counters); 114 + complete_all(&odp_data->notifier_completion); 115 + mutex_unlock(&odp_data->umem_mutex); 116 + } 117 + } 118 + 119 + up_write(&context->umem_rwsem); 120 + } 121 + } 122 + 123 + static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, 124 + u64 end, void *cookie) { 125 + /* 126 + * Increase the number of notifiers running, to 127 + * prevent any further fault handling on this MR. 128 + */ 129 + ib_umem_notifier_start_account(item); 130 + item->odp_data->dying = 1; 131 + /* Make sure that the fact the umem is dying is out before we release 132 + * all pending page faults. */ 133 + smp_wmb(); 134 + complete_all(&item->odp_data->notifier_completion); 135 + item->context->invalidate_range(item, ib_umem_start(item), 136 + ib_umem_end(item)); 137 + return 0; 138 + } 139 + 140 + static void ib_umem_notifier_release(struct mmu_notifier *mn, 141 + struct mm_struct *mm) 142 + { 143 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 144 + 145 + if (!context->invalidate_range) 146 + return; 147 + 148 + ib_ucontext_notifier_start_account(context); 149 + down_read(&context->umem_rwsem); 150 + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, 151 + ULLONG_MAX, 152 + ib_umem_notifier_release_trampoline, 153 + NULL); 154 + up_read(&context->umem_rwsem); 155 + } 156 + 157 + static int invalidate_page_trampoline(struct ib_umem *item, u64 start, 158 + u64 end, void *cookie) 159 + { 160 + ib_umem_notifier_start_account(item); 161 + item->context->invalidate_range(item, start, start + PAGE_SIZE); 162 + ib_umem_notifier_end_account(item); 163 + return 0; 164 + } 165 + 166 + static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, 167 + struct mm_struct *mm, 168 + unsigned long address) 169 + { 170 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 171 + 172 + if (!context->invalidate_range) 173 + return; 174 + 175 + ib_ucontext_notifier_start_account(context); 176 + down_read(&context->umem_rwsem); 177 + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, 178 + address + PAGE_SIZE, 179 + invalidate_page_trampoline, NULL); 180 + up_read(&context->umem_rwsem); 181 + ib_ucontext_notifier_end_account(context); 182 + } 183 + 184 + static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, 185 + u64 end, void *cookie) 186 + { 187 + ib_umem_notifier_start_account(item); 188 + item->context->invalidate_range(item, start, end); 189 + return 0; 190 + } 191 + 192 + static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, 193 + struct mm_struct *mm, 194 + unsigned long start, 195 + unsigned long end) 196 + { 197 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 198 + 199 + if (!context->invalidate_range) 200 + return; 201 + 202 + ib_ucontext_notifier_start_account(context); 203 + down_read(&context->umem_rwsem); 204 + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 205 + end, 206 + invalidate_range_start_trampoline, NULL); 207 + up_read(&context->umem_rwsem); 208 + } 209 + 210 + static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, 211 + u64 end, void *cookie) 212 + { 213 + ib_umem_notifier_end_account(item); 214 + return 0; 215 + } 216 + 217 + static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, 218 + struct mm_struct *mm, 219 + unsigned long start, 220 + unsigned long end) 221 + { 222 + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 223 + 224 + if (!context->invalidate_range) 225 + return; 226 + 227 + down_read(&context->umem_rwsem); 228 + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 229 + end, 230 + invalidate_range_end_trampoline, NULL); 231 + up_read(&context->umem_rwsem); 232 + ib_ucontext_notifier_end_account(context); 233 + } 234 + 235 + static struct mmu_notifier_ops ib_umem_notifiers = { 236 + .release = ib_umem_notifier_release, 237 + .invalidate_page = ib_umem_notifier_invalidate_page, 238 + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 239 + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 240 + }; 241 + 44 242 int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) 45 243 { 46 244 int ret_val; 47 245 struct pid *our_pid; 246 + struct mm_struct *mm = get_task_mm(current); 247 + 248 + if (!mm) 249 + return -EINVAL; 48 250 49 251 /* Prevent creating ODP MRs in child processes */ 50 252 rcu_read_lock(); 51 253 our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); 52 254 rcu_read_unlock(); 53 255 put_pid(our_pid); 54 - if (context->tgid != our_pid) 55 - return -EINVAL; 256 + if (context->tgid != our_pid) { 257 + ret_val = -EINVAL; 258 + goto out_mm; 259 + } 56 260 57 261 umem->hugetlb = 0; 58 262 umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); 59 - if (!umem->odp_data) 60 - return -ENOMEM; 263 + if (!umem->odp_data) { 264 + ret_val = -ENOMEM; 265 + goto out_mm; 266 + } 267 + umem->odp_data->umem = umem; 61 268 62 269 mutex_init(&umem->odp_data->umem_mutex); 270 + 271 + init_completion(&umem->odp_data->notifier_completion); 63 272 64 273 umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * 65 274 sizeof(*umem->odp_data->page_list)); ··· 284 75 goto out_page_list; 285 76 } 286 77 78 + /* 79 + * When using MMU notifiers, we will get a 80 + * notification before the "current" task (and MM) is 81 + * destroyed. We use the umem_rwsem semaphore to synchronize. 82 + */ 83 + down_write(&context->umem_rwsem); 84 + context->odp_mrs_count++; 85 + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 86 + rbt_ib_umem_insert(&umem->odp_data->interval_tree, 87 + &context->umem_tree); 88 + if (likely(!atomic_read(&context->notifier_count))) 89 + umem->odp_data->mn_counters_active = true; 90 + else 91 + list_add(&umem->odp_data->no_private_counters, 92 + &context->no_private_counters); 93 + downgrade_write(&context->umem_rwsem); 94 + 95 + if (context->odp_mrs_count == 1) { 96 + /* 97 + * Note that at this point, no MMU notifier is running 98 + * for this context! 99 + */ 100 + atomic_set(&context->notifier_count, 0); 101 + INIT_HLIST_NODE(&context->mn.hlist); 102 + context->mn.ops = &ib_umem_notifiers; 103 + /* 104 + * Lock-dep detects a false positive for mmap_sem vs. 105 + * umem_rwsem, due to not grasping downgrade_write correctly. 106 + */ 107 + lockdep_off(); 108 + ret_val = mmu_notifier_register(&context->mn, mm); 109 + lockdep_on(); 110 + if (ret_val) { 111 + pr_err("Failed to register mmu_notifier %d\n", ret_val); 112 + ret_val = -EBUSY; 113 + goto out_mutex; 114 + } 115 + } 116 + 117 + up_read(&context->umem_rwsem); 118 + 119 + /* 120 + * Note that doing an mmput can cause a notifier for the relevant mm. 121 + * If the notifier is called while we hold the umem_rwsem, this will 122 + * cause a deadlock. Therefore, we release the reference only after we 123 + * released the semaphore. 124 + */ 125 + mmput(mm); 287 126 return 0; 288 127 128 + out_mutex: 129 + up_read(&context->umem_rwsem); 130 + vfree(umem->odp_data->dma_list); 289 131 out_page_list: 290 132 vfree(umem->odp_data->page_list); 291 133 out_odp_data: 292 134 kfree(umem->odp_data); 135 + out_mm: 136 + mmput(mm); 293 137 return ret_val; 294 138 } 295 139 296 140 void ib_umem_odp_release(struct ib_umem *umem) 297 141 { 142 + struct ib_ucontext *context = umem->context; 143 + 298 144 /* 299 145 * Ensure that no more pages are mapped in the umem. 300 146 * ··· 358 94 */ 359 95 ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), 360 96 ib_umem_end(umem)); 97 + 98 + down_write(&context->umem_rwsem); 99 + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 100 + rbt_ib_umem_remove(&umem->odp_data->interval_tree, 101 + &context->umem_tree); 102 + context->odp_mrs_count--; 103 + if (!umem->odp_data->mn_counters_active) { 104 + list_del(&umem->odp_data->no_private_counters); 105 + complete_all(&umem->odp_data->notifier_completion); 106 + } 107 + 108 + /* 109 + * Downgrade the lock to a read lock. This ensures that the notifiers 110 + * (who lock the mutex for reading) will be able to finish, and we 111 + * will be able to enventually obtain the mmu notifiers SRCU. Note 112 + * that since we are doing it atomically, no other user could register 113 + * and unregister while we do the check. 114 + */ 115 + downgrade_write(&context->umem_rwsem); 116 + if (!context->odp_mrs_count) { 117 + struct task_struct *owning_process = NULL; 118 + struct mm_struct *owning_mm = NULL; 119 + 120 + owning_process = get_pid_task(context->tgid, 121 + PIDTYPE_PID); 122 + if (owning_process == NULL) 123 + /* 124 + * The process is already dead, notifier were removed 125 + * already. 126 + */ 127 + goto out; 128 + 129 + owning_mm = get_task_mm(owning_process); 130 + if (owning_mm == NULL) 131 + /* 132 + * The process' mm is already dead, notifier were 133 + * removed already. 134 + */ 135 + goto out_put_task; 136 + mmu_notifier_unregister(&context->mn, owning_mm); 137 + 138 + mmput(owning_mm); 139 + 140 + out_put_task: 141 + put_task_struct(owning_process); 142 + } 143 + out: 144 + up_read(&context->umem_rwsem); 361 145 362 146 vfree(umem->odp_data->dma_list); 363 147 vfree(umem->odp_data->page_list); ··· 424 112 * the sequence number is taken from 425 113 * umem->odp_data->notifiers_seq. 426 114 * 427 - * The function returns -EFAULT if the DMA mapping operation fails. 115 + * The function returns -EFAULT if the DMA mapping operation fails. It returns 116 + * -EAGAIN if a concurrent invalidation prevents us from updating the page. 428 117 * 429 118 * The page is released via put_page even if the operation failed. For 430 119 * on-demand pinning, the page is released whenever it isn't stored in the ··· 434 121 static int ib_umem_odp_map_dma_single_page( 435 122 struct ib_umem *umem, 436 123 int page_index, 124 + u64 base_virt_addr, 437 125 struct page *page, 438 126 u64 access_mask, 439 127 unsigned long current_seq) ··· 442 128 struct ib_device *dev = umem->context->device; 443 129 dma_addr_t dma_addr; 444 130 int stored_page = 0; 131 + int remove_existing_mapping = 0; 445 132 int ret = 0; 446 133 447 134 mutex_lock(&umem->odp_data->umem_mutex); 135 + /* 136 + * Note: we avoid writing if seq is different from the initial seq, to 137 + * handle case of a racing notifier. This check also allows us to bail 138 + * early if we have a notifier running in parallel with us. 139 + */ 140 + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { 141 + ret = -EAGAIN; 142 + goto out; 143 + } 448 144 if (!(umem->odp_data->dma_list[page_index])) { 449 145 dma_addr = ib_dma_map_page(dev, 450 146 page, ··· 472 148 } else { 473 149 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 474 150 umem->odp_data->page_list[page_index], page); 151 + /* Better remove the mapping now, to prevent any further 152 + * damage. */ 153 + remove_existing_mapping = 1; 475 154 } 476 155 477 156 out: 478 157 mutex_unlock(&umem->odp_data->umem_mutex); 479 158 480 - if (!stored_page) 159 + /* On Demand Paging - avoid pinning the page */ 160 + if (umem->context->invalidate_range || !stored_page) 481 161 put_page(page); 162 + 163 + if (remove_existing_mapping && umem->context->invalidate_range) { 164 + invalidate_page_trampoline( 165 + umem, 166 + base_virt_addr + (page_index * PAGE_SIZE), 167 + base_virt_addr + ((page_index+1)*PAGE_SIZE), 168 + NULL); 169 + ret = -EAGAIN; 170 + } 482 171 483 172 return ret; 484 173 } ··· 505 168 * 506 169 * Returns the number of pages mapped in success, negative error code 507 170 * for failure. 171 + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents 172 + * the function from completing its task. 508 173 * 509 174 * @umem: the umem to map and pin 510 175 * @user_virt: the address from which we need to map. ··· 528 189 struct page **local_page_list = NULL; 529 190 u64 off; 530 191 int j, k, ret = 0, start_idx, npages = 0; 192 + u64 base_virt_addr; 531 193 532 194 if (access_mask == 0) 533 195 return -EINVAL; ··· 543 203 544 204 off = user_virt & (~PAGE_MASK); 545 205 user_virt = user_virt & PAGE_MASK; 206 + base_virt_addr = user_virt; 546 207 bcnt += off; /* Charge for the first page offset as well. */ 547 208 548 209 owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); ··· 587 246 user_virt += npages << PAGE_SHIFT; 588 247 for (j = 0; j < npages; ++j) { 589 248 ret = ib_umem_odp_map_dma_single_page( 590 - umem, k, local_page_list[j], access_mask, 591 - current_seq); 249 + umem, k, base_virt_addr, local_page_list[j], 250 + access_mask, current_seq); 592 251 if (ret < 0) 593 252 break; 594 253 k++; ··· 627 286 628 287 virt = max_t(u64, virt, ib_umem_start(umem)); 629 288 bound = min_t(u64, bound, ib_umem_end(umem)); 289 + /* Note that during the run of this function, the 290 + * notifiers_count of the MR is > 0, preventing any racing 291 + * faults from completion. We might be racing with other 292 + * invalidations, so we must make sure we free each page only 293 + * once. */ 630 294 for (addr = virt; addr < bound; addr += (u64)umem->page_size) { 631 295 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 632 296 mutex_lock(&umem->odp_data->umem_mutex); ··· 646 300 ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, 647 301 DMA_BIDIRECTIONAL); 648 302 if (dma & ODP_WRITE_ALLOWED_BIT) 649 - set_page_dirty_lock(head_page); 650 - put_page(page); 303 + /* 304 + * set_page_dirty prefers being called with 305 + * the page lock. However, MMU notifiers are 306 + * called sometimes with and sometimes without 307 + * the lock. We rely on the umem_mutex instead 308 + * to prevent other mmu notifiers from 309 + * continuing and allowing the page mapping to 310 + * be removed. 311 + */ 312 + set_page_dirty(head_page); 313 + /* on demand pinning support */ 314 + if (!umem->context->invalidate_range) 315 + put_page(page); 316 + umem->odp_data->page_list[idx] = NULL; 317 + umem->odp_data->dma_list[idx] = 0; 651 318 } 652 319 mutex_unlock(&umem->odp_data->umem_mutex); 653 320 }
+94
drivers/infiniband/core/umem_rbtree.c
··· 1 + /* 2 + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the 8 + * OpenIB.org BSD license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or 11 + * without modification, are permitted provided that the following 12 + * conditions are met: 13 + * 14 + * - Redistributions of source code must retain the above 15 + * copyright notice, this list of conditions and the following 16 + * disclaimer. 17 + * 18 + * - Redistributions in binary form must reproduce the above 19 + * copyright notice, this list of conditions and the following 20 + * disclaimer in the documentation and/or other materials 21 + * provided with the distribution. 22 + * 23 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 + * SOFTWARE. 31 + */ 32 + 33 + #include <linux/kernel.h> 34 + #include <linux/module.h> 35 + #include <linux/interval_tree_generic.h> 36 + #include <linux/sched.h> 37 + #include <linux/gfp.h> 38 + #include <rdma/ib_umem_odp.h> 39 + 40 + /* 41 + * The ib_umem list keeps track of memory regions for which the HW 42 + * device request to receive notification when the related memory 43 + * mapping is changed. 44 + * 45 + * ib_umem_lock protects the list. 46 + */ 47 + 48 + static inline u64 node_start(struct umem_odp_node *n) 49 + { 50 + struct ib_umem_odp *umem_odp = 51 + container_of(n, struct ib_umem_odp, interval_tree); 52 + 53 + return ib_umem_start(umem_odp->umem); 54 + } 55 + 56 + /* Note that the representation of the intervals in the interval tree 57 + * considers the ending point as contained in the interval, while the 58 + * function ib_umem_end returns the first address which is not contained 59 + * in the umem. 60 + */ 61 + static inline u64 node_last(struct umem_odp_node *n) 62 + { 63 + struct ib_umem_odp *umem_odp = 64 + container_of(n, struct ib_umem_odp, interval_tree); 65 + 66 + return ib_umem_end(umem_odp->umem) - 1; 67 + } 68 + 69 + INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, 70 + node_start, node_last, , rbt_ib_umem) 71 + 72 + /* @last is not a part of the interval. See comment for function 73 + * node_last. 74 + */ 75 + int rbt_ib_umem_for_each_in_range(struct rb_root *root, 76 + u64 start, u64 last, 77 + umem_call_back cb, 78 + void *cookie) 79 + { 80 + int ret_val = 0; 81 + struct umem_odp_node *node; 82 + struct ib_umem_odp *umem; 83 + 84 + if (unlikely(start == last)) 85 + return ret_val; 86 + 87 + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; 88 + node = rbt_ib_umem_iter_next(node, start, last - 1)) { 89 + umem = container_of(node, struct ib_umem_odp, interval_tree); 90 + ret_val = cb(umem->umem, start, last, cookie) || ret_val; 91 + } 92 + 93 + return ret_val; 94 + }
+17
drivers/infiniband/core/uverbs_cmd.c
··· 289 289 struct ib_uverbs_get_context_resp resp; 290 290 struct ib_udata udata; 291 291 struct ib_device *ibdev = file->device->ib_dev; 292 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 293 + struct ib_device_attr dev_attr; 294 + #endif 292 295 struct ib_ucontext *ucontext; 293 296 struct file *filp; 294 297 int ret; ··· 333 330 ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 334 331 rcu_read_unlock(); 335 332 ucontext->closing = 0; 333 + 334 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 335 + ucontext->umem_tree = RB_ROOT; 336 + init_rwsem(&ucontext->umem_rwsem); 337 + ucontext->odp_mrs_count = 0; 338 + INIT_LIST_HEAD(&ucontext->no_private_counters); 339 + 340 + ret = ib_query_device(ibdev, &dev_attr); 341 + if (ret) 342 + goto err_free; 343 + if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) 344 + ucontext->invalidate_range = NULL; 345 + 346 + #endif 336 347 337 348 resp.num_comp_vectors = file->device->num_comp_vectors; 338 349
+64 -1
include/rdma/ib_umem_odp.h
··· 34 34 #define IB_UMEM_ODP_H 35 35 36 36 #include <rdma/ib_umem.h> 37 + #include <rdma/ib_verbs.h> 38 + #include <linux/interval_tree.h> 39 + 40 + struct umem_odp_node { 41 + u64 __subtree_last; 42 + struct rb_node rb; 43 + }; 37 44 38 45 struct ib_umem_odp { 39 46 /* ··· 58 51 dma_addr_t *dma_list; 59 52 /* 60 53 * The umem_mutex protects the page_list and dma_list fields of an ODP 61 - * umem, allowing only a single thread to map/unmap pages. 54 + * umem, allowing only a single thread to map/unmap pages. The mutex 55 + * also protects access to the mmu notifier counters. 62 56 */ 63 57 struct mutex umem_mutex; 64 58 void *private; /* for the HW driver to use. */ 59 + 60 + /* When false, use the notifier counter in the ucontext struct. */ 61 + bool mn_counters_active; 62 + int notifiers_seq; 63 + int notifiers_count; 64 + 65 + /* A linked list of umems that don't have private mmu notifier 66 + * counters yet. */ 67 + struct list_head no_private_counters; 68 + struct ib_umem *umem; 69 + 70 + /* Tree tracking */ 71 + struct umem_odp_node interval_tree; 72 + 73 + struct completion notifier_completion; 74 + int dying; 65 75 }; 66 76 67 77 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING ··· 105 81 106 82 void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, 107 83 u64 bound); 84 + 85 + void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); 86 + void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); 87 + typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, 88 + void *cookie); 89 + /* 90 + * Call the callback on each ib_umem in the range. Returns the logical or of 91 + * the return values of the functions called. 92 + */ 93 + int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, 94 + umem_call_back cb, void *cookie); 95 + 96 + struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, 97 + u64 start, u64 last); 98 + struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, 99 + u64 start, u64 last); 100 + 101 + static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, 102 + unsigned long mmu_seq) 103 + { 104 + /* 105 + * This code is strongly based on the KVM code from 106 + * mmu_notifier_retry. Should be called with 107 + * the relevant locks taken (item->odp_data->umem_mutex 108 + * and the ucontext umem_mutex semaphore locked for read). 109 + */ 110 + 111 + /* Do not allow page faults while the new ib_umem hasn't seen a state 112 + * with zero notifiers yet, and doesn't have its own valid set of 113 + * private counters. */ 114 + if (!item->odp_data->mn_counters_active) 115 + return 1; 116 + 117 + if (unlikely(item->odp_data->notifiers_count)) 118 + return 1; 119 + if (item->odp_data->notifiers_seq != mmu_seq) 120 + return 1; 121 + return 0; 122 + } 108 123 109 124 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 110 125
+19
include/rdma/ib_verbs.h
··· 51 51 #include <uapi/linux/if_ether.h> 52 52 53 53 #include <linux/atomic.h> 54 + #include <linux/mmu_notifier.h> 54 55 #include <asm/uaccess.h> 55 56 56 57 extern struct workqueue_struct *ib_wq; ··· 1140 1139 u8 page_shift; 1141 1140 }; 1142 1141 1142 + struct ib_umem; 1143 + 1143 1144 struct ib_ucontext { 1144 1145 struct ib_device *device; 1145 1146 struct list_head pd_list; ··· 1156 1153 int closing; 1157 1154 1158 1155 struct pid *tgid; 1156 + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 1157 + struct rb_root umem_tree; 1158 + /* 1159 + * Protects .umem_rbroot and tree, as well as odp_mrs_count and 1160 + * mmu notifiers registration. 1161 + */ 1162 + struct rw_semaphore umem_rwsem; 1163 + void (*invalidate_range)(struct ib_umem *umem, 1164 + unsigned long start, unsigned long end); 1165 + 1166 + struct mmu_notifier mn; 1167 + atomic_t notifier_count; 1168 + /* A list of umems that don't have private mmu notifier counters yet. */ 1169 + struct list_head no_private_counters; 1170 + int odp_mrs_count; 1171 + #endif 1159 1172 }; 1160 1173 1161 1174 struct ib_uobject {