powerpc/mm_iommu: Fix potential deadlock

Currently mm_iommu_do_alloc() is called in 2 cases:
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY ioctl() for normal memory:
this locks &mem_list_mutex and then locks mm::mmap_sem
several times when adjusting locked_vm or pinning pages;
- vfio_pci_nvgpu_regops::mmap() for GPU memory:
this is called with mm::mmap_sem held already and it locks
&mem_list_mutex.

So one can craft a userspace program to do special ioctl and mmap in
2 threads concurrently and cause a deadlock which lockdep warns about
(below).

We did not hit this yet because QEMU constructs the machine in a single
thread.

This moves the overlap check next to where the new entry is added and
reduces the amount of time spent with &mem_list_mutex held.

This moves locked_vm adjustment from under &mem_list_mutex.

This relies on mm_iommu_adjust_locked_vm() doing nothing when entries==0.

This is one of the lockdep warnings:

======================================================
WARNING: possible circular locking dependency detected
5.1.0-rc2-le_nv2_aikATfstn1-p1 #363 Not tainted
------------------------------------------------------
qemu-system-ppc/8038 is trying to acquire lock:
000000002ec6c453 (mem_list_mutex){+.+.}, at: mm_iommu_do_alloc+0x70/0x490

but task is already holding lock:
00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&mm->mmap_sem){++++}:
lock_acquire+0xf8/0x260
down_write+0x44/0xa0
mm_iommu_adjust_locked_vm.part.1+0x4c/0x190
mm_iommu_do_alloc+0x310/0x490
tce_iommu_ioctl.part.9+0xb84/0x1150 [vfio_iommu_spapr_tce]
vfio_fops_unl_ioctl+0x94/0x430 [vfio]
do_vfs_ioctl+0xe4/0x930
ksys_ioctl+0xc4/0x110
sys_ioctl+0x28/0x80
system_call+0x5c/0x70

-> #0 (mem_list_mutex){+.+.}:
__lock_acquire+0x1484/0x1900
lock_acquire+0xf8/0x260
__mutex_lock+0x88/0xa70
mm_iommu_do_alloc+0x70/0x490
vfio_pci_nvgpu_mmap+0xc0/0x130 [vfio_pci]
vfio_pci_mmap+0x198/0x2a0 [vfio_pci]
vfio_device_fops_mmap+0x44/0x70 [vfio]
mmap_region+0x5d4/0x770
do_mmap+0x42c/0x650
vm_mmap_pgoff+0x124/0x160
ksys_mmap_pgoff+0xdc/0x2f0
sys_mmap+0x40/0x80
system_call+0x5c/0x70

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(mem_list_mutex);
lock(&mm->mmap_sem);
lock(mem_list_mutex);

*** DEADLOCK ***

1 lock held by qemu-system-ppc/8038:
#0: 00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160

Fixes: c10c21efa4bc ("powerpc/vfio/iommu/kvm: Do not pin device memory", 2018-12-19)
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by Alexey Kardashevskiy and committed by Michael Ellerman eb9d7a62 8adddf34

Changed files
+40 -37
arch
powerpc
+40 -37
arch/powerpc/mm/mmu_context_iommu.c
··· 95 95 unsigned long entries, unsigned long dev_hpa, 96 96 struct mm_iommu_table_group_mem_t **pmem) 97 97 { 98 - struct mm_iommu_table_group_mem_t *mem; 99 - long i, ret, locked_entries = 0; 98 + struct mm_iommu_table_group_mem_t *mem, *mem2; 99 + long i, ret, locked_entries = 0, pinned = 0; 100 100 unsigned int pageshift; 101 - 102 - mutex_lock(&mem_list_mutex); 103 - 104 - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, 105 - next) { 106 - /* Overlap? */ 107 - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && 108 - (ua < (mem->ua + 109 - (mem->entries << PAGE_SHIFT)))) { 110 - ret = -EINVAL; 111 - goto unlock_exit; 112 - } 113 - 114 - } 115 101 116 102 if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { 117 103 ret = mm_iommu_adjust_locked_vm(mm, entries, true); 118 104 if (ret) 119 - goto unlock_exit; 105 + return ret; 120 106 121 107 locked_entries = entries; 122 108 } ··· 136 150 down_read(&mm->mmap_sem); 137 151 ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); 138 152 up_read(&mm->mmap_sem); 153 + pinned = ret > 0 ? ret : 0; 139 154 if (ret != entries) { 140 - /* free the reference taken */ 141 - for (i = 0; i < ret; i++) 142 - put_page(mem->hpages[i]); 143 - 144 - vfree(mem->hpas); 145 - kfree(mem); 146 155 ret = -EFAULT; 147 - goto unlock_exit; 156 + goto free_exit; 148 157 } 149 158 150 159 pageshift = PAGE_SHIFT; ··· 164 183 } 165 184 166 185 good_exit: 167 - ret = 0; 168 186 atomic64_set(&mem->mapped, 1); 169 187 mem->used = 1; 170 188 mem->ua = ua; 171 189 mem->entries = entries; 172 - *pmem = mem; 190 + 191 + mutex_lock(&mem_list_mutex); 192 + 193 + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { 194 + /* Overlap? */ 195 + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && 196 + (ua < (mem2->ua + 197 + (mem2->entries << PAGE_SHIFT)))) { 198 + ret = -EINVAL; 199 + mutex_unlock(&mem_list_mutex); 200 + goto free_exit; 201 + } 202 + } 173 203 174 204 list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); 175 205 176 - unlock_exit: 177 - if (locked_entries && ret) 178 - mm_iommu_adjust_locked_vm(mm, locked_entries, false); 179 - 180 206 mutex_unlock(&mem_list_mutex); 207 + 208 + *pmem = mem; 209 + 210 + return 0; 211 + 212 + free_exit: 213 + /* free the reference taken */ 214 + for (i = 0; i < pinned; i++) 215 + put_page(mem->hpages[i]); 216 + 217 + vfree(mem->hpas); 218 + kfree(mem); 219 + 220 + unlock_exit: 221 + mm_iommu_adjust_locked_vm(mm, locked_entries, false); 181 222 182 223 return ret; 183 224 } ··· 269 266 long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) 270 267 { 271 268 long ret = 0; 272 - unsigned long entries, dev_hpa; 269 + unsigned long unlock_entries = 0; 273 270 274 271 mutex_lock(&mem_list_mutex); 275 272 ··· 290 287 goto unlock_exit; 291 288 } 292 289 293 - /* @mapped became 0 so now mappings are disabled, release the region */ 294 - entries = mem->entries; 295 - dev_hpa = mem->dev_hpa; 296 - mm_iommu_release(mem); 290 + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) 291 + unlock_entries = mem->entries; 297 292 298 - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) 299 - mm_iommu_adjust_locked_vm(mm, entries, false); 293 + /* @mapped became 0 so now mappings are disabled, release the region */ 294 + mm_iommu_release(mem); 300 295 301 296 unlock_exit: 302 297 mutex_unlock(&mem_list_mutex); 298 + 299 + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); 303 300 304 301 return ret; 305 302 }