at v6.19-rc4 10 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2#include <linux/kernel.h> 3#include <linux/init.h> 4#include <linux/errno.h> 5#include <linux/mm.h> 6#include <linux/mman.h> 7#include <linux/slab.h> 8#include <linux/vmalloc.h> 9#include <linux/io_uring.h> 10#include <linux/io_uring_types.h> 11#include <asm/shmparam.h> 12 13#include "memmap.h" 14#include "kbuf.h" 15#include "rsrc.h" 16#include "zcrx.h" 17 18static bool io_mem_alloc_compound(struct page **pages, int nr_pages, 19 size_t size, gfp_t gfp) 20{ 21 struct page *page; 22 int i, order; 23 24 order = get_order(size); 25 if (order > MAX_PAGE_ORDER) 26 return false; 27 else if (order) 28 gfp |= __GFP_COMP; 29 30 page = alloc_pages(gfp, order); 31 if (!page) 32 return false; 33 34 for (i = 0; i < nr_pages; i++) 35 pages[i] = page + i; 36 37 return true; 38} 39 40struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) 41{ 42 unsigned long start, end, nr_pages; 43 struct page **pages; 44 int ret; 45 46 if (check_add_overflow(uaddr, len, &end)) 47 return ERR_PTR(-EOVERFLOW); 48 if (check_add_overflow(end, PAGE_SIZE - 1, &end)) 49 return ERR_PTR(-EOVERFLOW); 50 51 end = end >> PAGE_SHIFT; 52 start = uaddr >> PAGE_SHIFT; 53 nr_pages = end - start; 54 if (WARN_ON_ONCE(!nr_pages)) 55 return ERR_PTR(-EINVAL); 56 if (WARN_ON_ONCE(nr_pages > INT_MAX)) 57 return ERR_PTR(-EOVERFLOW); 58 59 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 60 if (!pages) 61 return ERR_PTR(-ENOMEM); 62 63 ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 64 pages); 65 /* success, mapped all pages */ 66 if (ret == nr_pages) { 67 *npages = nr_pages; 68 return pages; 69 } 70 71 /* partial map, or didn't map anything */ 72 if (ret >= 0) { 73 /* if we did partial map, release any pages we did get */ 74 if (ret) 75 unpin_user_pages(pages, ret); 76 ret = -EFAULT; 77 } 78 kvfree(pages); 79 return ERR_PTR(ret); 80} 81 82enum { 83 /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ 84 IO_REGION_F_VMAP = 1, 85 /* memory is provided by user and pinned by the kernel */ 86 IO_REGION_F_USER_PROVIDED = 2, 87 /* only the first page in the array is ref'ed */ 88 IO_REGION_F_SINGLE_REF = 4, 89}; 90 91void io_free_region(struct user_struct *user, struct io_mapped_region *mr) 92{ 93 if (mr->pages) { 94 long nr_refs = mr->nr_pages; 95 96 if (mr->flags & IO_REGION_F_SINGLE_REF) 97 nr_refs = 1; 98 99 if (mr->flags & IO_REGION_F_USER_PROVIDED) 100 unpin_user_pages(mr->pages, nr_refs); 101 else 102 release_pages(mr->pages, nr_refs); 103 104 kvfree(mr->pages); 105 } 106 if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) 107 vunmap(mr->ptr); 108 if (mr->nr_pages && user) 109 __io_unaccount_mem(user, mr->nr_pages); 110 111 memset(mr, 0, sizeof(*mr)); 112} 113 114static int io_region_init_ptr(struct io_mapped_region *mr) 115{ 116 struct io_imu_folio_data ifd; 117 void *ptr; 118 119 if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { 120 if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { 121 mr->ptr = page_address(mr->pages[0]); 122 return 0; 123 } 124 } 125 ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); 126 if (!ptr) 127 return -ENOMEM; 128 129 mr->ptr = ptr; 130 mr->flags |= IO_REGION_F_VMAP; 131 return 0; 132} 133 134static int io_region_pin_pages(struct io_mapped_region *mr, 135 struct io_uring_region_desc *reg) 136{ 137 size_t size = io_region_size(mr); 138 struct page **pages; 139 int nr_pages; 140 141 pages = io_pin_pages(reg->user_addr, size, &nr_pages); 142 if (IS_ERR(pages)) 143 return PTR_ERR(pages); 144 if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) 145 return -EFAULT; 146 147 mr->pages = pages; 148 mr->flags |= IO_REGION_F_USER_PROVIDED; 149 return 0; 150} 151 152static int io_region_allocate_pages(struct io_mapped_region *mr, 153 struct io_uring_region_desc *reg, 154 unsigned long mmap_offset) 155{ 156 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 157 size_t size = io_region_size(mr); 158 unsigned long nr_allocated; 159 struct page **pages; 160 161 pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); 162 if (!pages) 163 return -ENOMEM; 164 165 if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { 166 mr->flags |= IO_REGION_F_SINGLE_REF; 167 goto done; 168 } 169 170 nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, 171 mr->nr_pages, pages); 172 if (nr_allocated != mr->nr_pages) { 173 if (nr_allocated) 174 release_pages(pages, nr_allocated); 175 kvfree(pages); 176 return -ENOMEM; 177 } 178done: 179 reg->mmap_offset = mmap_offset; 180 mr->pages = pages; 181 return 0; 182} 183 184int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 185 struct io_uring_region_desc *reg, 186 unsigned long mmap_offset) 187{ 188 int nr_pages, ret; 189 u64 end; 190 191 if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) 192 return -EFAULT; 193 if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv))) 194 return -EINVAL; 195 if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) 196 return -EINVAL; 197 /* user_addr should be set IFF it's a user memory backed region */ 198 if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) 199 return -EFAULT; 200 if (!reg->size || reg->mmap_offset || reg->id) 201 return -EINVAL; 202 if ((reg->size >> PAGE_SHIFT) > INT_MAX) 203 return -E2BIG; 204 if ((reg->user_addr | reg->size) & ~PAGE_MASK) 205 return -EINVAL; 206 if (check_add_overflow(reg->user_addr, reg->size, &end)) 207 return -EOVERFLOW; 208 209 nr_pages = reg->size >> PAGE_SHIFT; 210 if (ctx->user) { 211 ret = __io_account_mem(ctx->user, nr_pages); 212 if (ret) 213 return ret; 214 } 215 mr->nr_pages = nr_pages; 216 217 if (reg->flags & IORING_MEM_REGION_TYPE_USER) 218 ret = io_region_pin_pages(mr, reg); 219 else 220 ret = io_region_allocate_pages(mr, reg, mmap_offset); 221 if (ret) 222 goto out_free; 223 224 ret = io_region_init_ptr(mr); 225 if (ret) 226 goto out_free; 227 return 0; 228out_free: 229 io_free_region(ctx->user, mr); 230 return ret; 231} 232 233static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, 234 loff_t pgoff) 235{ 236 loff_t offset = pgoff << PAGE_SHIFT; 237 unsigned int id; 238 239 240 switch (offset & IORING_OFF_MMAP_MASK) { 241 case IORING_OFF_SQ_RING: 242 case IORING_OFF_CQ_RING: 243 return &ctx->ring_region; 244 case IORING_OFF_SQES: 245 return &ctx->sq_region; 246 case IORING_OFF_PBUF_RING: 247 id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 248 return io_pbuf_get_region(ctx, id); 249 case IORING_MAP_OFF_PARAM_REGION: 250 return &ctx->param_region; 251 case IORING_MAP_OFF_ZCRX_REGION: 252 id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; 253 return io_zcrx_get_region(ctx, id); 254 } 255 return NULL; 256} 257 258static void *io_region_validate_mmap(struct io_ring_ctx *ctx, 259 struct io_mapped_region *mr) 260{ 261 lockdep_assert_held(&ctx->mmap_lock); 262 263 if (!io_region_is_set(mr)) 264 return ERR_PTR(-EINVAL); 265 if (mr->flags & IO_REGION_F_USER_PROVIDED) 266 return ERR_PTR(-EINVAL); 267 268 return io_region_get_ptr(mr); 269} 270 271static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff) 272{ 273 struct io_ring_ctx *ctx = file->private_data; 274 struct io_mapped_region *region; 275 276 region = io_mmap_get_region(ctx, pgoff); 277 if (!region) 278 return ERR_PTR(-EINVAL); 279 return io_region_validate_mmap(ctx, region); 280} 281 282#ifdef CONFIG_MMU 283 284static int io_region_mmap(struct io_ring_ctx *ctx, 285 struct io_mapped_region *mr, 286 struct vm_area_struct *vma, 287 unsigned max_pages) 288{ 289 unsigned long nr_pages = min(mr->nr_pages, max_pages); 290 291 vm_flags_set(vma, VM_DONTEXPAND); 292 return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); 293} 294 295__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 296{ 297 struct io_ring_ctx *ctx = file->private_data; 298 size_t sz = vma->vm_end - vma->vm_start; 299 long offset = vma->vm_pgoff << PAGE_SHIFT; 300 unsigned int page_limit = UINT_MAX; 301 struct io_mapped_region *region; 302 void *ptr; 303 304 guard(mutex)(&ctx->mmap_lock); 305 306 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff); 307 if (IS_ERR(ptr)) 308 return PTR_ERR(ptr); 309 310 switch (offset & IORING_OFF_MMAP_MASK) { 311 case IORING_OFF_SQ_RING: 312 case IORING_OFF_CQ_RING: 313 page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; 314 break; 315 } 316 317 region = io_mmap_get_region(ctx, vma->vm_pgoff); 318 return io_region_mmap(ctx, region, vma, page_limit); 319} 320 321unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, 322 unsigned long len, unsigned long pgoff, 323 unsigned long flags) 324{ 325 struct io_ring_ctx *ctx = filp->private_data; 326 void *ptr; 327 328 /* 329 * Do not allow to map to user-provided address to avoid breaking the 330 * aliasing rules. Userspace is not able to guess the offset address of 331 * kernel kmalloc()ed memory area. 332 */ 333 if (addr) 334 return -EINVAL; 335 336 guard(mutex)(&ctx->mmap_lock); 337 338 ptr = io_uring_validate_mmap_request(filp, pgoff); 339 if (IS_ERR(ptr)) 340 return -ENOMEM; 341 342 /* 343 * Some architectures have strong cache aliasing requirements. 344 * For such architectures we need a coherent mapping which aliases 345 * kernel memory *and* userspace memory. To achieve that: 346 * - use a NULL file pointer to reference physical memory, and 347 * - use the kernel virtual address of the shared io_uring context 348 * (instead of the userspace-provided address, which has to be 0UL 349 * anyway). 350 * - use the same pgoff which the get_unmapped_area() uses to 351 * calculate the page colouring. 352 * For architectures without such aliasing requirements, the 353 * architecture will return any suitable mapping because addr is 0. 354 */ 355 filp = NULL; 356 flags |= MAP_SHARED; 357 pgoff = 0; /* has been translated to ptr above */ 358#ifdef SHM_COLOUR 359 addr = (uintptr_t) ptr; 360 pgoff = addr >> PAGE_SHIFT; 361#else 362 addr = 0UL; 363#endif 364 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 365} 366 367#else /* !CONFIG_MMU */ 368 369int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 370{ 371 return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 372} 373 374unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 375{ 376 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 377} 378 379unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, 380 unsigned long len, unsigned long pgoff, 381 unsigned long flags) 382{ 383 struct io_ring_ctx *ctx = file->private_data; 384 void *ptr; 385 386 guard(mutex)(&ctx->mmap_lock); 387 388 ptr = io_uring_validate_mmap_request(file, pgoff); 389 if (IS_ERR(ptr)) 390 return PTR_ERR(ptr); 391 392 return (unsigned long) ptr; 393} 394 395#endif /* !CONFIG_MMU */