Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kvack.org/~bcrl/aio-next

Pull aio changes from Ben LaHaise:
"First off, sorry for this pull request being late in the merge window.
Al had raised a couple of concerns about 2 items in the series below.
I addressed the first issue (the race introduced by Gu's use of
mm_populate()), but he has not provided any further details on how he
wants to rework the anon_inode.c changes (which were sent out months
ago but have yet to be commented on).

The bulk of the changes have been sitting in the -next tree for a few
months, with all the issues raised being addressed"

* git://git.kvack.org/~bcrl/aio-next: (22 commits)
aio: rcu_read_lock protection for new rcu_dereference calls
aio: fix race in ring buffer page lookup introduced by page migration support
aio: fix rcu sparse warnings introduced by ioctx table lookup patch
aio: remove unnecessary debugging from aio_free_ring()
aio: table lookup: verify ctx pointer
staging/lustre: kiocb->ki_left is removed
aio: fix error handling and rcu usage in "convert the ioctx list to table lookup v3"
aio: be defensive to ensure request batching is non-zero instead of BUG_ON()
aio: convert the ioctx list to table lookup v3
aio: double aio_max_nr in calculations
aio: Kill ki_dtor
aio: Kill ki_users
aio: Kill unneeded kiocb members
aio: Kill aio_rw_vect_retry()
aio: Don't use ctx->tail unnecessarily
aio: io_cancel() no longer returns the io_event
aio: percpu ioctx refcount
aio: percpu reqs_available
aio: reqs_active -> reqs_available
aio: fix build when migration is disabled
...

+562 -313
+1 -1
drivers/staging/android/logger.c
··· 481 481 header.sec = now.tv_sec; 482 482 header.nsec = now.tv_nsec; 483 483 header.euid = current_euid(); 484 - header.len = min_t(size_t, iocb->ki_left, LOGGER_ENTRY_MAX_PAYLOAD); 484 + header.len = min_t(size_t, iocb->ki_nbytes, LOGGER_ENTRY_MAX_PAYLOAD); 485 485 header.hdr_size = sizeof(struct logger_entry); 486 486 487 487 /* null writes succeed, return zero */
+2 -2
drivers/staging/lustre/lustre/llite/file.c
··· 1009 1009 local_iov->iov_len = count; 1010 1010 init_sync_kiocb(kiocb, file); 1011 1011 kiocb->ki_pos = *ppos; 1012 - kiocb->ki_left = count; 1012 + kiocb->ki_nbytes = count; 1013 1013 1014 1014 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); 1015 1015 *ppos = kiocb->ki_pos; ··· 1068 1068 local_iov->iov_len = count; 1069 1069 init_sync_kiocb(kiocb, file); 1070 1070 kiocb->ki_pos = *ppos; 1071 - kiocb->ki_left = count; 1071 + kiocb->ki_nbytes = count; 1072 1072 1073 1073 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); 1074 1074 *ppos = kiocb->ki_pos;
+4 -5
drivers/usb/gadget/inode.c
··· 524 524 unsigned actual; 525 525 }; 526 526 527 - static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) 527 + static int ep_aio_cancel(struct kiocb *iocb) 528 528 { 529 529 struct kiocb_priv *priv = iocb->private; 530 530 struct ep_data *epdata; ··· 540 540 // spin_unlock(&epdata->dev->lock); 541 541 local_irq_enable(); 542 542 543 - aio_put_req(iocb); 544 543 return value; 545 544 } 546 545 ··· 708 709 if (unlikely(usb_endpoint_dir_in(&epdata->desc))) 709 710 return -EINVAL; 710 711 711 - buf = kmalloc(iocb->ki_left, GFP_KERNEL); 712 + buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); 712 713 if (unlikely(!buf)) 713 714 return -ENOMEM; 714 715 715 - return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); 716 + return ep_aio_rwtail(iocb, buf, iocb->ki_nbytes, epdata, iov, nr_segs); 716 717 } 717 718 718 719 static ssize_t ··· 727 728 if (unlikely(!usb_endpoint_dir_in(&epdata->desc))) 728 729 return -EINVAL; 729 730 730 - buf = kmalloc(iocb->ki_left, GFP_KERNEL); 731 + buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); 731 732 if (unlikely(!buf)) 732 733 return -ENOMEM; 733 734
+467 -261
fs/aio.c
··· 26 26 #include <linux/mm.h> 27 27 #include <linux/mman.h> 28 28 #include <linux/mmu_context.h> 29 + #include <linux/percpu.h> 29 30 #include <linux/slab.h> 30 31 #include <linux/timer.h> 31 32 #include <linux/aio.h> ··· 36 35 #include <linux/eventfd.h> 37 36 #include <linux/blkdev.h> 38 37 #include <linux/compat.h> 38 + #include <linux/anon_inodes.h> 39 + #include <linux/migrate.h> 40 + #include <linux/ramfs.h> 41 + #include <linux/percpu-refcount.h> 39 42 40 43 #include <asm/kmap_types.h> 41 44 #include <asm/uaccess.h> ··· 66 61 67 62 #define AIO_RING_PAGES 8 68 63 64 + struct kioctx_table { 65 + struct rcu_head rcu; 66 + unsigned nr; 67 + struct kioctx *table[]; 68 + }; 69 + 70 + struct kioctx_cpu { 71 + unsigned reqs_available; 72 + }; 73 + 69 74 struct kioctx { 70 - atomic_t users; 75 + struct percpu_ref users; 71 76 atomic_t dead; 72 77 73 - /* This needs improving */ 74 78 unsigned long user_id; 75 - struct hlist_node list; 76 79 80 + struct __percpu kioctx_cpu *cpu; 81 + 82 + /* 83 + * For percpu reqs_available, number of slots we move to/from global 84 + * counter at a time: 85 + */ 86 + unsigned req_batch; 77 87 /* 78 88 * This is what userspace passed to io_setup(), it's not used for 79 89 * anything but counting against the global max_reqs quota. ··· 108 88 long nr_pages; 109 89 110 90 struct rcu_head rcu_head; 111 - struct work_struct rcu_work; 91 + struct work_struct free_work; 112 92 113 93 struct { 114 - atomic_t reqs_active; 94 + /* 95 + * This counts the number of available slots in the ringbuffer, 96 + * so we avoid overflowing it: it's decremented (if positive) 97 + * when allocating a kiocb and incremented when the resulting 98 + * io_event is pulled off the ringbuffer. 99 + * 100 + * We batch accesses to it with a percpu version. 101 + */ 102 + atomic_t reqs_available; 115 103 } ____cacheline_aligned_in_smp; 116 104 117 105 struct { ··· 138 110 } ____cacheline_aligned_in_smp; 139 111 140 112 struct page *internal_pages[AIO_RING_PAGES]; 113 + struct file *aio_ring_file; 114 + 115 + unsigned id; 141 116 }; 142 117 143 118 /*------ sysctl variables----*/ ··· 169 138 170 139 static void aio_free_ring(struct kioctx *ctx) 171 140 { 172 - long i; 141 + int i; 142 + struct file *aio_ring_file = ctx->aio_ring_file; 173 143 174 - for (i = 0; i < ctx->nr_pages; i++) 144 + for (i = 0; i < ctx->nr_pages; i++) { 145 + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, 146 + page_count(ctx->ring_pages[i])); 175 147 put_page(ctx->ring_pages[i]); 148 + } 176 149 177 150 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) 178 151 kfree(ctx->ring_pages); 152 + 153 + if (aio_ring_file) { 154 + truncate_setsize(aio_ring_file->f_inode, 0); 155 + fput(aio_ring_file); 156 + ctx->aio_ring_file = NULL; 157 + } 179 158 } 159 + 160 + static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) 161 + { 162 + vma->vm_ops = &generic_file_vm_ops; 163 + return 0; 164 + } 165 + 166 + static const struct file_operations aio_ring_fops = { 167 + .mmap = aio_ring_mmap, 168 + }; 169 + 170 + static int aio_set_page_dirty(struct page *page) 171 + { 172 + return 0; 173 + } 174 + 175 + #if IS_ENABLED(CONFIG_MIGRATION) 176 + static int aio_migratepage(struct address_space *mapping, struct page *new, 177 + struct page *old, enum migrate_mode mode) 178 + { 179 + struct kioctx *ctx = mapping->private_data; 180 + unsigned long flags; 181 + unsigned idx = old->index; 182 + int rc; 183 + 184 + /* Writeback must be complete */ 185 + BUG_ON(PageWriteback(old)); 186 + put_page(old); 187 + 188 + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); 189 + if (rc != MIGRATEPAGE_SUCCESS) { 190 + get_page(old); 191 + return rc; 192 + } 193 + 194 + get_page(new); 195 + 196 + spin_lock_irqsave(&ctx->completion_lock, flags); 197 + migrate_page_copy(new, old); 198 + ctx->ring_pages[idx] = new; 199 + spin_unlock_irqrestore(&ctx->completion_lock, flags); 200 + 201 + return rc; 202 + } 203 + #endif 204 + 205 + static const struct address_space_operations aio_ctx_aops = { 206 + .set_page_dirty = aio_set_page_dirty, 207 + #if IS_ENABLED(CONFIG_MIGRATION) 208 + .migratepage = aio_migratepage, 209 + #endif 210 + }; 180 211 181 212 static int aio_setup_ring(struct kioctx *ctx) 182 213 { ··· 247 154 struct mm_struct *mm = current->mm; 248 155 unsigned long size, populate; 249 156 int nr_pages; 157 + int i; 158 + struct file *file; 250 159 251 160 /* Compensate for the ring buffer's head/tail overlap entry */ 252 161 nr_events += 2; /* 1 is required, 2 for good luck */ 253 162 254 163 size = sizeof(struct aio_ring); 255 164 size += sizeof(struct io_event) * nr_events; 256 - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; 257 165 166 + nr_pages = PFN_UP(size); 258 167 if (nr_pages < 0) 259 168 return -EINVAL; 260 169 261 - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); 170 + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); 171 + if (IS_ERR(file)) { 172 + ctx->aio_ring_file = NULL; 173 + return -EAGAIN; 174 + } 262 175 263 - ctx->nr_events = 0; 176 + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; 177 + file->f_inode->i_mapping->private_data = ctx; 178 + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; 179 + 180 + for (i = 0; i < nr_pages; i++) { 181 + struct page *page; 182 + page = find_or_create_page(file->f_inode->i_mapping, 183 + i, GFP_HIGHUSER | __GFP_ZERO); 184 + if (!page) 185 + break; 186 + pr_debug("pid(%d) page[%d]->count=%d\n", 187 + current->pid, i, page_count(page)); 188 + SetPageUptodate(page); 189 + SetPageDirty(page); 190 + unlock_page(page); 191 + } 192 + ctx->aio_ring_file = file; 193 + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) 194 + / sizeof(struct io_event); 195 + 264 196 ctx->ring_pages = ctx->internal_pages; 265 197 if (nr_pages > AIO_RING_PAGES) { 266 198 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), ··· 296 178 297 179 ctx->mmap_size = nr_pages * PAGE_SIZE; 298 180 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); 181 + 299 182 down_write(&mm->mmap_sem); 300 - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, 301 - PROT_READ|PROT_WRITE, 302 - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); 183 + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, 184 + PROT_READ | PROT_WRITE, 185 + MAP_SHARED | MAP_POPULATE, 0, &populate); 303 186 if (IS_ERR((void *)ctx->mmap_base)) { 304 187 up_write(&mm->mmap_sem); 305 188 ctx->mmap_size = 0; ··· 309 190 } 310 191 311 192 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 193 + 194 + /* We must do this while still holding mmap_sem for write, as we 195 + * need to be protected against userspace attempting to mremap() 196 + * or munmap() the ring buffer. 197 + */ 312 198 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 313 199 1, 0, ctx->ring_pages, NULL); 200 + 201 + /* Dropping the reference here is safe as the page cache will hold 202 + * onto the pages for us. It is also required so that page migration 203 + * can unmap the pages and get the right reference count. 204 + */ 205 + for (i = 0; i < ctx->nr_pages; i++) 206 + put_page(ctx->ring_pages[i]); 207 + 314 208 up_write(&mm->mmap_sem); 315 209 316 210 if (unlikely(ctx->nr_pages != nr_pages)) { 317 211 aio_free_ring(ctx); 318 212 return -EAGAIN; 319 213 } 320 - if (populate) 321 - mm_populate(ctx->mmap_base, populate); 322 214 323 215 ctx->user_id = ctx->mmap_base; 324 216 ctx->nr_events = nr_events; /* trusted copy */ 325 217 326 218 ring = kmap_atomic(ctx->ring_pages[0]); 327 219 ring->nr = nr_events; /* user copy */ 328 - ring->id = ctx->user_id; 220 + ring->id = ~0U; 329 221 ring->head = ring->tail = 0; 330 222 ring->magic = AIO_RING_MAGIC; 331 223 ring->compat_features = AIO_RING_COMPAT_FEATURES; ··· 368 238 } 369 239 EXPORT_SYMBOL(kiocb_set_cancel_fn); 370 240 371 - static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, 372 - struct io_event *res) 241 + static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) 373 242 { 374 243 kiocb_cancel_fn *old, *cancel; 375 - int ret = -EINVAL; 376 244 377 245 /* 378 246 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it ··· 380 252 cancel = ACCESS_ONCE(kiocb->ki_cancel); 381 253 do { 382 254 if (!cancel || cancel == KIOCB_CANCELLED) 383 - return ret; 255 + return -EINVAL; 384 256 385 257 old = cancel; 386 258 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); 387 259 } while (cancel != old); 388 260 389 - atomic_inc(&kiocb->ki_users); 390 - spin_unlock_irq(&ctx->ctx_lock); 391 - 392 - memset(res, 0, sizeof(*res)); 393 - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; 394 - res->data = kiocb->ki_user_data; 395 - ret = cancel(kiocb, res); 396 - 397 - spin_lock_irq(&ctx->ctx_lock); 398 - 399 - return ret; 261 + return cancel(kiocb); 400 262 } 401 263 402 264 static void free_ioctx_rcu(struct rcu_head *head) 403 265 { 404 266 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); 267 + 268 + free_percpu(ctx->cpu); 405 269 kmem_cache_free(kioctx_cachep, ctx); 406 270 } 407 271 ··· 402 282 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - 403 283 * now it's safe to cancel any that need to be. 404 284 */ 405 - static void free_ioctx(struct kioctx *ctx) 285 + static void free_ioctx(struct work_struct *work) 406 286 { 287 + struct kioctx *ctx = container_of(work, struct kioctx, free_work); 407 288 struct aio_ring *ring; 408 - struct io_event res; 409 289 struct kiocb *req; 410 - unsigned head, avail; 290 + unsigned cpu, avail; 291 + DEFINE_WAIT(wait); 411 292 412 293 spin_lock_irq(&ctx->ctx_lock); 413 294 ··· 417 296 struct kiocb, ki_list); 418 297 419 298 list_del_init(&req->ki_list); 420 - kiocb_cancel(ctx, req, &res); 299 + kiocb_cancel(ctx, req); 421 300 } 422 301 423 302 spin_unlock_irq(&ctx->ctx_lock); 424 303 425 - ring = kmap_atomic(ctx->ring_pages[0]); 426 - head = ring->head; 427 - kunmap_atomic(ring); 304 + for_each_possible_cpu(cpu) { 305 + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); 428 306 429 - while (atomic_read(&ctx->reqs_active) > 0) { 430 - wait_event(ctx->wait, 431 - head != ctx->tail || 432 - atomic_read(&ctx->reqs_active) <= 0); 433 - 434 - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; 435 - 436 - atomic_sub(avail, &ctx->reqs_active); 437 - head += avail; 438 - head %= ctx->nr_events; 307 + atomic_add(kcpu->reqs_available, &ctx->reqs_available); 308 + kcpu->reqs_available = 0; 439 309 } 440 310 441 - WARN_ON(atomic_read(&ctx->reqs_active) < 0); 311 + while (1) { 312 + prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); 313 + 314 + ring = kmap_atomic(ctx->ring_pages[0]); 315 + avail = (ring->head <= ring->tail) 316 + ? ring->tail - ring->head 317 + : ctx->nr_events - ring->head + ring->tail; 318 + 319 + atomic_add(avail, &ctx->reqs_available); 320 + ring->head = ring->tail; 321 + kunmap_atomic(ring); 322 + 323 + if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) 324 + break; 325 + 326 + schedule(); 327 + } 328 + finish_wait(&ctx->wait, &wait); 329 + 330 + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); 442 331 443 332 aio_free_ring(ctx); 444 333 ··· 464 333 call_rcu(&ctx->rcu_head, free_ioctx_rcu); 465 334 } 466 335 467 - static void put_ioctx(struct kioctx *ctx) 336 + static void free_ioctx_ref(struct percpu_ref *ref) 468 337 { 469 - if (unlikely(atomic_dec_and_test(&ctx->users))) 470 - free_ioctx(ctx); 338 + struct kioctx *ctx = container_of(ref, struct kioctx, users); 339 + 340 + INIT_WORK(&ctx->free_work, free_ioctx); 341 + schedule_work(&ctx->free_work); 342 + } 343 + 344 + static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) 345 + { 346 + unsigned i, new_nr; 347 + struct kioctx_table *table, *old; 348 + struct aio_ring *ring; 349 + 350 + spin_lock(&mm->ioctx_lock); 351 + rcu_read_lock(); 352 + table = rcu_dereference(mm->ioctx_table); 353 + 354 + while (1) { 355 + if (table) 356 + for (i = 0; i < table->nr; i++) 357 + if (!table->table[i]) { 358 + ctx->id = i; 359 + table->table[i] = ctx; 360 + rcu_read_unlock(); 361 + spin_unlock(&mm->ioctx_lock); 362 + 363 + ring = kmap_atomic(ctx->ring_pages[0]); 364 + ring->id = ctx->id; 365 + kunmap_atomic(ring); 366 + return 0; 367 + } 368 + 369 + new_nr = (table ? table->nr : 1) * 4; 370 + 371 + rcu_read_unlock(); 372 + spin_unlock(&mm->ioctx_lock); 373 + 374 + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * 375 + new_nr, GFP_KERNEL); 376 + if (!table) 377 + return -ENOMEM; 378 + 379 + table->nr = new_nr; 380 + 381 + spin_lock(&mm->ioctx_lock); 382 + rcu_read_lock(); 383 + old = rcu_dereference(mm->ioctx_table); 384 + 385 + if (!old) { 386 + rcu_assign_pointer(mm->ioctx_table, table); 387 + } else if (table->nr > old->nr) { 388 + memcpy(table->table, old->table, 389 + old->nr * sizeof(struct kioctx *)); 390 + 391 + rcu_assign_pointer(mm->ioctx_table, table); 392 + kfree_rcu(old, rcu); 393 + } else { 394 + kfree(table); 395 + table = old; 396 + } 397 + } 471 398 } 472 399 473 400 /* ioctx_alloc ··· 537 348 struct kioctx *ctx; 538 349 int err = -ENOMEM; 539 350 351 + /* 352 + * We keep track of the number of available ringbuffer slots, to prevent 353 + * overflow (reqs_available), and we also use percpu counters for this. 354 + * 355 + * So since up to half the slots might be on other cpu's percpu counters 356 + * and unavailable, double nr_events so userspace sees what they 357 + * expected: additionally, we move req_batch slots to/from percpu 358 + * counters at a time, so make sure that isn't 0: 359 + */ 360 + nr_events = max(nr_events, num_possible_cpus() * 4); 361 + nr_events *= 2; 362 + 540 363 /* Prevent overflows */ 541 364 if ((nr_events > (0x10000000U / sizeof(struct io_event))) || 542 365 (nr_events > (0x10000000U / sizeof(struct kiocb)))) { ··· 556 355 return ERR_PTR(-EINVAL); 557 356 } 558 357 559 - if (!nr_events || (unsigned long)nr_events > aio_max_nr) 358 + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) 560 359 return ERR_PTR(-EAGAIN); 561 360 562 361 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); ··· 565 364 566 365 ctx->max_reqs = nr_events; 567 366 568 - atomic_set(&ctx->users, 2); 569 - atomic_set(&ctx->dead, 0); 367 + if (percpu_ref_init(&ctx->users, free_ioctx_ref)) 368 + goto out_freectx; 369 + 570 370 spin_lock_init(&ctx->ctx_lock); 571 371 spin_lock_init(&ctx->completion_lock); 572 372 mutex_init(&ctx->ring_lock); ··· 575 373 576 374 INIT_LIST_HEAD(&ctx->active_reqs); 577 375 376 + ctx->cpu = alloc_percpu(struct kioctx_cpu); 377 + if (!ctx->cpu) 378 + goto out_freeref; 379 + 578 380 if (aio_setup_ring(ctx) < 0) 579 - goto out_freectx; 381 + goto out_freepcpu; 382 + 383 + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); 384 + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); 385 + if (ctx->req_batch < 1) 386 + ctx->req_batch = 1; 580 387 581 388 /* limit the number of system wide aios */ 582 389 spin_lock(&aio_nr_lock); 583 - if (aio_nr + nr_events > aio_max_nr || 390 + if (aio_nr + nr_events > (aio_max_nr * 2UL) || 584 391 aio_nr + nr_events < aio_nr) { 585 392 spin_unlock(&aio_nr_lock); 586 393 goto out_cleanup; ··· 597 386 aio_nr += ctx->max_reqs; 598 387 spin_unlock(&aio_nr_lock); 599 388 600 - /* now link into global list. */ 601 - spin_lock(&mm->ioctx_lock); 602 - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); 603 - spin_unlock(&mm->ioctx_lock); 389 + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ 390 + 391 + err = ioctx_add_table(ctx, mm); 392 + if (err) 393 + goto out_cleanup_put; 604 394 605 395 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 606 396 ctx, ctx->user_id, mm, ctx->nr_events); 607 397 return ctx; 608 398 399 + out_cleanup_put: 400 + percpu_ref_put(&ctx->users); 609 401 out_cleanup: 610 402 err = -EAGAIN; 611 403 aio_free_ring(ctx); 404 + out_freepcpu: 405 + free_percpu(ctx->cpu); 406 + out_freeref: 407 + free_percpu(ctx->users.pcpu_count); 612 408 out_freectx: 409 + if (ctx->aio_ring_file) 410 + fput(ctx->aio_ring_file); 613 411 kmem_cache_free(kioctx_cachep, ctx); 614 412 pr_debug("error allocating ioctx %d\n", err); 615 413 return ERR_PTR(err); 616 - } 617 - 618 - static void kill_ioctx_work(struct work_struct *work) 619 - { 620 - struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); 621 - 622 - wake_up_all(&ctx->wait); 623 - put_ioctx(ctx); 624 - } 625 - 626 - static void kill_ioctx_rcu(struct rcu_head *head) 627 - { 628 - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); 629 - 630 - INIT_WORK(&ctx->rcu_work, kill_ioctx_work); 631 - schedule_work(&ctx->rcu_work); 632 414 } 633 415 634 416 /* kill_ioctx ··· 629 425 * when the processes owning a context have all exited to encourage 630 426 * the rapid destruction of the kioctx. 631 427 */ 632 - static void kill_ioctx(struct kioctx *ctx) 428 + static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) 633 429 { 634 430 if (!atomic_xchg(&ctx->dead, 1)) { 635 - hlist_del_rcu(&ctx->list); 431 + struct kioctx_table *table; 432 + 433 + spin_lock(&mm->ioctx_lock); 434 + rcu_read_lock(); 435 + table = rcu_dereference(mm->ioctx_table); 436 + 437 + WARN_ON(ctx != table->table[ctx->id]); 438 + table->table[ctx->id] = NULL; 439 + rcu_read_unlock(); 440 + spin_unlock(&mm->ioctx_lock); 441 + 442 + /* percpu_ref_kill() will do the necessary call_rcu() */ 443 + wake_up_all(&ctx->wait); 636 444 637 445 /* 638 446 * It'd be more correct to do this in free_ioctx(), after all ··· 661 445 if (ctx->mmap_size) 662 446 vm_munmap(ctx->mmap_base, ctx->mmap_size); 663 447 664 - /* Between hlist_del_rcu() and dropping the initial ref */ 665 - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); 448 + percpu_ref_kill(&ctx->users); 666 449 } 667 450 } 668 451 669 452 /* wait_on_sync_kiocb: 670 453 * Waits on the given sync kiocb to complete. 671 454 */ 672 - ssize_t wait_on_sync_kiocb(struct kiocb *iocb) 455 + ssize_t wait_on_sync_kiocb(struct kiocb *req) 673 456 { 674 - while (atomic_read(&iocb->ki_users)) { 457 + while (!req->ki_ctx) { 675 458 set_current_state(TASK_UNINTERRUPTIBLE); 676 - if (!atomic_read(&iocb->ki_users)) 459 + if (req->ki_ctx) 677 460 break; 678 461 io_schedule(); 679 462 } 680 463 __set_current_state(TASK_RUNNING); 681 - return iocb->ki_user_data; 464 + return req->ki_user_data; 682 465 } 683 466 EXPORT_SYMBOL(wait_on_sync_kiocb); 684 467 ··· 691 476 */ 692 477 void exit_aio(struct mm_struct *mm) 693 478 { 479 + struct kioctx_table *table; 694 480 struct kioctx *ctx; 695 - struct hlist_node *n; 481 + unsigned i = 0; 696 482 697 - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { 698 - if (1 != atomic_read(&ctx->users)) 699 - printk(KERN_DEBUG 700 - "exit_aio:ioctx still alive: %d %d %d\n", 701 - atomic_read(&ctx->users), 702 - atomic_read(&ctx->dead), 703 - atomic_read(&ctx->reqs_active)); 483 + while (1) { 484 + rcu_read_lock(); 485 + table = rcu_dereference(mm->ioctx_table); 486 + 487 + do { 488 + if (!table || i >= table->nr) { 489 + rcu_read_unlock(); 490 + rcu_assign_pointer(mm->ioctx_table, NULL); 491 + if (table) 492 + kfree(table); 493 + return; 494 + } 495 + 496 + ctx = table->table[i++]; 497 + } while (!ctx); 498 + 499 + rcu_read_unlock(); 500 + 704 501 /* 705 502 * We don't need to bother with munmap() here - 706 503 * exit_mmap(mm) is coming and it'll unmap everything. ··· 723 496 */ 724 497 ctx->mmap_size = 0; 725 498 726 - kill_ioctx(ctx); 499 + kill_ioctx(mm, ctx); 727 500 } 728 501 } 729 502 503 + static void put_reqs_available(struct kioctx *ctx, unsigned nr) 504 + { 505 + struct kioctx_cpu *kcpu; 506 + 507 + preempt_disable(); 508 + kcpu = this_cpu_ptr(ctx->cpu); 509 + 510 + kcpu->reqs_available += nr; 511 + while (kcpu->reqs_available >= ctx->req_batch * 2) { 512 + kcpu->reqs_available -= ctx->req_batch; 513 + atomic_add(ctx->req_batch, &ctx->reqs_available); 514 + } 515 + 516 + preempt_enable(); 517 + } 518 + 519 + static bool get_reqs_available(struct kioctx *ctx) 520 + { 521 + struct kioctx_cpu *kcpu; 522 + bool ret = false; 523 + 524 + preempt_disable(); 525 + kcpu = this_cpu_ptr(ctx->cpu); 526 + 527 + if (!kcpu->reqs_available) { 528 + int old, avail = atomic_read(&ctx->reqs_available); 529 + 530 + do { 531 + if (avail < ctx->req_batch) 532 + goto out; 533 + 534 + old = avail; 535 + avail = atomic_cmpxchg(&ctx->reqs_available, 536 + avail, avail - ctx->req_batch); 537 + } while (avail != old); 538 + 539 + kcpu->reqs_available += ctx->req_batch; 540 + } 541 + 542 + ret = true; 543 + kcpu->reqs_available--; 544 + out: 545 + preempt_enable(); 546 + return ret; 547 + } 548 + 730 549 /* aio_get_req 731 - * Allocate a slot for an aio request. Increments the ki_users count 732 - * of the kioctx so that the kioctx stays around until all requests are 733 - * complete. Returns NULL if no requests are free. 734 - * 735 - * Returns with kiocb->ki_users set to 2. The io submit code path holds 736 - * an extra reference while submitting the i/o. 737 - * This prevents races between the aio code path referencing the 738 - * req (after submitting it) and aio_complete() freeing the req. 550 + * Allocate a slot for an aio request. 551 + * Returns NULL if no requests are free. 739 552 */ 740 553 static inline struct kiocb *aio_get_req(struct kioctx *ctx) 741 554 { 742 555 struct kiocb *req; 743 556 744 - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) 557 + if (!get_reqs_available(ctx)) 745 558 return NULL; 746 - 747 - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) 748 - goto out_put; 749 559 750 560 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 751 561 if (unlikely(!req)) 752 562 goto out_put; 753 563 754 - atomic_set(&req->ki_users, 2); 755 564 req->ki_ctx = ctx; 756 - 757 565 return req; 758 566 out_put: 759 - atomic_dec(&ctx->reqs_active); 567 + put_reqs_available(ctx, 1); 760 568 return NULL; 761 569 } 762 570 ··· 801 539 fput(req->ki_filp); 802 540 if (req->ki_eventfd != NULL) 803 541 eventfd_ctx_put(req->ki_eventfd); 804 - if (req->ki_dtor) 805 - req->ki_dtor(req); 806 - if (req->ki_iovec != &req->ki_inline_vec) 807 - kfree(req->ki_iovec); 808 542 kmem_cache_free(kiocb_cachep, req); 809 543 } 810 544 811 - void aio_put_req(struct kiocb *req) 812 - { 813 - if (atomic_dec_and_test(&req->ki_users)) 814 - kiocb_free(req); 815 - } 816 - EXPORT_SYMBOL(aio_put_req); 817 - 818 545 static struct kioctx *lookup_ioctx(unsigned long ctx_id) 819 546 { 547 + struct aio_ring __user *ring = (void __user *)ctx_id; 820 548 struct mm_struct *mm = current->mm; 821 549 struct kioctx *ctx, *ret = NULL; 550 + struct kioctx_table *table; 551 + unsigned id; 552 + 553 + if (get_user(id, &ring->id)) 554 + return NULL; 822 555 823 556 rcu_read_lock(); 557 + table = rcu_dereference(mm->ioctx_table); 824 558 825 - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { 826 - if (ctx->user_id == ctx_id) { 827 - atomic_inc(&ctx->users); 828 - ret = ctx; 829 - break; 830 - } 559 + if (!table || id >= table->nr) 560 + goto out; 561 + 562 + ctx = table->table[id]; 563 + if (ctx && ctx->user_id == ctx_id) { 564 + percpu_ref_get(&ctx->users); 565 + ret = ctx; 831 566 } 832 - 567 + out: 833 568 rcu_read_unlock(); 834 569 return ret; 835 570 } ··· 850 591 * - the sync task helpfully left a reference to itself in the iocb 851 592 */ 852 593 if (is_sync_kiocb(iocb)) { 853 - BUG_ON(atomic_read(&iocb->ki_users) != 1); 854 594 iocb->ki_user_data = res; 855 - atomic_set(&iocb->ki_users, 0); 595 + smp_wmb(); 596 + iocb->ki_ctx = ERR_PTR(-EXDEV); 856 597 wake_up_process(iocb->ki_obj.tsk); 857 598 return; 858 599 } 859 600 860 601 /* 861 602 * Take rcu_read_lock() in case the kioctx is being destroyed, as we 862 - * need to issue a wakeup after decrementing reqs_active. 603 + * need to issue a wakeup after incrementing reqs_available. 863 604 */ 864 605 rcu_read_lock(); 865 606 ··· 869 610 spin_lock_irqsave(&ctx->ctx_lock, flags); 870 611 list_del(&iocb->ki_list); 871 612 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 872 - } 873 - 874 - /* 875 - * cancelled requests don't get events, userland was given one 876 - * when the event got cancelled. 877 - */ 878 - if (unlikely(xchg(&iocb->ki_cancel, 879 - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { 880 - atomic_dec(&ctx->reqs_active); 881 - /* Still need the wake_up in case free_ioctx is waiting */ 882 - goto put_rq; 883 613 } 884 614 885 615 /* ··· 923 675 if (iocb->ki_eventfd != NULL) 924 676 eventfd_signal(iocb->ki_eventfd, 1); 925 677 926 - put_rq: 927 678 /* everything turned out well, dispose of the aiocb. */ 928 - aio_put_req(iocb); 679 + kiocb_free(iocb); 929 680 930 681 /* 931 682 * We have to order our ring_info tail store above and test ··· 949 702 struct io_event __user *event, long nr) 950 703 { 951 704 struct aio_ring *ring; 952 - unsigned head, pos; 705 + unsigned head, tail, pos; 953 706 long ret = 0; 954 707 int copy_ret; 955 708 ··· 957 710 958 711 ring = kmap_atomic(ctx->ring_pages[0]); 959 712 head = ring->head; 713 + tail = ring->tail; 960 714 kunmap_atomic(ring); 961 715 962 - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); 716 + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); 963 717 964 - if (head == ctx->tail) 718 + if (head == tail) 965 719 goto out; 966 720 967 721 while (ret < nr) { ··· 970 722 struct io_event *ev; 971 723 struct page *page; 972 724 973 - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; 974 - if (head == ctx->tail) 725 + avail = (head <= tail ? tail : ctx->nr_events) - head; 726 + if (head == tail) 975 727 break; 976 728 977 729 avail = min(avail, nr - ret); ··· 1002 754 kunmap_atomic(ring); 1003 755 flush_dcache_page(ctx->ring_pages[0]); 1004 756 1005 - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); 757 + pr_debug("%li h%u t%u\n", ret, head, tail); 1006 758 1007 - atomic_sub(ret, &ctx->reqs_active); 759 + put_reqs_available(ctx, ret); 1008 760 out: 1009 761 mutex_unlock(&ctx->ring_lock); 1010 762 ··· 1102 854 if (!IS_ERR(ioctx)) { 1103 855 ret = put_user(ioctx->user_id, ctxp); 1104 856 if (ret) 1105 - kill_ioctx(ioctx); 1106 - put_ioctx(ioctx); 857 + kill_ioctx(current->mm, ioctx); 858 + percpu_ref_put(&ioctx->users); 1107 859 } 1108 860 1109 861 out: ··· 1120 872 { 1121 873 struct kioctx *ioctx = lookup_ioctx(ctx); 1122 874 if (likely(NULL != ioctx)) { 1123 - kill_ioctx(ioctx); 1124 - put_ioctx(ioctx); 875 + kill_ioctx(current->mm, ioctx); 876 + percpu_ref_put(&ioctx->users); 1125 877 return 0; 1126 878 } 1127 879 pr_debug("EINVAL: io_destroy: invalid context id\n"); 1128 880 return -EINVAL; 1129 881 } 1130 882 1131 - static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) 1132 - { 1133 - struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; 1134 - 1135 - BUG_ON(ret <= 0); 1136 - 1137 - while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { 1138 - ssize_t this = min((ssize_t)iov->iov_len, ret); 1139 - iov->iov_base += this; 1140 - iov->iov_len -= this; 1141 - iocb->ki_left -= this; 1142 - ret -= this; 1143 - if (iov->iov_len == 0) { 1144 - iocb->ki_cur_seg++; 1145 - iov++; 1146 - } 1147 - } 1148 - 1149 - /* the caller should not have done more io than what fit in 1150 - * the remaining iovecs */ 1151 - BUG_ON(ret > 0 && iocb->ki_left == 0); 1152 - } 1153 - 1154 883 typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, 1155 884 unsigned long, loff_t); 1156 885 1157 - static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) 1158 - { 1159 - struct file *file = iocb->ki_filp; 1160 - struct address_space *mapping = file->f_mapping; 1161 - struct inode *inode = mapping->host; 1162 - ssize_t ret = 0; 1163 - 1164 - /* This matches the pread()/pwrite() logic */ 1165 - if (iocb->ki_pos < 0) 1166 - return -EINVAL; 1167 - 1168 - if (rw == WRITE) 1169 - file_start_write(file); 1170 - do { 1171 - ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], 1172 - iocb->ki_nr_segs - iocb->ki_cur_seg, 1173 - iocb->ki_pos); 1174 - if (ret > 0) 1175 - aio_advance_iovec(iocb, ret); 1176 - 1177 - /* retry all partial writes. retry partial reads as long as its a 1178 - * regular file. */ 1179 - } while (ret > 0 && iocb->ki_left > 0 && 1180 - (rw == WRITE || 1181 - (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); 1182 - if (rw == WRITE) 1183 - file_end_write(file); 1184 - 1185 - /* This means we must have transferred all that we could */ 1186 - /* No need to retry anymore */ 1187 - if ((ret == 0) || (iocb->ki_left == 0)) 1188 - ret = iocb->ki_nbytes - iocb->ki_left; 1189 - 1190 - /* If we managed to write some out we return that, rather than 1191 - * the eventual error. */ 1192 - if (rw == WRITE 1193 - && ret < 0 && ret != -EIOCBQUEUED 1194 - && iocb->ki_nbytes - iocb->ki_left) 1195 - ret = iocb->ki_nbytes - iocb->ki_left; 1196 - 1197 - return ret; 1198 - } 1199 - 1200 - static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) 886 + static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, 887 + int rw, char __user *buf, 888 + unsigned long *nr_segs, 889 + struct iovec **iovec, 890 + bool compat) 1201 891 { 1202 892 ssize_t ret; 1203 893 1204 - kiocb->ki_nr_segs = kiocb->ki_nbytes; 894 + *nr_segs = kiocb->ki_nbytes; 1205 895 1206 896 #ifdef CONFIG_COMPAT 1207 897 if (compat) 1208 898 ret = compat_rw_copy_check_uvector(rw, 1209 - (struct compat_iovec __user *)kiocb->ki_buf, 1210 - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, 1211 - &kiocb->ki_iovec); 899 + (struct compat_iovec __user *)buf, 900 + *nr_segs, 1, *iovec, iovec); 1212 901 else 1213 902 #endif 1214 903 ret = rw_copy_check_uvector(rw, 1215 - (struct iovec __user *)kiocb->ki_buf, 1216 - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, 1217 - &kiocb->ki_iovec); 904 + (struct iovec __user *)buf, 905 + *nr_segs, 1, *iovec, iovec); 1218 906 if (ret < 0) 1219 907 return ret; 1220 908 ··· 1159 975 return 0; 1160 976 } 1161 977 1162 - static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) 978 + static ssize_t aio_setup_single_vector(struct kiocb *kiocb, 979 + int rw, char __user *buf, 980 + unsigned long *nr_segs, 981 + struct iovec *iovec) 1163 982 { 1164 - if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) 983 + if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) 1165 984 return -EFAULT; 1166 985 1167 - kiocb->ki_iovec = &kiocb->ki_inline_vec; 1168 - kiocb->ki_iovec->iov_base = kiocb->ki_buf; 1169 - kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; 1170 - kiocb->ki_nr_segs = 1; 986 + iovec->iov_base = buf; 987 + iovec->iov_len = kiocb->ki_nbytes; 988 + *nr_segs = 1; 1171 989 return 0; 1172 990 } 1173 991 ··· 1178 992 * Performs the initial checks and aio retry method 1179 993 * setup for the kiocb at the time of io submission. 1180 994 */ 1181 - static ssize_t aio_run_iocb(struct kiocb *req, bool compat) 995 + static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, 996 + char __user *buf, bool compat) 1182 997 { 1183 998 struct file *file = req->ki_filp; 1184 999 ssize_t ret; 1000 + unsigned long nr_segs; 1185 1001 int rw; 1186 1002 fmode_t mode; 1187 1003 aio_rw_op *rw_op; 1004 + struct iovec inline_vec, *iovec = &inline_vec; 1188 1005 1189 - switch (req->ki_opcode) { 1006 + switch (opcode) { 1190 1007 case IOCB_CMD_PREAD: 1191 1008 case IOCB_CMD_PREADV: 1192 1009 mode = FMODE_READ; ··· 1210 1021 if (!rw_op) 1211 1022 return -EINVAL; 1212 1023 1213 - ret = (req->ki_opcode == IOCB_CMD_PREADV || 1214 - req->ki_opcode == IOCB_CMD_PWRITEV) 1215 - ? aio_setup_vectored_rw(rw, req, compat) 1216 - : aio_setup_single_vector(rw, req); 1024 + ret = (opcode == IOCB_CMD_PREADV || 1025 + opcode == IOCB_CMD_PWRITEV) 1026 + ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, 1027 + &iovec, compat) 1028 + : aio_setup_single_vector(req, rw, buf, &nr_segs, 1029 + iovec); 1217 1030 if (ret) 1218 1031 return ret; 1219 1032 1220 1033 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); 1221 - if (ret < 0) 1034 + if (ret < 0) { 1035 + if (iovec != &inline_vec) 1036 + kfree(iovec); 1222 1037 return ret; 1038 + } 1223 1039 1224 1040 req->ki_nbytes = ret; 1225 - req->ki_left = ret; 1226 1041 1227 - ret = aio_rw_vect_retry(req, rw, rw_op); 1042 + /* XXX: move/kill - rw_verify_area()? */ 1043 + /* This matches the pread()/pwrite() logic */ 1044 + if (req->ki_pos < 0) { 1045 + ret = -EINVAL; 1046 + break; 1047 + } 1048 + 1049 + if (rw == WRITE) 1050 + file_start_write(file); 1051 + 1052 + ret = rw_op(req, iovec, nr_segs, req->ki_pos); 1053 + 1054 + if (rw == WRITE) 1055 + file_end_write(file); 1228 1056 break; 1229 1057 1230 1058 case IOCB_CMD_FDSYNC: ··· 1262 1056 pr_debug("EINVAL: no operation provided\n"); 1263 1057 return -EINVAL; 1264 1058 } 1059 + 1060 + if (iovec != &inline_vec) 1061 + kfree(iovec); 1265 1062 1266 1063 if (ret != -EIOCBQUEUED) { 1267 1064 /* ··· 1337 1128 req->ki_obj.user = user_iocb; 1338 1129 req->ki_user_data = iocb->aio_data; 1339 1130 req->ki_pos = iocb->aio_offset; 1131 + req->ki_nbytes = iocb->aio_nbytes; 1340 1132 1341 - req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; 1342 - req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1343 - req->ki_opcode = iocb->aio_lio_opcode; 1344 - 1345 - ret = aio_run_iocb(req, compat); 1133 + ret = aio_run_iocb(req, iocb->aio_lio_opcode, 1134 + (char __user *)(unsigned long)iocb->aio_buf, 1135 + compat); 1346 1136 if (ret) 1347 1137 goto out_put_req; 1348 1138 1349 - aio_put_req(req); /* drop extra ref to req */ 1350 1139 return 0; 1351 1140 out_put_req: 1352 - atomic_dec(&ctx->reqs_active); 1353 - aio_put_req(req); /* drop extra ref to req */ 1354 - aio_put_req(req); /* drop i/o ref to req */ 1141 + put_reqs_available(ctx, 1); 1142 + kiocb_free(req); 1355 1143 return ret; 1356 1144 } 1357 1145 ··· 1401 1195 } 1402 1196 blk_finish_plug(&plug); 1403 1197 1404 - put_ioctx(ctx); 1198 + percpu_ref_put(&ctx->users); 1405 1199 return i ? i : ret; 1406 1200 } 1407 1201 ··· 1458 1252 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, 1459 1253 struct io_event __user *, result) 1460 1254 { 1461 - struct io_event res; 1462 1255 struct kioctx *ctx; 1463 1256 struct kiocb *kiocb; 1464 1257 u32 key; ··· 1475 1270 1476 1271 kiocb = lookup_kiocb(ctx, iocb, key); 1477 1272 if (kiocb) 1478 - ret = kiocb_cancel(ctx, kiocb, &res); 1273 + ret = kiocb_cancel(ctx, kiocb); 1479 1274 else 1480 1275 ret = -EINVAL; 1481 1276 1482 1277 spin_unlock_irq(&ctx->ctx_lock); 1483 1278 1484 1279 if (!ret) { 1485 - /* Cancellation succeeded -- copy the result 1486 - * into the user's buffer. 1280 + /* 1281 + * The result argument is no longer used - the io_event is 1282 + * always delivered via the ring buffer. -EINPROGRESS indicates 1283 + * cancellation is progress: 1487 1284 */ 1488 - if (copy_to_user(result, &res, sizeof(res))) 1489 - ret = -EFAULT; 1285 + ret = -EINPROGRESS; 1490 1286 } 1491 1287 1492 - put_ioctx(ctx); 1288 + percpu_ref_put(&ctx->users); 1493 1289 1494 1290 return ret; 1495 1291 } ··· 1519 1313 if (likely(ioctx)) { 1520 1314 if (likely(min_nr <= nr && min_nr >= 0)) 1521 1315 ret = read_events(ioctx, min_nr, nr, events, timeout); 1522 - put_ioctx(ioctx); 1316 + percpu_ref_put(&ioctx->users); 1523 1317 } 1524 1318 return ret; 1525 1319 }
+66
fs/anon_inodes.c
··· 109 109 }; 110 110 111 111 /** 112 + * anon_inode_getfile_private - creates a new file instance by hooking it up to an 113 + * anonymous inode, and a dentry that describe the "class" 114 + * of the file 115 + * 116 + * @name: [in] name of the "class" of the new file 117 + * @fops: [in] file operations for the new file 118 + * @priv: [in] private data for the new file (will be file's private_data) 119 + * @flags: [in] flags 120 + * 121 + * 122 + * Similar to anon_inode_getfile, but each file holds a single inode. 123 + * 124 + */ 125 + struct file *anon_inode_getfile_private(const char *name, 126 + const struct file_operations *fops, 127 + void *priv, int flags) 128 + { 129 + struct qstr this; 130 + struct path path; 131 + struct file *file; 132 + struct inode *inode; 133 + 134 + if (fops->owner && !try_module_get(fops->owner)) 135 + return ERR_PTR(-ENOENT); 136 + 137 + inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); 138 + if (IS_ERR(inode)) { 139 + file = ERR_PTR(-ENOMEM); 140 + goto err_module; 141 + } 142 + 143 + /* 144 + * Link the inode to a directory entry by creating a unique name 145 + * using the inode sequence number. 146 + */ 147 + file = ERR_PTR(-ENOMEM); 148 + this.name = name; 149 + this.len = strlen(name); 150 + this.hash = 0; 151 + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); 152 + if (!path.dentry) 153 + goto err_module; 154 + 155 + path.mnt = mntget(anon_inode_mnt); 156 + 157 + d_instantiate(path.dentry, inode); 158 + 159 + file = alloc_file(&path, OPEN_FMODE(flags), fops); 160 + if (IS_ERR(file)) 161 + goto err_dput; 162 + 163 + file->f_mapping = inode->i_mapping; 164 + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); 165 + file->private_data = priv; 166 + 167 + return file; 168 + 169 + err_dput: 170 + path_put(&path); 171 + err_module: 172 + module_put(fops->owner); 173 + return file; 174 + } 175 + EXPORT_SYMBOL_GPL(anon_inode_getfile_private); 176 + 177 + /** 112 178 * anon_inode_getfile - creates a new file instance by hooking it up to an 113 179 * anonymous inode, and a dentry that describe the "class" 114 180 * of the file
+1 -1
fs/block_dev.c
··· 1542 1542 return 0; 1543 1543 1544 1544 size -= pos; 1545 - if (size < iocb->ki_left) 1545 + if (size < iocb->ki_nbytes) 1546 1546 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1547 1547 return generic_file_aio_read(iocb, iov, nr_segs, pos); 1548 1548 }
-1
fs/nfs/direct.c
··· 130 130 131 131 return -EINVAL; 132 132 #else 133 - VM_BUG_ON(iocb->ki_left != PAGE_SIZE); 134 133 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 135 134 136 135 if (rw == READ || rw == KERNEL_READ)
+3 -3
fs/ocfs2/file.c
··· 2242 2242 file->f_path.dentry->d_name.name, 2243 2243 (unsigned int)nr_segs); 2244 2244 2245 - if (iocb->ki_left == 0) 2245 + if (iocb->ki_nbytes == 0) 2246 2246 return 0; 2247 2247 2248 2248 appending = file->f_flags & O_APPEND ? 1 : 0; ··· 2293 2293 2294 2294 can_do_direct = direct_io; 2295 2295 ret = ocfs2_prepare_inode_for_write(file, ppos, 2296 - iocb->ki_left, appending, 2296 + iocb->ki_nbytes, appending, 2297 2297 &can_do_direct, &has_refcount); 2298 2298 if (ret < 0) { 2299 2299 mlog_errno(ret); ··· 2301 2301 } 2302 2302 2303 2303 if (direct_io && !is_sync_kiocb(iocb)) 2304 - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, 2304 + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, 2305 2305 *ppos); 2306 2306 2307 2307 /*
-3
fs/read_write.c
··· 367 367 368 368 init_sync_kiocb(&kiocb, filp); 369 369 kiocb.ki_pos = *ppos; 370 - kiocb.ki_left = len; 371 370 kiocb.ki_nbytes = len; 372 371 373 372 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); ··· 416 417 417 418 init_sync_kiocb(&kiocb, filp); 418 419 kiocb.ki_pos = *ppos; 419 - kiocb.ki_left = len; 420 420 kiocb.ki_nbytes = len; 421 421 422 422 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); ··· 597 599 598 600 init_sync_kiocb(&kiocb, filp); 599 601 kiocb.ki_pos = *ppos; 600 - kiocb.ki_left = len; 601 602 kiocb.ki_nbytes = len; 602 603 603 604 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+1 -1
fs/udf/file.c
··· 141 141 struct file *file = iocb->ki_filp; 142 142 struct inode *inode = file_inode(file); 143 143 int err, pos; 144 - size_t count = iocb->ki_left; 144 + size_t count = iocb->ki_nbytes; 145 145 struct udf_inode_info *iinfo = UDF_I(inode); 146 146 147 147 down_write(&iinfo->i_data_sem);
+3 -18
include/linux/aio.h
··· 27 27 */ 28 28 #define KIOCB_CANCELLED ((void *) (~0ULL)) 29 29 30 - typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); 30 + typedef int (kiocb_cancel_fn)(struct kiocb *); 31 31 32 32 struct kiocb { 33 - atomic_t ki_users; 34 - 35 33 struct file *ki_filp; 36 34 struct kioctx *ki_ctx; /* NULL for sync ops */ 37 35 kiocb_cancel_fn *ki_cancel; 38 - void (*ki_dtor)(struct kiocb *); 36 + void *private; 39 37 40 38 union { 41 39 void __user *user; ··· 42 44 43 45 __u64 ki_user_data; /* user's data for completion */ 44 46 loff_t ki_pos; 45 - 46 - void *private; 47 - /* State that we remember to be able to restart/retry */ 48 - unsigned short ki_opcode; 49 - size_t ki_nbytes; /* copy of iocb->aio_nbytes */ 50 - char __user *ki_buf; /* remaining iocb->aio_buf */ 51 - size_t ki_left; /* remaining bytes */ 52 - struct iovec ki_inline_vec; /* inline vector */ 53 - struct iovec *ki_iovec; 54 - unsigned long ki_nr_segs; 55 - unsigned long ki_cur_seg; 47 + size_t ki_nbytes; /* copy of iocb->aio_nbytes */ 56 48 57 49 struct list_head ki_list; /* the aio core uses this 58 50 * for cancellation */ ··· 62 74 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) 63 75 { 64 76 *kiocb = (struct kiocb) { 65 - .ki_users = ATOMIC_INIT(1), 66 77 .ki_ctx = NULL, 67 78 .ki_filp = filp, 68 79 .ki_obj.tsk = current, ··· 71 84 /* prototypes */ 72 85 #ifdef CONFIG_AIO 73 86 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); 74 - extern void aio_put_req(struct kiocb *iocb); 75 87 extern void aio_complete(struct kiocb *iocb, long res, long res2); 76 88 struct mm_struct; 77 89 extern void exit_aio(struct mm_struct *mm); ··· 79 93 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); 80 94 #else 81 95 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } 82 - static inline void aio_put_req(struct kiocb *iocb) { } 83 96 static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } 84 97 struct mm_struct; 85 98 static inline void exit_aio(struct mm_struct *mm) { }
+3
include/linux/anon_inodes.h
··· 13 13 struct file *anon_inode_getfile(const char *name, 14 14 const struct file_operations *fops, 15 15 void *priv, int flags); 16 + struct file *anon_inode_getfile_private(const char *name, 17 + const struct file_operations *fops, 18 + void *priv, int flags); 16 19 int anon_inode_getfd(const char *name, const struct file_operations *fops, 17 20 void *priv, int flags); 18 21
+3
include/linux/migrate.h
··· 53 53 extern void migrate_page_copy(struct page *newpage, struct page *page); 54 54 extern int migrate_huge_page_move_mapping(struct address_space *mapping, 55 55 struct page *newpage, struct page *page); 56 + extern int migrate_page_move_mapping(struct address_space *mapping, 57 + struct page *newpage, struct page *page, 58 + struct buffer_head *head, enum migrate_mode mode); 56 59 #else 57 60 58 61 static inline void putback_lru_pages(struct list_head *l) {}
+3 -2
include/linux/mm_types.h
··· 322 322 atomic_long_t count[NR_MM_COUNTERS]; 323 323 }; 324 324 325 + struct kioctx_table; 325 326 struct mm_struct { 326 327 struct vm_area_struct * mmap; /* list of VMAs */ 327 328 struct rb_root mm_rb; ··· 384 383 385 384 struct core_state *core_state; /* coredumping support */ 386 385 #ifdef CONFIG_AIO 387 - spinlock_t ioctx_lock; 388 - struct hlist_head ioctx_list; 386 + spinlock_t ioctx_lock; 387 + struct kioctx_table __rcu *ioctx_table; 389 388 #endif 390 389 #ifdef CONFIG_MM_OWNER 391 390 /*
+1 -1
kernel/fork.c
··· 519 519 { 520 520 #ifdef CONFIG_AIO 521 521 spin_lock_init(&mm->ioctx_lock); 522 - INIT_HLIST_HEAD(&mm->ioctx_list); 522 + mm->ioctx_table = NULL; 523 523 #endif 524 524 } 525 525
+1 -1
mm/migrate.c
··· 311 311 * 2 for pages with a mapping 312 312 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 313 313 */ 314 - static int migrate_page_move_mapping(struct address_space *mapping, 314 + int migrate_page_move_mapping(struct address_space *mapping, 315 315 struct page *newpage, struct page *page, 316 316 struct buffer_head *head, enum migrate_mode mode) 317 317 {
-1
mm/page_io.c
··· 266 266 267 267 init_sync_kiocb(&kiocb, swap_file); 268 268 kiocb.ki_pos = page_file_offset(page); 269 - kiocb.ki_left = PAGE_SIZE; 270 269 kiocb.ki_nbytes = PAGE_SIZE; 271 270 272 271 set_page_writeback(page);
+3 -12
net/socket.c
··· 854 854 } 855 855 EXPORT_SYMBOL(kernel_recvmsg); 856 856 857 - static void sock_aio_dtor(struct kiocb *iocb) 858 - { 859 - kfree(iocb->private); 860 - } 861 - 862 857 static ssize_t sock_sendpage(struct file *file, struct page *page, 863 858 int offset, size_t size, loff_t *ppos, int more) 864 859 { ··· 884 889 static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, 885 890 struct sock_iocb *siocb) 886 891 { 887 - if (!is_sync_kiocb(iocb)) { 888 - siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); 889 - if (!siocb) 890 - return NULL; 891 - iocb->ki_dtor = sock_aio_dtor; 892 - } 892 + if (!is_sync_kiocb(iocb)) 893 + BUG(); 893 894 894 895 siocb->kiocb = iocb; 895 896 iocb->private = siocb; ··· 922 931 if (pos != 0) 923 932 return -ESPIPE; 924 933 925 - if (iocb->ki_left == 0) /* Match SYS5 behaviour */ 934 + if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ 926 935 return 0; 927 936 928 937