Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/export.h>
3#include <linux/bvec.h>
4#include <linux/fault-inject-usercopy.h>
5#include <linux/uio.h>
6#include <linux/pagemap.h>
7#include <linux/highmem.h>
8#include <linux/slab.h>
9#include <linux/vmalloc.h>
10#include <linux/splice.h>
11#include <linux/compat.h>
12#include <linux/scatterlist.h>
13#include <linux/instrumented.h>
14#include <linux/iov_iter.h>
15
16static __always_inline
17size_t copy_to_user_iter(void __user *iter_to, size_t progress,
18 size_t len, void *from, void *priv2)
19{
20 if (should_fail_usercopy())
21 return len;
22 if (access_ok(iter_to, len)) {
23 from += progress;
24 instrument_copy_to_user(iter_to, from, len);
25 len = raw_copy_to_user(iter_to, from, len);
26 }
27 return len;
28}
29
30static __always_inline
31size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
32 size_t len, void *from, void *priv2)
33{
34 ssize_t res;
35
36 if (should_fail_usercopy())
37 return len;
38
39 from += progress;
40 res = copy_to_user_nofault(iter_to, from, len);
41 return res < 0 ? len : res;
42}
43
44static __always_inline
45size_t copy_from_user_iter(void __user *iter_from, size_t progress,
46 size_t len, void *to, void *priv2)
47{
48 size_t res = len;
49
50 if (should_fail_usercopy())
51 return len;
52 if (can_do_masked_user_access()) {
53 iter_from = mask_user_address(iter_from);
54 } else {
55 if (!access_ok(iter_from, len))
56 return res;
57
58 /*
59 * Ensure that bad access_ok() speculation will not
60 * lead to nasty side effects *after* the copy is
61 * finished:
62 */
63 barrier_nospec();
64 }
65 to += progress;
66 instrument_copy_from_user_before(to, iter_from, len);
67 res = raw_copy_from_user(to, iter_from, len);
68 instrument_copy_from_user_after(to, iter_from, len, res);
69
70 return res;
71}
72
73static __always_inline
74size_t memcpy_to_iter(void *iter_to, size_t progress,
75 size_t len, void *from, void *priv2)
76{
77 memcpy(iter_to, from + progress, len);
78 return 0;
79}
80
81static __always_inline
82size_t memcpy_from_iter(void *iter_from, size_t progress,
83 size_t len, void *to, void *priv2)
84{
85 memcpy(to + progress, iter_from, len);
86 return 0;
87}
88
89/*
90 * fault_in_iov_iter_readable - fault in iov iterator for reading
91 * @i: iterator
92 * @size: maximum length
93 *
94 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
95 * @size. For each iovec, fault in each page that constitutes the iovec.
96 *
97 * Returns the number of bytes not faulted in (like copy_to_user() and
98 * copy_from_user()).
99 *
100 * Always returns 0 for non-userspace iterators.
101 */
102size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
103{
104 if (iter_is_ubuf(i)) {
105 size_t n = min(size, iov_iter_count(i));
106 n -= fault_in_readable(i->ubuf + i->iov_offset, n);
107 return size - n;
108 } else if (iter_is_iovec(i)) {
109 size_t count = min(size, iov_iter_count(i));
110 const struct iovec *p;
111 size_t skip;
112
113 size -= count;
114 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
115 size_t len = min(count, p->iov_len - skip);
116 size_t ret;
117
118 if (unlikely(!len))
119 continue;
120 ret = fault_in_readable(p->iov_base + skip, len);
121 count -= len - ret;
122 if (ret)
123 break;
124 }
125 return count + size;
126 }
127 return 0;
128}
129EXPORT_SYMBOL(fault_in_iov_iter_readable);
130
131/*
132 * fault_in_iov_iter_writeable - fault in iov iterator for writing
133 * @i: iterator
134 * @size: maximum length
135 *
136 * Faults in the iterator using get_user_pages(), i.e., without triggering
137 * hardware page faults. This is primarily useful when we already know that
138 * some or all of the pages in @i aren't in memory.
139 *
140 * Returns the number of bytes not faulted in, like copy_to_user() and
141 * copy_from_user().
142 *
143 * Always returns 0 for non-user-space iterators.
144 */
145size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
146{
147 if (iter_is_ubuf(i)) {
148 size_t n = min(size, iov_iter_count(i));
149 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
150 return size - n;
151 } else if (iter_is_iovec(i)) {
152 size_t count = min(size, iov_iter_count(i));
153 const struct iovec *p;
154 size_t skip;
155
156 size -= count;
157 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
158 size_t len = min(count, p->iov_len - skip);
159 size_t ret;
160
161 if (unlikely(!len))
162 continue;
163 ret = fault_in_safe_writeable(p->iov_base + skip, len);
164 count -= len - ret;
165 if (ret)
166 break;
167 }
168 return count + size;
169 }
170 return 0;
171}
172EXPORT_SYMBOL(fault_in_iov_iter_writeable);
173
174void iov_iter_init(struct iov_iter *i, unsigned int direction,
175 const struct iovec *iov, unsigned long nr_segs,
176 size_t count)
177{
178 WARN_ON(direction & ~(READ | WRITE));
179 *i = (struct iov_iter) {
180 .iter_type = ITER_IOVEC,
181 .nofault = false,
182 .data_source = direction,
183 .__iov = iov,
184 .nr_segs = nr_segs,
185 .iov_offset = 0,
186 .count = count
187 };
188}
189EXPORT_SYMBOL(iov_iter_init);
190
191size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
192{
193 if (WARN_ON_ONCE(i->data_source))
194 return 0;
195 if (user_backed_iter(i))
196 might_fault();
197 return iterate_and_advance(i, bytes, (void *)addr,
198 copy_to_user_iter, memcpy_to_iter);
199}
200EXPORT_SYMBOL(_copy_to_iter);
201
202#ifdef CONFIG_ARCH_HAS_COPY_MC
203static __always_inline
204size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
205 size_t len, void *from, void *priv2)
206{
207 if (access_ok(iter_to, len)) {
208 from += progress;
209 instrument_copy_to_user(iter_to, from, len);
210 len = copy_mc_to_user(iter_to, from, len);
211 }
212 return len;
213}
214
215static __always_inline
216size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
217 size_t len, void *from, void *priv2)
218{
219 return copy_mc_to_kernel(iter_to, from + progress, len);
220}
221
222/**
223 * _copy_mc_to_iter - copy to iter with source memory error exception handling
224 * @addr: source kernel address
225 * @bytes: total transfer length
226 * @i: destination iterator
227 *
228 * The pmem driver deploys this for the dax operation
229 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
230 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
231 * successfully copied.
232 *
233 * The main differences between this and typical _copy_to_iter().
234 *
235 * * Typical tail/residue handling after a fault retries the copy
236 * byte-by-byte until the fault happens again. Re-triggering machine
237 * checks is potentially fatal so the implementation uses source
238 * alignment and poison alignment assumptions to avoid re-triggering
239 * hardware exceptions.
240 *
241 * * ITER_KVEC and ITER_BVEC can return short copies. Compare to
242 * copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
243 *
244 * Return: number of bytes copied (may be %0)
245 */
246size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
247{
248 if (WARN_ON_ONCE(i->data_source))
249 return 0;
250 if (user_backed_iter(i))
251 might_fault();
252 return iterate_and_advance(i, bytes, (void *)addr,
253 copy_to_user_iter_mc, memcpy_to_iter_mc);
254}
255EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
256#endif /* CONFIG_ARCH_HAS_COPY_MC */
257
258static __always_inline
259size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
260{
261 return iterate_and_advance(i, bytes, addr,
262 copy_from_user_iter, memcpy_from_iter);
263}
264
265size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
266{
267 if (WARN_ON_ONCE(!i->data_source))
268 return 0;
269
270 if (user_backed_iter(i))
271 might_fault();
272 return __copy_from_iter(addr, bytes, i);
273}
274EXPORT_SYMBOL(_copy_from_iter);
275
276static __always_inline
277size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
278 size_t len, void *to, void *priv2)
279{
280 return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
281}
282
283size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
284{
285 if (WARN_ON_ONCE(!i->data_source))
286 return 0;
287
288 return iterate_and_advance(i, bytes, addr,
289 copy_from_user_iter_nocache,
290 memcpy_from_iter);
291}
292EXPORT_SYMBOL(_copy_from_iter_nocache);
293
294#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
295static __always_inline
296size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
297 size_t len, void *to, void *priv2)
298{
299 return __copy_from_user_flushcache(to + progress, iter_from, len);
300}
301
302static __always_inline
303size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
304 size_t len, void *to, void *priv2)
305{
306 memcpy_flushcache(to + progress, iter_from, len);
307 return 0;
308}
309
310/**
311 * _copy_from_iter_flushcache - write destination through cpu cache
312 * @addr: destination kernel address
313 * @bytes: total transfer length
314 * @i: source iterator
315 *
316 * The pmem driver arranges for filesystem-dax to use this facility via
317 * dax_copy_from_iter() for ensuring that writes to persistent memory
318 * are flushed through the CPU cache. It is differentiated from
319 * _copy_from_iter_nocache() in that guarantees all data is flushed for
320 * all iterator types. The _copy_from_iter_nocache() only attempts to
321 * bypass the cache for the ITER_IOVEC case, and on some archs may use
322 * instructions that strand dirty-data in the cache.
323 *
324 * Return: number of bytes copied (may be %0)
325 */
326size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
327{
328 if (WARN_ON_ONCE(!i->data_source))
329 return 0;
330
331 return iterate_and_advance(i, bytes, addr,
332 copy_from_user_iter_flushcache,
333 memcpy_from_iter_flushcache);
334}
335EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
336#endif
337
338static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
339{
340 struct page *head;
341 size_t v = n + offset;
342
343 /*
344 * The general case needs to access the page order in order
345 * to compute the page size.
346 * However, we mostly deal with order-0 pages and thus can
347 * avoid a possible cache line miss for requests that fit all
348 * page orders.
349 */
350 if (n <= v && v <= PAGE_SIZE)
351 return true;
352
353 head = compound_head(page);
354 v += (page - head) << PAGE_SHIFT;
355
356 if (WARN_ON(n > v || v > page_size(head)))
357 return false;
358 return true;
359}
360
361size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
362 struct iov_iter *i)
363{
364 size_t res = 0;
365 if (!page_copy_sane(page, offset, bytes))
366 return 0;
367 if (WARN_ON_ONCE(i->data_source))
368 return 0;
369 page += offset / PAGE_SIZE; // first subpage
370 offset %= PAGE_SIZE;
371 while (1) {
372 void *kaddr = kmap_local_page(page);
373 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
374 n = _copy_to_iter(kaddr + offset, n, i);
375 kunmap_local(kaddr);
376 res += n;
377 bytes -= n;
378 if (!bytes || !n)
379 break;
380 offset += n;
381 if (offset == PAGE_SIZE) {
382 page++;
383 offset = 0;
384 }
385 }
386 return res;
387}
388EXPORT_SYMBOL(copy_page_to_iter);
389
390size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
391 struct iov_iter *i)
392{
393 size_t res = 0;
394
395 if (!page_copy_sane(page, offset, bytes))
396 return 0;
397 if (WARN_ON_ONCE(i->data_source))
398 return 0;
399 page += offset / PAGE_SIZE; // first subpage
400 offset %= PAGE_SIZE;
401 while (1) {
402 void *kaddr = kmap_local_page(page);
403 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
404
405 n = iterate_and_advance(i, n, kaddr + offset,
406 copy_to_user_iter_nofault,
407 memcpy_to_iter);
408 kunmap_local(kaddr);
409 res += n;
410 bytes -= n;
411 if (!bytes || !n)
412 break;
413 offset += n;
414 if (offset == PAGE_SIZE) {
415 page++;
416 offset = 0;
417 }
418 }
419 return res;
420}
421EXPORT_SYMBOL(copy_page_to_iter_nofault);
422
423size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
424 struct iov_iter *i)
425{
426 size_t res = 0;
427 if (!page_copy_sane(page, offset, bytes))
428 return 0;
429 page += offset / PAGE_SIZE; // first subpage
430 offset %= PAGE_SIZE;
431 while (1) {
432 void *kaddr = kmap_local_page(page);
433 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
434 n = _copy_from_iter(kaddr + offset, n, i);
435 kunmap_local(kaddr);
436 res += n;
437 bytes -= n;
438 if (!bytes || !n)
439 break;
440 offset += n;
441 if (offset == PAGE_SIZE) {
442 page++;
443 offset = 0;
444 }
445 }
446 return res;
447}
448EXPORT_SYMBOL(copy_page_from_iter);
449
450static __always_inline
451size_t zero_to_user_iter(void __user *iter_to, size_t progress,
452 size_t len, void *priv, void *priv2)
453{
454 return clear_user(iter_to, len);
455}
456
457static __always_inline
458size_t zero_to_iter(void *iter_to, size_t progress,
459 size_t len, void *priv, void *priv2)
460{
461 memset(iter_to, 0, len);
462 return 0;
463}
464
465size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
466{
467 return iterate_and_advance(i, bytes, NULL,
468 zero_to_user_iter, zero_to_iter);
469}
470EXPORT_SYMBOL(iov_iter_zero);
471
472size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
473 size_t bytes, struct iov_iter *i)
474{
475 size_t n, copied = 0;
476
477 if (!page_copy_sane(&folio->page, offset, bytes))
478 return 0;
479 if (WARN_ON_ONCE(!i->data_source))
480 return 0;
481
482 do {
483 char *to = kmap_local_folio(folio, offset);
484
485 n = bytes - copied;
486 if (folio_test_partial_kmap(folio) &&
487 n > PAGE_SIZE - offset_in_page(offset))
488 n = PAGE_SIZE - offset_in_page(offset);
489
490 pagefault_disable();
491 n = __copy_from_iter(to, n, i);
492 pagefault_enable();
493 kunmap_local(to);
494 copied += n;
495 offset += n;
496 } while (copied != bytes && n > 0);
497
498 return copied;
499}
500EXPORT_SYMBOL(copy_folio_from_iter_atomic);
501
502static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
503{
504 const struct bio_vec *bvec, *end;
505
506 if (!i->count)
507 return;
508 i->count -= size;
509
510 size += i->iov_offset;
511
512 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
513 if (likely(size < bvec->bv_len))
514 break;
515 size -= bvec->bv_len;
516 }
517 i->iov_offset = size;
518 i->nr_segs -= bvec - i->bvec;
519 i->bvec = bvec;
520}
521
522static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
523{
524 const struct iovec *iov, *end;
525
526 if (!i->count)
527 return;
528 i->count -= size;
529
530 size += i->iov_offset; // from beginning of current segment
531 for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
532 if (likely(size < iov->iov_len))
533 break;
534 size -= iov->iov_len;
535 }
536 i->iov_offset = size;
537 i->nr_segs -= iov - iter_iov(i);
538 i->__iov = iov;
539}
540
541static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
542{
543 const struct folio_queue *folioq = i->folioq;
544 unsigned int slot = i->folioq_slot;
545
546 if (!i->count)
547 return;
548 i->count -= size;
549
550 if (slot >= folioq_nr_slots(folioq)) {
551 folioq = folioq->next;
552 slot = 0;
553 }
554
555 size += i->iov_offset; /* From beginning of current segment. */
556 do {
557 size_t fsize = folioq_folio_size(folioq, slot);
558
559 if (likely(size < fsize))
560 break;
561 size -= fsize;
562 slot++;
563 if (slot >= folioq_nr_slots(folioq) && folioq->next) {
564 folioq = folioq->next;
565 slot = 0;
566 }
567 } while (size);
568
569 i->iov_offset = size;
570 i->folioq_slot = slot;
571 i->folioq = folioq;
572}
573
574void iov_iter_advance(struct iov_iter *i, size_t size)
575{
576 if (unlikely(i->count < size))
577 size = i->count;
578 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
579 i->iov_offset += size;
580 i->count -= size;
581 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
582 /* iovec and kvec have identical layouts */
583 iov_iter_iovec_advance(i, size);
584 } else if (iov_iter_is_bvec(i)) {
585 iov_iter_bvec_advance(i, size);
586 } else if (iov_iter_is_folioq(i)) {
587 iov_iter_folioq_advance(i, size);
588 } else if (iov_iter_is_discard(i)) {
589 i->count -= size;
590 }
591}
592EXPORT_SYMBOL(iov_iter_advance);
593
594static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
595{
596 const struct folio_queue *folioq = i->folioq;
597 unsigned int slot = i->folioq_slot;
598
599 for (;;) {
600 size_t fsize;
601
602 if (slot == 0) {
603 folioq = folioq->prev;
604 slot = folioq_nr_slots(folioq);
605 }
606 slot--;
607
608 fsize = folioq_folio_size(folioq, slot);
609 if (unroll <= fsize) {
610 i->iov_offset = fsize - unroll;
611 break;
612 }
613 unroll -= fsize;
614 }
615
616 i->folioq_slot = slot;
617 i->folioq = folioq;
618}
619
620void iov_iter_revert(struct iov_iter *i, size_t unroll)
621{
622 if (!unroll)
623 return;
624 if (WARN_ON(unroll > MAX_RW_COUNT))
625 return;
626 i->count += unroll;
627 if (unlikely(iov_iter_is_discard(i)))
628 return;
629 if (unroll <= i->iov_offset) {
630 i->iov_offset -= unroll;
631 return;
632 }
633 unroll -= i->iov_offset;
634 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
635 BUG(); /* We should never go beyond the start of the specified
636 * range since we might then be straying into pages that
637 * aren't pinned.
638 */
639 } else if (iov_iter_is_bvec(i)) {
640 const struct bio_vec *bvec = i->bvec;
641 while (1) {
642 size_t n = (--bvec)->bv_len;
643 i->nr_segs++;
644 if (unroll <= n) {
645 i->bvec = bvec;
646 i->iov_offset = n - unroll;
647 return;
648 }
649 unroll -= n;
650 }
651 } else if (iov_iter_is_folioq(i)) {
652 i->iov_offset = 0;
653 iov_iter_folioq_revert(i, unroll);
654 } else { /* same logics for iovec and kvec */
655 const struct iovec *iov = iter_iov(i);
656 while (1) {
657 size_t n = (--iov)->iov_len;
658 i->nr_segs++;
659 if (unroll <= n) {
660 i->__iov = iov;
661 i->iov_offset = n - unroll;
662 return;
663 }
664 unroll -= n;
665 }
666 }
667}
668EXPORT_SYMBOL(iov_iter_revert);
669
670/*
671 * Return the count of just the current iov_iter segment.
672 */
673size_t iov_iter_single_seg_count(const struct iov_iter *i)
674{
675 if (i->nr_segs > 1) {
676 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
677 return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
678 if (iov_iter_is_bvec(i))
679 return min(i->count, i->bvec->bv_len - i->iov_offset);
680 }
681 if (unlikely(iov_iter_is_folioq(i)))
682 return !i->count ? 0 :
683 umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
684 return i->count;
685}
686EXPORT_SYMBOL(iov_iter_single_seg_count);
687
688void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
689 const struct kvec *kvec, unsigned long nr_segs,
690 size_t count)
691{
692 WARN_ON(direction & ~(READ | WRITE));
693 *i = (struct iov_iter){
694 .iter_type = ITER_KVEC,
695 .data_source = direction,
696 .kvec = kvec,
697 .nr_segs = nr_segs,
698 .iov_offset = 0,
699 .count = count
700 };
701}
702EXPORT_SYMBOL(iov_iter_kvec);
703
704void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
705 const struct bio_vec *bvec, unsigned long nr_segs,
706 size_t count)
707{
708 WARN_ON(direction & ~(READ | WRITE));
709 *i = (struct iov_iter){
710 .iter_type = ITER_BVEC,
711 .data_source = direction,
712 .bvec = bvec,
713 .nr_segs = nr_segs,
714 .iov_offset = 0,
715 .count = count
716 };
717}
718EXPORT_SYMBOL(iov_iter_bvec);
719
720/**
721 * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
722 * @i: The iterator to initialise.
723 * @direction: The direction of the transfer.
724 * @folioq: The starting point in the folio queue.
725 * @first_slot: The first slot in the folio queue to use
726 * @offset: The offset into the folio in the first slot to start at
727 * @count: The size of the I/O buffer in bytes.
728 *
729 * Set up an I/O iterator to either draw data out of the pages attached to an
730 * inode or to inject data into those pages. The pages *must* be prevented
731 * from evaporation, either by taking a ref on them or locking them by the
732 * caller.
733 */
734void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
735 const struct folio_queue *folioq, unsigned int first_slot,
736 unsigned int offset, size_t count)
737{
738 BUG_ON(direction & ~1);
739 *i = (struct iov_iter) {
740 .iter_type = ITER_FOLIOQ,
741 .data_source = direction,
742 .folioq = folioq,
743 .folioq_slot = first_slot,
744 .count = count,
745 .iov_offset = offset,
746 };
747}
748EXPORT_SYMBOL(iov_iter_folio_queue);
749
750/**
751 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
752 * @i: The iterator to initialise.
753 * @direction: The direction of the transfer.
754 * @xarray: The xarray to access.
755 * @start: The start file position.
756 * @count: The size of the I/O buffer in bytes.
757 *
758 * Set up an I/O iterator to either draw data out of the pages attached to an
759 * inode or to inject data into those pages. The pages *must* be prevented
760 * from evaporation, either by taking a ref on them or locking them by the
761 * caller.
762 */
763void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
764 struct xarray *xarray, loff_t start, size_t count)
765{
766 BUG_ON(direction & ~1);
767 *i = (struct iov_iter) {
768 .iter_type = ITER_XARRAY,
769 .data_source = direction,
770 .xarray = xarray,
771 .xarray_start = start,
772 .count = count,
773 .iov_offset = 0
774 };
775}
776EXPORT_SYMBOL(iov_iter_xarray);
777
778/**
779 * iov_iter_discard - Initialise an I/O iterator that discards data
780 * @i: The iterator to initialise.
781 * @direction: The direction of the transfer.
782 * @count: The size of the I/O buffer in bytes.
783 *
784 * Set up an I/O iterator that just discards everything that's written to it.
785 * It's only available as a READ iterator.
786 */
787void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
788{
789 BUG_ON(direction != READ);
790 *i = (struct iov_iter){
791 .iter_type = ITER_DISCARD,
792 .data_source = false,
793 .count = count,
794 .iov_offset = 0
795 };
796}
797EXPORT_SYMBOL(iov_iter_discard);
798
799static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
800{
801 const struct iovec *iov = iter_iov(i);
802 unsigned long res = 0;
803 size_t size = i->count;
804 size_t skip = i->iov_offset;
805
806 do {
807 size_t len = iov->iov_len - skip;
808 if (len) {
809 res |= (unsigned long)iov->iov_base + skip;
810 if (len > size)
811 len = size;
812 res |= len;
813 size -= len;
814 }
815 iov++;
816 skip = 0;
817 } while (size);
818 return res;
819}
820
821static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
822{
823 const struct bio_vec *bvec = i->bvec;
824 unsigned res = 0;
825 size_t size = i->count;
826 unsigned skip = i->iov_offset;
827
828 do {
829 size_t len = bvec->bv_len - skip;
830 res |= (unsigned long)bvec->bv_offset + skip;
831 if (len > size)
832 len = size;
833 res |= len;
834 bvec++;
835 size -= len;
836 skip = 0;
837 } while (size);
838
839 return res;
840}
841
842unsigned long iov_iter_alignment(const struct iov_iter *i)
843{
844 if (likely(iter_is_ubuf(i))) {
845 size_t size = i->count;
846 if (size)
847 return ((unsigned long)i->ubuf + i->iov_offset) | size;
848 return 0;
849 }
850
851 /* iovec and kvec have identical layouts */
852 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
853 return iov_iter_alignment_iovec(i);
854
855 if (iov_iter_is_bvec(i))
856 return iov_iter_alignment_bvec(i);
857
858 /* With both xarray and folioq types, we're dealing with whole folios. */
859 if (iov_iter_is_folioq(i))
860 return i->iov_offset | i->count;
861 if (iov_iter_is_xarray(i))
862 return (i->xarray_start + i->iov_offset) | i->count;
863
864 return 0;
865}
866EXPORT_SYMBOL(iov_iter_alignment);
867
868unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
869{
870 unsigned long res = 0;
871 unsigned long v = 0;
872 size_t size = i->count;
873 unsigned k;
874
875 if (iter_is_ubuf(i))
876 return 0;
877
878 if (WARN_ON(!iter_is_iovec(i)))
879 return ~0U;
880
881 for (k = 0; k < i->nr_segs; k++) {
882 const struct iovec *iov = iter_iov(i) + k;
883 if (iov->iov_len) {
884 unsigned long base = (unsigned long)iov->iov_base;
885 if (v) // if not the first one
886 res |= base | v; // this start | previous end
887 v = base + iov->iov_len;
888 if (size <= iov->iov_len)
889 break;
890 size -= iov->iov_len;
891 }
892 }
893 return res;
894}
895EXPORT_SYMBOL(iov_iter_gap_alignment);
896
897static int want_pages_array(struct page ***res, size_t size,
898 size_t start, unsigned int maxpages)
899{
900 unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);
901
902 if (count > maxpages)
903 count = maxpages;
904 WARN_ON(!count); // caller should've prevented that
905 if (!*res) {
906 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
907 if (!*res)
908 return 0;
909 }
910 return count;
911}
912
913static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
914 struct page ***ppages, size_t maxsize,
915 unsigned maxpages, size_t *_start_offset)
916{
917 const struct folio_queue *folioq = iter->folioq;
918 struct page **pages;
919 unsigned int slot = iter->folioq_slot;
920 size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;
921
922 if (slot >= folioq_nr_slots(folioq)) {
923 folioq = folioq->next;
924 slot = 0;
925 if (WARN_ON(iov_offset != 0))
926 return -EIO;
927 }
928
929 maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
930 if (!maxpages)
931 return -ENOMEM;
932 *_start_offset = iov_offset & ~PAGE_MASK;
933 pages = *ppages;
934
935 for (;;) {
936 struct folio *folio = folioq_folio(folioq, slot);
937 size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
938 size_t part = PAGE_SIZE - offset % PAGE_SIZE;
939
940 if (offset < fsize) {
941 part = umin(part, umin(maxsize - extracted, fsize - offset));
942 count -= part;
943 iov_offset += part;
944 extracted += part;
945
946 *pages = folio_page(folio, offset / PAGE_SIZE);
947 get_page(*pages);
948 pages++;
949 maxpages--;
950 }
951
952 if (maxpages == 0 || extracted >= maxsize)
953 break;
954
955 if (iov_offset >= fsize) {
956 iov_offset = 0;
957 slot++;
958 if (slot == folioq_nr_slots(folioq) && folioq->next) {
959 folioq = folioq->next;
960 slot = 0;
961 }
962 }
963 }
964
965 iter->count = count;
966 iter->iov_offset = iov_offset;
967 iter->folioq = folioq;
968 iter->folioq_slot = slot;
969 return extracted;
970}
971
972static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
973 pgoff_t index, unsigned int nr_pages)
974{
975 XA_STATE(xas, xa, index);
976 struct folio *folio;
977 unsigned int ret = 0;
978
979 rcu_read_lock();
980 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
981 if (xas_retry(&xas, folio))
982 continue;
983
984 /* Has the folio moved or been split? */
985 if (unlikely(folio != xas_reload(&xas))) {
986 xas_reset(&xas);
987 continue;
988 }
989
990 pages[ret] = folio_file_page(folio, xas.xa_index);
991 folio_get(folio);
992 if (++ret == nr_pages)
993 break;
994 }
995 rcu_read_unlock();
996 return ret;
997}
998
999static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1000 struct page ***pages, size_t maxsize,
1001 unsigned maxpages, size_t *_start_offset)
1002{
1003 unsigned nr, offset, count;
1004 pgoff_t index;
1005 loff_t pos;
1006
1007 pos = i->xarray_start + i->iov_offset;
1008 index = pos >> PAGE_SHIFT;
1009 offset = pos & ~PAGE_MASK;
1010 *_start_offset = offset;
1011
1012 count = want_pages_array(pages, maxsize, offset, maxpages);
1013 if (!count)
1014 return -ENOMEM;
1015 nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
1016 if (nr == 0)
1017 return 0;
1018
1019 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1020 i->iov_offset += maxsize;
1021 i->count -= maxsize;
1022 return maxsize;
1023}
1024
1025/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
1026static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
1027{
1028 size_t skip;
1029 long k;
1030
1031 if (iter_is_ubuf(i))
1032 return (unsigned long)i->ubuf + i->iov_offset;
1033
1034 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1035 const struct iovec *iov = iter_iov(i) + k;
1036 size_t len = iov->iov_len - skip;
1037
1038 if (unlikely(!len))
1039 continue;
1040 if (*size > len)
1041 *size = len;
1042 return (unsigned long)iov->iov_base + skip;
1043 }
1044 BUG(); // if it had been empty, we wouldn't get called
1045}
1046
1047/* must be done on non-empty ITER_BVEC one */
1048static struct page *first_bvec_segment(const struct iov_iter *i,
1049 size_t *size, size_t *start)
1050{
1051 struct page *page;
1052 size_t skip = i->iov_offset, len;
1053
1054 len = i->bvec->bv_len - skip;
1055 if (*size > len)
1056 *size = len;
1057 skip += i->bvec->bv_offset;
1058 page = i->bvec->bv_page + skip / PAGE_SIZE;
1059 *start = skip % PAGE_SIZE;
1060 return page;
1061}
1062
1063static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
1064 struct page ***pages, size_t maxsize,
1065 unsigned int maxpages, size_t *start)
1066{
1067 unsigned int n, gup_flags = 0;
1068
1069 if (maxsize > i->count)
1070 maxsize = i->count;
1071 if (!maxsize)
1072 return 0;
1073 if (maxsize > MAX_RW_COUNT)
1074 maxsize = MAX_RW_COUNT;
1075
1076 if (likely(user_backed_iter(i))) {
1077 unsigned long addr;
1078 int res;
1079
1080 if (iov_iter_rw(i) != WRITE)
1081 gup_flags |= FOLL_WRITE;
1082 if (i->nofault)
1083 gup_flags |= FOLL_NOFAULT;
1084
1085 addr = first_iovec_segment(i, &maxsize);
1086 *start = addr % PAGE_SIZE;
1087 addr &= PAGE_MASK;
1088 n = want_pages_array(pages, maxsize, *start, maxpages);
1089 if (!n)
1090 return -ENOMEM;
1091 res = get_user_pages_fast(addr, n, gup_flags, *pages);
1092 if (unlikely(res <= 0))
1093 return res;
1094 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1095 iov_iter_advance(i, maxsize);
1096 return maxsize;
1097 }
1098 if (iov_iter_is_bvec(i)) {
1099 struct page **p;
1100 struct page *page;
1101
1102 page = first_bvec_segment(i, &maxsize, start);
1103 n = want_pages_array(pages, maxsize, *start, maxpages);
1104 if (!n)
1105 return -ENOMEM;
1106 p = *pages;
1107 for (int k = 0; k < n; k++) {
1108 struct folio *folio = page_folio(page + k);
1109 p[k] = page + k;
1110 if (!folio_test_slab(folio))
1111 folio_get(folio);
1112 }
1113 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1114 i->count -= maxsize;
1115 i->iov_offset += maxsize;
1116 if (i->iov_offset == i->bvec->bv_len) {
1117 i->iov_offset = 0;
1118 i->bvec++;
1119 i->nr_segs--;
1120 }
1121 return maxsize;
1122 }
1123 if (iov_iter_is_folioq(i))
1124 return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
1125 if (iov_iter_is_xarray(i))
1126 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1127 return -EFAULT;
1128}
1129
1130ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
1131 size_t maxsize, unsigned maxpages, size_t *start)
1132{
1133 if (!maxpages)
1134 return 0;
1135 BUG_ON(!pages);
1136
1137 return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
1138}
1139EXPORT_SYMBOL(iov_iter_get_pages2);
1140
1141ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
1142 struct page ***pages, size_t maxsize, size_t *start)
1143{
1144 ssize_t len;
1145
1146 *pages = NULL;
1147
1148 len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
1149 if (len <= 0) {
1150 kvfree(*pages);
1151 *pages = NULL;
1152 }
1153 return len;
1154}
1155EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
1156
1157static int iov_npages(const struct iov_iter *i, int maxpages)
1158{
1159 size_t skip = i->iov_offset, size = i->count;
1160 const struct iovec *p;
1161 int npages = 0;
1162
1163 for (p = iter_iov(i); size; skip = 0, p++) {
1164 unsigned offs = offset_in_page(p->iov_base + skip);
1165 size_t len = min(p->iov_len - skip, size);
1166
1167 if (len) {
1168 size -= len;
1169 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1170 if (unlikely(npages > maxpages))
1171 return maxpages;
1172 }
1173 }
1174 return npages;
1175}
1176
1177static int bvec_npages(const struct iov_iter *i, int maxpages)
1178{
1179 size_t skip = i->iov_offset, size = i->count;
1180 const struct bio_vec *p;
1181 int npages = 0;
1182
1183 for (p = i->bvec; size; skip = 0, p++) {
1184 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1185 size_t len = min(p->bv_len - skip, size);
1186
1187 size -= len;
1188 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1189 if (unlikely(npages > maxpages))
1190 return maxpages;
1191 }
1192 return npages;
1193}
1194
1195int iov_iter_npages(const struct iov_iter *i, int maxpages)
1196{
1197 if (unlikely(!i->count))
1198 return 0;
1199 if (likely(iter_is_ubuf(i))) {
1200 unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
1201 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
1202 return min(npages, maxpages);
1203 }
1204 /* iovec and kvec have identical layouts */
1205 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1206 return iov_npages(i, maxpages);
1207 if (iov_iter_is_bvec(i))
1208 return bvec_npages(i, maxpages);
1209 if (iov_iter_is_folioq(i)) {
1210 unsigned offset = i->iov_offset % PAGE_SIZE;
1211 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1212 return min(npages, maxpages);
1213 }
1214 if (iov_iter_is_xarray(i)) {
1215 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1216 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1217 return min(npages, maxpages);
1218 }
1219 return 0;
1220}
1221EXPORT_SYMBOL(iov_iter_npages);
1222
1223const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1224{
1225 *new = *old;
1226 if (iov_iter_is_bvec(new))
1227 return new->bvec = kmemdup(new->bvec,
1228 new->nr_segs * sizeof(struct bio_vec),
1229 flags);
1230 else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
1231 /* iovec and kvec have identical layout */
1232 return new->__iov = kmemdup(new->__iov,
1233 new->nr_segs * sizeof(struct iovec),
1234 flags);
1235 return NULL;
1236}
1237EXPORT_SYMBOL(dup_iter);
1238
1239static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
1240 const struct iovec __user *uvec, u32 nr_segs)
1241{
1242 const struct compat_iovec __user *uiov =
1243 (const struct compat_iovec __user *)uvec;
1244 int ret = -EFAULT;
1245 u32 i;
1246
1247 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1248 return -EFAULT;
1249
1250 for (i = 0; i < nr_segs; i++) {
1251 compat_uptr_t buf;
1252 compat_ssize_t len;
1253
1254 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1255 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1256
1257 /* check for compat_size_t not fitting in compat_ssize_t .. */
1258 if (len < 0) {
1259 ret = -EINVAL;
1260 goto uaccess_end;
1261 }
1262 iov[i].iov_base = compat_ptr(buf);
1263 iov[i].iov_len = len;
1264 }
1265
1266 ret = 0;
1267uaccess_end:
1268 user_access_end();
1269 return ret;
1270}
1271
1272static __noclone int copy_iovec_from_user(struct iovec *iov,
1273 const struct iovec __user *uiov, unsigned long nr_segs)
1274{
1275 int ret = -EFAULT;
1276
1277 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1278 return -EFAULT;
1279
1280 do {
1281 void __user *buf;
1282 ssize_t len;
1283
1284 unsafe_get_user(len, &uiov->iov_len, uaccess_end);
1285 unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
1286
1287 /* check for size_t not fitting in ssize_t .. */
1288 if (unlikely(len < 0)) {
1289 ret = -EINVAL;
1290 goto uaccess_end;
1291 }
1292 iov->iov_base = buf;
1293 iov->iov_len = len;
1294
1295 uiov++; iov++;
1296 } while (--nr_segs);
1297
1298 ret = 0;
1299uaccess_end:
1300 user_access_end();
1301 return ret;
1302}
1303
1304struct iovec *iovec_from_user(const struct iovec __user *uvec,
1305 unsigned long nr_segs, unsigned long fast_segs,
1306 struct iovec *fast_iov, bool compat)
1307{
1308 struct iovec *iov = fast_iov;
1309 int ret;
1310
1311 /*
1312 * SuS says "The readv() function *may* fail if the iovcnt argument was
1313 * less than or equal to 0, or greater than {IOV_MAX}. Linux has
1314 * traditionally returned zero for zero segments, so...
1315 */
1316 if (nr_segs == 0)
1317 return iov;
1318 if (nr_segs > UIO_MAXIOV)
1319 return ERR_PTR(-EINVAL);
1320 if (nr_segs > fast_segs) {
1321 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1322 if (!iov)
1323 return ERR_PTR(-ENOMEM);
1324 }
1325
1326 if (unlikely(compat))
1327 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1328 else
1329 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1330 if (ret) {
1331 if (iov != fast_iov)
1332 kfree(iov);
1333 return ERR_PTR(ret);
1334 }
1335
1336 return iov;
1337}
1338
1339/*
1340 * Single segment iovec supplied by the user, import it as ITER_UBUF.
1341 */
1342static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
1343 struct iovec **iovp, struct iov_iter *i,
1344 bool compat)
1345{
1346 struct iovec *iov = *iovp;
1347 ssize_t ret;
1348
1349 *iovp = NULL;
1350
1351 if (compat)
1352 ret = copy_compat_iovec_from_user(iov, uvec, 1);
1353 else
1354 ret = copy_iovec_from_user(iov, uvec, 1);
1355 if (unlikely(ret))
1356 return ret;
1357
1358 ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
1359 if (unlikely(ret))
1360 return ret;
1361 return i->count;
1362}
1363
1364ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1365 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1366 struct iov_iter *i, bool compat)
1367{
1368 ssize_t total_len = 0;
1369 unsigned long seg;
1370 struct iovec *iov;
1371
1372 if (nr_segs == 1)
1373 return __import_iovec_ubuf(type, uvec, iovp, i, compat);
1374
1375 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1376 if (IS_ERR(iov)) {
1377 *iovp = NULL;
1378 return PTR_ERR(iov);
1379 }
1380
1381 /*
1382 * According to the Single Unix Specification we should return EINVAL if
1383 * an element length is < 0 when cast to ssize_t or if the total length
1384 * would overflow the ssize_t return value of the system call.
1385 *
1386 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1387 * overflow case.
1388 */
1389 for (seg = 0; seg < nr_segs; seg++) {
1390 ssize_t len = (ssize_t)iov[seg].iov_len;
1391
1392 if (!access_ok(iov[seg].iov_base, len)) {
1393 if (iov != *iovp)
1394 kfree(iov);
1395 *iovp = NULL;
1396 return -EFAULT;
1397 }
1398
1399 if (len > MAX_RW_COUNT - total_len) {
1400 len = MAX_RW_COUNT - total_len;
1401 iov[seg].iov_len = len;
1402 }
1403 total_len += len;
1404 }
1405
1406 iov_iter_init(i, type, iov, nr_segs, total_len);
1407 if (iov == *iovp)
1408 *iovp = NULL;
1409 else
1410 *iovp = iov;
1411 return total_len;
1412}
1413
1414/**
1415 * import_iovec() - Copy an array of &struct iovec from userspace
1416 * into the kernel, check that it is valid, and initialize a new
1417 * &struct iov_iter iterator to access it.
1418 *
1419 * @type: One of %READ or %WRITE.
1420 * @uvec: Pointer to the userspace array.
1421 * @nr_segs: Number of elements in userspace array.
1422 * @fast_segs: Number of elements in @iov.
1423 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1424 * on-stack) kernel array.
1425 * @i: Pointer to iterator that will be initialized on success.
1426 *
1427 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1428 * then this function places %NULL in *@iov on return. Otherwise, a new
1429 * array will be allocated and the result placed in *@iov. This means that
1430 * the caller may call kfree() on *@iov regardless of whether the small
1431 * on-stack array was used or not (and regardless of whether this function
1432 * returns an error or not).
1433 *
1434 * Return: Negative error code on error, bytes imported on success
1435 */
1436ssize_t import_iovec(int type, const struct iovec __user *uvec,
1437 unsigned nr_segs, unsigned fast_segs,
1438 struct iovec **iovp, struct iov_iter *i)
1439{
1440 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1441 in_compat_syscall());
1442}
1443EXPORT_SYMBOL(import_iovec);
1444
1445int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
1446{
1447 if (len > MAX_RW_COUNT)
1448 len = MAX_RW_COUNT;
1449 if (unlikely(!access_ok(buf, len)))
1450 return -EFAULT;
1451
1452 iov_iter_ubuf(i, rw, buf, len);
1453 return 0;
1454}
1455EXPORT_SYMBOL_GPL(import_ubuf);
1456
1457/**
1458 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1459 * iov_iter_save_state() was called.
1460 *
1461 * @i: &struct iov_iter to restore
1462 * @state: state to restore from
1463 *
1464 * Used after iov_iter_save_state() to bring restore @i, if operations may
1465 * have advanced it.
1466 *
1467 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1468 */
1469void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1470{
1471 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
1472 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
1473 return;
1474 i->iov_offset = state->iov_offset;
1475 i->count = state->count;
1476 if (iter_is_ubuf(i))
1477 return;
1478 /*
1479 * For the *vec iters, nr_segs + iov is constant - if we increment
1480 * the vec, then we also decrement the nr_segs count. Hence we don't
1481 * need to track both of these, just one is enough and we can deduct
1482 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
1483 * size, so we can just increment the iov pointer as they are unionzed.
1484 * ITER_BVEC _may_ be the same size on some archs, but on others it is
1485 * not. Be safe and handle it separately.
1486 */
1487 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
1488 if (iov_iter_is_bvec(i))
1489 i->bvec -= state->nr_segs - i->nr_segs;
1490 else
1491 i->__iov -= state->nr_segs - i->nr_segs;
1492 i->nr_segs = state->nr_segs;
1493}
1494
1495/*
1496 * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does
1497 * not get references on the pages, nor does it get a pin on them.
1498 */
1499static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
1500 struct page ***pages, size_t maxsize,
1501 unsigned int maxpages,
1502 iov_iter_extraction_t extraction_flags,
1503 size_t *offset0)
1504{
1505 const struct folio_queue *folioq = i->folioq;
1506 struct page **p;
1507 unsigned int nr = 0;
1508 size_t extracted = 0, offset, slot = i->folioq_slot;
1509
1510 if (slot >= folioq_nr_slots(folioq)) {
1511 folioq = folioq->next;
1512 slot = 0;
1513 if (WARN_ON(i->iov_offset != 0))
1514 return -EIO;
1515 }
1516
1517 offset = i->iov_offset & ~PAGE_MASK;
1518 *offset0 = offset;
1519
1520 maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1521 if (!maxpages)
1522 return -ENOMEM;
1523 p = *pages;
1524
1525 for (;;) {
1526 struct folio *folio = folioq_folio(folioq, slot);
1527 size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
1528 size_t part = PAGE_SIZE - offset % PAGE_SIZE;
1529
1530 if (offset < fsize) {
1531 part = umin(part, umin(maxsize - extracted, fsize - offset));
1532 i->count -= part;
1533 i->iov_offset += part;
1534 extracted += part;
1535
1536 p[nr++] = folio_page(folio, offset / PAGE_SIZE);
1537 }
1538
1539 if (nr >= maxpages || extracted >= maxsize)
1540 break;
1541
1542 if (i->iov_offset >= fsize) {
1543 i->iov_offset = 0;
1544 slot++;
1545 if (slot == folioq_nr_slots(folioq) && folioq->next) {
1546 folioq = folioq->next;
1547 slot = 0;
1548 }
1549 }
1550 }
1551
1552 i->folioq = folioq;
1553 i->folioq_slot = slot;
1554 return extracted;
1555}
1556
1557/*
1558 * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not
1559 * get references on the pages, nor does it get a pin on them.
1560 */
1561static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
1562 struct page ***pages, size_t maxsize,
1563 unsigned int maxpages,
1564 iov_iter_extraction_t extraction_flags,
1565 size_t *offset0)
1566{
1567 struct page **p;
1568 struct folio *folio;
1569 unsigned int nr = 0, offset;
1570 loff_t pos = i->xarray_start + i->iov_offset;
1571 XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT);
1572
1573 offset = pos & ~PAGE_MASK;
1574 *offset0 = offset;
1575
1576 maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1577 if (!maxpages)
1578 return -ENOMEM;
1579 p = *pages;
1580
1581 rcu_read_lock();
1582 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
1583 if (xas_retry(&xas, folio))
1584 continue;
1585
1586 /* Has the folio moved or been split? */
1587 if (unlikely(folio != xas_reload(&xas))) {
1588 xas_reset(&xas);
1589 continue;
1590 }
1591
1592 p[nr++] = folio_file_page(folio, xas.xa_index);
1593 if (nr == maxpages)
1594 break;
1595 }
1596 rcu_read_unlock();
1597
1598 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1599 iov_iter_advance(i, maxsize);
1600 return maxsize;
1601}
1602
1603/*
1604 * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
1605 * This does not get references on the pages, nor does it get a pin on them.
1606 */
1607static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
1608 struct page ***pages, size_t maxsize,
1609 unsigned int maxpages,
1610 iov_iter_extraction_t extraction_flags,
1611 size_t *offset0)
1612{
1613 size_t skip = i->iov_offset, size = 0;
1614 struct bvec_iter bi;
1615 int k = 0;
1616
1617 if (i->nr_segs == 0)
1618 return 0;
1619
1620 if (i->iov_offset == i->bvec->bv_len) {
1621 i->iov_offset = 0;
1622 i->nr_segs--;
1623 i->bvec++;
1624 skip = 0;
1625 }
1626 bi.bi_idx = 0;
1627 bi.bi_size = maxsize;
1628 bi.bi_bvec_done = skip;
1629
1630 maxpages = want_pages_array(pages, maxsize, skip, maxpages);
1631
1632 while (bi.bi_size && bi.bi_idx < i->nr_segs) {
1633 struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);
1634
1635 /*
1636 * The iov_iter_extract_pages interface only allows an offset
1637 * into the first page. Break out of the loop if we see an
1638 * offset into subsequent pages, the caller will have to call
1639 * iov_iter_extract_pages again for the reminder.
1640 */
1641 if (k) {
1642 if (bv.bv_offset)
1643 break;
1644 } else {
1645 *offset0 = bv.bv_offset;
1646 }
1647
1648 (*pages)[k++] = bv.bv_page;
1649 size += bv.bv_len;
1650
1651 if (k >= maxpages)
1652 break;
1653
1654 /*
1655 * We are done when the end of the bvec doesn't align to a page
1656 * boundary as that would create a hole in the returned space.
1657 * The caller will handle this with another call to
1658 * iov_iter_extract_pages.
1659 */
1660 if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
1661 break;
1662
1663 bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
1664 }
1665
1666 iov_iter_advance(i, size);
1667 return size;
1668}
1669
1670/*
1671 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
1672 * This does not get references on the pages, nor does it get a pin on them.
1673 */
1674static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
1675 struct page ***pages, size_t maxsize,
1676 unsigned int maxpages,
1677 iov_iter_extraction_t extraction_flags,
1678 size_t *offset0)
1679{
1680 struct page **p, *page;
1681 const void *kaddr;
1682 size_t skip = i->iov_offset, offset, len, size;
1683 int k;
1684
1685 for (;;) {
1686 if (i->nr_segs == 0)
1687 return 0;
1688 size = min(maxsize, i->kvec->iov_len - skip);
1689 if (size)
1690 break;
1691 i->iov_offset = 0;
1692 i->nr_segs--;
1693 i->kvec++;
1694 skip = 0;
1695 }
1696
1697 kaddr = i->kvec->iov_base + skip;
1698 offset = (unsigned long)kaddr & ~PAGE_MASK;
1699 *offset0 = offset;
1700
1701 maxpages = want_pages_array(pages, size, offset, maxpages);
1702 if (!maxpages)
1703 return -ENOMEM;
1704 p = *pages;
1705
1706 kaddr -= offset;
1707 len = offset + size;
1708 for (k = 0; k < maxpages; k++) {
1709 size_t seg = min_t(size_t, len, PAGE_SIZE);
1710
1711 if (is_vmalloc_or_module_addr(kaddr))
1712 page = vmalloc_to_page(kaddr);
1713 else
1714 page = virt_to_page(kaddr);
1715
1716 p[k] = page;
1717 len -= seg;
1718 kaddr += PAGE_SIZE;
1719 }
1720
1721 size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
1722 iov_iter_advance(i, size);
1723 return size;
1724}
1725
1726/*
1727 * Extract a list of contiguous pages from a user iterator and get a pin on
1728 * each of them. This should only be used if the iterator is user-backed
1729 * (IOBUF/UBUF).
1730 *
1731 * It does not get refs on the pages, but the pages must be unpinned by the
1732 * caller once the transfer is complete.
1733 *
1734 * This is safe to be used where background IO/DMA *is* going to be modifying
1735 * the buffer; using a pin rather than a ref makes forces fork() to give the
1736 * child a copy of the page.
1737 */
1738static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
1739 struct page ***pages,
1740 size_t maxsize,
1741 unsigned int maxpages,
1742 iov_iter_extraction_t extraction_flags,
1743 size_t *offset0)
1744{
1745 unsigned long addr;
1746 unsigned int gup_flags = 0;
1747 size_t offset;
1748 int res;
1749
1750 if (i->data_source == ITER_DEST)
1751 gup_flags |= FOLL_WRITE;
1752 if (extraction_flags & ITER_ALLOW_P2PDMA)
1753 gup_flags |= FOLL_PCI_P2PDMA;
1754 if (i->nofault)
1755 gup_flags |= FOLL_NOFAULT;
1756
1757 addr = first_iovec_segment(i, &maxsize);
1758 *offset0 = offset = addr % PAGE_SIZE;
1759 addr &= PAGE_MASK;
1760 maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1761 if (!maxpages)
1762 return -ENOMEM;
1763 res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
1764 if (unlikely(res <= 0))
1765 return res;
1766 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
1767 iov_iter_advance(i, maxsize);
1768 return maxsize;
1769}
1770
1771/**
1772 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
1773 * @i: The iterator to extract from
1774 * @pages: Where to return the list of pages
1775 * @maxsize: The maximum amount of iterator to extract
1776 * @maxpages: The maximum size of the list of pages
1777 * @extraction_flags: Flags to qualify request
1778 * @offset0: Where to return the starting offset into (*@pages)[0]
1779 *
1780 * Extract a list of contiguous pages from the current point of the iterator,
1781 * advancing the iterator. The maximum number of pages and the maximum amount
1782 * of page contents can be set.
1783 *
1784 * If *@pages is NULL, a page list will be allocated to the required size and
1785 * *@pages will be set to its base. If *@pages is not NULL, it will be assumed
1786 * that the caller allocated a page list at least @maxpages in size and this
1787 * will be filled in.
1788 *
1789 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
1790 * be allowed on the pages extracted.
1791 *
1792 * The iov_iter_extract_will_pin() function can be used to query how cleanup
1793 * should be performed.
1794 *
1795 * Extra refs or pins on the pages may be obtained as follows:
1796 *
1797 * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
1798 * added to the pages, but refs will not be taken.
1799 * iov_iter_extract_will_pin() will return true.
1800 *
1801 * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
1802 * pages are merely listed; no extra refs or pins are obtained.
1803 * iov_iter_extract_will_pin() will return 0.
1804 *
1805 * Note also:
1806 *
1807 * (*) Use with ITER_DISCARD is not supported as that has no content.
1808 *
1809 * On success, the function sets *@pages to the new pagelist, if allocated, and
1810 * sets *offset0 to the offset into the first page.
1811 *
1812 * It may also return -ENOMEM and -EFAULT.
1813 */
1814ssize_t iov_iter_extract_pages(struct iov_iter *i,
1815 struct page ***pages,
1816 size_t maxsize,
1817 unsigned int maxpages,
1818 iov_iter_extraction_t extraction_flags,
1819 size_t *offset0)
1820{
1821 maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
1822 if (!maxsize)
1823 return 0;
1824
1825 if (likely(user_backed_iter(i)))
1826 return iov_iter_extract_user_pages(i, pages, maxsize,
1827 maxpages, extraction_flags,
1828 offset0);
1829 if (iov_iter_is_kvec(i))
1830 return iov_iter_extract_kvec_pages(i, pages, maxsize,
1831 maxpages, extraction_flags,
1832 offset0);
1833 if (iov_iter_is_bvec(i))
1834 return iov_iter_extract_bvec_pages(i, pages, maxsize,
1835 maxpages, extraction_flags,
1836 offset0);
1837 if (iov_iter_is_folioq(i))
1838 return iov_iter_extract_folioq_pages(i, pages, maxsize,
1839 maxpages, extraction_flags,
1840 offset0);
1841 if (iov_iter_is_xarray(i))
1842 return iov_iter_extract_xarray_pages(i, pages, maxsize,
1843 maxpages, extraction_flags,
1844 offset0);
1845 return -EFAULT;
1846}
1847EXPORT_SYMBOL_GPL(iov_iter_extract_pages);