Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2025 Christoph Hellwig
4 */
5#include <linux/blk-integrity.h>
6#include <linux/blk-mq-dma.h>
7#include "blk.h"
8
9struct phys_vec {
10 phys_addr_t paddr;
11 u32 len;
12};
13
14static bool __blk_map_iter_next(struct blk_map_iter *iter)
15{
16 if (iter->iter.bi_size)
17 return true;
18 if (!iter->bio || !iter->bio->bi_next)
19 return false;
20
21 iter->bio = iter->bio->bi_next;
22 if (iter->is_integrity) {
23 iter->iter = bio_integrity(iter->bio)->bip_iter;
24 iter->bvecs = bio_integrity(iter->bio)->bip_vec;
25 } else {
26 iter->iter = iter->bio->bi_iter;
27 iter->bvecs = iter->bio->bi_io_vec;
28 }
29 return true;
30}
31
32static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
33 struct phys_vec *vec)
34{
35 unsigned int max_size;
36 struct bio_vec bv;
37
38 if (!iter->iter.bi_size)
39 return false;
40
41 bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
42 vec->paddr = bvec_phys(&bv);
43 max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
44 bv.bv_len = min(bv.bv_len, max_size);
45 bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
46
47 /*
48 * If we are entirely done with this bi_io_vec entry, check if the next
49 * one could be merged into it. This typically happens when moving to
50 * the next bio, but some callers also don't pack bvecs tight.
51 */
52 while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
53 struct bio_vec next;
54
55 if (!__blk_map_iter_next(iter))
56 break;
57
58 next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
59 if (bv.bv_len + next.bv_len > max_size ||
60 !biovec_phys_mergeable(req->q, &bv, &next))
61 break;
62
63 bv.bv_len += next.bv_len;
64 bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
65 }
66
67 vec->len = bv.bv_len;
68 return true;
69}
70
71/*
72 * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
73 * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
74 * we need to ensure our segments are aligned to this as well.
75 *
76 * Note that there is no point in using the slightly more complicated IOVA based
77 * path for single segment mappings.
78 */
79static inline bool blk_can_dma_map_iova(struct request *req,
80 struct device *dma_dev)
81{
82 return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
83}
84
85static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
86{
87 iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
88 iter->len = vec->len;
89 return true;
90}
91
92static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
93 struct blk_dma_iter *iter, struct phys_vec *vec)
94{
95 unsigned int attrs = 0;
96
97 if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
98 attrs |= DMA_ATTR_MMIO;
99
100 iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
101 rq_dma_dir(req), attrs);
102 if (dma_mapping_error(dma_dev, iter->addr)) {
103 iter->status = BLK_STS_RESOURCE;
104 return false;
105 }
106 iter->len = vec->len;
107 return true;
108}
109
110static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
111 struct dma_iova_state *state, struct blk_dma_iter *iter,
112 struct phys_vec *vec)
113{
114 enum dma_data_direction dir = rq_dma_dir(req);
115 unsigned int mapped = 0;
116 unsigned int attrs = 0;
117 int error;
118
119 iter->addr = state->addr;
120 iter->len = dma_iova_size(state);
121
122 if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
123 attrs |= DMA_ATTR_MMIO;
124
125 do {
126 error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
127 vec->len, dir, attrs);
128 if (error)
129 break;
130 mapped += vec->len;
131 } while (blk_map_iter_next(req, &iter->iter, vec));
132
133 error = dma_iova_sync(dma_dev, state, 0, mapped);
134 if (error) {
135 iter->status = errno_to_blk_status(error);
136 return false;
137 }
138
139 return true;
140}
141
142static inline void blk_rq_map_iter_init(struct request *rq,
143 struct blk_map_iter *iter)
144{
145 struct bio *bio = rq->bio;
146
147 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
148 *iter = (struct blk_map_iter) {
149 .bvecs = &rq->special_vec,
150 .iter = {
151 .bi_size = rq->special_vec.bv_len,
152 }
153 };
154 } else if (bio) {
155 *iter = (struct blk_map_iter) {
156 .bio = bio,
157 .bvecs = bio->bi_io_vec,
158 .iter = bio->bi_iter,
159 };
160 } else {
161 /* the internal flush request may not have bio attached */
162 *iter = (struct blk_map_iter) {};
163 }
164}
165
166static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
167 struct dma_iova_state *state, struct blk_dma_iter *iter,
168 unsigned int total_len)
169{
170 struct phys_vec vec;
171
172 memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
173 iter->status = BLK_STS_OK;
174 iter->p2pdma.map = PCI_P2PDMA_MAP_NONE;
175
176 /*
177 * Grab the first segment ASAP because we'll need it to check for P2P
178 * transfers.
179 */
180 if (!blk_map_iter_next(req, &iter->iter, &vec))
181 return false;
182
183 switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
184 phys_to_page(vec.paddr))) {
185 case PCI_P2PDMA_MAP_BUS_ADDR:
186 return blk_dma_map_bus(iter, &vec);
187 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
188 /*
189 * P2P transfers through the host bridge are treated the
190 * same as non-P2P transfers below and during unmap.
191 */
192 case PCI_P2PDMA_MAP_NONE:
193 break;
194 default:
195 iter->status = BLK_STS_INVAL;
196 return false;
197 }
198
199 if (blk_can_dma_map_iova(req, dma_dev) &&
200 dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
201 return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
202 memset(state, 0, sizeof(*state));
203 return blk_dma_map_direct(req, dma_dev, iter, &vec);
204}
205
206/**
207 * blk_rq_dma_map_iter_start - map the first DMA segment for a request
208 * @req: request to map
209 * @dma_dev: device to map to
210 * @state: DMA IOVA state
211 * @iter: block layer DMA iterator
212 *
213 * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
214 * caller and don't need to be initialized. @state needs to be stored for use
215 * at unmap time, @iter is only needed at map time.
216 *
217 * Returns %false if there is no segment to map, including due to an error, or
218 * %true ft it did map a segment.
219 *
220 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
221 * the length in @iter.len. If no segment was mapped the status code is
222 * returned in @iter.status.
223 *
224 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
225 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
226 * to try to map the following segments.
227 */
228bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
229 struct dma_iova_state *state, struct blk_dma_iter *iter)
230{
231 blk_rq_map_iter_init(req, &iter->iter);
232 return blk_dma_map_iter_start(req, dma_dev, state, iter,
233 blk_rq_payload_bytes(req));
234}
235EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
236
237/**
238 * blk_rq_dma_map_iter_next - map the next DMA segment for a request
239 * @req: request to map
240 * @dma_dev: device to map to
241 * @state: DMA IOVA state
242 * @iter: block layer DMA iterator
243 *
244 * Iterate to the next mapping after a previous call to
245 * blk_rq_dma_map_iter_start(). See there for a detailed description of the
246 * arguments.
247 *
248 * Returns %false if there is no segment to map, including due to an error, or
249 * %true ft it did map a segment.
250 *
251 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
252 * the length in @iter.len. If no segment was mapped the status code is
253 * returned in @iter.status.
254 */
255bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
256 struct dma_iova_state *state, struct blk_dma_iter *iter)
257{
258 struct phys_vec vec;
259
260 if (!blk_map_iter_next(req, &iter->iter, &vec))
261 return false;
262
263 if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
264 return blk_dma_map_bus(iter, &vec);
265 return blk_dma_map_direct(req, dma_dev, iter, &vec);
266}
267EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
268
269static inline struct scatterlist *
270blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
271{
272 if (!*sg)
273 return sglist;
274
275 /*
276 * If the driver previously mapped a shorter list, we could see a
277 * termination bit prematurely unless it fully inits the sg table
278 * on each mapping. We KNOW that there must be more entries here
279 * or the driver would be buggy, so force clear the termination bit
280 * to avoid doing a full sg_init_table() in drivers for each command.
281 */
282 sg_unmark_end(*sg);
283 return sg_next(*sg);
284}
285
286/*
287 * Map a request to scatterlist, return number of sg entries setup. Caller
288 * must make sure sg can hold rq->nr_phys_segments entries.
289 */
290int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
291 struct scatterlist **last_sg)
292{
293 struct blk_map_iter iter;
294 struct phys_vec vec;
295 int nsegs = 0;
296
297 blk_rq_map_iter_init(rq, &iter);
298 while (blk_map_iter_next(rq, &iter, &vec)) {
299 *last_sg = blk_next_sg(last_sg, sglist);
300 sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
301 offset_in_page(vec.paddr));
302 nsegs++;
303 }
304
305 if (*last_sg)
306 sg_mark_end(*last_sg);
307
308 /*
309 * Something must have been wrong if the figured number of
310 * segment is bigger than number of req's physical segments
311 */
312 WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
313
314 return nsegs;
315}
316EXPORT_SYMBOL(__blk_rq_map_sg);
317
318#ifdef CONFIG_BLK_DEV_INTEGRITY
319/**
320 * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
321 * for a request
322 * @req: request to map
323 * @dma_dev: device to map to
324 * @state: DMA IOVA state
325 * @iter: block layer DMA iterator
326 *
327 * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are
328 * provided by the caller and don't need to be initialized. @state needs to be
329 * stored for use at unmap time, @iter is only needed at map time.
330 *
331 * Returns %false if there is no segment to map, including due to an error, or
332 * %true if it did map a segment.
333 *
334 * If a segment was mapped, the DMA address for it is returned in @iter.addr
335 * and the length in @iter.len. If no segment was mapped the status code is
336 * returned in @iter.status.
337 *
338 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
339 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
340 * to try to map the following segments.
341 */
342bool blk_rq_integrity_dma_map_iter_start(struct request *req,
343 struct device *dma_dev, struct dma_iova_state *state,
344 struct blk_dma_iter *iter)
345{
346 unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
347 blk_rq_sectors(req));
348 struct bio *bio = req->bio;
349
350 iter->iter = (struct blk_map_iter) {
351 .bio = bio,
352 .iter = bio_integrity(bio)->bip_iter,
353 .bvecs = bio_integrity(bio)->bip_vec,
354 .is_integrity = true,
355 };
356 return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
357}
358EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
359
360/**
361 * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for
362 * a request
363 * @req: request to map
364 * @dma_dev: device to map to
365 * @state: DMA IOVA state
366 * @iter: block layer DMA iterator
367 *
368 * Iterate to the next integrity mapping after a previous call to
369 * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description
370 * of the arguments.
371 *
372 * Returns %false if there is no segment to map, including due to an error, or
373 * %true if it did map a segment.
374 *
375 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
376 * the length in @iter.len. If no segment was mapped the status code is
377 * returned in @iter.status.
378 */
379bool blk_rq_integrity_dma_map_iter_next(struct request *req,
380 struct device *dma_dev, struct blk_dma_iter *iter)
381{
382 struct phys_vec vec;
383
384 if (!blk_map_iter_next(req, &iter->iter, &vec))
385 return false;
386
387 if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
388 return blk_dma_map_bus(iter, &vec);
389 return blk_dma_map_direct(req, dma_dev, iter, &vec);
390}
391EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
392
393/**
394 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
395 * @rq: request to map
396 * @sglist: target scatterlist
397 *
398 * Description: Map the integrity vectors in request into a
399 * scatterlist. The scatterlist must be big enough to hold all
400 * elements. I.e. sized using blk_rq_count_integrity_sg() or
401 * rq->nr_integrity_segments.
402 */
403int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
404{
405 struct request_queue *q = rq->q;
406 struct scatterlist *sg = NULL;
407 struct bio *bio = rq->bio;
408 unsigned int segments = 0;
409 struct phys_vec vec;
410
411 struct blk_map_iter iter = {
412 .bio = bio,
413 .iter = bio_integrity(bio)->bip_iter,
414 .bvecs = bio_integrity(bio)->bip_vec,
415 .is_integrity = true,
416 };
417
418 while (blk_map_iter_next(rq, &iter, &vec)) {
419 sg = blk_next_sg(&sg, sglist);
420 sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
421 offset_in_page(vec.paddr));
422 segments++;
423 }
424
425 if (sg)
426 sg_mark_end(sg);
427
428 /*
429 * Something must have been wrong if the figured number of segment
430 * is bigger than number of req's physical integrity segments
431 */
432 BUG_ON(segments > rq->nr_integrity_segments);
433 BUG_ON(segments > queue_max_integrity_segments(q));
434 return segments;
435}
436EXPORT_SYMBOL(blk_rq_map_integrity_sg);
437#endif