Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[POWERPC] Use 4kB iommu pages even on 64kB-page systems

The 10Gigabit ethernet device drivers appear to be able to chew
up all 256MB of TCE mappings on pSeries systems, as evidenced by
numerous error messages:

iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1

Some experimentation indicates that this is essentially because
one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
the large 64K pages are enabled. Thus, it doesn't take much to
exhaust all of the available DMA mappings for a high-speed card.

This patch changes the iommu allocator to work with its own
unique, distinct page size. Although the patch is long, its
actually quite simple: it just #defines a distinct IOMMU_PAGE_SIZE
and then uses this in all the places that matter.

As a side effect, it also dramatically improves network performance
on platforms with H-calls on iommu translation inserts/removes (since
we no longer call it 16 times for a 1500 bytes packet when the iommu HW
is still 4k).

In the future, we might want to make the IOMMU_PAGE_SIZE a variable
in the iommu_table instance, thus allowing support for different HW
page sizes in the iommu itself.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>

authored by

Linas Vepstas and committed by
Paul Mackerras
5d2efba6 dd6c89f6

+80 -81
+45 -32
arch/powerpc/kernel/iommu.c
··· 47 47 static int novmerge = 1; 48 48 #endif 49 49 50 + static inline unsigned long iommu_num_pages(unsigned long vaddr, 51 + unsigned long slen) 52 + { 53 + unsigned long npages; 54 + 55 + npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK); 56 + npages >>= IOMMU_PAGE_SHIFT; 57 + 58 + return npages; 59 + } 60 + 50 61 static int __init setup_iommu(char *str) 51 62 { 52 63 if (!strcmp(str, "novmerge")) ··· 189 178 } 190 179 191 180 entry += tbl->it_offset; /* Offset into real TCE table */ 192 - ret = entry << PAGE_SHIFT; /* Set the return dma address */ 181 + ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ 193 182 194 183 /* Put the TCEs in the HW table */ 195 - ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK, 184 + ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK, 196 185 direction); 197 186 198 187 ··· 214 203 unsigned long entry, free_entry; 215 204 unsigned long i; 216 205 217 - entry = dma_addr >> PAGE_SHIFT; 206 + entry = dma_addr >> IOMMU_PAGE_SHIFT; 218 207 free_entry = entry - tbl->it_offset; 219 208 220 209 if (((free_entry + npages) > tbl->it_size) || ··· 281 270 /* Init first segment length for backout at failure */ 282 271 outs->dma_length = 0; 283 272 284 - DBG("mapping %d elements:\n", nelems); 273 + DBG("sg mapping %d elements:\n", nelems); 285 274 286 275 spin_lock_irqsave(&(tbl->it_lock), flags); 287 276 ··· 296 285 } 297 286 /* Allocate iommu entries for that segment */ 298 287 vaddr = (unsigned long)page_address(s->page) + s->offset; 299 - npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK); 300 - npages >>= PAGE_SHIFT; 301 - entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0); 288 + npages = iommu_num_pages(vaddr, slen); 289 + entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0); 302 290 303 291 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); 304 292 ··· 311 301 312 302 /* Convert entry to a dma_addr_t */ 313 303 entry += tbl->it_offset; 314 - dma_addr = entry << PAGE_SHIFT; 315 - dma_addr |= s->offset; 304 + dma_addr = entry << IOMMU_PAGE_SHIFT; 305 + dma_addr |= (s->offset & ~IOMMU_PAGE_MASK); 316 306 317 - DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n", 307 + DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", 318 308 npages, entry, dma_addr); 319 309 320 310 /* Insert into HW table */ 321 - ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction); 311 + ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction); 322 312 323 313 /* If we are in an open segment, try merging */ 324 314 if (segstart != s) { ··· 333 323 DBG(" can't merge, new segment.\n"); 334 324 } else { 335 325 outs->dma_length += s->length; 336 - DBG(" merged, new len: %lx\n", outs->dma_length); 326 + DBG(" merged, new len: %ux\n", outs->dma_length); 337 327 } 338 328 } 339 329 ··· 377 367 if (s->dma_length != 0) { 378 368 unsigned long vaddr, npages; 379 369 380 - vaddr = s->dma_address & PAGE_MASK; 381 - npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr) 382 - >> PAGE_SHIFT; 370 + vaddr = s->dma_address & IOMMU_PAGE_MASK; 371 + npages = iommu_num_pages(s->dma_address, s->dma_length); 383 372 __iommu_free(tbl, vaddr, npages); 384 373 s->dma_address = DMA_ERROR_CODE; 385 374 s->dma_length = 0; ··· 407 398 408 399 if (sglist->dma_length == 0) 409 400 break; 410 - npages = (PAGE_ALIGN(dma_handle + sglist->dma_length) 411 - - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; 401 + npages = iommu_num_pages(dma_handle,sglist->dma_length); 412 402 __iommu_free(tbl, dma_handle, npages); 413 403 sglist++; 414 404 } ··· 540 532 BUG_ON(direction == DMA_NONE); 541 533 542 534 uaddr = (unsigned long)vaddr; 543 - npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); 544 - npages >>= PAGE_SHIFT; 535 + npages = iommu_num_pages(uaddr, size); 545 536 546 537 if (tbl) { 547 538 dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 548 - mask >> PAGE_SHIFT, 0); 539 + mask >> IOMMU_PAGE_SHIFT, 0); 549 540 if (dma_handle == DMA_ERROR_CODE) { 550 541 if (printk_ratelimit()) { 551 542 printk(KERN_INFO "iommu_alloc failed, " ··· 552 545 tbl, vaddr, npages); 553 546 } 554 547 } else 555 - dma_handle |= (uaddr & ~PAGE_MASK); 548 + dma_handle |= (uaddr & ~IOMMU_PAGE_MASK); 556 549 } 557 550 558 551 return dma_handle; ··· 561 554 void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, 562 555 size_t size, enum dma_data_direction direction) 563 556 { 557 + unsigned int npages; 558 + 564 559 BUG_ON(direction == DMA_NONE); 565 560 566 - if (tbl) 567 - iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - 568 - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); 561 + if (tbl) { 562 + npages = iommu_num_pages(dma_handle, size); 563 + iommu_free(tbl, dma_handle, npages); 564 + } 569 565 } 570 566 571 567 /* Allocates a contiguous real buffer and creates mappings over it. ··· 580 570 { 581 571 void *ret = NULL; 582 572 dma_addr_t mapping; 583 - unsigned int npages, order; 573 + unsigned int order; 574 + unsigned int nio_pages, io_order; 584 575 struct page *page; 585 576 586 577 size = PAGE_ALIGN(size); 587 - npages = size >> PAGE_SHIFT; 588 578 order = get_order(size); 589 579 590 580 /* ··· 608 598 memset(ret, 0, size); 609 599 610 600 /* Set up tces to cover the allocated range */ 611 - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, 612 - mask >> PAGE_SHIFT, order); 601 + nio_pages = size >> IOMMU_PAGE_SHIFT; 602 + io_order = get_iommu_order(size); 603 + mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL, 604 + mask >> IOMMU_PAGE_SHIFT, io_order); 613 605 if (mapping == DMA_ERROR_CODE) { 614 606 free_pages((unsigned long)ret, order); 615 607 return NULL; ··· 623 611 void iommu_free_coherent(struct iommu_table *tbl, size_t size, 624 612 void *vaddr, dma_addr_t dma_handle) 625 613 { 626 - unsigned int npages; 627 - 628 614 if (tbl) { 615 + unsigned int nio_pages; 616 + 629 617 size = PAGE_ALIGN(size); 630 - npages = size >> PAGE_SHIFT; 631 - iommu_free(tbl, dma_handle, npages); 618 + nio_pages = size >> IOMMU_PAGE_SHIFT; 619 + iommu_free(tbl, dma_handle, nio_pages); 620 + size = PAGE_ALIGN(size); 632 621 free_pages((unsigned long)vaddr, get_order(size)); 633 622 } 634 623 }
+2 -2
arch/powerpc/kernel/vio.c
··· 92 92 &tbl->it_index, &offset, &size); 93 93 94 94 /* TCE table size - measured in tce entries */ 95 - tbl->it_size = size >> PAGE_SHIFT; 95 + tbl->it_size = size >> IOMMU_PAGE_SHIFT; 96 96 /* offset for VIO should always be 0 */ 97 - tbl->it_offset = offset >> PAGE_SHIFT; 97 + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; 98 98 tbl->it_busno = 0; 99 99 tbl->it_type = TCE_VB; 100 100
+2 -9
arch/powerpc/platforms/iseries/iommu.c
··· 43 43 u64 rc; 44 44 u64 tce, rpn; 45 45 46 - index <<= TCE_PAGE_FACTOR; 47 - npages <<= TCE_PAGE_FACTOR; 48 - 49 46 while (npages--) { 50 47 rpn = virt_to_abs(uaddr) >> TCE_SHIFT; 51 48 tce = (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; ··· 71 74 static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) 72 75 { 73 76 u64 rc; 74 - 75 - npages <<= TCE_PAGE_FACTOR; 76 - index <<= TCE_PAGE_FACTOR; 77 77 78 78 while (npages--) { 79 79 rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0); ··· 130 136 panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms); 131 137 132 138 /* itc_size is in pages worth of table, it_size is in # of entries */ 133 - tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) / 134 - TCE_ENTRY_SIZE) >> TCE_PAGE_FACTOR; 139 + tbl->it_size = (parms->itc_size * TCE_PAGE_SIZE) / TCE_ENTRY_SIZE; 135 140 tbl->it_busno = parms->itc_busno; 136 - tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR; 141 + tbl->it_offset = parms->itc_offset; 137 142 tbl->it_index = parms->itc_index; 138 143 tbl->it_blocksize = 1; 139 144 tbl->it_type = virtbus ? TCE_VB : TCE_PCI;
+8 -27
arch/powerpc/platforms/pseries/iommu.c
··· 57 57 u64 *tcep; 58 58 u64 rpn; 59 59 60 - index <<= TCE_PAGE_FACTOR; 61 - npages <<= TCE_PAGE_FACTOR; 62 - 63 60 proto_tce = TCE_PCI_READ; // Read allowed 64 61 65 62 if (direction != DMA_TO_DEVICE) ··· 79 82 { 80 83 u64 *tcep; 81 84 82 - npages <<= TCE_PAGE_FACTOR; 83 - index <<= TCE_PAGE_FACTOR; 84 - 85 85 tcep = ((u64 *)tbl->it_base) + index; 86 86 87 87 while (npages--) ··· 89 95 { 90 96 u64 *tcep; 91 97 92 - index <<= TCE_PAGE_FACTOR; 93 98 tcep = ((u64 *)tbl->it_base) + index; 94 99 95 100 return *tcep; ··· 101 108 u64 rc; 102 109 u64 proto_tce, tce; 103 110 u64 rpn; 104 - 105 - tcenum <<= TCE_PAGE_FACTOR; 106 - npages <<= TCE_PAGE_FACTOR; 107 111 108 112 rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; 109 113 proto_tce = TCE_PCI_READ; ··· 136 146 u64 rpn; 137 147 long l, limit; 138 148 139 - if (TCE_PAGE_FACTOR == 0 && npages == 1) 149 + if (npages == 1) 140 150 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, 141 151 direction); 142 152 ··· 153 163 uaddr, direction); 154 164 __get_cpu_var(tce_page) = tcep; 155 165 } 156 - 157 - tcenum <<= TCE_PAGE_FACTOR; 158 - npages <<= TCE_PAGE_FACTOR; 159 166 160 167 rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; 161 168 proto_tce = TCE_PCI_READ; ··· 194 207 { 195 208 u64 rc; 196 209 197 - tcenum <<= TCE_PAGE_FACTOR; 198 - npages <<= TCE_PAGE_FACTOR; 199 - 200 210 while (npages--) { 201 211 rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); 202 212 ··· 213 229 { 214 230 u64 rc; 215 231 216 - tcenum <<= TCE_PAGE_FACTOR; 217 - npages <<= TCE_PAGE_FACTOR; 218 - 219 232 rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); 220 233 221 234 if (rc && printk_ratelimit()) { ··· 229 248 u64 rc; 230 249 unsigned long tce_ret; 231 250 232 - tcenum <<= TCE_PAGE_FACTOR; 233 251 rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); 234 252 235 253 if (rc && printk_ratelimit()) { ··· 269 289 tbl->it_busno = phb->bus->number; 270 290 271 291 /* Units of tce entries */ 272 - tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT; 292 + tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT; 273 293 274 294 /* Test if we are going over 2GB of DMA space */ 275 295 if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { ··· 280 300 phb->dma_window_base_cur += phb->dma_window_size; 281 301 282 302 /* Set the tce table size - measured in entries */ 283 - tbl->it_size = phb->dma_window_size >> PAGE_SHIFT; 303 + tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT; 284 304 285 305 tbl->it_index = 0; 286 306 tbl->it_blocksize = 16; ··· 305 325 tbl->it_base = 0; 306 326 tbl->it_blocksize = 16; 307 327 tbl->it_type = TCE_PCI; 308 - tbl->it_offset = offset >> PAGE_SHIFT; 309 - tbl->it_size = size >> PAGE_SHIFT; 328 + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; 329 + tbl->it_size = size >> IOMMU_PAGE_SHIFT; 310 330 } 311 331 312 332 static void iommu_bus_setup_pSeries(struct pci_bus *bus) ··· 502 522 const void *dma_window = NULL; 503 523 struct pci_dn *pci; 504 524 505 - DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev)); 506 - 507 525 /* dev setup for LPAR is a little tricky, since the device tree might 508 526 * contain the dma-window properties per-device and not neccesarily 509 527 * for the bus. So we need to search upwards in the tree until we ··· 509 531 * already allocated. 510 532 */ 511 533 dn = pci_device_to_OF_node(dev); 534 + 535 + DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n", 536 + dev, pci_name(dev), dn->full_name); 512 537 513 538 for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; 514 539 pdn = pdn->parent) {
-1
arch/powerpc/sysdev/dart.h
··· 72 72 73 73 #define DART_PAGE_SHIFT 12 74 74 #define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT) 75 - #define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT) 76 75 77 76 78 77 #endif /* _POWERPC_SYSDEV_DART_H */
+1 -7
arch/powerpc/sysdev/dart_iommu.c
··· 156 156 157 157 DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr); 158 158 159 - index <<= DART_PAGE_FACTOR; 160 - npages <<= DART_PAGE_FACTOR; 161 - 162 159 dp = ((unsigned int*)tbl->it_base) + index; 163 160 164 161 /* On U3, all memory is contigous, so we can move this ··· 195 198 */ 196 199 197 200 DBG("dart: free at: %lx, %lx\n", index, npages); 198 - 199 - index <<= DART_PAGE_FACTOR; 200 - npages <<= DART_PAGE_FACTOR; 201 201 202 202 dp = ((unsigned int *)tbl->it_base) + index; 203 203 ··· 275 281 iommu_table_dart.it_busno = 0; 276 282 iommu_table_dart.it_offset = 0; 277 283 /* it_size is in number of entries */ 278 - iommu_table_dart.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR; 284 + iommu_table_dart.it_size = dart_tablesize / sizeof(u32); 279 285 280 286 /* Initialize the common IOMMU code */ 281 287 iommu_table_dart.it_base = (unsigned long)dart_vbase;
+20 -2
include/asm-powerpc/iommu.h
··· 22 22 #define _ASM_IOMMU_H 23 23 #ifdef __KERNEL__ 24 24 25 - #include <asm/types.h> 25 + #include <linux/compiler.h> 26 26 #include <linux/spinlock.h> 27 27 #include <linux/device.h> 28 28 #include <linux/dma-mapping.h> 29 + #include <asm/types.h> 30 + #include <asm/bitops.h> 31 + 32 + #define IOMMU_PAGE_SHIFT 12 33 + #define IOMMU_PAGE_SIZE (ASM_CONST(1) << IOMMU_PAGE_SHIFT) 34 + #define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1)) 35 + #define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE) 36 + 37 + #ifndef __ASSEMBLY__ 38 + 39 + /* Pure 2^n version of get_order */ 40 + static __inline__ __attribute_const__ int get_iommu_order(unsigned long size) 41 + { 42 + return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1; 43 + } 44 + 45 + #endif /* __ASSEMBLY__ */ 46 + 29 47 30 48 /* 31 49 * IOMAP_MAX_ORDER defines the largest contiguous block 32 50 * of dma space we can get. IOMAP_MAX_ORDER = 13 33 51 * allows up to 2**12 pages (4096 * 4096) = 16 MB 34 52 */ 35 - #define IOMAP_MAX_ORDER 13 53 + #define IOMAP_MAX_ORDER 13 36 54 37 55 struct iommu_table { 38 56 unsigned long it_busno; /* Bus number this table belongs to */
+2 -1
include/asm-powerpc/tce.h
··· 22 22 #define _ASM_POWERPC_TCE_H 23 23 #ifdef __KERNEL__ 24 24 25 + #include <asm/iommu.h> 26 + 25 27 /* 26 28 * Tces come in two formats, one for the virtual bus and a different 27 29 * format for PCI ··· 35 33 36 34 #define TCE_SHIFT 12 37 35 #define TCE_PAGE_SIZE (1 << TCE_SHIFT) 38 - #define TCE_PAGE_FACTOR (PAGE_SHIFT - TCE_SHIFT) 39 36 40 37 #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */ 41 38