Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] x86_64: Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.

As a bit of historical background: when the x86-64 port
was originally designed we had some discussion if we should
use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
both. Both was ruled out at this point because it was in early
2.4 when VM is still quite shakey and had bad troubles even
dealing with one DMA zone. We settled on the 16MB DMA zone mainly
because we worried about older soundcards and the floppy.

But this has always caused problems since then because
device drivers had trouble getting enough DMA able memory. These days
the VM works much better and the wide use of NUMA has proven
it can deal with many zones successfully.

So this patch adds both zones.

This helps drivers who need a lot of memory below 4GB because
their hardware is not accessing more (graphic drivers - proprietary
and free ones, video frame buffer drivers, sound drivers etc.).
Previously they could only use IOMMU+16MB GFP_DMA, which
was not enough memory.

Another common problem is that hardware who has full memory
addressing for >4GB misses it for some control structures in memory
(like transmit rings or other metadata). They tended to allocate memory
in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent,
but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory
(even on AMD systems the IOMMU tends to be quite small) especially if you have
many devices. With the new zone pci_alloc_consistent can just put
this stuff into memory below 4GB which works better.

One argument was still if the zone should be 4GB or 2GB. The main
motivation for 2GB would be an unnamed not so unpopular hardware
raid controller (mostly found in older machines from a particular four letter
company) who has a strange 2GB restriction in firmware. But
that one works ok with swiotlb/IOMMU anyways, so it doesn't really
need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
it seems to be the most common restriction.

The new zone is so far added only for x86-64.

For other architectures who don't set up this
new zone nothing changes. Architectures can set a compatibility
define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
as GFP_DMA. Otherwise it's a nop because on 32bit architectures
it's normally not needed because GFP_NORMAL (=0) is DMA able
enough.

One problem is still that GFP_DMA means different things on different
architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA
(trusting it to be 4GB) #elif __x86_64__ (use other hacks like
the swiotlb because 16MB is not enough) ... . This was quite
ugly and is now obsolete.

These should be now converted to use GFP_DMA32 unconditionally. I haven't done
this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
which will use GFP_DMA32 transparently.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Andi Kleen and committed by
Linus Torvalds
a2f1b424 56720367

+90 -55
+42 -23
arch/x86_64/mm/init.c
··· 318 318 flush_tlb_all(); 319 319 } 320 320 321 + /* Compute zone sizes for the DMA and DMA32 zones in a node. */ 322 + __init void 323 + size_zones(unsigned long *z, unsigned long *h, 324 + unsigned long start_pfn, unsigned long end_pfn) 325 + { 326 + int i; 327 + unsigned long w; 328 + 329 + for (i = 0; i < MAX_NR_ZONES; i++) 330 + z[i] = 0; 331 + 332 + if (start_pfn < MAX_DMA_PFN) 333 + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; 334 + if (start_pfn < MAX_DMA32_PFN) { 335 + unsigned long dma32_pfn = MAX_DMA32_PFN; 336 + if (dma32_pfn > end_pfn) 337 + dma32_pfn = end_pfn; 338 + z[ZONE_DMA32] = dma32_pfn - start_pfn; 339 + } 340 + z[ZONE_NORMAL] = end_pfn - start_pfn; 341 + 342 + /* Remove lower zones from higher ones. */ 343 + w = 0; 344 + for (i = 0; i < MAX_NR_ZONES; i++) { 345 + if (z[i]) 346 + z[i] -= w; 347 + w += z[i]; 348 + } 349 + 350 + /* Compute holes */ 351 + w = 0; 352 + for (i = 0; i < MAX_NR_ZONES; i++) { 353 + unsigned long s = w; 354 + w += z[i]; 355 + h[i] = e820_hole_size(s, w); 356 + } 357 + } 358 + 321 359 #ifndef CONFIG_NUMA 322 360 void __init paging_init(void) 323 361 { 324 - { 325 - unsigned long zones_size[MAX_NR_ZONES]; 326 - unsigned long holes[MAX_NR_ZONES]; 327 - unsigned int max_dma; 328 - 329 - memset(zones_size, 0, sizeof(zones_size)); 330 - memset(holes, 0, sizeof(holes)); 331 - 332 - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 333 - 334 - if (end_pfn < max_dma) { 335 - zones_size[ZONE_DMA] = end_pfn; 336 - holes[ZONE_DMA] = e820_hole_size(0, end_pfn); 337 - } else { 338 - zones_size[ZONE_DMA] = max_dma; 339 - holes[ZONE_DMA] = e820_hole_size(0, max_dma); 340 - zones_size[ZONE_NORMAL] = end_pfn - max_dma; 341 - holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); 342 - } 343 - free_area_init_node(0, NODE_DATA(0), zones_size, 344 - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); 345 - } 346 - return; 362 + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; 363 + size_zones(zones, holes, 0, end_pfn); 364 + free_area_init_node(0, NODE_DATA(0), zones, 365 + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); 347 366 } 348 367 #endif 349 368
+5 -20
arch/x86_64/mm/numa.c
··· 132 132 unsigned long start_pfn, end_pfn; 133 133 unsigned long zones[MAX_NR_ZONES]; 134 134 unsigned long holes[MAX_NR_ZONES]; 135 - unsigned long dma_end_pfn; 136 135 137 - memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 138 - memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); 136 + start_pfn = node_start_pfn(nodeid); 137 + end_pfn = node_end_pfn(nodeid); 139 138 140 - start_pfn = node_start_pfn(nodeid); 141 - end_pfn = node_end_pfn(nodeid); 139 + Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", 140 + nodeid, start_pfn, end_pfn); 142 141 143 - Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); 144 - 145 - /* All nodes > 0 have a zero length zone DMA */ 146 - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 147 - if (start_pfn < dma_end_pfn) { 148 - zones[ZONE_DMA] = dma_end_pfn - start_pfn; 149 - holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); 150 - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 151 - holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); 152 - 153 - } else { 154 - zones[ZONE_NORMAL] = end_pfn - start_pfn; 155 - holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); 156 - } 157 - 142 + size_zones(zones, holes, start_pfn, end_pfn); 158 143 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 159 144 start_pfn, holes); 160 145 }
+9 -2
include/asm-x86_64/dma.h
··· 72 72 73 73 #define MAX_DMA_CHANNELS 8 74 74 75 - /* The maximum address that we can perform a DMA transfer to on this platform */ 76 - #define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000) 75 + 76 + /* 16MB ISA DMA zone */ 77 + #define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT) 78 + 79 + /* 4GB broken PCI/AGP hardware bus master zone */ 80 + #define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT) 81 + 82 + /* Compat define for old dma zone */ 83 + #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) 77 84 78 85 /* 8237 DMA controllers */ 79 86 #define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
+2
include/asm-x86_64/proto.h
··· 22 22 #define mtrr_bp_init() do {} while (0) 23 23 #endif 24 24 extern void init_memory_mapping(unsigned long start, unsigned long end); 25 + extern void size_zones(unsigned long *z, unsigned long *h, 26 + unsigned long start_pfn, unsigned long end_pfn); 25 27 26 28 extern void system_call(void); 27 29 extern int kernel_syscall(void);
+11
include/linux/gfp.h
··· 14 14 /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ 15 15 #define __GFP_DMA ((__force gfp_t)0x01u) 16 16 #define __GFP_HIGHMEM ((__force gfp_t)0x02u) 17 + #ifdef CONFIG_DMA_IS_DMA32 18 + #define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */ 19 + #elif BITS_PER_LONG < 64 20 + #define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */ 21 + #else 22 + #define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */ 23 + #endif 17 24 18 25 /* 19 26 * Action modifiers - doesn't change the zoning ··· 70 63 platforms, used as appropriate on others */ 71 64 72 65 #define GFP_DMA __GFP_DMA 66 + 67 + /* 4GB DMA on some platforms */ 68 + #define GFP_DMA32 __GFP_DMA32 69 + 73 70 74 71 #define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK)) 75 72
+9 -7
include/linux/mmzone.h
··· 71 71 #endif 72 72 73 73 #define ZONE_DMA 0 74 - #define ZONE_NORMAL 1 75 - #define ZONE_HIGHMEM 2 74 + #define ZONE_DMA32 1 75 + #define ZONE_NORMAL 2 76 + #define ZONE_HIGHMEM 3 76 77 77 - #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 78 + #define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */ 78 79 #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 79 80 80 81 ··· 109 108 110 109 /* 111 110 * On machines where it is needed (eg PCs) we divide physical memory 112 - * into multiple physical zones. On a PC we have 3 zones: 111 + * into multiple physical zones. On a PC we have 4 zones: 113 112 * 114 113 * ZONE_DMA < 16 MB ISA DMA capable memory 114 + * ZONE_DMA32 0 MB Empty 115 115 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 116 116 * ZONE_HIGHMEM > 896 MB only page cache and user processes 117 117 */ ··· 457 455 458 456 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 459 457 /* 460 - * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 461 - * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 458 + * with 32 bit page->flags field, we reserve 9 bits for node/zone info. 459 + * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes. 462 460 */ 463 - #define FLAGS_RESERVED 8 461 + #define FLAGS_RESERVED 9 464 462 465 463 #elif BITS_PER_LONG == 64 466 464 /*
+12 -3
mm/page_alloc.c
··· 60 60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 61 61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 62 62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 63 + * 64 + * TBD: should special case ZONE_DMA32 machines here - in those we normally 65 + * don't need any ZONE_NORMAL reservation 63 66 */ 64 - int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 67 + int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 65 68 66 69 EXPORT_SYMBOL(totalram_pages); 67 70 EXPORT_SYMBOL(nr_swap_pages); ··· 76 73 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 77 74 EXPORT_SYMBOL(zone_table); 78 75 79 - static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 76 + static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 80 77 int min_free_kbytes = 1024; 81 78 82 79 unsigned long __initdata nr_kernel_pages; ··· 1445 1442 zone = pgdat->node_zones + ZONE_NORMAL; 1446 1443 if (zone->present_pages) 1447 1444 zonelist->zones[j++] = zone; 1445 + case ZONE_DMA32: 1446 + zone = pgdat->node_zones + ZONE_DMA32; 1447 + if (zone->present_pages) 1448 + zonelist->zones[j++] = zone; 1448 1449 case ZONE_DMA: 1449 1450 zone = pgdat->node_zones + ZONE_DMA; 1450 1451 if (zone->present_pages) ··· 1463 1456 int res = ZONE_NORMAL; 1464 1457 if (zone_bits & (__force int)__GFP_HIGHMEM) 1465 1458 res = ZONE_HIGHMEM; 1459 + if (zone_bits & (__force int)__GFP_DMA32) 1460 + res = ZONE_DMA32; 1466 1461 if (zone_bits & (__force int)__GFP_DMA) 1467 1462 res = ZONE_DMA; 1468 1463 return res; ··· 1985 1976 if (zholes_size) 1986 1977 realsize -= zholes_size[j]; 1987 1978 1988 - if (j == ZONE_DMA || j == ZONE_NORMAL) 1979 + if (j < ZONE_HIGHMEM) 1989 1980 nr_kernel_pages += realsize; 1990 1981 nr_all_pages += realsize; 1991 1982