Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, pat: Fix cacheflush address in change_page_attr_set_clr()
mm: remove !NUMA condition from PAGEFLAGS_EXTENDED condition set
x86: Fix earlyprintk=dbgp for machines without NX
x86, pat: Sanity check remap_pfn_range for RAM region
x86, pat: Lookup the protection from memtype list on vm_insert_pfn()
x86, pat: Add lookup_memtype to get the current memtype of a paddr
x86, pat: Use page flags to track memtypes of RAM pages
x86, pat: Generalize the use of page flag PG_uncached
x86, pat: Add rbtree to do quick lookup in memtype tracking
x86, pat: Add PAT reserve free to io_mapping* APIs
x86, pat: New i/f for driver to request memtype for IO regions
x86, pat: ioremap to follow same PAT restrictions as other PAT users
x86, pat: Keep identity maps consistent with mmaps even when pat_disabled
x86, mtrr: make mtrr_aps_delayed_init static bool
x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init
generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled
x86: Fix an incorrect argument of reserve_bootmem()
x86: Fix system crash when loading with "reservetop" parameter

+510 -143
+4
arch/ia64/Kconfig
··· 112 112 bool 113 113 select GENERIC_ALLOCATOR 114 114 115 + config ARCH_USES_PG_UNCACHED 116 + def_bool y 117 + depends on IA64_UNCACHED_ALLOCATOR 118 + 115 119 config AUDIT_ARCH 116 120 bool 117 121 default y
+4
arch/x86/Kconfig
··· 1417 1417 1418 1418 If unsure, say Y. 1419 1419 1420 + config ARCH_USES_PG_UNCACHED 1421 + def_bool y 1422 + depends on X86_PAT 1423 + 1420 1424 config EFI 1421 1425 bool "EFI runtime service support" 1422 1426 depends on ACPI
+52 -2
arch/x86/include/asm/cacheflush.h
··· 43 43 memcpy(dst, src, len); 44 44 } 45 45 46 - #define PG_non_WB PG_arch_1 47 - PAGEFLAG(NonWB, non_WB) 46 + #define PG_WC PG_arch_1 47 + PAGEFLAG(WC, WC) 48 + 49 + #ifdef CONFIG_X86_PAT 50 + /* 51 + * X86 PAT uses page flags WC and Uncached together to keep track of 52 + * memory type of pages that have backing page struct. X86 PAT supports 3 53 + * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and 54 + * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not 55 + * been changed from its default (value of -1 used to denote this). 56 + * Note we do not support _PAGE_CACHE_UC here. 57 + * 58 + * Caller must hold memtype_lock for atomicity. 59 + */ 60 + static inline unsigned long get_page_memtype(struct page *pg) 61 + { 62 + if (!PageUncached(pg) && !PageWC(pg)) 63 + return -1; 64 + else if (!PageUncached(pg) && PageWC(pg)) 65 + return _PAGE_CACHE_WC; 66 + else if (PageUncached(pg) && !PageWC(pg)) 67 + return _PAGE_CACHE_UC_MINUS; 68 + else 69 + return _PAGE_CACHE_WB; 70 + } 71 + 72 + static inline void set_page_memtype(struct page *pg, unsigned long memtype) 73 + { 74 + switch (memtype) { 75 + case _PAGE_CACHE_WC: 76 + ClearPageUncached(pg); 77 + SetPageWC(pg); 78 + break; 79 + case _PAGE_CACHE_UC_MINUS: 80 + SetPageUncached(pg); 81 + ClearPageWC(pg); 82 + break; 83 + case _PAGE_CACHE_WB: 84 + SetPageUncached(pg); 85 + SetPageWC(pg); 86 + break; 87 + default: 88 + case -1: 89 + ClearPageUncached(pg); 90 + ClearPageWC(pg); 91 + break; 92 + } 93 + } 94 + #else 95 + static inline unsigned long get_page_memtype(struct page *pg) { return -1; } 96 + static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } 97 + #endif 48 98 49 99 /* 50 100 * The set_memory_* API can be used to change various attributes of a virtual
+6 -3
arch/x86/include/asm/iomap.h
··· 26 26 #include <asm/pgtable.h> 27 27 #include <asm/tlbflush.h> 28 28 29 - int 30 - is_io_mapping_possible(resource_size_t base, unsigned long size); 31 - 32 29 void * 33 30 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 34 31 35 32 void 36 33 iounmap_atomic(void *kvaddr, enum km_type type); 34 + 35 + int 36 + iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); 37 + 38 + void 39 + iomap_free(resource_size_t base, unsigned long size); 37 40 38 41 #endif /* _ASM_X86_IOMAP_H */
+6
arch/x86/include/asm/mtrr.h
··· 121 121 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); 122 122 extern void mtrr_ap_init(void); 123 123 extern void mtrr_bp_init(void); 124 + extern void set_mtrr_aps_delayed_init(void); 125 + extern void mtrr_aps_init(void); 126 + extern void mtrr_bp_restore(void); 124 127 extern int mtrr_trim_uncached_memory(unsigned long end_pfn); 125 128 extern int amd_special_default_mtrr(void); 126 129 # else ··· 164 161 165 162 #define mtrr_ap_init() do {} while (0) 166 163 #define mtrr_bp_init() do {} while (0) 164 + #define set_mtrr_aps_delayed_init() do {} while (0) 165 + #define mtrr_aps_init() do {} while (0) 166 + #define mtrr_bp_restore() do {} while (0) 167 167 # endif 168 168 169 169 #ifdef CONFIG_COMPAT
+5
arch/x86/include/asm/pat.h
··· 19 19 extern int kernel_map_sync_memtype(u64 base, unsigned long size, 20 20 unsigned long flag); 21 21 22 + int io_reserve_memtype(resource_size_t start, resource_size_t end, 23 + unsigned long *type); 24 + 25 + void io_free_memtype(resource_size_t start, resource_size_t end); 26 + 22 27 #endif /* _ASM_X86_PAT_H */
+37 -9
arch/x86/kernel/cpu/mtrr/main.c
··· 58 58 static DEFINE_MUTEX(mtrr_mutex); 59 59 60 60 u64 size_or_mask, size_and_mask; 61 + static bool mtrr_aps_delayed_init; 61 62 62 63 static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63 64 ··· 164 163 if (data->smp_reg != ~0U) { 165 164 mtrr_if->set(data->smp_reg, data->smp_base, 166 165 data->smp_size, data->smp_type); 167 - } else { 166 + } else if (mtrr_aps_delayed_init) { 167 + /* 168 + * Initialize the MTRRs inaddition to the synchronisation. 169 + */ 168 170 mtrr_if->set_all(); 169 171 } 170 172 ··· 269 265 */ 270 266 if (reg != ~0U) 271 267 mtrr_if->set(reg, base, size, type); 268 + else if (!mtrr_aps_delayed_init) 269 + mtrr_if->set_all(); 272 270 273 271 /* Wait for the others */ 274 272 while (atomic_read(&data.count)) ··· 727 721 728 722 void mtrr_ap_init(void) 729 723 { 730 - unsigned long flags; 731 - 732 - if (!mtrr_if || !use_intel()) 724 + if (!use_intel() || mtrr_aps_delayed_init) 733 725 return; 734 726 /* 735 727 * Ideally we should hold mtrr_mutex here to avoid mtrr entries ··· 742 738 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 743 739 * lock to prevent mtrr entry changes 744 740 */ 745 - local_irq_save(flags); 746 - 747 - mtrr_if->set_all(); 748 - 749 - local_irq_restore(flags); 741 + set_mtrr(~0U, 0, 0, 0); 750 742 } 751 743 752 744 /** ··· 751 751 void mtrr_save_state(void) 752 752 { 753 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 754 + } 755 + 756 + void set_mtrr_aps_delayed_init(void) 757 + { 758 + if (!use_intel()) 759 + return; 760 + 761 + mtrr_aps_delayed_init = true; 762 + } 763 + 764 + /* 765 + * MTRR initialization for all AP's 766 + */ 767 + void mtrr_aps_init(void) 768 + { 769 + if (!use_intel()) 770 + return; 771 + 772 + set_mtrr(~0U, 0, 0, 0); 773 + mtrr_aps_delayed_init = false; 774 + } 775 + 776 + void mtrr_bp_restore(void) 777 + { 778 + if (!use_intel()) 779 + return; 780 + 781 + mtrr_if->set_all(); 754 782 } 755 783 756 784 static int __init mtrr_init_finialize(void)
+15 -5
arch/x86/kernel/setup.c
··· 712 712 printk(KERN_INFO "Command line: %s\n", boot_command_line); 713 713 #endif 714 714 715 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 716 + *cmdline_p = command_line; 717 + 718 + #ifdef CONFIG_X86_64 719 + /* 720 + * Must call this twice: Once just to detect whether hardware doesn't 721 + * support NX (so that the early EHCI debug console setup can safely 722 + * call set_fixmap(), and then again after parsing early parameters to 723 + * honor the respective command line option. 724 + */ 725 + check_efer(); 726 + #endif 727 + 728 + parse_early_param(); 729 + 715 730 /* VMI may relocate the fixmap; do this before touching ioremap area */ 716 731 vmi_init(); 717 732 ··· 808 793 } 809 794 #endif 810 795 #endif 811 - 812 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 813 - *cmdline_p = command_line; 814 - 815 - parse_early_param(); 816 796 817 797 #ifdef CONFIG_X86_64 818 798 check_efer();
+14
arch/x86/kernel/smpboot.c
··· 1118 1118 1119 1119 if (is_uv_system()) 1120 1120 uv_system_init(); 1121 + 1122 + set_mtrr_aps_delayed_init(); 1121 1123 out: 1122 1124 preempt_enable(); 1123 1125 } 1126 + 1127 + void arch_enable_nonboot_cpus_begin(void) 1128 + { 1129 + set_mtrr_aps_delayed_init(); 1130 + } 1131 + 1132 + void arch_enable_nonboot_cpus_end(void) 1133 + { 1134 + mtrr_aps_init(); 1135 + } 1136 + 1124 1137 /* 1125 1138 * Early setup to make printk work. 1126 1139 */ ··· 1155 1142 setup_ioapic_dest(); 1156 1143 #endif 1157 1144 check_nmi_watchdog(); 1145 + mtrr_aps_init(); 1158 1146 } 1159 1147 1160 1148 static int __initdata setup_possible_cpus = -1;
+25 -2
arch/x86/mm/iomap_32.c
··· 21 21 #include <linux/module.h> 22 22 #include <linux/highmem.h> 23 23 24 - int is_io_mapping_possible(resource_size_t base, unsigned long size) 24 + static int is_io_mapping_possible(resource_size_t base, unsigned long size) 25 25 { 26 26 #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) 27 27 /* There is no way to map greater than 1 << 32 address without PAE */ ··· 30 30 #endif 31 31 return 1; 32 32 } 33 - EXPORT_SYMBOL_GPL(is_io_mapping_possible); 33 + 34 + int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) 35 + { 36 + unsigned long flag = _PAGE_CACHE_WC; 37 + int ret; 38 + 39 + if (!is_io_mapping_possible(base, size)) 40 + return -EINVAL; 41 + 42 + ret = io_reserve_memtype(base, base + size, &flag); 43 + if (ret) 44 + return ret; 45 + 46 + *prot = __pgprot(__PAGE_KERNEL | flag); 47 + return 0; 48 + } 49 + EXPORT_SYMBOL_GPL(iomap_create_wc); 50 + 51 + void 52 + iomap_free(resource_size_t base, unsigned long size) 53 + { 54 + io_free_memtype(base, base + size); 55 + } 56 + EXPORT_SYMBOL_GPL(iomap_free); 34 57 35 58 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 36 59 {
+4 -14
arch/x86/mm/ioremap.c
··· 158 158 retval = reserve_memtype(phys_addr, (u64)phys_addr + size, 159 159 prot_val, &new_prot_val); 160 160 if (retval) { 161 - pr_debug("Warning: reserve_memtype returned %d\n", retval); 161 + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); 162 162 return NULL; 163 163 } 164 164 165 165 if (prot_val != new_prot_val) { 166 - /* 167 - * Do not fallback to certain memory types with certain 168 - * requested type: 169 - * - request is uc-, return cannot be write-back 170 - * - request is uc-, return cannot be write-combine 171 - * - request is write-combine, return cannot be write-back 172 - */ 173 - if ((prot_val == _PAGE_CACHE_UC_MINUS && 174 - (new_prot_val == _PAGE_CACHE_WB || 175 - new_prot_val == _PAGE_CACHE_WC)) || 176 - (prot_val == _PAGE_CACHE_WC && 177 - new_prot_val == _PAGE_CACHE_WB)) { 178 - pr_debug( 166 + if (!is_new_memtype_allowed(phys_addr, size, 167 + prot_val, new_prot_val)) { 168 + printk(KERN_ERR 179 169 "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", 180 170 (unsigned long long)phys_addr, 181 171 (unsigned long long)(phys_addr + size),
+7 -1
arch/x86/mm/pageattr.c
··· 822 822 { 823 823 struct cpa_data cpa; 824 824 int ret, cache, checkalias; 825 + unsigned long baddr = 0; 825 826 826 827 /* 827 828 * Check, if we are requested to change a not supported ··· 854 853 */ 855 854 WARN_ON_ONCE(1); 856 855 } 856 + /* 857 + * Save address for cache flush. *addr is modified in the call 858 + * to __change_page_attr_set_clr() below. 859 + */ 860 + baddr = *addr; 857 861 } 858 862 859 863 /* Must avoid aliasing mappings in the highmem code */ ··· 906 900 cpa_flush_array(addr, numpages, cache, 907 901 cpa.flags, pages); 908 902 } else 909 - cpa_flush_range(*addr, numpages, cache); 903 + cpa_flush_range(baddr, numpages, cache); 910 904 } else 911 905 cpa_flush_all(cache); 912 906
+267 -92
arch/x86/mm/pat.c
··· 15 15 #include <linux/gfp.h> 16 16 #include <linux/mm.h> 17 17 #include <linux/fs.h> 18 + #include <linux/rbtree.h> 18 19 19 20 #include <asm/cacheflush.h> 20 21 #include <asm/processor.h> ··· 149 148 * areas). All the aliases have the same cache attributes of course. 150 149 * Zero attributes are represented as holes. 151 150 * 152 - * Currently the data structure is a list because the number of mappings 153 - * are expected to be relatively small. If this should be a problem 154 - * it could be changed to a rbtree or similar. 151 + * The data structure is a list that is also organized as an rbtree 152 + * sorted on the start address of memtype range. 155 153 * 156 - * memtype_lock protects the whole list. 154 + * memtype_lock protects both the linear list and rbtree. 157 155 */ 158 156 159 157 struct memtype { ··· 160 160 u64 end; 161 161 unsigned long type; 162 162 struct list_head nd; 163 + struct rb_node rb; 163 164 }; 164 165 166 + static struct rb_root memtype_rbroot = RB_ROOT; 165 167 static LIST_HEAD(memtype_list); 166 168 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 169 + 170 + static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) 171 + { 172 + struct rb_node *node = root->rb_node; 173 + struct memtype *last_lower = NULL; 174 + 175 + while (node) { 176 + struct memtype *data = container_of(node, struct memtype, rb); 177 + 178 + if (data->start < start) { 179 + last_lower = data; 180 + node = node->rb_right; 181 + } else if (data->start > start) { 182 + node = node->rb_left; 183 + } else 184 + return data; 185 + } 186 + 187 + /* Will return NULL if there is no entry with its start <= start */ 188 + return last_lower; 189 + } 190 + 191 + static void memtype_rb_insert(struct rb_root *root, struct memtype *data) 192 + { 193 + struct rb_node **new = &(root->rb_node); 194 + struct rb_node *parent = NULL; 195 + 196 + while (*new) { 197 + struct memtype *this = container_of(*new, struct memtype, rb); 198 + 199 + parent = *new; 200 + if (data->start <= this->start) 201 + new = &((*new)->rb_left); 202 + else if (data->start > this->start) 203 + new = &((*new)->rb_right); 204 + } 205 + 206 + rb_link_node(&data->rb, parent, new); 207 + rb_insert_color(&data->rb, root); 208 + } 167 209 168 210 /* 169 211 * Does intersection of PAT memory type and MTRR memory type and returns ··· 260 218 return -EBUSY; 261 219 } 262 220 263 - static struct memtype *cached_entry; 264 - static u64 cached_start; 265 - 266 221 static int pat_pagerange_is_ram(unsigned long start, unsigned long end) 267 222 { 268 223 int ram_page = 0, not_rampage = 0; ··· 288 249 } 289 250 290 251 /* 291 - * For RAM pages, mark the pages as non WB memory type using 292 - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or 293 - * set_memory_wc() on a RAM page at a time before marking it as WB again. 294 - * This is ok, because only one driver will be owning the page and 295 - * doing set_memory_*() calls. 252 + * For RAM pages, we use page flags to mark the pages with appropriate type. 253 + * Here we do two pass: 254 + * - Find the memtype of all the pages in the range, look for any conflicts 255 + * - In case of no conflicts, set the new memtype for pages in the range 296 256 * 297 - * For now, we use PageNonWB to track that the RAM page is being mapped 298 - * as non WB. In future, we will have to use one more flag 299 - * (or some other mechanism in page_struct) to distinguish between 300 - * UC and WC mapping. 257 + * Caller must hold memtype_lock for atomicity. 301 258 */ 302 259 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, 303 260 unsigned long *new_type) 304 261 { 305 262 struct page *page; 306 - u64 pfn, end_pfn; 263 + u64 pfn; 264 + 265 + if (req_type == _PAGE_CACHE_UC) { 266 + /* We do not support strong UC */ 267 + WARN_ON_ONCE(1); 268 + req_type = _PAGE_CACHE_UC_MINUS; 269 + } 270 + 271 + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 272 + unsigned long type; 273 + 274 + page = pfn_to_page(pfn); 275 + type = get_page_memtype(page); 276 + if (type != -1) { 277 + printk(KERN_INFO "reserve_ram_pages_type failed " 278 + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", 279 + start, end, type, req_type); 280 + if (new_type) 281 + *new_type = type; 282 + 283 + return -EBUSY; 284 + } 285 + } 286 + 287 + if (new_type) 288 + *new_type = req_type; 307 289 308 290 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 309 291 page = pfn_to_page(pfn); 310 - if (page_mapped(page) || PageNonWB(page)) 311 - goto out; 312 - 313 - SetPageNonWB(page); 292 + set_page_memtype(page, req_type); 314 293 } 315 294 return 0; 316 - 317 - out: 318 - end_pfn = pfn; 319 - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { 320 - page = pfn_to_page(pfn); 321 - ClearPageNonWB(page); 322 - } 323 - 324 - return -EINVAL; 325 295 } 326 296 327 297 static int free_ram_pages_type(u64 start, u64 end) 328 298 { 329 299 struct page *page; 330 - u64 pfn, end_pfn; 300 + u64 pfn; 331 301 332 302 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 333 303 page = pfn_to_page(pfn); 334 - if (page_mapped(page) || !PageNonWB(page)) 335 - goto out; 336 - 337 - ClearPageNonWB(page); 304 + set_page_memtype(page, -1); 338 305 } 339 306 return 0; 340 - 341 - out: 342 - end_pfn = pfn; 343 - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { 344 - page = pfn_to_page(pfn); 345 - SetPageNonWB(page); 346 - } 347 - return -EINVAL; 348 307 } 349 308 350 309 /* ··· 376 339 if (new_type) { 377 340 if (req_type == -1) 378 341 *new_type = _PAGE_CACHE_WB; 342 + else if (req_type == _PAGE_CACHE_WC) 343 + *new_type = _PAGE_CACHE_UC_MINUS; 379 344 else 380 345 *new_type = req_type & _PAGE_CACHE_MASK; 381 346 } ··· 403 364 *new_type = actual_type; 404 365 405 366 is_range_ram = pat_pagerange_is_ram(start, end); 406 - if (is_range_ram == 1) 407 - return reserve_ram_pages_type(start, end, req_type, 408 - new_type); 409 - else if (is_range_ram < 0) 367 + if (is_range_ram == 1) { 368 + 369 + spin_lock(&memtype_lock); 370 + err = reserve_ram_pages_type(start, end, req_type, new_type); 371 + spin_unlock(&memtype_lock); 372 + 373 + return err; 374 + } else if (is_range_ram < 0) { 410 375 return -EINVAL; 376 + } 411 377 412 378 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 413 379 if (!new) ··· 424 380 425 381 spin_lock(&memtype_lock); 426 382 427 - if (cached_entry && start >= cached_start) 428 - entry = cached_entry; 429 - else 383 + entry = memtype_rb_search(&memtype_rbroot, new->start); 384 + if (likely(entry != NULL)) { 385 + /* To work correctly with list_for_each_entry_continue */ 386 + entry = list_entry(entry->nd.prev, struct memtype, nd); 387 + } else { 430 388 entry = list_entry(&memtype_list, struct memtype, nd); 389 + } 431 390 432 391 /* Search for existing mapping that overlaps the current range */ 433 392 where = NULL; 434 393 list_for_each_entry_continue(entry, &memtype_list, nd) { 435 394 if (end <= entry->start) { 436 395 where = entry->nd.prev; 437 - cached_entry = list_entry(where, struct memtype, nd); 438 396 break; 439 397 } else if (start <= entry->start) { /* end > entry->start */ 440 398 err = chk_conflict(new, entry, new_type); ··· 444 398 dprintk("Overlap at 0x%Lx-0x%Lx\n", 445 399 entry->start, entry->end); 446 400 where = entry->nd.prev; 447 - cached_entry = list_entry(where, 448 - struct memtype, nd); 449 401 } 450 402 break; 451 403 } else if (start < entry->end) { /* start > entry->start */ ··· 451 407 if (!err) { 452 408 dprintk("Overlap at 0x%Lx-0x%Lx\n", 453 409 entry->start, entry->end); 454 - cached_entry = list_entry(entry->nd.prev, 455 - struct memtype, nd); 456 410 457 411 /* 458 412 * Move to right position in the linked ··· 478 436 return err; 479 437 } 480 438 481 - cached_start = start; 482 - 483 439 if (where) 484 440 list_add(&new->nd, where); 485 441 else 486 442 list_add_tail(&new->nd, &memtype_list); 443 + 444 + memtype_rb_insert(&memtype_rbroot, new); 487 445 488 446 spin_unlock(&memtype_lock); 489 447 ··· 496 454 497 455 int free_memtype(u64 start, u64 end) 498 456 { 499 - struct memtype *entry; 457 + struct memtype *entry, *saved_entry; 500 458 int err = -EINVAL; 501 459 int is_range_ram; 502 460 ··· 508 466 return 0; 509 467 510 468 is_range_ram = pat_pagerange_is_ram(start, end); 511 - if (is_range_ram == 1) 512 - return free_ram_pages_type(start, end); 513 - else if (is_range_ram < 0) 469 + if (is_range_ram == 1) { 470 + 471 + spin_lock(&memtype_lock); 472 + err = free_ram_pages_type(start, end); 473 + spin_unlock(&memtype_lock); 474 + 475 + return err; 476 + } else if (is_range_ram < 0) { 514 477 return -EINVAL; 478 + } 515 479 516 480 spin_lock(&memtype_lock); 481 + 482 + entry = memtype_rb_search(&memtype_rbroot, start); 483 + if (unlikely(entry == NULL)) 484 + goto unlock_ret; 485 + 486 + /* 487 + * Saved entry points to an entry with start same or less than what 488 + * we searched for. Now go through the list in both directions to look 489 + * for the entry that matches with both start and end, with list stored 490 + * in sorted start address 491 + */ 492 + saved_entry = entry; 517 493 list_for_each_entry(entry, &memtype_list, nd) { 518 494 if (entry->start == start && entry->end == end) { 519 - if (cached_entry == entry || cached_start == start) 520 - cached_entry = NULL; 521 - 495 + rb_erase(&entry->rb, &memtype_rbroot); 522 496 list_del(&entry->nd); 523 497 kfree(entry); 524 498 err = 0; 525 499 break; 500 + } else if (entry->start > start) { 501 + break; 526 502 } 527 503 } 504 + 505 + if (!err) 506 + goto unlock_ret; 507 + 508 + entry = saved_entry; 509 + list_for_each_entry_reverse(entry, &memtype_list, nd) { 510 + if (entry->start == start && entry->end == end) { 511 + rb_erase(&entry->rb, &memtype_rbroot); 512 + list_del(&entry->nd); 513 + kfree(entry); 514 + err = 0; 515 + break; 516 + } else if (entry->start < start) { 517 + break; 518 + } 519 + } 520 + unlock_ret: 528 521 spin_unlock(&memtype_lock); 529 522 530 523 if (err) { ··· 572 495 return err; 573 496 } 574 497 498 + 499 + /** 500 + * lookup_memtype - Looksup the memory type for a physical address 501 + * @paddr: physical address of which memory type needs to be looked up 502 + * 503 + * Only to be called when PAT is enabled 504 + * 505 + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or 506 + * _PAGE_CACHE_UC 507 + */ 508 + static unsigned long lookup_memtype(u64 paddr) 509 + { 510 + int rettype = _PAGE_CACHE_WB; 511 + struct memtype *entry; 512 + 513 + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 514 + return rettype; 515 + 516 + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 517 + struct page *page; 518 + spin_lock(&memtype_lock); 519 + page = pfn_to_page(paddr >> PAGE_SHIFT); 520 + rettype = get_page_memtype(page); 521 + spin_unlock(&memtype_lock); 522 + /* 523 + * -1 from get_page_memtype() implies RAM page is in its 524 + * default state and not reserved, and hence of type WB 525 + */ 526 + if (rettype == -1) 527 + rettype = _PAGE_CACHE_WB; 528 + 529 + return rettype; 530 + } 531 + 532 + spin_lock(&memtype_lock); 533 + 534 + entry = memtype_rb_search(&memtype_rbroot, paddr); 535 + if (entry != NULL) 536 + rettype = entry->type; 537 + else 538 + rettype = _PAGE_CACHE_UC_MINUS; 539 + 540 + spin_unlock(&memtype_lock); 541 + return rettype; 542 + } 543 + 544 + /** 545 + * io_reserve_memtype - Request a memory type mapping for a region of memory 546 + * @start: start (physical address) of the region 547 + * @end: end (physical address) of the region 548 + * @type: A pointer to memtype, with requested type. On success, requested 549 + * or any other compatible type that was available for the region is returned 550 + * 551 + * On success, returns 0 552 + * On failure, returns non-zero 553 + */ 554 + int io_reserve_memtype(resource_size_t start, resource_size_t end, 555 + unsigned long *type) 556 + { 557 + resource_size_t size = end - start; 558 + unsigned long req_type = *type; 559 + unsigned long new_type; 560 + int ret; 561 + 562 + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 563 + 564 + ret = reserve_memtype(start, end, req_type, &new_type); 565 + if (ret) 566 + goto out_err; 567 + 568 + if (!is_new_memtype_allowed(start, size, req_type, new_type)) 569 + goto out_free; 570 + 571 + if (kernel_map_sync_memtype(start, size, new_type) < 0) 572 + goto out_free; 573 + 574 + *type = new_type; 575 + return 0; 576 + 577 + out_free: 578 + free_memtype(start, end); 579 + ret = -EBUSY; 580 + out_err: 581 + return ret; 582 + } 583 + 584 + /** 585 + * io_free_memtype - Release a memory type mapping for a region of memory 586 + * @start: start (physical address) of the region 587 + * @end: end (physical address) of the region 588 + */ 589 + void io_free_memtype(resource_size_t start, resource_size_t end) 590 + { 591 + free_memtype(start, end); 592 + } 575 593 576 594 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 577 595 unsigned long size, pgprot_t vma_prot) ··· 749 577 { 750 578 unsigned long id_sz; 751 579 752 - if (!pat_enabled || base >= __pa(high_memory)) 580 + if (base >= __pa(high_memory)) 753 581 return 0; 754 582 755 583 id_sz = (__pa(high_memory) < base + size) ? ··· 784 612 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 785 613 786 614 /* 787 - * reserve_pfn_range() doesn't support RAM pages. Maintain the current 788 - * behavior with RAM pages by returning success. 615 + * reserve_pfn_range() for RAM pages. We do not refcount to keep 616 + * track of number of mappings of RAM pages. We can assert that 617 + * the type requested matches the type of first page in the range. 789 618 */ 790 - if (is_ram != 0) 619 + if (is_ram) { 620 + if (!pat_enabled) 621 + return 0; 622 + 623 + flags = lookup_memtype(paddr); 624 + if (want_flags != flags) { 625 + printk(KERN_WARNING 626 + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", 627 + current->comm, current->pid, 628 + cattr_name(want_flags), 629 + (unsigned long long)paddr, 630 + (unsigned long long)(paddr + size), 631 + cattr_name(flags)); 632 + *vma_prot = __pgprot((pgprot_val(*vma_prot) & 633 + (~_PAGE_CACHE_MASK)) | 634 + flags); 635 + } 791 636 return 0; 637 + } 792 638 793 639 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 794 640 if (ret) ··· 868 678 unsigned long vma_size = vma->vm_end - vma->vm_start; 869 679 pgprot_t pgprot; 870 680 871 - if (!pat_enabled) 872 - return 0; 873 - 874 - /* 875 - * For now, only handle remap_pfn_range() vmas where 876 - * is_linear_pfn_mapping() == TRUE. Handling of 877 - * vm_insert_pfn() is TBD. 878 - */ 879 681 if (is_linear_pfn_mapping(vma)) { 880 682 /* 881 683 * reserve the whole chunk covered by vma. We need the ··· 895 713 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, 896 714 unsigned long pfn, unsigned long size) 897 715 { 716 + unsigned long flags; 898 717 resource_size_t paddr; 899 718 unsigned long vma_size = vma->vm_end - vma->vm_start; 900 719 901 - if (!pat_enabled) 902 - return 0; 903 - 904 - /* 905 - * For now, only handle remap_pfn_range() vmas where 906 - * is_linear_pfn_mapping() == TRUE. Handling of 907 - * vm_insert_pfn() is TBD. 908 - */ 909 720 if (is_linear_pfn_mapping(vma)) { 910 721 /* reserve the whole chunk starting from vm_pgoff */ 911 722 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; 912 723 return reserve_pfn_range(paddr, vma_size, prot, 0); 913 724 } 725 + 726 + if (!pat_enabled) 727 + return 0; 728 + 729 + /* for vm_insert_pfn and friends, we set prot based on lookup */ 730 + flags = lookup_memtype(pfn << PAGE_SHIFT); 731 + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 732 + flags); 914 733 915 734 return 0; 916 735 } ··· 927 744 resource_size_t paddr; 928 745 unsigned long vma_size = vma->vm_end - vma->vm_start; 929 746 930 - if (!pat_enabled) 931 - return; 932 - 933 - /* 934 - * For now, only handle remap_pfn_range() vmas where 935 - * is_linear_pfn_mapping() == TRUE. Handling of 936 - * vm_insert_pfn() is TBD. 937 - */ 938 747 if (is_linear_pfn_mapping(vma)) { 939 748 /* free the whole chunk starting from vm_pgoff */ 940 749 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+1 -1
arch/x86/power/cpu.c
··· 242 242 fix_processor_context(); 243 243 244 244 do_fpu_end(); 245 - mtrr_ap_init(); 245 + mtrr_bp_restore(); 246 246 247 247 #ifdef CONFIG_X86_OLD_MCE 248 248 mcheck_init(&boot_cpu_data);
+12 -5
include/linux/io-mapping.h
··· 49 49 io_mapping_create_wc(resource_size_t base, unsigned long size) 50 50 { 51 51 struct io_mapping *iomap; 52 - 53 - if (!is_io_mapping_possible(base, size)) 54 - return NULL; 52 + pgprot_t prot; 55 53 56 54 iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); 57 55 if (!iomap) 58 - return NULL; 56 + goto out_err; 57 + 58 + if (iomap_create_wc(base, size, &prot)) 59 + goto out_free; 59 60 60 61 iomap->base = base; 61 62 iomap->size = size; 62 - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); 63 + iomap->prot = prot; 63 64 return iomap; 65 + 66 + out_free: 67 + kfree(iomap); 68 + out_err: 69 + return NULL; 64 70 } 65 71 66 72 static inline void 67 73 io_mapping_free(struct io_mapping *mapping) 68 74 { 75 + iomap_free(mapping->base, mapping->size); 69 76 kfree(mapping); 70 77 } 71 78
+2 -2
include/linux/page-flags.h
··· 99 99 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 100 100 PG_mlocked, /* Page is vma mlocked */ 101 101 #endif 102 - #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 102 + #ifdef CONFIG_ARCH_USES_PG_UNCACHED 103 103 PG_uncached, /* Page has been mapped as uncached */ 104 104 #endif 105 105 __NR_PAGEFLAGS, ··· 257 257 SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) 258 258 #endif 259 259 260 - #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 260 + #ifdef CONFIG_ARCH_USES_PG_UNCACHED 261 261 PAGEFLAG(Uncached, uncached) 262 262 #else 263 263 PAGEFLAG_FALSE(Uncached)
+14
kernel/cpu.c
··· 414 414 return error; 415 415 } 416 416 417 + void __weak arch_enable_nonboot_cpus_begin(void) 418 + { 419 + } 420 + 421 + void __weak arch_enable_nonboot_cpus_end(void) 422 + { 423 + } 424 + 417 425 void __ref enable_nonboot_cpus(void) 418 426 { 419 427 int cpu, error; ··· 433 425 goto out; 434 426 435 427 printk("Enabling non-boot CPUs ...\n"); 428 + 429 + arch_enable_nonboot_cpus_begin(); 430 + 436 431 for_each_cpu(cpu, frozen_cpus) { 437 432 error = _cpu_up(cpu, 1); 438 433 if (!error) { ··· 444 433 } 445 434 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 446 435 } 436 + 437 + arch_enable_nonboot_cpus_end(); 438 + 447 439 cpumask_clear(frozen_cpus); 448 440 out: 449 441 cpu_maps_update_done();
+34 -6
kernel/smp.c
··· 177 177 int cpu = get_cpu(); 178 178 179 179 /* 180 + * Shouldn't receive this interrupt on a cpu that is not yet online. 181 + */ 182 + WARN_ON_ONCE(!cpu_online(cpu)); 183 + 184 + /* 180 185 * Ensure entry is visible on call_function_queue after we have 181 186 * entered the IPI. See comment in smp_call_function_many. 182 187 * If we don't have this, then we may miss an entry on the list ··· 234 229 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 235 230 unsigned int data_flags; 236 231 LIST_HEAD(list); 232 + 233 + /* 234 + * Shouldn't receive this interrupt on a cpu that is not yet online. 235 + */ 236 + WARN_ON_ONCE(!cpu_online(smp_processor_id())); 237 237 238 238 spin_lock(&q->lock); 239 239 list_replace_init(&q->list, &list); ··· 295 285 */ 296 286 this_cpu = get_cpu(); 297 287 298 - /* Can deadlock when called with interrupts disabled */ 299 - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 288 + /* 289 + * Can deadlock when called with interrupts disabled. 290 + * We allow cpu's that are not yet online though, as no one else can 291 + * send smp call function interrupt to this cpu and as such deadlocks 292 + * can't happen. 293 + */ 294 + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 295 + && !oops_in_progress); 300 296 301 297 if (cpu == this_cpu) { 302 298 local_irq_save(flags); ··· 345 329 { 346 330 csd_lock(data); 347 331 348 - /* Can deadlock when called with interrupts disabled */ 349 - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 332 + /* 333 + * Can deadlock when called with interrupts disabled. 334 + * We allow cpu's that are not yet online though, as no one else can 335 + * send smp call function interrupt to this cpu and as such deadlocks 336 + * can't happen. 337 + */ 338 + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 339 + && !oops_in_progress); 350 340 351 341 generic_exec_single(cpu, data, wait); 352 342 } ··· 387 365 unsigned long flags; 388 366 int cpu, next_cpu, this_cpu = smp_processor_id(); 389 367 390 - /* Can deadlock when called with interrupts disabled */ 391 - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 368 + /* 369 + * Can deadlock when called with interrupts disabled. 370 + * We allow cpu's that are not yet online though, as no one else can 371 + * send smp call function interrupt to this cpu and as such deadlocks 372 + * can't happen. 373 + */ 374 + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 375 + && !oops_in_progress); 392 376 393 377 /* So, what's a CPU they want? Ignoring this one. */ 394 378 cpu = cpumask_first_and(mask, cpu_online_mask);
+1 -1
mm/Kconfig
··· 153 153 # 154 154 config PAGEFLAGS_EXTENDED 155 155 def_bool y 156 - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 156 + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM 157 157 158 158 # Heavily threaded applications may benefit from splitting the mm-wide 159 159 # page_table_lock, so that faults on different parts of the user address