x86, pat: Migrate to rbtree only backend for pat memtype management

Move pat backend to fully rbtree based implementation from the existing
rbtree and linked list hybrid.

New rbtree based solution uses interval trees (augmented rbtrees) in
order to store the PAT ranges. The new code seprates out the pat backend
to pat_rbtree.c file, making is cleaner. The change also makes the PAT
lookup, reserve and free operations more optimal, as we don't have to
traverse linear linked list of few tens of entries in normal case.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210232607.GB11465@linux-os.sc.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

authored by Pallipadi, Venkatesh and committed by H. Peter Anvin 9e41a49a be5a0c12

+296 -205
+1
arch/x86/mm/Makefile
··· 6 CFLAGS_physaddr.o := $(nostackp) 7 CFLAGS_setup_nx.o := $(nostackp) 8 9 obj-$(CONFIG_SMP) += tlb.o 10 11 obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
··· 6 CFLAGS_physaddr.o := $(nostackp) 7 CFLAGS_setup_nx.o := $(nostackp) 8 9 + obj-$(CONFIG_X86_PAT) += pat_rbtree.o 10 obj-$(CONFIG_SMP) += tlb.o 11 12 obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
+5 -204
arch/x86/mm/pat.c
··· 130 131 #undef PAT 132 133 - /* 134 - * The global memtype list keeps track of memory type for specific 135 - * physical memory areas. Conflicting memory types in different 136 - * mappings can cause CPU cache corruption. To avoid this we keep track. 137 - * 138 - * The list is sorted based on starting address and can contain multiple 139 - * entries for each address (this allows reference counting for overlapping 140 - * areas). All the aliases have the same cache attributes of course. 141 - * Zero attributes are represented as holes. 142 - * 143 - * The data structure is a list that is also organized as an rbtree 144 - * sorted on the start address of memtype range. 145 - * 146 - * memtype_lock protects both the linear list and rbtree. 147 - */ 148 - 149 - static struct rb_root memtype_rbroot = RB_ROOT; 150 - static LIST_HEAD(memtype_list); 151 - static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 152 - 153 - static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) 154 - { 155 - struct rb_node *node = root->rb_node; 156 - struct memtype *last_lower = NULL; 157 - 158 - while (node) { 159 - struct memtype *data = container_of(node, struct memtype, rb); 160 - 161 - if (data->start < start) { 162 - last_lower = data; 163 - node = node->rb_right; 164 - } else if (data->start > start) { 165 - node = node->rb_left; 166 - } else 167 - return data; 168 - } 169 - 170 - /* Will return NULL if there is no entry with its start <= start */ 171 - return last_lower; 172 - } 173 - 174 - static void memtype_rb_insert(struct rb_root *root, struct memtype *data) 175 - { 176 - struct rb_node **new = &(root->rb_node); 177 - struct rb_node *parent = NULL; 178 - 179 - while (*new) { 180 - struct memtype *this = container_of(*new, struct memtype, rb); 181 - 182 - parent = *new; 183 - if (data->start <= this->start) 184 - new = &((*new)->rb_left); 185 - else if (data->start > this->start) 186 - new = &((*new)->rb_right); 187 - } 188 - 189 - rb_link_node(&data->rb, parent, new); 190 - rb_insert_color(&data->rb, root); 191 - } 192 193 /* 194 * Does intersection of PAT memory type and MTRR memory type and returns ··· 156 } 157 158 return req_type; 159 - } 160 - 161 - static int 162 - chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) 163 - { 164 - if (new->type != entry->type) { 165 - if (type) { 166 - new->type = entry->type; 167 - *type = entry->type; 168 - } else 169 - goto conflict; 170 - } 171 - 172 - /* check overlaps with more than one entry in the list */ 173 - list_for_each_entry_continue(entry, &memtype_list, nd) { 174 - if (new->end <= entry->start) 175 - break; 176 - else if (new->type != entry->type) 177 - goto conflict; 178 - } 179 - return 0; 180 - 181 - conflict: 182 - printk(KERN_INFO "%s:%d conflicting memory types " 183 - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, 184 - new->end, cattr_name(new->type), cattr_name(entry->type)); 185 - return -EBUSY; 186 } 187 188 static int pat_pagerange_is_ram(unsigned long start, unsigned long end) ··· 243 return 0; 244 } 245 246 - static int memtype_check_insert(struct memtype *new, unsigned long *new_type) 247 - { 248 - struct memtype *entry; 249 - u64 start, end; 250 - unsigned long actual_type; 251 - struct list_head *where; 252 - int err = 0; 253 - 254 - start = new->start; 255 - end = new->end; 256 - actual_type = new->type; 257 - 258 - /* Search for existing mapping that overlaps the current range */ 259 - where = NULL; 260 - list_for_each_entry(entry, &memtype_list, nd) { 261 - if (end <= entry->start) { 262 - where = entry->nd.prev; 263 - break; 264 - } else if (start <= entry->start) { /* end > entry->start */ 265 - err = chk_conflict(new, entry, new_type); 266 - if (!err) { 267 - dprintk("Overlap at 0x%Lx-0x%Lx\n", 268 - entry->start, entry->end); 269 - where = entry->nd.prev; 270 - } 271 - break; 272 - } else if (start < entry->end) { /* start > entry->start */ 273 - err = chk_conflict(new, entry, new_type); 274 - if (!err) { 275 - dprintk("Overlap at 0x%Lx-0x%Lx\n", 276 - entry->start, entry->end); 277 - 278 - /* 279 - * Move to right position in the linked 280 - * list to add this new entry 281 - */ 282 - list_for_each_entry_continue(entry, 283 - &memtype_list, nd) { 284 - if (start <= entry->start) { 285 - where = entry->nd.prev; 286 - break; 287 - } 288 - } 289 - } 290 - break; 291 - } 292 - } 293 - if (!err) { 294 - if (where) 295 - list_add(&new->nd, where); 296 - else 297 - list_add_tail(&new->nd, &memtype_list); 298 - 299 - memtype_rb_insert(&memtype_rbroot, new); 300 - } 301 - return err; 302 - } 303 - 304 /* 305 * req_type typically has one of the: 306 * - _PAGE_CACHE_WB ··· 316 317 spin_lock(&memtype_lock); 318 319 - err = memtype_check_insert(new, new_type); 320 if (err) { 321 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " 322 "track %s, req %s\n", ··· 338 339 int free_memtype(u64 start, u64 end) 340 { 341 - struct memtype *entry, *saved_entry; 342 int err = -EINVAL; 343 int is_range_ram; 344 ··· 361 } 362 363 spin_lock(&memtype_lock); 364 - 365 - entry = memtype_rb_search(&memtype_rbroot, start); 366 - if (unlikely(entry == NULL)) 367 - goto unlock_ret; 368 - 369 - /* 370 - * Saved entry points to an entry with start same or less than what 371 - * we searched for. Now go through the list in both directions to look 372 - * for the entry that matches with both start and end, with list stored 373 - * in sorted start address 374 - */ 375 - saved_entry = entry; 376 - list_for_each_entry_from(entry, &memtype_list, nd) { 377 - if (entry->start == start && entry->end == end) { 378 - rb_erase(&entry->rb, &memtype_rbroot); 379 - list_del(&entry->nd); 380 - kfree(entry); 381 - err = 0; 382 - break; 383 - } else if (entry->start > start) { 384 - break; 385 - } 386 - } 387 - 388 - if (!err) 389 - goto unlock_ret; 390 - 391 - entry = saved_entry; 392 - list_for_each_entry_reverse(entry, &memtype_list, nd) { 393 - if (entry->start == start && entry->end == end) { 394 - rb_erase(&entry->rb, &memtype_rbroot); 395 - list_del(&entry->nd); 396 - kfree(entry); 397 - err = 0; 398 - break; 399 - } else if (entry->start < start) { 400 - break; 401 - } 402 - } 403 - unlock_ret: 404 spin_unlock(&memtype_lock); 405 406 if (err) { ··· 410 411 spin_lock(&memtype_lock); 412 413 - entry = memtype_rb_search(&memtype_rbroot, paddr); 414 if (entry != NULL) 415 rettype = entry->type; 416 else ··· 747 748 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 749 750 - /* get Nth element of the linked list */ 751 - static int copy_memtype_nth_element(struct memtype *out, loff_t pos) 752 - { 753 - struct memtype *list_node; 754 - int i = 1; 755 - 756 - list_for_each_entry(list_node, &memtype_list, nd) { 757 - if (pos == i) { 758 - *out = *list_node; 759 - return 0; 760 - } 761 - ++i; 762 - } 763 - return 1; 764 - } 765 - 766 static struct memtype *memtype_get_idx(loff_t pos) 767 { 768 struct memtype *print_entry; ··· 757 return NULL; 758 759 spin_lock(&memtype_lock); 760 - ret = copy_memtype_nth_element(print_entry, pos); 761 spin_unlock(&memtype_lock); 762 763 if (!ret) {
··· 130 131 #undef PAT 132 133 + static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 134 135 /* 136 * Does intersection of PAT memory type and MTRR memory type and returns ··· 214 } 215 216 return req_type; 217 } 218 219 static int pat_pagerange_is_ram(unsigned long start, unsigned long end) ··· 328 return 0; 329 } 330 331 /* 332 * req_type typically has one of the: 333 * - _PAGE_CACHE_WB ··· 459 460 spin_lock(&memtype_lock); 461 462 + err = rbt_memtype_check_insert(new, new_type); 463 if (err) { 464 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " 465 "track %s, req %s\n", ··· 481 482 int free_memtype(u64 start, u64 end) 483 { 484 int err = -EINVAL; 485 int is_range_ram; 486 ··· 505 } 506 507 spin_lock(&memtype_lock); 508 + err = rbt_memtype_erase(start, end); 509 spin_unlock(&memtype_lock); 510 511 if (err) { ··· 593 594 spin_lock(&memtype_lock); 595 596 + entry = rbt_memtype_lookup(paddr); 597 if (entry != NULL) 598 rettype = entry->type; 599 else ··· 930 931 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 932 933 static struct memtype *memtype_get_idx(loff_t pos) 934 { 935 struct memtype *print_entry; ··· 956 return NULL; 957 958 spin_lock(&memtype_lock); 959 + ret = rbt_memtype_copy_nth_element(print_entry, pos); 960 spin_unlock(&memtype_lock); 961 962 if (!ret) {
+19 -1
arch/x86/mm/pat_internal.h
··· 9 struct memtype { 10 u64 start; 11 u64 end; 12 unsigned long type; 13 - struct list_head nd; 14 struct rb_node rb; 15 }; 16 ··· 24 default: return "broken"; 25 } 26 } 27 28 #endif /* __PAT_INTERNAL_H_ */
··· 9 struct memtype { 10 u64 start; 11 u64 end; 12 + u64 subtree_max_end; 13 unsigned long type; 14 struct rb_node rb; 15 }; 16 ··· 24 default: return "broken"; 25 } 26 } 27 + 28 + #ifdef CONFIG_X86_PAT 29 + extern int rbt_memtype_check_insert(struct memtype *new, 30 + unsigned long *new_type); 31 + extern int rbt_memtype_erase(u64 start, u64 end); 32 + extern struct memtype *rbt_memtype_lookup(u64 addr); 33 + extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); 34 + #else 35 + static inline int rbt_memtype_check_insert(struct memtype *new, 36 + unsigned long *new_type) 37 + { return 0; } 38 + static inline int rbt_memtype_erase(u64 start, u64 end) 39 + { return 0; } 40 + static inline struct memtype *rbt_memtype_lookup(u64 addr) 41 + { return NULL; } 42 + static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) 43 + { return 0; } 44 + #endif 45 46 #endif /* __PAT_INTERNAL_H_ */
+271
arch/x86/mm/pat_rbtree.c
···
··· 1 + /* 2 + * Handle caching attributes in page tables (PAT) 3 + * 4 + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 5 + * Suresh B Siddha <suresh.b.siddha@intel.com> 6 + * 7 + * Interval tree (augmented rbtree) used to store the PAT memory type 8 + * reservations. 9 + */ 10 + 11 + #include <linux/seq_file.h> 12 + #include <linux/debugfs.h> 13 + #include <linux/kernel.h> 14 + #include <linux/module.h> 15 + #include <linux/rbtree.h> 16 + #include <linux/sched.h> 17 + #include <linux/gfp.h> 18 + 19 + #include <asm/pgtable.h> 20 + #include <asm/pat.h> 21 + 22 + #include "pat_internal.h" 23 + 24 + /* 25 + * The memtype tree keeps track of memory type for specific 26 + * physical memory areas. Without proper tracking, conflicting memory 27 + * types in different mappings can cause CPU cache corruption. 28 + * 29 + * The tree is an interval tree (augmented rbtree) with tree ordered 30 + * on starting address. Tree can contain multiple entries for 31 + * different regions which overlap. All the aliases have the same 32 + * cache attributes of course. 33 + * 34 + * memtype_lock protects the rbtree. 35 + */ 36 + 37 + static void memtype_rb_augment_cb(struct rb_node *node); 38 + static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb); 39 + 40 + static int is_node_overlap(struct memtype *node, u64 start, u64 end) 41 + { 42 + if (node->start >= end || node->end <= start) 43 + return 0; 44 + 45 + return 1; 46 + } 47 + 48 + static u64 get_subtree_max_end(struct rb_node *node) 49 + { 50 + u64 ret = 0; 51 + if (node) { 52 + struct memtype *data = container_of(node, struct memtype, rb); 53 + ret = data->subtree_max_end; 54 + } 55 + return ret; 56 + } 57 + 58 + /* Update 'subtree_max_end' for a node, based on node and its children */ 59 + static void update_node_max_end(struct rb_node *node) 60 + { 61 + struct memtype *data; 62 + u64 max_end, child_max_end; 63 + 64 + if (!node) 65 + return; 66 + 67 + data = container_of(node, struct memtype, rb); 68 + max_end = data->end; 69 + 70 + child_max_end = get_subtree_max_end(node->rb_right); 71 + if (child_max_end > max_end) 72 + max_end = child_max_end; 73 + 74 + child_max_end = get_subtree_max_end(node->rb_left); 75 + if (child_max_end > max_end) 76 + max_end = child_max_end; 77 + 78 + data->subtree_max_end = max_end; 79 + } 80 + 81 + /* Update 'subtree_max_end' for a node and all its ancestors */ 82 + static void update_path_max_end(struct rb_node *node) 83 + { 84 + u64 old_max_end, new_max_end; 85 + 86 + while (node) { 87 + struct memtype *data = container_of(node, struct memtype, rb); 88 + 89 + old_max_end = data->subtree_max_end; 90 + update_node_max_end(node); 91 + new_max_end = data->subtree_max_end; 92 + 93 + if (new_max_end == old_max_end) 94 + break; 95 + 96 + node = rb_parent(node); 97 + } 98 + } 99 + 100 + /* Find the first (lowest start addr) overlapping range from rb tree */ 101 + static struct memtype *memtype_rb_lowest_match(struct rb_root *root, 102 + u64 start, u64 end) 103 + { 104 + struct rb_node *node = root->rb_node; 105 + struct memtype *last_lower = NULL; 106 + 107 + while (node) { 108 + struct memtype *data = container_of(node, struct memtype, rb); 109 + 110 + if (get_subtree_max_end(node->rb_left) > start) { 111 + /* Lowest overlap if any must be on left side */ 112 + node = node->rb_left; 113 + } else if (is_node_overlap(data, start, end)) { 114 + last_lower = data; 115 + break; 116 + } else if (start >= data->start) { 117 + /* Lowest overlap if any must be on right side */ 118 + node = node->rb_right; 119 + } else { 120 + break; 121 + } 122 + } 123 + return last_lower; /* Returns NULL if there is no overlap */ 124 + } 125 + 126 + static struct memtype *memtype_rb_exact_match(struct rb_root *root, 127 + u64 start, u64 end) 128 + { 129 + struct memtype *match; 130 + 131 + match = memtype_rb_lowest_match(root, start, end); 132 + while (match != NULL && match->start < end) { 133 + struct rb_node *node; 134 + 135 + if (match->start == start && match->end == end) 136 + return match; 137 + 138 + node = rb_next(&match->rb); 139 + if (node) 140 + match = container_of(node, struct memtype, rb); 141 + else 142 + match = NULL; 143 + } 144 + 145 + return NULL; /* Returns NULL if there is no exact match */ 146 + } 147 + 148 + static int memtype_rb_check_conflict(struct rb_root *root, 149 + u64 start, u64 end, 150 + unsigned long reqtype, unsigned long *newtype) 151 + { 152 + struct rb_node *node; 153 + struct memtype *match; 154 + int found_type = reqtype; 155 + 156 + match = memtype_rb_lowest_match(&memtype_rbroot, start, end); 157 + if (match == NULL) 158 + goto success; 159 + 160 + if (match->type != found_type && newtype == NULL) 161 + goto failure; 162 + 163 + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); 164 + found_type = match->type; 165 + 166 + node = rb_next(&match->rb); 167 + while (node) { 168 + match = container_of(node, struct memtype, rb); 169 + 170 + if (match->start >= end) /* Checked all possible matches */ 171 + goto success; 172 + 173 + if (is_node_overlap(match, start, end) && 174 + match->type != found_type) { 175 + goto failure; 176 + } 177 + 178 + node = rb_next(&match->rb); 179 + } 180 + success: 181 + if (newtype) 182 + *newtype = found_type; 183 + 184 + return 0; 185 + 186 + failure: 187 + printk(KERN_INFO "%s:%d conflicting memory types " 188 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, 189 + end, cattr_name(found_type), cattr_name(match->type)); 190 + return -EBUSY; 191 + } 192 + 193 + static void memtype_rb_augment_cb(struct rb_node *node) 194 + { 195 + if (node) 196 + update_path_max_end(node); 197 + } 198 + 199 + static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) 200 + { 201 + struct rb_node **node = &(root->rb_node); 202 + struct rb_node *parent = NULL; 203 + 204 + while (*node) { 205 + struct memtype *data = container_of(*node, struct memtype, rb); 206 + 207 + parent = *node; 208 + if (newdata->start <= data->start) 209 + node = &((*node)->rb_left); 210 + else if (newdata->start > data->start) 211 + node = &((*node)->rb_right); 212 + } 213 + 214 + rb_link_node(&newdata->rb, parent, node); 215 + rb_insert_color(&newdata->rb, root); 216 + } 217 + 218 + int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) 219 + { 220 + int err = 0; 221 + 222 + err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, 223 + new->type, ret_type); 224 + 225 + if (!err) { 226 + new->type = *ret_type; 227 + memtype_rb_insert(&memtype_rbroot, new); 228 + } 229 + return err; 230 + } 231 + 232 + int rbt_memtype_erase(u64 start, u64 end) 233 + { 234 + struct memtype *data; 235 + 236 + data = memtype_rb_exact_match(&memtype_rbroot, start, end); 237 + if (!data) 238 + return -EINVAL; 239 + 240 + rb_erase(&data->rb, &memtype_rbroot); 241 + return 0; 242 + } 243 + 244 + struct memtype *rbt_memtype_lookup(u64 addr) 245 + { 246 + struct memtype *data; 247 + data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); 248 + return data; 249 + } 250 + 251 + #if defined(CONFIG_DEBUG_FS) 252 + int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) 253 + { 254 + struct rb_node *node; 255 + int i = 1; 256 + 257 + node = rb_first(&memtype_rbroot); 258 + while (node && pos != i) { 259 + node = rb_next(node); 260 + i++; 261 + } 262 + 263 + if (node) { /* pos == i */ 264 + struct memtype *this = container_of(node, struct memtype, rb); 265 + *out = *this; 266 + return 0; 267 + } else { 268 + return 1; 269 + } 270 + } 271 + #endif