Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

slab: fix oops when reading /proc/slab_allocators

Commit b1cb0982bdd6 ("change the management method of free objects of
the slab") introduced a bug on slab leak detector
('/proc/slab_allocators'). This detector works like as following
decription.

1. traverse all objects on all the slabs.
2. determine whether it is active or not.
3. if active, print who allocate this object.

but that commit changed the way how to manage free objects, so the logic
determining whether it is active or not is also changed. In before, we
regard object in cpu caches as inactive one, but, with this commit, we
mistakenly regard object in cpu caches as active one.

This intoduces kernel oops if DEBUG_PAGEALLOC is enabled. If
DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who
corrupt free memory in the slab. It unmaps page table mapping if object
is free and map it if object is active. When slab leak detector check
object in cpu caches, it mistakenly think this object active so try to
access object memory to retrieve caller of allocation. At this point,
page table mapping to this object doesn't exist, so oops occurs.

Following is oops message reported from Dave.

It blew up when something tried to read /proc/slab_allocators
(Just cat it, and you should see the oops below)

Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
[snip...]
CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131
task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000
RIP: 0010:[<ffffffffaa1a8f4a>] [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180
RSP: 0018:ffff880076925de0 EFLAGS: 00010002
RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7
RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000
RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000
R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0
FS: 00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0
DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602
Call Trace:
leaks_show+0xce/0x240
seq_read+0x28e/0x490
proc_reg_read+0x3d/0x80
vfs_read+0x9b/0x160
SyS_read+0x58/0xb0
tracesys+0xd4/0xd9
Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46
RIP handle_slab+0x8a/0x180

To fix the problem, I introduce an object status buffer on each slab.
With this, we can track object status precisely, so slab leak detector
would not access active object and no kernel oops would occur. Memory
overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK
which is mainly used for debugging, so memory overhead isn't big
problem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-by: Dave Jones <davej@redhat.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Joonsoo Kim and committed by
Linus Torvalds
03787301 f00cdc6d

+71 -19
+71 -19
mm/slab.c
··· 386 386 387 387 #endif 388 388 389 + #define OBJECT_FREE (0) 390 + #define OBJECT_ACTIVE (1) 391 + 392 + #ifdef CONFIG_DEBUG_SLAB_LEAK 393 + 394 + static void set_obj_status(struct page *page, int idx, int val) 395 + { 396 + int freelist_size; 397 + char *status; 398 + struct kmem_cache *cachep = page->slab_cache; 399 + 400 + freelist_size = cachep->num * sizeof(freelist_idx_t); 401 + status = (char *)page->freelist + freelist_size; 402 + status[idx] = val; 403 + } 404 + 405 + static inline unsigned int get_obj_status(struct page *page, int idx) 406 + { 407 + int freelist_size; 408 + char *status; 409 + struct kmem_cache *cachep = page->slab_cache; 410 + 411 + freelist_size = cachep->num * sizeof(freelist_idx_t); 412 + status = (char *)page->freelist + freelist_size; 413 + 414 + return status[idx]; 415 + } 416 + 417 + #else 418 + static inline void set_obj_status(struct page *page, int idx, int val) {} 419 + 420 + #endif 421 + 389 422 /* 390 423 * Do not go above this order unless 0 objects fit into the slab or 391 424 * overridden on the command line. ··· 609 576 return cachep->array[smp_processor_id()]; 610 577 } 611 578 579 + static size_t calculate_freelist_size(int nr_objs, size_t align) 580 + { 581 + size_t freelist_size; 582 + 583 + freelist_size = nr_objs * sizeof(freelist_idx_t); 584 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 585 + freelist_size += nr_objs * sizeof(char); 586 + 587 + if (align) 588 + freelist_size = ALIGN(freelist_size, align); 589 + 590 + return freelist_size; 591 + } 592 + 612 593 static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 613 594 size_t idx_size, size_t align) 614 595 { 615 596 int nr_objs; 597 + size_t remained_size; 616 598 size_t freelist_size; 599 + int extra_space = 0; 617 600 601 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 602 + extra_space = sizeof(char); 618 603 /* 619 604 * Ignore padding for the initial guess. The padding 620 605 * is at most @align-1 bytes, and @buffer_size is at ··· 641 590 * into the memory allocation when taking the padding 642 591 * into account. 643 592 */ 644 - nr_objs = slab_size / (buffer_size + idx_size); 593 + nr_objs = slab_size / (buffer_size + idx_size + extra_space); 645 594 646 595 /* 647 596 * This calculated number will be either the right 648 597 * amount, or one greater than what we want. 649 598 */ 650 - freelist_size = slab_size - nr_objs * buffer_size; 651 - if (freelist_size < ALIGN(nr_objs * idx_size, align)) 599 + remained_size = slab_size - nr_objs * buffer_size; 600 + freelist_size = calculate_freelist_size(nr_objs, align); 601 + if (remained_size < freelist_size) 652 602 nr_objs--; 653 603 654 604 return nr_objs; ··· 687 635 } else { 688 636 nr_objs = calculate_nr_objs(slab_size, buffer_size, 689 637 sizeof(freelist_idx_t), align); 690 - mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); 638 + mgmt_size = calculate_freelist_size(nr_objs, align); 691 639 } 692 640 *num = nr_objs; 693 641 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; ··· 2093 2041 break; 2094 2042 2095 2043 if (flags & CFLGS_OFF_SLAB) { 2044 + size_t freelist_size_per_obj = sizeof(freelist_idx_t); 2096 2045 /* 2097 2046 * Max number of objs-per-slab for caches which 2098 2047 * use off-slab slabs. Needed to avoid a possible 2099 2048 * looping condition in cache_grow(). 2100 2049 */ 2050 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) 2051 + freelist_size_per_obj += sizeof(char); 2101 2052 offslab_limit = size; 2102 - offslab_limit /= sizeof(freelist_idx_t); 2053 + offslab_limit /= freelist_size_per_obj; 2103 2054 2104 2055 if (num > offslab_limit) 2105 2056 break; ··· 2349 2294 if (!cachep->num) 2350 2295 return -E2BIG; 2351 2296 2352 - freelist_size = 2353 - ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); 2297 + freelist_size = calculate_freelist_size(cachep->num, cachep->align); 2354 2298 2355 2299 /* 2356 2300 * If the slab has been placed off-slab, and we have enough space then ··· 2362 2308 2363 2309 if (flags & CFLGS_OFF_SLAB) { 2364 2310 /* really off slab. No need for manual alignment */ 2365 - freelist_size = cachep->num * sizeof(freelist_idx_t); 2311 + freelist_size = calculate_freelist_size(cachep->num, 0); 2366 2312 2367 2313 #ifdef CONFIG_PAGE_POISONING 2368 2314 /* If we're going to use the generic kernel_map_pages() ··· 2666 2612 if (cachep->ctor) 2667 2613 cachep->ctor(objp); 2668 2614 #endif 2615 + set_obj_status(page, i, OBJECT_FREE); 2669 2616 set_free_obj(page, i, i); 2670 2617 } 2671 2618 } ··· 2875 2820 BUG_ON(objnr >= cachep->num); 2876 2821 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2877 2822 2823 + set_obj_status(page, objnr, OBJECT_FREE); 2878 2824 if (cachep->flags & SLAB_POISON) { 2879 2825 #ifdef CONFIG_DEBUG_PAGEALLOC 2880 2826 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { ··· 3009 2953 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3010 2954 gfp_t flags, void *objp, unsigned long caller) 3011 2955 { 2956 + struct page *page; 2957 + 3012 2958 if (!objp) 3013 2959 return objp; 3014 2960 if (cachep->flags & SLAB_POISON) { ··· 3041 2983 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3042 2984 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3043 2985 } 2986 + 2987 + page = virt_to_head_page(objp); 2988 + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); 3044 2989 objp += obj_offset(cachep); 3045 2990 if (cachep->ctor && cachep->flags & SLAB_POISON) 3046 2991 cachep->ctor(objp); ··· 4280 4219 struct page *page) 4281 4220 { 4282 4221 void *p; 4283 - int i, j; 4222 + int i; 4284 4223 4285 4224 if (n[0] == n[1]) 4286 4225 return; 4287 4226 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4288 - bool active = true; 4289 - 4290 - for (j = page->active; j < c->num; j++) { 4291 - /* Skip freed item */ 4292 - if (get_free_obj(page, j) == i) { 4293 - active = false; 4294 - break; 4295 - } 4296 - } 4297 - if (!active) 4227 + if (get_obj_status(page, i) != OBJECT_ACTIVE) 4298 4228 continue; 4299 4229 4300 4230 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))