Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm, truncate: remove all exceptional entries from pagevec under one lock

During truncate each entry in a pagevec is checked to see if it is an
exceptional entry and if so, the shadow entry is cleaned up. This is
potentially expensive as multiple entries for a mapping locks/unlocks
the tree lock. This batches the operation such that any exceptional
entries removed from a pagevec only acquire the mapping tree lock once.
The corner case where this is more expensive is where there is only one
exceptional entry but this is unlikely due to temporal locality and how
it affects LRU ordering. Note that for truncations of small files
created recently, this patch should show no gain because it only batches
the handling of exceptional entries.

sparsetruncate (large)
4.14.0-rc4 4.14.0-rc4
pickhelper-v1r1 batchshadow-v1r1
Min Time 38.00 ( 0.00%) 27.00 ( 28.95%)
1st-qrtle Time 40.00 ( 0.00%) 28.00 ( 30.00%)
2nd-qrtle Time 44.00 ( 0.00%) 41.00 ( 6.82%)
3rd-qrtle Time 146.00 ( 0.00%) 147.00 ( -0.68%)
Max-90% Time 153.00 ( 0.00%) 153.00 ( 0.00%)
Max-95% Time 155.00 ( 0.00%) 156.00 ( -0.65%)
Max-99% Time 181.00 ( 0.00%) 171.00 ( 5.52%)
Amean Time 93.04 ( 0.00%) 88.43 ( 4.96%)
Best99%Amean Time 92.08 ( 0.00%) 86.13 ( 6.46%)
Best95%Amean Time 89.19 ( 0.00%) 83.13 ( 6.80%)
Best90%Amean Time 85.60 ( 0.00%) 79.15 ( 7.53%)
Best75%Amean Time 72.95 ( 0.00%) 65.09 ( 10.78%)
Best50%Amean Time 39.86 ( 0.00%) 28.20 ( 29.25%)
Best25%Amean Time 39.44 ( 0.00%) 27.70 ( 29.77%)

bonnie
4.14.0-rc4 4.14.0-rc4
pickhelper-v1r1 batchshadow-v1r1
Hmean SeqCreate ops 71.92 ( 0.00%) 76.78 ( 6.76%)
Hmean SeqCreate read 42.42 ( 0.00%) 45.01 ( 6.10%)
Hmean SeqCreate del 26519.88 ( 0.00%) 27191.87 ( 2.53%)
Hmean RandCreate ops 71.92 ( 0.00%) 76.95 ( 7.00%)
Hmean RandCreate read 44.44 ( 0.00%) 49.23 ( 10.78%)
Hmean RandCreate del 24948.62 ( 0.00%) 24764.97 ( -0.74%)

Truncation of a large number of files shows a substantial gain with 99%
of files being truncated 6.46% faster. bonnie shows a modest gain of
2.53%

[jack@suse.cz: fix truncate_exceptional_pvec_entries()]
Link: http://lkml.kernel.org/r/20171108164226.26788-1-jack@suse.cz
Link: http://lkml.kernel.org/r/20171018075952.10627-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mel Gorman and committed by
Linus Torvalds
f2187599 c7df8ad2

+63 -28
+63 -28
mm/truncate.c
··· 25 25 #include <linux/rmap.h> 26 26 #include "internal.h" 27 27 28 - static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, 29 - void *entry) 28 + /* 29 + * Regular page slots are stabilized by the page lock even without the tree 30 + * itself locked. These unlocked entries need verification under the tree 31 + * lock. 32 + */ 33 + static inline void __clear_shadow_entry(struct address_space *mapping, 34 + pgoff_t index, void *entry) 30 35 { 31 36 struct radix_tree_node *node; 32 37 void **slot; 33 38 34 - spin_lock_irq(&mapping->tree_lock); 35 - /* 36 - * Regular page slots are stabilized by the page lock even 37 - * without the tree itself locked. These unlocked entries 38 - * need verification under the tree lock. 39 - */ 40 39 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 41 - goto unlock; 40 + return; 42 41 if (*slot != entry) 43 - goto unlock; 42 + return; 44 43 __radix_tree_replace(&mapping->page_tree, node, slot, NULL, 45 44 workingset_update_node); 46 45 mapping->nrexceptional--; 47 - unlock: 46 + } 47 + 48 + static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, 49 + void *entry) 50 + { 51 + spin_lock_irq(&mapping->tree_lock); 52 + __clear_shadow_entry(mapping, index, entry); 48 53 spin_unlock_irq(&mapping->tree_lock); 49 54 } 50 55 51 56 /* 52 - * Unconditionally remove exceptional entry. Usually called from truncate path. 57 + * Unconditionally remove exceptional entries. Usually called from truncate 58 + * path. Note that the pagevec may be altered by this function by removing 59 + * exceptional entries similar to what pagevec_remove_exceptionals does. 53 60 */ 54 - static void truncate_exceptional_entry(struct address_space *mapping, 55 - pgoff_t index, void *entry) 61 + static void truncate_exceptional_pvec_entries(struct address_space *mapping, 62 + struct pagevec *pvec, pgoff_t *indices, 63 + pgoff_t end) 56 64 { 65 + int i, j; 66 + bool dax, lock; 67 + 57 68 /* Handled by shmem itself */ 58 69 if (shmem_mapping(mapping)) 59 70 return; 60 71 61 - if (dax_mapping(mapping)) { 62 - dax_delete_mapping_entry(mapping, index); 72 + for (j = 0; j < pagevec_count(pvec); j++) 73 + if (radix_tree_exceptional_entry(pvec->pages[j])) 74 + break; 75 + 76 + if (j == pagevec_count(pvec)) 63 77 return; 78 + 79 + dax = dax_mapping(mapping); 80 + lock = !dax && indices[j] < end; 81 + if (lock) 82 + spin_lock_irq(&mapping->tree_lock); 83 + 84 + for (i = j; i < pagevec_count(pvec); i++) { 85 + struct page *page = pvec->pages[i]; 86 + pgoff_t index = indices[i]; 87 + 88 + if (!radix_tree_exceptional_entry(page)) { 89 + pvec->pages[j++] = page; 90 + continue; 91 + } 92 + 93 + if (index >= end) 94 + continue; 95 + 96 + if (unlikely(dax)) { 97 + dax_delete_mapping_entry(mapping, index); 98 + continue; 99 + } 100 + 101 + __clear_shadow_entry(mapping, index, page); 64 102 } 65 - clear_shadow_entry(mapping, index, entry); 103 + 104 + if (lock) 105 + spin_unlock_irq(&mapping->tree_lock); 106 + pvec->nr = j; 66 107 } 67 108 68 109 /* ··· 351 310 if (index >= end) 352 311 break; 353 312 354 - if (radix_tree_exceptional_entry(page)) { 355 - truncate_exceptional_entry(mapping, index, 356 - page); 313 + if (radix_tree_exceptional_entry(page)) 357 314 continue; 358 - } 359 315 360 316 if (!trylock_page(page)) 361 317 continue; ··· 372 334 delete_from_page_cache_batch(mapping, &locked_pvec); 373 335 for (i = 0; i < pagevec_count(&locked_pvec); i++) 374 336 unlock_page(locked_pvec.pages[i]); 375 - pagevec_remove_exceptionals(&pvec); 337 + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); 376 338 pagevec_release(&pvec); 377 339 cond_resched(); 378 340 index++; 379 341 } 380 - 381 342 if (partial_start) { 382 343 struct page *page = find_lock_page(mapping, start - 1); 383 344 if (page) { ··· 434 397 pagevec_release(&pvec); 435 398 break; 436 399 } 400 + 437 401 for (i = 0; i < pagevec_count(&pvec); i++) { 438 402 struct page *page = pvec.pages[i]; 439 403 ··· 446 408 break; 447 409 } 448 410 449 - if (radix_tree_exceptional_entry(page)) { 450 - truncate_exceptional_entry(mapping, index, 451 - page); 411 + if (radix_tree_exceptional_entry(page)) 452 412 continue; 453 - } 454 413 455 414 lock_page(page); 456 415 WARN_ON(page_to_index(page) != index); ··· 455 420 truncate_inode_page(mapping, page); 456 421 unlock_page(page); 457 422 } 458 - pagevec_remove_exceptionals(&pvec); 423 + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); 459 424 pagevec_release(&pvec); 460 425 index++; 461 426 }