Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-5.14-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu

Pull percpu fix from Dennis Zhou:
"This is just a single change to fix percpu depopulation. The code
relied on depopulation code written specifically for the free path and
relied on vmalloc to do the tlb flush lazily. As we're modifying the
backing pages during the lifetime of a chunk, we need to also flush
the tlb accordingly.

Guenter Roeck reported this issue in [1] on mips. I believe we just
happen to be lucky given the much larger chunk sizes on x86 and
consequently less churning of this memory"

Link: https://lore.kernel.org/lkml/20210702191140.GA3166599@roeck-us.net/ [1]

* 'for-5.14-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu:
percpu: flush tlb in pcpu_reclaim_populated()

+35 -8
+6
mm/percpu-km.c
··· 32 32 33 33 #include <linux/log2.h> 34 34 35 + static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, 36 + int page_start, int page_end) 37 + { 38 + /* nothing */ 39 + } 40 + 35 41 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 36 42 int page_start, int page_end, gfp_t gfp) 37 43 {
+3 -2
mm/percpu-vm.c
··· 303 303 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 304 304 * from @chunk. 305 305 * 306 + * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the 307 + * region back to vmalloc() which will lazily flush the tlb. 308 + * 306 309 * CONTEXT: 307 310 * pcpu_alloc_mutex. 308 311 */ ··· 326 323 pcpu_pre_unmap_flush(chunk, page_start, page_end); 327 324 328 325 pcpu_unmap_pages(chunk, pages, page_start, page_end); 329 - 330 - /* no need to flush tlb, vmalloc will handle it lazily */ 331 326 332 327 pcpu_free_pages(chunk, pages, page_start, page_end); 333 328 }
+26 -6
mm/percpu.c
··· 1572 1572 * 1573 1573 * pcpu_populate_chunk - populate the specified range of a chunk 1574 1574 * pcpu_depopulate_chunk - depopulate the specified range of a chunk 1575 + * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk 1575 1576 * pcpu_create_chunk - create a new chunk 1576 1577 * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop 1577 1578 * pcpu_addr_to_page - translate address to physical address ··· 1582 1581 int page_start, int page_end, gfp_t gfp); 1583 1582 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 1584 1583 int page_start, int page_end); 1584 + static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, 1585 + int page_start, int page_end); 1585 1586 static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); 1586 1587 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); 1587 1588 static struct page *pcpu_addr_to_page(void *addr); ··· 2140 2137 { 2141 2138 struct pcpu_chunk *chunk; 2142 2139 struct pcpu_block_md *block; 2140 + int freed_page_start, freed_page_end; 2143 2141 int i, end; 2142 + bool reintegrate; 2144 2143 2145 2144 lockdep_assert_held(&pcpu_lock); 2146 2145 2147 - restart: 2148 2146 /* 2149 2147 * Once a chunk is isolated to the to_depopulate list, the chunk is no 2150 2148 * longer discoverable to allocations whom may populate pages. The only ··· 2161 2157 * Scan chunk's pages in the reverse order to keep populated 2162 2158 * pages close to the beginning of the chunk. 2163 2159 */ 2160 + freed_page_start = chunk->nr_pages; 2161 + freed_page_end = 0; 2162 + reintegrate = false; 2164 2163 for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { 2165 2164 /* no more work to do */ 2166 2165 if (chunk->nr_empty_pop_pages == 0) ··· 2171 2164 2172 2165 /* reintegrate chunk to prevent atomic alloc failures */ 2173 2166 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { 2174 - pcpu_reintegrate_chunk(chunk); 2175 - goto restart; 2167 + reintegrate = true; 2168 + goto end_chunk; 2176 2169 } 2177 2170 2178 2171 /* ··· 2201 2194 spin_lock_irq(&pcpu_lock); 2202 2195 2203 2196 pcpu_chunk_depopulated(chunk, i + 1, end + 1); 2197 + freed_page_start = min(freed_page_start, i + 1); 2198 + freed_page_end = max(freed_page_end, end + 1); 2204 2199 2205 2200 /* reset the range and continue */ 2206 2201 end = -1; 2207 2202 } 2208 2203 2209 - if (chunk->free_bytes == pcpu_unit_size) 2204 + end_chunk: 2205 + /* batch tlb flush per chunk to amortize cost */ 2206 + if (freed_page_start < freed_page_end) { 2207 + spin_unlock_irq(&pcpu_lock); 2208 + pcpu_post_unmap_tlb_flush(chunk, 2209 + freed_page_start, 2210 + freed_page_end); 2211 + cond_resched(); 2212 + spin_lock_irq(&pcpu_lock); 2213 + } 2214 + 2215 + if (reintegrate || chunk->free_bytes == pcpu_unit_size) 2210 2216 pcpu_reintegrate_chunk(chunk); 2211 2217 else 2212 - list_move(&chunk->list, 2213 - &pcpu_chunk_lists[pcpu_sidelined_slot]); 2218 + list_move_tail(&chunk->list, 2219 + &pcpu_chunk_lists[pcpu_sidelined_slot]); 2214 2220 } 2215 2221 } 2216 2222