Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: refactor TLB update handling

Conceptually, we want the memory mappings to always be up to date and
represent whatever is in the TLB. To ensure that, we need to sync them
over in the userspace case and for the kernel we need to process the
mappings.

The kernel will call flush_tlb_* if page table entries that were valid
before become invalid. Unfortunately, this is not the case if entries
are added.

As such, change both flush_tlb_* and set_ptes to track the memory range
that has to be synchronized. For the kernel, we need to execute a
flush_tlb_kern_* immediately but we can wait for the first page fault in
case of set_ptes. For userspace in contrast we only store that a range
of memory needs to be synced and do so whenever we switch to that
process.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20240703134536.1161108-13-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Benjamin Berg and committed by
Johannes Berg
bcf3d957 573a446f

+110 -132
-2
arch/um/drivers/ubd_kern.c
··· 36 36 #include <linux/vmalloc.h> 37 37 #include <linux/platform_device.h> 38 38 #include <linux/scatterlist.h> 39 - #include <asm/tlbflush.h> 40 39 #include <kern_util.h> 41 40 #include "mconsole_kern.h" 42 41 #include <init.h> ··· 769 770 printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); 770 771 goto error; 771 772 } 772 - flush_tlb_kernel_vm(); 773 773 774 774 err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap, 775 775 ubd_dev->cow.bitmap_offset,
+4
arch/um/include/asm/mmu.h
··· 10 10 11 11 typedef struct mm_context { 12 12 struct mm_id id; 13 + 14 + /* Address range in need of a TLB sync */ 15 + unsigned long sync_tlb_range_from; 16 + unsigned long sync_tlb_range_to; 13 17 } mm_context_t; 14 18 15 19 #endif
+32
arch/um/include/asm/pgtable.h
··· 244 244 245 245 #define PFN_PTE_SHIFT PAGE_SHIFT 246 246 247 + static inline void um_tlb_mark_sync(struct mm_struct *mm, unsigned long start, 248 + unsigned long end) 249 + { 250 + if (!mm->context.sync_tlb_range_to) { 251 + mm->context.sync_tlb_range_from = start; 252 + mm->context.sync_tlb_range_to = end; 253 + } else { 254 + if (start < mm->context.sync_tlb_range_from) 255 + mm->context.sync_tlb_range_from = start; 256 + if (end > mm->context.sync_tlb_range_to) 257 + mm->context.sync_tlb_range_to = end; 258 + } 259 + } 260 + 261 + #define set_ptes set_ptes 262 + static inline void set_ptes(struct mm_struct *mm, unsigned long addr, 263 + pte_t *ptep, pte_t pte, int nr) 264 + { 265 + /* Basically the default implementation */ 266 + size_t length = nr * PAGE_SIZE; 267 + 268 + for (;;) { 269 + set_pte(ptep, pte); 270 + if (--nr == 0) 271 + break; 272 + ptep++; 273 + pte = __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); 274 + } 275 + 276 + um_tlb_mark_sync(mm, addr, addr + length); 277 + } 278 + 247 279 #define __HAVE_ARCH_PTE_SAME 248 280 static inline int pte_same(pte_t pte_a, pte_t pte_b) 249 281 {
+37 -9
arch/um/include/asm/tlbflush.h
··· 9 9 #include <linux/mm.h> 10 10 11 11 /* 12 - * TLB flushing: 12 + * In UML, we need to sync the TLB over by using mmap/munmap/mprotect syscalls 13 + * from the process handling the MM (which can be the kernel itself). 13 14 * 14 - * - flush_tlb() flushes the current mm struct TLBs 15 + * To track updates, we can hook into set_ptes and flush_tlb_*. With set_ptes 16 + * we catch all PTE transitions where memory that was unusable becomes usable. 17 + * While with flush_tlb_* we can track any memory that becomes unusable and 18 + * even if a higher layer of the page table was modified. 19 + * 20 + * So, we simply track updates using both methods and mark the memory area to 21 + * be synced later on. The only special case is that flush_tlb_kern_* needs to 22 + * be executed immediately as there is no good synchronization point in that 23 + * case. In contrast, in the set_ptes case we can wait for the next kernel 24 + * segfault before we do the synchornization. 25 + * 15 26 * - flush_tlb_all() flushes all processes TLBs 16 27 * - flush_tlb_mm(mm) flushes the specified mm context TLB's 17 28 * - flush_tlb_page(vma, vmaddr) flushes one page 18 - * - flush_tlb_kernel_vm() flushes the kernel vm area 19 29 * - flush_tlb_range(vma, start, end) flushes a range of pages 30 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages 20 31 */ 32 + 33 + extern int um_tlb_sync(struct mm_struct *mm); 21 34 22 35 extern void flush_tlb_all(void); 23 36 extern void flush_tlb_mm(struct mm_struct *mm); 24 - extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 25 - unsigned long end); 26 - extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long address); 27 - extern void flush_tlb_kernel_vm(void); 28 - extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); 29 - extern void __flush_tlb_one(unsigned long addr); 37 + 38 + static inline void flush_tlb_page(struct vm_area_struct *vma, 39 + unsigned long address) 40 + { 41 + um_tlb_mark_sync(vma->vm_mm, address, address + PAGE_SIZE); 42 + } 43 + 44 + static inline void flush_tlb_range(struct vm_area_struct *vma, 45 + unsigned long start, unsigned long end) 46 + { 47 + um_tlb_mark_sync(vma->vm_mm, start, end); 48 + } 49 + 50 + static inline void flush_tlb_kernel_range(unsigned long start, 51 + unsigned long end) 52 + { 53 + um_tlb_mark_sync(&init_mm, start, end); 54 + 55 + /* Kernel needs to be synced immediately */ 56 + um_tlb_sync(&init_mm); 57 + } 30 58 31 59 #endif
+1
arch/um/include/shared/skas/skas.h
··· 16 16 extern long execute_syscall_skas(void *r); 17 17 extern unsigned long current_stub_stack(void); 18 18 extern struct mm_id *current_mm_id(void); 19 + extern void current_mm_sync(void); 19 20 20 21 #endif
+10
arch/um/kernel/skas/process.c
··· 8 8 #include <linux/sched/task_stack.h> 9 9 #include <linux/sched/task.h> 10 10 11 + #include <asm/tlbflush.h> 12 + 11 13 #include <as-layout.h> 12 14 #include <kern.h> 13 15 #include <os.h> ··· 59 57 return NULL; 60 58 61 59 return &current->mm->context.id; 60 + } 61 + 62 + void current_mm_sync(void) 63 + { 64 + if (current->mm == NULL) 65 + return; 66 + 67 + um_tlb_sync(current->mm); 62 68 }
+12 -118
arch/um/kernel/tlb.c
··· 170 170 return ret; 171 171 } 172 172 173 - static int fix_range_common(struct mm_struct *mm, unsigned long start_addr, 174 - unsigned long end_addr) 173 + int um_tlb_sync(struct mm_struct *mm) 175 174 { 176 175 pgd_t *pgd; 177 176 struct vm_ops ops; 178 - unsigned long addr = start_addr, next; 177 + unsigned long addr = mm->context.sync_tlb_range_from, next; 179 178 int ret = 0; 179 + 180 + if (mm->context.sync_tlb_range_to == 0) 181 + return 0; 180 182 181 183 ops.mm_idp = &mm->context.id; 182 184 if (mm == &init_mm) { ··· 193 191 194 192 pgd = pgd_offset(mm, addr); 195 193 do { 196 - next = pgd_addr_end(addr, end_addr); 194 + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); 197 195 if (!pgd_present(*pgd)) { 198 196 if (pgd_newpage(*pgd)) { 199 197 ret = ops.unmap(ops.mm_idp, addr, ··· 202 200 } 203 201 } else 204 202 ret = update_p4d_range(pgd, addr, next, &ops); 205 - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); 203 + } while (pgd++, addr = next, 204 + ((addr < mm->context.sync_tlb_range_to) && !ret)); 206 205 207 206 if (ret == -ENOMEM) 208 207 report_enomem(); 209 208 209 + mm->context.sync_tlb_range_from = 0; 210 + mm->context.sync_tlb_range_to = 0; 211 + 210 212 return ret; 211 - } 212 - 213 - static void flush_tlb_kernel_range_common(unsigned long start, unsigned long end) 214 - { 215 - int err; 216 - 217 - err = fix_range_common(&init_mm, start, end); 218 - 219 - if (err) 220 - panic("flush_tlb_kernel failed, errno = %d\n", err); 221 - } 222 - 223 - void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) 224 - { 225 - pgd_t *pgd; 226 - p4d_t *p4d; 227 - pud_t *pud; 228 - pmd_t *pmd; 229 - pte_t *pte; 230 - struct mm_struct *mm = vma->vm_mm; 231 - int r, w, x, prot; 232 - struct mm_id *mm_id; 233 - 234 - address &= PAGE_MASK; 235 - 236 - pgd = pgd_offset(mm, address); 237 - if (!pgd_present(*pgd)) 238 - goto kill; 239 - 240 - p4d = p4d_offset(pgd, address); 241 - if (!p4d_present(*p4d)) 242 - goto kill; 243 - 244 - pud = pud_offset(p4d, address); 245 - if (!pud_present(*pud)) 246 - goto kill; 247 - 248 - pmd = pmd_offset(pud, address); 249 - if (!pmd_present(*pmd)) 250 - goto kill; 251 - 252 - pte = pte_offset_kernel(pmd, address); 253 - 254 - r = pte_read(*pte); 255 - w = pte_write(*pte); 256 - x = pte_exec(*pte); 257 - if (!pte_young(*pte)) { 258 - r = 0; 259 - w = 0; 260 - } else if (!pte_dirty(*pte)) { 261 - w = 0; 262 - } 263 - 264 - mm_id = &mm->context.id; 265 - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | 266 - (x ? UM_PROT_EXEC : 0)); 267 - if (pte_newpage(*pte)) { 268 - if (pte_present(*pte)) { 269 - unsigned long long offset; 270 - int fd; 271 - 272 - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); 273 - map(mm_id, address, PAGE_SIZE, prot, fd, offset); 274 - } else 275 - unmap(mm_id, address, PAGE_SIZE); 276 - } else if (pte_newprot(*pte)) 277 - protect(mm_id, address, PAGE_SIZE, prot); 278 - 279 - *pte = pte_mkuptodate(*pte); 280 - 281 - return; 282 - 283 - kill: 284 - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); 285 - force_sig(SIGKILL); 286 213 } 287 214 288 215 void flush_tlb_all(void) ··· 226 295 flush_tlb_mm(current->mm); 227 296 } 228 297 229 - void flush_tlb_kernel_range(unsigned long start, unsigned long end) 230 - { 231 - flush_tlb_kernel_range_common(start, end); 232 - } 233 - 234 - void flush_tlb_kernel_vm(void) 235 - { 236 - flush_tlb_kernel_range_common(start_vm, end_vm); 237 - } 238 - 239 - void __flush_tlb_one(unsigned long addr) 240 - { 241 - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); 242 - } 243 - 244 - static void fix_range(struct mm_struct *mm, unsigned long start_addr, 245 - unsigned long end_addr) 246 - { 247 - /* 248 - * Don't bother flushing if this address space is about to be 249 - * destroyed. 250 - */ 251 - if (atomic_read(&mm->mm_users) == 0) 252 - return; 253 - 254 - fix_range_common(mm, start_addr, end_addr); 255 - } 256 - 257 - void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 258 - unsigned long end) 259 - { 260 - if (vma->vm_mm == NULL) 261 - flush_tlb_kernel_range_common(start, end); 262 - else fix_range(vma->vm_mm, start, end); 263 - } 264 - EXPORT_SYMBOL(flush_tlb_range); 265 - 266 298 void flush_tlb_mm(struct mm_struct *mm) 267 299 { 268 300 struct vm_area_struct *vma; 269 301 VMA_ITERATOR(vmi, mm, 0); 270 302 271 303 for_each_vma(vmi, vma) 272 - fix_range(mm, vma->vm_start, vma->vm_end); 304 + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); 273 305 }
+12 -3
arch/um/kernel/trap.c
··· 113 113 #if 0 114 114 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); 115 115 #endif 116 - flush_tlb_page(vma, address); 116 + 117 117 out: 118 118 mmap_read_unlock(mm); 119 119 out_nosemaphore: ··· 210 210 if (!is_user && regs) 211 211 current->thread.segv_regs = container_of(regs, struct pt_regs, regs); 212 212 213 - if (!is_user && (address >= start_vm) && (address < end_vm)) { 214 - flush_tlb_kernel_vm(); 213 + if (!is_user && init_mm.context.sync_tlb_range_to) { 214 + /* 215 + * Kernel has pending updates from set_ptes that were not 216 + * flushed yet. Syncing them should fix the pagefault (if not 217 + * we'll get here again and panic). 218 + */ 219 + err = um_tlb_sync(&init_mm); 220 + if (err == -ENOMEM) 221 + report_enomem(); 222 + if (err) 223 + panic("Failed to sync kernel TLBs: %d", err); 215 224 goto out; 216 225 } 217 226 else if (current->mm == NULL) {
+2
arch/um/os-Linux/skas/process.c
··· 347 347 while (1) { 348 348 time_travel_print_bc_msg(); 349 349 350 + current_mm_sync(); 351 + 350 352 /* Flush out any pending syscalls */ 351 353 err = syscall_stub_flush(current_mm_id()); 352 354 if (err) {