Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/mm: use full 4KB page for 2KB PTE

Cease using 4KB pages to host two 2KB PTEs. That greatly
simplifies the memory management code at the expense of
page tables memory footprint.

Instead of two PTEs per 4KB page use only upper half of
the parent page for a single PTE. With that the list of
half-used pages pgtable_list becomes unneeded.

Further, the upper byte of the parent page _refcount
counter does not need to be used for fragments tracking
and could be left alone.

Commit 8211dad62798 ("s390: add pte_free_defer() for
pgtables sharing page") introduced the use of PageActive
flag to coordinate a deferred free with 2KB page table
fragments tracking. Since there is no tracking anymore,
there is no need for using PageActive flag.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>

authored by

Alexander Gordeev and committed by
Vasily Gorbik
d08d4e7c a51324c4

+29 -260
-2
arch/s390/include/asm/mmu.h
··· 11 11 cpumask_t cpu_attach_mask; 12 12 atomic_t flush_count; 13 13 unsigned int flush_mm; 14 - struct list_head pgtable_list; 15 14 struct list_head gmap_list; 16 15 unsigned long gmap_asce; 17 16 unsigned long asce; ··· 38 39 39 40 #define INIT_MM_CONTEXT(name) \ 40 41 .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ 41 - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ 42 42 .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), 43 43 44 44 #endif
-1
arch/s390/include/asm/mmu_context.h
··· 22 22 unsigned long asce_type, init_entry; 23 23 24 24 spin_lock_init(&mm->context.lock); 25 - INIT_LIST_HEAD(&mm->context.pgtable_list); 26 25 INIT_LIST_HEAD(&mm->context.gmap_list); 27 26 cpumask_clear(&mm->context.cpu_attach_mask); 28 27 atomic_set(&mm->context.flush_count, 0);
-5
arch/s390/include/asm/tlb.h
··· 69 69 tlb->mm->context.flush_mm = 1; 70 70 tlb->freed_tables = 1; 71 71 tlb->cleared_pmds = 1; 72 - /* 73 - * page_table_free_rcu takes care of the allocation bit masks 74 - * of the 2K table fragments in the 4K page table page, 75 - * then calls tlb_remove_table. 76 - */ 77 72 page_table_free_rcu(tlb, (unsigned long *) pte, address); 78 73 } 79 74
+29 -252
arch/s390/mm/pgalloc.c
··· 133 133 return -ENOMEM; 134 134 } 135 135 136 - static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) 137 - { 138 - return atomic_fetch_xor(bits, v) ^ bits; 139 - } 140 - 141 136 #ifdef CONFIG_PGSTE 142 137 143 138 struct page *page_table_alloc_pgste(struct mm_struct *mm) ··· 157 162 158 163 #endif /* CONFIG_PGSTE */ 159 164 160 - /* 161 - * A 2KB-pgtable is either upper or lower half of a normal page. 162 - * The second half of the page may be unused or used as another 163 - * 2KB-pgtable. 164 - * 165 - * Whenever possible the parent page for a new 2KB-pgtable is picked 166 - * from the list of partially allocated pages mm_context_t::pgtable_list. 167 - * In case the list is empty a new parent page is allocated and added to 168 - * the list. 169 - * 170 - * When a parent page gets fully allocated it contains 2KB-pgtables in both 171 - * upper and lower halves and is removed from mm_context_t::pgtable_list. 172 - * 173 - * When 2KB-pgtable is freed from to fully allocated parent page that 174 - * page turns partially allocated and added to mm_context_t::pgtable_list. 175 - * 176 - * If 2KB-pgtable is freed from the partially allocated parent page that 177 - * page turns unused and gets removed from mm_context_t::pgtable_list. 178 - * Furthermore, the unused parent page is released. 179 - * 180 - * As follows from the above, no unallocated or fully allocated parent 181 - * pages are contained in mm_context_t::pgtable_list. 182 - * 183 - * The upper byte (bits 24-31) of the parent page _refcount is used 184 - * for tracking contained 2KB-pgtables and has the following format: 185 - * 186 - * PP AA 187 - * 01234567 upper byte (bits 24-31) of struct page::_refcount 188 - * || || 189 - * || |+--- upper 2KB-pgtable is allocated 190 - * || +---- lower 2KB-pgtable is allocated 191 - * |+------- upper 2KB-pgtable is pending for removal 192 - * +-------- lower 2KB-pgtable is pending for removal 193 - * 194 - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why 195 - * using _refcount is possible). 196 - * 197 - * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. 198 - * The parent page is either: 199 - * - added to mm_context_t::pgtable_list in case the second half of the 200 - * parent page is still unallocated; 201 - * - removed from mm_context_t::pgtable_list in case both hales of the 202 - * parent page are allocated; 203 - * These operations are protected with mm_context_t::lock. 204 - * 205 - * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 206 - * and the corresponding PP bit is set to 1 in a single atomic operation. 207 - * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually 208 - * exclusive and may never be both set to 1! 209 - * The parent page is either: 210 - * - added to mm_context_t::pgtable_list in case the second half of the 211 - * parent page is still allocated; 212 - * - removed from mm_context_t::pgtable_list in case the second half of 213 - * the parent page is unallocated; 214 - * These operations are protected with mm_context_t::lock. 215 - * 216 - * It is important to understand that mm_context_t::lock only protects 217 - * mm_context_t::pgtable_list and AA bits, but not the parent page itself 218 - * and PP bits. 219 - * 220 - * Releasing the parent page happens whenever the PP bit turns from 1 to 0, 221 - * while both AA bits and the second PP bit are already unset. Then the 222 - * parent page does not contain any 2KB-pgtable fragment anymore, and it has 223 - * also been removed from mm_context_t::pgtable_list. It is safe to release 224 - * the page therefore. 225 - * 226 - * PGSTE memory spaces use full 4KB-pgtables and do not need most of the 227 - * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable 228 - * while the PP bits are never used, nor such a page is added to or removed 229 - * from mm_context_t::pgtable_list. 230 - * 231 - * pte_free_defer() overrides those rules: it takes the page off pgtable_list, 232 - * and prevents both 2K fragments from being reused. pte_free_defer() has to 233 - * guarantee that its pgtable cannot be reused before the RCU grace period 234 - * has elapsed (which page_table_free_rcu() does not actually guarantee). 235 - * But for simplicity, because page->rcu_head overlays page->lru, and because 236 - * the RCU callback might not be called before the mm_context_t has been freed, 237 - * pte_free_defer() in this implementation prevents both fragments from being 238 - * reused, and delays making the call to RCU until both fragments are freed. 239 - */ 240 165 unsigned long *page_table_alloc(struct mm_struct *mm) 241 166 { 242 - unsigned long *table; 243 167 struct ptdesc *ptdesc; 244 - unsigned int mask, bit; 168 + unsigned long *table; 245 169 246 - /* Try to get a fragment of a 4K page as a 2K page table */ 247 - if (!mm_alloc_pgste(mm)) { 248 - table = NULL; 249 - spin_lock_bh(&mm->context.lock); 250 - if (!list_empty(&mm->context.pgtable_list)) { 251 - ptdesc = list_first_entry(&mm->context.pgtable_list, 252 - struct ptdesc, pt_list); 253 - mask = atomic_read(&ptdesc->_refcount) >> 24; 254 - /* 255 - * The pending removal bits must also be checked. 256 - * Failure to do so might lead to an impossible 257 - * value of (i.e 0x13 or 0x23) written to _refcount. 258 - * Such values violate the assumption that pending and 259 - * allocation bits are mutually exclusive, and the rest 260 - * of the code unrails as result. That could lead to 261 - * a whole bunch of races and corruptions. 262 - */ 263 - mask = (mask | (mask >> 4)) & 0x03U; 264 - if (mask != 0x03U) { 265 - table = (unsigned long *) ptdesc_to_virt(ptdesc); 266 - bit = mask & 1; /* =1 -> second 2K */ 267 - if (bit) 268 - table += PTRS_PER_PTE; 269 - atomic_xor_bits(&ptdesc->_refcount, 270 - 0x01U << (bit + 24)); 271 - list_del_init(&ptdesc->pt_list); 272 - } 273 - } 274 - spin_unlock_bh(&mm->context.lock); 275 - if (table) 276 - return table; 277 - } 278 - /* Allocate a fresh page */ 279 170 ptdesc = pagetable_alloc(GFP_KERNEL, 0); 280 171 if (!ptdesc) 281 172 return NULL; ··· 169 288 pagetable_free(ptdesc); 170 289 return NULL; 171 290 } 172 - /* Initialize page table */ 173 291 table = ptdesc_to_virt(ptdesc); 174 292 __arch_set_page_dat(table, 1); 175 - if (mm_alloc_pgste(mm)) { 176 - /* Return 4K page table with PGSTEs */ 177 - INIT_LIST_HEAD(&ptdesc->pt_list); 178 - atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); 179 - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); 180 - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); 181 - } else { 182 - /* Return the first 2K fragment of the page */ 183 - atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); 184 - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); 185 - spin_lock_bh(&mm->context.lock); 186 - list_add(&ptdesc->pt_list, &mm->context.pgtable_list); 187 - spin_unlock_bh(&mm->context.lock); 188 - } 293 + /* pt_list is used by gmap only */ 294 + INIT_LIST_HEAD(&ptdesc->pt_list); 295 + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); 296 + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); 189 297 return table; 190 298 } 191 299 192 - static void page_table_release_check(struct page *page, void *table, 193 - unsigned int half, unsigned int mask) 300 + static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) 194 301 { 195 - char msg[128]; 196 - 197 - if (!IS_ENABLED(CONFIG_DEBUG_VM)) 198 - return; 199 - if (!mask && list_empty(&page->lru)) 200 - return; 201 - snprintf(msg, sizeof(msg), 202 - "Invalid pgtable %p release half 0x%02x mask 0x%02x", 203 - table, half, mask); 204 - dump_page(page, msg); 205 - } 206 - 207 - static void pte_free_now(struct rcu_head *head) 208 - { 209 - struct ptdesc *ptdesc; 210 - 211 - ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 212 302 pagetable_pte_dtor(ptdesc); 213 303 pagetable_free(ptdesc); 214 304 } 215 305 216 306 void page_table_free(struct mm_struct *mm, unsigned long *table) 217 307 { 218 - unsigned int mask, bit, half; 219 308 struct ptdesc *ptdesc = virt_to_ptdesc(table); 220 309 221 - if (!mm_alloc_pgste(mm)) { 222 - /* Free 2K page table fragment of a 4K page */ 223 - bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); 224 - spin_lock_bh(&mm->context.lock); 225 - /* 226 - * Mark the page for delayed release. The actual release 227 - * will happen outside of the critical section from this 228 - * function or from __tlb_remove_table() 229 - */ 230 - mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); 231 - mask >>= 24; 232 - if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { 233 - /* 234 - * Other half is allocated, and neither half has had 235 - * its free deferred: add page to head of list, to make 236 - * this freed half available for immediate reuse. 237 - */ 238 - list_add(&ptdesc->pt_list, &mm->context.pgtable_list); 239 - } else { 240 - /* If page is on list, now remove it. */ 241 - list_del_init(&ptdesc->pt_list); 242 - } 243 - spin_unlock_bh(&mm->context.lock); 244 - mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); 245 - mask >>= 24; 246 - if (mask != 0x00U) 247 - return; 248 - half = 0x01U << bit; 249 - } else { 250 - half = 0x03U; 251 - mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); 252 - mask >>= 24; 253 - } 254 - 255 - page_table_release_check(ptdesc_page(ptdesc), table, half, mask); 256 - if (folio_test_clear_active(ptdesc_folio(ptdesc))) 257 - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); 258 - else 259 - pte_free_now(&ptdesc->pt_rcu_head); 310 + pagetable_pte_dtor_free(ptdesc); 260 311 } 261 312 262 313 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, 263 314 unsigned long vmaddr) 264 315 { 265 316 struct mm_struct *mm; 266 - unsigned int bit, mask; 267 - struct ptdesc *ptdesc = virt_to_ptdesc(table); 268 317 269 318 mm = tlb->mm; 270 - if (mm_alloc_pgste(mm)) { 319 + if (mm_alloc_pgste(mm)) 271 320 gmap_unlink(mm, table, vmaddr); 272 - table = (unsigned long *) ((unsigned long)table | 0x03U); 273 - tlb_remove_ptdesc(tlb, table); 274 - return; 275 - } 276 - bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); 277 - spin_lock_bh(&mm->context.lock); 278 - /* 279 - * Mark the page for delayed release. The actual release will happen 280 - * outside of the critical section from __tlb_remove_table() or from 281 - * page_table_free() 282 - */ 283 - mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); 284 - mask >>= 24; 285 - if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { 286 - /* 287 - * Other half is allocated, and neither half has had 288 - * its free deferred: add page to end of list, to make 289 - * this freed half available for reuse once its pending 290 - * bit has been cleared by __tlb_remove_table(). 291 - */ 292 - list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); 293 - } else { 294 - /* If page is on list, now remove it. */ 295 - list_del_init(&ptdesc->pt_list); 296 - } 297 - spin_unlock_bh(&mm->context.lock); 298 - table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); 321 + table = (unsigned long *)((unsigned long)table | 0x01U); 299 322 tlb_remove_ptdesc(tlb, table); 300 323 } 301 324 302 325 void __tlb_remove_table(void *_table) 303 326 { 304 - unsigned int mask = (unsigned long) _table & 0x03U, half = mask; 305 - void *table = (void *)((unsigned long) _table ^ mask); 306 - struct ptdesc *ptdesc = virt_to_ptdesc(table); 327 + struct ptdesc *ptdesc; 328 + unsigned int mask; 329 + void *table; 307 330 308 - switch (half) { 309 - case 0x00U: /* pmd, pud, or p4d */ 331 + mask = (unsigned long)_table & 0x01U; 332 + table = (void *)((unsigned long)_table ^ mask); 333 + ptdesc = virt_to_ptdesc(table); 334 + if (!mask) { 335 + /* pmd, pud, or p4d */ 310 336 pagetable_free(ptdesc); 311 337 return; 312 - case 0x01U: /* lower 2K of a 4K page table */ 313 - case 0x02U: /* higher 2K of a 4K page table */ 314 - mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); 315 - mask >>= 24; 316 - if (mask != 0x00U) 317 - return; 318 - break; 319 - case 0x03U: /* 4K page table with pgstes */ 320 - mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); 321 - mask >>= 24; 322 - break; 323 338 } 324 - 325 - page_table_release_check(ptdesc_page(ptdesc), table, half, mask); 326 - if (folio_test_clear_active(ptdesc_folio(ptdesc))) 327 - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); 328 - else 329 - pte_free_now(&ptdesc->pt_rcu_head); 339 + pagetable_pte_dtor_free(ptdesc); 330 340 } 331 341 332 342 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 343 + static void pte_free_now(struct rcu_head *head) 344 + { 345 + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); 346 + 347 + pagetable_pte_dtor_free(ptdesc); 348 + } 349 + 333 350 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) 334 351 { 335 - struct page *page; 352 + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); 336 353 337 - page = virt_to_page(pgtable); 338 - SetPageActive(page); 339 - page_table_free(mm, (unsigned long *)pgtable); 354 + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); 340 355 /* 341 - * page_table_free() does not do the pgste gmap_unlink() which 342 - * page_table_free_rcu() does: warn us if pgste ever reaches here. 356 + * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. 357 + * Turn to the generic pte_free_defer() version once gmap is removed. 343 358 */ 344 359 WARN_ON_ONCE(mm_has_pgste(mm)); 345 360 }