at master 12 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2 3/* 4 * Copyright (c) 2025, Google LLC. 5 * Pasha Tatashin <pasha.tatashin@soleen.com> 6 * 7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates. 8 * Pratyush Yadav <ptyadav@amazon.de> 9 */ 10 11/** 12 * DOC: Memfd Preservation via LUO 13 * 14 * Overview 15 * ======== 16 * 17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live 18 * Update Orchestrator (LUO) file preservation. This allows userspace to 19 * transfer its memory contents to the next kernel after a kexec. 20 * 21 * The preservation is not intended to be transparent. Only select properties of 22 * the file are preserved. All others are reset to default. The preserved 23 * properties are described below. 24 * 25 * .. note:: 26 * The LUO API is not stabilized yet, so the preserved properties of a memfd 27 * are also not stable and are subject to backwards incompatible changes. 28 * 29 * .. note:: 30 * Currently a memfd backed by Hugetlb is not supported. Memfds created 31 * with ``MFD_HUGETLB`` will be rejected. 32 * 33 * Preserved Properties 34 * ==================== 35 * 36 * The following properties of the memfd are preserved across kexec: 37 * 38 * File Contents 39 * All data stored in the file is preserved. 40 * 41 * File Size 42 * The size of the file is preserved. Holes in the file are filled by 43 * allocating pages for them during preservation. 44 * 45 * File Position 46 * The current file position is preserved, allowing applications to continue 47 * reading/writing from their last position. 48 * 49 * File Status Flags 50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property 51 * is maintained. 52 * 53 * Non-Preserved Properties 54 * ======================== 55 * 56 * All properties which are not preserved must be assumed to be reset to 57 * default. This section describes some of those properties which may be more of 58 * note. 59 * 60 * ``FD_CLOEXEC`` flag 61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the 62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set 63 * again after restore via ``fcntl()``. 64 * 65 * Seals 66 * File seals are not preserved. The file is unsealed on restore and if 67 * needed, must be sealed again via ``fcntl()``. 68 */ 69 70#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 71 72#include <linux/bits.h> 73#include <linux/err.h> 74#include <linux/file.h> 75#include <linux/io.h> 76#include <linux/kexec_handover.h> 77#include <linux/kho/abi/memfd.h> 78#include <linux/liveupdate.h> 79#include <linux/shmem_fs.h> 80#include <linux/vmalloc.h> 81#include "internal.h" 82 83static int memfd_luo_preserve_folios(struct file *file, 84 struct kho_vmalloc *kho_vmalloc, 85 struct memfd_luo_folio_ser **out_folios_ser, 86 u64 *nr_foliosp) 87{ 88 struct inode *inode = file_inode(file); 89 struct memfd_luo_folio_ser *folios_ser; 90 unsigned int max_folios; 91 long i, size, nr_pinned; 92 struct folio **folios; 93 int err = -EINVAL; 94 pgoff_t offset; 95 u64 nr_folios; 96 97 size = i_size_read(inode); 98 /* 99 * If the file has zero size, then the folios and nr_folios properties 100 * are not set. 101 */ 102 if (!size) { 103 *nr_foliosp = 0; 104 *out_folios_ser = NULL; 105 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); 106 return 0; 107 } 108 109 /* 110 * Guess the number of folios based on inode size. Real number might end 111 * up being smaller if there are higher order folios. 112 */ 113 max_folios = PAGE_ALIGN(size) / PAGE_SIZE; 114 folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); 115 if (!folios) 116 return -ENOMEM; 117 118 /* 119 * Pin the folios so they don't move around behind our back. This also 120 * ensures none of the folios are in CMA -- which ensures they don't 121 * fall in KHO scratch memory. It also moves swapped out folios back to 122 * memory. 123 * 124 * A side effect of doing this is that it allocates a folio for all 125 * indices in the file. This might waste memory on sparse memfds. If 126 * that is really a problem in the future, we can have a 127 * memfd_pin_folios() variant that does not allocate a page on empty 128 * slots. 129 */ 130 nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, 131 &offset); 132 if (nr_pinned < 0) { 133 err = nr_pinned; 134 pr_err("failed to pin folios: %d\n", err); 135 goto err_free_folios; 136 } 137 nr_folios = nr_pinned; 138 139 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); 140 if (!folios_ser) { 141 err = -ENOMEM; 142 goto err_unpin; 143 } 144 145 for (i = 0; i < nr_folios; i++) { 146 struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 147 struct folio *folio = folios[i]; 148 unsigned int flags = 0; 149 150 err = kho_preserve_folio(folio); 151 if (err) 152 goto err_unpreserve; 153 154 if (folio_test_dirty(folio)) 155 flags |= MEMFD_LUO_FOLIO_DIRTY; 156 if (folio_test_uptodate(folio)) 157 flags |= MEMFD_LUO_FOLIO_UPTODATE; 158 159 pfolio->pfn = folio_pfn(folio); 160 pfolio->flags = flags; 161 pfolio->index = folio->index; 162 } 163 164 err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); 165 if (err) 166 goto err_unpreserve; 167 168 kvfree(folios); 169 *nr_foliosp = nr_folios; 170 *out_folios_ser = folios_ser; 171 172 /* 173 * Note: folios_ser is purposely not freed here. It is preserved 174 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer 175 * that is passed via private_data. 176 */ 177 return 0; 178 179err_unpreserve: 180 for (i = i - 1; i >= 0; i--) 181 kho_unpreserve_folio(folios[i]); 182 vfree(folios_ser); 183err_unpin: 184 unpin_folios(folios, nr_folios); 185err_free_folios: 186 kvfree(folios); 187 188 return err; 189} 190 191static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, 192 struct memfd_luo_folio_ser *folios_ser, 193 u64 nr_folios) 194{ 195 long i; 196 197 if (!nr_folios) 198 return; 199 200 kho_unpreserve_vmalloc(kho_vmalloc); 201 202 for (i = 0; i < nr_folios; i++) { 203 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 204 struct folio *folio; 205 206 if (!pfolio->pfn) 207 continue; 208 209 folio = pfn_folio(pfolio->pfn); 210 211 kho_unpreserve_folio(folio); 212 unpin_folio(folio); 213 } 214 215 vfree(folios_ser); 216} 217 218static int memfd_luo_preserve(struct liveupdate_file_op_args *args) 219{ 220 struct inode *inode = file_inode(args->file); 221 struct memfd_luo_folio_ser *folios_ser; 222 struct memfd_luo_ser *ser; 223 u64 nr_folios; 224 int err = 0; 225 226 inode_lock(inode); 227 shmem_freeze(inode, true); 228 229 /* Allocate the main serialization structure in preserved memory */ 230 ser = kho_alloc_preserve(sizeof(*ser)); 231 if (IS_ERR(ser)) { 232 err = PTR_ERR(ser); 233 goto err_unlock; 234 } 235 236 ser->pos = args->file->f_pos; 237 ser->size = i_size_read(inode); 238 239 err = memfd_luo_preserve_folios(args->file, &ser->folios, 240 &folios_ser, &nr_folios); 241 if (err) 242 goto err_free_ser; 243 244 ser->nr_folios = nr_folios; 245 inode_unlock(inode); 246 247 args->private_data = folios_ser; 248 args->serialized_data = virt_to_phys(ser); 249 250 return 0; 251 252err_free_ser: 253 kho_unpreserve_free(ser); 254err_unlock: 255 shmem_freeze(inode, false); 256 inode_unlock(inode); 257 return err; 258} 259 260static int memfd_luo_freeze(struct liveupdate_file_op_args *args) 261{ 262 struct memfd_luo_ser *ser; 263 264 if (WARN_ON_ONCE(!args->serialized_data)) 265 return -EINVAL; 266 267 ser = phys_to_virt(args->serialized_data); 268 269 /* 270 * The pos might have changed since prepare. Everything else stays the 271 * same. 272 */ 273 ser->pos = args->file->f_pos; 274 275 return 0; 276} 277 278static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) 279{ 280 struct inode *inode = file_inode(args->file); 281 struct memfd_luo_ser *ser; 282 283 if (WARN_ON_ONCE(!args->serialized_data)) 284 return; 285 286 inode_lock(inode); 287 shmem_freeze(inode, false); 288 289 ser = phys_to_virt(args->serialized_data); 290 291 memfd_luo_unpreserve_folios(&ser->folios, args->private_data, 292 ser->nr_folios); 293 294 kho_unpreserve_free(ser); 295 inode_unlock(inode); 296} 297 298static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, 299 u64 nr_folios) 300{ 301 u64 i; 302 303 for (i = 0; i < nr_folios; i++) { 304 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 305 struct folio *folio; 306 phys_addr_t phys; 307 308 if (!pfolio->pfn) 309 continue; 310 311 phys = PFN_PHYS(pfolio->pfn); 312 folio = kho_restore_folio(phys); 313 if (!folio) { 314 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", 315 phys); 316 continue; 317 } 318 319 folio_put(folio); 320 } 321} 322 323static void memfd_luo_finish(struct liveupdate_file_op_args *args) 324{ 325 struct memfd_luo_folio_ser *folios_ser; 326 struct memfd_luo_ser *ser; 327 328 if (args->retrieved) 329 return; 330 331 ser = phys_to_virt(args->serialized_data); 332 if (!ser) 333 return; 334 335 if (ser->nr_folios) { 336 folios_ser = kho_restore_vmalloc(&ser->folios); 337 if (!folios_ser) 338 goto out; 339 340 memfd_luo_discard_folios(folios_ser, ser->nr_folios); 341 vfree(folios_ser); 342 } 343 344out: 345 kho_restore_free(ser); 346} 347 348static int memfd_luo_retrieve_folios(struct file *file, 349 struct memfd_luo_folio_ser *folios_ser, 350 u64 nr_folios) 351{ 352 struct inode *inode = file_inode(file); 353 struct address_space *mapping = inode->i_mapping; 354 struct folio *folio; 355 int err = -EIO; 356 long i; 357 358 for (i = 0; i < nr_folios; i++) { 359 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; 360 phys_addr_t phys; 361 u64 index; 362 int flags; 363 364 if (!pfolio->pfn) 365 continue; 366 367 phys = PFN_PHYS(pfolio->pfn); 368 folio = kho_restore_folio(phys); 369 if (!folio) { 370 pr_err("Unable to restore folio at physical address: %llx\n", 371 phys); 372 goto put_folios; 373 } 374 index = pfolio->index; 375 flags = pfolio->flags; 376 377 /* Set up the folio for insertion. */ 378 __folio_set_locked(folio); 379 __folio_set_swapbacked(folio); 380 381 err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); 382 if (err) { 383 pr_err("shmem: failed to charge folio index %ld: %d\n", 384 i, err); 385 goto unlock_folio; 386 } 387 388 err = shmem_add_to_page_cache(folio, mapping, index, NULL, 389 mapping_gfp_mask(mapping)); 390 if (err) { 391 pr_err("shmem: failed to add to page cache folio index %ld: %d\n", 392 i, err); 393 goto unlock_folio; 394 } 395 396 if (flags & MEMFD_LUO_FOLIO_UPTODATE) 397 folio_mark_uptodate(folio); 398 if (flags & MEMFD_LUO_FOLIO_DIRTY) 399 folio_mark_dirty(folio); 400 401 err = shmem_inode_acct_blocks(inode, 1); 402 if (err) { 403 pr_err("shmem: failed to account folio index %ld: %d\n", 404 i, err); 405 goto unlock_folio; 406 } 407 408 shmem_recalc_inode(inode, 1, 0); 409 folio_add_lru(folio); 410 folio_unlock(folio); 411 folio_put(folio); 412 } 413 414 return 0; 415 416unlock_folio: 417 folio_unlock(folio); 418 folio_put(folio); 419put_folios: 420 /* 421 * Note: don't free the folios already added to the file. They will be 422 * freed when the file is freed. Free the ones not added yet here. 423 */ 424 for (long j = i + 1; j < nr_folios; j++) { 425 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; 426 427 folio = kho_restore_folio(pfolio->pfn); 428 if (folio) 429 folio_put(folio); 430 } 431 432 return err; 433} 434 435static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) 436{ 437 struct memfd_luo_folio_ser *folios_ser; 438 struct memfd_luo_ser *ser; 439 struct file *file; 440 int err; 441 442 ser = phys_to_virt(args->serialized_data); 443 if (!ser) 444 return -EINVAL; 445 446 file = shmem_file_setup("", 0, VM_NORESERVE); 447 448 if (IS_ERR(file)) { 449 pr_err("failed to setup file: %pe\n", file); 450 return PTR_ERR(file); 451 } 452 453 vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); 454 file->f_inode->i_size = ser->size; 455 456 if (ser->nr_folios) { 457 folios_ser = kho_restore_vmalloc(&ser->folios); 458 if (!folios_ser) { 459 err = -EINVAL; 460 goto put_file; 461 } 462 463 err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); 464 vfree(folios_ser); 465 if (err) 466 goto put_file; 467 } 468 469 args->file = file; 470 kho_restore_free(ser); 471 472 return 0; 473 474put_file: 475 fput(file); 476 477 return err; 478} 479 480static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, 481 struct file *file) 482{ 483 struct inode *inode = file_inode(file); 484 485 return shmem_file(file) && !inode->i_nlink; 486} 487 488static const struct liveupdate_file_ops memfd_luo_file_ops = { 489 .freeze = memfd_luo_freeze, 490 .finish = memfd_luo_finish, 491 .retrieve = memfd_luo_retrieve, 492 .preserve = memfd_luo_preserve, 493 .unpreserve = memfd_luo_unpreserve, 494 .can_preserve = memfd_luo_can_preserve, 495 .owner = THIS_MODULE, 496}; 497 498static struct liveupdate_file_handler memfd_luo_handler = { 499 .ops = &memfd_luo_file_ops, 500 .compatible = MEMFD_LUO_FH_COMPATIBLE, 501}; 502 503static int __init memfd_luo_init(void) 504{ 505 int err = liveupdate_register_file_handler(&memfd_luo_handler); 506 507 if (err && err != -EOPNOTSUPP) { 508 pr_err("Could not register luo filesystem handler: %pe\n", 509 ERR_PTR(err)); 510 511 return err; 512 } 513 514 return 0; 515} 516late_initcall(memfd_luo_init);