···130130Douglas Gilbert <dougg@torque.net>131131Ed L. Cashin <ecashin@coraid.com>132132Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>133133+Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com>133134Evgeniy Polyakov <johnpol@2ka.mipt.ru>134135Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com>135136Felipe W Damasio <felipewd@terra.com.br>
+6-9
Documentation/admin-guide/cgroup-v2.rst
···12451245 This is a simple interface to trigger memory reclaim in the12461246 target cgroup.1247124712481248- This file accepts a string which contains the number of bytes to12491249- reclaim.12481248+ This file accepts a single key, the number of bytes to reclaim.12491249+ No nested keys are currently supported.1250125012511251 Example::1252125212531253 echo "1G" > memory.reclaim12541254+12551255+ The interface can be later extended with nested keys to12561256+ configure the reclaim behavior. For example, specify the12571257+ type of memory to reclaim from (anon, file, ..).1254125812551259 Please note that the kernel can over or under reclaim from12561260 the target cgroup. If less bytes are reclaimed than the···12661262 the memory reclaim normally is not exercised in this case.12671263 This means that the networking layer will not adapt based on12681264 reclaim induced by memory.reclaim.12691269-12701270- This file also allows the user to specify the nodes to reclaim from,12711271- via the 'nodes=' key, for example::12721272-12731273- echo "1G nodes=0,1" > memory.reclaim12741274-12751275- The above instructs the kernel to reclaim memory from nodes 0,1.1276126512771266 memory.peak12781267 A read-only single value file which exists on non-root
+5-2
arch/ia64/kernel/sys_ia64.c
···170170asmlinkage long171171ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp)172172{173173+ struct timespec64 rtn_tp;174174+ s64 tick_ns;175175+173176 /*174177 * ia64's clock_gettime() syscall is implemented as a vdso call175178 * fsys_clock_gettime(). Currently it handles only···188185 switch (which_clock) {189186 case CLOCK_REALTIME:190187 case CLOCK_MONOTONIC:191191- s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);192192- struct timespec64 rtn_tp = ns_to_timespec64(tick_ns);188188+ tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);189189+ rtn_tp = ns_to_timespec64(tick_ns);193190 return put_timespec64(&rtn_tp, tp);194191 }195192
+1
arch/sh/kernel/vmlinux.lds.S
···44 * Written by Niibe Yutaka and Paul Mundt55 */66OUTPUT_ARCH(sh)77+#define RUNTIME_DISCARD_EXIT78#include <asm/thread_info.h>89#include <asm/cache.h>910#include <asm/vmlinux.lds.h>
+1-5
drivers/of/fdt.c
···2626#include <linux/serial_core.h>2727#include <linux/sysfs.h>2828#include <linux/random.h>2929-#include <linux/kmemleak.h>30293130#include <asm/setup.h> /* for COMMAND_LINE_SIZE */3231#include <asm/page.h>···524525 size = dt_mem_next_cell(dt_root_size_cells, &prop);525526526527 if (size &&527527- early_init_dt_reserve_memory(base, size, nomap) == 0) {528528+ early_init_dt_reserve_memory(base, size, nomap) == 0)528529 pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",529530 uname, &base, (unsigned long)(size / SZ_1M));530530- if (!nomap)531531- kmemleak_alloc_phys(base, size, 0);532532- }533531 else534532 pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",535533 uname, &base, (unsigned long)(size / SZ_1M));
+1-1
fs/freevxfs/Kconfig
···88 of SCO UnixWare (and possibly others) and optionally available99 for Sunsoft Solaris, HP-UX and many other operating systems. However1010 these particular OS implementations of vxfs may differ in on-disk1111- data endianess and/or superblock offset. The vxfs module has been1111+ data endianness and/or superblock offset. The vxfs module has been1212 tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.)1313 Currently only readonly access is supported and VxFX versions1414 2, 3 and 4. Tests were performed with HP-UX VxFS version 3.
+1-3
fs/proc/task_mmu.c
···745745 page = pfn_swap_entry_to_page(swpent);746746 }747747 if (page) {748748- int mapcount = page_mapcount(page);749749-750750- if (mapcount >= 2)748748+ if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))751749 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));752750 else753751 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
···6363 long long bytes_used;6464 unsigned int inodes;6565 unsigned int fragments;6666- int xattr_ids;6666+ unsigned int xattr_ids;6767 unsigned int ids;6868 bool panic_on_errors;6969 const struct squashfs_decompressor_thread_ops *thread_ops;
+2-2
fs/squashfs/xattr.h
···10101111#ifdef CONFIG_SQUASHFS_XATTR1212extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,1313- u64 *, int *);1313+ u64 *, unsigned int *);1414extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,1515 unsigned int *, unsigned long long *);1616#else1717static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,1818- u64 start, u64 *xattr_table_start, int *xattr_ids)1818+ u64 start, u64 *xattr_table_start, unsigned int *xattr_ids)1919{2020 struct squashfs_xattr_id_table *id_table;2121
+2-2
fs/squashfs/xattr_id.c
···5656 * Read uncompressed xattr id lookup table indexes from disk into memory5757 */5858__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,5959- u64 *xattr_table_start, int *xattr_ids)5959+ u64 *xattr_table_start, unsigned int *xattr_ids)6060{6161 struct squashfs_sb_info *msblk = sb->s_fs_info;6262 unsigned int len, indexes;···7676 /* Sanity check values */77777878 /* there is always at least one xattr id */7979- if (*xattr_ids == 0)7979+ if (*xattr_ids <= 0)8080 return ERR_PTR(-EINVAL);81818282 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
···16881688static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,16891689 struct bdi_writeback *wb)16901690{16911691+ struct mem_cgroup *memcg;16921692+16911693 if (mem_cgroup_disabled())16921694 return;1693169516941694- if (unlikely(&folio_memcg(folio)->css != wb->memcg_css))16961696+ memcg = folio_memcg(folio);16971697+ if (unlikely(memcg && &memcg->css != wb->memcg_css))16951698 mem_cgroup_track_foreign_dirty_slowpath(folio, wb);16961699}16971700
+1-2
include/linux/swap.h
···418418extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,419419 unsigned long nr_pages,420420 gfp_t gfp_mask,421421- unsigned int reclaim_options,422422- nodemask_t *nodemask);421421+ unsigned int reclaim_options);423422extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,424423 gfp_t gfp_mask, bool noswap,425424 pg_data_t *pgdat,
+2-1
lib/Kconfig.debug
···754754 select KALLSYMS755755 select CRC32756756 select STACKDEPOT757757+ select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF757758 help758759 Say Y here if you want to enable the memory leak759760 detector. The memory allocation/freeing is traced in a way···12081207 depends on DEBUG_KERNEL && PROC_FS12091208 default y12101209 help12111211- If you say Y here, the /proc/sched_debug file will be provided12101210+ If you say Y here, the /sys/kernel/debug/sched file will be provided12121211 that can help debug the scheduler. The runtime overhead of this12131212 option is minimal.12141213
+11-11
lib/maple_tree.c
···667667 unsigned char piv)668668{669669 struct maple_node *node = mte_to_node(mn);670670+ enum maple_type type = mte_node_type(mn);670671671671- if (piv >= mt_pivots[piv]) {672672+ if (piv >= mt_pivots[type]) {672673 WARN_ON(1);673674 return 0;674675 }675675- switch (mte_node_type(mn)) {676676+ switch (type) {676677 case maple_arange_64:677678 return node->ma64.pivot[piv];678679 case maple_range_64:···48774876 unsigned long *pivots, *gaps;48784877 void __rcu **slots;48794878 unsigned long gap = 0;48804880- unsigned long max, min, index;48794879+ unsigned long max, min;48814880 unsigned char offset;4882488148834882 if (unlikely(mas_is_err(mas)))···48994898 min = mas_safe_min(mas, pivots, --offset);4900489949014900 max = mas_safe_pivot(mas, pivots, offset, type);49024902- index = mas->index;49034903- while (index <= max) {49014901+ while (mas->index <= max) {49044902 gap = 0;49054903 if (gaps)49064904 gap = gaps[offset];···49304930 min = mas_safe_min(mas, pivots, offset);49314931 }4932493249334933- if (unlikely(index > max)) {49344934- mas_set_err(mas, -EBUSY);49354935- return false;49364936- }49334933+ if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))49344934+ goto no_space;4937493549384936 if (unlikely(ma_is_leaf(type))) {49394937 mas->offset = offset;···49484950 return false;4949495149504952ascend:49514951- if (mte_is_root(mas->node))49524952- mas_set_err(mas, -EBUSY);49534953+ if (!mte_is_root(mas->node))49544954+ return false;4953495549564956+no_space:49574957+ mas_set_err(mas, -EBUSY);49544958 return false;49554959}49564960
+89
lib/test_maple_tree.c
···25172517 mt_set_non_kernel(0);25182518}2519251925202520+static noinline void check_empty_area_window(struct maple_tree *mt)25212521+{25222522+ unsigned long i, nr_entries = 20;25232523+ MA_STATE(mas, mt, 0, 0);25242524+25252525+ for (i = 1; i <= nr_entries; i++)25262526+ mtree_store_range(mt, i*10, i*10 + 9,25272527+ xa_mk_value(i), GFP_KERNEL);25282528+25292529+ /* Create another hole besides the one at 0 */25302530+ mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL);25312531+25322532+ /* Check lower bounds that don't fit */25332533+ rcu_read_lock();25342534+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY);25352535+25362536+ mas_reset(&mas);25372537+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY);25382538+25392539+ /* Check lower bound that does fit */25402540+ mas_reset(&mas);25412541+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0);25422542+ MT_BUG_ON(mt, mas.index != 5);25432543+ MT_BUG_ON(mt, mas.last != 9);25442544+ rcu_read_unlock();25452545+25462546+ /* Check one gap that doesn't fit and one that does */25472547+ rcu_read_lock();25482548+ mas_reset(&mas);25492549+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0);25502550+ MT_BUG_ON(mt, mas.index != 161);25512551+ MT_BUG_ON(mt, mas.last != 169);25522552+25532553+ /* Check one gap that does fit above the min */25542554+ mas_reset(&mas);25552555+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0);25562556+ MT_BUG_ON(mt, mas.index != 216);25572557+ MT_BUG_ON(mt, mas.last != 218);25582558+25592559+ /* Check size that doesn't fit any gap */25602560+ mas_reset(&mas);25612561+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY);25622562+25632563+ /*25642564+ * Check size that doesn't fit the lower end of the window but25652565+ * does fit the gap25662566+ */25672567+ mas_reset(&mas);25682568+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY);25692569+25702570+ /*25712571+ * Check size that doesn't fit the upper end of the window but25722572+ * does fit the gap25732573+ */25742574+ mas_reset(&mas);25752575+ MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY);25762576+25772577+ /* Check mas_empty_area forward */25782578+ mas_reset(&mas);25792579+ MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0);25802580+ MT_BUG_ON(mt, mas.index != 0);25812581+ MT_BUG_ON(mt, mas.last != 8);25822582+25832583+ mas_reset(&mas);25842584+ MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0);25852585+ MT_BUG_ON(mt, mas.index != 0);25862586+ MT_BUG_ON(mt, mas.last != 3);25872587+25882588+ mas_reset(&mas);25892589+ MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY);25902590+25912591+ mas_reset(&mas);25922592+ MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY);25932593+25942594+ mas_reset(&mas);25952595+ MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY);25962596+25972597+ mas_reset(&mas);25982598+ mas_empty_area(&mas, 100, 165, 3);25992599+26002600+ mas_reset(&mas);26012601+ MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY);26022602+ rcu_read_unlock();26032603+}26042604+25202605static DEFINE_MTREE(tree);25212606static int maple_tree_seed(void)25222607{···2848276328492764 mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);28502765 check_bnode_min_spanning(&tree);27662766+ mtree_destroy(&tree);27672767+27682768+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);27692769+ check_empty_area_window(&tree);28512770 mtree_destroy(&tree);2852277128532772#if defined(BENCH)
+21-1
mm/khugepaged.c
···847847 return SCAN_SUCCEED;848848}849849850850+/*851851+ * See pmd_trans_unstable() for how the result may change out from852852+ * underneath us, even if we hold mmap_lock in read.853853+ */850854static int find_pmd_or_thp_or_none(struct mm_struct *mm,851855 unsigned long address,852856 pmd_t **pmd)···869865#endif870866 if (pmd_none(pmde))871867 return SCAN_PMD_NONE;868868+ if (!pmd_present(pmde))869869+ return SCAN_PMD_NULL;872870 if (pmd_trans_huge(pmde))873871 return SCAN_PMD_MAPPED;872872+ if (pmd_devmap(pmde))873873+ return SCAN_PMD_NULL;874874 if (pmd_bad(pmde))875875 return SCAN_PMD_NULL;876876 return SCAN_SUCCEED;···16501642 * has higher cost too. It would also probably require locking16511643 * the anon_vma.16521644 */16531653- if (vma->anon_vma) {16451645+ if (READ_ONCE(vma->anon_vma)) {16541646 result = SCAN_PAGE_ANON;16551647 goto next;16561648 }···16781670 result = SCAN_PTE_MAPPED_HUGEPAGE;16791671 if ((cc->is_khugepaged || is_target) &&16801672 mmap_write_trylock(mm)) {16731673+ /*16741674+ * Re-check whether we have an ->anon_vma, because16751675+ * collapse_and_free_pmd() requires that either no16761676+ * ->anon_vma exists or the anon_vma is locked.16771677+ * We already checked ->anon_vma above, but that check16781678+ * is racy because ->anon_vma can be populated under the16791679+ * mmap lock in read mode.16801680+ */16811681+ if (vma->anon_vma) {16821682+ result = SCAN_PAGE_ANON;16831683+ goto unlock_next;16841684+ }16811685 /*16821686 * When a vma is registered with uffd-wp, we can't16831687 * recycle the pmd pgtable because there can be pte
···600600601601 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */602602 if (flags & (MPOL_MF_MOVE_ALL) ||603603- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {603603+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&604604+ !hugetlb_pmd_shared(pte))) {604605 if (isolate_hugetlb(page, qp->pagelist) &&605606 (flags & MPOL_MF_STRICT))606607 /*
+19-6
mm/mremap.c
···10271027 }1028102810291029 /*10301030- * Function vma_merge() is called on the extension we are adding to10311031- * the already existing vma, vma_merge() will merge this extension with10321032- * the already existing vma (expand operation itself) and possibly also10331033- * with the next vma if it becomes adjacent to the expanded vma and10341034- * otherwise compatible.10301030+ * Function vma_merge() is called on the extension we10311031+ * are adding to the already existing vma, vma_merge()10321032+ * will merge this extension with the already existing10331033+ * vma (expand operation itself) and possibly also with10341034+ * the next vma if it becomes adjacent to the expanded10351035+ * vma and otherwise compatible.10361036+ *10371037+ * However, vma_merge() can currently fail due to10381038+ * is_mergeable_vma() check for vm_ops->close (see the10391039+ * comment there). Yet this should not prevent vma10401040+ * expanding, so perform a simple expand for such vma.10411041+ * Ideally the check for close op should be only done10421042+ * when a vma would be actually removed due to a merge.10351043 */10361036- vma = vma_merge(mm, vma, extension_start, extension_end,10441044+ if (!vma->vm_ops || !vma->vm_ops->close) {10451045+ vma = vma_merge(mm, vma, extension_start, extension_end,10371046 vma->vm_flags, vma->anon_vma, vma->vm_file,10381047 extension_pgoff, vma_policy(vma),10391048 vma->vm_userfaultfd_ctx, anon_vma_name(vma));10491049+ } else if (vma_adjust(vma, vma->vm_start, addr + new_len,10501050+ vma->vm_pgoff, NULL)) {10511051+ vma = NULL;10521052+ }10401053 if (!vma) {10411054 vm_unacct_memory(pages);10421055 ret = -ENOMEM;
···33353335 if (mem_cgroup_disabled())33363336 return;3337333733383338+ /* migration can happen before addition */33393339+ if (!mm->lru_gen.memcg)33403340+ return;33413341+33383342 rcu_read_lock();33393343 memcg = mem_cgroup_from_task(task);33403344 rcu_read_unlock();33413345 if (memcg == mm->lru_gen.memcg)33423346 return;3343334733443344- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);33453348 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));3346334933473350 lru_gen_del_mm(mm);···70257022unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,70267023 unsigned long nr_pages,70277024 gfp_t gfp_mask,70287028- unsigned int reclaim_options,70297029- nodemask_t *nodemask)70257025+ unsigned int reclaim_options)70307026{70317027 unsigned long nr_reclaimed;70327028 unsigned int noreclaim_flag;···70407038 .may_unmap = 1,70417039 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),70427040 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),70437043- .nodemask = nodemask,70447041 };70457042 /*70467043 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
+205-32
mm/zsmalloc.c
···113113 * have room for two bit at least.114114 */115115#define OBJ_ALLOCATED_TAG 1116116-#define OBJ_TAG_BITS 1116116+117117+#ifdef CONFIG_ZPOOL118118+/*119119+ * The second least-significant bit in the object's header identifies if the120120+ * value stored at the header is a deferred handle from the last reclaim121121+ * attempt.122122+ *123123+ * As noted above, this is valid because we have room for two bits.124124+ */125125+#define OBJ_DEFERRED_HANDLE_TAG 2126126+#define OBJ_TAG_BITS 2127127+#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)128128+#else129129+#define OBJ_TAG_BITS 1130130+#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG131131+#endif /* CONFIG_ZPOOL */132132+117133#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)118134#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)119135···238222 * Handle of allocated object.239223 */240224 unsigned long handle;225225+#ifdef CONFIG_ZPOOL226226+ /*227227+ * Deferred handle of a reclaimed object.228228+ */229229+ unsigned long deferred_handle;230230+#endif241231 };242232};243233···294272 /* links the zspage to the lru list in the pool */295273 struct list_head lru;296274 bool under_reclaim;297297- /* list of unfreed handles whose objects have been reclaimed */298298- unsigned long *deferred_handles;299275#endif300276301277 struct zs_pool *pool;···917897 return *(unsigned long *)handle;918898}919899920920-static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)900900+static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,901901+ int tag)921902{922903 unsigned long handle;923904 struct zspage *zspage = get_zspage(page);···929908 } else930909 handle = *(unsigned long *)obj;931910932932- if (!(handle & OBJ_ALLOCATED_TAG))911911+ if (!(handle & tag))933912 return false;934913935935- *phandle = handle & ~OBJ_ALLOCATED_TAG;914914+ /* Clear all tags before returning the handle */915915+ *phandle = handle & ~OBJ_TAG_MASK;936916 return true;937917}918918+919919+static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)920920+{921921+ return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);922922+}923923+924924+#ifdef CONFIG_ZPOOL925925+static bool obj_stores_deferred_handle(struct page *page, void *obj,926926+ unsigned long *phandle)927927+{928928+ return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);929929+}930930+#endif938931939932static void reset_page(struct page *page)940933{···981946}982947983948#ifdef CONFIG_ZPOOL949949+static unsigned long find_deferred_handle_obj(struct size_class *class,950950+ struct page *page, int *obj_idx);951951+984952/*985953 * Free all the deferred handles whose objects are freed in zs_free.986954 */987987-static void free_handles(struct zs_pool *pool, struct zspage *zspage)955955+static void free_handles(struct zs_pool *pool, struct size_class *class,956956+ struct zspage *zspage)988957{989989- unsigned long handle = (unsigned long)zspage->deferred_handles;958958+ int obj_idx = 0;959959+ struct page *page = get_first_page(zspage);960960+ unsigned long handle;990961991991- while (handle) {992992- unsigned long nxt_handle = handle_to_obj(handle);962962+ while (1) {963963+ handle = find_deferred_handle_obj(class, page, &obj_idx);964964+ if (!handle) {965965+ page = get_next_page(page);966966+ if (!page)967967+ break;968968+ obj_idx = 0;969969+ continue;970970+ }993971994972 cache_free_handle(pool, handle);995995- handle = nxt_handle;973973+ obj_idx++;996974 }997975}998976#else999999-static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}977977+static inline void free_handles(struct zs_pool *pool, struct size_class *class,978978+ struct zspage *zspage) {}1000979#endif10019801002981static void __free_zspage(struct zs_pool *pool, struct size_class *class,···1028979 VM_BUG_ON(fg != ZS_EMPTY);10299801030981 /* Free all deferred handles from zs_free */10311031- free_handles(pool, zspage);982982+ free_handles(pool, class, zspage);10329831033984 next = page = get_first_page(zspage);1034985 do {···11161067#ifdef CONFIG_ZPOOL11171068 INIT_LIST_HEAD(&zspage->lru);11181069 zspage->under_reclaim = false;11191119- zspage->deferred_handles = NULL;11201070#endif1121107111221072 set_freeobj(zspage, 0);···16161568}16171569EXPORT_SYMBOL_GPL(zs_malloc);1618157016191619-static void obj_free(int class_size, unsigned long obj)15711571+static void obj_free(int class_size, unsigned long obj, unsigned long *handle)16201572{16211573 struct link_free *link;16221574 struct zspage *zspage;···16301582 zspage = get_zspage(f_page);1631158316321584 vaddr = kmap_atomic(f_page);16331633-16341634- /* Insert this object in containing zspage's freelist */16351585 link = (struct link_free *)(vaddr + f_offset);16361636- if (likely(!ZsHugePage(zspage)))16371637- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;16381638- else16391639- f_page->index = 0;15861586+15871587+ if (handle) {15881588+#ifdef CONFIG_ZPOOL15891589+ /* Stores the (deferred) handle in the object's header */15901590+ *handle |= OBJ_DEFERRED_HANDLE_TAG;15911591+ *handle &= ~OBJ_ALLOCATED_TAG;15921592+15931593+ if (likely(!ZsHugePage(zspage)))15941594+ link->deferred_handle = *handle;15951595+ else15961596+ f_page->index = *handle;15971597+#endif15981598+ } else {15991599+ /* Insert this object in containing zspage's freelist */16001600+ if (likely(!ZsHugePage(zspage)))16011601+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;16021602+ else16031603+ f_page->index = 0;16041604+ set_freeobj(zspage, f_objidx);16051605+ }16061606+16401607 kunmap_atomic(vaddr);16411641- set_freeobj(zspage, f_objidx);16421608 mod_zspage_inuse(zspage, -1);16431609}16441610···16771615 zspage = get_zspage(f_page);16781616 class = zspage_class(pool, zspage);1679161716801680- obj_free(class->size, obj);16811618 class_stat_dec(class, OBJ_USED, 1);1682161916831620#ifdef CONFIG_ZPOOL···16851624 * Reclaim needs the handles during writeback. It'll free16861625 * them along with the zspage when it's done with them.16871626 *16881688- * Record current deferred handle at the memory location16891689- * whose address is given by handle.16271627+ * Record current deferred handle in the object's header.16901628 */16911691- record_obj(handle, (unsigned long)zspage->deferred_handles);16921692- zspage->deferred_handles = (unsigned long *)handle;16291629+ obj_free(class->size, obj, &handle);16931630 spin_unlock(&pool->lock);16941631 return;16951632 }16961633#endif16341634+ obj_free(class->size, obj, NULL);16351635+16971636 fullness = fix_fullness_group(class, zspage);16981637 if (fullness == ZS_EMPTY)16991638 free_zspage(pool, class, zspage);···17741713}1775171417761715/*17771777- * Find alloced object in zspage from index object and17161716+ * Find object with a certain tag in zspage from index object and17781717 * return handle.17791718 */17801780-static unsigned long find_alloced_obj(struct size_class *class,17811781- struct page *page, int *obj_idx)17191719+static unsigned long find_tagged_obj(struct size_class *class,17201720+ struct page *page, int *obj_idx, int tag)17821721{17831722 unsigned int offset;17841723 int index = *obj_idx;···17891728 offset += class->size * index;1790172917911730 while (offset < PAGE_SIZE) {17921792- if (obj_allocated(page, addr + offset, &handle))17311731+ if (obj_tagged(page, addr + offset, &handle, tag))17931732 break;1794173317951734 offset += class->size;···1802174118031742 return handle;18041743}17441744+17451745+/*17461746+ * Find alloced object in zspage from index object and17471747+ * return handle.17481748+ */17491749+static unsigned long find_alloced_obj(struct size_class *class,17501750+ struct page *page, int *obj_idx)17511751+{17521752+ return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);17531753+}17541754+17551755+#ifdef CONFIG_ZPOOL17561756+/*17571757+ * Find object storing a deferred handle in header in zspage from index object17581758+ * and return handle.17591759+ */17601760+static unsigned long find_deferred_handle_obj(struct size_class *class,17611761+ struct page *page, int *obj_idx)17621762+{17631763+ return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);17641764+}17651765+#endif1805176618061767struct zs_compact_control {18071768 /* Source spage for migration which could be a subpage of zspage */···18671784 zs_object_copy(class, free_obj, used_obj);18681785 obj_idx++;18691786 record_obj(handle, free_obj);18701870- obj_free(class->size, used_obj);17871787+ obj_free(class->size, used_obj, NULL);18711788 }1872178918731790 /* Remember last position in this iteration */···25582475EXPORT_SYMBOL_GPL(zs_destroy_pool);2559247625602477#ifdef CONFIG_ZPOOL24782478+static void restore_freelist(struct zs_pool *pool, struct size_class *class,24792479+ struct zspage *zspage)24802480+{24812481+ unsigned int obj_idx = 0;24822482+ unsigned long handle, off = 0; /* off is within-page offset */24832483+ struct page *page = get_first_page(zspage);24842484+ struct link_free *prev_free = NULL;24852485+ void *prev_page_vaddr = NULL;24862486+24872487+ /* in case no free object found */24882488+ set_freeobj(zspage, (unsigned int)(-1UL));24892489+24902490+ while (page) {24912491+ void *vaddr = kmap_atomic(page);24922492+ struct page *next_page;24932493+24942494+ while (off < PAGE_SIZE) {24952495+ void *obj_addr = vaddr + off;24962496+24972497+ /* skip allocated object */24982498+ if (obj_allocated(page, obj_addr, &handle)) {24992499+ obj_idx++;25002500+ off += class->size;25012501+ continue;25022502+ }25032503+25042504+ /* free deferred handle from reclaim attempt */25052505+ if (obj_stores_deferred_handle(page, obj_addr, &handle))25062506+ cache_free_handle(pool, handle);25072507+25082508+ if (prev_free)25092509+ prev_free->next = obj_idx << OBJ_TAG_BITS;25102510+ else /* first free object found */25112511+ set_freeobj(zspage, obj_idx);25122512+25132513+ prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);25142514+ /* if last free object in a previous page, need to unmap */25152515+ if (prev_page_vaddr) {25162516+ kunmap_atomic(prev_page_vaddr);25172517+ prev_page_vaddr = NULL;25182518+ }25192519+25202520+ obj_idx++;25212521+ off += class->size;25222522+ }25232523+25242524+ /*25252525+ * Handle the last (full or partial) object on this page.25262526+ */25272527+ next_page = get_next_page(page);25282528+ if (next_page) {25292529+ if (!prev_free || prev_page_vaddr) {25302530+ /*25312531+ * There is no free object in this page, so we can safely25322532+ * unmap it.25332533+ */25342534+ kunmap_atomic(vaddr);25352535+ } else {25362536+ /* update prev_page_vaddr since prev_free is on this page */25372537+ prev_page_vaddr = vaddr;25382538+ }25392539+ } else { /* this is the last page */25402540+ if (prev_free) {25412541+ /*25422542+ * Reset OBJ_TAG_BITS bit to last link to tell25432543+ * whether it's allocated object or not.25442544+ */25452545+ prev_free->next = -1UL << OBJ_TAG_BITS;25462546+ }25472547+25482548+ /* unmap previous page (if not done yet) */25492549+ if (prev_page_vaddr) {25502550+ kunmap_atomic(prev_page_vaddr);25512551+ prev_page_vaddr = NULL;25522552+ }25532553+25542554+ kunmap_atomic(vaddr);25552555+ }25562556+25572557+ page = next_page;25582558+ off %= PAGE_SIZE;25592559+ }25602560+}25612561+25612562static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)25622563{25632564 int i, obj_idx, ret = 0;···27252558 return 0;27262559 }2727256025612561+ /*25622562+ * Eviction fails on one of the handles, so we need to restore zspage.25632563+ * We need to rebuild its freelist (and free stored deferred handles),25642564+ * put it back to the correct size class, and add it to the LRU list.25652565+ */25662566+ restore_freelist(pool, class, zspage);27282567 putback_zspage(class, zspage);27292568 list_add(&zspage->lru, &pool->lru);27302569 unlock_zspage(zspage);