fs/gfs2/bmap.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / gfs2 / bmap.c
at master 2508 lines 67 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5 */
   6
   7#include <linux/spinlock.h>
   8#include <linux/completion.h>
   9#include <linux/buffer_head.h>
  10#include <linux/blkdev.h>
  11#include <linux/gfs2_ondisk.h>
  12#include <linux/crc32.h>
  13#include <linux/iomap.h>
  14#include <linux/ktime.h>
  15
  16#include "gfs2.h"
  17#include "incore.h"
  18#include "bmap.h"
  19#include "glock.h"
  20#include "inode.h"
  21#include "meta_io.h"
  22#include "quota.h"
  23#include "rgrp.h"
  24#include "log.h"
  25#include "super.h"
  26#include "trans.h"
  27#include "dir.h"
  28#include "util.h"
  29#include "aops.h"
  30#include "trace_gfs2.h"
  31
  32/* This doesn't need to be that large as max 64 bit pointers in a 4k
  33 * block is 512, so __u16 is fine for that. It saves stack space to
  34 * keep it small.
  35 */
  36struct metapath {
  37	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38	__u16 mp_list[GFS2_MAX_META_HEIGHT];
  39	int mp_fheight; /* find_metapath height */
  40	int mp_aheight; /* actual height (lookup height) */
  41};
  42
  43static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45/**
  46 * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
  47 * @ip: the inode
  48 * @dibh: the dinode buffer
  49 * @block: the block number that was allocated
  50 * @folio: The folio.
  51 *
  52 * Returns: errno
  53 */
  54static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
  55			       u64 block, struct folio *folio)
  56{
  57	struct inode *inode = &ip->i_inode;
  58
  59	if (!folio_test_uptodate(folio)) {
  60		void *kaddr = kmap_local_folio(folio, 0);
  61		u64 dsize = i_size_read(inode);
  62 
  63		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  64		memset(kaddr + dsize, 0, folio_size(folio) - dsize);
  65		kunmap_local(kaddr);
  66
  67		folio_mark_uptodate(folio);
  68	}
  69
  70	if (gfs2_is_jdata(ip)) {
  71		struct buffer_head *bh = folio_buffers(folio);
  72
  73		if (!bh)
  74			bh = create_empty_buffers(folio,
  75				BIT(inode->i_blkbits), BIT(BH_Uptodate));
  76
  77		if (!buffer_mapped(bh))
  78			map_bh(bh, inode->i_sb, block);
  79
  80		set_buffer_uptodate(bh);
  81		gfs2_trans_add_data(ip->i_gl, bh);
  82	} else {
  83		folio_mark_dirty(folio);
  84		gfs2_ordered_add_inode(ip);
  85	}
  86
  87	return 0;
  88}
  89
  90static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
  91{
  92	struct buffer_head *bh, *dibh;
  93	struct gfs2_dinode *di;
  94	u64 block = 0;
  95	int isdir = gfs2_is_dir(ip);
  96	int error;
  97
  98	error = gfs2_meta_inode_buffer(ip, &dibh);
  99	if (error)
 100		return error;
 101
 102	if (i_size_read(&ip->i_inode)) {
 103		/* Get a free block, fill it with the stuffed data,
 104		   and write it out to disk */
 105
 106		unsigned int n = 1;
 107		error = gfs2_alloc_blocks(ip, &block, &n, 0);
 108		if (error)
 109			goto out_brelse;
 110		if (isdir) {
 111			gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 112			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 113			if (error)
 114				goto out_brelse;
 115			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 116					      dibh, sizeof(struct gfs2_dinode));
 117			brelse(bh);
 118		} else {
 119			error = gfs2_unstuffer_folio(ip, dibh, block, folio);
 120			if (error)
 121				goto out_brelse;
 122		}
 123	}
 124
 125	/*  Set up the pointer to the new block  */
 126
 127	gfs2_trans_add_meta(ip->i_gl, dibh);
 128	di = (struct gfs2_dinode *)dibh->b_data;
 129	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 130
 131	if (i_size_read(&ip->i_inode)) {
 132		*(__be64 *)(di + 1) = cpu_to_be64(block);
 133		gfs2_add_inode_blocks(&ip->i_inode, 1);
 134		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 135	}
 136
 137	ip->i_height = 1;
 138	di->di_height = cpu_to_be16(1);
 139
 140out_brelse:
 141	brelse(dibh);
 142	return error;
 143}
 144
 145/**
 146 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 147 * @ip: The GFS2 inode to unstuff
 148 *
 149 * This routine unstuffs a dinode and returns it to a "normal" state such
 150 * that the height can be grown in the traditional way.
 151 *
 152 * Returns: errno
 153 */
 154
 155int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 156{
 157	struct inode *inode = &ip->i_inode;
 158	struct folio *folio;
 159	int error;
 160
 161	down_write(&ip->i_rw_mutex);
 162	folio = filemap_grab_folio(inode->i_mapping, 0);
 163	error = PTR_ERR(folio);
 164	if (IS_ERR(folio))
 165		goto out;
 166	error = __gfs2_unstuff_inode(ip, folio);
 167	folio_unlock(folio);
 168	folio_put(folio);
 169out:
 170	up_write(&ip->i_rw_mutex);
 171	return error;
 172}
 173
 174/**
 175 * find_metapath - Find path through the metadata tree
 176 * @sdp: The superblock
 177 * @block: The disk block to look up
 178 * @mp: The metapath to return the result in
 179 * @height: The pre-calculated height of the metadata tree
 180 *
 181 *   This routine returns a struct metapath structure that defines a path
 182 *   through the metadata of inode "ip" to get to block "block".
 183 *
 184 *   Example:
 185 *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 186 *   filesystem with a blocksize of 4096.
 187 *
 188 *   find_metapath() would return a struct metapath structure set to:
 189 *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 190 *
 191 *   That means that in order to get to the block containing the byte at
 192 *   offset 101342453, we would load the indirect block pointed to by pointer
 193 *   0 in the dinode.  We would then load the indirect block pointed to by
 194 *   pointer 48 in that indirect block.  We would then load the data block
 195 *   pointed to by pointer 165 in that indirect block.
 196 *
 197 *             ----------------------------------------
 198 *             | Dinode |                             |
 199 *             |        |                            4|
 200 *             |        |0 1 2 3 4 5                 9|
 201 *             |        |                            6|
 202 *             ----------------------------------------
 203 *                       |
 204 *                       |
 205 *                       V
 206 *             ----------------------------------------
 207 *             | Indirect Block                       |
 208 *             |                                     5|
 209 *             |            4 4 4 4 4 5 5            1|
 210 *             |0           5 6 7 8 9 0 1            2|
 211 *             ----------------------------------------
 212 *                                |
 213 *                                |
 214 *                                V
 215 *             ----------------------------------------
 216 *             | Indirect Block                       |
 217 *             |                         1 1 1 1 1   5|
 218 *             |                         6 6 6 6 6   1|
 219 *             |0                        3 4 5 6 7   2|
 220 *             ----------------------------------------
 221 *                                           |
 222 *                                           |
 223 *                                           V
 224 *             ----------------------------------------
 225 *             | Data block containing offset         |
 226 *             |            101342453                 |
 227 *             |                                      |
 228 *             |                                      |
 229 *             ----------------------------------------
 230 *
 231 */
 232
 233static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 234			  struct metapath *mp, unsigned int height)
 235{
 236	unsigned int i;
 237
 238	mp->mp_fheight = height;
 239	for (i = height; i--;)
 240		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 241}
 242
 243static inline unsigned int metapath_branch_start(const struct metapath *mp)
 244{
 245	if (mp->mp_list[0] == 0)
 246		return 2;
 247	return 1;
 248}
 249
 250/**
 251 * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 252 * @height: The metadata height (0 = dinode)
 253 * @mp: The metapath
 254 */
 255static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 256{
 257	struct buffer_head *bh = mp->mp_bh[height];
 258	if (height == 0)
 259		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 260	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 261}
 262
 263/**
 264 * metapointer - Return pointer to start of metadata in a buffer
 265 * @height: The metadata height (0 = dinode)
 266 * @mp: The metapath
 267 *
 268 * Return a pointer to the block number of the next height of the metadata
 269 * tree given a buffer containing the pointer to the current height of the
 270 * metadata tree.
 271 */
 272
 273static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 274{
 275	__be64 *p = metaptr1(height, mp);
 276	return p + mp->mp_list[height];
 277}
 278
 279static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 280{
 281	const struct buffer_head *bh = mp->mp_bh[height];
 282	return (const __be64 *)(bh->b_data + bh->b_size);
 283}
 284
 285static void clone_metapath(struct metapath *clone, struct metapath *mp)
 286{
 287	unsigned int hgt;
 288
 289	*clone = *mp;
 290	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 291		get_bh(clone->mp_bh[hgt]);
 292}
 293
 294static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 295{
 296	const __be64 *t;
 297
 298	for (t = start; t < end; t++) {
 299		struct buffer_head *rabh;
 300
 301		if (!*t)
 302			continue;
 303
 304		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 305		if (trylock_buffer(rabh)) {
 306			if (!buffer_uptodate(rabh)) {
 307				rabh->b_end_io = end_buffer_read_sync;
 308				submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 309					  REQ_PRIO, rabh);
 310				continue;
 311			}
 312			unlock_buffer(rabh);
 313		}
 314		brelse(rabh);
 315	}
 316}
 317
 318static inline struct buffer_head *
 319metapath_dibh(struct metapath *mp)
 320{
 321	return mp->mp_bh[0];
 322}
 323
 324static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 325			     unsigned int x, unsigned int h)
 326{
 327	for (; x < h; x++) {
 328		__be64 *ptr = metapointer(x, mp);
 329		u64 dblock = be64_to_cpu(*ptr);
 330		int ret;
 331
 332		if (!dblock)
 333			break;
 334		ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 335		if (ret)
 336			return ret;
 337	}
 338	mp->mp_aheight = x + 1;
 339	return 0;
 340}
 341
 342/**
 343 * lookup_metapath - Walk the metadata tree to a specific point
 344 * @ip: The inode
 345 * @mp: The metapath
 346 *
 347 * Assumes that the inode's buffer has already been looked up and
 348 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 349 * by find_metapath().
 350 *
 351 * If this function encounters part of the tree which has not been
 352 * allocated, it returns the current height of the tree at the point
 353 * at which it found the unallocated block. Blocks which are found are
 354 * added to the mp->mp_bh[] list.
 355 *
 356 * Returns: error
 357 */
 358
 359static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 360{
 361	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 362}
 363
 364/**
 365 * fillup_metapath - fill up buffers for the metadata path to a specific height
 366 * @ip: The inode
 367 * @mp: The metapath
 368 * @h: The height to which it should be mapped
 369 *
 370 * Similar to lookup_metapath, but does lookups for a range of heights
 371 *
 372 * Returns: error or the number of buffers filled
 373 */
 374
 375static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 376{
 377	unsigned int x = 0;
 378	int ret;
 379
 380	if (h) {
 381		/* find the first buffer we need to look up. */
 382		for (x = h - 1; x > 0; x--) {
 383			if (mp->mp_bh[x])
 384				break;
 385		}
 386	}
 387	ret = __fillup_metapath(ip, mp, x, h);
 388	if (ret)
 389		return ret;
 390	return mp->mp_aheight - x - 1;
 391}
 392
 393static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 394{
 395	sector_t factor = 1, block = 0;
 396	int hgt;
 397
 398	for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 399		if (hgt < mp->mp_aheight)
 400			block += mp->mp_list[hgt] * factor;
 401		factor *= sdp->sd_inptrs;
 402	}
 403	return block;
 404}
 405
 406static void release_metapath(struct metapath *mp)
 407{
 408	int i;
 409
 410	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 411		if (mp->mp_bh[i] == NULL)
 412			break;
 413		brelse(mp->mp_bh[i]);
 414		mp->mp_bh[i] = NULL;
 415	}
 416}
 417
 418/**
 419 * gfs2_extent_length - Returns length of an extent of blocks
 420 * @bh: The metadata block
 421 * @ptr: Current position in @bh
 422 * @eob: Set to 1 if we hit "end of block"
 423 *
 424 * Returns: The length of the extent (minimum of one block)
 425 */
 426
 427static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob)
 428{
 429	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 430	const __be64 *first = ptr;
 431	u64 d = be64_to_cpu(*ptr);
 432
 433	*eob = 0;
 434	do {
 435		ptr++;
 436		if (ptr >= end)
 437			break;
 438		d++;
 439	} while(be64_to_cpu(*ptr) == d);
 440	if (ptr >= end)
 441		*eob = 1;
 442	return ptr - first;
 443}
 444
 445enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 446
 447/*
 448 * gfs2_metadata_walker - walk an indirect block
 449 * @mp: Metapath to indirect block
 450 * @ptrs: Number of pointers to look at
 451 *
 452 * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 453 * indirect block to follow.
 454 */
 455typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 456						   unsigned int ptrs);
 457
 458/*
 459 * gfs2_walk_metadata - walk a tree of indirect blocks
 460 * @inode: The inode
 461 * @mp: Starting point of walk
 462 * @max_len: Maximum number of blocks to walk
 463 * @walker: Called during the walk
 464 *
 465 * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 466 * past the end of metadata, and a negative error code otherwise.
 467 */
 468
 469static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 470		u64 max_len, gfs2_metadata_walker walker)
 471{
 472	struct gfs2_inode *ip = GFS2_I(inode);
 473	struct gfs2_sbd *sdp = GFS2_SB(inode);
 474	u64 factor = 1;
 475	unsigned int hgt;
 476	int ret;
 477
 478	/*
 479	 * The walk starts in the lowest allocated indirect block, which may be
 480	 * before the position indicated by @mp.  Adjust @max_len accordingly
 481	 * to avoid a short walk.
 482	 */
 483	for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 484		max_len += mp->mp_list[hgt] * factor;
 485		mp->mp_list[hgt] = 0;
 486		factor *= sdp->sd_inptrs;
 487	}
 488
 489	for (;;) {
 490		u16 start = mp->mp_list[hgt];
 491		enum walker_status status;
 492		unsigned int ptrs;
 493		u64 len;
 494
 495		/* Walk indirect block. */
 496		ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 497		len = ptrs * factor;
 498		if (len > max_len)
 499			ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 500		status = walker(mp, ptrs);
 501		switch (status) {
 502		case WALK_STOP:
 503			return 1;
 504		case WALK_FOLLOW:
 505			BUG_ON(mp->mp_aheight == mp->mp_fheight);
 506			ptrs = mp->mp_list[hgt] - start;
 507			len = ptrs * factor;
 508			break;
 509		case WALK_CONTINUE:
 510			break;
 511		}
 512		if (len >= max_len)
 513			break;
 514		max_len -= len;
 515		if (status == WALK_FOLLOW)
 516			goto fill_up_metapath;
 517
 518lower_metapath:
 519		/* Decrease height of metapath. */
 520		brelse(mp->mp_bh[hgt]);
 521		mp->mp_bh[hgt] = NULL;
 522		mp->mp_list[hgt] = 0;
 523		if (!hgt)
 524			break;
 525		hgt--;
 526		factor *= sdp->sd_inptrs;
 527
 528		/* Advance in metadata tree. */
 529		(mp->mp_list[hgt])++;
 530		if (hgt) {
 531			if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 532				goto lower_metapath;
 533		} else {
 534			if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 535				break;
 536		}
 537
 538fill_up_metapath:
 539		/* Increase height of metapath. */
 540		ret = fillup_metapath(ip, mp, ip->i_height - 1);
 541		if (ret < 0)
 542			return ret;
 543		hgt += ret;
 544		for (; ret; ret--)
 545			do_div(factor, sdp->sd_inptrs);
 546		mp->mp_aheight = hgt + 1;
 547	}
 548	return 0;
 549}
 550
 551static enum walker_status gfs2_hole_walker(struct metapath *mp,
 552					   unsigned int ptrs)
 553{
 554	const __be64 *start, *ptr, *end;
 555	unsigned int hgt;
 556
 557	hgt = mp->mp_aheight - 1;
 558	start = metapointer(hgt, mp);
 559	end = start + ptrs;
 560
 561	for (ptr = start; ptr < end; ptr++) {
 562		if (*ptr) {
 563			mp->mp_list[hgt] += ptr - start;
 564			if (mp->mp_aheight == mp->mp_fheight)
 565				return WALK_STOP;
 566			return WALK_FOLLOW;
 567		}
 568	}
 569	return WALK_CONTINUE;
 570}
 571
 572/**
 573 * gfs2_hole_size - figure out the size of a hole
 574 * @inode: The inode
 575 * @lblock: The logical starting block number
 576 * @len: How far to look (in blocks)
 577 * @mp: The metapath at lblock
 578 * @iomap: The iomap to store the hole size in
 579 *
 580 * This function modifies @mp.
 581 *
 582 * Returns: errno on error
 583 */
 584static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 585			  struct metapath *mp, struct iomap *iomap)
 586{
 587	struct metapath clone;
 588	u64 hole_size;
 589	int ret;
 590
 591	clone_metapath(&clone, mp);
 592	ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 593	if (ret < 0)
 594		goto out;
 595
 596	if (ret == 1)
 597		hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 598	else
 599		hole_size = len;
 600	iomap->length = hole_size << inode->i_blkbits;
 601	ret = 0;
 602
 603out:
 604	release_metapath(&clone);
 605	return ret;
 606}
 607
 608static inline void gfs2_indirect_init(struct metapath *mp,
 609				      struct gfs2_glock *gl, unsigned int i,
 610				      unsigned offset, u64 bn)
 611{
 612	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 613		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
 614				 sizeof(struct gfs2_dinode)));
 615	BUG_ON(i < 1);
 616	BUG_ON(mp->mp_bh[i] != NULL);
 617	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 618	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 619	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 620	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 621	ptr += offset;
 622	*ptr = cpu_to_be64(bn);
 623}
 624
 625enum alloc_state {
 626	ALLOC_DATA = 0,
 627	ALLOC_GROW_DEPTH = 1,
 628	ALLOC_GROW_HEIGHT = 2,
 629	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 630};
 631
 632/**
 633 * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 634 * @inode: The GFS2 inode
 635 * @iomap: The iomap structure
 636 * @mp: The metapath, with proper height information calculated
 637 *
 638 * In this routine we may have to alloc:
 639 *   i) Indirect blocks to grow the metadata tree height
 640 *  ii) Indirect blocks to fill in lower part of the metadata tree
 641 * iii) Data blocks
 642 *
 643 * This function is called after __gfs2_iomap_get, which works out the
 644 * total number of blocks which we need via gfs2_alloc_size.
 645 *
 646 * We then do the actual allocation asking for an extent at a time (if
 647 * enough contiguous free blocks are available, there will only be one
 648 * allocation request per call) and uses the state machine to initialise
 649 * the blocks in order.
 650 *
 651 * Right now, this function will allocate at most one indirect block
 652 * worth of data -- with a default block size of 4K, that's slightly
 653 * less than 2M.  If this limitation is ever removed to allow huge
 654 * allocations, we would probably still want to limit the iomap size we
 655 * return to avoid stalling other tasks during huge writes; the next
 656 * iomap iteration would then find the blocks already allocated.
 657 *
 658 * Returns: errno on error
 659 */
 660
 661static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 662			      struct metapath *mp)
 663{
 664	struct gfs2_inode *ip = GFS2_I(inode);
 665	struct gfs2_sbd *sdp = GFS2_SB(inode);
 666	struct buffer_head *dibh = metapath_dibh(mp);
 667	u64 bn;
 668	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 669	size_t dblks = iomap->length >> inode->i_blkbits;
 670	const unsigned end_of_metadata = mp->mp_fheight - 1;
 671	int ret;
 672	enum alloc_state state;
 673	__be64 *ptr;
 674	__be64 zero_bn = 0;
 675
 676	BUG_ON(mp->mp_aheight < 1);
 677	BUG_ON(dibh == NULL);
 678	BUG_ON(dblks < 1);
 679
 680	gfs2_trans_add_meta(ip->i_gl, dibh);
 681
 682	down_write(&ip->i_rw_mutex);
 683
 684	if (mp->mp_fheight == mp->mp_aheight) {
 685		/* Bottom indirect block exists */
 686		state = ALLOC_DATA;
 687	} else {
 688		/* Need to allocate indirect blocks */
 689		if (mp->mp_fheight == ip->i_height) {
 690			/* Writing into existing tree, extend tree down */
 691			iblks = mp->mp_fheight - mp->mp_aheight;
 692			state = ALLOC_GROW_DEPTH;
 693		} else {
 694			/* Building up tree height */
 695			state = ALLOC_GROW_HEIGHT;
 696			iblks = mp->mp_fheight - ip->i_height;
 697			branch_start = metapath_branch_start(mp);
 698			iblks += (mp->mp_fheight - branch_start);
 699		}
 700	}
 701
 702	/* start of the second part of the function (state machine) */
 703
 704	blks = dblks + iblks;
 705	i = mp->mp_aheight;
 706	do {
 707		n = blks - alloced;
 708		ret = gfs2_alloc_blocks(ip, &bn, &n, 0);
 709		if (ret)
 710			goto out;
 711		alloced += n;
 712		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 713			gfs2_trans_remove_revoke(sdp, bn, n);
 714		switch (state) {
 715		/* Growing height of tree */
 716		case ALLOC_GROW_HEIGHT:
 717			if (i == 1) {
 718				ptr = (__be64 *)(dibh->b_data +
 719						 sizeof(struct gfs2_dinode));
 720				zero_bn = *ptr;
 721			}
 722			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 723			     i++, n--)
 724				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 725			if (i - 1 == mp->mp_fheight - ip->i_height) {
 726				i--;
 727				gfs2_buffer_copy_tail(mp->mp_bh[i],
 728						sizeof(struct gfs2_meta_header),
 729						dibh, sizeof(struct gfs2_dinode));
 730				gfs2_buffer_clear_tail(dibh,
 731						sizeof(struct gfs2_dinode) +
 732						sizeof(__be64));
 733				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 734					sizeof(struct gfs2_meta_header));
 735				*ptr = zero_bn;
 736				state = ALLOC_GROW_DEPTH;
 737				for(i = branch_start; i < mp->mp_fheight; i++) {
 738					if (mp->mp_bh[i] == NULL)
 739						break;
 740					brelse(mp->mp_bh[i]);
 741					mp->mp_bh[i] = NULL;
 742				}
 743				i = branch_start;
 744			}
 745			if (n == 0)
 746				break;
 747			fallthrough;	/* To branching from existing tree */
 748		case ALLOC_GROW_DEPTH:
 749			if (i > 1 && i < mp->mp_fheight)
 750				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 751			for (; i < mp->mp_fheight && n > 0; i++, n--)
 752				gfs2_indirect_init(mp, ip->i_gl, i,
 753						   mp->mp_list[i-1], bn++);
 754			if (i == mp->mp_fheight)
 755				state = ALLOC_DATA;
 756			if (n == 0)
 757				break;
 758			fallthrough;	/* To tree complete, adding data blocks */
 759		case ALLOC_DATA:
 760			BUG_ON(n > dblks);
 761			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 762			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 763			dblks = n;
 764			ptr = metapointer(end_of_metadata, mp);
 765			iomap->addr = bn << inode->i_blkbits;
 766			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 767			while (n-- > 0)
 768				*ptr++ = cpu_to_be64(bn++);
 769			break;
 770		}
 771	} while (iomap->addr == IOMAP_NULL_ADDR);
 772
 773	iomap->type = IOMAP_MAPPED;
 774	iomap->length = (u64)dblks << inode->i_blkbits;
 775	ip->i_height = mp->mp_fheight;
 776	gfs2_add_inode_blocks(&ip->i_inode, alloced);
 777	gfs2_dinode_out(ip, dibh->b_data);
 778out:
 779	up_write(&ip->i_rw_mutex);
 780	return ret;
 781}
 782
 783#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 784
 785/**
 786 * gfs2_alloc_size - Compute the maximum allocation size
 787 * @inode: The inode
 788 * @mp: The metapath
 789 * @size: Requested size in blocks
 790 *
 791 * Compute the maximum size of the next allocation at @mp.
 792 *
 793 * Returns: size in blocks
 794 */
 795static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 796{
 797	struct gfs2_inode *ip = GFS2_I(inode);
 798	struct gfs2_sbd *sdp = GFS2_SB(inode);
 799	const __be64 *first, *ptr, *end;
 800
 801	/*
 802	 * For writes to stuffed files, this function is called twice via
 803	 * __gfs2_iomap_get, before and after unstuffing. The size we return the
 804	 * first time needs to be large enough to get the reservation and
 805	 * allocation sizes right.  The size we return the second time must
 806	 * be exact or else __gfs2_iomap_alloc won't do the right thing.
 807	 */
 808
 809	if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 810		unsigned int maxsize = mp->mp_fheight > 1 ?
 811			sdp->sd_inptrs : sdp->sd_diptrs;
 812		maxsize -= mp->mp_list[mp->mp_fheight - 1];
 813		if (size > maxsize)
 814			size = maxsize;
 815		return size;
 816	}
 817
 818	first = metapointer(ip->i_height - 1, mp);
 819	end = metaend(ip->i_height - 1, mp);
 820	if (end - first > size)
 821		end = first + size;
 822	for (ptr = first; ptr < end; ptr++) {
 823		if (*ptr)
 824			break;
 825	}
 826	return ptr - first;
 827}
 828
 829/**
 830 * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 831 * @inode: The inode
 832 * @pos: Starting position in bytes
 833 * @length: Length to map, in bytes
 834 * @flags: iomap flags
 835 * @iomap: The iomap structure
 836 * @mp: The metapath
 837 *
 838 * Returns: errno
 839 */
 840static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 841			    unsigned flags, struct iomap *iomap,
 842			    struct metapath *mp)
 843{
 844	struct gfs2_inode *ip = GFS2_I(inode);
 845	struct gfs2_sbd *sdp = GFS2_SB(inode);
 846	loff_t size = i_size_read(inode);
 847	__be64 *ptr;
 848	sector_t lblock;
 849	sector_t lblock_stop;
 850	int ret;
 851	int eob;
 852	u64 len;
 853	struct buffer_head *dibh = NULL, *bh;
 854	u8 height;
 855
 856	if (!length)
 857		return -EINVAL;
 858
 859	down_read(&ip->i_rw_mutex);
 860
 861	ret = gfs2_meta_inode_buffer(ip, &dibh);
 862	if (ret)
 863		goto unlock;
 864	mp->mp_bh[0] = dibh;
 865
 866	if (gfs2_is_stuffed(ip)) {
 867		if (flags & IOMAP_WRITE) {
 868			loff_t max_size = gfs2_max_stuffed_size(ip);
 869
 870			if (pos + length > max_size)
 871				goto unstuff;
 872			iomap->length = max_size;
 873		} else {
 874			if (pos >= size) {
 875				if (flags & IOMAP_REPORT) {
 876					ret = -ENOENT;
 877					goto unlock;
 878				} else {
 879					iomap->offset = pos;
 880					iomap->length = length;
 881					goto hole_found;
 882				}
 883			}
 884			iomap->length = size;
 885		}
 886		iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 887			      sizeof(struct gfs2_dinode);
 888		iomap->type = IOMAP_INLINE;
 889		iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 890		goto out;
 891	}
 892
 893unstuff:
 894	lblock = pos >> inode->i_blkbits;
 895	iomap->offset = lblock << inode->i_blkbits;
 896	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 897	len = lblock_stop - lblock + 1;
 898	iomap->length = len << inode->i_blkbits;
 899
 900	height = ip->i_height;
 901	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 902		height++;
 903	find_metapath(sdp, lblock, mp, height);
 904	if (height > ip->i_height || gfs2_is_stuffed(ip))
 905		goto do_alloc;
 906
 907	ret = lookup_metapath(ip, mp);
 908	if (ret)
 909		goto unlock;
 910
 911	if (mp->mp_aheight != ip->i_height)
 912		goto do_alloc;
 913
 914	ptr = metapointer(ip->i_height - 1, mp);
 915	if (*ptr == 0)
 916		goto do_alloc;
 917
 918	bh = mp->mp_bh[ip->i_height - 1];
 919	len = gfs2_extent_length(bh, ptr, &eob);
 920
 921	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 922	iomap->length = len << inode->i_blkbits;
 923	iomap->type = IOMAP_MAPPED;
 924	iomap->flags |= IOMAP_F_MERGED;
 925	if (eob)
 926		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 927
 928out:
 929	iomap->bdev = inode->i_sb->s_bdev;
 930unlock:
 931	up_read(&ip->i_rw_mutex);
 932	return ret;
 933
 934do_alloc:
 935	if (flags & IOMAP_REPORT) {
 936		if (pos >= size)
 937			ret = -ENOENT;
 938		else if (height == ip->i_height)
 939			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 940		else
 941			iomap->length = size - iomap->offset;
 942	} else if (flags & IOMAP_WRITE) {
 943		u64 alloc_size;
 944
 945		if (flags & IOMAP_DIRECT)
 946			goto out;  /* (see gfs2_file_direct_write) */
 947
 948		len = gfs2_alloc_size(inode, mp, len);
 949		alloc_size = len << inode->i_blkbits;
 950		if (alloc_size < iomap->length)
 951			iomap->length = alloc_size;
 952	} else {
 953		if (pos < size && height == ip->i_height)
 954			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 955	}
 956hole_found:
 957	iomap->addr = IOMAP_NULL_ADDR;
 958	iomap->type = IOMAP_HOLE;
 959	goto out;
 960}
 961
 962static struct folio *
 963gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 964{
 965	struct inode *inode = iter->inode;
 966	struct gfs2_inode *ip = GFS2_I(inode);
 967	unsigned int blockmask = i_blocksize(inode) - 1;
 968	struct gfs2_sbd *sdp = GFS2_SB(inode);
 969	unsigned int blocks;
 970	struct folio *folio;
 971	int status;
 972
 973	if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
 974		return iomap_get_folio(iter, pos, len);
 975
 976	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 977	status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 978	if (status)
 979		return ERR_PTR(status);
 980
 981	folio = iomap_get_folio(iter, pos, len);
 982	if (IS_ERR(folio))
 983		gfs2_trans_end(sdp);
 984	return folio;
 985}
 986
 987static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 988				 unsigned copied, struct folio *folio)
 989{
 990	struct gfs2_trans *tr = current->journal_info;
 991	struct gfs2_inode *ip = GFS2_I(inode);
 992	struct gfs2_sbd *sdp = GFS2_SB(inode);
 993
 994	if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
 995		gfs2_trans_add_databufs(ip->i_gl, folio,
 996					offset_in_folio(folio, pos),
 997					copied);
 998
 999	folio_unlock(folio);
1000	folio_put(folio);
1001
1002	if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) {
1003		if (tr->tr_num_buf_new)
1004			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1005		gfs2_trans_end(sdp);
1006	}
1007}
1008
1009const struct iomap_write_ops gfs2_iomap_write_ops = {
1010	.get_folio = gfs2_iomap_get_folio,
1011	.put_folio = gfs2_iomap_put_folio,
1012};
1013
1014static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1015				  loff_t length, unsigned flags,
1016				  struct iomap *iomap,
1017				  struct metapath *mp)
1018{
1019	struct gfs2_inode *ip = GFS2_I(inode);
1020	struct gfs2_sbd *sdp = GFS2_SB(inode);
1021	bool unstuff;
1022	int ret;
1023
1024	unstuff = gfs2_is_stuffed(ip) &&
1025		  pos + length > gfs2_max_stuffed_size(ip);
1026
1027	if (unstuff || iomap->type == IOMAP_HOLE) {
1028		unsigned int data_blocks, ind_blocks;
1029		struct gfs2_alloc_parms ap = {};
1030		unsigned int rblocks;
1031		struct gfs2_trans *tr;
1032
1033		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1034				       &ind_blocks);
1035		ap.target = data_blocks + ind_blocks;
1036		ret = gfs2_quota_lock_check(ip, &ap);
1037		if (ret)
1038			return ret;
1039
1040		ret = gfs2_inplace_reserve(ip, &ap);
1041		if (ret)
1042			goto out_qunlock;
1043
1044		rblocks = RES_DINODE + ind_blocks;
1045		if (gfs2_is_jdata(ip))
1046			rblocks += data_blocks;
1047		if (ind_blocks || data_blocks)
1048			rblocks += RES_STATFS + RES_QUOTA;
1049		if (inode == sdp->sd_rindex)
1050			rblocks += 2 * RES_STATFS;
1051		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1052
1053		ret = gfs2_trans_begin(sdp, rblocks,
1054				       iomap->length >> inode->i_blkbits);
1055		if (ret)
1056			goto out_trans_fail;
1057
1058		if (unstuff) {
1059			ret = gfs2_unstuff_dinode(ip);
1060			if (ret)
1061				goto out_trans_end;
1062			release_metapath(mp);
1063			ret = __gfs2_iomap_get(inode, iomap->offset,
1064					       iomap->length, flags, iomap, mp);
1065			if (ret)
1066				goto out_trans_end;
1067		}
1068
1069		if (iomap->type == IOMAP_HOLE) {
1070			ret = __gfs2_iomap_alloc(inode, iomap, mp);
1071			if (ret) {
1072				gfs2_trans_end(sdp);
1073				gfs2_inplace_release(ip);
1074				punch_hole(ip, iomap->offset, iomap->length);
1075				goto out_qunlock;
1076			}
1077		}
1078
1079		tr = current->journal_info;
1080		if (tr->tr_num_buf_new)
1081			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1082
1083		gfs2_trans_end(sdp);
1084	}
1085
1086	return 0;
1087
1088out_trans_end:
1089	gfs2_trans_end(sdp);
1090out_trans_fail:
1091	gfs2_inplace_release(ip);
1092out_qunlock:
1093	gfs2_quota_unlock(ip);
1094	return ret;
1095}
1096
1097static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1098			    unsigned flags, struct iomap *iomap,
1099			    struct iomap *srcmap)
1100{
1101	struct gfs2_inode *ip = GFS2_I(inode);
1102	struct metapath mp = { .mp_aheight = 1, };
1103	int ret;
1104
1105	if (gfs2_is_jdata(ip))
1106		iomap->flags |= IOMAP_F_BUFFER_HEAD;
1107
1108	trace_gfs2_iomap_start(ip, pos, length, flags);
1109	ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1110	if (ret)
1111		goto out_unlock;
1112
1113	switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1114	case IOMAP_WRITE:
1115		if (flags & IOMAP_DIRECT) {
1116			/*
1117			 * Silently fall back to buffered I/O for stuffed files
1118			 * or if we've got a hole (see gfs2_file_direct_write).
1119			 */
1120			if (iomap->type != IOMAP_MAPPED)
1121				ret = -ENOTBLK;
1122			goto out_unlock;
1123		}
1124		break;
1125	case IOMAP_ZERO:
1126		if (iomap->type == IOMAP_HOLE)
1127			goto out_unlock;
1128		break;
1129	default:
1130		goto out;
1131	}
1132
1133	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1134	if (ret)
1135		goto out_unlock;
1136
1137out:
1138	if (iomap->type == IOMAP_INLINE) {
1139		iomap->private = metapath_dibh(&mp);
1140		get_bh(iomap->private);
1141	}
1142
1143out_unlock:
1144	release_metapath(&mp);
1145	trace_gfs2_iomap_end(ip, iomap, ret);
1146	return ret;
1147}
1148
1149static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1150			  ssize_t written, unsigned flags, struct iomap *iomap)
1151{
1152	struct gfs2_inode *ip = GFS2_I(inode);
1153	struct gfs2_sbd *sdp = GFS2_SB(inode);
1154
1155	if (iomap->private)
1156		brelse(iomap->private);
1157
1158	switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1159	case IOMAP_WRITE:
1160		if (flags & IOMAP_DIRECT)
1161			return 0;
1162		break;
1163	case IOMAP_ZERO:
1164		 if (iomap->type == IOMAP_HOLE)
1165			 return 0;
1166		 break;
1167	default:
1168		 return 0;
1169	}
1170
1171	if (!gfs2_is_stuffed(ip))
1172		gfs2_ordered_add_inode(ip);
1173
1174	if (inode == sdp->sd_rindex)
1175		adjust_fs_space(inode);
1176
1177	gfs2_inplace_release(ip);
1178
1179	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1180		gfs2_quota_unlock(ip);
1181
1182	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1183		/* Deallocate blocks that were just allocated. */
1184		loff_t hstart = round_up(pos + written, i_blocksize(inode));
1185		loff_t hend = iomap->offset + iomap->length;
1186
1187		if (hstart < hend) {
1188			truncate_pagecache_range(inode, hstart, hend - 1);
1189			punch_hole(ip, hstart, hend - hstart);
1190		}
1191	}
1192
1193	if (unlikely(!written))
1194		return 0;
1195
1196	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1197		mark_inode_dirty(inode);
1198	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1199	return 0;
1200}
1201
1202const struct iomap_ops gfs2_iomap_ops = {
1203	.iomap_begin = gfs2_iomap_begin,
1204	.iomap_end = gfs2_iomap_end,
1205};
1206
1207/**
1208 * gfs2_block_map - Map one or more blocks of an inode to a disk block
1209 * @inode: The inode
1210 * @lblock: The logical block number
1211 * @bh_map: The bh to be mapped
1212 * @create: True if its ok to alloc blocks to satify the request
1213 *
1214 * The size of the requested mapping is defined in bh_map->b_size.
1215 *
1216 * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1217 * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1218 * bh_map->b_size to indicate the size of the mapping when @lblock and
1219 * successive blocks are mapped, up to the requested size.
1220 *
1221 * Sets buffer_boundary() if a read of metadata will be required
1222 * before the next block can be mapped. Sets buffer_new() if new
1223 * blocks were allocated.
1224 *
1225 * Returns: errno
1226 */
1227
1228int gfs2_block_map(struct inode *inode, sector_t lblock,
1229		   struct buffer_head *bh_map, int create)
1230{
1231	struct gfs2_inode *ip = GFS2_I(inode);
1232	loff_t pos = (loff_t)lblock << inode->i_blkbits;
1233	loff_t length = bh_map->b_size;
1234	struct iomap iomap = { };
1235	int ret;
1236
1237	clear_buffer_mapped(bh_map);
1238	clear_buffer_new(bh_map);
1239	clear_buffer_boundary(bh_map);
1240	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1241
1242	if (!create)
1243		ret = gfs2_iomap_get(inode, pos, length, &iomap);
1244	else
1245		ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1246	if (ret)
1247		goto out;
1248
1249	if (iomap.length > bh_map->b_size) {
1250		iomap.length = bh_map->b_size;
1251		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1252	}
1253	if (iomap.addr != IOMAP_NULL_ADDR)
1254		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1255	bh_map->b_size = iomap.length;
1256	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1257		set_buffer_boundary(bh_map);
1258	if (iomap.flags & IOMAP_F_NEW)
1259		set_buffer_new(bh_map);
1260
1261out:
1262	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1263	return ret;
1264}
1265
1266int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1267		    unsigned int *extlen)
1268{
1269	unsigned int blkbits = inode->i_blkbits;
1270	struct iomap iomap = { };
1271	unsigned int len;
1272	int ret;
1273
1274	ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1275			     &iomap);
1276	if (ret)
1277		return ret;
1278	if (iomap.type != IOMAP_MAPPED)
1279		return -EIO;
1280	*dblock = iomap.addr >> blkbits;
1281	len = iomap.length >> blkbits;
1282	if (len < *extlen)
1283		*extlen = len;
1284	return 0;
1285}
1286
1287int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1288		      unsigned int *extlen, bool *new)
1289{
1290	unsigned int blkbits = inode->i_blkbits;
1291	struct iomap iomap = { };
1292	unsigned int len;
1293	int ret;
1294
1295	ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1296			       &iomap);
1297	if (ret)
1298		return ret;
1299	if (iomap.type != IOMAP_MAPPED)
1300		return -EIO;
1301	*dblock = iomap.addr >> blkbits;
1302	len = iomap.length >> blkbits;
1303	if (len < *extlen)
1304		*extlen = len;
1305	*new = iomap.flags & IOMAP_F_NEW;
1306	return 0;
1307}
1308
1309/*
1310 * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1311 * uses iomap write to perform its actions, which begin their own transactions
1312 * (iomap_begin, get_folio, etc.)
1313 */
1314static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length)
1315{
1316	BUG_ON(current->journal_info);
1317	if (from >= inode->i_size)
1318		return 0;
1319	length = min(length, inode->i_size - from);
1320	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
1321			&gfs2_iomap_write_ops, NULL);
1322}
1323
1324#define GFS2_JTRUNC_REVOKES 8192
1325
1326/**
1327 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1328 * @inode: The inode being truncated
1329 * @oldsize: The original (larger) size
1330 * @newsize: The new smaller size
1331 *
1332 * With jdata files, we have to journal a revoke for each block which is
1333 * truncated. As a result, we need to split this into separate transactions
1334 * if the number of pages being truncated gets too large.
1335 */
1336
1337static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1338{
1339	struct gfs2_sbd *sdp = GFS2_SB(inode);
1340	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1341	u64 chunk;
1342	int error;
1343
1344	while (oldsize != newsize) {
1345		struct gfs2_trans *tr;
1346		unsigned int offs;
1347
1348		chunk = oldsize - newsize;
1349		if (chunk > max_chunk)
1350			chunk = max_chunk;
1351
1352		offs = oldsize & ~PAGE_MASK;
1353		if (offs && chunk > PAGE_SIZE)
1354			chunk = offs + ((chunk - offs) & PAGE_MASK);
1355
1356		truncate_pagecache(inode, oldsize - chunk);
1357		oldsize -= chunk;
1358
1359		tr = current->journal_info;
1360		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1361			continue;
1362
1363		gfs2_trans_end(sdp);
1364		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1365		if (error)
1366			return error;
1367	}
1368
1369	return 0;
1370}
1371
1372static int trunc_start(struct inode *inode, u64 newsize)
1373{
1374	struct gfs2_inode *ip = GFS2_I(inode);
1375	struct gfs2_sbd *sdp = GFS2_SB(inode);
1376	struct buffer_head *dibh = NULL;
1377	int journaled = gfs2_is_jdata(ip);
1378	u64 oldsize = inode->i_size;
1379	int error;
1380
1381	if (!gfs2_is_stuffed(ip)) {
1382		unsigned int blocksize = i_blocksize(inode);
1383		unsigned int offs = newsize & (blocksize - 1);
1384		if (offs) {
1385			error = gfs2_block_zero_range(inode, newsize,
1386						      blocksize - offs);
1387			if (error)
1388				return error;
1389		}
1390	}
1391	if (journaled)
1392		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1393	else
1394		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1395	if (error)
1396		return error;
1397
1398	error = gfs2_meta_inode_buffer(ip, &dibh);
1399	if (error)
1400		goto out;
1401
1402	gfs2_trans_add_meta(ip->i_gl, dibh);
1403
1404	if (gfs2_is_stuffed(ip))
1405		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1406	else
1407		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1408
1409	i_size_write(inode, newsize);
1410	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1411	gfs2_dinode_out(ip, dibh->b_data);
1412
1413	if (journaled)
1414		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1415	else
1416		truncate_pagecache(inode, newsize);
1417
1418out:
1419	brelse(dibh);
1420	if (current->journal_info)
1421		gfs2_trans_end(sdp);
1422	return error;
1423}
1424
1425int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1426		   struct iomap *iomap)
1427{
1428	struct metapath mp = { .mp_aheight = 1, };
1429	int ret;
1430
1431	ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1432	release_metapath(&mp);
1433	return ret;
1434}
1435
1436int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1437		     struct iomap *iomap)
1438{
1439	struct metapath mp = { .mp_aheight = 1, };
1440	int ret;
1441
1442	ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1443	if (!ret && iomap->type == IOMAP_HOLE)
1444		ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1445	release_metapath(&mp);
1446	return ret;
1447}
1448
1449/**
1450 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1451 * @ip: inode
1452 * @rd_gh: holder of resource group glock
1453 * @bh: buffer head to sweep
1454 * @start: starting point in bh
1455 * @end: end point in bh
1456 * @meta: true if bh points to metadata (rather than data)
1457 * @btotal: place to keep count of total blocks freed
1458 *
1459 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1460 * free, and free them all. However, we do it one rgrp at a time. If this
1461 * block has references to multiple rgrps, we break it into individual
1462 * transactions. This allows other processes to use the rgrps while we're
1463 * focused on a single one, for better concurrency / performance.
1464 * At every transaction boundary, we rewrite the inode into the journal.
1465 * That way the bitmaps are kept consistent with the inode and we can recover
1466 * if we're interrupted by power-outages.
1467 *
1468 * Returns: 0, or return code if an error occurred.
1469 *          *btotal has the total number of blocks freed
1470 */
1471static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1472			      struct buffer_head *bh, __be64 *start, __be64 *end,
1473			      bool meta, u32 *btotal)
1474{
1475	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1476	struct gfs2_rgrpd *rgd;
1477	struct gfs2_trans *tr;
1478	__be64 *p;
1479	int blks_outside_rgrp;
1480	u64 bn, bstart, isize_blks;
1481	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1482	int ret = 0;
1483	bool buf_in_tr = false; /* buffer was added to transaction */
1484
1485more_rgrps:
1486	rgd = NULL;
1487	if (gfs2_holder_initialized(rd_gh)) {
1488		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1489		gfs2_assert_withdraw(sdp,
1490			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1491	}
1492	blks_outside_rgrp = 0;
1493	bstart = 0;
1494	blen = 0;
1495
1496	for (p = start; p < end; p++) {
1497		if (!*p)
1498			continue;
1499		bn = be64_to_cpu(*p);
1500
1501		if (rgd) {
1502			if (!rgrp_contains_block(rgd, bn)) {
1503				blks_outside_rgrp++;
1504				continue;
1505			}
1506		} else {
1507			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1508			if (unlikely(!rgd)) {
1509				ret = -EIO;
1510				goto out;
1511			}
1512			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1513						 LM_FLAG_NODE_SCOPE, rd_gh);
1514			if (ret)
1515				goto out;
1516
1517			/* Must be done with the rgrp glock held: */
1518			if (gfs2_rs_active(&ip->i_res) &&
1519			    rgd == ip->i_res.rs_rgd)
1520				gfs2_rs_deltree(&ip->i_res);
1521		}
1522
1523		/* The size of our transactions will be unknown until we
1524		   actually process all the metadata blocks that relate to
1525		   the rgrp. So we estimate. We know it can't be more than
1526		   the dinode's i_blocks and we don't want to exceed the
1527		   journal flush threshold, sd_log_thresh2. */
1528		if (current->journal_info == NULL) {
1529			unsigned int jblocks_rqsted, revokes;
1530
1531			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1532				RES_INDIRECT;
1533			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1534			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1535				jblocks_rqsted +=
1536					atomic_read(&sdp->sd_log_thresh2);
1537			else
1538				jblocks_rqsted += isize_blks;
1539			revokes = jblocks_rqsted;
1540			if (meta)
1541				revokes += end - start;
1542			else if (ip->i_depth)
1543				revokes += sdp->sd_inptrs;
1544			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1545			if (ret)
1546				goto out_unlock;
1547			down_write(&ip->i_rw_mutex);
1548		}
1549		/* check if we will exceed the transaction blocks requested */
1550		tr = current->journal_info;
1551		if (tr->tr_num_buf_new + RES_STATFS +
1552		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1553			/* We set blks_outside_rgrp to ensure the loop will
1554			   be repeated for the same rgrp, but with a new
1555			   transaction. */
1556			blks_outside_rgrp++;
1557			/* This next part is tricky. If the buffer was added
1558			   to the transaction, we've already set some block
1559			   pointers to 0, so we better follow through and free
1560			   them, or we will introduce corruption (so break).
1561			   This may be impossible, or at least rare, but I
1562			   decided to cover the case regardless.
1563
1564			   If the buffer was not added to the transaction
1565			   (this call), doing so would exceed our transaction
1566			   size, so we need to end the transaction and start a
1567			   new one (so goto). */
1568
1569			if (buf_in_tr)
1570				break;
1571			goto out_unlock;
1572		}
1573
1574		gfs2_trans_add_meta(ip->i_gl, bh);
1575		buf_in_tr = true;
1576		*p = 0;
1577		if (bstart + blen == bn) {
1578			blen++;
1579			continue;
1580		}
1581		if (bstart) {
1582			__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1583			(*btotal) += blen;
1584			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1585		}
1586		bstart = bn;
1587		blen = 1;
1588	}
1589	if (bstart) {
1590		__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1591		(*btotal) += blen;
1592		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1593	}
1594out_unlock:
1595	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1596					    outside the rgrp we just processed,
1597					    do it all over again. */
1598		if (current->journal_info) {
1599			struct buffer_head *dibh;
1600
1601			ret = gfs2_meta_inode_buffer(ip, &dibh);
1602			if (ret)
1603				goto out;
1604
1605			/* Every transaction boundary, we rewrite the dinode
1606			   to keep its di_blocks current in case of failure. */
1607			inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1608			gfs2_trans_add_meta(ip->i_gl, dibh);
1609			gfs2_dinode_out(ip, dibh->b_data);
1610			brelse(dibh);
1611			up_write(&ip->i_rw_mutex);
1612			gfs2_trans_end(sdp);
1613			buf_in_tr = false;
1614		}
1615		gfs2_glock_dq_uninit(rd_gh);
1616		cond_resched();
1617		goto more_rgrps;
1618	}
1619out:
1620	return ret;
1621}
1622
1623static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1624{
1625	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1626		return false;
1627	return true;
1628}
1629
1630/**
1631 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1632 * @sdp: The superblock
1633 * @mp: starting metapath
1634 * @h: desired height to search
1635 * @end_list: See punch_hole().
1636 * @end_aligned: See punch_hole().
1637 *
1638 * Assumes the metapath is valid (with buffers) out to height h.
1639 * Returns: true if a non-null pointer was found in the metapath buffer
1640 *          false if all remaining pointers are NULL in the buffer
1641 */
1642static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1643			     unsigned int h,
1644			     __u16 *end_list, unsigned int end_aligned)
1645{
1646	struct buffer_head *bh = mp->mp_bh[h];
1647	__be64 *first, *ptr, *end;
1648
1649	first = metaptr1(h, mp);
1650	ptr = first + mp->mp_list[h];
1651	end = (__be64 *)(bh->b_data + bh->b_size);
1652	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1653		bool keep_end = h < end_aligned;
1654		end = first + end_list[h] + keep_end;
1655	}
1656
1657	while (ptr < end) {
1658		if (*ptr) { /* if we have a non-null pointer */
1659			mp->mp_list[h] = ptr - first;
1660			h++;
1661			if (h < GFS2_MAX_META_HEIGHT)
1662				mp->mp_list[h] = 0;
1663			return true;
1664		}
1665		ptr++;
1666	}
1667	return false;
1668}
1669
1670enum dealloc_states {
1671	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1672	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1673	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1674	DEALLOC_DONE = 3,       /* process complete */
1675};
1676
1677static inline void
1678metapointer_range(struct metapath *mp, int height,
1679		  __u16 *start_list, unsigned int start_aligned,
1680		  __u16 *end_list, unsigned int end_aligned,
1681		  __be64 **start, __be64 **end)
1682{
1683	struct buffer_head *bh = mp->mp_bh[height];
1684	__be64 *first;
1685
1686	first = metaptr1(height, mp);
1687	*start = first;
1688	if (mp_eq_to_hgt(mp, start_list, height)) {
1689		bool keep_start = height < start_aligned;
1690		*start = first + start_list[height] + keep_start;
1691	}
1692	*end = (__be64 *)(bh->b_data + bh->b_size);
1693	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1694		bool keep_end = height < end_aligned;
1695		*end = first + end_list[height] + keep_end;
1696	}
1697}
1698
1699static inline bool walk_done(struct gfs2_sbd *sdp,
1700			     struct metapath *mp, int height,
1701			     __u16 *end_list, unsigned int end_aligned)
1702{
1703	__u16 end;
1704
1705	if (end_list) {
1706		bool keep_end = height < end_aligned;
1707		if (!mp_eq_to_hgt(mp, end_list, height))
1708			return false;
1709		end = end_list[height] + keep_end;
1710	} else
1711		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1712	return mp->mp_list[height] >= end;
1713}
1714
1715/**
1716 * punch_hole - deallocate blocks in a file
1717 * @ip: inode to truncate
1718 * @offset: the start of the hole
1719 * @length: the size of the hole (or 0 for truncate)
1720 *
1721 * Punch a hole into a file or truncate a file at a given position.  This
1722 * function operates in whole blocks (@offset and @length are rounded
1723 * accordingly); partially filled blocks must be cleared otherwise.
1724 *
1725 * This function works from the bottom up, and from the right to the left. In
1726 * other words, it strips off the highest layer (data) before stripping any of
1727 * the metadata. Doing it this way is best in case the operation is interrupted
1728 * by power failure, etc.  The dinode is rewritten in every transaction to
1729 * guarantee integrity.
1730 */
1731static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1732{
1733	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1734	u64 maxsize = sdp->sd_heightsize[ip->i_height];
1735	struct metapath mp = {};
1736	struct buffer_head *dibh, *bh;
1737	struct gfs2_holder rd_gh;
1738	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1739	unsigned int bsize = 1 << bsize_shift;
1740	u64 lblock = (offset + bsize - 1) >> bsize_shift;
1741	__u16 start_list[GFS2_MAX_META_HEIGHT];
1742	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1743	unsigned int start_aligned, end_aligned;
1744	unsigned int strip_h = ip->i_height - 1;
1745	u32 btotal = 0;
1746	int ret, state;
1747	int mp_h; /* metapath buffers are read in to this height */
1748	u64 prev_bnr = 0;
1749	__be64 *start, *end;
1750
1751	if (offset + bsize - 1 >= maxsize) {
1752		/*
1753		 * The starting point lies beyond the allocated metadata;
1754		 * there are no blocks to deallocate.
1755		 */
1756		return 0;
1757	}
1758
1759	/*
1760	 * The start position of the hole is defined by lblock, start_list, and
1761	 * start_aligned.  The end position of the hole is defined by lend,
1762	 * end_list, and end_aligned.
1763	 *
1764	 * start_aligned and end_aligned define down to which height the start
1765	 * and end positions are aligned to the metadata tree (i.e., the
1766	 * position is a multiple of the metadata granularity at the height
1767	 * above).  This determines at which heights additional meta pointers
1768	 * needs to be preserved for the remaining data.
1769	 */
1770
1771	if (length) {
1772		u64 end_offset = offset + length;
1773		u64 lend;
1774
1775		/*
1776		 * Clip the end at the maximum file size for the given height:
1777		 * that's how far the metadata goes; files bigger than that
1778		 * will have additional layers of indirection.
1779		 */
1780		if (end_offset > maxsize)
1781			end_offset = maxsize;
1782		lend = end_offset >> bsize_shift;
1783
1784		if (lblock >= lend)
1785			return 0;
1786
1787		find_metapath(sdp, lend, &mp, ip->i_height);
1788		end_list = __end_list;
1789		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1790
1791		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1792			if (end_list[mp_h])
1793				break;
1794		}
1795		end_aligned = mp_h;
1796	}
1797
1798	find_metapath(sdp, lblock, &mp, ip->i_height);
1799	memcpy(start_list, mp.mp_list, sizeof(start_list));
1800
1801	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1802		if (start_list[mp_h])
1803			break;
1804	}
1805	start_aligned = mp_h;
1806
1807	ret = gfs2_meta_inode_buffer(ip, &dibh);
1808	if (ret)
1809		return ret;
1810
1811	mp.mp_bh[0] = dibh;
1812	ret = lookup_metapath(ip, &mp);
1813	if (ret)
1814		goto out_metapath;
1815
1816	/* issue read-ahead on metadata */
1817	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1818		metapointer_range(&mp, mp_h, start_list, start_aligned,
1819				  end_list, end_aligned, &start, &end);
1820		gfs2_metapath_ra(ip->i_gl, start, end);
1821	}
1822
1823	if (mp.mp_aheight == ip->i_height)
1824		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1825	else
1826		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1827
1828	ret = gfs2_rindex_update(sdp);
1829	if (ret)
1830		goto out_metapath;
1831
1832	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1833	if (ret)
1834		goto out_metapath;
1835	gfs2_holder_mark_uninitialized(&rd_gh);
1836
1837	mp_h = strip_h;
1838
1839	while (state != DEALLOC_DONE) {
1840		switch (state) {
1841		/* Truncate a full metapath at the given strip height.
1842		 * Note that strip_h == mp_h in order to be in this state. */
1843		case DEALLOC_MP_FULL:
1844			bh = mp.mp_bh[mp_h];
1845			gfs2_assert_withdraw(sdp, bh);
1846			if (gfs2_assert_withdraw(sdp,
1847						 prev_bnr != bh->b_blocknr)) {
1848				fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u, "
1849					 "s_h:%u, mp_h:%u\n",
1850				       (unsigned long long)ip->i_no_addr,
1851				       prev_bnr, ip->i_height, strip_h, mp_h);
1852			}
1853			prev_bnr = bh->b_blocknr;
1854
1855			if (gfs2_metatype_check(sdp, bh,
1856						(mp_h ? GFS2_METATYPE_IN :
1857							GFS2_METATYPE_DI))) {
1858				ret = -EIO;
1859				goto out;
1860			}
1861
1862			/*
1863			 * Below, passing end_aligned as 0 gives us the
1864			 * metapointer range excluding the end point: the end
1865			 * point is the first metapath we must not deallocate!
1866			 */
1867
1868			metapointer_range(&mp, mp_h, start_list, start_aligned,
1869					  end_list, 0 /* end_aligned */,
1870					  &start, &end);
1871			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1872						 start, end,
1873						 mp_h != ip->i_height - 1,
1874						 &btotal);
1875
1876			/* If we hit an error or just swept dinode buffer,
1877			   just exit. */
1878			if (ret || !mp_h) {
1879				state = DEALLOC_DONE;
1880				break;
1881			}
1882			state = DEALLOC_MP_LOWER;
1883			break;
1884
1885		/* lower the metapath strip height */
1886		case DEALLOC_MP_LOWER:
1887			/* We're done with the current buffer, so release it,
1888			   unless it's the dinode buffer. Then back up to the
1889			   previous pointer. */
1890			if (mp_h) {
1891				brelse(mp.mp_bh[mp_h]);
1892				mp.mp_bh[mp_h] = NULL;
1893			}
1894			/* If we can't get any lower in height, we've stripped
1895			   off all we can. Next step is to back up and start
1896			   stripping the previous level of metadata. */
1897			if (mp_h == 0) {
1898				strip_h--;
1899				memcpy(mp.mp_list, start_list, sizeof(start_list));
1900				mp_h = strip_h;
1901				state = DEALLOC_FILL_MP;
1902				break;
1903			}
1904			mp.mp_list[mp_h] = 0;
1905			mp_h--; /* search one metadata height down */
1906			mp.mp_list[mp_h]++;
1907			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1908				break;
1909			/* Here we've found a part of the metapath that is not
1910			 * allocated. We need to search at that height for the
1911			 * next non-null pointer. */
1912			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1913				state = DEALLOC_FILL_MP;
1914				mp_h++;
1915			}
1916			/* No more non-null pointers at this height. Back up
1917			   to the previous height and try again. */
1918			break; /* loop around in the same state */
1919
1920		/* Fill the metapath with buffers to the given height. */
1921		case DEALLOC_FILL_MP:
1922			/* Fill the buffers out to the current height. */
1923			ret = fillup_metapath(ip, &mp, mp_h);
1924			if (ret < 0)
1925				goto out;
1926
1927			/* On the first pass, issue read-ahead on metadata. */
1928			if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1929				unsigned int height = mp.mp_aheight - 1;
1930
1931				/* No read-ahead for data blocks. */
1932				if (mp.mp_aheight - 1 == strip_h)
1933					height--;
1934
1935				for (; height >= mp.mp_aheight - ret; height--) {
1936					metapointer_range(&mp, height,
1937							  start_list, start_aligned,
1938							  end_list, end_aligned,
1939							  &start, &end);
1940					gfs2_metapath_ra(ip->i_gl, start, end);
1941				}
1942			}
1943
1944			/* If buffers found for the entire strip height */
1945			if (mp.mp_aheight - 1 == strip_h) {
1946				state = DEALLOC_MP_FULL;
1947				break;
1948			}
1949			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1950				mp_h = mp.mp_aheight - 1;
1951
1952			/* If we find a non-null block pointer, crawl a bit
1953			   higher up in the metapath and try again, otherwise
1954			   we need to look lower for a new starting point. */
1955			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1956				mp_h++;
1957			else
1958				state = DEALLOC_MP_LOWER;
1959			break;
1960		}
1961	}
1962
1963	if (btotal) {
1964		if (current->journal_info == NULL) {
1965			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1966					       RES_QUOTA, 0);
1967			if (ret)
1968				goto out;
1969			down_write(&ip->i_rw_mutex);
1970		}
1971		gfs2_statfs_change(sdp, 0, +btotal, 0);
1972		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1973				  ip->i_inode.i_gid);
1974		inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
1975		gfs2_trans_add_meta(ip->i_gl, dibh);
1976		gfs2_dinode_out(ip, dibh->b_data);
1977		up_write(&ip->i_rw_mutex);
1978		gfs2_trans_end(sdp);
1979	}
1980
1981out:
1982	if (gfs2_holder_initialized(&rd_gh))
1983		gfs2_glock_dq_uninit(&rd_gh);
1984	if (current->journal_info) {
1985		up_write(&ip->i_rw_mutex);
1986		gfs2_trans_end(sdp);
1987		cond_resched();
1988	}
1989	gfs2_quota_unhold(ip);
1990out_metapath:
1991	release_metapath(&mp);
1992	return ret;
1993}
1994
1995static int trunc_end(struct gfs2_inode *ip)
1996{
1997	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1998	struct buffer_head *dibh;
1999	int error;
2000
2001	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2002	if (error)
2003		return error;
2004
2005	down_write(&ip->i_rw_mutex);
2006
2007	error = gfs2_meta_inode_buffer(ip, &dibh);
2008	if (error)
2009		goto out;
2010
2011	if (!i_size_read(&ip->i_inode)) {
2012		ip->i_height = 0;
2013		ip->i_goal = ip->i_no_addr;
2014		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2015		gfs2_ordered_del_inode(ip);
2016	}
2017	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2018	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2019
2020	gfs2_trans_add_meta(ip->i_gl, dibh);
2021	gfs2_dinode_out(ip, dibh->b_data);
2022	brelse(dibh);
2023
2024out:
2025	up_write(&ip->i_rw_mutex);
2026	gfs2_trans_end(sdp);
2027	return error;
2028}
2029
2030/**
2031 * do_shrink - make a file smaller
2032 * @inode: the inode
2033 * @newsize: the size to make the file
2034 *
2035 * Called with an exclusive lock on @inode. The @size must
2036 * be equal to or smaller than the current inode size.
2037 *
2038 * Returns: errno
2039 */
2040
2041static int do_shrink(struct inode *inode, u64 newsize)
2042{
2043	struct gfs2_inode *ip = GFS2_I(inode);
2044	int error;
2045
2046	error = trunc_start(inode, newsize);
2047	if (error < 0)
2048		return error;
2049	if (gfs2_is_stuffed(ip))
2050		return 0;
2051
2052	error = punch_hole(ip, newsize, 0);
2053	if (error == 0)
2054		error = trunc_end(ip);
2055
2056	return error;
2057}
2058
2059/**
2060 * do_grow - Touch and update inode size
2061 * @inode: The inode
2062 * @size: The new size
2063 *
2064 * This function updates the timestamps on the inode and
2065 * may also increase the size of the inode. This function
2066 * must not be called with @size any smaller than the current
2067 * inode size.
2068 *
2069 * Although it is not strictly required to unstuff files here,
2070 * earlier versions of GFS2 have a bug in the stuffed file reading
2071 * code which will result in a buffer overrun if the size is larger
2072 * than the max stuffed file size. In order to prevent this from
2073 * occurring, such files are unstuffed, but in other cases we can
2074 * just update the inode size directly.
2075 *
2076 * Returns: 0 on success, or -ve on error
2077 */
2078
2079static int do_grow(struct inode *inode, u64 size)
2080{
2081	struct gfs2_inode *ip = GFS2_I(inode);
2082	struct gfs2_sbd *sdp = GFS2_SB(inode);
2083	struct gfs2_alloc_parms ap = { .target = 1, };
2084	struct buffer_head *dibh;
2085	int error;
2086	int unstuff = 0;
2087
2088	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2089		error = gfs2_quota_lock_check(ip, &ap);
2090		if (error)
2091			return error;
2092
2093		error = gfs2_inplace_reserve(ip, &ap);
2094		if (error)
2095			goto do_grow_qunlock;
2096		unstuff = 1;
2097	}
2098
2099	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2100				 (unstuff &&
2101				  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2102				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2103				  0 : RES_QUOTA), 0);
2104	if (error)
2105		goto do_grow_release;
2106
2107	if (unstuff) {
2108		error = gfs2_unstuff_dinode(ip);
2109		if (error)
2110			goto do_end_trans;
2111	}
2112
2113	error = gfs2_meta_inode_buffer(ip, &dibh);
2114	if (error)
2115		goto do_end_trans;
2116
2117	truncate_setsize(inode, size);
2118	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
2119	gfs2_trans_add_meta(ip->i_gl, dibh);
2120	gfs2_dinode_out(ip, dibh->b_data);
2121	brelse(dibh);
2122
2123do_end_trans:
2124	gfs2_trans_end(sdp);
2125do_grow_release:
2126	if (unstuff) {
2127		gfs2_inplace_release(ip);
2128do_grow_qunlock:
2129		gfs2_quota_unlock(ip);
2130	}
2131	return error;
2132}
2133
2134/**
2135 * gfs2_setattr_size - make a file a given size
2136 * @inode: the inode
2137 * @newsize: the size to make the file
2138 *
2139 * The file size can grow, shrink, or stay the same size. This
2140 * is called holding i_rwsem and an exclusive glock on the inode
2141 * in question.
2142 *
2143 * Returns: errno
2144 */
2145
2146int gfs2_setattr_size(struct inode *inode, u64 newsize)
2147{
2148	struct gfs2_inode *ip = GFS2_I(inode);
2149	int ret;
2150
2151	BUG_ON(!S_ISREG(inode->i_mode));
2152
2153	ret = inode_newsize_ok(inode, newsize);
2154	if (ret)
2155		return ret;
2156
2157	inode_dio_wait(inode);
2158
2159	ret = gfs2_qa_get(ip);
2160	if (ret)
2161		goto out;
2162
2163	if (newsize >= inode->i_size) {
2164		ret = do_grow(inode, newsize);
2165		goto out;
2166	}
2167
2168	ret = do_shrink(inode, newsize);
2169out:
2170	gfs2_rs_delete(ip);
2171	gfs2_qa_put(ip);
2172	return ret;
2173}
2174
2175int gfs2_truncatei_resume(struct gfs2_inode *ip)
2176{
2177	int error;
2178	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2179	if (!error)
2180		error = trunc_end(ip);
2181	return error;
2182}
2183
2184int gfs2_file_dealloc(struct gfs2_inode *ip)
2185{
2186	return punch_hole(ip, 0, 0);
2187}
2188
2189/**
2190 * gfs2_free_journal_extents - Free cached journal bmap info
2191 * @jd: The journal
2192 *
2193 */
2194
2195void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2196{
2197	struct gfs2_journal_extent *jext;
2198
2199	while(!list_empty(&jd->extent_list)) {
2200		jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2201		list_del(&jext->list);
2202		kfree(jext);
2203	}
2204}
2205
2206/**
2207 * gfs2_add_jextent - Add or merge a new extent to extent cache
2208 * @jd: The journal descriptor
2209 * @lblock: The logical block at start of new extent
2210 * @dblock: The physical block at start of new extent
2211 * @blocks: Size of extent in fs blocks
2212 *
2213 * Returns: 0 on success or -ENOMEM
2214 */
2215
2216static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2217{
2218	struct gfs2_journal_extent *jext;
2219
2220	if (!list_empty(&jd->extent_list)) {
2221		jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2222		if ((jext->dblock + jext->blocks) == dblock) {
2223			jext->blocks += blocks;
2224			return 0;
2225		}
2226	}
2227
2228	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2229	if (jext == NULL)
2230		return -ENOMEM;
2231	jext->dblock = dblock;
2232	jext->lblock = lblock;
2233	jext->blocks = blocks;
2234	list_add_tail(&jext->list, &jd->extent_list);
2235	jd->nr_extents++;
2236	return 0;
2237}
2238
2239/**
2240 * gfs2_map_journal_extents - Cache journal bmap info
2241 * @sdp: The super block
2242 * @jd: The journal to map
2243 *
2244 * Create a reusable "extent" mapping from all logical
2245 * blocks to all physical blocks for the given journal.  This will save
2246 * us time when writing journal blocks.  Most journals will have only one
2247 * extent that maps all their logical blocks.  That's because gfs2.mkfs
2248 * arranges the journal blocks sequentially to maximize performance.
2249 * So the extent would map the first block for the entire file length.
2250 * However, gfs2_jadd can happen while file activity is happening, so
2251 * those journals may not be sequential.  Less likely is the case where
2252 * the users created their own journals by mounting the metafs and
2253 * laying it out.  But it's still possible.  These journals might have
2254 * several extents.
2255 *
2256 * Returns: 0 on success, or error on failure
2257 */
2258
2259int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2260{
2261	u64 lblock = 0;
2262	u64 lblock_stop;
2263	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2264	struct buffer_head bh;
2265	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2266	u64 size;
2267	int rc;
2268	ktime_t start, end;
2269
2270	start = ktime_get();
2271	lblock_stop = i_size_read(jd->jd_inode) >> shift;
2272	size = (lblock_stop - lblock) << shift;
2273	jd->nr_extents = 0;
2274	WARN_ON(!list_empty(&jd->extent_list));
2275
2276	do {
2277		bh.b_state = 0;
2278		bh.b_blocknr = 0;
2279		bh.b_size = size;
2280		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2281		if (rc || !buffer_mapped(&bh))
2282			goto fail;
2283		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2284		if (rc)
2285			goto fail;
2286		size -= bh.b_size;
2287		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2288	} while(size > 0);
2289
2290	end = ktime_get();
2291	fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2292		jd->nr_extents, ktime_ms_delta(end, start));
2293	return 0;
2294
2295fail:
2296	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2297		rc, jd->jd_jid,
2298		(unsigned long long)(i_size_read(jd->jd_inode) - size),
2299		jd->nr_extents);
2300	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2301		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2302		bh.b_state, (unsigned long long)bh.b_size);
2303	gfs2_free_journal_extents(jd);
2304	return rc;
2305}
2306
2307/**
2308 * gfs2_write_alloc_required - figure out if a write will require an allocation
2309 * @ip: the file being written to
2310 * @offset: the offset to write to
2311 * @len: the number of bytes being written
2312 *
2313 * Returns: 1 if an alloc is required, 0 otherwise
2314 */
2315
2316int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2317			      unsigned int len)
2318{
2319	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2320	struct buffer_head bh;
2321	unsigned int shift;
2322	u64 lblock, lblock_stop, size;
2323	u64 end_of_file;
2324
2325	if (!len)
2326		return 0;
2327
2328	if (gfs2_is_stuffed(ip)) {
2329		if (offset + len > gfs2_max_stuffed_size(ip))
2330			return 1;
2331		return 0;
2332	}
2333
2334	shift = sdp->sd_sb.sb_bsize_shift;
2335	BUG_ON(gfs2_is_dir(ip));
2336	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2337	lblock = offset >> shift;
2338	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2339	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2340		return 1;
2341
2342	size = (lblock_stop - lblock) << shift;
2343	do {
2344		bh.b_state = 0;
2345		bh.b_size = size;
2346		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2347		if (!buffer_mapped(&bh))
2348			return 1;
2349		size -= bh.b_size;
2350		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2351	} while(size > 0);
2352
2353	return 0;
2354}
2355
2356static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2357{
2358	struct gfs2_inode *ip = GFS2_I(inode);
2359	struct buffer_head *dibh;
2360	int error;
2361
2362	if (offset >= inode->i_size)
2363		return 0;
2364	if (offset + length > inode->i_size)
2365		length = inode->i_size - offset;
2366
2367	error = gfs2_meta_inode_buffer(ip, &dibh);
2368	if (error)
2369		return error;
2370	gfs2_trans_add_meta(ip->i_gl, dibh);
2371	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2372	       length);
2373	brelse(dibh);
2374	return 0;
2375}
2376
2377static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2378					 loff_t length)
2379{
2380	struct gfs2_sbd *sdp = GFS2_SB(inode);
2381	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2382	int error;
2383
2384	while (length) {
2385		struct gfs2_trans *tr;
2386		loff_t chunk;
2387		unsigned int offs;
2388
2389		chunk = length;
2390		if (chunk > max_chunk)
2391			chunk = max_chunk;
2392
2393		offs = offset & ~PAGE_MASK;
2394		if (offs && chunk > PAGE_SIZE)
2395			chunk = offs + ((chunk - offs) & PAGE_MASK);
2396
2397		truncate_pagecache_range(inode, offset, chunk);
2398		offset += chunk;
2399		length -= chunk;
2400
2401		tr = current->journal_info;
2402		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2403			continue;
2404
2405		gfs2_trans_end(sdp);
2406		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2407		if (error)
2408			return error;
2409	}
2410	return 0;
2411}
2412
2413int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2414{
2415	struct inode *inode = file_inode(file);
2416	struct gfs2_inode *ip = GFS2_I(inode);
2417	struct gfs2_sbd *sdp = GFS2_SB(inode);
2418	unsigned int blocksize = i_blocksize(inode);
2419	loff_t start, end;
2420	int error;
2421
2422	if (!gfs2_is_stuffed(ip)) {
2423		unsigned int start_off, end_len;
2424
2425		start_off = offset & (blocksize - 1);
2426		end_len = (offset + length) & (blocksize - 1);
2427		if (start_off) {
2428			unsigned int len = length;
2429			if (length > blocksize - start_off)
2430				len = blocksize - start_off;
2431			error = gfs2_block_zero_range(inode, offset, len);
2432			if (error)
2433				goto out;
2434			if (start_off + length < blocksize)
2435				end_len = 0;
2436		}
2437		if (end_len) {
2438			error = gfs2_block_zero_range(inode,
2439				offset + length - end_len, end_len);
2440			if (error)
2441				goto out;
2442		}
2443	}
2444
2445	start = round_down(offset, blocksize);
2446	end = round_up(offset + length, blocksize) - 1;
2447	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2448	if (error)
2449		return error;
2450
2451	if (gfs2_is_jdata(ip))
2452		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2453					 GFS2_JTRUNC_REVOKES);
2454	else
2455		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2456	if (error)
2457		return error;
2458
2459	if (gfs2_is_stuffed(ip)) {
2460		error = stuffed_zero_range(inode, offset, length);
2461		if (error)
2462			goto out;
2463	}
2464
2465	if (gfs2_is_jdata(ip)) {
2466		BUG_ON(!current->journal_info);
2467		gfs2_journaled_truncate_range(inode, offset, length);
2468	} else
2469		truncate_pagecache_range(inode, offset, offset + length - 1);
2470
2471	file_update_time(file);
2472	mark_inode_dirty(inode);
2473
2474	if (current->journal_info)
2475		gfs2_trans_end(sdp);
2476
2477	if (!gfs2_is_stuffed(ip))
2478		error = punch_hole(ip, offset, length);
2479
2480out:
2481	if (current->journal_info)
2482		gfs2_trans_end(sdp);
2483	return error;
2484}
2485
2486static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
2487		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
2488{
2489	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
2490		return -EIO;
2491
2492	if (offset < wpc->iomap.offset ||
2493	    offset >= wpc->iomap.offset + wpc->iomap.length) {
2494		int ret;
2495
2496		memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2497		ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
2498		if (ret)
2499			return ret;
2500	}
2501
2502	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
2503}
2504
2505const struct iomap_writeback_ops gfs2_writeback_ops = {
2506	.writeback_range	= gfs2_writeback_range,
2507	.writeback_submit	= iomap_ioend_writeback_submit,
2508};