fs/btrfs/compression.c at v5.18 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / compression.c
at v5.18 1909 lines 52 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/kernel.h>
   7#include <linux/bio.h>
   8#include <linux/file.h>
   9#include <linux/fs.h>
  10#include <linux/pagemap.h>
  11#include <linux/highmem.h>
  12#include <linux/kthread.h>
  13#include <linux/time.h>
  14#include <linux/init.h>
  15#include <linux/string.h>
  16#include <linux/backing-dev.h>
  17#include <linux/writeback.h>
  18#include <linux/slab.h>
  19#include <linux/sched/mm.h>
  20#include <linux/log2.h>
  21#include <crypto/hash.h>
  22#include "misc.h"
  23#include "ctree.h"
  24#include "disk-io.h"
  25#include "transaction.h"
  26#include "btrfs_inode.h"
  27#include "volumes.h"
  28#include "ordered-data.h"
  29#include "compression.h"
  30#include "extent_io.h"
  31#include "extent_map.h"
  32#include "subpage.h"
  33#include "zoned.h"
  34
  35static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
  36
  37const char* btrfs_compress_type2str(enum btrfs_compression_type type)
  38{
  39	switch (type) {
  40	case BTRFS_COMPRESS_ZLIB:
  41	case BTRFS_COMPRESS_LZO:
  42	case BTRFS_COMPRESS_ZSTD:
  43	case BTRFS_COMPRESS_NONE:
  44		return btrfs_compress_types[type];
  45	default:
  46		break;
  47	}
  48
  49	return NULL;
  50}
  51
  52bool btrfs_compress_is_valid_type(const char *str, size_t len)
  53{
  54	int i;
  55
  56	for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
  57		size_t comp_len = strlen(btrfs_compress_types[i]);
  58
  59		if (len < comp_len)
  60			continue;
  61
  62		if (!strncmp(btrfs_compress_types[i], str, comp_len))
  63			return true;
  64	}
  65	return false;
  66}
  67
  68static int compression_compress_pages(int type, struct list_head *ws,
  69               struct address_space *mapping, u64 start, struct page **pages,
  70               unsigned long *out_pages, unsigned long *total_in,
  71               unsigned long *total_out)
  72{
  73	switch (type) {
  74	case BTRFS_COMPRESS_ZLIB:
  75		return zlib_compress_pages(ws, mapping, start, pages,
  76				out_pages, total_in, total_out);
  77	case BTRFS_COMPRESS_LZO:
  78		return lzo_compress_pages(ws, mapping, start, pages,
  79				out_pages, total_in, total_out);
  80	case BTRFS_COMPRESS_ZSTD:
  81		return zstd_compress_pages(ws, mapping, start, pages,
  82				out_pages, total_in, total_out);
  83	case BTRFS_COMPRESS_NONE:
  84	default:
  85		/*
  86		 * This can happen when compression races with remount setting
  87		 * it to 'no compress', while caller doesn't call
  88		 * inode_need_compress() to check if we really need to
  89		 * compress.
  90		 *
  91		 * Not a big deal, just need to inform caller that we
  92		 * haven't allocated any pages yet.
  93		 */
  94		*out_pages = 0;
  95		return -E2BIG;
  96	}
  97}
  98
  99static int compression_decompress_bio(struct list_head *ws,
 100				      struct compressed_bio *cb)
 101{
 102	switch (cb->compress_type) {
 103	case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
 104	case BTRFS_COMPRESS_LZO:  return lzo_decompress_bio(ws, cb);
 105	case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
 106	case BTRFS_COMPRESS_NONE:
 107	default:
 108		/*
 109		 * This can't happen, the type is validated several times
 110		 * before we get here.
 111		 */
 112		BUG();
 113	}
 114}
 115
 116static int compression_decompress(int type, struct list_head *ws,
 117               unsigned char *data_in, struct page *dest_page,
 118               unsigned long start_byte, size_t srclen, size_t destlen)
 119{
 120	switch (type) {
 121	case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
 122						start_byte, srclen, destlen);
 123	case BTRFS_COMPRESS_LZO:  return lzo_decompress(ws, data_in, dest_page,
 124						start_byte, srclen, destlen);
 125	case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
 126						start_byte, srclen, destlen);
 127	case BTRFS_COMPRESS_NONE:
 128	default:
 129		/*
 130		 * This can't happen, the type is validated several times
 131		 * before we get here.
 132		 */
 133		BUG();
 134	}
 135}
 136
 137static int btrfs_decompress_bio(struct compressed_bio *cb);
 138
 139static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
 140				      unsigned long disk_size)
 141{
 142	return sizeof(struct compressed_bio) +
 143		(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
 144}
 145
 146static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 147				 u64 disk_start)
 148{
 149	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 150	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 151	const u32 csum_size = fs_info->csum_size;
 152	const u32 sectorsize = fs_info->sectorsize;
 153	struct page *page;
 154	unsigned int i;
 155	char *kaddr;
 156	u8 csum[BTRFS_CSUM_SIZE];
 157	struct compressed_bio *cb = bio->bi_private;
 158	u8 *cb_sum = cb->sums;
 159
 160	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
 161	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
 162		return 0;
 163
 164	shash->tfm = fs_info->csum_shash;
 165
 166	for (i = 0; i < cb->nr_pages; i++) {
 167		u32 pg_offset;
 168		u32 bytes_left = PAGE_SIZE;
 169		page = cb->compressed_pages[i];
 170
 171		/* Determine the remaining bytes inside the page first */
 172		if (i == cb->nr_pages - 1)
 173			bytes_left = cb->compressed_len - i * PAGE_SIZE;
 174
 175		/* Hash through the page sector by sector */
 176		for (pg_offset = 0; pg_offset < bytes_left;
 177		     pg_offset += sectorsize) {
 178			kaddr = kmap_atomic(page);
 179			crypto_shash_digest(shash, kaddr + pg_offset,
 180					    sectorsize, csum);
 181			kunmap_atomic(kaddr);
 182
 183			if (memcmp(&csum, cb_sum, csum_size) != 0) {
 184				btrfs_print_data_csum_error(inode, disk_start,
 185						csum, cb_sum, cb->mirror_num);
 186				if (btrfs_bio(bio)->device)
 187					btrfs_dev_stat_inc_and_print(
 188						btrfs_bio(bio)->device,
 189						BTRFS_DEV_STAT_CORRUPTION_ERRS);
 190				return -EIO;
 191			}
 192			cb_sum += csum_size;
 193			disk_start += sectorsize;
 194		}
 195	}
 196	return 0;
 197}
 198
 199/*
 200 * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
 201 *
 202 * Return true if there is no pending bio nor io.
 203 * Return false otherwise.
 204 */
 205static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
 206{
 207	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 208	unsigned int bi_size = 0;
 209	bool last_io = false;
 210	struct bio_vec *bvec;
 211	struct bvec_iter_all iter_all;
 212
 213	/*
 214	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
 215	 * Thus here we have to iterate through all segments to grab correct
 216	 * bio size.
 217	 */
 218	bio_for_each_segment_all(bvec, bio, iter_all)
 219		bi_size += bvec->bv_len;
 220
 221	if (bio->bi_status)
 222		cb->status = bio->bi_status;
 223
 224	ASSERT(bi_size && bi_size <= cb->compressed_len);
 225	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
 226					&cb->pending_sectors);
 227	/*
 228	 * Here we must wake up the possible error handler after all other
 229	 * operations on @cb finished, or we can race with
 230	 * finish_compressed_bio_*() which may free @cb.
 231	 */
 232	wake_up_var(cb);
 233
 234	return last_io;
 235}
 236
 237static void finish_compressed_bio_read(struct compressed_bio *cb)
 238{
 239	unsigned int index;
 240	struct page *page;
 241
 242	/* Release the compressed pages */
 243	for (index = 0; index < cb->nr_pages; index++) {
 244		page = cb->compressed_pages[index];
 245		page->mapping = NULL;
 246		put_page(page);
 247	}
 248
 249	/* Do io completion on the original bio */
 250	if (cb->status != BLK_STS_OK) {
 251		cb->orig_bio->bi_status = cb->status;
 252		bio_endio(cb->orig_bio);
 253	} else {
 254		struct bio_vec *bvec;
 255		struct bvec_iter_all iter_all;
 256
 257		/*
 258		 * We have verified the checksum already, set page checked so
 259		 * the end_io handlers know about it
 260		 */
 261		ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
 262		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
 263			u64 bvec_start = page_offset(bvec->bv_page) +
 264					 bvec->bv_offset;
 265
 266			btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
 267					bvec->bv_page, bvec_start,
 268					bvec->bv_len);
 269		}
 270
 271		bio_endio(cb->orig_bio);
 272	}
 273
 274	/* Finally free the cb struct */
 275	kfree(cb->compressed_pages);
 276	kfree(cb);
 277}
 278
 279/* when we finish reading compressed pages from the disk, we
 280 * decompress them and then run the bio end_io routines on the
 281 * decompressed pages (in the inode address space).
 282 *
 283 * This allows the checksumming and other IO error handling routines
 284 * to work normally
 285 *
 286 * The compressed pages are freed here, and it must be run
 287 * in process context
 288 */
 289static void end_compressed_bio_read(struct bio *bio)
 290{
 291	struct compressed_bio *cb = bio->bi_private;
 292	struct inode *inode;
 293	unsigned int mirror = btrfs_bio(bio)->mirror_num;
 294	int ret = 0;
 295
 296	if (!dec_and_test_compressed_bio(cb, bio))
 297		goto out;
 298
 299	/*
 300	 * Record the correct mirror_num in cb->orig_bio so that
 301	 * read-repair can work properly.
 302	 */
 303	btrfs_bio(cb->orig_bio)->mirror_num = mirror;
 304	cb->mirror_num = mirror;
 305
 306	/*
 307	 * Some IO in this cb have failed, just skip checksum as there
 308	 * is no way it could be correct.
 309	 */
 310	if (cb->status != BLK_STS_OK)
 311		goto csum_failed;
 312
 313	inode = cb->inode;
 314	ret = check_compressed_csum(BTRFS_I(inode), bio,
 315				    bio->bi_iter.bi_sector << 9);
 316	if (ret)
 317		goto csum_failed;
 318
 319	/* ok, we're the last bio for this extent, lets start
 320	 * the decompression.
 321	 */
 322	ret = btrfs_decompress_bio(cb);
 323
 324csum_failed:
 325	if (ret)
 326		cb->status = errno_to_blk_status(ret);
 327	finish_compressed_bio_read(cb);
 328out:
 329	bio_put(bio);
 330}
 331
 332/*
 333 * Clear the writeback bits on all of the file
 334 * pages for a compressed write
 335 */
 336static noinline void end_compressed_writeback(struct inode *inode,
 337					      const struct compressed_bio *cb)
 338{
 339	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 340	unsigned long index = cb->start >> PAGE_SHIFT;
 341	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 342	struct page *pages[16];
 343	unsigned long nr_pages = end_index - index + 1;
 344	const int errno = blk_status_to_errno(cb->status);
 345	int i;
 346	int ret;
 347
 348	if (errno)
 349		mapping_set_error(inode->i_mapping, errno);
 350
 351	while (nr_pages > 0) {
 352		ret = find_get_pages_contig(inode->i_mapping, index,
 353				     min_t(unsigned long,
 354				     nr_pages, ARRAY_SIZE(pages)), pages);
 355		if (ret == 0) {
 356			nr_pages -= 1;
 357			index += 1;
 358			continue;
 359		}
 360		for (i = 0; i < ret; i++) {
 361			if (errno)
 362				SetPageError(pages[i]);
 363			btrfs_page_clamp_clear_writeback(fs_info, pages[i],
 364							 cb->start, cb->len);
 365			put_page(pages[i]);
 366		}
 367		nr_pages -= ret;
 368		index += ret;
 369	}
 370	/* the inode may be gone now */
 371}
 372
 373static void finish_compressed_bio_write(struct compressed_bio *cb)
 374{
 375	struct inode *inode = cb->inode;
 376	unsigned int index;
 377
 378	/*
 379	 * Ok, we're the last bio for this extent, step one is to call back
 380	 * into the FS and do all the end_io operations.
 381	 */
 382	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
 383			cb->start, cb->start + cb->len - 1,
 384			cb->status == BLK_STS_OK);
 385
 386	if (cb->writeback)
 387		end_compressed_writeback(inode, cb);
 388	/* Note, our inode could be gone now */
 389
 390	/*
 391	 * Release the compressed pages, these came from alloc_page and
 392	 * are not attached to the inode at all
 393	 */
 394	for (index = 0; index < cb->nr_pages; index++) {
 395		struct page *page = cb->compressed_pages[index];
 396
 397		page->mapping = NULL;
 398		put_page(page);
 399	}
 400
 401	/* Finally free the cb struct */
 402	kfree(cb->compressed_pages);
 403	kfree(cb);
 404}
 405
 406/*
 407 * Do the cleanup once all the compressed pages hit the disk.  This will clear
 408 * writeback on the file pages and free the compressed pages.
 409 *
 410 * This also calls the writeback end hooks for the file pages so that metadata
 411 * and checksums can be updated in the file.
 412 */
 413static void end_compressed_bio_write(struct bio *bio)
 414{
 415	struct compressed_bio *cb = bio->bi_private;
 416
 417	if (!dec_and_test_compressed_bio(cb, bio))
 418		goto out;
 419
 420	btrfs_record_physical_zoned(cb->inode, cb->start, bio);
 421
 422	finish_compressed_bio_write(cb);
 423out:
 424	bio_put(bio);
 425}
 426
 427static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
 428					  struct compressed_bio *cb,
 429					  struct bio *bio, int mirror_num)
 430{
 431	blk_status_t ret;
 432
 433	ASSERT(bio->bi_iter.bi_size);
 434	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 435	if (ret)
 436		return ret;
 437	ret = btrfs_map_bio(fs_info, bio, mirror_num);
 438	return ret;
 439}
 440
 441/*
 442 * Allocate a compressed_bio, which will be used to read/write on-disk
 443 * (aka, compressed) * data.
 444 *
 445 * @cb:                 The compressed_bio structure, which records all the needed
 446 *                      information to bind the compressed data to the uncompressed
 447 *                      page cache.
 448 * @disk_byten:         The logical bytenr where the compressed data will be read
 449 *                      from or written to.
 450 * @endio_func:         The endio function to call after the IO for compressed data
 451 *                      is finished.
 452 * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
 453 *                      Let the caller know to only fill the bio up to the stripe
 454 *                      boundary.
 455 */
 456
 457
 458static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
 459					unsigned int opf, bio_end_io_t endio_func,
 460					u64 *next_stripe_start)
 461{
 462	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 463	struct btrfs_io_geometry geom;
 464	struct extent_map *em;
 465	struct bio *bio;
 466	int ret;
 467
 468	bio = btrfs_bio_alloc(BIO_MAX_VECS);
 469
 470	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 471	bio->bi_opf = opf;
 472	bio->bi_private = cb;
 473	bio->bi_end_io = endio_func;
 474
 475	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
 476	if (IS_ERR(em)) {
 477		bio_put(bio);
 478		return ERR_CAST(em);
 479	}
 480
 481	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
 482		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
 483
 484	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
 485	free_extent_map(em);
 486	if (ret < 0) {
 487		bio_put(bio);
 488		return ERR_PTR(ret);
 489	}
 490	*next_stripe_start = disk_bytenr + geom.len;
 491
 492	return bio;
 493}
 494
 495/*
 496 * worker function to build and submit bios for previously compressed pages.
 497 * The corresponding pages in the inode should be marked for writeback
 498 * and the compressed pages should have a reference on them for dropping
 499 * when the IO is complete.
 500 *
 501 * This also checksums the file bytes and gets things ready for
 502 * the end io hooks.
 503 */
 504blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 505				 unsigned int len, u64 disk_start,
 506				 unsigned int compressed_len,
 507				 struct page **compressed_pages,
 508				 unsigned int nr_pages,
 509				 unsigned int write_flags,
 510				 struct cgroup_subsys_state *blkcg_css,
 511				 bool writeback)
 512{
 513	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 514	struct bio *bio = NULL;
 515	struct compressed_bio *cb;
 516	u64 cur_disk_bytenr = disk_start;
 517	u64 next_stripe_start;
 518	blk_status_t ret;
 519	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
 520	const bool use_append = btrfs_use_zone_append(inode, disk_start);
 521	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
 522
 523	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 524	       IS_ALIGNED(len, fs_info->sectorsize));
 525	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 526	if (!cb)
 527		return BLK_STS_RESOURCE;
 528	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
 529	cb->status = BLK_STS_OK;
 530	cb->inode = &inode->vfs_inode;
 531	cb->start = start;
 532	cb->len = len;
 533	cb->mirror_num = 0;
 534	cb->compressed_pages = compressed_pages;
 535	cb->compressed_len = compressed_len;
 536	cb->writeback = writeback;
 537	cb->orig_bio = NULL;
 538	cb->nr_pages = nr_pages;
 539
 540	if (blkcg_css)
 541		kthread_associate_blkcg(blkcg_css);
 542
 543	while (cur_disk_bytenr < disk_start + compressed_len) {
 544		u64 offset = cur_disk_bytenr - disk_start;
 545		unsigned int index = offset >> PAGE_SHIFT;
 546		unsigned int real_size;
 547		unsigned int added;
 548		struct page *page = compressed_pages[index];
 549		bool submit = false;
 550
 551		/* Allocate new bio if submitted or not yet allocated */
 552		if (!bio) {
 553			bio = alloc_compressed_bio(cb, cur_disk_bytenr,
 554				bio_op | write_flags, end_compressed_bio_write,
 555				&next_stripe_start);
 556			if (IS_ERR(bio)) {
 557				ret = errno_to_blk_status(PTR_ERR(bio));
 558				bio = NULL;
 559				goto finish_cb;
 560			}
 561			if (blkcg_css)
 562				bio->bi_opf |= REQ_CGROUP_PUNT;
 563		}
 564		/*
 565		 * We should never reach next_stripe_start start as we will
 566		 * submit comp_bio when reach the boundary immediately.
 567		 */
 568		ASSERT(cur_disk_bytenr != next_stripe_start);
 569
 570		/*
 571		 * We have various limits on the real read size:
 572		 * - stripe boundary
 573		 * - page boundary
 574		 * - compressed length boundary
 575		 */
 576		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
 577		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
 578		real_size = min_t(u64, real_size, compressed_len - offset);
 579		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
 580
 581		if (use_append)
 582			added = bio_add_zone_append_page(bio, page, real_size,
 583					offset_in_page(offset));
 584		else
 585			added = bio_add_page(bio, page, real_size,
 586					offset_in_page(offset));
 587		/* Reached zoned boundary */
 588		if (added == 0)
 589			submit = true;
 590
 591		cur_disk_bytenr += added;
 592		/* Reached stripe boundary */
 593		if (cur_disk_bytenr == next_stripe_start)
 594			submit = true;
 595
 596		/* Finished the range */
 597		if (cur_disk_bytenr == disk_start + compressed_len)
 598			submit = true;
 599
 600		if (submit) {
 601			if (!skip_sum) {
 602				ret = btrfs_csum_one_bio(inode, bio, start, true);
 603				if (ret)
 604					goto finish_cb;
 605			}
 606
 607			ret = submit_compressed_bio(fs_info, cb, bio, 0);
 608			if (ret)
 609				goto finish_cb;
 610			bio = NULL;
 611		}
 612		cond_resched();
 613	}
 614	if (blkcg_css)
 615		kthread_associate_blkcg(NULL);
 616
 617	return 0;
 618
 619finish_cb:
 620	if (blkcg_css)
 621		kthread_associate_blkcg(NULL);
 622
 623	if (bio) {
 624		bio->bi_status = ret;
 625		bio_endio(bio);
 626	}
 627	/* Last byte of @cb is submitted, endio will free @cb */
 628	if (cur_disk_bytenr == disk_start + compressed_len)
 629		return ret;
 630
 631	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
 632			   (disk_start + compressed_len - cur_disk_bytenr) >>
 633			   fs_info->sectorsize_bits);
 634	/*
 635	 * Even with previous bio ended, we should still have io not yet
 636	 * submitted, thus need to finish manually.
 637	 */
 638	ASSERT(refcount_read(&cb->pending_sectors));
 639	/* Now we are the only one referring @cb, can finish it safely. */
 640	finish_compressed_bio_write(cb);
 641	return ret;
 642}
 643
 644static u64 bio_end_offset(struct bio *bio)
 645{
 646	struct bio_vec *last = bio_last_bvec_all(bio);
 647
 648	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
 649}
 650
 651/*
 652 * Add extra pages in the same compressed file extent so that we don't need to
 653 * re-read the same extent again and again.
 654 *
 655 * NOTE: this won't work well for subpage, as for subpage read, we lock the
 656 * full page then submit bio for each compressed/regular extents.
 657 *
 658 * This means, if we have several sectors in the same page points to the same
 659 * on-disk compressed data, we will re-read the same extent many times and
 660 * this function can only help for the next page.
 661 */
 662static noinline int add_ra_bio_pages(struct inode *inode,
 663				     u64 compressed_end,
 664				     struct compressed_bio *cb)
 665{
 666	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 667	unsigned long end_index;
 668	u64 cur = bio_end_offset(cb->orig_bio);
 669	u64 isize = i_size_read(inode);
 670	int ret;
 671	struct page *page;
 672	struct extent_map *em;
 673	struct address_space *mapping = inode->i_mapping;
 674	struct extent_map_tree *em_tree;
 675	struct extent_io_tree *tree;
 676	int sectors_missed = 0;
 677
 678	em_tree = &BTRFS_I(inode)->extent_tree;
 679	tree = &BTRFS_I(inode)->io_tree;
 680
 681	if (isize == 0)
 682		return 0;
 683
 684	/*
 685	 * For current subpage support, we only support 64K page size,
 686	 * which means maximum compressed extent size (128K) is just 2x page
 687	 * size.
 688	 * This makes readahead less effective, so here disable readahead for
 689	 * subpage for now, until full compressed write is supported.
 690	 */
 691	if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
 692		return 0;
 693
 694	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 695
 696	while (cur < compressed_end) {
 697		u64 page_end;
 698		u64 pg_index = cur >> PAGE_SHIFT;
 699		u32 add_size;
 700
 701		if (pg_index > end_index)
 702			break;
 703
 704		page = xa_load(&mapping->i_pages, pg_index);
 705		if (page && !xa_is_value(page)) {
 706			sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
 707					  fs_info->sectorsize_bits;
 708
 709			/* Beyond threshold, no need to continue */
 710			if (sectors_missed > 4)
 711				break;
 712
 713			/*
 714			 * Jump to next page start as we already have page for
 715			 * current offset.
 716			 */
 717			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
 718			continue;
 719		}
 720
 721		page = __page_cache_alloc(mapping_gfp_constraint(mapping,
 722								 ~__GFP_FS));
 723		if (!page)
 724			break;
 725
 726		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
 727			put_page(page);
 728			/* There is already a page, skip to page end */
 729			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
 730			continue;
 731		}
 732
 733		ret = set_page_extent_mapped(page);
 734		if (ret < 0) {
 735			unlock_page(page);
 736			put_page(page);
 737			break;
 738		}
 739
 740		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
 741		lock_extent(tree, cur, page_end);
 742		read_lock(&em_tree->lock);
 743		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
 744		read_unlock(&em_tree->lock);
 745
 746		/*
 747		 * At this point, we have a locked page in the page cache for
 748		 * these bytes in the file.  But, we have to make sure they map
 749		 * to this compressed extent on disk.
 750		 */
 751		if (!em || cur < em->start ||
 752		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
 753		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 754			free_extent_map(em);
 755			unlock_extent(tree, cur, page_end);
 756			unlock_page(page);
 757			put_page(page);
 758			break;
 759		}
 760		free_extent_map(em);
 761
 762		if (page->index == end_index) {
 763			size_t zero_offset = offset_in_page(isize);
 764
 765			if (zero_offset) {
 766				int zeros;
 767				zeros = PAGE_SIZE - zero_offset;
 768				memzero_page(page, zero_offset, zeros);
 769				flush_dcache_page(page);
 770			}
 771		}
 772
 773		add_size = min(em->start + em->len, page_end + 1) - cur;
 774		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
 775		if (ret != add_size) {
 776			unlock_extent(tree, cur, page_end);
 777			unlock_page(page);
 778			put_page(page);
 779			break;
 780		}
 781		/*
 782		 * If it's subpage, we also need to increase its
 783		 * subpage::readers number, as at endio we will decrease
 784		 * subpage::readers and to unlock the page.
 785		 */
 786		if (fs_info->sectorsize < PAGE_SIZE)
 787			btrfs_subpage_start_reader(fs_info, page, cur, add_size);
 788		put_page(page);
 789		cur += add_size;
 790	}
 791	return 0;
 792}
 793
 794/*
 795 * for a compressed read, the bio we get passed has all the inode pages
 796 * in it.  We don't actually do IO on those pages but allocate new ones
 797 * to hold the compressed pages on disk.
 798 *
 799 * bio->bi_iter.bi_sector points to the compressed extent on disk
 800 * bio->bi_io_vec points to all of the inode pages
 801 *
 802 * After the compressed pages are read, we copy the bytes into the
 803 * bio we were passed and then call the bio end_io calls
 804 */
 805blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 806				 int mirror_num, unsigned long bio_flags)
 807{
 808	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 809	struct extent_map_tree *em_tree;
 810	struct compressed_bio *cb;
 811	unsigned int compressed_len;
 812	unsigned int nr_pages;
 813	unsigned int pg_index;
 814	struct bio *comp_bio = NULL;
 815	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 816	u64 cur_disk_byte = disk_bytenr;
 817	u64 next_stripe_start;
 818	u64 file_offset;
 819	u64 em_len;
 820	u64 em_start;
 821	struct extent_map *em;
 822	blk_status_t ret;
 823	int faili = 0;
 824	u8 *sums;
 825
 826	em_tree = &BTRFS_I(inode)->extent_tree;
 827
 828	file_offset = bio_first_bvec_all(bio)->bv_offset +
 829		      page_offset(bio_first_page_all(bio));
 830
 831	/* we need the actual starting offset of this extent in the file */
 832	read_lock(&em_tree->lock);
 833	em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
 834	read_unlock(&em_tree->lock);
 835	if (!em) {
 836		ret = BLK_STS_IOERR;
 837		goto out;
 838	}
 839
 840	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
 841	compressed_len = em->block_len;
 842	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 843	if (!cb) {
 844		ret = BLK_STS_RESOURCE;
 845		goto out;
 846	}
 847
 848	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
 849	cb->status = BLK_STS_OK;
 850	cb->inode = inode;
 851	cb->mirror_num = mirror_num;
 852	sums = cb->sums;
 853
 854	cb->start = em->orig_start;
 855	em_len = em->len;
 856	em_start = em->start;
 857
 858	free_extent_map(em);
 859	em = NULL;
 860
 861	cb->len = bio->bi_iter.bi_size;
 862	cb->compressed_len = compressed_len;
 863	cb->compress_type = extent_compress_type(bio_flags);
 864	cb->orig_bio = bio;
 865
 866	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
 867	cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
 868				       GFP_NOFS);
 869	if (!cb->compressed_pages) {
 870		ret = BLK_STS_RESOURCE;
 871		goto fail1;
 872	}
 873
 874	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 875		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
 876		if (!cb->compressed_pages[pg_index]) {
 877			faili = pg_index - 1;
 878			ret = BLK_STS_RESOURCE;
 879			goto fail2;
 880		}
 881	}
 882	faili = nr_pages - 1;
 883	cb->nr_pages = nr_pages;
 884
 885	add_ra_bio_pages(inode, em_start + em_len, cb);
 886
 887	/* include any pages we added in add_ra-bio_pages */
 888	cb->len = bio->bi_iter.bi_size;
 889
 890	while (cur_disk_byte < disk_bytenr + compressed_len) {
 891		u64 offset = cur_disk_byte - disk_bytenr;
 892		unsigned int index = offset >> PAGE_SHIFT;
 893		unsigned int real_size;
 894		unsigned int added;
 895		struct page *page = cb->compressed_pages[index];
 896		bool submit = false;
 897
 898		/* Allocate new bio if submitted or not yet allocated */
 899		if (!comp_bio) {
 900			comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
 901					REQ_OP_READ, end_compressed_bio_read,
 902					&next_stripe_start);
 903			if (IS_ERR(comp_bio)) {
 904				ret = errno_to_blk_status(PTR_ERR(comp_bio));
 905				comp_bio = NULL;
 906				goto finish_cb;
 907			}
 908		}
 909		/*
 910		 * We should never reach next_stripe_start start as we will
 911		 * submit comp_bio when reach the boundary immediately.
 912		 */
 913		ASSERT(cur_disk_byte != next_stripe_start);
 914		/*
 915		 * We have various limit on the real read size:
 916		 * - stripe boundary
 917		 * - page boundary
 918		 * - compressed length boundary
 919		 */
 920		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
 921		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
 922		real_size = min_t(u64, real_size, compressed_len - offset);
 923		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
 924
 925		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
 926		/*
 927		 * Maximum compressed extent is smaller than bio size limit,
 928		 * thus bio_add_page() should always success.
 929		 */
 930		ASSERT(added == real_size);
 931		cur_disk_byte += added;
 932
 933		/* Reached stripe boundary, need to submit */
 934		if (cur_disk_byte == next_stripe_start)
 935			submit = true;
 936
 937		/* Has finished the range, need to submit */
 938		if (cur_disk_byte == disk_bytenr + compressed_len)
 939			submit = true;
 940
 941		if (submit) {
 942			unsigned int nr_sectors;
 943
 944			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
 945			if (ret)
 946				goto finish_cb;
 947
 948			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 949						  fs_info->sectorsize);
 950			sums += fs_info->csum_size * nr_sectors;
 951
 952			ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
 953			if (ret)
 954				goto finish_cb;
 955			comp_bio = NULL;
 956		}
 957	}
 958	return BLK_STS_OK;
 959
 960fail2:
 961	while (faili >= 0) {
 962		__free_page(cb->compressed_pages[faili]);
 963		faili--;
 964	}
 965
 966	kfree(cb->compressed_pages);
 967fail1:
 968	kfree(cb);
 969out:
 970	free_extent_map(em);
 971	bio->bi_status = ret;
 972	bio_endio(bio);
 973	return ret;
 974finish_cb:
 975	if (comp_bio) {
 976		comp_bio->bi_status = ret;
 977		bio_endio(comp_bio);
 978	}
 979	/* All bytes of @cb is submitted, endio will free @cb */
 980	if (cur_disk_byte == disk_bytenr + compressed_len)
 981		return ret;
 982
 983	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
 984			   (disk_bytenr + compressed_len - cur_disk_byte) >>
 985			   fs_info->sectorsize_bits);
 986	/*
 987	 * Even with previous bio ended, we should still have io not yet
 988	 * submitted, thus need to finish @cb manually.
 989	 */
 990	ASSERT(refcount_read(&cb->pending_sectors));
 991	/* Now we are the only one referring @cb, can finish it safely. */
 992	finish_compressed_bio_read(cb);
 993	return ret;
 994}
 995
 996/*
 997 * Heuristic uses systematic sampling to collect data from the input data
 998 * range, the logic can be tuned by the following constants:
 999 *
1000 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
1001 * @SAMPLING_INTERVAL  - range from which the sampled data can be collected
1002 */
1003#define SAMPLING_READ_SIZE	(16)
1004#define SAMPLING_INTERVAL	(256)
1005
1006/*
1007 * For statistical analysis of the input data we consider bytes that form a
1008 * Galois Field of 256 objects. Each object has an attribute count, ie. how
1009 * many times the object appeared in the sample.
1010 */
1011#define BUCKET_SIZE		(256)
1012
1013/*
1014 * The size of the sample is based on a statistical sampling rule of thumb.
1015 * The common way is to perform sampling tests as long as the number of
1016 * elements in each cell is at least 5.
1017 *
1018 * Instead of 5, we choose 32 to obtain more accurate results.
1019 * If the data contain the maximum number of symbols, which is 256, we obtain a
1020 * sample size bound by 8192.
1021 *
1022 * For a sample of at most 8KB of data per data range: 16 consecutive bytes
1023 * from up to 512 locations.
1024 */
1025#define MAX_SAMPLE_SIZE		(BTRFS_MAX_UNCOMPRESSED *		\
1026				 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
1027
1028struct bucket_item {
1029	u32 count;
1030};
1031
1032struct heuristic_ws {
1033	/* Partial copy of input data */
1034	u8 *sample;
1035	u32 sample_size;
1036	/* Buckets store counters for each byte value */
1037	struct bucket_item *bucket;
1038	/* Sorting buffer */
1039	struct bucket_item *bucket_b;
1040	struct list_head list;
1041};
1042
1043static struct workspace_manager heuristic_wsm;
1044
1045static void free_heuristic_ws(struct list_head *ws)
1046{
1047	struct heuristic_ws *workspace;
1048
1049	workspace = list_entry(ws, struct heuristic_ws, list);
1050
1051	kvfree(workspace->sample);
1052	kfree(workspace->bucket);
1053	kfree(workspace->bucket_b);
1054	kfree(workspace);
1055}
1056
1057static struct list_head *alloc_heuristic_ws(unsigned int level)
1058{
1059	struct heuristic_ws *ws;
1060
1061	ws = kzalloc(sizeof(*ws), GFP_KERNEL);
1062	if (!ws)
1063		return ERR_PTR(-ENOMEM);
1064
1065	ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
1066	if (!ws->sample)
1067		goto fail;
1068
1069	ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
1070	if (!ws->bucket)
1071		goto fail;
1072
1073	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
1074	if (!ws->bucket_b)
1075		goto fail;
1076
1077	INIT_LIST_HEAD(&ws->list);
1078	return &ws->list;
1079fail:
1080	free_heuristic_ws(&ws->list);
1081	return ERR_PTR(-ENOMEM);
1082}
1083
1084const struct btrfs_compress_op btrfs_heuristic_compress = {
1085	.workspace_manager = &heuristic_wsm,
1086};
1087
1088static const struct btrfs_compress_op * const btrfs_compress_op[] = {
1089	/* The heuristic is represented as compression type 0 */
1090	&btrfs_heuristic_compress,
1091	&btrfs_zlib_compress,
1092	&btrfs_lzo_compress,
1093	&btrfs_zstd_compress,
1094};
1095
1096static struct list_head *alloc_workspace(int type, unsigned int level)
1097{
1098	switch (type) {
1099	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
1100	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
1101	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(level);
1102	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
1103	default:
1104		/*
1105		 * This can't happen, the type is validated several times
1106		 * before we get here.
1107		 */
1108		BUG();
1109	}
1110}
1111
1112static void free_workspace(int type, struct list_head *ws)
1113{
1114	switch (type) {
1115	case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
1116	case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
1117	case BTRFS_COMPRESS_LZO:  return lzo_free_workspace(ws);
1118	case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
1119	default:
1120		/*
1121		 * This can't happen, the type is validated several times
1122		 * before we get here.
1123		 */
1124		BUG();
1125	}
1126}
1127
1128static void btrfs_init_workspace_manager(int type)
1129{
1130	struct workspace_manager *wsm;
1131	struct list_head *workspace;
1132
1133	wsm = btrfs_compress_op[type]->workspace_manager;
1134	INIT_LIST_HEAD(&wsm->idle_ws);
1135	spin_lock_init(&wsm->ws_lock);
1136	atomic_set(&wsm->total_ws, 0);
1137	init_waitqueue_head(&wsm->ws_wait);
1138
1139	/*
1140	 * Preallocate one workspace for each compression type so we can
1141	 * guarantee forward progress in the worst case
1142	 */
1143	workspace = alloc_workspace(type, 0);
1144	if (IS_ERR(workspace)) {
1145		pr_warn(
1146	"BTRFS: cannot preallocate compression workspace, will try later\n");
1147	} else {
1148		atomic_set(&wsm->total_ws, 1);
1149		wsm->free_ws = 1;
1150		list_add(workspace, &wsm->idle_ws);
1151	}
1152}
1153
1154static void btrfs_cleanup_workspace_manager(int type)
1155{
1156	struct workspace_manager *wsman;
1157	struct list_head *ws;
1158
1159	wsman = btrfs_compress_op[type]->workspace_manager;
1160	while (!list_empty(&wsman->idle_ws)) {
1161		ws = wsman->idle_ws.next;
1162		list_del(ws);
1163		free_workspace(type, ws);
1164		atomic_dec(&wsman->total_ws);
1165	}
1166}
1167
1168/*
1169 * This finds an available workspace or allocates a new one.
1170 * If it's not possible to allocate a new one, waits until there's one.
1171 * Preallocation makes a forward progress guarantees and we do not return
1172 * errors.
1173 */
1174struct list_head *btrfs_get_workspace(int type, unsigned int level)
1175{
1176	struct workspace_manager *wsm;
1177	struct list_head *workspace;
1178	int cpus = num_online_cpus();
1179	unsigned nofs_flag;
1180	struct list_head *idle_ws;
1181	spinlock_t *ws_lock;
1182	atomic_t *total_ws;
1183	wait_queue_head_t *ws_wait;
1184	int *free_ws;
1185
1186	wsm = btrfs_compress_op[type]->workspace_manager;
1187	idle_ws	 = &wsm->idle_ws;
1188	ws_lock	 = &wsm->ws_lock;
1189	total_ws = &wsm->total_ws;
1190	ws_wait	 = &wsm->ws_wait;
1191	free_ws	 = &wsm->free_ws;
1192
1193again:
1194	spin_lock(ws_lock);
1195	if (!list_empty(idle_ws)) {
1196		workspace = idle_ws->next;
1197		list_del(workspace);
1198		(*free_ws)--;
1199		spin_unlock(ws_lock);
1200		return workspace;
1201
1202	}
1203	if (atomic_read(total_ws) > cpus) {
1204		DEFINE_WAIT(wait);
1205
1206		spin_unlock(ws_lock);
1207		prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
1208		if (atomic_read(total_ws) > cpus && !*free_ws)
1209			schedule();
1210		finish_wait(ws_wait, &wait);
1211		goto again;
1212	}
1213	atomic_inc(total_ws);
1214	spin_unlock(ws_lock);
1215
1216	/*
1217	 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
1218	 * to turn it off here because we might get called from the restricted
1219	 * context of btrfs_compress_bio/btrfs_compress_pages
1220	 */
1221	nofs_flag = memalloc_nofs_save();
1222	workspace = alloc_workspace(type, level);
1223	memalloc_nofs_restore(nofs_flag);
1224
1225	if (IS_ERR(workspace)) {
1226		atomic_dec(total_ws);
1227		wake_up(ws_wait);
1228
1229		/*
1230		 * Do not return the error but go back to waiting. There's a
1231		 * workspace preallocated for each type and the compression
1232		 * time is bounded so we get to a workspace eventually. This
1233		 * makes our caller's life easier.
1234		 *
1235		 * To prevent silent and low-probability deadlocks (when the
1236		 * initial preallocation fails), check if there are any
1237		 * workspaces at all.
1238		 */
1239		if (atomic_read(total_ws) == 0) {
1240			static DEFINE_RATELIMIT_STATE(_rs,
1241					/* once per minute */ 60 * HZ,
1242					/* no burst */ 1);
1243
1244			if (__ratelimit(&_rs)) {
1245				pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
1246			}
1247		}
1248		goto again;
1249	}
1250	return workspace;
1251}
1252
1253static struct list_head *get_workspace(int type, int level)
1254{
1255	switch (type) {
1256	case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
1257	case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
1258	case BTRFS_COMPRESS_LZO:  return btrfs_get_workspace(type, level);
1259	case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
1260	default:
1261		/*
1262		 * This can't happen, the type is validated several times
1263		 * before we get here.
1264		 */
1265		BUG();
1266	}
1267}
1268
1269/*
1270 * put a workspace struct back on the list or free it if we have enough
1271 * idle ones sitting around
1272 */
1273void btrfs_put_workspace(int type, struct list_head *ws)
1274{
1275	struct workspace_manager *wsm;
1276	struct list_head *idle_ws;
1277	spinlock_t *ws_lock;
1278	atomic_t *total_ws;
1279	wait_queue_head_t *ws_wait;
1280	int *free_ws;
1281
1282	wsm = btrfs_compress_op[type]->workspace_manager;
1283	idle_ws	 = &wsm->idle_ws;
1284	ws_lock	 = &wsm->ws_lock;
1285	total_ws = &wsm->total_ws;
1286	ws_wait	 = &wsm->ws_wait;
1287	free_ws	 = &wsm->free_ws;
1288
1289	spin_lock(ws_lock);
1290	if (*free_ws <= num_online_cpus()) {
1291		list_add(ws, idle_ws);
1292		(*free_ws)++;
1293		spin_unlock(ws_lock);
1294		goto wake;
1295	}
1296	spin_unlock(ws_lock);
1297
1298	free_workspace(type, ws);
1299	atomic_dec(total_ws);
1300wake:
1301	cond_wake_up(ws_wait);
1302}
1303
1304static void put_workspace(int type, struct list_head *ws)
1305{
1306	switch (type) {
1307	case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
1308	case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
1309	case BTRFS_COMPRESS_LZO:  return btrfs_put_workspace(type, ws);
1310	case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
1311	default:
1312		/*
1313		 * This can't happen, the type is validated several times
1314		 * before we get here.
1315		 */
1316		BUG();
1317	}
1318}
1319
1320/*
1321 * Adjust @level according to the limits of the compression algorithm or
1322 * fallback to default
1323 */
1324static unsigned int btrfs_compress_set_level(int type, unsigned level)
1325{
1326	const struct btrfs_compress_op *ops = btrfs_compress_op[type];
1327
1328	if (level == 0)
1329		level = ops->default_level;
1330	else
1331		level = min(level, ops->max_level);
1332
1333	return level;
1334}
1335
1336/*
1337 * Given an address space and start and length, compress the bytes into @pages
1338 * that are allocated on demand.
1339 *
1340 * @type_level is encoded algorithm and level, where level 0 means whatever
1341 * default the algorithm chooses and is opaque here;
1342 * - compression algo are 0-3
1343 * - the level are bits 4-7
1344 *
1345 * @out_pages is an in/out parameter, holds maximum number of pages to allocate
1346 * and returns number of actually allocated pages
1347 *
1348 * @total_in is used to return the number of bytes actually read.  It
1349 * may be smaller than the input length if we had to exit early because we
1350 * ran out of room in the pages array or because we cross the
1351 * max_out threshold.
1352 *
1353 * @total_out is an in/out parameter, must be set to the input length and will
1354 * be also used to return the total number of compressed bytes
1355 */
1356int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
1357			 u64 start, struct page **pages,
1358			 unsigned long *out_pages,
1359			 unsigned long *total_in,
1360			 unsigned long *total_out)
1361{
1362	int type = btrfs_compress_type(type_level);
1363	int level = btrfs_compress_level(type_level);
1364	struct list_head *workspace;
1365	int ret;
1366
1367	level = btrfs_compress_set_level(type, level);
1368	workspace = get_workspace(type, level);
1369	ret = compression_compress_pages(type, workspace, mapping, start, pages,
1370					 out_pages, total_in, total_out);
1371	put_workspace(type, workspace);
1372	return ret;
1373}
1374
1375static int btrfs_decompress_bio(struct compressed_bio *cb)
1376{
1377	struct list_head *workspace;
1378	int ret;
1379	int type = cb->compress_type;
1380
1381	workspace = get_workspace(type, 0);
1382	ret = compression_decompress_bio(workspace, cb);
1383	put_workspace(type, workspace);
1384
1385	return ret;
1386}
1387
1388/*
1389 * a less complex decompression routine.  Our compressed data fits in a
1390 * single page, and we want to read a single page out of it.
1391 * start_byte tells us the offset into the compressed data we're interested in
1392 */
1393int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
1394		     unsigned long start_byte, size_t srclen, size_t destlen)
1395{
1396	struct list_head *workspace;
1397	int ret;
1398
1399	workspace = get_workspace(type, 0);
1400	ret = compression_decompress(type, workspace, data_in, dest_page,
1401				     start_byte, srclen, destlen);
1402	put_workspace(type, workspace);
1403
1404	return ret;
1405}
1406
1407void __init btrfs_init_compress(void)
1408{
1409	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
1410	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
1411	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
1412	zstd_init_workspace_manager();
1413}
1414
1415void __cold btrfs_exit_compress(void)
1416{
1417	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
1418	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
1419	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
1420	zstd_cleanup_workspace_manager();
1421}
1422
1423/*
1424 * Copy decompressed data from working buffer to pages.
1425 *
1426 * @buf:		The decompressed data buffer
1427 * @buf_len:		The decompressed data length
1428 * @decompressed:	Number of bytes that are already decompressed inside the
1429 * 			compressed extent
1430 * @cb:			The compressed extent descriptor
1431 * @orig_bio:		The original bio that the caller wants to read for
1432 *
1433 * An easier to understand graph is like below:
1434 *
1435 * 		|<- orig_bio ->|     |<- orig_bio->|
1436 * 	|<-------      full decompressed extent      ----->|
1437 * 	|<-----------    @cb range   ---->|
1438 * 	|			|<-- @buf_len -->|
1439 * 	|<--- @decompressed --->|
1440 *
1441 * Note that, @cb can be a subpage of the full decompressed extent, but
1442 * @cb->start always has the same as the orig_file_offset value of the full
1443 * decompressed extent.
1444 *
1445 * When reading compressed extent, we have to read the full compressed extent,
1446 * while @orig_bio may only want part of the range.
1447 * Thus this function will ensure only data covered by @orig_bio will be copied
1448 * to.
1449 *
1450 * Return 0 if we have copied all needed contents for @orig_bio.
1451 * Return >0 if we need continue decompress.
1452 */
1453int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1454			      struct compressed_bio *cb, u32 decompressed)
1455{
1456	struct bio *orig_bio = cb->orig_bio;
1457	/* Offset inside the full decompressed extent */
1458	u32 cur_offset;
1459
1460	cur_offset = decompressed;
1461	/* The main loop to do the copy */
1462	while (cur_offset < decompressed + buf_len) {
1463		struct bio_vec bvec;
1464		size_t copy_len;
1465		u32 copy_start;
1466		/* Offset inside the full decompressed extent */
1467		u32 bvec_offset;
1468
1469		bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1470		/*
1471		 * cb->start may underflow, but subtracting that value can still
1472		 * give us correct offset inside the full decompressed extent.
1473		 */
1474		bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
1475
1476		/* Haven't reached the bvec range, exit */
1477		if (decompressed + buf_len <= bvec_offset)
1478			return 1;
1479
1480		copy_start = max(cur_offset, bvec_offset);
1481		copy_len = min(bvec_offset + bvec.bv_len,
1482			       decompressed + buf_len) - copy_start;
1483		ASSERT(copy_len);
1484
1485		/*
1486		 * Extra range check to ensure we didn't go beyond
1487		 * @buf + @buf_len.
1488		 */
1489		ASSERT(copy_start - decompressed < buf_len);
1490		memcpy_to_page(bvec.bv_page, bvec.bv_offset,
1491			       buf + copy_start - decompressed, copy_len);
1492		flush_dcache_page(bvec.bv_page);
1493		cur_offset += copy_len;
1494
1495		bio_advance(orig_bio, copy_len);
1496		/* Finished the bio */
1497		if (!orig_bio->bi_iter.bi_size)
1498			return 0;
1499	}
1500	return 1;
1501}
1502
1503/*
1504 * Shannon Entropy calculation
1505 *
1506 * Pure byte distribution analysis fails to determine compressibility of data.
1507 * Try calculating entropy to estimate the average minimum number of bits
1508 * needed to encode the sampled data.
1509 *
1510 * For convenience, return the percentage of needed bits, instead of amount of
1511 * bits directly.
1512 *
1513 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1514 *			    and can be compressible with high probability
1515 *
1516 * @ENTROPY_LVL_HIGH - data are not compressible with high probability
1517 *
1518 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1519 */
1520#define ENTROPY_LVL_ACEPTABLE		(65)
1521#define ENTROPY_LVL_HIGH		(80)
1522
1523/*
1524 * For increasead precision in shannon_entropy calculation,
1525 * let's do pow(n, M) to save more digits after comma:
1526 *
1527 * - maximum int bit length is 64
1528 * - ilog2(MAX_SAMPLE_SIZE)	-> 13
1529 * - 13 * 4 = 52 < 64		-> M = 4
1530 *
1531 * So use pow(n, 4).
1532 */
1533static inline u32 ilog2_w(u64 n)
1534{
1535	return ilog2(n * n * n * n);
1536}
1537
1538static u32 shannon_entropy(struct heuristic_ws *ws)
1539{
1540	const u32 entropy_max = 8 * ilog2_w(2);
1541	u32 entropy_sum = 0;
1542	u32 p, p_base, sz_base;
1543	u32 i;
1544
1545	sz_base = ilog2_w(ws->sample_size);
1546	for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1547		p = ws->bucket[i].count;
1548		p_base = ilog2_w(p);
1549		entropy_sum += p * (sz_base - p_base);
1550	}
1551
1552	entropy_sum /= ws->sample_size;
1553	return entropy_sum * 100 / entropy_max;
1554}
1555
1556#define RADIX_BASE		4U
1557#define COUNTERS_SIZE		(1U << RADIX_BASE)
1558
1559static u8 get4bits(u64 num, int shift) {
1560	u8 low4bits;
1561
1562	num >>= shift;
1563	/* Reverse order */
1564	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1565	return low4bits;
1566}
1567
1568/*
1569 * Use 4 bits as radix base
1570 * Use 16 u32 counters for calculating new position in buf array
1571 *
1572 * @array     - array that will be sorted
1573 * @array_buf - buffer array to store sorting results
1574 *              must be equal in size to @array
1575 * @num       - array size
1576 */
1577static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1578		       int num)
1579{
1580	u64 max_num;
1581	u64 buf_num;
1582	u32 counters[COUNTERS_SIZE];
1583	u32 new_addr;
1584	u32 addr;
1585	int bitlen;
1586	int shift;
1587	int i;
1588
1589	/*
1590	 * Try avoid useless loop iterations for small numbers stored in big
1591	 * counters.  Example: 48 33 4 ... in 64bit array
1592	 */
1593	max_num = array[0].count;
1594	for (i = 1; i < num; i++) {
1595		buf_num = array[i].count;
1596		if (buf_num > max_num)
1597			max_num = buf_num;
1598	}
1599
1600	buf_num = ilog2(max_num);
1601	bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1602
1603	shift = 0;
1604	while (shift < bitlen) {
1605		memset(counters, 0, sizeof(counters));
1606
1607		for (i = 0; i < num; i++) {
1608			buf_num = array[i].count;
1609			addr = get4bits(buf_num, shift);
1610			counters[addr]++;
1611		}
1612
1613		for (i = 1; i < COUNTERS_SIZE; i++)
1614			counters[i] += counters[i - 1];
1615
1616		for (i = num - 1; i >= 0; i--) {
1617			buf_num = array[i].count;
1618			addr = get4bits(buf_num, shift);
1619			counters[addr]--;
1620			new_addr = counters[addr];
1621			array_buf[new_addr] = array[i];
1622		}
1623
1624		shift += RADIX_BASE;
1625
1626		/*
1627		 * Normal radix expects to move data from a temporary array, to
1628		 * the main one.  But that requires some CPU time. Avoid that
1629		 * by doing another sort iteration to original array instead of
1630		 * memcpy()
1631		 */
1632		memset(counters, 0, sizeof(counters));
1633
1634		for (i = 0; i < num; i ++) {
1635			buf_num = array_buf[i].count;
1636			addr = get4bits(buf_num, shift);
1637			counters[addr]++;
1638		}
1639
1640		for (i = 1; i < COUNTERS_SIZE; i++)
1641			counters[i] += counters[i - 1];
1642
1643		for (i = num - 1; i >= 0; i--) {
1644			buf_num = array_buf[i].count;
1645			addr = get4bits(buf_num, shift);
1646			counters[addr]--;
1647			new_addr = counters[addr];
1648			array[new_addr] = array_buf[i];
1649		}
1650
1651		shift += RADIX_BASE;
1652	}
1653}
1654
1655/*
1656 * Size of the core byte set - how many bytes cover 90% of the sample
1657 *
1658 * There are several types of structured binary data that use nearly all byte
1659 * values. The distribution can be uniform and counts in all buckets will be
1660 * nearly the same (eg. encrypted data). Unlikely to be compressible.
1661 *
1662 * Other possibility is normal (Gaussian) distribution, where the data could
1663 * be potentially compressible, but we have to take a few more steps to decide
1664 * how much.
1665 *
1666 * @BYTE_CORE_SET_LOW  - main part of byte values repeated frequently,
1667 *                       compression algo can easy fix that
1668 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1669 *                       probability is not compressible
1670 */
1671#define BYTE_CORE_SET_LOW		(64)
1672#define BYTE_CORE_SET_HIGH		(200)
1673
1674static int byte_core_set_size(struct heuristic_ws *ws)
1675{
1676	u32 i;
1677	u32 coreset_sum = 0;
1678	const u32 core_set_threshold = ws->sample_size * 90 / 100;
1679	struct bucket_item *bucket = ws->bucket;
1680
1681	/* Sort in reverse order */
1682	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1683
1684	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1685		coreset_sum += bucket[i].count;
1686
1687	if (coreset_sum > core_set_threshold)
1688		return i;
1689
1690	for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1691		coreset_sum += bucket[i].count;
1692		if (coreset_sum > core_set_threshold)
1693			break;
1694	}
1695
1696	return i;
1697}
1698
1699/*
1700 * Count byte values in buckets.
1701 * This heuristic can detect textual data (configs, xml, json, html, etc).
1702 * Because in most text-like data byte set is restricted to limited number of
1703 * possible characters, and that restriction in most cases makes data easy to
1704 * compress.
1705 *
1706 * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1707 *	less - compressible
1708 *	more - need additional analysis
1709 */
1710#define BYTE_SET_THRESHOLD		(64)
1711
1712static u32 byte_set_size(const struct heuristic_ws *ws)
1713{
1714	u32 i;
1715	u32 byte_set_size = 0;
1716
1717	for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1718		if (ws->bucket[i].count > 0)
1719			byte_set_size++;
1720	}
1721
1722	/*
1723	 * Continue collecting count of byte values in buckets.  If the byte
1724	 * set size is bigger then the threshold, it's pointless to continue,
1725	 * the detection technique would fail for this type of data.
1726	 */
1727	for (; i < BUCKET_SIZE; i++) {
1728		if (ws->bucket[i].count > 0) {
1729			byte_set_size++;
1730			if (byte_set_size > BYTE_SET_THRESHOLD)
1731				return byte_set_size;
1732		}
1733	}
1734
1735	return byte_set_size;
1736}
1737
1738static bool sample_repeated_patterns(struct heuristic_ws *ws)
1739{
1740	const u32 half_of_sample = ws->sample_size / 2;
1741	const u8 *data = ws->sample;
1742
1743	return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1744}
1745
1746static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1747				     struct heuristic_ws *ws)
1748{
1749	struct page *page;
1750	u64 index, index_end;
1751	u32 i, curr_sample_pos;
1752	u8 *in_data;
1753
1754	/*
1755	 * Compression handles the input data by chunks of 128KiB
1756	 * (defined by BTRFS_MAX_UNCOMPRESSED)
1757	 *
1758	 * We do the same for the heuristic and loop over the whole range.
1759	 *
1760	 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1761	 * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1762	 */
1763	if (end - start > BTRFS_MAX_UNCOMPRESSED)
1764		end = start + BTRFS_MAX_UNCOMPRESSED;
1765
1766	index = start >> PAGE_SHIFT;
1767	index_end = end >> PAGE_SHIFT;
1768
1769	/* Don't miss unaligned end */
1770	if (!IS_ALIGNED(end, PAGE_SIZE))
1771		index_end++;
1772
1773	curr_sample_pos = 0;
1774	while (index < index_end) {
1775		page = find_get_page(inode->i_mapping, index);
1776		in_data = kmap_local_page(page);
1777		/* Handle case where the start is not aligned to PAGE_SIZE */
1778		i = start % PAGE_SIZE;
1779		while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1780			/* Don't sample any garbage from the last page */
1781			if (start > end - SAMPLING_READ_SIZE)
1782				break;
1783			memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1784					SAMPLING_READ_SIZE);
1785			i += SAMPLING_INTERVAL;
1786			start += SAMPLING_INTERVAL;
1787			curr_sample_pos += SAMPLING_READ_SIZE;
1788		}
1789		kunmap_local(in_data);
1790		put_page(page);
1791
1792		index++;
1793	}
1794
1795	ws->sample_size = curr_sample_pos;
1796}
1797
1798/*
1799 * Compression heuristic.
1800 *
1801 * For now is's a naive and optimistic 'return true', we'll extend the logic to
1802 * quickly (compared to direct compression) detect data characteristics
1803 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
1804 * data.
1805 *
1806 * The following types of analysis can be performed:
1807 * - detect mostly zero data
1808 * - detect data with low "byte set" size (text, etc)
1809 * - detect data with low/high "core byte" set
1810 *
1811 * Return non-zero if the compression should be done, 0 otherwise.
1812 */
1813int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1814{
1815	struct list_head *ws_list = get_workspace(0, 0);
1816	struct heuristic_ws *ws;
1817	u32 i;
1818	u8 byte;
1819	int ret = 0;
1820
1821	ws = list_entry(ws_list, struct heuristic_ws, list);
1822
1823	heuristic_collect_sample(inode, start, end, ws);
1824
1825	if (sample_repeated_patterns(ws)) {
1826		ret = 1;
1827		goto out;
1828	}
1829
1830	memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1831
1832	for (i = 0; i < ws->sample_size; i++) {
1833		byte = ws->sample[i];
1834		ws->bucket[byte].count++;
1835	}
1836
1837	i = byte_set_size(ws);
1838	if (i < BYTE_SET_THRESHOLD) {
1839		ret = 2;
1840		goto out;
1841	}
1842
1843	i = byte_core_set_size(ws);
1844	if (i <= BYTE_CORE_SET_LOW) {
1845		ret = 3;
1846		goto out;
1847	}
1848
1849	if (i >= BYTE_CORE_SET_HIGH) {
1850		ret = 0;
1851		goto out;
1852	}
1853
1854	i = shannon_entropy(ws);
1855	if (i <= ENTROPY_LVL_ACEPTABLE) {
1856		ret = 4;
1857		goto out;
1858	}
1859
1860	/*
1861	 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1862	 * needed to give green light to compression.
1863	 *
1864	 * For now just assume that compression at that level is not worth the
1865	 * resources because:
1866	 *
1867	 * 1. it is possible to defrag the data later
1868	 *
1869	 * 2. the data would turn out to be hardly compressible, eg. 150 byte
1870	 * values, every bucket has counter at level ~54. The heuristic would
1871	 * be confused. This can happen when data have some internal repeated
1872	 * patterns like "abbacbbc...". This can be detected by analyzing
1873	 * pairs of bytes, which is too costly.
1874	 */
1875	if (i < ENTROPY_LVL_HIGH) {
1876		ret = 5;
1877		goto out;
1878	} else {
1879		ret = 0;
1880		goto out;
1881	}
1882
1883out:
1884	put_workspace(0, ws_list);
1885	return ret;
1886}
1887
1888/*
1889 * Convert the compression suffix (eg. after "zlib" starting with ":") to
1890 * level, unrecognized string will set the default level
1891 */
1892unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
1893{
1894	unsigned int level = 0;
1895	int ret;
1896
1897	if (!type)
1898		return 0;
1899
1900	if (str[0] == ':') {
1901		ret = kstrtouint(str + 1, 10, &level);
1902		if (ret)
1903			level = 0;
1904	}
1905
1906	level = btrfs_compress_set_level(type, level);
1907
1908	return level;
1909}