Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/rbtree.h>
9#include <linux/mm.h>
10#include <linux/error-injection.h>
11#include "messages.h"
12#include "ctree.h"
13#include "disk-io.h"
14#include "transaction.h"
15#include "print-tree.h"
16#include "locking.h"
17#include "volumes.h"
18#include "qgroup.h"
19#include "tree-mod-log.h"
20#include "tree-checker.h"
21#include "fs.h"
22#include "accessors.h"
23#include "extent-tree.h"
24#include "relocation.h"
25#include "file-item.h"
26
27static struct kmem_cache *btrfs_path_cachep;
28
29static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 *root, struct btrfs_path *path, int level);
31static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 const struct btrfs_key *ins_key, struct btrfs_path *path,
33 int data_size, bool extend);
34static int push_node_left(struct btrfs_trans_handle *trans,
35 struct extent_buffer *dst,
36 struct extent_buffer *src, bool empty);
37static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf);
40/*
41 * The leaf data grows from end-to-front in the node. this returns the address
42 * of the start of the last item, which is the stop of the leaf data stack.
43 */
44static unsigned int leaf_data_end(const struct extent_buffer *leaf)
45{
46 u32 nr = btrfs_header_nritems(leaf);
47
48 if (nr == 0)
49 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
50 return btrfs_item_offset(leaf, nr - 1);
51}
52
53/*
54 * Move data in a @leaf (using memmove, safe for overlapping ranges).
55 *
56 * @leaf: leaf that we're doing a memmove on
57 * @dst_offset: item data offset we're moving to
58 * @src_offset: item data offset were' moving from
59 * @len: length of the data we're moving
60 *
61 * Wrapper around memmove_extent_buffer() that takes into account the header on
62 * the leaf. The btrfs_item offset's start directly after the header, so we
63 * have to adjust any offsets to account for the header in the leaf. This
64 * handles that math to simplify the callers.
65 */
66static inline void memmove_leaf_data(const struct extent_buffer *leaf,
67 unsigned long dst_offset,
68 unsigned long src_offset,
69 unsigned long len)
70{
71 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
72 btrfs_item_nr_offset(leaf, 0) + src_offset, len);
73}
74
75/*
76 * Copy item data from @src into @dst at the given @offset.
77 *
78 * @dst: destination leaf that we're copying into
79 * @src: source leaf that we're copying from
80 * @dst_offset: item data offset we're copying to
81 * @src_offset: item data offset were' copying from
82 * @len: length of the data we're copying
83 *
84 * Wrapper around copy_extent_buffer() that takes into account the header on
85 * the leaf. The btrfs_item offset's start directly after the header, so we
86 * have to adjust any offsets to account for the header in the leaf. This
87 * handles that math to simplify the callers.
88 */
89static inline void copy_leaf_data(const struct extent_buffer *dst,
90 const struct extent_buffer *src,
91 unsigned long dst_offset,
92 unsigned long src_offset, unsigned long len)
93{
94 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
95 btrfs_item_nr_offset(src, 0) + src_offset, len);
96}
97
98/*
99 * Move items in a @leaf (using memmove).
100 *
101 * @dst: destination leaf for the items
102 * @dst_item: the item nr we're copying into
103 * @src_item: the item nr we're copying from
104 * @nr_items: the number of items to copy
105 *
106 * Wrapper around memmove_extent_buffer() that does the math to get the
107 * appropriate offsets into the leaf from the item numbers.
108 */
109static inline void memmove_leaf_items(const struct extent_buffer *leaf,
110 int dst_item, int src_item, int nr_items)
111{
112 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
113 btrfs_item_nr_offset(leaf, src_item),
114 nr_items * sizeof(struct btrfs_item));
115}
116
117/*
118 * Copy items from @src into @dst at the given @offset.
119 *
120 * @dst: destination leaf for the items
121 * @src: source leaf for the items
122 * @dst_item: the item nr we're copying into
123 * @src_item: the item nr we're copying from
124 * @nr_items: the number of items to copy
125 *
126 * Wrapper around copy_extent_buffer() that does the math to get the
127 * appropriate offsets into the leaf from the item numbers.
128 */
129static inline void copy_leaf_items(const struct extent_buffer *dst,
130 const struct extent_buffer *src,
131 int dst_item, int src_item, int nr_items)
132{
133 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
134 btrfs_item_nr_offset(src, src_item),
135 nr_items * sizeof(struct btrfs_item));
136}
137
138struct btrfs_path *btrfs_alloc_path(void)
139{
140 might_sleep();
141
142 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
143}
144
145/* this also releases the path */
146void btrfs_free_path(struct btrfs_path *p)
147{
148 if (!p)
149 return;
150 btrfs_release_path(p);
151 kmem_cache_free(btrfs_path_cachep, p);
152}
153
154/*
155 * path release drops references on the extent buffers in the path
156 * and it drops any locks held by this path
157 *
158 * It is safe to call this on paths that no locks or extent buffers held.
159 */
160noinline void btrfs_release_path(struct btrfs_path *p)
161{
162 int i;
163
164 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
165 p->slots[i] = 0;
166 if (!p->nodes[i])
167 continue;
168 if (p->locks[i]) {
169 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
170 p->locks[i] = 0;
171 }
172 free_extent_buffer(p->nodes[i]);
173 p->nodes[i] = NULL;
174 }
175}
176
177/*
178 * safely gets a reference on the root node of a tree. A lock
179 * is not taken, so a concurrent writer may put a different node
180 * at the root of the tree. See btrfs_lock_root_node for the
181 * looping required.
182 *
183 * The extent buffer returned by this has a reference taken, so
184 * it won't disappear. It may stop being the root of the tree
185 * at any time because there are no locks held.
186 */
187struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
188{
189 struct extent_buffer *eb;
190
191 while (1) {
192 rcu_read_lock();
193 eb = rcu_dereference(root->node);
194
195 /*
196 * RCU really hurts here, we could free up the root node because
197 * it was COWed but we may not get the new root node yet so do
198 * the inc_not_zero dance and if it doesn't work then
199 * synchronize_rcu and try again.
200 */
201 if (refcount_inc_not_zero(&eb->refs)) {
202 rcu_read_unlock();
203 break;
204 }
205 rcu_read_unlock();
206 synchronize_rcu();
207 }
208 return eb;
209}
210
211/*
212 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
213 * just get put onto a simple dirty list. Transaction walks this list to make
214 * sure they get properly updated on disk.
215 */
216static void add_root_to_dirty_list(struct btrfs_root *root)
217{
218 struct btrfs_fs_info *fs_info = root->fs_info;
219
220 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
221 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
222 return;
223
224 spin_lock(&fs_info->trans_lock);
225 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
226 /* Want the extent tree to be the last on the list */
227 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
228 list_move_tail(&root->dirty_list,
229 &fs_info->dirty_cowonly_roots);
230 else
231 list_move(&root->dirty_list,
232 &fs_info->dirty_cowonly_roots);
233 }
234 spin_unlock(&fs_info->trans_lock);
235}
236
237/*
238 * used by snapshot creation to make a copy of a root for a tree with
239 * a given objectid. The buffer with the new root node is returned in
240 * cow_ret, and this func returns zero on success or a negative error code.
241 */
242int btrfs_copy_root(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root,
244 struct extent_buffer *buf,
245 struct extent_buffer **cow_ret, u64 new_root_objectid)
246{
247 struct btrfs_fs_info *fs_info = root->fs_info;
248 struct extent_buffer *cow;
249 int ret = 0;
250 int level;
251 struct btrfs_disk_key disk_key;
252 u64 reloc_src_root = 0;
253
254 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
255 trans->transid != fs_info->running_transaction->transid);
256 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
257 trans->transid != btrfs_get_root_last_trans(root));
258
259 level = btrfs_header_level(buf);
260 if (level == 0)
261 btrfs_item_key(buf, &disk_key, 0);
262 else
263 btrfs_node_key(buf, &disk_key, 0);
264
265 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
266 reloc_src_root = btrfs_header_owner(buf);
267 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
268 &disk_key, level, buf->start, 0,
269 reloc_src_root, BTRFS_NESTING_NEW_ROOT);
270 if (IS_ERR(cow))
271 return PTR_ERR(cow);
272
273 copy_extent_buffer_full(cow, buf);
274 btrfs_set_header_bytenr(cow, cow->start);
275 btrfs_set_header_generation(cow, trans->transid);
276 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
277 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
278 BTRFS_HEADER_FLAG_RELOC);
279 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
280 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
281 else
282 btrfs_set_header_owner(cow, new_root_objectid);
283
284 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
285
286 if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
287 btrfs_tree_unlock(cow);
288 free_extent_buffer(cow);
289 ret = -EUCLEAN;
290 btrfs_abort_transaction(trans, ret);
291 return ret;
292 }
293
294 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
295 ret = btrfs_inc_ref(trans, root, cow, 1);
296 if (unlikely(ret))
297 btrfs_abort_transaction(trans, ret);
298 } else {
299 ret = btrfs_inc_ref(trans, root, cow, 0);
300 if (unlikely(ret))
301 btrfs_abort_transaction(trans, ret);
302 }
303 if (ret) {
304 btrfs_tree_unlock(cow);
305 free_extent_buffer(cow);
306 return ret;
307 }
308
309 btrfs_mark_buffer_dirty(trans, cow);
310 *cow_ret = cow;
311 return 0;
312}
313
314/*
315 * check if the tree block can be shared by multiple trees
316 */
317bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
318 const struct btrfs_root *root,
319 const struct extent_buffer *buf)
320{
321 const u64 buf_gen = btrfs_header_generation(buf);
322
323 /*
324 * Tree blocks not in shareable trees and tree roots are never shared.
325 * If a block was allocated after the last snapshot and the block was
326 * not allocated by tree relocation, we know the block is not shared.
327 */
328
329 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
330 return false;
331
332 if (buf == root->node)
333 return false;
334
335 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
336 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
337 return false;
338
339 if (buf != root->commit_root)
340 return true;
341
342 /*
343 * An extent buffer that used to be the commit root may still be shared
344 * because the tree height may have increased and it became a child of a
345 * higher level root. This can happen when snapshotting a subvolume
346 * created in the current transaction.
347 */
348 if (buf_gen == trans->transid)
349 return true;
350
351 return false;
352}
353
354static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
355 struct btrfs_root *root,
356 struct extent_buffer *buf,
357 struct extent_buffer *cow,
358 int *last_ref)
359{
360 struct btrfs_fs_info *fs_info = root->fs_info;
361 u64 refs;
362 u64 owner;
363 u64 flags;
364 int ret;
365
366 /*
367 * Backrefs update rules:
368 *
369 * Always use full backrefs for extent pointers in tree block
370 * allocated by tree relocation.
371 *
372 * If a shared tree block is no longer referenced by its owner
373 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
374 * use full backrefs for extent pointers in tree block.
375 *
376 * If a tree block is been relocating
377 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
378 * use full backrefs for extent pointers in tree block.
379 * The reason for this is some operations (such as drop tree)
380 * are only allowed for blocks use full backrefs.
381 */
382
383 if (btrfs_block_can_be_shared(trans, root, buf)) {
384 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
385 btrfs_header_level(buf), 1,
386 &refs, &flags, NULL);
387 if (ret)
388 return ret;
389 if (unlikely(refs == 0)) {
390 btrfs_crit(fs_info,
391 "found 0 references for tree block at bytenr %llu level %d root %llu",
392 buf->start, btrfs_header_level(buf),
393 btrfs_root_id(root));
394 ret = -EUCLEAN;
395 btrfs_abort_transaction(trans, ret);
396 return ret;
397 }
398 } else {
399 refs = 1;
400 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
401 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
402 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
403 else
404 flags = 0;
405 }
406
407 owner = btrfs_header_owner(buf);
408 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
409 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
410 btrfs_crit(fs_info,
411"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
412 buf->start, btrfs_header_level(buf),
413 btrfs_root_id(root), refs, flags);
414 ret = -EUCLEAN;
415 btrfs_abort_transaction(trans, ret);
416 return ret;
417 }
418
419 if (refs > 1) {
420 if ((owner == btrfs_root_id(root) ||
421 btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
422 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
423 ret = btrfs_inc_ref(trans, root, buf, 1);
424 if (ret)
425 return ret;
426
427 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
428 ret = btrfs_dec_ref(trans, root, buf, 0);
429 if (ret)
430 return ret;
431 ret = btrfs_inc_ref(trans, root, cow, 1);
432 if (ret)
433 return ret;
434 }
435 ret = btrfs_set_disk_extent_flags(trans, buf,
436 BTRFS_BLOCK_FLAG_FULL_BACKREF);
437 if (ret)
438 return ret;
439 } else {
440
441 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
442 ret = btrfs_inc_ref(trans, root, cow, 1);
443 else
444 ret = btrfs_inc_ref(trans, root, cow, 0);
445 if (ret)
446 return ret;
447 }
448 } else {
449 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
450 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
451 ret = btrfs_inc_ref(trans, root, cow, 1);
452 else
453 ret = btrfs_inc_ref(trans, root, cow, 0);
454 if (ret)
455 return ret;
456 ret = btrfs_dec_ref(trans, root, buf, 1);
457 if (ret)
458 return ret;
459 }
460 btrfs_clear_buffer_dirty(trans, buf);
461 *last_ref = 1;
462 }
463 return 0;
464}
465
466/*
467 * does the dirty work in cow of a single block. The parent block (if
468 * supplied) is updated to point to the new cow copy. The new buffer is marked
469 * dirty and returned locked. If you modify the block it needs to be marked
470 * dirty again.
471 *
472 * search_start -- an allocation hint for the new block
473 *
474 * empty_size -- a hint that you plan on doing more cow. This is the size in
475 * bytes the allocator should try to find free next to the block it returns.
476 * This is just a hint and may be ignored by the allocator.
477 */
478int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
479 struct btrfs_root *root,
480 struct extent_buffer *buf,
481 struct extent_buffer *parent, int parent_slot,
482 struct extent_buffer **cow_ret,
483 u64 search_start, u64 empty_size,
484 enum btrfs_lock_nesting nest)
485{
486 struct btrfs_fs_info *fs_info = root->fs_info;
487 struct btrfs_disk_key disk_key;
488 struct extent_buffer *cow;
489 int level, ret;
490 int last_ref = 0;
491 int unlock_orig = 0;
492 u64 parent_start = 0;
493 u64 reloc_src_root = 0;
494
495 if (*cow_ret == buf)
496 unlock_orig = 1;
497
498 btrfs_assert_tree_write_locked(buf);
499
500 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
501 trans->transid != fs_info->running_transaction->transid);
502 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
503 trans->transid != btrfs_get_root_last_trans(root));
504
505 level = btrfs_header_level(buf);
506
507 if (level == 0)
508 btrfs_item_key(buf, &disk_key, 0);
509 else
510 btrfs_node_key(buf, &disk_key, 0);
511
512 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
513 if (parent)
514 parent_start = parent->start;
515 reloc_src_root = btrfs_header_owner(buf);
516 }
517 cow = btrfs_alloc_tree_block(trans, root, parent_start,
518 btrfs_root_id(root), &disk_key, level,
519 search_start, empty_size, reloc_src_root, nest);
520 if (IS_ERR(cow))
521 return PTR_ERR(cow);
522
523 /* cow is set to blocking by btrfs_init_new_buffer */
524
525 copy_extent_buffer_full(cow, buf);
526 btrfs_set_header_bytenr(cow, cow->start);
527 btrfs_set_header_generation(cow, trans->transid);
528 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
529 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
530 BTRFS_HEADER_FLAG_RELOC);
531 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
532 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
533 else
534 btrfs_set_header_owner(cow, btrfs_root_id(root));
535
536 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
537
538 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
539 if (unlikely(ret)) {
540 btrfs_abort_transaction(trans, ret);
541 goto error_unlock_cow;
542 }
543
544 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
545 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
546 if (unlikely(ret)) {
547 btrfs_abort_transaction(trans, ret);
548 goto error_unlock_cow;
549 }
550 }
551
552 if (buf == root->node) {
553 WARN_ON(parent && parent != buf);
554 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
555 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
556 parent_start = buf->start;
557
558 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
559 if (unlikely(ret < 0)) {
560 btrfs_abort_transaction(trans, ret);
561 goto error_unlock_cow;
562 }
563 refcount_inc(&cow->refs);
564 rcu_assign_pointer(root->node, cow);
565
566 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
567 parent_start, last_ref);
568 free_extent_buffer(buf);
569 add_root_to_dirty_list(root);
570 if (unlikely(ret < 0)) {
571 btrfs_abort_transaction(trans, ret);
572 goto error_unlock_cow;
573 }
574 } else {
575 WARN_ON(trans->transid != btrfs_header_generation(parent));
576 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
577 BTRFS_MOD_LOG_KEY_REPLACE);
578 if (unlikely(ret)) {
579 btrfs_abort_transaction(trans, ret);
580 goto error_unlock_cow;
581 }
582 btrfs_set_node_blockptr(parent, parent_slot,
583 cow->start);
584 btrfs_set_node_ptr_generation(parent, parent_slot,
585 trans->transid);
586 btrfs_mark_buffer_dirty(trans, parent);
587 if (last_ref) {
588 ret = btrfs_tree_mod_log_free_eb(buf);
589 if (unlikely(ret)) {
590 btrfs_abort_transaction(trans, ret);
591 goto error_unlock_cow;
592 }
593 }
594 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
595 parent_start, last_ref);
596 if (unlikely(ret < 0)) {
597 btrfs_abort_transaction(trans, ret);
598 goto error_unlock_cow;
599 }
600 }
601
602 trace_btrfs_cow_block(root, buf, cow);
603 if (unlock_orig)
604 btrfs_tree_unlock(buf);
605 free_extent_buffer_stale(buf);
606 btrfs_mark_buffer_dirty(trans, cow);
607 *cow_ret = cow;
608 return 0;
609
610error_unlock_cow:
611 btrfs_tree_unlock(cow);
612 free_extent_buffer(cow);
613 return ret;
614}
615
616static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
617 const struct btrfs_root *root,
618 const struct extent_buffer *buf)
619{
620 if (btrfs_is_testing(root->fs_info))
621 return false;
622
623 /*
624 * We do not need to cow a block if
625 * 1) this block is not created or changed in this transaction;
626 * 2) this block does not belong to TREE_RELOC tree;
627 * 3) the root is not forced COW.
628 *
629 * What is forced COW:
630 * when we create snapshot during committing the transaction,
631 * after we've finished copying src root, we must COW the shared
632 * block to ensure the metadata consistency.
633 */
634
635 if (btrfs_header_generation(buf) != trans->transid)
636 return true;
637
638 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
639 return true;
640
641 /* Ensure we can see the FORCE_COW bit. */
642 smp_mb__before_atomic();
643 if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
644 return true;
645
646 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
647 return false;
648
649 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
650 return true;
651
652 return false;
653}
654
655/*
656 * COWs a single block, see btrfs_force_cow_block() for the real work.
657 * This version of it has extra checks so that a block isn't COWed more than
658 * once per transaction, as long as it hasn't been written yet
659 */
660int btrfs_cow_block(struct btrfs_trans_handle *trans,
661 struct btrfs_root *root, struct extent_buffer *buf,
662 struct extent_buffer *parent, int parent_slot,
663 struct extent_buffer **cow_ret,
664 enum btrfs_lock_nesting nest)
665{
666 struct btrfs_fs_info *fs_info = root->fs_info;
667 u64 search_start;
668
669 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
670 btrfs_abort_transaction(trans, -EUCLEAN);
671 btrfs_crit(fs_info,
672 "attempt to COW block %llu on root %llu that is being deleted",
673 buf->start, btrfs_root_id(root));
674 return -EUCLEAN;
675 }
676
677 /*
678 * COWing must happen through a running transaction, which always
679 * matches the current fs generation (it's a transaction with a state
680 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
681 * into error state to prevent the commit of any transaction.
682 */
683 if (unlikely(trans->transaction != fs_info->running_transaction ||
684 trans->transid != fs_info->generation)) {
685 btrfs_abort_transaction(trans, -EUCLEAN);
686 btrfs_crit(fs_info,
687"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
688 buf->start, btrfs_root_id(root), trans->transid,
689 fs_info->running_transaction->transid,
690 fs_info->generation);
691 return -EUCLEAN;
692 }
693
694 if (!should_cow_block(trans, root, buf)) {
695 *cow_ret = buf;
696 return 0;
697 }
698
699 search_start = round_down(buf->start, SZ_1G);
700
701 /*
702 * Before CoWing this block for later modification, check if it's
703 * the subtree root and do the delayed subtree trace if needed.
704 *
705 * Also We don't care about the error, as it's handled internally.
706 */
707 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
708 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
709 cow_ret, search_start, 0, nest);
710}
711ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
712
713/*
714 * same as comp_keys only with two btrfs_key's
715 */
716int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
717{
718 if (k1->objectid > k2->objectid)
719 return 1;
720 if (k1->objectid < k2->objectid)
721 return -1;
722 if (k1->type > k2->type)
723 return 1;
724 if (k1->type < k2->type)
725 return -1;
726 if (k1->offset > k2->offset)
727 return 1;
728 if (k1->offset < k2->offset)
729 return -1;
730 return 0;
731}
732
733/*
734 * Search for a key in the given extent_buffer.
735 *
736 * The lower boundary for the search is specified by the slot number @first_slot.
737 * Use a value of 0 to search over the whole extent buffer. Works for both
738 * leaves and nodes.
739 *
740 * The slot in the extent buffer is returned via @slot. If the key exists in the
741 * extent buffer, then @slot will point to the slot where the key is, otherwise
742 * it points to the slot where you would insert the key.
743 *
744 * Slot may point to the total number of items (i.e. one position beyond the last
745 * key) if the key is bigger than the last key in the extent buffer.
746 */
747int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
748 const struct btrfs_key *key, int *slot)
749{
750 unsigned long p;
751 int item_size;
752 /*
753 * Use unsigned types for the low and high slots, so that we get a more
754 * efficient division in the search loop below.
755 */
756 u32 low = first_slot;
757 u32 high = btrfs_header_nritems(eb);
758 int ret;
759 const int key_size = sizeof(struct btrfs_disk_key);
760
761 if (unlikely(low > high)) {
762 btrfs_err(eb->fs_info,
763 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
764 __func__, low, high, eb->start,
765 btrfs_header_owner(eb), btrfs_header_level(eb));
766 return -EINVAL;
767 }
768
769 if (btrfs_header_level(eb) == 0) {
770 p = offsetof(struct btrfs_leaf, items);
771 item_size = sizeof(struct btrfs_item);
772 } else {
773 p = offsetof(struct btrfs_node, ptrs);
774 item_size = sizeof(struct btrfs_key_ptr);
775 }
776
777 while (low < high) {
778 const int unit_size = eb->folio_size;
779 unsigned long oil;
780 unsigned long offset;
781 struct btrfs_disk_key *tmp;
782 struct btrfs_disk_key unaligned;
783 int mid;
784
785 mid = (low + high) / 2;
786 offset = p + mid * item_size;
787 oil = get_eb_offset_in_folio(eb, offset);
788
789 if (oil + key_size <= unit_size) {
790 const unsigned long idx = get_eb_folio_index(eb, offset);
791 char *kaddr = folio_address(eb->folios[idx]);
792
793 oil = get_eb_offset_in_folio(eb, offset);
794 tmp = (struct btrfs_disk_key *)(kaddr + oil);
795 } else {
796 read_extent_buffer(eb, &unaligned, offset, key_size);
797 tmp = &unaligned;
798 }
799
800 ret = btrfs_comp_keys(tmp, key);
801
802 if (ret < 0)
803 low = mid + 1;
804 else if (ret > 0)
805 high = mid;
806 else {
807 *slot = mid;
808 return 0;
809 }
810 }
811 *slot = low;
812 return 1;
813}
814
815static void root_add_used_bytes(struct btrfs_root *root)
816{
817 spin_lock(&root->accounting_lock);
818 btrfs_set_root_used(&root->root_item,
819 btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
820 spin_unlock(&root->accounting_lock);
821}
822
823static void root_sub_used_bytes(struct btrfs_root *root)
824{
825 spin_lock(&root->accounting_lock);
826 btrfs_set_root_used(&root->root_item,
827 btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
828 spin_unlock(&root->accounting_lock);
829}
830
831/* given a node and slot number, this reads the blocks it points to. The
832 * extent buffer is returned with a reference taken (but unlocked).
833 */
834struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
835 int slot)
836{
837 int level = btrfs_header_level(parent);
838 struct btrfs_tree_parent_check check = { 0 };
839 struct extent_buffer *eb;
840
841 if (slot < 0 || slot >= btrfs_header_nritems(parent))
842 return ERR_PTR(-ENOENT);
843
844 ASSERT(level);
845
846 check.level = level - 1;
847 check.transid = btrfs_node_ptr_generation(parent, slot);
848 check.owner_root = btrfs_header_owner(parent);
849 check.has_first_key = true;
850 btrfs_node_key_to_cpu(parent, &check.first_key, slot);
851
852 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
853 &check);
854 if (IS_ERR(eb))
855 return eb;
856 if (unlikely(!extent_buffer_uptodate(eb))) {
857 free_extent_buffer(eb);
858 return ERR_PTR(-EIO);
859 }
860
861 return eb;
862}
863
864/*
865 * Promote a child node to become the new tree root.
866 *
867 * @trans: Transaction handle
868 * @root: Tree root structure to update
869 * @path: Path holding nodes and locks
870 * @level: Level of the parent (old root)
871 * @parent: The parent (old root) with exactly one item
872 *
873 * This helper is called during rebalancing when the root node contains only
874 * a single item (nritems == 1). We can reduce the tree height by promoting
875 * that child to become the new root and freeing the old root node. The path
876 * locks and references are updated accordingly.
877 *
878 * Return: 0 on success, negative errno on failure. The transaction is aborted
879 * on critical errors.
880 */
881static int promote_child_to_root(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root, struct btrfs_path *path,
883 int level, struct extent_buffer *parent)
884{
885 struct extent_buffer *child;
886 int ret;
887
888 ASSERT(btrfs_header_nritems(parent) == 1);
889
890 child = btrfs_read_node_slot(parent, 0);
891 if (IS_ERR(child))
892 return PTR_ERR(child);
893
894 btrfs_tree_lock(child);
895 ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW);
896 if (ret) {
897 btrfs_tree_unlock(child);
898 free_extent_buffer(child);
899 return ret;
900 }
901
902 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
903 if (unlikely(ret < 0)) {
904 btrfs_tree_unlock(child);
905 free_extent_buffer(child);
906 btrfs_abort_transaction(trans, ret);
907 return ret;
908 }
909 rcu_assign_pointer(root->node, child);
910
911 add_root_to_dirty_list(root);
912 btrfs_tree_unlock(child);
913
914 path->locks[level] = 0;
915 path->nodes[level] = NULL;
916 btrfs_clear_buffer_dirty(trans, parent);
917 btrfs_tree_unlock(parent);
918 /* Once for the path. */
919 free_extent_buffer(parent);
920
921 root_sub_used_bytes(root);
922 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1);
923 /* Once for the root ptr. */
924 free_extent_buffer_stale(parent);
925 if (unlikely(ret < 0)) {
926 btrfs_abort_transaction(trans, ret);
927 return ret;
928 }
929
930 return 0;
931}
932
933/*
934 * node level balancing, used to make sure nodes are in proper order for
935 * item deletion. We balance from the top down, so we have to make sure
936 * that a deletion won't leave an node completely empty later on.
937 */
938static noinline int balance_level(struct btrfs_trans_handle *trans,
939 struct btrfs_root *root,
940 struct btrfs_path *path, int level)
941{
942 struct btrfs_fs_info *fs_info = root->fs_info;
943 struct extent_buffer *right = NULL;
944 struct extent_buffer *mid;
945 struct extent_buffer *left = NULL;
946 struct extent_buffer *parent = NULL;
947 int ret = 0;
948 int wret;
949 int pslot;
950 int orig_slot = path->slots[level];
951 u64 orig_ptr;
952
953 ASSERT(level > 0);
954
955 mid = path->nodes[level];
956
957 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
958 WARN_ON(btrfs_header_generation(mid) != trans->transid);
959
960 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
961
962 if (level < BTRFS_MAX_LEVEL - 1) {
963 parent = path->nodes[level + 1];
964 pslot = path->slots[level + 1];
965 }
966
967 /*
968 * deal with the case where there is only one pointer in the root
969 * by promoting the node below to a root
970 */
971 if (!parent) {
972 if (btrfs_header_nritems(mid) != 1)
973 return 0;
974
975 return promote_child_to_root(trans, root, path, level, mid);
976 }
977 if (btrfs_header_nritems(mid) >
978 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
979 return 0;
980
981 if (pslot) {
982 left = btrfs_read_node_slot(parent, pslot - 1);
983 if (IS_ERR(left)) {
984 ret = PTR_ERR(left);
985 left = NULL;
986 goto out;
987 }
988
989 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
990 wret = btrfs_cow_block(trans, root, left,
991 parent, pslot - 1, &left,
992 BTRFS_NESTING_LEFT_COW);
993 if (wret) {
994 ret = wret;
995 goto out;
996 }
997 }
998
999 if (pslot + 1 < btrfs_header_nritems(parent)) {
1000 right = btrfs_read_node_slot(parent, pslot + 1);
1001 if (IS_ERR(right)) {
1002 ret = PTR_ERR(right);
1003 right = NULL;
1004 goto out;
1005 }
1006
1007 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
1008 wret = btrfs_cow_block(trans, root, right,
1009 parent, pslot + 1, &right,
1010 BTRFS_NESTING_RIGHT_COW);
1011 if (wret) {
1012 ret = wret;
1013 goto out;
1014 }
1015 }
1016
1017 /* first, try to make some room in the middle buffer */
1018 if (left) {
1019 orig_slot += btrfs_header_nritems(left);
1020 wret = push_node_left(trans, left, mid, 1);
1021 if (wret < 0)
1022 ret = wret;
1023 }
1024
1025 /*
1026 * then try to empty the right most buffer into the middle
1027 */
1028 if (right) {
1029 wret = push_node_left(trans, mid, right, 1);
1030 if (wret < 0 && wret != -ENOSPC)
1031 ret = wret;
1032 if (btrfs_header_nritems(right) == 0) {
1033 btrfs_clear_buffer_dirty(trans, right);
1034 btrfs_tree_unlock(right);
1035 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1036 if (ret < 0) {
1037 free_extent_buffer_stale(right);
1038 right = NULL;
1039 goto out;
1040 }
1041 root_sub_used_bytes(root);
1042 ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
1043 right, 0, 1);
1044 free_extent_buffer_stale(right);
1045 right = NULL;
1046 if (unlikely(ret < 0)) {
1047 btrfs_abort_transaction(trans, ret);
1048 goto out;
1049 }
1050 } else {
1051 struct btrfs_disk_key right_key;
1052 btrfs_node_key(right, &right_key, 0);
1053 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1054 BTRFS_MOD_LOG_KEY_REPLACE);
1055 if (unlikely(ret < 0)) {
1056 btrfs_abort_transaction(trans, ret);
1057 goto out;
1058 }
1059 btrfs_set_node_key(parent, &right_key, pslot + 1);
1060 btrfs_mark_buffer_dirty(trans, parent);
1061 }
1062 }
1063 if (btrfs_header_nritems(mid) == 1) {
1064 /*
1065 * we're not allowed to leave a node with one item in the
1066 * tree during a delete. A deletion from lower in the tree
1067 * could try to delete the only pointer in this node.
1068 * So, pull some keys from the left.
1069 * There has to be a left pointer at this point because
1070 * otherwise we would have pulled some pointers from the
1071 * right
1072 */
1073 if (unlikely(!left)) {
1074 btrfs_crit(fs_info,
1075"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1076 parent->start, btrfs_header_level(parent),
1077 mid->start, btrfs_root_id(root));
1078 ret = -EUCLEAN;
1079 btrfs_abort_transaction(trans, ret);
1080 goto out;
1081 }
1082 wret = balance_node_right(trans, mid, left);
1083 if (wret < 0) {
1084 ret = wret;
1085 goto out;
1086 }
1087 if (wret == 1) {
1088 wret = push_node_left(trans, left, mid, 1);
1089 if (wret < 0)
1090 ret = wret;
1091 }
1092 BUG_ON(wret == 1);
1093 }
1094 if (btrfs_header_nritems(mid) == 0) {
1095 btrfs_clear_buffer_dirty(trans, mid);
1096 btrfs_tree_unlock(mid);
1097 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1098 if (ret < 0) {
1099 free_extent_buffer_stale(mid);
1100 mid = NULL;
1101 goto out;
1102 }
1103 root_sub_used_bytes(root);
1104 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1105 free_extent_buffer_stale(mid);
1106 mid = NULL;
1107 if (unlikely(ret < 0)) {
1108 btrfs_abort_transaction(trans, ret);
1109 goto out;
1110 }
1111 } else {
1112 /* update the parent key to reflect our changes */
1113 struct btrfs_disk_key mid_key;
1114 btrfs_node_key(mid, &mid_key, 0);
1115 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1116 BTRFS_MOD_LOG_KEY_REPLACE);
1117 if (unlikely(ret < 0)) {
1118 btrfs_abort_transaction(trans, ret);
1119 goto out;
1120 }
1121 btrfs_set_node_key(parent, &mid_key, pslot);
1122 btrfs_mark_buffer_dirty(trans, parent);
1123 }
1124
1125 /* update the path */
1126 if (left) {
1127 if (btrfs_header_nritems(left) > orig_slot) {
1128 /* left was locked after cow */
1129 path->nodes[level] = left;
1130 path->slots[level + 1] -= 1;
1131 path->slots[level] = orig_slot;
1132 /* Left is now owned by path. */
1133 left = NULL;
1134 if (mid) {
1135 btrfs_tree_unlock(mid);
1136 free_extent_buffer(mid);
1137 }
1138 } else {
1139 orig_slot -= btrfs_header_nritems(left);
1140 path->slots[level] = orig_slot;
1141 }
1142 }
1143 /* double check we haven't messed things up */
1144 if (orig_ptr !=
1145 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1146 BUG();
1147out:
1148 if (right) {
1149 btrfs_tree_unlock(right);
1150 free_extent_buffer(right);
1151 }
1152 if (left) {
1153 btrfs_tree_unlock(left);
1154 free_extent_buffer(left);
1155 }
1156 return ret;
1157}
1158
1159/* Node balancing for insertion. Here we only split or push nodes around
1160 * when they are completely full. This is also done top down, so we
1161 * have to be pessimistic.
1162 */
1163static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1164 struct btrfs_root *root,
1165 struct btrfs_path *path, int level)
1166{
1167 struct btrfs_fs_info *fs_info = root->fs_info;
1168 struct extent_buffer *right = NULL;
1169 struct extent_buffer *mid;
1170 struct extent_buffer *left = NULL;
1171 struct extent_buffer *parent = NULL;
1172 int ret = 0;
1173 int wret;
1174 int pslot;
1175 int orig_slot = path->slots[level];
1176
1177 if (level == 0)
1178 return 1;
1179
1180 mid = path->nodes[level];
1181 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1182
1183 if (level < BTRFS_MAX_LEVEL - 1) {
1184 parent = path->nodes[level + 1];
1185 pslot = path->slots[level + 1];
1186 }
1187
1188 if (!parent)
1189 return 1;
1190
1191 /* first, try to make some room in the middle buffer */
1192 if (pslot) {
1193 u32 left_nr;
1194
1195 left = btrfs_read_node_slot(parent, pslot - 1);
1196 if (IS_ERR(left))
1197 return PTR_ERR(left);
1198
1199 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
1200
1201 left_nr = btrfs_header_nritems(left);
1202 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1203 wret = 1;
1204 } else {
1205 ret = btrfs_cow_block(trans, root, left, parent,
1206 pslot - 1, &left,
1207 BTRFS_NESTING_LEFT_COW);
1208 if (ret)
1209 wret = 1;
1210 else {
1211 wret = push_node_left(trans, left, mid, 0);
1212 }
1213 }
1214 if (wret < 0)
1215 ret = wret;
1216 if (wret == 0) {
1217 struct btrfs_disk_key disk_key;
1218 orig_slot += left_nr;
1219 btrfs_node_key(mid, &disk_key, 0);
1220 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1221 BTRFS_MOD_LOG_KEY_REPLACE);
1222 if (unlikely(ret < 0)) {
1223 btrfs_tree_unlock(left);
1224 free_extent_buffer(left);
1225 btrfs_abort_transaction(trans, ret);
1226 return ret;
1227 }
1228 btrfs_set_node_key(parent, &disk_key, pslot);
1229 btrfs_mark_buffer_dirty(trans, parent);
1230 if (btrfs_header_nritems(left) > orig_slot) {
1231 path->nodes[level] = left;
1232 path->slots[level + 1] -= 1;
1233 path->slots[level] = orig_slot;
1234 btrfs_tree_unlock(mid);
1235 free_extent_buffer(mid);
1236 } else {
1237 orig_slot -=
1238 btrfs_header_nritems(left);
1239 path->slots[level] = orig_slot;
1240 btrfs_tree_unlock(left);
1241 free_extent_buffer(left);
1242 }
1243 return 0;
1244 }
1245 btrfs_tree_unlock(left);
1246 free_extent_buffer(left);
1247 }
1248
1249 /*
1250 * then try to empty the right most buffer into the middle
1251 */
1252 if (pslot + 1 < btrfs_header_nritems(parent)) {
1253 u32 right_nr;
1254
1255 right = btrfs_read_node_slot(parent, pslot + 1);
1256 if (IS_ERR(right))
1257 return PTR_ERR(right);
1258
1259 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
1260
1261 right_nr = btrfs_header_nritems(right);
1262 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1263 wret = 1;
1264 } else {
1265 ret = btrfs_cow_block(trans, root, right,
1266 parent, pslot + 1,
1267 &right, BTRFS_NESTING_RIGHT_COW);
1268 if (ret)
1269 wret = 1;
1270 else {
1271 wret = balance_node_right(trans, right, mid);
1272 }
1273 }
1274 if (wret < 0)
1275 ret = wret;
1276 if (wret == 0) {
1277 struct btrfs_disk_key disk_key;
1278
1279 btrfs_node_key(right, &disk_key, 0);
1280 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1281 BTRFS_MOD_LOG_KEY_REPLACE);
1282 if (unlikely(ret < 0)) {
1283 btrfs_tree_unlock(right);
1284 free_extent_buffer(right);
1285 btrfs_abort_transaction(trans, ret);
1286 return ret;
1287 }
1288 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1289 btrfs_mark_buffer_dirty(trans, parent);
1290
1291 if (btrfs_header_nritems(mid) <= orig_slot) {
1292 path->nodes[level] = right;
1293 path->slots[level + 1] += 1;
1294 path->slots[level] = orig_slot -
1295 btrfs_header_nritems(mid);
1296 btrfs_tree_unlock(mid);
1297 free_extent_buffer(mid);
1298 } else {
1299 btrfs_tree_unlock(right);
1300 free_extent_buffer(right);
1301 }
1302 return 0;
1303 }
1304 btrfs_tree_unlock(right);
1305 free_extent_buffer(right);
1306 }
1307 return 1;
1308}
1309
1310/*
1311 * readahead one full node of leaves, finding things that are close
1312 * to the block in 'slot', and triggering ra on them.
1313 */
1314static void reada_for_search(struct btrfs_fs_info *fs_info,
1315 const struct btrfs_path *path,
1316 int level, int slot, u64 objectid)
1317{
1318 struct extent_buffer *node;
1319 struct btrfs_disk_key disk_key;
1320 u32 nritems;
1321 u64 search;
1322 u64 target;
1323 u64 nread = 0;
1324 u64 nread_max;
1325 u32 nr;
1326 u32 blocksize;
1327 u32 nscan = 0;
1328
1329 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1330 return;
1331
1332 if (!path->nodes[level])
1333 return;
1334
1335 node = path->nodes[level];
1336
1337 /*
1338 * Since the time between visiting leaves is much shorter than the time
1339 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1340 * much IO at once (possibly random).
1341 */
1342 if (path->reada == READA_FORWARD_ALWAYS) {
1343 if (level > 1)
1344 nread_max = node->fs_info->nodesize;
1345 else
1346 nread_max = SZ_128K;
1347 } else {
1348 nread_max = SZ_64K;
1349 }
1350
1351 search = btrfs_node_blockptr(node, slot);
1352 blocksize = fs_info->nodesize;
1353 if (path->reada != READA_FORWARD_ALWAYS) {
1354 struct extent_buffer *eb;
1355
1356 eb = find_extent_buffer(fs_info, search);
1357 if (eb) {
1358 free_extent_buffer(eb);
1359 return;
1360 }
1361 }
1362
1363 target = search;
1364
1365 nritems = btrfs_header_nritems(node);
1366 nr = slot;
1367
1368 while (1) {
1369 if (path->reada == READA_BACK) {
1370 if (nr == 0)
1371 break;
1372 nr--;
1373 } else if (path->reada == READA_FORWARD ||
1374 path->reada == READA_FORWARD_ALWAYS) {
1375 nr++;
1376 if (nr >= nritems)
1377 break;
1378 }
1379 if (path->reada == READA_BACK && objectid) {
1380 btrfs_node_key(node, &disk_key, nr);
1381 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1382 break;
1383 }
1384 search = btrfs_node_blockptr(node, nr);
1385 if (path->reada == READA_FORWARD_ALWAYS ||
1386 (search <= target && target - search <= 65536) ||
1387 (search > target && search - target <= 65536)) {
1388 btrfs_readahead_node_child(node, nr);
1389 nread += blocksize;
1390 }
1391 nscan++;
1392 if (nread > nread_max || nscan > 32)
1393 break;
1394 }
1395}
1396
1397static noinline void reada_for_balance(const struct btrfs_path *path, int level)
1398{
1399 struct extent_buffer *parent;
1400 int slot;
1401 int nritems;
1402
1403 parent = path->nodes[level + 1];
1404 if (!parent)
1405 return;
1406
1407 nritems = btrfs_header_nritems(parent);
1408 slot = path->slots[level + 1];
1409
1410 if (slot > 0)
1411 btrfs_readahead_node_child(parent, slot - 1);
1412 if (slot + 1 < nritems)
1413 btrfs_readahead_node_child(parent, slot + 1);
1414}
1415
1416
1417/*
1418 * when we walk down the tree, it is usually safe to unlock the higher layers
1419 * in the tree. The exceptions are when our path goes through slot 0, because
1420 * operations on the tree might require changing key pointers higher up in the
1421 * tree.
1422 *
1423 * callers might also have set path->keep_locks, which tells this code to keep
1424 * the lock if the path points to the last slot in the block. This is part of
1425 * walking through the tree, and selecting the next slot in the higher block.
1426 *
1427 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1428 * if lowest_unlock is 1, level 0 won't be unlocked
1429 */
1430static noinline void unlock_up(struct btrfs_path *path, int level,
1431 int lowest_unlock, int min_write_lock_level,
1432 int *write_lock_level)
1433{
1434 int i;
1435 int skip_level = level;
1436 bool check_skip = true;
1437
1438 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1439 if (!path->nodes[i])
1440 break;
1441 if (!path->locks[i])
1442 break;
1443
1444 if (check_skip) {
1445 if (path->slots[i] == 0) {
1446 skip_level = i + 1;
1447 continue;
1448 }
1449
1450 if (path->keep_locks) {
1451 u32 nritems;
1452
1453 nritems = btrfs_header_nritems(path->nodes[i]);
1454 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1455 skip_level = i + 1;
1456 continue;
1457 }
1458 }
1459 }
1460
1461 if (i >= lowest_unlock && i > skip_level) {
1462 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1463 check_skip = false;
1464 path->locks[i] = 0;
1465 if (write_lock_level &&
1466 i > min_write_lock_level &&
1467 i <= *write_lock_level) {
1468 *write_lock_level = i - 1;
1469 }
1470 }
1471 }
1472}
1473
1474/*
1475 * Helper function for btrfs_search_slot() and other functions that do a search
1476 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1477 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1478 * its pages from disk.
1479 *
1480 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1481 * whole btree search, starting again from the current root node.
1482 */
1483static int
1484read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1485 struct extent_buffer **eb_ret, int slot,
1486 const struct btrfs_key *key)
1487{
1488 struct btrfs_fs_info *fs_info = root->fs_info;
1489 struct btrfs_tree_parent_check check = { 0 };
1490 u64 blocknr;
1491 struct extent_buffer *tmp = NULL;
1492 int ret = 0;
1493 int ret2;
1494 int parent_level;
1495 bool read_tmp = false;
1496 bool tmp_locked = false;
1497 bool path_released = false;
1498
1499 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1500 parent_level = btrfs_header_level(*eb_ret);
1501 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1502 check.has_first_key = true;
1503 check.level = parent_level - 1;
1504 check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
1505 check.owner_root = btrfs_root_id(root);
1506
1507 /*
1508 * If we need to read an extent buffer from disk and we are holding locks
1509 * on upper level nodes, we unlock all the upper nodes before reading the
1510 * extent buffer, and then return -EAGAIN to the caller as it needs to
1511 * restart the search. We don't release the lock on the current level
1512 * because we need to walk this node to figure out which blocks to read.
1513 */
1514 tmp = find_extent_buffer(fs_info, blocknr);
1515 if (tmp) {
1516 if (p->reada == READA_FORWARD_ALWAYS)
1517 reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1518
1519 /* first we do an atomic uptodate check */
1520 if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
1521 /*
1522 * Do extra check for first_key, eb can be stale due to
1523 * being cached, read from scrub, or have multiple
1524 * parents (shared tree blocks).
1525 */
1526 if (unlikely(btrfs_verify_level_key(tmp, &check))) {
1527 ret = -EUCLEAN;
1528 goto out;
1529 }
1530 *eb_ret = tmp;
1531 tmp = NULL;
1532 ret = 0;
1533 goto out;
1534 }
1535
1536 if (p->nowait) {
1537 ret = -EAGAIN;
1538 goto out;
1539 }
1540
1541 if (!p->skip_locking) {
1542 btrfs_unlock_up_safe(p, parent_level + 1);
1543 btrfs_maybe_reset_lockdep_class(root, tmp);
1544 tmp_locked = true;
1545 btrfs_tree_read_lock(tmp);
1546 btrfs_release_path(p);
1547 ret = -EAGAIN;
1548 path_released = true;
1549 }
1550
1551 /* Now we're allowed to do a blocking uptodate check. */
1552 ret2 = btrfs_read_extent_buffer(tmp, &check);
1553 if (ret2) {
1554 ret = ret2;
1555 goto out;
1556 }
1557
1558 if (ret == 0) {
1559 ASSERT(!tmp_locked);
1560 *eb_ret = tmp;
1561 tmp = NULL;
1562 }
1563 goto out;
1564 } else if (p->nowait) {
1565 ret = -EAGAIN;
1566 goto out;
1567 }
1568
1569 if (!p->skip_locking) {
1570 btrfs_unlock_up_safe(p, parent_level + 1);
1571 ret = -EAGAIN;
1572 }
1573
1574 if (p->reada != READA_NONE)
1575 reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1576
1577 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
1578 if (IS_ERR(tmp)) {
1579 ret = PTR_ERR(tmp);
1580 tmp = NULL;
1581 goto out;
1582 }
1583 read_tmp = true;
1584
1585 if (!p->skip_locking) {
1586 ASSERT(ret == -EAGAIN);
1587 btrfs_maybe_reset_lockdep_class(root, tmp);
1588 tmp_locked = true;
1589 btrfs_tree_read_lock(tmp);
1590 btrfs_release_path(p);
1591 path_released = true;
1592 }
1593
1594 /* Now we're allowed to do a blocking uptodate check. */
1595 ret2 = btrfs_read_extent_buffer(tmp, &check);
1596 if (ret2) {
1597 ret = ret2;
1598 goto out;
1599 }
1600
1601 /*
1602 * If the read above didn't mark this buffer up to date,
1603 * it will never end up being up to date. Set ret to EIO now
1604 * and give up so that our caller doesn't loop forever
1605 * on our EAGAINs.
1606 */
1607 if (unlikely(!extent_buffer_uptodate(tmp))) {
1608 ret = -EIO;
1609 goto out;
1610 }
1611
1612 if (ret == 0) {
1613 ASSERT(!tmp_locked);
1614 *eb_ret = tmp;
1615 tmp = NULL;
1616 }
1617out:
1618 if (tmp) {
1619 if (tmp_locked)
1620 btrfs_tree_read_unlock(tmp);
1621 if (read_tmp && ret && ret != -EAGAIN)
1622 free_extent_buffer_stale(tmp);
1623 else
1624 free_extent_buffer(tmp);
1625 }
1626 if (ret && !path_released)
1627 btrfs_release_path(p);
1628
1629 return ret;
1630}
1631
1632/*
1633 * helper function for btrfs_search_slot. This does all of the checks
1634 * for node-level blocks and does any balancing required based on
1635 * the ins_len.
1636 *
1637 * If no extra work was required, zero is returned. If we had to
1638 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1639 * start over
1640 */
1641static int
1642setup_nodes_for_search(struct btrfs_trans_handle *trans,
1643 struct btrfs_root *root, struct btrfs_path *p,
1644 struct extent_buffer *b, int level, int ins_len,
1645 int *write_lock_level)
1646{
1647 struct btrfs_fs_info *fs_info = root->fs_info;
1648 int ret = 0;
1649
1650 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1651 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1652
1653 if (*write_lock_level < level + 1) {
1654 *write_lock_level = level + 1;
1655 btrfs_release_path(p);
1656 return -EAGAIN;
1657 }
1658
1659 reada_for_balance(p, level);
1660 ret = split_node(trans, root, p, level);
1661
1662 b = p->nodes[level];
1663 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1664 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1665
1666 if (*write_lock_level < level + 1) {
1667 *write_lock_level = level + 1;
1668 btrfs_release_path(p);
1669 return -EAGAIN;
1670 }
1671
1672 reada_for_balance(p, level);
1673 ret = balance_level(trans, root, p, level);
1674 if (ret)
1675 return ret;
1676
1677 b = p->nodes[level];
1678 if (!b) {
1679 btrfs_release_path(p);
1680 return -EAGAIN;
1681 }
1682 BUG_ON(btrfs_header_nritems(b) == 1);
1683 }
1684 return ret;
1685}
1686
1687int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1688 u64 iobjectid, u64 ioff, u8 key_type,
1689 struct btrfs_key *found_key)
1690{
1691 int ret;
1692 struct btrfs_key key;
1693 struct extent_buffer *eb;
1694
1695 ASSERT(path);
1696 ASSERT(found_key);
1697
1698 key.type = key_type;
1699 key.objectid = iobjectid;
1700 key.offset = ioff;
1701
1702 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1703 if (ret < 0)
1704 return ret;
1705
1706 eb = path->nodes[0];
1707 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1708 ret = btrfs_next_leaf(fs_root, path);
1709 if (ret)
1710 return ret;
1711 eb = path->nodes[0];
1712 }
1713
1714 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1715 if (found_key->type != key.type ||
1716 found_key->objectid != key.objectid)
1717 return 1;
1718
1719 return 0;
1720}
1721
1722static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1723 struct btrfs_path *p,
1724 int write_lock_level)
1725{
1726 struct extent_buffer *b;
1727 int root_lock = 0;
1728 int level = 0;
1729
1730 if (p->search_commit_root) {
1731 b = root->commit_root;
1732 refcount_inc(&b->refs);
1733 level = btrfs_header_level(b);
1734 /*
1735 * Ensure that all callers have set skip_locking when
1736 * p->search_commit_root is true.
1737 */
1738 ASSERT(p->skip_locking);
1739
1740 goto out;
1741 }
1742
1743 if (p->skip_locking) {
1744 b = btrfs_root_node(root);
1745 level = btrfs_header_level(b);
1746 goto out;
1747 }
1748
1749 /* We try very hard to do read locks on the root */
1750 root_lock = BTRFS_READ_LOCK;
1751
1752 /*
1753 * If the level is set to maximum, we can skip trying to get the read
1754 * lock.
1755 */
1756 if (write_lock_level < BTRFS_MAX_LEVEL) {
1757 /*
1758 * We don't know the level of the root node until we actually
1759 * have it read locked
1760 */
1761 if (p->nowait) {
1762 b = btrfs_try_read_lock_root_node(root);
1763 if (IS_ERR(b))
1764 return b;
1765 } else {
1766 b = btrfs_read_lock_root_node(root);
1767 }
1768 level = btrfs_header_level(b);
1769 if (level > write_lock_level)
1770 goto out;
1771
1772 /* Whoops, must trade for write lock */
1773 btrfs_tree_read_unlock(b);
1774 free_extent_buffer(b);
1775 }
1776
1777 b = btrfs_lock_root_node(root);
1778 root_lock = BTRFS_WRITE_LOCK;
1779
1780 /* The level might have changed, check again */
1781 level = btrfs_header_level(b);
1782
1783out:
1784 /*
1785 * The root may have failed to write out at some point, and thus is no
1786 * longer valid, return an error in this case.
1787 */
1788 if (unlikely(!extent_buffer_uptodate(b))) {
1789 if (root_lock)
1790 btrfs_tree_unlock_rw(b, root_lock);
1791 free_extent_buffer(b);
1792 return ERR_PTR(-EIO);
1793 }
1794
1795 p->nodes[level] = b;
1796 if (!p->skip_locking)
1797 p->locks[level] = root_lock;
1798 /*
1799 * Callers are responsible for dropping b's references.
1800 */
1801 return b;
1802}
1803
1804/*
1805 * Replace the extent buffer at the lowest level of the path with a cloned
1806 * version. The purpose is to be able to use it safely, after releasing the
1807 * commit root semaphore, even if relocation is happening in parallel, the
1808 * transaction used for relocation is committed and the extent buffer is
1809 * reallocated in the next transaction.
1810 *
1811 * This is used in a context where the caller does not prevent transaction
1812 * commits from happening, either by holding a transaction handle or holding
1813 * some lock, while it's doing searches through a commit root.
1814 * At the moment it's only used for send operations.
1815 */
1816static int finish_need_commit_sem_search(struct btrfs_path *path)
1817{
1818 const int i = path->lowest_level;
1819 const int slot = path->slots[i];
1820 struct extent_buffer *lowest = path->nodes[i];
1821 struct extent_buffer *clone;
1822
1823 ASSERT(path->need_commit_sem);
1824
1825 if (!lowest)
1826 return 0;
1827
1828 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1829
1830 clone = btrfs_clone_extent_buffer(lowest);
1831 if (!clone)
1832 return -ENOMEM;
1833
1834 btrfs_release_path(path);
1835 path->nodes[i] = clone;
1836 path->slots[i] = slot;
1837
1838 return 0;
1839}
1840
1841static inline int search_for_key_slot(const struct extent_buffer *eb,
1842 int search_low_slot,
1843 const struct btrfs_key *key,
1844 int prev_cmp,
1845 int *slot)
1846{
1847 /*
1848 * If a previous call to btrfs_bin_search() on a parent node returned an
1849 * exact match (prev_cmp == 0), we can safely assume the target key will
1850 * always be at slot 0 on lower levels, since each key pointer
1851 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1852 * subtree it points to. Thus we can skip searching lower levels.
1853 */
1854 if (prev_cmp == 0) {
1855 *slot = 0;
1856 return 0;
1857 }
1858
1859 return btrfs_bin_search(eb, search_low_slot, key, slot);
1860}
1861
1862static int search_leaf(struct btrfs_trans_handle *trans,
1863 struct btrfs_root *root,
1864 const struct btrfs_key *key,
1865 struct btrfs_path *path,
1866 int ins_len,
1867 int prev_cmp)
1868{
1869 struct extent_buffer *leaf = path->nodes[0];
1870 int leaf_free_space = -1;
1871 int search_low_slot = 0;
1872 int ret;
1873 bool do_bin_search = true;
1874
1875 /*
1876 * If we are doing an insertion, the leaf has enough free space and the
1877 * destination slot for the key is not slot 0, then we can unlock our
1878 * write lock on the parent, and any other upper nodes, before doing the
1879 * binary search on the leaf (with search_for_key_slot()), allowing other
1880 * tasks to lock the parent and any other upper nodes.
1881 */
1882 if (ins_len > 0) {
1883 /*
1884 * Cache the leaf free space, since we will need it later and it
1885 * will not change until then.
1886 */
1887 leaf_free_space = btrfs_leaf_free_space(leaf);
1888
1889 /*
1890 * !path->locks[1] means we have a single node tree, the leaf is
1891 * the root of the tree.
1892 */
1893 if (path->locks[1] && leaf_free_space >= ins_len) {
1894 struct btrfs_disk_key first_key;
1895
1896 ASSERT(btrfs_header_nritems(leaf) > 0);
1897 btrfs_item_key(leaf, &first_key, 0);
1898
1899 /*
1900 * Doing the extra comparison with the first key is cheap,
1901 * taking into account that the first key is very likely
1902 * already in a cache line because it immediately follows
1903 * the extent buffer's header and we have recently accessed
1904 * the header's level field.
1905 */
1906 ret = btrfs_comp_keys(&first_key, key);
1907 if (ret < 0) {
1908 /*
1909 * The first key is smaller than the key we want
1910 * to insert, so we are safe to unlock all upper
1911 * nodes and we have to do the binary search.
1912 *
1913 * We do use btrfs_unlock_up_safe() and not
1914 * unlock_up() because the later does not unlock
1915 * nodes with a slot of 0 - we can safely unlock
1916 * any node even if its slot is 0 since in this
1917 * case the key does not end up at slot 0 of the
1918 * leaf and there's no need to split the leaf.
1919 */
1920 btrfs_unlock_up_safe(path, 1);
1921 search_low_slot = 1;
1922 } else {
1923 /*
1924 * The first key is >= then the key we want to
1925 * insert, so we can skip the binary search as
1926 * the target key will be at slot 0.
1927 *
1928 * We can not unlock upper nodes when the key is
1929 * less than the first key, because we will need
1930 * to update the key at slot 0 of the parent node
1931 * and possibly of other upper nodes too.
1932 * If the key matches the first key, then we can
1933 * unlock all the upper nodes, using
1934 * btrfs_unlock_up_safe() instead of unlock_up()
1935 * as stated above.
1936 */
1937 if (ret == 0)
1938 btrfs_unlock_up_safe(path, 1);
1939 /*
1940 * ret is already 0 or 1, matching the result of
1941 * a btrfs_bin_search() call, so there is no need
1942 * to adjust it.
1943 */
1944 do_bin_search = false;
1945 path->slots[0] = 0;
1946 }
1947 }
1948 }
1949
1950 if (do_bin_search) {
1951 ret = search_for_key_slot(leaf, search_low_slot, key,
1952 prev_cmp, &path->slots[0]);
1953 if (ret < 0)
1954 return ret;
1955 }
1956
1957 if (ins_len > 0) {
1958 /*
1959 * Item key already exists. In this case, if we are allowed to
1960 * insert the item (for example, in dir_item case, item key
1961 * collision is allowed), it will be merged with the original
1962 * item. Only the item size grows, no new btrfs item will be
1963 * added. If search_for_extension is not set, ins_len already
1964 * accounts the size btrfs_item, deduct it here so leaf space
1965 * check will be correct.
1966 */
1967 if (ret == 0 && !path->search_for_extension) {
1968 ASSERT(ins_len >= sizeof(struct btrfs_item));
1969 ins_len -= sizeof(struct btrfs_item);
1970 }
1971
1972 ASSERT(leaf_free_space >= 0);
1973
1974 if (leaf_free_space < ins_len) {
1975 int ret2;
1976
1977 ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
1978 ASSERT(ret2 <= 0);
1979 if (WARN_ON(ret2 > 0))
1980 ret2 = -EUCLEAN;
1981 if (ret2)
1982 ret = ret2;
1983 }
1984 }
1985
1986 return ret;
1987}
1988
1989/*
1990 * Look for a key in a tree and perform necessary modifications to preserve
1991 * tree invariants.
1992 *
1993 * @trans: Handle of transaction, used when modifying the tree
1994 * @p: Holds all btree nodes along the search path
1995 * @root: The root node of the tree
1996 * @key: The key we are looking for
1997 * @ins_len: Indicates purpose of search:
1998 * >0 for inserts it's size of item inserted (*)
1999 * <0 for deletions
2000 * 0 for plain searches, not modifying the tree
2001 *
2002 * (*) If size of item inserted doesn't include
2003 * sizeof(struct btrfs_item), then p->search_for_extension must
2004 * be set.
2005 * @cow: boolean should CoW operations be performed. Must always be 1
2006 * when modifying the tree.
2007 *
2008 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2009 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2010 *
2011 * If @key is found, 0 is returned and you can find the item in the leaf level
2012 * of the path (level 0)
2013 *
2014 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2015 * points to the slot where it should be inserted
2016 *
2017 * If an error is encountered while searching the tree a negative error number
2018 * is returned
2019 */
2020int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2021 const struct btrfs_key *key, struct btrfs_path *p,
2022 int ins_len, int cow)
2023{
2024 struct btrfs_fs_info *fs_info;
2025 struct extent_buffer *b;
2026 int slot;
2027 int ret;
2028 int level;
2029 int lowest_unlock = 1;
2030 /* everything at write_lock_level or lower must be write locked */
2031 int write_lock_level = 0;
2032 u8 lowest_level = 0;
2033 int min_write_lock_level;
2034 int prev_cmp;
2035
2036 if (!root)
2037 return -EINVAL;
2038
2039 fs_info = root->fs_info;
2040 might_sleep();
2041
2042 lowest_level = p->lowest_level;
2043 WARN_ON(lowest_level && ins_len > 0);
2044 WARN_ON(p->nodes[0] != NULL);
2045 BUG_ON(!cow && ins_len);
2046
2047 /*
2048 * For now only allow nowait for read only operations. There's no
2049 * strict reason why we can't, we just only need it for reads so it's
2050 * only implemented for reads.
2051 */
2052 ASSERT(!p->nowait || !cow);
2053
2054 if (ins_len < 0) {
2055 lowest_unlock = 2;
2056
2057 /* when we are removing items, we might have to go up to level
2058 * two as we update tree pointers Make sure we keep write
2059 * for those levels as well
2060 */
2061 write_lock_level = 2;
2062 } else if (ins_len > 0) {
2063 /*
2064 * for inserting items, make sure we have a write lock on
2065 * level 1 so we can update keys
2066 */
2067 write_lock_level = 1;
2068 }
2069
2070 if (!cow)
2071 write_lock_level = -1;
2072
2073 if (cow && (p->keep_locks || p->lowest_level))
2074 write_lock_level = BTRFS_MAX_LEVEL;
2075
2076 min_write_lock_level = write_lock_level;
2077
2078 if (p->need_commit_sem) {
2079 ASSERT(p->search_commit_root);
2080 if (p->nowait) {
2081 if (!down_read_trylock(&fs_info->commit_root_sem))
2082 return -EAGAIN;
2083 } else {
2084 down_read(&fs_info->commit_root_sem);
2085 }
2086 }
2087
2088again:
2089 prev_cmp = -1;
2090 b = btrfs_search_slot_get_root(root, p, write_lock_level);
2091 if (IS_ERR(b)) {
2092 ret = PTR_ERR(b);
2093 goto done;
2094 }
2095
2096 while (b) {
2097 int dec = 0;
2098 int ret2;
2099
2100 level = btrfs_header_level(b);
2101
2102 if (cow) {
2103 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2104
2105 /*
2106 * if we don't really need to cow this block
2107 * then we don't want to set the path blocking,
2108 * so we test it here
2109 */
2110 if (!should_cow_block(trans, root, b))
2111 goto cow_done;
2112
2113 /*
2114 * must have write locks on this node and the
2115 * parent
2116 */
2117 if (level > write_lock_level ||
2118 (level + 1 > write_lock_level &&
2119 level + 1 < BTRFS_MAX_LEVEL &&
2120 p->nodes[level + 1])) {
2121 write_lock_level = level + 1;
2122 btrfs_release_path(p);
2123 goto again;
2124 }
2125
2126 if (last_level)
2127 ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
2128 &b, BTRFS_NESTING_COW);
2129 else
2130 ret2 = btrfs_cow_block(trans, root, b,
2131 p->nodes[level + 1],
2132 p->slots[level + 1], &b,
2133 BTRFS_NESTING_COW);
2134 if (ret2) {
2135 ret = ret2;
2136 goto done;
2137 }
2138 }
2139cow_done:
2140 p->nodes[level] = b;
2141
2142 /*
2143 * we have a lock on b and as long as we aren't changing
2144 * the tree, there is no way to for the items in b to change.
2145 * It is safe to drop the lock on our parent before we
2146 * go through the expensive btree search on b.
2147 *
2148 * If we're inserting or deleting (ins_len != 0), then we might
2149 * be changing slot zero, which may require changing the parent.
2150 * So, we can't drop the lock until after we know which slot
2151 * we're operating on.
2152 */
2153 if (!ins_len && !p->keep_locks) {
2154 int u = level + 1;
2155
2156 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2157 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2158 p->locks[u] = 0;
2159 }
2160 }
2161
2162 if (level == 0) {
2163 if (ins_len > 0)
2164 ASSERT(write_lock_level >= 1);
2165
2166 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2167 if (!p->search_for_split)
2168 unlock_up(p, level, lowest_unlock,
2169 min_write_lock_level, NULL);
2170 goto done;
2171 }
2172
2173 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2174 if (ret < 0)
2175 goto done;
2176 prev_cmp = ret;
2177
2178 if (ret && slot > 0) {
2179 dec = 1;
2180 slot--;
2181 }
2182 p->slots[level] = slot;
2183 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2184 &write_lock_level);
2185 if (ret2 == -EAGAIN)
2186 goto again;
2187 if (ret2) {
2188 ret = ret2;
2189 goto done;
2190 }
2191 b = p->nodes[level];
2192 slot = p->slots[level];
2193
2194 /*
2195 * Slot 0 is special, if we change the key we have to update
2196 * the parent pointer which means we must have a write lock on
2197 * the parent
2198 */
2199 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2200 write_lock_level = level + 1;
2201 btrfs_release_path(p);
2202 goto again;
2203 }
2204
2205 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2206 &write_lock_level);
2207
2208 if (level == lowest_level) {
2209 if (dec)
2210 p->slots[level]++;
2211 goto done;
2212 }
2213
2214 ret2 = read_block_for_search(root, p, &b, slot, key);
2215 if (ret2 == -EAGAIN && !p->nowait)
2216 goto again;
2217 if (ret2) {
2218 ret = ret2;
2219 goto done;
2220 }
2221
2222 if (!p->skip_locking) {
2223 level = btrfs_header_level(b);
2224
2225 btrfs_maybe_reset_lockdep_class(root, b);
2226
2227 if (level <= write_lock_level) {
2228 btrfs_tree_lock(b);
2229 p->locks[level] = BTRFS_WRITE_LOCK;
2230 } else {
2231 if (p->nowait) {
2232 if (!btrfs_try_tree_read_lock(b)) {
2233 free_extent_buffer(b);
2234 ret = -EAGAIN;
2235 goto done;
2236 }
2237 } else {
2238 btrfs_tree_read_lock(b);
2239 }
2240 p->locks[level] = BTRFS_READ_LOCK;
2241 }
2242 p->nodes[level] = b;
2243 }
2244 }
2245 ret = 1;
2246done:
2247 if (ret < 0 && !p->skip_release_on_error)
2248 btrfs_release_path(p);
2249
2250 if (p->need_commit_sem) {
2251 int ret2;
2252
2253 ret2 = finish_need_commit_sem_search(p);
2254 up_read(&fs_info->commit_root_sem);
2255 if (ret2)
2256 ret = ret2;
2257 }
2258
2259 return ret;
2260}
2261ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2262
2263/*
2264 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2265 * current state of the tree together with the operations recorded in the tree
2266 * modification log to search for the key in a previous version of this tree, as
2267 * denoted by the time_seq parameter.
2268 *
2269 * Naturally, there is no support for insert, delete or cow operations.
2270 *
2271 * The resulting path and return value will be set up as if we called
2272 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2273 */
2274int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2275 struct btrfs_path *p, u64 time_seq)
2276{
2277 struct btrfs_fs_info *fs_info = root->fs_info;
2278 struct extent_buffer *b;
2279 int slot;
2280 int ret;
2281 int level;
2282 int lowest_unlock = 1;
2283 u8 lowest_level = 0;
2284
2285 lowest_level = p->lowest_level;
2286 WARN_ON(p->nodes[0] != NULL);
2287 ASSERT(!p->nowait);
2288
2289 if (p->search_commit_root) {
2290 BUG_ON(time_seq);
2291 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2292 }
2293
2294again:
2295 b = btrfs_get_old_root(root, time_seq);
2296 if (unlikely(!b)) {
2297 ret = -EIO;
2298 goto done;
2299 }
2300 level = btrfs_header_level(b);
2301 p->locks[level] = BTRFS_READ_LOCK;
2302
2303 while (b) {
2304 int dec = 0;
2305 int ret2;
2306
2307 level = btrfs_header_level(b);
2308 p->nodes[level] = b;
2309
2310 /*
2311 * we have a lock on b and as long as we aren't changing
2312 * the tree, there is no way to for the items in b to change.
2313 * It is safe to drop the lock on our parent before we
2314 * go through the expensive btree search on b.
2315 */
2316 btrfs_unlock_up_safe(p, level + 1);
2317
2318 ret = btrfs_bin_search(b, 0, key, &slot);
2319 if (ret < 0)
2320 goto done;
2321
2322 if (level == 0) {
2323 p->slots[level] = slot;
2324 unlock_up(p, level, lowest_unlock, 0, NULL);
2325 goto done;
2326 }
2327
2328 if (ret && slot > 0) {
2329 dec = 1;
2330 slot--;
2331 }
2332 p->slots[level] = slot;
2333 unlock_up(p, level, lowest_unlock, 0, NULL);
2334
2335 if (level == lowest_level) {
2336 if (dec)
2337 p->slots[level]++;
2338 goto done;
2339 }
2340
2341 ret2 = read_block_for_search(root, p, &b, slot, key);
2342 if (ret2 == -EAGAIN && !p->nowait)
2343 goto again;
2344 if (ret2) {
2345 ret = ret2;
2346 goto done;
2347 }
2348
2349 level = btrfs_header_level(b);
2350 btrfs_tree_read_lock(b);
2351 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
2352 if (!b) {
2353 ret = -ENOMEM;
2354 goto done;
2355 }
2356 p->locks[level] = BTRFS_READ_LOCK;
2357 p->nodes[level] = b;
2358 }
2359 ret = 1;
2360done:
2361 if (ret < 0)
2362 btrfs_release_path(p);
2363
2364 return ret;
2365}
2366
2367/*
2368 * Search the tree again to find a leaf with smaller keys.
2369 * Returns 0 if it found something.
2370 * Returns 1 if there are no smaller keys.
2371 * Returns < 0 on error.
2372 *
2373 * This may release the path, and so you may lose any locks held at the
2374 * time you call it.
2375 */
2376static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2377{
2378 struct btrfs_key key;
2379 struct btrfs_key orig_key;
2380 struct btrfs_disk_key found_key;
2381 int ret;
2382
2383 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2384 orig_key = key;
2385
2386 if (key.offset > 0) {
2387 key.offset--;
2388 } else if (key.type > 0) {
2389 key.type--;
2390 key.offset = (u64)-1;
2391 } else if (key.objectid > 0) {
2392 key.objectid--;
2393 key.type = (u8)-1;
2394 key.offset = (u64)-1;
2395 } else {
2396 return 1;
2397 }
2398
2399 btrfs_release_path(path);
2400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2401 if (ret <= 0)
2402 return ret;
2403
2404 /*
2405 * Previous key not found. Even if we were at slot 0 of the leaf we had
2406 * before releasing the path and calling btrfs_search_slot(), we now may
2407 * be in a slot pointing to the same original key - this can happen if
2408 * after we released the path, one of more items were moved from a
2409 * sibling leaf into the front of the leaf we had due to an insertion
2410 * (see push_leaf_right()).
2411 * If we hit this case and our slot is > 0 and just decrement the slot
2412 * so that the caller does not process the same key again, which may or
2413 * may not break the caller, depending on its logic.
2414 */
2415 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2416 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2417 ret = btrfs_comp_keys(&found_key, &orig_key);
2418 if (ret == 0) {
2419 if (path->slots[0] > 0) {
2420 path->slots[0]--;
2421 return 0;
2422 }
2423 /*
2424 * At slot 0, same key as before, it means orig_key is
2425 * the lowest, leftmost, key in the tree. We're done.
2426 */
2427 return 1;
2428 }
2429 }
2430
2431 btrfs_item_key(path->nodes[0], &found_key, 0);
2432 ret = btrfs_comp_keys(&found_key, &key);
2433 /*
2434 * We might have had an item with the previous key in the tree right
2435 * before we released our path. And after we released our path, that
2436 * item might have been pushed to the first slot (0) of the leaf we
2437 * were holding due to a tree balance. Alternatively, an item with the
2438 * previous key can exist as the only element of a leaf (big fat item).
2439 * Therefore account for these 2 cases, so that our callers (like
2440 * btrfs_previous_item) don't miss an existing item with a key matching
2441 * the previous key we computed above.
2442 */
2443 if (ret <= 0)
2444 return 0;
2445 return 1;
2446}
2447
2448/*
2449 * helper to use instead of search slot if no exact match is needed but
2450 * instead the next or previous item should be returned.
2451 * When find_higher is true, the next higher item is returned, the next lower
2452 * otherwise.
2453 * When return_any and find_higher are both true, and no higher item is found,
2454 * return the next lower instead.
2455 * When return_any is true and find_higher is false, and no lower item is found,
2456 * return the next higher instead.
2457 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2458 * < 0 on error
2459 */
2460int btrfs_search_slot_for_read(struct btrfs_root *root,
2461 const struct btrfs_key *key,
2462 struct btrfs_path *p, int find_higher,
2463 int return_any)
2464{
2465 int ret;
2466 struct extent_buffer *leaf;
2467
2468again:
2469 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2470 if (ret <= 0)
2471 return ret;
2472 /*
2473 * a return value of 1 means the path is at the position where the
2474 * item should be inserted. Normally this is the next bigger item,
2475 * but in case the previous item is the last in a leaf, path points
2476 * to the first free slot in the previous leaf, i.e. at an invalid
2477 * item.
2478 */
2479 leaf = p->nodes[0];
2480
2481 if (find_higher) {
2482 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2483 ret = btrfs_next_leaf(root, p);
2484 if (ret <= 0)
2485 return ret;
2486 if (!return_any)
2487 return 1;
2488 /*
2489 * no higher item found, return the next
2490 * lower instead
2491 */
2492 return_any = 0;
2493 find_higher = 0;
2494 btrfs_release_path(p);
2495 goto again;
2496 }
2497 } else {
2498 if (p->slots[0] == 0) {
2499 ret = btrfs_prev_leaf(root, p);
2500 if (ret < 0)
2501 return ret;
2502 if (!ret) {
2503 leaf = p->nodes[0];
2504 if (p->slots[0] == btrfs_header_nritems(leaf))
2505 p->slots[0]--;
2506 return 0;
2507 }
2508 if (!return_any)
2509 return 1;
2510 /*
2511 * no lower item found, return the next
2512 * higher instead
2513 */
2514 return_any = 0;
2515 find_higher = 1;
2516 btrfs_release_path(p);
2517 goto again;
2518 } else {
2519 --p->slots[0];
2520 }
2521 }
2522 return 0;
2523}
2524
2525/*
2526 * Execute search and call btrfs_previous_item to traverse backwards if the item
2527 * was not found.
2528 *
2529 * Return 0 if found, 1 if not found and < 0 if error.
2530 */
2531int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2532 struct btrfs_path *path)
2533{
2534 int ret;
2535
2536 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2537 if (ret > 0)
2538 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2539
2540 if (ret == 0)
2541 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2542
2543 return ret;
2544}
2545
2546/*
2547 * Search for a valid slot for the given path.
2548 *
2549 * @root: The root node of the tree.
2550 * @key: Will contain a valid item if found.
2551 * @path: The starting point to validate the slot.
2552 *
2553 * Return: 0 if the item is valid
2554 * 1 if not found
2555 * <0 if error.
2556 */
2557int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2558 struct btrfs_path *path)
2559{
2560 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2561 int ret;
2562
2563 ret = btrfs_next_leaf(root, path);
2564 if (ret)
2565 return ret;
2566 }
2567
2568 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2569 return 0;
2570}
2571
2572/*
2573 * adjust the pointers going up the tree, starting at level
2574 * making sure the right key of each node is points to 'key'.
2575 * This is used after shifting pointers to the left, so it stops
2576 * fixing up pointers when a given leaf/node is not in slot 0 of the
2577 * higher levels
2578 *
2579 */
2580static void fixup_low_keys(struct btrfs_trans_handle *trans,
2581 const struct btrfs_path *path,
2582 const struct btrfs_disk_key *key, int level)
2583{
2584 int i;
2585 struct extent_buffer *t;
2586 int ret;
2587
2588 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2589 int tslot = path->slots[i];
2590
2591 if (!path->nodes[i])
2592 break;
2593 t = path->nodes[i];
2594 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2595 BTRFS_MOD_LOG_KEY_REPLACE);
2596 BUG_ON(ret < 0);
2597 btrfs_set_node_key(t, key, tslot);
2598 btrfs_mark_buffer_dirty(trans, path->nodes[i]);
2599 if (tslot != 0)
2600 break;
2601 }
2602}
2603
2604/*
2605 * update item key.
2606 *
2607 * This function isn't completely safe. It's the caller's responsibility
2608 * that the new key won't break the order
2609 */
2610void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2611 const struct btrfs_path *path,
2612 const struct btrfs_key *new_key)
2613{
2614 struct btrfs_fs_info *fs_info = trans->fs_info;
2615 struct btrfs_disk_key disk_key;
2616 struct extent_buffer *eb;
2617 int slot;
2618
2619 eb = path->nodes[0];
2620 slot = path->slots[0];
2621 if (slot > 0) {
2622 btrfs_item_key(eb, &disk_key, slot - 1);
2623 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
2624 btrfs_print_leaf(eb);
2625 btrfs_crit(fs_info,
2626 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
2627 slot, btrfs_disk_key_objectid(&disk_key),
2628 btrfs_disk_key_type(&disk_key),
2629 btrfs_disk_key_offset(&disk_key),
2630 BTRFS_KEY_FMT_VALUE(new_key));
2631 BUG();
2632 }
2633 }
2634 if (slot < btrfs_header_nritems(eb) - 1) {
2635 btrfs_item_key(eb, &disk_key, slot + 1);
2636 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
2637 btrfs_print_leaf(eb);
2638 btrfs_crit(fs_info,
2639 "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
2640 slot, btrfs_disk_key_objectid(&disk_key),
2641 btrfs_disk_key_type(&disk_key),
2642 btrfs_disk_key_offset(&disk_key),
2643 BTRFS_KEY_FMT_VALUE(new_key));
2644 BUG();
2645 }
2646 }
2647
2648 btrfs_cpu_key_to_disk(&disk_key, new_key);
2649 btrfs_set_item_key(eb, &disk_key, slot);
2650 btrfs_mark_buffer_dirty(trans, eb);
2651 if (slot == 0)
2652 fixup_low_keys(trans, path, &disk_key, 1);
2653}
2654
2655/*
2656 * Check key order of two sibling extent buffers.
2657 *
2658 * Return true if something is wrong.
2659 * Return false if everything is fine.
2660 *
2661 * Tree-checker only works inside one tree block, thus the following
2662 * corruption can not be detected by tree-checker:
2663 *
2664 * Leaf @left | Leaf @right
2665 * --------------------------------------------------------------
2666 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2667 *
2668 * Key f6 in leaf @left itself is valid, but not valid when the next
2669 * key in leaf @right is 7.
2670 * This can only be checked at tree block merge time.
2671 * And since tree checker has ensured all key order in each tree block
2672 * is correct, we only need to bother the last key of @left and the first
2673 * key of @right.
2674 */
2675static bool check_sibling_keys(const struct extent_buffer *left,
2676 const struct extent_buffer *right)
2677{
2678 struct btrfs_key left_last;
2679 struct btrfs_key right_first;
2680 int level = btrfs_header_level(left);
2681 int nr_left = btrfs_header_nritems(left);
2682 int nr_right = btrfs_header_nritems(right);
2683
2684 /* No key to check in one of the tree blocks */
2685 if (!nr_left || !nr_right)
2686 return false;
2687
2688 if (level) {
2689 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2690 btrfs_node_key_to_cpu(right, &right_first, 0);
2691 } else {
2692 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2693 btrfs_item_key_to_cpu(right, &right_first, 0);
2694 }
2695
2696 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2697 btrfs_crit(left->fs_info, "left extent buffer:");
2698 btrfs_print_tree(left, false);
2699 btrfs_crit(left->fs_info, "right extent buffer:");
2700 btrfs_print_tree(right, false);
2701 btrfs_crit(left->fs_info,
2702"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT,
2703 BTRFS_KEY_FMT_VALUE(&left_last),
2704 BTRFS_KEY_FMT_VALUE(&right_first));
2705 return true;
2706 }
2707 return false;
2708}
2709
2710/*
2711 * try to push data from one node into the next node left in the
2712 * tree.
2713 *
2714 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2715 * error, and > 0 if there was no room in the left hand block.
2716 */
2717static int push_node_left(struct btrfs_trans_handle *trans,
2718 struct extent_buffer *dst,
2719 struct extent_buffer *src, bool empty)
2720{
2721 struct btrfs_fs_info *fs_info = trans->fs_info;
2722 int push_items = 0;
2723 int src_nritems;
2724 int dst_nritems;
2725 int ret = 0;
2726
2727 src_nritems = btrfs_header_nritems(src);
2728 dst_nritems = btrfs_header_nritems(dst);
2729 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2730 WARN_ON(btrfs_header_generation(src) != trans->transid);
2731 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2732
2733 if (!empty && src_nritems <= 8)
2734 return 1;
2735
2736 if (push_items <= 0)
2737 return 1;
2738
2739 if (empty) {
2740 push_items = min(src_nritems, push_items);
2741 if (push_items < src_nritems) {
2742 /* leave at least 8 pointers in the node if
2743 * we aren't going to empty it
2744 */
2745 if (src_nritems - push_items < 8) {
2746 if (push_items <= 8)
2747 return 1;
2748 push_items -= 8;
2749 }
2750 }
2751 } else
2752 push_items = min(src_nritems - 8, push_items);
2753
2754 /* dst is the left eb, src is the middle eb */
2755 if (unlikely(check_sibling_keys(dst, src))) {
2756 ret = -EUCLEAN;
2757 btrfs_abort_transaction(trans, ret);
2758 return ret;
2759 }
2760 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2761 if (unlikely(ret)) {
2762 btrfs_abort_transaction(trans, ret);
2763 return ret;
2764 }
2765 copy_extent_buffer(dst, src,
2766 btrfs_node_key_ptr_offset(dst, dst_nritems),
2767 btrfs_node_key_ptr_offset(src, 0),
2768 push_items * sizeof(struct btrfs_key_ptr));
2769
2770 if (push_items < src_nritems) {
2771 /*
2772 * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2773 * don't need to do an explicit tree mod log operation for it.
2774 */
2775 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2776 btrfs_node_key_ptr_offset(src, push_items),
2777 (src_nritems - push_items) *
2778 sizeof(struct btrfs_key_ptr));
2779 }
2780 btrfs_set_header_nritems(src, src_nritems - push_items);
2781 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2782 btrfs_mark_buffer_dirty(trans, src);
2783 btrfs_mark_buffer_dirty(trans, dst);
2784
2785 return ret;
2786}
2787
2788/*
2789 * try to push data from one node into the next node right in the
2790 * tree.
2791 *
2792 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2793 * error, and > 0 if there was no room in the right hand block.
2794 *
2795 * this will only push up to 1/2 the contents of the left node over
2796 */
2797static int balance_node_right(struct btrfs_trans_handle *trans,
2798 struct extent_buffer *dst,
2799 struct extent_buffer *src)
2800{
2801 struct btrfs_fs_info *fs_info = trans->fs_info;
2802 int push_items = 0;
2803 int max_push;
2804 int src_nritems;
2805 int dst_nritems;
2806 int ret = 0;
2807
2808 WARN_ON(btrfs_header_generation(src) != trans->transid);
2809 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2810
2811 src_nritems = btrfs_header_nritems(src);
2812 dst_nritems = btrfs_header_nritems(dst);
2813 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2814 if (push_items <= 0)
2815 return 1;
2816
2817 if (src_nritems < 4)
2818 return 1;
2819
2820 max_push = src_nritems / 2 + 1;
2821 /* don't try to empty the node */
2822 if (max_push >= src_nritems)
2823 return 1;
2824
2825 if (max_push < push_items)
2826 push_items = max_push;
2827
2828 /* dst is the right eb, src is the middle eb */
2829 if (unlikely(check_sibling_keys(src, dst))) {
2830 ret = -EUCLEAN;
2831 btrfs_abort_transaction(trans, ret);
2832 return ret;
2833 }
2834
2835 /*
2836 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2837 * need to do an explicit tree mod log operation for it.
2838 */
2839 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2840 btrfs_node_key_ptr_offset(dst, 0),
2841 (dst_nritems) *
2842 sizeof(struct btrfs_key_ptr));
2843
2844 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2845 push_items);
2846 if (unlikely(ret)) {
2847 btrfs_abort_transaction(trans, ret);
2848 return ret;
2849 }
2850 copy_extent_buffer(dst, src,
2851 btrfs_node_key_ptr_offset(dst, 0),
2852 btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2853 push_items * sizeof(struct btrfs_key_ptr));
2854
2855 btrfs_set_header_nritems(src, src_nritems - push_items);
2856 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2857
2858 btrfs_mark_buffer_dirty(trans, src);
2859 btrfs_mark_buffer_dirty(trans, dst);
2860
2861 return ret;
2862}
2863
2864/*
2865 * helper function to insert a new root level in the tree.
2866 * A new node is allocated, and a single item is inserted to
2867 * point to the existing root
2868 *
2869 * returns zero on success or < 0 on failure.
2870 */
2871static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2872 struct btrfs_root *root,
2873 struct btrfs_path *path, int level)
2874{
2875 u64 lower_gen;
2876 struct extent_buffer *lower;
2877 struct extent_buffer *c;
2878 struct extent_buffer *old;
2879 struct btrfs_disk_key lower_key;
2880 int ret;
2881
2882 BUG_ON(path->nodes[level]);
2883 BUG_ON(path->nodes[level-1] != root->node);
2884
2885 lower = path->nodes[level-1];
2886 if (level == 1)
2887 btrfs_item_key(lower, &lower_key, 0);
2888 else
2889 btrfs_node_key(lower, &lower_key, 0);
2890
2891 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
2892 &lower_key, level, root->node->start, 0,
2893 0, BTRFS_NESTING_NEW_ROOT);
2894 if (IS_ERR(c))
2895 return PTR_ERR(c);
2896
2897 root_add_used_bytes(root);
2898
2899 btrfs_set_header_nritems(c, 1);
2900 btrfs_set_node_key(c, &lower_key, 0);
2901 btrfs_set_node_blockptr(c, 0, lower->start);
2902 lower_gen = btrfs_header_generation(lower);
2903 WARN_ON(lower_gen != trans->transid);
2904
2905 btrfs_set_node_ptr_generation(c, 0, lower_gen);
2906
2907 btrfs_mark_buffer_dirty(trans, c);
2908
2909 old = root->node;
2910 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2911 if (ret < 0) {
2912 int ret2;
2913
2914 btrfs_clear_buffer_dirty(trans, c);
2915 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2916 if (unlikely(ret2 < 0))
2917 btrfs_abort_transaction(trans, ret2);
2918 btrfs_tree_unlock(c);
2919 free_extent_buffer(c);
2920 return ret;
2921 }
2922 rcu_assign_pointer(root->node, c);
2923
2924 /* the super has an extra ref to root->node */
2925 free_extent_buffer(old);
2926
2927 add_root_to_dirty_list(root);
2928 refcount_inc(&c->refs);
2929 path->nodes[level] = c;
2930 path->locks[level] = BTRFS_WRITE_LOCK;
2931 path->slots[level] = 0;
2932 return 0;
2933}
2934
2935/*
2936 * worker function to insert a single pointer in a node.
2937 * the node should have enough room for the pointer already
2938 *
2939 * slot and level indicate where you want the key to go, and
2940 * blocknr is the block the key points to.
2941 */
2942static int insert_ptr(struct btrfs_trans_handle *trans,
2943 const struct btrfs_path *path,
2944 const struct btrfs_disk_key *key, u64 bytenr,
2945 int slot, int level)
2946{
2947 struct extent_buffer *lower;
2948 int nritems;
2949 int ret;
2950
2951 BUG_ON(!path->nodes[level]);
2952 btrfs_assert_tree_write_locked(path->nodes[level]);
2953 lower = path->nodes[level];
2954 nritems = btrfs_header_nritems(lower);
2955 BUG_ON(slot > nritems);
2956 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2957 if (slot != nritems) {
2958 if (level) {
2959 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2960 slot, nritems - slot);
2961 if (unlikely(ret < 0)) {
2962 btrfs_abort_transaction(trans, ret);
2963 return ret;
2964 }
2965 }
2966 memmove_extent_buffer(lower,
2967 btrfs_node_key_ptr_offset(lower, slot + 1),
2968 btrfs_node_key_ptr_offset(lower, slot),
2969 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2970 }
2971 if (level) {
2972 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2973 BTRFS_MOD_LOG_KEY_ADD);
2974 if (unlikely(ret < 0)) {
2975 btrfs_abort_transaction(trans, ret);
2976 return ret;
2977 }
2978 }
2979 btrfs_set_node_key(lower, key, slot);
2980 btrfs_set_node_blockptr(lower, slot, bytenr);
2981 WARN_ON(trans->transid == 0);
2982 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2983 btrfs_set_header_nritems(lower, nritems + 1);
2984 btrfs_mark_buffer_dirty(trans, lower);
2985
2986 return 0;
2987}
2988
2989/*
2990 * split the node at the specified level in path in two.
2991 * The path is corrected to point to the appropriate node after the split
2992 *
2993 * Before splitting this tries to make some room in the node by pushing
2994 * left and right, if either one works, it returns right away.
2995 *
2996 * returns 0 on success and < 0 on failure
2997 */
2998static noinline int split_node(struct btrfs_trans_handle *trans,
2999 struct btrfs_root *root,
3000 struct btrfs_path *path, int level)
3001{
3002 struct btrfs_fs_info *fs_info = root->fs_info;
3003 struct extent_buffer *c;
3004 struct extent_buffer *split;
3005 struct btrfs_disk_key disk_key;
3006 int mid;
3007 int ret;
3008 u32 c_nritems;
3009
3010 c = path->nodes[level];
3011 WARN_ON(btrfs_header_generation(c) != trans->transid);
3012 if (c == root->node) {
3013 /*
3014 * trying to split the root, lets make a new one
3015 *
3016 * tree mod log: We don't log_removal old root in
3017 * insert_new_root, because that root buffer will be kept as a
3018 * normal node. We are going to log removal of half of the
3019 * elements below with btrfs_tree_mod_log_eb_copy(). We're
3020 * holding a tree lock on the buffer, which is why we cannot
3021 * race with other tree_mod_log users.
3022 */
3023 ret = insert_new_root(trans, root, path, level + 1);
3024 if (ret)
3025 return ret;
3026 } else {
3027 ret = push_nodes_for_insert(trans, root, path, level);
3028 c = path->nodes[level];
3029 if (!ret && btrfs_header_nritems(c) <
3030 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3031 return 0;
3032 if (ret < 0)
3033 return ret;
3034 }
3035
3036 c_nritems = btrfs_header_nritems(c);
3037 mid = (c_nritems + 1) / 2;
3038 btrfs_node_key(c, &disk_key, mid);
3039
3040 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3041 &disk_key, level, c->start, 0,
3042 0, BTRFS_NESTING_SPLIT);
3043 if (IS_ERR(split))
3044 return PTR_ERR(split);
3045
3046 root_add_used_bytes(root);
3047 ASSERT(btrfs_header_level(c) == level);
3048
3049 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3050 if (unlikely(ret)) {
3051 btrfs_tree_unlock(split);
3052 free_extent_buffer(split);
3053 btrfs_abort_transaction(trans, ret);
3054 return ret;
3055 }
3056 copy_extent_buffer(split, c,
3057 btrfs_node_key_ptr_offset(split, 0),
3058 btrfs_node_key_ptr_offset(c, mid),
3059 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3060 btrfs_set_header_nritems(split, c_nritems - mid);
3061 btrfs_set_header_nritems(c, mid);
3062
3063 btrfs_mark_buffer_dirty(trans, c);
3064 btrfs_mark_buffer_dirty(trans, split);
3065
3066 ret = insert_ptr(trans, path, &disk_key, split->start,
3067 path->slots[level + 1] + 1, level + 1);
3068 if (ret < 0) {
3069 btrfs_tree_unlock(split);
3070 free_extent_buffer(split);
3071 return ret;
3072 }
3073
3074 if (path->slots[level] >= mid) {
3075 path->slots[level] -= mid;
3076 btrfs_tree_unlock(c);
3077 free_extent_buffer(c);
3078 path->nodes[level] = split;
3079 path->slots[level + 1] += 1;
3080 } else {
3081 btrfs_tree_unlock(split);
3082 free_extent_buffer(split);
3083 }
3084 return 0;
3085}
3086
3087/*
3088 * how many bytes are required to store the items in a leaf. start
3089 * and nr indicate which items in the leaf to check. This totals up the
3090 * space used both by the item structs and the item data
3091 */
3092static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3093{
3094 int data_len;
3095 int nritems = btrfs_header_nritems(l);
3096 int end = min(nritems, start + nr) - 1;
3097
3098 if (!nr)
3099 return 0;
3100 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3101 data_len = data_len - btrfs_item_offset(l, end);
3102 data_len += sizeof(struct btrfs_item) * nr;
3103 WARN_ON(data_len < 0);
3104 return data_len;
3105}
3106
3107/*
3108 * The space between the end of the leaf items and
3109 * the start of the leaf data. IOW, how much room
3110 * the leaf has left for both items and data
3111 */
3112int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3113{
3114 struct btrfs_fs_info *fs_info = leaf->fs_info;
3115 int nritems = btrfs_header_nritems(leaf);
3116 int ret;
3117
3118 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3119 if (unlikely(ret < 0)) {
3120 btrfs_crit(fs_info,
3121 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3122 ret,
3123 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3124 leaf_space_used(leaf, 0, nritems), nritems);
3125 }
3126 return ret;
3127}
3128
3129/*
3130 * min slot controls the lowest index we're willing to push to the
3131 * right. We'll push up to and including min_slot, but no lower
3132 */
3133static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3134 struct btrfs_path *path,
3135 int data_size, bool empty,
3136 struct extent_buffer *right,
3137 int free_space, u32 left_nritems,
3138 u32 min_slot)
3139{
3140 struct btrfs_fs_info *fs_info = right->fs_info;
3141 struct extent_buffer *left = path->nodes[0];
3142 struct extent_buffer *upper = path->nodes[1];
3143 struct btrfs_disk_key disk_key;
3144 int slot;
3145 u32 i;
3146 int push_space = 0;
3147 int push_items = 0;
3148 u32 nr;
3149 u32 right_nritems;
3150 u32 data_end;
3151 u32 this_item_size;
3152
3153 if (empty)
3154 nr = 0;
3155 else
3156 nr = max_t(u32, 1, min_slot);
3157
3158 if (path->slots[0] >= left_nritems)
3159 push_space += data_size;
3160
3161 slot = path->slots[1];
3162 i = left_nritems - 1;
3163 while (i >= nr) {
3164 if (!empty && push_items > 0) {
3165 if (path->slots[0] > i)
3166 break;
3167 if (path->slots[0] == i) {
3168 int space = btrfs_leaf_free_space(left);
3169
3170 if (space + push_space * 2 > free_space)
3171 break;
3172 }
3173 }
3174
3175 if (path->slots[0] == i)
3176 push_space += data_size;
3177
3178 this_item_size = btrfs_item_size(left, i);
3179 if (this_item_size + sizeof(struct btrfs_item) +
3180 push_space > free_space)
3181 break;
3182
3183 push_items++;
3184 push_space += this_item_size + sizeof(struct btrfs_item);
3185 if (i == 0)
3186 break;
3187 i--;
3188 }
3189
3190 if (push_items == 0)
3191 goto out_unlock;
3192
3193 WARN_ON(!empty && push_items == left_nritems);
3194
3195 /* push left to right */
3196 right_nritems = btrfs_header_nritems(right);
3197
3198 push_space = btrfs_item_data_end(left, left_nritems - push_items);
3199 push_space -= leaf_data_end(left);
3200
3201 /* make room in the right data area */
3202 data_end = leaf_data_end(right);
3203 memmove_leaf_data(right, data_end - push_space, data_end,
3204 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3205
3206 /* copy from the left data area */
3207 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3208 leaf_data_end(left), push_space);
3209
3210 memmove_leaf_items(right, push_items, 0, right_nritems);
3211
3212 /* copy the items from left to right */
3213 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3214
3215 /* update the item pointers */
3216 right_nritems += push_items;
3217 btrfs_set_header_nritems(right, right_nritems);
3218 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3219 for (i = 0; i < right_nritems; i++) {
3220 push_space -= btrfs_item_size(right, i);
3221 btrfs_set_item_offset(right, i, push_space);
3222 }
3223
3224 left_nritems -= push_items;
3225 btrfs_set_header_nritems(left, left_nritems);
3226
3227 if (left_nritems)
3228 btrfs_mark_buffer_dirty(trans, left);
3229 else
3230 btrfs_clear_buffer_dirty(trans, left);
3231
3232 btrfs_mark_buffer_dirty(trans, right);
3233
3234 btrfs_item_key(right, &disk_key, 0);
3235 btrfs_set_node_key(upper, &disk_key, slot + 1);
3236 btrfs_mark_buffer_dirty(trans, upper);
3237
3238 /* then fixup the leaf pointer in the path */
3239 if (path->slots[0] >= left_nritems) {
3240 path->slots[0] -= left_nritems;
3241 btrfs_tree_unlock(left);
3242 free_extent_buffer(left);
3243 path->nodes[0] = right;
3244 path->slots[1] += 1;
3245 } else {
3246 btrfs_tree_unlock(right);
3247 free_extent_buffer(right);
3248 }
3249 return 0;
3250
3251out_unlock:
3252 btrfs_tree_unlock(right);
3253 free_extent_buffer(right);
3254 return 1;
3255}
3256
3257/*
3258 * push some data in the path leaf to the right, trying to free up at
3259 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3260 *
3261 * returns 1 if the push failed because the other node didn't have enough
3262 * room, 0 if everything worked out and < 0 if there were major errors.
3263 *
3264 * this will push starting from min_slot to the end of the leaf. It won't
3265 * push any slot lower than min_slot
3266 */
3267static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3268 *root, struct btrfs_path *path,
3269 int min_data_size, int data_size,
3270 bool empty, u32 min_slot)
3271{
3272 struct extent_buffer *left = path->nodes[0];
3273 struct extent_buffer *right;
3274 struct extent_buffer *upper;
3275 int slot;
3276 int free_space;
3277 u32 left_nritems;
3278 int ret;
3279
3280 if (!path->nodes[1])
3281 return 1;
3282
3283 slot = path->slots[1];
3284 upper = path->nodes[1];
3285 if (slot >= btrfs_header_nritems(upper) - 1)
3286 return 1;
3287
3288 btrfs_assert_tree_write_locked(path->nodes[1]);
3289
3290 right = btrfs_read_node_slot(upper, slot + 1);
3291 if (IS_ERR(right))
3292 return PTR_ERR(right);
3293
3294 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
3295
3296 free_space = btrfs_leaf_free_space(right);
3297 if (free_space < data_size)
3298 goto out_unlock;
3299
3300 ret = btrfs_cow_block(trans, root, right, upper,
3301 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3302 if (ret)
3303 goto out_unlock;
3304
3305 left_nritems = btrfs_header_nritems(left);
3306 if (left_nritems == 0)
3307 goto out_unlock;
3308
3309 if (unlikely(check_sibling_keys(left, right))) {
3310 ret = -EUCLEAN;
3311 btrfs_abort_transaction(trans, ret);
3312 btrfs_tree_unlock(right);
3313 free_extent_buffer(right);
3314 return ret;
3315 }
3316 if (path->slots[0] == left_nritems && !empty) {
3317 /* Key greater than all keys in the leaf, right neighbor has
3318 * enough room for it and we're not emptying our leaf to delete
3319 * it, therefore use right neighbor to insert the new item and
3320 * no need to touch/dirty our left leaf. */
3321 btrfs_tree_unlock(left);
3322 free_extent_buffer(left);
3323 path->nodes[0] = right;
3324 path->slots[0] = 0;
3325 path->slots[1]++;
3326 return 0;
3327 }
3328
3329 return __push_leaf_right(trans, path, min_data_size, empty, right,
3330 free_space, left_nritems, min_slot);
3331out_unlock:
3332 btrfs_tree_unlock(right);
3333 free_extent_buffer(right);
3334 return 1;
3335}
3336
3337/*
3338 * push some data in the path leaf to the left, trying to free up at
3339 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3340 *
3341 * max_slot can put a limit on how far into the leaf we'll push items. The
3342 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3343 * items
3344 */
3345static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3346 struct btrfs_path *path, int data_size,
3347 bool empty, struct extent_buffer *left,
3348 int free_space, u32 right_nritems,
3349 u32 max_slot)
3350{
3351 struct btrfs_fs_info *fs_info = left->fs_info;
3352 struct btrfs_disk_key disk_key;
3353 struct extent_buffer *right = path->nodes[0];
3354 int i;
3355 int push_space = 0;
3356 int push_items = 0;
3357 u32 old_left_nritems;
3358 u32 nr;
3359 int ret = 0;
3360 u32 this_item_size;
3361 u32 old_left_item_size;
3362
3363 if (empty)
3364 nr = min(right_nritems, max_slot);
3365 else
3366 nr = min(right_nritems - 1, max_slot);
3367
3368 for (i = 0; i < nr; i++) {
3369 if (!empty && push_items > 0) {
3370 if (path->slots[0] < i)
3371 break;
3372 if (path->slots[0] == i) {
3373 int space = btrfs_leaf_free_space(right);
3374
3375 if (space + push_space * 2 > free_space)
3376 break;
3377 }
3378 }
3379
3380 if (path->slots[0] == i)
3381 push_space += data_size;
3382
3383 this_item_size = btrfs_item_size(right, i);
3384 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3385 free_space)
3386 break;
3387
3388 push_items++;
3389 push_space += this_item_size + sizeof(struct btrfs_item);
3390 }
3391
3392 if (push_items == 0) {
3393 ret = 1;
3394 goto out;
3395 }
3396 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3397
3398 /* push data from right to left */
3399 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3400
3401 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3402 btrfs_item_offset(right, push_items - 1);
3403
3404 copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3405 btrfs_item_offset(right, push_items - 1), push_space);
3406 old_left_nritems = btrfs_header_nritems(left);
3407 BUG_ON(old_left_nritems <= 0);
3408
3409 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3410 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3411 u32 ioff;
3412
3413 ioff = btrfs_item_offset(left, i);
3414 btrfs_set_item_offset(left, i,
3415 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3416 }
3417 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3418
3419 /* fixup right node */
3420 if (unlikely(push_items > right_nritems)) {
3421 ret = -EUCLEAN;
3422 btrfs_abort_transaction(trans, ret);
3423 btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)",
3424 push_items, right_nritems);
3425 goto out;
3426 }
3427
3428 if (push_items < right_nritems) {
3429 push_space = btrfs_item_offset(right, push_items - 1) -
3430 leaf_data_end(right);
3431 memmove_leaf_data(right,
3432 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3433 leaf_data_end(right), push_space);
3434
3435 memmove_leaf_items(right, 0, push_items,
3436 btrfs_header_nritems(right) - push_items);
3437 }
3438
3439 right_nritems -= push_items;
3440 btrfs_set_header_nritems(right, right_nritems);
3441 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3442 for (i = 0; i < right_nritems; i++) {
3443 push_space = push_space - btrfs_item_size(right, i);
3444 btrfs_set_item_offset(right, i, push_space);
3445 }
3446
3447 btrfs_mark_buffer_dirty(trans, left);
3448 if (right_nritems)
3449 btrfs_mark_buffer_dirty(trans, right);
3450 else
3451 btrfs_clear_buffer_dirty(trans, right);
3452
3453 btrfs_item_key(right, &disk_key, 0);
3454 fixup_low_keys(trans, path, &disk_key, 1);
3455
3456 /* then fixup the leaf pointer in the path */
3457 if (path->slots[0] < push_items) {
3458 path->slots[0] += old_left_nritems;
3459 btrfs_tree_unlock(right);
3460 free_extent_buffer(right);
3461 path->nodes[0] = left;
3462 path->slots[1] -= 1;
3463 } else {
3464 btrfs_tree_unlock(left);
3465 free_extent_buffer(left);
3466 path->slots[0] -= push_items;
3467 }
3468 BUG_ON(path->slots[0] < 0);
3469 return ret;
3470out:
3471 btrfs_tree_unlock(left);
3472 free_extent_buffer(left);
3473 return ret;
3474}
3475
3476/*
3477 * push some data in the path leaf to the left, trying to free up at
3478 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3479 *
3480 * max_slot can put a limit on how far into the leaf we'll push items. The
3481 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3482 * items
3483 */
3484static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3485 *root, struct btrfs_path *path, int min_data_size,
3486 int data_size, int empty, u32 max_slot)
3487{
3488 struct extent_buffer *right = path->nodes[0];
3489 struct extent_buffer *left;
3490 int slot;
3491 int free_space;
3492 u32 right_nritems;
3493 int ret = 0;
3494
3495 slot = path->slots[1];
3496 if (slot == 0)
3497 return 1;
3498 if (!path->nodes[1])
3499 return 1;
3500
3501 right_nritems = btrfs_header_nritems(right);
3502 if (right_nritems == 0)
3503 return 1;
3504
3505 btrfs_assert_tree_write_locked(path->nodes[1]);
3506
3507 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3508 if (IS_ERR(left))
3509 return PTR_ERR(left);
3510
3511 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
3512
3513 free_space = btrfs_leaf_free_space(left);
3514 if (free_space < data_size) {
3515 ret = 1;
3516 goto out;
3517 }
3518
3519 ret = btrfs_cow_block(trans, root, left,
3520 path->nodes[1], slot - 1, &left,
3521 BTRFS_NESTING_LEFT_COW);
3522 if (ret) {
3523 /* we hit -ENOSPC, but it isn't fatal here */
3524 if (ret == -ENOSPC)
3525 ret = 1;
3526 goto out;
3527 }
3528
3529 if (unlikely(check_sibling_keys(left, right))) {
3530 ret = -EUCLEAN;
3531 btrfs_abort_transaction(trans, ret);
3532 goto out;
3533 }
3534 return __push_leaf_left(trans, path, min_data_size, empty, left,
3535 free_space, right_nritems, max_slot);
3536out:
3537 btrfs_tree_unlock(left);
3538 free_extent_buffer(left);
3539 return ret;
3540}
3541
3542/*
3543 * split the path's leaf in two, making sure there is at least data_size
3544 * available for the resulting leaf level of the path.
3545 */
3546static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3547 struct btrfs_path *path,
3548 struct extent_buffer *l,
3549 struct extent_buffer *right,
3550 int slot, int mid, int nritems)
3551{
3552 struct btrfs_fs_info *fs_info = trans->fs_info;
3553 int data_copy_size;
3554 int rt_data_off;
3555 int i;
3556 int ret;
3557 struct btrfs_disk_key disk_key;
3558
3559 nritems = nritems - mid;
3560 btrfs_set_header_nritems(right, nritems);
3561 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3562
3563 copy_leaf_items(right, l, 0, mid, nritems);
3564
3565 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3566 leaf_data_end(l), data_copy_size);
3567
3568 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3569
3570 for (i = 0; i < nritems; i++) {
3571 u32 ioff;
3572
3573 ioff = btrfs_item_offset(right, i);
3574 btrfs_set_item_offset(right, i, ioff + rt_data_off);
3575 }
3576
3577 btrfs_set_header_nritems(l, mid);
3578 btrfs_item_key(right, &disk_key, 0);
3579 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3580 if (ret < 0)
3581 return ret;
3582
3583 btrfs_mark_buffer_dirty(trans, right);
3584 btrfs_mark_buffer_dirty(trans, l);
3585 BUG_ON(path->slots[0] != slot);
3586
3587 if (mid <= slot) {
3588 btrfs_tree_unlock(path->nodes[0]);
3589 free_extent_buffer(path->nodes[0]);
3590 path->nodes[0] = right;
3591 path->slots[0] -= mid;
3592 path->slots[1] += 1;
3593 } else {
3594 btrfs_tree_unlock(right);
3595 free_extent_buffer(right);
3596 }
3597
3598 BUG_ON(path->slots[0] < 0);
3599
3600 return 0;
3601}
3602
3603/*
3604 * double splits happen when we need to insert a big item in the middle
3605 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3606 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3607 * A B C
3608 *
3609 * We avoid this by trying to push the items on either side of our target
3610 * into the adjacent leaves. If all goes well we can avoid the double split
3611 * completely.
3612 */
3613static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3614 struct btrfs_root *root,
3615 struct btrfs_path *path,
3616 int data_size)
3617{
3618 int ret;
3619 int progress = 0;
3620 int slot;
3621 u32 nritems;
3622 int space_needed = data_size;
3623
3624 slot = path->slots[0];
3625 if (slot < btrfs_header_nritems(path->nodes[0]))
3626 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3627
3628 /*
3629 * try to push all the items after our slot into the
3630 * right leaf
3631 */
3632 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3633 if (ret < 0)
3634 return ret;
3635
3636 if (ret == 0)
3637 progress++;
3638
3639 nritems = btrfs_header_nritems(path->nodes[0]);
3640 /*
3641 * our goal is to get our slot at the start or end of a leaf. If
3642 * we've done so we're done
3643 */
3644 if (path->slots[0] == 0 || path->slots[0] == nritems)
3645 return 0;
3646
3647 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3648 return 0;
3649
3650 /* try to push all the items before our slot into the next leaf */
3651 slot = path->slots[0];
3652 space_needed = data_size;
3653 if (slot > 0)
3654 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3655 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3656 if (ret < 0)
3657 return ret;
3658
3659 if (ret == 0)
3660 progress++;
3661
3662 if (progress)
3663 return 0;
3664 return 1;
3665}
3666
3667/*
3668 * split the path's leaf in two, making sure there is at least data_size
3669 * available for the resulting leaf level of the path.
3670 *
3671 * returns 0 if all went well and < 0 on failure.
3672 */
3673static noinline int split_leaf(struct btrfs_trans_handle *trans,
3674 struct btrfs_root *root,
3675 const struct btrfs_key *ins_key,
3676 struct btrfs_path *path, int data_size,
3677 bool extend)
3678{
3679 struct btrfs_disk_key disk_key;
3680 struct extent_buffer *l;
3681 u32 nritems;
3682 int mid;
3683 int slot;
3684 struct extent_buffer *right;
3685 struct btrfs_fs_info *fs_info = root->fs_info;
3686 int ret = 0;
3687 int wret;
3688 int split;
3689 int num_doubles = 0;
3690 int tried_avoid_double = 0;
3691
3692 l = path->nodes[0];
3693 slot = path->slots[0];
3694 if (extend && data_size + btrfs_item_size(l, slot) +
3695 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3696 return -EOVERFLOW;
3697
3698 /* first try to make some room by pushing left and right */
3699 if (data_size && path->nodes[1]) {
3700 int space_needed = data_size;
3701
3702 if (slot < btrfs_header_nritems(l))
3703 space_needed -= btrfs_leaf_free_space(l);
3704
3705 wret = push_leaf_right(trans, root, path, space_needed,
3706 space_needed, 0, 0);
3707 if (wret < 0)
3708 return wret;
3709 if (wret) {
3710 space_needed = data_size;
3711 if (slot > 0)
3712 space_needed -= btrfs_leaf_free_space(l);
3713 wret = push_leaf_left(trans, root, path, space_needed,
3714 space_needed, 0, (u32)-1);
3715 if (wret < 0)
3716 return wret;
3717 }
3718 l = path->nodes[0];
3719
3720 /* did the pushes work? */
3721 if (btrfs_leaf_free_space(l) >= data_size)
3722 return 0;
3723 }
3724
3725 if (!path->nodes[1]) {
3726 ret = insert_new_root(trans, root, path, 1);
3727 if (ret)
3728 return ret;
3729 }
3730again:
3731 split = 1;
3732 l = path->nodes[0];
3733 slot = path->slots[0];
3734 nritems = btrfs_header_nritems(l);
3735 mid = (nritems + 1) / 2;
3736
3737 if (mid <= slot) {
3738 if (nritems == 1 ||
3739 leaf_space_used(l, mid, nritems - mid) + data_size >
3740 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3741 if (slot >= nritems) {
3742 split = 0;
3743 } else {
3744 mid = slot;
3745 if (mid != nritems &&
3746 leaf_space_used(l, mid, nritems - mid) +
3747 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3748 if (data_size && !tried_avoid_double)
3749 goto push_for_double;
3750 split = 2;
3751 }
3752 }
3753 }
3754 } else {
3755 if (leaf_space_used(l, 0, mid) + data_size >
3756 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3757 if (!extend && data_size && slot == 0) {
3758 split = 0;
3759 } else if ((extend || !data_size) && slot == 0) {
3760 mid = 1;
3761 } else {
3762 mid = slot;
3763 if (mid != nritems &&
3764 leaf_space_used(l, mid, nritems - mid) +
3765 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3766 if (data_size && !tried_avoid_double)
3767 goto push_for_double;
3768 split = 2;
3769 }
3770 }
3771 }
3772 }
3773
3774 if (split == 0)
3775 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3776 else
3777 btrfs_item_key(l, &disk_key, mid);
3778
3779 /*
3780 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3781 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3782 * subclasses, which is 8 at the time of this patch, and we've maxed it
3783 * out. In the future we could add a
3784 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3785 * use BTRFS_NESTING_NEW_ROOT.
3786 */
3787 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3788 &disk_key, 0, l->start, 0, 0,
3789 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3790 BTRFS_NESTING_SPLIT);
3791 if (IS_ERR(right))
3792 return PTR_ERR(right);
3793
3794 root_add_used_bytes(root);
3795
3796 if (split == 0) {
3797 if (mid <= slot) {
3798 btrfs_set_header_nritems(right, 0);
3799 ret = insert_ptr(trans, path, &disk_key,
3800 right->start, path->slots[1] + 1, 1);
3801 if (ret < 0) {
3802 btrfs_tree_unlock(right);
3803 free_extent_buffer(right);
3804 return ret;
3805 }
3806 btrfs_tree_unlock(path->nodes[0]);
3807 free_extent_buffer(path->nodes[0]);
3808 path->nodes[0] = right;
3809 path->slots[0] = 0;
3810 path->slots[1] += 1;
3811 } else {
3812 btrfs_set_header_nritems(right, 0);
3813 ret = insert_ptr(trans, path, &disk_key,
3814 right->start, path->slots[1], 1);
3815 if (ret < 0) {
3816 btrfs_tree_unlock(right);
3817 free_extent_buffer(right);
3818 return ret;
3819 }
3820 btrfs_tree_unlock(path->nodes[0]);
3821 free_extent_buffer(path->nodes[0]);
3822 path->nodes[0] = right;
3823 path->slots[0] = 0;
3824 if (path->slots[1] == 0)
3825 fixup_low_keys(trans, path, &disk_key, 1);
3826 }
3827 /*
3828 * We create a new leaf 'right' for the required ins_len and
3829 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3830 * the content of ins_len to 'right'.
3831 */
3832 return ret;
3833 }
3834
3835 ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3836 if (ret < 0) {
3837 btrfs_tree_unlock(right);
3838 free_extent_buffer(right);
3839 return ret;
3840 }
3841
3842 if (split == 2) {
3843 BUG_ON(num_doubles != 0);
3844 num_doubles++;
3845 goto again;
3846 }
3847
3848 return 0;
3849
3850push_for_double:
3851 push_for_double_split(trans, root, path, data_size);
3852 tried_avoid_double = 1;
3853 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3854 return 0;
3855 goto again;
3856}
3857
3858static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3859 struct btrfs_root *root,
3860 struct btrfs_path *path, int ins_len)
3861{
3862 struct btrfs_key key;
3863 struct extent_buffer *leaf;
3864 struct btrfs_file_extent_item *fi;
3865 u64 extent_len = 0;
3866 u32 item_size;
3867 int ret;
3868
3869 leaf = path->nodes[0];
3870 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3871
3872 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3873 key.type != BTRFS_RAID_STRIPE_KEY &&
3874 key.type != BTRFS_EXTENT_CSUM_KEY);
3875
3876 if (btrfs_leaf_free_space(leaf) >= ins_len)
3877 return 0;
3878
3879 item_size = btrfs_item_size(leaf, path->slots[0]);
3880 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3881 fi = btrfs_item_ptr(leaf, path->slots[0],
3882 struct btrfs_file_extent_item);
3883 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3884 }
3885 btrfs_release_path(path);
3886
3887 path->keep_locks = true;
3888 path->search_for_split = true;
3889 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3890 path->search_for_split = false;
3891 if (ret > 0)
3892 ret = -EAGAIN;
3893 if (ret < 0)
3894 goto err;
3895
3896 ret = -EAGAIN;
3897 leaf = path->nodes[0];
3898 /* if our item isn't there, return now */
3899 if (item_size != btrfs_item_size(leaf, path->slots[0]))
3900 goto err;
3901
3902 /* the leaf has changed, it now has room. return now */
3903 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3904 goto err;
3905
3906 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3907 fi = btrfs_item_ptr(leaf, path->slots[0],
3908 struct btrfs_file_extent_item);
3909 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3910 goto err;
3911 }
3912
3913 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3914 if (ret)
3915 goto err;
3916
3917 path->keep_locks = false;
3918 btrfs_unlock_up_safe(path, 1);
3919 return 0;
3920err:
3921 path->keep_locks = false;
3922 return ret;
3923}
3924
3925static noinline int split_item(struct btrfs_trans_handle *trans,
3926 struct btrfs_path *path,
3927 const struct btrfs_key *new_key,
3928 unsigned long split_offset)
3929{
3930 struct extent_buffer *leaf;
3931 int orig_slot, slot;
3932 char *buf;
3933 u32 nritems;
3934 u32 item_size;
3935 u32 orig_offset;
3936 struct btrfs_disk_key disk_key;
3937
3938 leaf = path->nodes[0];
3939 /*
3940 * Shouldn't happen because the caller must have previously called
3941 * setup_leaf_for_split() to make room for the new item in the leaf.
3942 */
3943 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
3944 return -ENOSPC;
3945
3946 orig_slot = path->slots[0];
3947 orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3948 item_size = btrfs_item_size(leaf, path->slots[0]);
3949
3950 buf = kmalloc(item_size, GFP_NOFS);
3951 if (!buf)
3952 return -ENOMEM;
3953
3954 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3955 path->slots[0]), item_size);
3956
3957 slot = path->slots[0] + 1;
3958 nritems = btrfs_header_nritems(leaf);
3959 if (slot != nritems) {
3960 /* shift the items */
3961 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3962 }
3963
3964 btrfs_cpu_key_to_disk(&disk_key, new_key);
3965 btrfs_set_item_key(leaf, &disk_key, slot);
3966
3967 btrfs_set_item_offset(leaf, slot, orig_offset);
3968 btrfs_set_item_size(leaf, slot, item_size - split_offset);
3969
3970 btrfs_set_item_offset(leaf, orig_slot,
3971 orig_offset + item_size - split_offset);
3972 btrfs_set_item_size(leaf, orig_slot, split_offset);
3973
3974 btrfs_set_header_nritems(leaf, nritems + 1);
3975
3976 /* write the data for the start of the original item */
3977 write_extent_buffer(leaf, buf,
3978 btrfs_item_ptr_offset(leaf, path->slots[0]),
3979 split_offset);
3980
3981 /* write the data for the new item */
3982 write_extent_buffer(leaf, buf + split_offset,
3983 btrfs_item_ptr_offset(leaf, slot),
3984 item_size - split_offset);
3985 btrfs_mark_buffer_dirty(trans, leaf);
3986
3987 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3988 kfree(buf);
3989 return 0;
3990}
3991
3992/*
3993 * This function splits a single item into two items,
3994 * giving 'new_key' to the new item and splitting the
3995 * old one at split_offset (from the start of the item).
3996 *
3997 * The path may be released by this operation. After
3998 * the split, the path is pointing to the old item. The
3999 * new item is going to be in the same node as the old one.
4000 *
4001 * Note, the item being split must be smaller enough to live alone on
4002 * a tree block with room for one extra struct btrfs_item
4003 *
4004 * This allows us to split the item in place, keeping a lock on the
4005 * leaf the entire time.
4006 */
4007int btrfs_split_item(struct btrfs_trans_handle *trans,
4008 struct btrfs_root *root,
4009 struct btrfs_path *path,
4010 const struct btrfs_key *new_key,
4011 unsigned long split_offset)
4012{
4013 int ret;
4014 ret = setup_leaf_for_split(trans, root, path,
4015 sizeof(struct btrfs_item));
4016 if (ret)
4017 return ret;
4018
4019 ret = split_item(trans, path, new_key, split_offset);
4020 return ret;
4021}
4022
4023/*
4024 * make the item pointed to by the path smaller. new_size indicates
4025 * how small to make it, and from_end tells us if we just chop bytes
4026 * off the end of the item or if we shift the item to chop bytes off
4027 * the front.
4028 */
4029void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4030 const struct btrfs_path *path, u32 new_size, int from_end)
4031{
4032 int slot;
4033 struct extent_buffer *leaf;
4034 u32 nritems;
4035 unsigned int data_end;
4036 unsigned int old_data_start;
4037 unsigned int old_size;
4038 unsigned int size_diff;
4039 int i;
4040
4041 leaf = path->nodes[0];
4042 slot = path->slots[0];
4043
4044 old_size = btrfs_item_size(leaf, slot);
4045 if (old_size == new_size)
4046 return;
4047
4048 nritems = btrfs_header_nritems(leaf);
4049 data_end = leaf_data_end(leaf);
4050
4051 old_data_start = btrfs_item_offset(leaf, slot);
4052
4053 size_diff = old_size - new_size;
4054
4055 BUG_ON(slot < 0);
4056 BUG_ON(slot >= nritems);
4057
4058 /*
4059 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4060 */
4061 /* first correct the data pointers */
4062 for (i = slot; i < nritems; i++) {
4063 u32 ioff;
4064
4065 ioff = btrfs_item_offset(leaf, i);
4066 btrfs_set_item_offset(leaf, i, ioff + size_diff);
4067 }
4068
4069 /* shift the data */
4070 if (from_end) {
4071 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4072 old_data_start + new_size - data_end);
4073 } else {
4074 struct btrfs_disk_key disk_key;
4075 u64 offset;
4076
4077 btrfs_item_key(leaf, &disk_key, slot);
4078
4079 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4080 unsigned long ptr;
4081 struct btrfs_file_extent_item *fi;
4082
4083 fi = btrfs_item_ptr(leaf, slot,
4084 struct btrfs_file_extent_item);
4085 fi = (struct btrfs_file_extent_item *)(
4086 (unsigned long)fi - size_diff);
4087
4088 if (btrfs_file_extent_type(leaf, fi) ==
4089 BTRFS_FILE_EXTENT_INLINE) {
4090 ptr = btrfs_item_ptr_offset(leaf, slot);
4091 memmove_extent_buffer(leaf, ptr,
4092 (unsigned long)fi,
4093 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4094 }
4095 }
4096
4097 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4098 old_data_start - data_end);
4099
4100 offset = btrfs_disk_key_offset(&disk_key);
4101 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4102 btrfs_set_item_key(leaf, &disk_key, slot);
4103 if (slot == 0)
4104 fixup_low_keys(trans, path, &disk_key, 1);
4105 }
4106
4107 btrfs_set_item_size(leaf, slot, new_size);
4108 btrfs_mark_buffer_dirty(trans, leaf);
4109
4110 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
4111 btrfs_print_leaf(leaf);
4112 BUG();
4113 }
4114}
4115
4116/*
4117 * make the item pointed to by the path bigger, data_size is the added size.
4118 */
4119void btrfs_extend_item(struct btrfs_trans_handle *trans,
4120 const struct btrfs_path *path, u32 data_size)
4121{
4122 int slot;
4123 struct extent_buffer *leaf;
4124 u32 nritems;
4125 unsigned int data_end;
4126 unsigned int old_data;
4127 unsigned int old_size;
4128 int i;
4129
4130 leaf = path->nodes[0];
4131
4132 nritems = btrfs_header_nritems(leaf);
4133 data_end = leaf_data_end(leaf);
4134
4135 if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) {
4136 btrfs_print_leaf(leaf);
4137 BUG();
4138 }
4139 slot = path->slots[0];
4140 old_data = btrfs_item_data_end(leaf, slot);
4141
4142 BUG_ON(slot < 0);
4143 if (unlikely(slot >= nritems)) {
4144 btrfs_print_leaf(leaf);
4145 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4146 slot, nritems);
4147 BUG();
4148 }
4149
4150 /*
4151 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4152 */
4153 /* first correct the data pointers */
4154 for (i = slot; i < nritems; i++) {
4155 u32 ioff;
4156
4157 ioff = btrfs_item_offset(leaf, i);
4158 btrfs_set_item_offset(leaf, i, ioff - data_size);
4159 }
4160
4161 /* shift the data */
4162 memmove_leaf_data(leaf, data_end - data_size, data_end,
4163 old_data - data_end);
4164
4165 old_size = btrfs_item_size(leaf, slot);
4166 btrfs_set_item_size(leaf, slot, old_size + data_size);
4167 btrfs_mark_buffer_dirty(trans, leaf);
4168
4169 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
4170 btrfs_print_leaf(leaf);
4171 BUG();
4172 }
4173}
4174
4175/*
4176 * Make space in the node before inserting one or more items.
4177 *
4178 * @trans: transaction handle
4179 * @root: root we are inserting items to
4180 * @path: points to the leaf/slot where we are going to insert new items
4181 * @batch: information about the batch of items to insert
4182 *
4183 * Main purpose is to save stack depth by doing the bulk of the work in a
4184 * function that doesn't call btrfs_search_slot
4185 */
4186static void setup_items_for_insert(struct btrfs_trans_handle *trans,
4187 struct btrfs_root *root, struct btrfs_path *path,
4188 const struct btrfs_item_batch *batch)
4189{
4190 struct btrfs_fs_info *fs_info = root->fs_info;
4191 int i;
4192 u32 nritems;
4193 unsigned int data_end;
4194 struct btrfs_disk_key disk_key;
4195 struct extent_buffer *leaf;
4196 int slot;
4197 u32 total_size;
4198
4199 /*
4200 * Before anything else, update keys in the parent and other ancestors
4201 * if needed, then release the write locks on them, so that other tasks
4202 * can use them while we modify the leaf.
4203 */
4204 if (path->slots[0] == 0) {
4205 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4206 fixup_low_keys(trans, path, &disk_key, 1);
4207 }
4208 btrfs_unlock_up_safe(path, 1);
4209
4210 leaf = path->nodes[0];
4211 slot = path->slots[0];
4212
4213 nritems = btrfs_header_nritems(leaf);
4214 data_end = leaf_data_end(leaf);
4215 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4216
4217 if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
4218 btrfs_print_leaf(leaf);
4219 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4220 total_size, btrfs_leaf_free_space(leaf));
4221 BUG();
4222 }
4223
4224 if (slot != nritems) {
4225 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4226
4227 if (unlikely(old_data < data_end)) {
4228 btrfs_print_leaf(leaf);
4229 btrfs_crit(fs_info,
4230 "item at slot %d with data offset %u beyond data end of leaf %u",
4231 slot, old_data, data_end);
4232 BUG();
4233 }
4234 /*
4235 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4236 */
4237 /* first correct the data pointers */
4238 for (i = slot; i < nritems; i++) {
4239 u32 ioff;
4240
4241 ioff = btrfs_item_offset(leaf, i);
4242 btrfs_set_item_offset(leaf, i,
4243 ioff - batch->total_data_size);
4244 }
4245 /* shift the items */
4246 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4247
4248 /* shift the data */
4249 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4250 data_end, old_data - data_end);
4251 data_end = old_data;
4252 }
4253
4254 /* setup the item for the new data */
4255 for (i = 0; i < batch->nr; i++) {
4256 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4257 btrfs_set_item_key(leaf, &disk_key, slot + i);
4258 data_end -= batch->data_sizes[i];
4259 btrfs_set_item_offset(leaf, slot + i, data_end);
4260 btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
4261 }
4262
4263 btrfs_set_header_nritems(leaf, nritems + batch->nr);
4264 btrfs_mark_buffer_dirty(trans, leaf);
4265
4266 if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
4267 btrfs_print_leaf(leaf);
4268 BUG();
4269 }
4270}
4271
4272/*
4273 * Insert a new item into a leaf.
4274 *
4275 * @trans: Transaction handle.
4276 * @root: The root of the btree.
4277 * @path: A path pointing to the target leaf and slot.
4278 * @key: The key of the new item.
4279 * @data_size: The size of the data associated with the new key.
4280 */
4281void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
4282 struct btrfs_root *root,
4283 struct btrfs_path *path,
4284 const struct btrfs_key *key,
4285 u32 data_size)
4286{
4287 struct btrfs_item_batch batch;
4288
4289 batch.keys = key;
4290 batch.data_sizes = &data_size;
4291 batch.total_data_size = data_size;
4292 batch.nr = 1;
4293
4294 setup_items_for_insert(trans, root, path, &batch);
4295}
4296
4297/*
4298 * Given a key and some data, insert items into the tree.
4299 * This does all the path init required, making room in the tree if needed.
4300 *
4301 * Returns: 0 on success
4302 * -EEXIST if the first key already exists
4303 * < 0 on other errors
4304 */
4305int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4306 struct btrfs_root *root,
4307 struct btrfs_path *path,
4308 const struct btrfs_item_batch *batch)
4309{
4310 int ret = 0;
4311 int slot;
4312 u32 total_size;
4313
4314 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4315 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4316 if (ret == 0)
4317 return -EEXIST;
4318 if (ret < 0)
4319 return ret;
4320
4321 slot = path->slots[0];
4322 BUG_ON(slot < 0);
4323
4324 setup_items_for_insert(trans, root, path, batch);
4325 return 0;
4326}
4327
4328/*
4329 * Given a key and some data, insert an item into the tree.
4330 * This does all the path init required, making room in the tree if needed.
4331 */
4332int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4333 const struct btrfs_key *cpu_key, void *data,
4334 u32 data_size)
4335{
4336 int ret = 0;
4337 BTRFS_PATH_AUTO_FREE(path);
4338 struct extent_buffer *leaf;
4339 unsigned long ptr;
4340
4341 path = btrfs_alloc_path();
4342 if (!path)
4343 return -ENOMEM;
4344 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4345 if (!ret) {
4346 leaf = path->nodes[0];
4347 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4348 write_extent_buffer(leaf, data, ptr, data_size);
4349 btrfs_mark_buffer_dirty(trans, leaf);
4350 }
4351 return ret;
4352}
4353
4354/*
4355 * This function duplicates an item, giving 'new_key' to the new item.
4356 * It guarantees both items live in the same tree leaf and the new item is
4357 * contiguous with the original item.
4358 *
4359 * This allows us to split a file extent in place, keeping a lock on the leaf
4360 * the entire time.
4361 */
4362int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4363 struct btrfs_root *root,
4364 struct btrfs_path *path,
4365 const struct btrfs_key *new_key)
4366{
4367 struct extent_buffer *leaf;
4368 int ret;
4369 u32 item_size;
4370
4371 leaf = path->nodes[0];
4372 item_size = btrfs_item_size(leaf, path->slots[0]);
4373 ret = setup_leaf_for_split(trans, root, path,
4374 item_size + sizeof(struct btrfs_item));
4375 if (ret)
4376 return ret;
4377
4378 path->slots[0]++;
4379 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
4380 leaf = path->nodes[0];
4381 memcpy_extent_buffer(leaf,
4382 btrfs_item_ptr_offset(leaf, path->slots[0]),
4383 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4384 item_size);
4385 return 0;
4386}
4387
4388/*
4389 * delete the pointer from a given node.
4390 *
4391 * the tree should have been previously balanced so the deletion does not
4392 * empty a node.
4393 *
4394 * This is exported for use inside btrfs-progs, don't un-export it.
4395 */
4396int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4397 struct btrfs_path *path, int level, int slot)
4398{
4399 struct extent_buffer *parent = path->nodes[level];
4400 u32 nritems;
4401 int ret;
4402
4403 nritems = btrfs_header_nritems(parent);
4404 if (slot != nritems - 1) {
4405 if (level) {
4406 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4407 slot + 1, nritems - slot - 1);
4408 if (unlikely(ret < 0)) {
4409 btrfs_abort_transaction(trans, ret);
4410 return ret;
4411 }
4412 }
4413 memmove_extent_buffer(parent,
4414 btrfs_node_key_ptr_offset(parent, slot),
4415 btrfs_node_key_ptr_offset(parent, slot + 1),
4416 sizeof(struct btrfs_key_ptr) *
4417 (nritems - slot - 1));
4418 } else if (level) {
4419 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4420 BTRFS_MOD_LOG_KEY_REMOVE);
4421 if (unlikely(ret < 0)) {
4422 btrfs_abort_transaction(trans, ret);
4423 return ret;
4424 }
4425 }
4426
4427 nritems--;
4428 btrfs_set_header_nritems(parent, nritems);
4429 if (nritems == 0 && parent == root->node) {
4430 BUG_ON(btrfs_header_level(root->node) != 1);
4431 /* just turn the root into a leaf and break */
4432 btrfs_set_header_level(root->node, 0);
4433 } else if (slot == 0) {
4434 struct btrfs_disk_key disk_key;
4435
4436 btrfs_node_key(parent, &disk_key, 0);
4437 fixup_low_keys(trans, path, &disk_key, level + 1);
4438 }
4439 btrfs_mark_buffer_dirty(trans, parent);
4440 return 0;
4441}
4442
4443/*
4444 * a helper function to delete the leaf pointed to by path->slots[1] and
4445 * path->nodes[1].
4446 *
4447 * This deletes the pointer in path->nodes[1] and frees the leaf
4448 * block extent. zero is returned if it all worked out, < 0 otherwise.
4449 *
4450 * The path must have already been setup for deleting the leaf, including
4451 * all the proper balancing. path->nodes[1] must be locked.
4452 */
4453static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4454 struct btrfs_root *root,
4455 struct btrfs_path *path,
4456 struct extent_buffer *leaf)
4457{
4458 int ret;
4459
4460 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4461 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4462 if (ret < 0)
4463 return ret;
4464
4465 /*
4466 * btrfs_free_extent is expensive, we want to make sure we
4467 * aren't holding any locks when we call it
4468 */
4469 btrfs_unlock_up_safe(path, 0);
4470
4471 root_sub_used_bytes(root);
4472
4473 refcount_inc(&leaf->refs);
4474 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4475 free_extent_buffer_stale(leaf);
4476 if (ret < 0)
4477 btrfs_abort_transaction(trans, ret);
4478
4479 return ret;
4480}
4481/*
4482 * delete the item at the leaf level in path. If that empties
4483 * the leaf, remove it from the tree
4484 */
4485int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4486 struct btrfs_path *path, int slot, int nr)
4487{
4488 struct btrfs_fs_info *fs_info = root->fs_info;
4489 struct extent_buffer *leaf;
4490 int ret = 0;
4491 int wret;
4492 u32 nritems;
4493
4494 leaf = path->nodes[0];
4495 nritems = btrfs_header_nritems(leaf);
4496
4497 if (slot + nr != nritems) {
4498 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4499 const int data_end = leaf_data_end(leaf);
4500 u32 dsize = 0;
4501 int i;
4502
4503 for (i = 0; i < nr; i++)
4504 dsize += btrfs_item_size(leaf, slot + i);
4505
4506 memmove_leaf_data(leaf, data_end + dsize, data_end,
4507 last_off - data_end);
4508
4509 for (i = slot + nr; i < nritems; i++) {
4510 u32 ioff;
4511
4512 ioff = btrfs_item_offset(leaf, i);
4513 btrfs_set_item_offset(leaf, i, ioff + dsize);
4514 }
4515
4516 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4517 }
4518 btrfs_set_header_nritems(leaf, nritems - nr);
4519 nritems -= nr;
4520
4521 /* delete the leaf if we've emptied it */
4522 if (nritems == 0) {
4523 if (leaf != root->node) {
4524 btrfs_clear_buffer_dirty(trans, leaf);
4525 ret = btrfs_del_leaf(trans, root, path, leaf);
4526 if (ret < 0)
4527 return ret;
4528 }
4529 } else {
4530 int used = leaf_space_used(leaf, 0, nritems);
4531 if (slot == 0) {
4532 struct btrfs_disk_key disk_key;
4533
4534 btrfs_item_key(leaf, &disk_key, 0);
4535 fixup_low_keys(trans, path, &disk_key, 1);
4536 }
4537
4538 /*
4539 * Try to delete the leaf if it is mostly empty. We do this by
4540 * trying to move all its items into its left and right neighbours.
4541 * If we can't move all the items, then we don't delete it - it's
4542 * not ideal, but future insertions might fill the leaf with more
4543 * items, or items from other leaves might be moved later into our
4544 * leaf due to deletions on those leaves.
4545 */
4546 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4547 u32 min_push_space;
4548
4549 /* push_leaf_left fixes the path.
4550 * make sure the path still points to our leaf
4551 * for possible call to btrfs_del_ptr below
4552 */
4553 slot = path->slots[1];
4554 refcount_inc(&leaf->refs);
4555 /*
4556 * We want to be able to at least push one item to the
4557 * left neighbour leaf, and that's the first item.
4558 */
4559 min_push_space = sizeof(struct btrfs_item) +
4560 btrfs_item_size(leaf, 0);
4561 wret = push_leaf_left(trans, root, path, 0,
4562 min_push_space, 1, (u32)-1);
4563 if (wret < 0 && wret != -ENOSPC)
4564 ret = wret;
4565
4566 if (path->nodes[0] == leaf &&
4567 btrfs_header_nritems(leaf)) {
4568 /*
4569 * If we were not able to push all items from our
4570 * leaf to its left neighbour, then attempt to
4571 * either push all the remaining items to the
4572 * right neighbour or none. There's no advantage
4573 * in pushing only some items, instead of all, as
4574 * it's pointless to end up with a leaf having
4575 * too few items while the neighbours can be full
4576 * or nearly full.
4577 */
4578 nritems = btrfs_header_nritems(leaf);
4579 min_push_space = leaf_space_used(leaf, 0, nritems);
4580 wret = push_leaf_right(trans, root, path, 0,
4581 min_push_space, 1, 0);
4582 if (wret < 0 && wret != -ENOSPC)
4583 ret = wret;
4584 }
4585
4586 if (btrfs_header_nritems(leaf) == 0) {
4587 path->slots[1] = slot;
4588 ret = btrfs_del_leaf(trans, root, path, leaf);
4589 free_extent_buffer(leaf);
4590 if (ret < 0)
4591 return ret;
4592 } else {
4593 /* if we're still in the path, make sure
4594 * we're dirty. Otherwise, one of the
4595 * push_leaf functions must have already
4596 * dirtied this buffer
4597 */
4598 if (path->nodes[0] == leaf)
4599 btrfs_mark_buffer_dirty(trans, leaf);
4600 free_extent_buffer(leaf);
4601 }
4602 } else {
4603 btrfs_mark_buffer_dirty(trans, leaf);
4604 }
4605 }
4606 return ret;
4607}
4608
4609/*
4610 * A helper function to walk down the tree starting at min_key, and looking
4611 * for leaves that have a minimum transaction id.
4612 * This is used by the btree defrag code, and tree logging
4613 *
4614 * This does not cow, but it does stuff the starting key it finds back
4615 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4616 * key and get a writable path.
4617 *
4618 * min_trans indicates the oldest transaction that you are interested
4619 * in walking through. Any nodes or leaves older than min_trans are
4620 * skipped over (without reading them).
4621 *
4622 * returns zero if something useful was found, < 0 on error and 1 if there
4623 * was nothing in the tree that matched the search criteria.
4624 */
4625int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4626 struct btrfs_path *path,
4627 u64 min_trans)
4628{
4629 struct extent_buffer *cur;
4630 int slot;
4631 int sret;
4632 u32 nritems;
4633 int level;
4634 int ret = 1;
4635 const bool keep_locks = path->keep_locks;
4636
4637 ASSERT(!path->nowait);
4638 ASSERT(path->lowest_level == 0);
4639 path->keep_locks = true;
4640again:
4641 cur = btrfs_read_lock_root_node(root);
4642 level = btrfs_header_level(cur);
4643 WARN_ON(path->nodes[level]);
4644 path->nodes[level] = cur;
4645 path->locks[level] = BTRFS_READ_LOCK;
4646
4647 if (btrfs_header_generation(cur) < min_trans) {
4648 ret = 1;
4649 goto out;
4650 }
4651 while (1) {
4652 nritems = btrfs_header_nritems(cur);
4653 level = btrfs_header_level(cur);
4654 sret = btrfs_bin_search(cur, 0, min_key, &slot);
4655 if (sret < 0) {
4656 ret = sret;
4657 goto out;
4658 }
4659
4660 /* At level 0 we're done, setup the path and exit. */
4661 if (level == 0) {
4662 if (slot >= nritems)
4663 goto find_next_key;
4664 ret = 0;
4665 path->slots[level] = slot;
4666 /* Save our key for returning back. */
4667 btrfs_item_key_to_cpu(cur, min_key, slot);
4668 goto out;
4669 }
4670 if (sret && slot > 0)
4671 slot--;
4672 /*
4673 * check this node pointer against the min_trans parameters.
4674 * If it is too old, skip to the next one.
4675 */
4676 while (slot < nritems) {
4677 u64 gen;
4678
4679 gen = btrfs_node_ptr_generation(cur, slot);
4680 if (gen < min_trans) {
4681 slot++;
4682 continue;
4683 }
4684 break;
4685 }
4686find_next_key:
4687 /*
4688 * we didn't find a candidate key in this node, walk forward
4689 * and find another one
4690 */
4691 path->slots[level] = slot;
4692 if (slot >= nritems) {
4693 sret = btrfs_find_next_key(root, path, min_key, level,
4694 min_trans);
4695 if (sret == 0) {
4696 btrfs_release_path(path);
4697 goto again;
4698 } else {
4699 goto out;
4700 }
4701 }
4702 cur = btrfs_read_node_slot(cur, slot);
4703 if (IS_ERR(cur)) {
4704 ret = PTR_ERR(cur);
4705 goto out;
4706 }
4707
4708 btrfs_tree_read_lock(cur);
4709
4710 path->locks[level - 1] = BTRFS_READ_LOCK;
4711 path->nodes[level - 1] = cur;
4712 unlock_up(path, level, 1, 0, NULL);
4713 }
4714out:
4715 path->keep_locks = keep_locks;
4716 if (ret == 0)
4717 btrfs_unlock_up_safe(path, 1);
4718 return ret;
4719}
4720
4721/*
4722 * this is similar to btrfs_next_leaf, but does not try to preserve
4723 * and fixup the path. It looks for and returns the next key in the
4724 * tree based on the current path and the min_trans parameters.
4725 *
4726 * 0 is returned if another key is found, < 0 if there are any errors
4727 * and 1 is returned if there are no higher keys in the tree
4728 *
4729 * path->keep_locks should be set to true on the search made before
4730 * calling this function.
4731 */
4732int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4733 struct btrfs_key *key, int level, u64 min_trans)
4734{
4735 int slot;
4736 struct extent_buffer *c;
4737
4738 WARN_ON(!path->keep_locks && !path->skip_locking);
4739 while (level < BTRFS_MAX_LEVEL) {
4740 if (!path->nodes[level])
4741 return 1;
4742
4743 slot = path->slots[level] + 1;
4744 c = path->nodes[level];
4745next:
4746 if (slot >= btrfs_header_nritems(c)) {
4747 int ret;
4748 int orig_lowest;
4749 struct btrfs_key cur_key;
4750 if (level + 1 >= BTRFS_MAX_LEVEL ||
4751 !path->nodes[level + 1])
4752 return 1;
4753
4754 if (path->locks[level + 1] || path->skip_locking) {
4755 level++;
4756 continue;
4757 }
4758
4759 slot = btrfs_header_nritems(c) - 1;
4760 if (level == 0)
4761 btrfs_item_key_to_cpu(c, &cur_key, slot);
4762 else
4763 btrfs_node_key_to_cpu(c, &cur_key, slot);
4764
4765 orig_lowest = path->lowest_level;
4766 btrfs_release_path(path);
4767 path->lowest_level = level;
4768 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4769 0, 0);
4770 path->lowest_level = orig_lowest;
4771 if (ret < 0)
4772 return ret;
4773
4774 c = path->nodes[level];
4775 slot = path->slots[level];
4776 if (ret == 0)
4777 slot++;
4778 goto next;
4779 }
4780
4781 if (level == 0)
4782 btrfs_item_key_to_cpu(c, key, slot);
4783 else {
4784 u64 gen = btrfs_node_ptr_generation(c, slot);
4785
4786 if (gen < min_trans) {
4787 slot++;
4788 goto next;
4789 }
4790 btrfs_node_key_to_cpu(c, key, slot);
4791 }
4792 return 0;
4793 }
4794 return 1;
4795}
4796
4797int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4798 u64 time_seq)
4799{
4800 int slot;
4801 int level;
4802 struct extent_buffer *c;
4803 struct extent_buffer *next;
4804 struct btrfs_fs_info *fs_info = root->fs_info;
4805 struct btrfs_key key;
4806 bool need_commit_sem = false;
4807 u32 nritems;
4808 int ret;
4809 int i;
4810
4811 /*
4812 * The nowait semantics are used only for write paths, where we don't
4813 * use the tree mod log and sequence numbers.
4814 */
4815 if (time_seq)
4816 ASSERT(!path->nowait);
4817
4818 nritems = btrfs_header_nritems(path->nodes[0]);
4819 if (nritems == 0)
4820 return 1;
4821
4822 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4823again:
4824 level = 1;
4825 next = NULL;
4826 btrfs_release_path(path);
4827
4828 path->keep_locks = true;
4829
4830 if (time_seq) {
4831 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4832 } else {
4833 if (path->need_commit_sem) {
4834 path->need_commit_sem = false;
4835 need_commit_sem = true;
4836 if (path->nowait) {
4837 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4838 ret = -EAGAIN;
4839 goto done;
4840 }
4841 } else {
4842 down_read(&fs_info->commit_root_sem);
4843 }
4844 }
4845 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4846 }
4847 path->keep_locks = false;
4848
4849 if (ret < 0)
4850 goto done;
4851
4852 nritems = btrfs_header_nritems(path->nodes[0]);
4853 /*
4854 * By releasing the path above we dropped all our locks. A balance
4855 * could have happened and
4856 *
4857 * 1. added more items after the previous last item
4858 * 2. deleted the previous last item
4859 *
4860 * So, check again here and advance the path if there are now more
4861 * items available.
4862 */
4863 if (nritems > 0 && path->slots[0] <= nritems - 1) {
4864 if (ret == 0 && path->slots[0] != nritems - 1) {
4865 path->slots[0]++;
4866 goto done;
4867 } else if (ret > 0) {
4868 ret = 0;
4869 goto done;
4870 }
4871 }
4872
4873 while (level < BTRFS_MAX_LEVEL) {
4874 if (!path->nodes[level]) {
4875 ret = 1;
4876 goto done;
4877 }
4878
4879 slot = path->slots[level] + 1;
4880 c = path->nodes[level];
4881 if (slot >= btrfs_header_nritems(c)) {
4882 level++;
4883 if (level == BTRFS_MAX_LEVEL) {
4884 ret = 1;
4885 goto done;
4886 }
4887 continue;
4888 }
4889
4890
4891 /*
4892 * Our current level is where we're going to start from, and to
4893 * make sure lockdep doesn't complain we need to drop our locks
4894 * and nodes from 0 to our current level.
4895 */
4896 for (i = 0; i < level; i++) {
4897 if (path->locks[level]) {
4898 btrfs_tree_read_unlock(path->nodes[i]);
4899 path->locks[i] = 0;
4900 }
4901 free_extent_buffer(path->nodes[i]);
4902 path->nodes[i] = NULL;
4903 }
4904
4905 next = c;
4906 ret = read_block_for_search(root, path, &next, slot, &key);
4907 if (ret == -EAGAIN && !path->nowait)
4908 goto again;
4909
4910 if (ret < 0) {
4911 btrfs_release_path(path);
4912 goto done;
4913 }
4914
4915 if (!path->skip_locking) {
4916 ret = btrfs_try_tree_read_lock(next);
4917 if (!ret && path->nowait) {
4918 ret = -EAGAIN;
4919 goto done;
4920 }
4921 if (!ret && time_seq) {
4922 /*
4923 * If we don't get the lock, we may be racing
4924 * with push_leaf_left, holding that lock while
4925 * itself waiting for the leaf we've currently
4926 * locked. To solve this situation, we give up
4927 * on our lock and cycle.
4928 */
4929 free_extent_buffer(next);
4930 btrfs_release_path(path);
4931 cond_resched();
4932 goto again;
4933 }
4934 if (!ret)
4935 btrfs_tree_read_lock(next);
4936 }
4937 break;
4938 }
4939 path->slots[level] = slot;
4940 while (1) {
4941 level--;
4942 path->nodes[level] = next;
4943 path->slots[level] = 0;
4944 if (!path->skip_locking)
4945 path->locks[level] = BTRFS_READ_LOCK;
4946 if (!level)
4947 break;
4948
4949 ret = read_block_for_search(root, path, &next, 0, &key);
4950 if (ret == -EAGAIN && !path->nowait)
4951 goto again;
4952
4953 if (ret < 0) {
4954 btrfs_release_path(path);
4955 goto done;
4956 }
4957
4958 if (!path->skip_locking) {
4959 if (path->nowait) {
4960 if (!btrfs_try_tree_read_lock(next)) {
4961 ret = -EAGAIN;
4962 goto done;
4963 }
4964 } else {
4965 btrfs_tree_read_lock(next);
4966 }
4967 }
4968 }
4969 ret = 0;
4970done:
4971 unlock_up(path, 0, 1, 0, NULL);
4972 if (need_commit_sem) {
4973 int ret2;
4974
4975 path->need_commit_sem = true;
4976 ret2 = finish_need_commit_sem_search(path);
4977 up_read(&fs_info->commit_root_sem);
4978 if (ret2)
4979 ret = ret2;
4980 }
4981
4982 return ret;
4983}
4984
4985int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4986{
4987 path->slots[0]++;
4988 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4989 return btrfs_next_old_leaf(root, path, time_seq);
4990 return 0;
4991}
4992
4993/*
4994 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4995 * searching until it gets past min_objectid or finds an item of 'type'
4996 *
4997 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4998 */
4999int btrfs_previous_item(struct btrfs_root *root,
5000 struct btrfs_path *path, u64 min_objectid,
5001 int type)
5002{
5003 struct btrfs_key found_key;
5004 struct extent_buffer *leaf;
5005 u32 nritems;
5006 int ret;
5007
5008 while (1) {
5009 if (path->slots[0] == 0) {
5010 ret = btrfs_prev_leaf(root, path);
5011 if (ret != 0)
5012 return ret;
5013 } else {
5014 path->slots[0]--;
5015 }
5016 leaf = path->nodes[0];
5017 nritems = btrfs_header_nritems(leaf);
5018 if (nritems == 0)
5019 return 1;
5020 if (path->slots[0] == nritems)
5021 path->slots[0]--;
5022
5023 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5024 if (found_key.objectid < min_objectid)
5025 break;
5026 if (found_key.type == type)
5027 return 0;
5028 if (found_key.objectid == min_objectid &&
5029 found_key.type < type)
5030 break;
5031 }
5032 return 1;
5033}
5034
5035/*
5036 * search in extent tree to find a previous Metadata/Data extent item with
5037 * min objecitd.
5038 *
5039 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5040 */
5041int btrfs_previous_extent_item(struct btrfs_root *root,
5042 struct btrfs_path *path, u64 min_objectid)
5043{
5044 struct btrfs_key found_key;
5045 struct extent_buffer *leaf;
5046 u32 nritems;
5047 int ret;
5048
5049 while (1) {
5050 if (path->slots[0] == 0) {
5051 ret = btrfs_prev_leaf(root, path);
5052 if (ret != 0)
5053 return ret;
5054 } else {
5055 path->slots[0]--;
5056 }
5057 leaf = path->nodes[0];
5058 nritems = btrfs_header_nritems(leaf);
5059 if (nritems == 0)
5060 return 1;
5061 if (path->slots[0] == nritems)
5062 path->slots[0]--;
5063
5064 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5065 if (found_key.objectid < min_objectid)
5066 break;
5067 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5068 found_key.type == BTRFS_METADATA_ITEM_KEY)
5069 return 0;
5070 if (found_key.objectid == min_objectid &&
5071 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5072 break;
5073 }
5074 return 1;
5075}
5076
5077int __init btrfs_ctree_init(void)
5078{
5079 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
5080 if (!btrfs_path_cachep)
5081 return -ENOMEM;
5082 return 0;
5083}
5084
5085void __cold btrfs_ctree_exit(void)
5086{
5087 kmem_cache_destroy(btrfs_path_cachep);
5088}