blob: eed81a7e36a4d0dd1cd1fbd05b4b6383f0fc32f8 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
David Sterbac1d7c512018-04-03 19:23:33 +02002
Chris Masond1310b22008-01-24 16:13:08 -05003#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
Chris Masond1310b22008-01-24 16:13:08 -05007#include <linux/pagemap.h>
8#include <linux/page-flags.h>
Sweet Tea Dorminy395cb572022-04-06 14:24:18 -04009#include <linux/sched/mm.h>
Chris Masond1310b22008-01-24 16:13:08 -050010#include <linux/spinlock.h>
11#include <linux/blkdev.h>
12#include <linux/swap.h>
Chris Masond1310b22008-01-24 16:13:08 -050013#include <linux/writeback.h>
14#include <linux/pagevec.h>
Linus Torvalds268bb0c2011-05-20 12:50:29 -070015#include <linux/prefetch.h>
Boris Burkov14605402021-06-30 13:01:49 -070016#include <linux/fsverity.h>
Johannes Thumshirncea62802021-03-16 19:04:01 +090017#include "misc.h"
Chris Masond1310b22008-01-24 16:13:08 -050018#include "extent_io.h"
Josef Bacik9c7d3a52019-09-23 10:05:19 -040019#include "extent-io-tree.h"
Chris Masond1310b22008-01-24 16:13:08 -050020#include "extent_map.h"
David Woodhouse902b22f2008-08-20 08:51:49 -040021#include "ctree.h"
22#include "btrfs_inode.h"
Jan Schmidt4a54c8c2011-07-22 15:41:52 +020023#include "volumes.h"
Stefan Behrens21adbd52011-11-09 13:44:05 +010024#include "check-integrity.h"
Josef Bacik0b32f4b2012-03-13 09:38:00 -040025#include "locking.h"
Josef Bacik606686e2012-06-04 14:03:51 -040026#include "rcu-string.h"
Liu Bofe09e162013-09-22 12:54:23 +080027#include "backref.h"
David Sterba6af49db2017-06-23 04:09:57 +020028#include "disk-io.h"
Qu Wenruo760f9912021-01-26 16:33:48 +080029#include "subpage.h"
Naohiro Aotad35751562021-02-04 19:21:54 +090030#include "zoned.h"
Naohiro Aota0bc09ca2021-02-04 19:22:08 +090031#include "block-group.h"
David Sterba2a5232a2021-07-27 14:47:09 +020032#include "compression.h"
Chris Masond1310b22008-01-24 16:13:08 -050033
Chris Masond1310b22008-01-24 16:13:08 -050034static struct kmem_cache *extent_state_cache;
35static struct kmem_cache *extent_buffer_cache;
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -040036static struct bio_set btrfs_bioset;
Chris Masond1310b22008-01-24 16:13:08 -050037
Filipe Manana27a35072014-07-06 20:09:59 +010038static inline bool extent_state_in_tree(const struct extent_state *state)
39{
40 return !RB_EMPTY_NODE(&state->rb_node);
41}
42
Eric Sandeen6d49ba12013-04-22 16:12:31 +000043#ifdef CONFIG_BTRFS_DEBUG
Chris Masond1310b22008-01-24 16:13:08 -050044static LIST_HEAD(states);
Chris Masond3977122009-01-05 21:25:51 -050045static DEFINE_SPINLOCK(leak_lock);
Eric Sandeen6d49ba12013-04-22 16:12:31 +000046
Josef Bacik3fd63722020-02-14 16:11:40 -050047static inline void btrfs_leak_debug_add(spinlock_t *lock,
48 struct list_head *new,
49 struct list_head *head)
Eric Sandeen6d49ba12013-04-22 16:12:31 +000050{
51 unsigned long flags;
52
Josef Bacik3fd63722020-02-14 16:11:40 -050053 spin_lock_irqsave(lock, flags);
Eric Sandeen6d49ba12013-04-22 16:12:31 +000054 list_add(new, head);
Josef Bacik3fd63722020-02-14 16:11:40 -050055 spin_unlock_irqrestore(lock, flags);
Eric Sandeen6d49ba12013-04-22 16:12:31 +000056}
57
Josef Bacik3fd63722020-02-14 16:11:40 -050058static inline void btrfs_leak_debug_del(spinlock_t *lock,
59 struct list_head *entry)
Eric Sandeen6d49ba12013-04-22 16:12:31 +000060{
61 unsigned long flags;
62
Josef Bacik3fd63722020-02-14 16:11:40 -050063 spin_lock_irqsave(lock, flags);
Eric Sandeen6d49ba12013-04-22 16:12:31 +000064 list_del(entry);
Josef Bacik3fd63722020-02-14 16:11:40 -050065 spin_unlock_irqrestore(lock, flags);
Eric Sandeen6d49ba12013-04-22 16:12:31 +000066}
67
Josef Bacik3fd63722020-02-14 16:11:40 -050068void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
Josef Bacik33ca8322019-09-23 10:05:17 -040069{
70 struct extent_buffer *eb;
Josef Bacik3fd63722020-02-14 16:11:40 -050071 unsigned long flags;
Josef Bacik33ca8322019-09-23 10:05:17 -040072
Josef Bacik8c389382020-02-14 16:11:42 -050073 /*
74 * If we didn't get into open_ctree our allocated_ebs will not be
75 * initialized, so just skip this.
76 */
77 if (!fs_info->allocated_ebs.next)
78 return;
79
Qu Wenruob95b78e2022-03-15 18:01:33 +080080 WARN_ON(!list_empty(&fs_info->allocated_ebs));
Josef Bacik3fd63722020-02-14 16:11:40 -050081 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
82 while (!list_empty(&fs_info->allocated_ebs)) {
83 eb = list_first_entry(&fs_info->allocated_ebs,
84 struct extent_buffer, leak_list);
Josef Bacik8c389382020-02-14 16:11:42 -050085 pr_err(
86 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
87 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
88 btrfs_header_owner(eb));
Josef Bacik33ca8322019-09-23 10:05:17 -040089 list_del(&eb->leak_list);
90 kmem_cache_free(extent_buffer_cache, eb);
91 }
Josef Bacik3fd63722020-02-14 16:11:40 -050092 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
Josef Bacik33ca8322019-09-23 10:05:17 -040093}
94
95static inline void btrfs_extent_state_leak_debug_check(void)
Eric Sandeen6d49ba12013-04-22 16:12:31 +000096{
97 struct extent_state *state;
Eric Sandeen6d49ba12013-04-22 16:12:31 +000098
99 while (!list_empty(&states)) {
100 state = list_entry(states.next, struct extent_state, leak_list);
David Sterba9ee49a042015-01-14 19:52:13 +0100101 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
Filipe Manana27a35072014-07-06 20:09:59 +0100102 state->start, state->end, state->state,
103 extent_state_in_tree(state),
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200104 refcount_read(&state->refs));
Eric Sandeen6d49ba12013-04-22 16:12:31 +0000105 list_del(&state->leak_list);
106 kmem_cache_free(extent_state_cache, state);
107 }
Eric Sandeen6d49ba12013-04-22 16:12:31 +0000108}
David Sterba8d599ae2013-04-30 15:22:23 +0000109
Josef Bacika5dee372013-12-13 10:02:44 -0500110#define btrfs_debug_check_extent_io_range(tree, start, end) \
111 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
David Sterba8d599ae2013-04-30 15:22:23 +0000112static inline void __btrfs_debug_check_extent_io_range(const char *caller,
Josef Bacika5dee372013-12-13 10:02:44 -0500113 struct extent_io_tree *tree, u64 start, u64 end)
David Sterba8d599ae2013-04-30 15:22:23 +0000114{
Nikolay Borisov65a680f2018-11-01 14:09:49 +0200115 struct inode *inode = tree->private_data;
116 u64 isize;
117
118 if (!inode || !is_data_inode(inode))
119 return;
120
121 isize = i_size_read(inode);
122 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
123 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
124 "%s: ino %llu isize %llu odd range [%llu,%llu]",
125 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
126 }
David Sterba8d599ae2013-04-30 15:22:23 +0000127}
Eric Sandeen6d49ba12013-04-22 16:12:31 +0000128#else
Josef Bacik3fd63722020-02-14 16:11:40 -0500129#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
130#define btrfs_leak_debug_del(lock, entry) do {} while (0)
Josef Bacik33ca8322019-09-23 10:05:17 -0400131#define btrfs_extent_state_leak_debug_check() do {} while (0)
David Sterba8d599ae2013-04-30 15:22:23 +0000132#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
Chris Mason4bef0842008-09-08 11:18:08 -0400133#endif
Chris Masond1310b22008-01-24 16:13:08 -0500134
Chris Masond1310b22008-01-24 16:13:08 -0500135struct tree_entry {
136 u64 start;
137 u64 end;
Chris Masond1310b22008-01-24 16:13:08 -0500138 struct rb_node rb_node;
139};
140
Christoph Hellwig7aab8b32022-04-15 16:33:24 +0200141/*
142 * Structure to record info about the bio being assembled, and other info like
143 * how many bytes are there before stripe/ordered extent boundary.
144 */
145struct btrfs_bio_ctrl {
146 struct bio *bio;
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200147 int mirror_num;
David Sterba0f070032021-07-27 15:11:53 +0200148 enum btrfs_compression_type compress_type;
Christoph Hellwig7aab8b32022-04-15 16:33:24 +0200149 u32 len_to_stripe_boundary;
150 u32 len_to_oe_boundary;
151};
152
Chris Masond1310b22008-01-24 16:13:08 -0500153struct extent_page_data {
Qu Wenruo390ed29b82021-04-14 16:42:15 +0800154 struct btrfs_bio_ctrl bio_ctrl;
Chris Mason771ed682008-11-06 22:02:51 -0500155 /* tells writepage not to lock the state bits for this range
156 * it still does the unlocking
157 */
Chris Masonffbd5172009-04-20 15:50:09 -0400158 unsigned int extent_locked:1;
159
Christoph Hellwig70fd7612016-11-01 07:40:10 -0600160 /* tells the submit_bio code to use REQ_SYNC */
Chris Masonffbd5172009-04-20 15:50:09 -0400161 unsigned int sync_io:1;
Chris Masond1310b22008-01-24 16:13:08 -0500162};
163
Qu Wenruof97e27e2020-11-13 20:51:40 +0800164static int add_extent_changeset(struct extent_state *state, u32 bits,
Qu Wenruod38ed272015-10-12 14:53:37 +0800165 struct extent_changeset *changeset,
166 int set)
167{
168 int ret;
169
170 if (!changeset)
David Sterba57599c72018-03-01 17:56:34 +0100171 return 0;
Qu Wenruod38ed272015-10-12 14:53:37 +0800172 if (set && (state->state & bits) == bits)
David Sterba57599c72018-03-01 17:56:34 +0100173 return 0;
Qu Wenruofefdc552015-10-12 15:35:38 +0800174 if (!set && (state->state & bits) == 0)
David Sterba57599c72018-03-01 17:56:34 +0100175 return 0;
Qu Wenruod38ed272015-10-12 14:53:37 +0800176 changeset->bytes_changed += state->end - state->start + 1;
David Sterba53d32352017-02-13 13:42:29 +0100177 ret = ulist_add(&changeset->range_changed, state->start, state->end,
Qu Wenruod38ed272015-10-12 14:53:37 +0800178 GFP_ATOMIC);
David Sterba57599c72018-03-01 17:56:34 +0100179 return ret;
Qu Wenruod38ed272015-10-12 14:53:37 +0800180}
181
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200182static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
Qu Wenruobb58eb92019-01-25 13:09:15 +0800183{
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200184 struct bio *bio;
Christoph Hellwig7aa51232022-07-07 07:33:28 +0200185 struct bio_vec *bv;
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200186 struct inode *inode;
187 int mirror_num;
Qu Wenruobb58eb92019-01-25 13:09:15 +0800188
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200189 if (!bio_ctrl->bio)
190 return;
191
192 bio = bio_ctrl->bio;
Christoph Hellwig7aa51232022-07-07 07:33:28 +0200193 bv = bio_first_bvec_all(bio);
194 inode = bv->bv_page->mapping->host;
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200195 mirror_num = bio_ctrl->mirror_num;
Qu Wenruobb58eb92019-01-25 13:09:15 +0800196
Qu Wenruoe0eefe02021-07-26 14:35:00 +0800197 /* Caller should ensure the bio has at least some range added */
198 ASSERT(bio->bi_iter.bi_size);
Qu Wenruoc9583ad2022-04-12 20:30:13 +0800199
Christoph Hellwig7aa51232022-07-07 07:33:28 +0200200 btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
201
Christoph Hellwigc93104e2022-05-26 09:36:35 +0200202 if (!is_data_inode(inode))
203 btrfs_submit_metadata_bio(inode, bio, mirror_num);
204 else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
205 btrfs_submit_data_write_bio(inode, bio, mirror_num);
Nikolay Borisov908930f2020-09-18 16:34:37 +0300206 else
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200207 btrfs_submit_data_read_bio(inode, bio, mirror_num,
208 bio_ctrl->compress_type);
Qu Wenruobb58eb92019-01-25 13:09:15 +0800209
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200210 /* The bio is owned by the bi_end_io handler now */
211 bio_ctrl->bio = NULL;
Qu Wenruo30659762019-03-20 14:27:42 +0800212}
213
Qu Wenruof4340622019-03-20 14:27:41 +0800214/*
Christoph Hellwig9845e5d2022-06-03 09:11:02 +0200215 * Submit or fail the current bio in an extent_page_data structure.
Qu Wenruof4340622019-03-20 14:27:41 +0800216 */
Christoph Hellwig9845e5d2022-06-03 09:11:02 +0200217static void submit_write_bio(struct extent_page_data *epd, int ret)
Qu Wenruobb58eb92019-01-25 13:09:15 +0800218{
Qu Wenruo390ed29b82021-04-14 16:42:15 +0800219 struct bio *bio = epd->bio_ctrl.bio;
Qu Wenruobb58eb92019-01-25 13:09:15 +0800220
Christoph Hellwig9845e5d2022-06-03 09:11:02 +0200221 if (!bio)
222 return;
223
224 if (ret) {
225 ASSERT(ret < 0);
Qu Wenruobb58eb92019-01-25 13:09:15 +0800226 bio->bi_status = errno_to_blk_status(ret);
227 bio_endio(bio);
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200228 /* The bio is owned by the bi_end_io handler now */
Qu Wenruo390ed29b82021-04-14 16:42:15 +0800229 epd->bio_ctrl.bio = NULL;
Christoph Hellwig9845e5d2022-06-03 09:11:02 +0200230 } else {
Christoph Hellwig722c82a2022-06-03 09:11:03 +0200231 submit_one_bio(&epd->bio_ctrl);
Qu Wenruobb58eb92019-01-25 13:09:15 +0800232 }
233}
David Sterbae2932ee2017-06-23 04:16:17 +0200234
Josef Bacik6f0d04f2019-09-23 10:05:18 -0400235int __init extent_state_cache_init(void)
Chris Masond1310b22008-01-24 16:13:08 -0500236{
David Sterba837e1972012-09-07 03:00:48 -0600237 extent_state_cache = kmem_cache_create("btrfs_extent_state",
Christoph Hellwig9601e3f2009-04-13 15:33:09 +0200238 sizeof(struct extent_state), 0,
Nikolay Borisovfba4b692016-06-23 21:17:08 +0300239 SLAB_MEM_SPREAD, NULL);
Chris Masond1310b22008-01-24 16:13:08 -0500240 if (!extent_state_cache)
241 return -ENOMEM;
Josef Bacik6f0d04f2019-09-23 10:05:18 -0400242 return 0;
243}
Chris Masond1310b22008-01-24 16:13:08 -0500244
Josef Bacik6f0d04f2019-09-23 10:05:18 -0400245int __init extent_io_init(void)
246{
David Sterba837e1972012-09-07 03:00:48 -0600247 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
Christoph Hellwig9601e3f2009-04-13 15:33:09 +0200248 sizeof(struct extent_buffer), 0,
Nikolay Borisovfba4b692016-06-23 21:17:08 +0300249 SLAB_MEM_SPREAD, NULL);
Chris Masond1310b22008-01-24 16:13:08 -0500250 if (!extent_buffer_cache)
Josef Bacik6f0d04f2019-09-23 10:05:18 -0400251 return -ENOMEM;
Chris Mason9be33952013-05-17 18:30:14 -0400252
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -0400253 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
Qu Wenruoc3a3b192021-09-15 15:17:18 +0800254 offsetof(struct btrfs_bio, bio),
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -0400255 BIOSET_NEED_BVECS))
Chris Mason9be33952013-05-17 18:30:14 -0400256 goto free_buffer_cache;
Darrick J. Wongb208c2f2013-09-19 20:37:07 -0700257
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -0400258 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
Darrick J. Wongb208c2f2013-09-19 20:37:07 -0700259 goto free_bioset;
260
Chris Masond1310b22008-01-24 16:13:08 -0500261 return 0;
262
Darrick J. Wongb208c2f2013-09-19 20:37:07 -0700263free_bioset:
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -0400264 bioset_exit(&btrfs_bioset);
Darrick J. Wongb208c2f2013-09-19 20:37:07 -0700265
Chris Mason9be33952013-05-17 18:30:14 -0400266free_buffer_cache:
267 kmem_cache_destroy(extent_buffer_cache);
268 extent_buffer_cache = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500269 return -ENOMEM;
270}
271
Josef Bacik6f0d04f2019-09-23 10:05:18 -0400272void __cold extent_state_cache_exit(void)
273{
274 btrfs_extent_state_leak_debug_check();
275 kmem_cache_destroy(extent_state_cache);
276}
277
David Sterbae67c7182018-02-19 17:24:18 +0100278void __cold extent_io_exit(void)
Chris Masond1310b22008-01-24 16:13:08 -0500279{
Kirill A. Shutemov8c0a8532012-09-26 11:33:07 +1000280 /*
281 * Make sure all delayed rcu free are flushed before we
282 * destroy caches.
283 */
284 rcu_barrier();
Kinglong Mee5598e902016-01-29 21:36:35 +0800285 kmem_cache_destroy(extent_buffer_cache);
Kent Overstreet8ac9f7c2018-05-20 18:25:56 -0400286 bioset_exit(&btrfs_bioset);
Chris Masond1310b22008-01-24 16:13:08 -0500287}
288
Josef Bacik41a2ee72020-01-17 09:02:21 -0500289/*
290 * For the file_extent_tree, we want to hold the inode lock when we lookup and
291 * update the disk_i_size, but lockdep will complain because our io_tree we hold
292 * the tree lock and get the inode lock when setting delalloc. These two things
293 * are unrelated, so make a class for the file_extent_tree so we don't get the
294 * two locking patterns mixed up.
295 */
296static struct lock_class_key file_extent_tree_class;
297
Qu Wenruoc258d6e2019-03-01 10:47:58 +0800298void extent_io_tree_init(struct btrfs_fs_info *fs_info,
Qu Wenruo43eb5f22019-03-01 10:47:59 +0800299 struct extent_io_tree *tree, unsigned int owner,
300 void *private_data)
Chris Masond1310b22008-01-24 16:13:08 -0500301{
Qu Wenruoc258d6e2019-03-01 10:47:58 +0800302 tree->fs_info = fs_info;
Eric Paris6bef4d32010-02-23 19:43:04 +0000303 tree->state = RB_ROOT;
Chris Masond1310b22008-01-24 16:13:08 -0500304 tree->dirty_bytes = 0;
Chris Mason70dec802008-01-29 09:59:12 -0500305 spin_lock_init(&tree->lock);
Josef Bacikc6100a42017-05-05 11:57:13 -0400306 tree->private_data = private_data;
Qu Wenruo43eb5f22019-03-01 10:47:59 +0800307 tree->owner = owner;
Josef Bacik41a2ee72020-01-17 09:02:21 -0500308 if (owner == IO_TREE_INODE_FILE_EXTENT)
309 lockdep_set_class(&tree->lock, &file_extent_tree_class);
Chris Masond1310b22008-01-24 16:13:08 -0500310}
Chris Masond1310b22008-01-24 16:13:08 -0500311
Nikolay Borisov41e7acd2019-03-25 14:31:24 +0200312void extent_io_tree_release(struct extent_io_tree *tree)
313{
314 spin_lock(&tree->lock);
315 /*
316 * Do a single barrier for the waitqueue_active check here, the state
317 * of the waitqueue should not change once extent_io_tree_release is
318 * called.
319 */
320 smp_mb();
321 while (!RB_EMPTY_ROOT(&tree->state)) {
322 struct rb_node *node;
323 struct extent_state *state;
324
325 node = rb_first(&tree->state);
326 state = rb_entry(node, struct extent_state, rb_node);
327 rb_erase(&state->rb_node, &tree->state);
328 RB_CLEAR_NODE(&state->rb_node);
329 /*
330 * btree io trees aren't supposed to have tasks waiting for
331 * changes in the flags of extent states ever.
332 */
333 ASSERT(!waitqueue_active(&state->wq));
334 free_extent_state(state);
335
336 cond_resched_lock(&tree->lock);
337 }
338 spin_unlock(&tree->lock);
339}
340
Christoph Hellwigb2950862008-12-02 09:54:17 -0500341static struct extent_state *alloc_extent_state(gfp_t mask)
Chris Masond1310b22008-01-24 16:13:08 -0500342{
343 struct extent_state *state;
Chris Masond1310b22008-01-24 16:13:08 -0500344
Michal Hocko3ba7ab22017-01-09 15:39:02 +0100345 /*
346 * The given mask might be not appropriate for the slab allocator,
347 * drop the unsupported bits
348 */
349 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
Chris Masond1310b22008-01-24 16:13:08 -0500350 state = kmem_cache_alloc(extent_state_cache, mask);
Peter2b114d12008-04-01 11:21:40 -0400351 if (!state)
Chris Masond1310b22008-01-24 16:13:08 -0500352 return state;
353 state->state = 0;
David Sterba47dc1962016-02-11 13:24:13 +0100354 state->failrec = NULL;
Filipe Manana27a35072014-07-06 20:09:59 +0100355 RB_CLEAR_NODE(&state->rb_node);
Josef Bacik3fd63722020-02-14 16:11:40 -0500356 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200357 refcount_set(&state->refs, 1);
Chris Masond1310b22008-01-24 16:13:08 -0500358 init_waitqueue_head(&state->wq);
Jeff Mahoney143bede2012-03-01 14:56:26 +0100359 trace_alloc_extent_state(state, mask, _RET_IP_);
Chris Masond1310b22008-01-24 16:13:08 -0500360 return state;
361}
Chris Masond1310b22008-01-24 16:13:08 -0500362
Chris Mason4845e442010-05-25 20:56:50 -0400363void free_extent_state(struct extent_state *state)
Chris Masond1310b22008-01-24 16:13:08 -0500364{
Chris Masond1310b22008-01-24 16:13:08 -0500365 if (!state)
366 return;
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200367 if (refcount_dec_and_test(&state->refs)) {
Filipe Manana27a35072014-07-06 20:09:59 +0100368 WARN_ON(extent_state_in_tree(state));
Josef Bacik3fd63722020-02-14 16:11:40 -0500369 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
Jeff Mahoney143bede2012-03-01 14:56:26 +0100370 trace_free_extent_state(state, _RET_IP_);
Chris Masond1310b22008-01-24 16:13:08 -0500371 kmem_cache_free(extent_state_cache, state);
372 }
373}
Chris Masond1310b22008-01-24 16:13:08 -0500374
Nikolay Borisov8666e632019-06-05 14:50:04 +0300375/**
Nikolay Borisov3bed2da2021-01-22 11:58:03 +0200376 * Search @tree for an entry that contains @offset. Such entry would have
377 * entry->start <= offset && entry->end >= offset.
Nikolay Borisov8666e632019-06-05 14:50:04 +0300378 *
Nikolay Borisov3bed2da2021-01-22 11:58:03 +0200379 * @tree: the tree to search
380 * @offset: offset that should fall within an entry in @tree
David Sterba9db33892020-06-25 19:03:41 +0200381 * @node_ret: pointer where new node should be anchored (used when inserting an
Nikolay Borisov3bed2da2021-01-22 11:58:03 +0200382 * entry in the tree)
383 * @parent_ret: points to entry which would have been the parent of the entry,
Nikolay Borisov8666e632019-06-05 14:50:04 +0300384 * containing @offset
385 *
David Sterba9db33892020-06-25 19:03:41 +0200386 * Return a pointer to the entry that contains @offset byte address and don't change
387 * @node_ret and @parent_ret.
388 *
389 * If no such entry exists, return pointer to entry that ends before @offset
390 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
Nikolay Borisov8666e632019-06-05 14:50:04 +0300391 */
David Sterba9db33892020-06-25 19:03:41 +0200392static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
393 u64 offset,
394 struct rb_node ***node_ret,
395 struct rb_node **parent_ret)
Chris Masond1310b22008-01-24 16:13:08 -0500396{
Chris Mason80ea96b2008-02-01 14:51:59 -0500397 struct rb_root *root = &tree->state;
David Sterbabebb22c2020-06-25 18:35:24 +0200398 struct rb_node **node = &root->rb_node;
Chris Masond1310b22008-01-24 16:13:08 -0500399 struct rb_node *prev = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500400 struct tree_entry *entry;
Chris Masond1310b22008-01-24 16:13:08 -0500401
David Sterbabebb22c2020-06-25 18:35:24 +0200402 while (*node) {
403 prev = *node;
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +0000404 entry = rb_entry(prev, struct tree_entry, rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500405
406 if (offset < entry->start)
David Sterbabebb22c2020-06-25 18:35:24 +0200407 node = &(*node)->rb_left;
Chris Masond1310b22008-01-24 16:13:08 -0500408 else if (offset > entry->end)
David Sterbabebb22c2020-06-25 18:35:24 +0200409 node = &(*node)->rb_right;
Chris Masond3977122009-01-05 21:25:51 -0500410 else
David Sterbabebb22c2020-06-25 18:35:24 +0200411 return *node;
Chris Masond1310b22008-01-24 16:13:08 -0500412 }
413
David Sterba9db33892020-06-25 19:03:41 +0200414 if (node_ret)
415 *node_ret = node;
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +0000416 if (parent_ret)
417 *parent_ret = prev;
418
David Sterbabebb22c2020-06-25 18:35:24 +0200419 /* Search neighbors until we find the first one past the end */
420 while (prev && offset > entry->end) {
421 prev = rb_next(prev);
422 entry = rb_entry(prev, struct tree_entry, rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500423 }
424
David Sterbabebb22c2020-06-25 18:35:24 +0200425 return prev;
Chris Masond1310b22008-01-24 16:13:08 -0500426}
427
Chris Masond1310b22008-01-24 16:13:08 -0500428/*
David Sterbaec60c762020-06-25 18:49:39 +0200429 * Inexact rb-tree search, return the next entry if @offset is not found
430 */
431static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
Chris Masond1310b22008-01-24 16:13:08 -0500432{
433 return tree_search_for_insert(tree, offset, NULL, NULL);
434}
435
David Sterba9db33892020-06-25 19:03:41 +0200436/**
437 * Search offset in the tree or fill neighbor rbtree node pointers.
438 *
439 * @tree: the tree to search
440 * @offset: offset that should fall within an entry in @tree
441 * @next_ret: pointer to the first entry whose range ends after @offset
442 * @prev_ret: pointer to the first entry whose range begins before @offset
443 *
444 * Return a pointer to the entry that contains @offset byte address. If no
445 * such entry exists, then return NULL and fill @prev_ret and @next_ret.
446 * Otherwise return the found entry and other pointers are left untouched.
447 */
448static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
449 u64 offset,
450 struct rb_node **prev_ret,
451 struct rb_node **next_ret)
452{
453 struct rb_root *root = &tree->state;
454 struct rb_node **node = &root->rb_node;
455 struct rb_node *prev = NULL;
456 struct rb_node *orig_prev = NULL;
457 struct tree_entry *entry;
458
459 ASSERT(prev_ret);
460 ASSERT(next_ret);
461
462 while (*node) {
463 prev = *node;
464 entry = rb_entry(prev, struct tree_entry, rb_node);
465
466 if (offset < entry->start)
467 node = &(*node)->rb_left;
468 else if (offset > entry->end)
469 node = &(*node)->rb_right;
470 else
471 return *node;
472 }
473
474 orig_prev = prev;
475 while (prev && offset > entry->end) {
476 prev = rb_next(prev);
477 entry = rb_entry(prev, struct tree_entry, rb_node);
478 }
479 *next_ret = prev;
480 prev = orig_prev;
481
482 entry = rb_entry(prev, struct tree_entry, rb_node);
483 while (prev && offset < entry->start) {
484 prev = rb_prev(prev);
485 entry = rb_entry(prev, struct tree_entry, rb_node);
486 }
487 *prev_ret = prev;
488
489 return NULL;
490}
491
Chris Masond1310b22008-01-24 16:13:08 -0500492/*
493 * utility function to look for merge candidates inside a given range.
494 * Any extents with matching state are merged together into a single
495 * extent in the tree. Extents with EXTENT_IO in their state field
496 * are not merged because the end_io handlers need to be able to do
497 * operations on them without sleeping (or doing allocations/splits).
498 *
499 * This should be called with the tree lock held.
500 */
Jeff Mahoney1bf85042011-07-21 16:56:09 +0000501static void merge_state(struct extent_io_tree *tree,
502 struct extent_state *state)
Chris Masond1310b22008-01-24 16:13:08 -0500503{
504 struct extent_state *other;
505 struct rb_node *other_node;
506
Nikolay Borisov88826792019-03-14 15:28:31 +0200507 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
Jeff Mahoney1bf85042011-07-21 16:56:09 +0000508 return;
Chris Masond1310b22008-01-24 16:13:08 -0500509
510 other_node = rb_prev(&state->rb_node);
511 if (other_node) {
512 other = rb_entry(other_node, struct extent_state, rb_node);
513 if (other->end == state->start - 1 &&
514 other->state == state->state) {
Nikolay Borisov5c848192018-11-01 14:09:52 +0200515 if (tree->private_data &&
516 is_data_inode(tree->private_data))
517 btrfs_merge_delalloc_extent(tree->private_data,
518 state, other);
Chris Masond1310b22008-01-24 16:13:08 -0500519 state->start = other->start;
Chris Masond1310b22008-01-24 16:13:08 -0500520 rb_erase(&other->rb_node, &tree->state);
Filipe Manana27a35072014-07-06 20:09:59 +0100521 RB_CLEAR_NODE(&other->rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500522 free_extent_state(other);
523 }
524 }
525 other_node = rb_next(&state->rb_node);
526 if (other_node) {
527 other = rb_entry(other_node, struct extent_state, rb_node);
528 if (other->start == state->end + 1 &&
529 other->state == state->state) {
Nikolay Borisov5c848192018-11-01 14:09:52 +0200530 if (tree->private_data &&
531 is_data_inode(tree->private_data))
532 btrfs_merge_delalloc_extent(tree->private_data,
533 state, other);
Josef Bacikdf98b6e2011-06-20 14:53:48 -0400534 state->end = other->end;
Josef Bacikdf98b6e2011-06-20 14:53:48 -0400535 rb_erase(&other->rb_node, &tree->state);
Filipe Manana27a35072014-07-06 20:09:59 +0100536 RB_CLEAR_NODE(&other->rb_node);
Josef Bacikdf98b6e2011-06-20 14:53:48 -0400537 free_extent_state(other);
Chris Masond1310b22008-01-24 16:13:08 -0500538 }
539 }
Chris Masond1310b22008-01-24 16:13:08 -0500540}
541
Xiao Guangrong3150b692011-07-14 03:19:08 +0000542static void set_state_bits(struct extent_io_tree *tree,
David Sterba6d92b302020-06-25 17:54:54 +0200543 struct extent_state *state, u32 bits,
Qu Wenruod38ed272015-10-12 14:53:37 +0800544 struct extent_changeset *changeset);
Xiao Guangrong3150b692011-07-14 03:19:08 +0000545
Chris Masond1310b22008-01-24 16:13:08 -0500546/*
547 * insert an extent_state struct into the tree. 'bits' are set on the
548 * struct before it is inserted.
549 *
550 * This may return -EEXIST if the extent is already there, in which case the
551 * state struct is freed.
552 *
553 * The tree lock is not taken internally. This is a utility function and
554 * probably isn't what you want to call (see set/clear_extent_bit).
555 */
556static int insert_state(struct extent_io_tree *tree,
David Sterbacee51262020-06-25 17:18:24 +0200557 struct extent_state *state,
David Sterba6d92b302020-06-25 17:54:54 +0200558 u32 bits, struct extent_changeset *changeset)
Chris Masond1310b22008-01-24 16:13:08 -0500559{
David Sterbac7e118c2020-06-25 17:14:17 +0200560 struct rb_node **node;
561 struct rb_node *parent;
David Sterbacee51262020-06-25 17:18:24 +0200562 const u64 end = state->end;
Josef Bacik9ed74f22009-09-11 16:12:44 -0400563
Qu Wenruod38ed272015-10-12 14:53:37 +0800564 set_state_bits(tree, state, bits, changeset);
Xiao Guangrong3150b692011-07-14 03:19:08 +0000565
David Sterbac7e118c2020-06-25 17:14:17 +0200566 node = &tree->state.rb_node;
567 while (*node) {
568 struct tree_entry *entry;
569
570 parent = *node;
571 entry = rb_entry(parent, struct tree_entry, rb_node);
572
573 if (end < entry->start) {
574 node = &(*node)->rb_left;
575 } else if (end > entry->end) {
576 node = &(*node)->rb_right;
577 } else {
578 btrfs_err(tree->fs_info,
579 "found node %llu %llu on insert of %llu %llu",
David Sterbacee51262020-06-25 17:18:24 +0200580 entry->start, entry->end, state->start, end);
David Sterbac7e118c2020-06-25 17:14:17 +0200581 return -EEXIST;
582 }
Chris Masond1310b22008-01-24 16:13:08 -0500583 }
David Sterbac7e118c2020-06-25 17:14:17 +0200584
David Sterbac7e118c2020-06-25 17:14:17 +0200585 rb_link_node(&state->rb_node, parent, node);
586 rb_insert_color(&state->rb_node, &tree->state);
587
Chris Masond1310b22008-01-24 16:13:08 -0500588 merge_state(tree, state);
589 return 0;
590}
591
592/*
David Sterbafb8f07d2020-06-25 18:11:31 +0200593 * Insert state to @tree to the location given by @node and @parent.
594 */
595static void insert_state_fast(struct extent_io_tree *tree,
596 struct extent_state *state, struct rb_node **node,
597 struct rb_node *parent, unsigned bits,
598 struct extent_changeset *changeset)
599{
600 set_state_bits(tree, state, bits, changeset);
601 rb_link_node(&state->rb_node, parent, node);
602 rb_insert_color(&state->rb_node, &tree->state);
603 merge_state(tree, state);
604}
605
606/*
Chris Masond1310b22008-01-24 16:13:08 -0500607 * split a given extent state struct in two, inserting the preallocated
608 * struct 'prealloc' as the newly created second half. 'split' indicates an
609 * offset inside 'orig' where it should be split.
610 *
611 * Before calling,
612 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
613 * are two extent state structs in the tree:
614 * prealloc: [orig->start, split - 1]
615 * orig: [ split, orig->end ]
616 *
617 * The tree locks are not taken by this function. They need to be held
618 * by the caller.
619 */
620static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
621 struct extent_state *prealloc, u64 split)
622{
David Sterba12c9cdd2020-06-25 16:49:48 +0200623 struct rb_node *parent = NULL;
624 struct rb_node **node;
Josef Bacik9ed74f22009-09-11 16:12:44 -0400625
Nikolay Borisovabbb55f2018-11-01 14:09:53 +0200626 if (tree->private_data && is_data_inode(tree->private_data))
627 btrfs_split_delalloc_extent(tree->private_data, orig, split);
Josef Bacik9ed74f22009-09-11 16:12:44 -0400628
Chris Masond1310b22008-01-24 16:13:08 -0500629 prealloc->start = orig->start;
630 prealloc->end = split - 1;
631 prealloc->state = orig->state;
632 orig->start = split;
633
David Sterba12c9cdd2020-06-25 16:49:48 +0200634 parent = &orig->rb_node;
635 node = &parent;
636 while (*node) {
637 struct tree_entry *entry;
638
639 parent = *node;
640 entry = rb_entry(parent, struct tree_entry, rb_node);
641
642 if (prealloc->end < entry->start) {
643 node = &(*node)->rb_left;
644 } else if (prealloc->end > entry->end) {
645 node = &(*node)->rb_right;
646 } else {
647 free_extent_state(prealloc);
648 return -EEXIST;
649 }
Chris Masond1310b22008-01-24 16:13:08 -0500650 }
David Sterba12c9cdd2020-06-25 16:49:48 +0200651
652 rb_link_node(&prealloc->rb_node, parent, node);
653 rb_insert_color(&prealloc->rb_node, &tree->state);
654
Chris Masond1310b22008-01-24 16:13:08 -0500655 return 0;
656}
657
Li Zefancdc6a392012-03-12 16:39:48 +0800658static struct extent_state *next_state(struct extent_state *state)
659{
660 struct rb_node *next = rb_next(&state->rb_node);
661 if (next)
662 return rb_entry(next, struct extent_state, rb_node);
663 else
664 return NULL;
665}
666
Chris Masond1310b22008-01-24 16:13:08 -0500667/*
668 * utility function to clear some bits in an extent state struct.
Andrea Gelmini52042d82018-11-28 12:05:13 +0100669 * it will optionally wake up anyone waiting on this state (wake == 1).
Chris Masond1310b22008-01-24 16:13:08 -0500670 *
671 * If no bits are set on the state struct after clearing things, the
672 * struct is freed and removed from the tree
673 */
Li Zefancdc6a392012-03-12 16:39:48 +0800674static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
675 struct extent_state *state,
David Sterba6d92b302020-06-25 17:54:54 +0200676 u32 bits, int wake,
Qu Wenruofefdc552015-10-12 15:35:38 +0800677 struct extent_changeset *changeset)
Chris Masond1310b22008-01-24 16:13:08 -0500678{
Li Zefancdc6a392012-03-12 16:39:48 +0800679 struct extent_state *next;
David Sterba6d92b302020-06-25 17:54:54 +0200680 u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
David Sterba57599c72018-03-01 17:56:34 +0100681 int ret;
Chris Masond1310b22008-01-24 16:13:08 -0500682
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400683 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
Chris Masond1310b22008-01-24 16:13:08 -0500684 u64 range = state->end - state->start + 1;
685 WARN_ON(range > tree->dirty_bytes);
686 tree->dirty_bytes -= range;
687 }
Nikolay Borisova36bb5f2018-11-01 14:09:51 +0200688
689 if (tree->private_data && is_data_inode(tree->private_data))
690 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
691
David Sterba57599c72018-03-01 17:56:34 +0100692 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
693 BUG_ON(ret < 0);
Josef Bacik32c00af2009-10-08 13:34:05 -0400694 state->state &= ~bits_to_clear;
Chris Masond1310b22008-01-24 16:13:08 -0500695 if (wake)
696 wake_up(&state->wq);
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400697 if (state->state == 0) {
Li Zefancdc6a392012-03-12 16:39:48 +0800698 next = next_state(state);
Filipe Manana27a35072014-07-06 20:09:59 +0100699 if (extent_state_in_tree(state)) {
Chris Masond1310b22008-01-24 16:13:08 -0500700 rb_erase(&state->rb_node, &tree->state);
Filipe Manana27a35072014-07-06 20:09:59 +0100701 RB_CLEAR_NODE(&state->rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500702 free_extent_state(state);
703 } else {
704 WARN_ON(1);
705 }
706 } else {
707 merge_state(tree, state);
Li Zefancdc6a392012-03-12 16:39:48 +0800708 next = next_state(state);
Chris Masond1310b22008-01-24 16:13:08 -0500709 }
Li Zefancdc6a392012-03-12 16:39:48 +0800710 return next;
Chris Masond1310b22008-01-24 16:13:08 -0500711}
712
Xiao Guangrong82337672011-04-20 06:44:57 +0000713static struct extent_state *
714alloc_extent_state_atomic(struct extent_state *prealloc)
715{
716 if (!prealloc)
717 prealloc = alloc_extent_state(GFP_ATOMIC);
718
719 return prealloc;
720}
721
Eric Sandeen48a3b632013-04-25 20:41:01 +0000722static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -0400723{
Su Yue29b665c2021-01-03 17:28:03 +0800724 btrfs_panic(tree->fs_info, err,
David Sterba05912a32018-07-18 19:23:45 +0200725 "locking error: extent tree was modified by another thread while locked");
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -0400726}
727
Chris Masond1310b22008-01-24 16:13:08 -0500728/*
729 * clear some bits on a range in the tree. This may require splitting
730 * or inserting elements in the tree, so the gfp mask is used to
731 * indicate which allocations or sleeping are allowed.
732 *
733 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
734 * the given range from the tree regardless of state (ie for truncate).
735 *
736 * the range [start, end] is inclusive.
737 *
Jeff Mahoney6763af82012-03-01 14:56:29 +0100738 * This takes the tree lock, and returns 0 on success and < 0 on error.
Chris Masond1310b22008-01-24 16:13:08 -0500739 */
David Sterba66b0c882017-10-31 16:30:47 +0100740int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +0800741 u32 bits, int wake, int delete,
742 struct extent_state **cached_state,
743 gfp_t mask, struct extent_changeset *changeset)
Chris Masond1310b22008-01-24 16:13:08 -0500744{
745 struct extent_state *state;
Chris Mason2c64c532009-09-02 15:04:12 -0400746 struct extent_state *cached;
Chris Masond1310b22008-01-24 16:13:08 -0500747 struct extent_state *prealloc = NULL;
748 struct rb_node *node;
Yan Zheng5c939df2009-05-27 09:16:03 -0400749 u64 last_end;
Chris Masond1310b22008-01-24 16:13:08 -0500750 int err;
Josef Bacik2ac55d42010-02-03 19:33:23 +0000751 int clear = 0;
Chris Masond1310b22008-01-24 16:13:08 -0500752
Josef Bacika5dee372013-12-13 10:02:44 -0500753 btrfs_debug_check_extent_io_range(tree, start, end);
Qu Wenruoa1d19842019-03-01 10:48:00 +0800754 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
David Sterba8d599ae2013-04-30 15:22:23 +0000755
Josef Bacik7ee9e442013-06-21 16:37:03 -0400756 if (bits & EXTENT_DELALLOC)
757 bits |= EXTENT_NORESERVE;
758
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400759 if (delete)
760 bits |= ~EXTENT_CTLBITS;
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400761
Nikolay Borisov88826792019-03-14 15:28:31 +0200762 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
Josef Bacik2ac55d42010-02-03 19:33:23 +0000763 clear = 1;
Chris Masond1310b22008-01-24 16:13:08 -0500764again:
Mel Gormand0164ad2015-11-06 16:28:21 -0800765 if (!prealloc && gfpflags_allow_blocking(mask)) {
Filipe Mananac7bc6312014-11-03 14:12:57 +0000766 /*
767 * Don't care for allocation failure here because we might end
768 * up not needing the pre-allocated extent state at all, which
769 * is the case if we only have in the tree extent states that
770 * cover our input range and don't cover too any other range.
771 * If we end up needing a new extent state we allocate it later.
772 */
Chris Masond1310b22008-01-24 16:13:08 -0500773 prealloc = alloc_extent_state(mask);
Chris Masond1310b22008-01-24 16:13:08 -0500774 }
775
Chris Masoncad321a2008-12-17 14:51:42 -0500776 spin_lock(&tree->lock);
Chris Mason2c64c532009-09-02 15:04:12 -0400777 if (cached_state) {
778 cached = *cached_state;
Josef Bacik2ac55d42010-02-03 19:33:23 +0000779
780 if (clear) {
781 *cached_state = NULL;
782 cached_state = NULL;
783 }
784
Filipe Manana27a35072014-07-06 20:09:59 +0100785 if (cached && extent_state_in_tree(cached) &&
786 cached->start <= start && cached->end > start) {
Josef Bacik2ac55d42010-02-03 19:33:23 +0000787 if (clear)
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200788 refcount_dec(&cached->refs);
Chris Mason2c64c532009-09-02 15:04:12 -0400789 state = cached;
Chris Mason42daec22009-09-23 19:51:09 -0400790 goto hit_next;
Chris Mason2c64c532009-09-02 15:04:12 -0400791 }
Josef Bacik2ac55d42010-02-03 19:33:23 +0000792 if (clear)
793 free_extent_state(cached);
Chris Mason2c64c532009-09-02 15:04:12 -0400794 }
Chris Masond1310b22008-01-24 16:13:08 -0500795 /*
796 * this search will find the extents that end after
797 * our range starts
798 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500799 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -0500800 if (!node)
801 goto out;
802 state = rb_entry(node, struct extent_state, rb_node);
Chris Mason2c64c532009-09-02 15:04:12 -0400803hit_next:
Chris Masond1310b22008-01-24 16:13:08 -0500804 if (state->start > end)
805 goto out;
806 WARN_ON(state->end < start);
Yan Zheng5c939df2009-05-27 09:16:03 -0400807 last_end = state->end;
Chris Masond1310b22008-01-24 16:13:08 -0500808
Liu Bo04493142012-02-16 18:34:37 +0800809 /* the state doesn't have the wanted bits, go ahead */
Li Zefancdc6a392012-03-12 16:39:48 +0800810 if (!(state->state & bits)) {
811 state = next_state(state);
Liu Bo04493142012-02-16 18:34:37 +0800812 goto next;
Li Zefancdc6a392012-03-12 16:39:48 +0800813 }
Liu Bo04493142012-02-16 18:34:37 +0800814
Chris Masond1310b22008-01-24 16:13:08 -0500815 /*
816 * | ---- desired range ---- |
817 * | state | or
818 * | ------------- state -------------- |
819 *
820 * We need to split the extent we found, and may flip
821 * bits on second half.
822 *
823 * If the extent we found extends past our range, we
824 * just split and search again. It'll get split again
825 * the next time though.
826 *
827 * If the extent we found is inside our range, we clear
828 * the desired bit on it.
829 */
830
831 if (state->start < start) {
Xiao Guangrong82337672011-04-20 06:44:57 +0000832 prealloc = alloc_extent_state_atomic(prealloc);
833 BUG_ON(!prealloc);
Chris Masond1310b22008-01-24 16:13:08 -0500834 err = split_state(tree, state, prealloc, start);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -0400835 if (err)
836 extent_io_tree_panic(tree, err);
837
Chris Masond1310b22008-01-24 16:13:08 -0500838 prealloc = NULL;
839 if (err)
840 goto out;
841 if (state->end <= end) {
David Sterba6d92b302020-06-25 17:54:54 +0200842 state = clear_state_bit(tree, state, bits, wake, changeset);
Liu Bod1ac6e42012-05-10 18:10:39 +0800843 goto next;
Chris Masond1310b22008-01-24 16:13:08 -0500844 }
845 goto search_again;
846 }
847 /*
848 * | ---- desired range ---- |
849 * | state |
850 * We need to split the extent, and clear the bit
851 * on the first half
852 */
853 if (state->start <= end && state->end > end) {
Xiao Guangrong82337672011-04-20 06:44:57 +0000854 prealloc = alloc_extent_state_atomic(prealloc);
855 BUG_ON(!prealloc);
Chris Masond1310b22008-01-24 16:13:08 -0500856 err = split_state(tree, state, prealloc, end + 1);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -0400857 if (err)
858 extent_io_tree_panic(tree, err);
859
Chris Masond1310b22008-01-24 16:13:08 -0500860 if (wake)
861 wake_up(&state->wq);
Chris Mason42daec22009-09-23 19:51:09 -0400862
David Sterba6d92b302020-06-25 17:54:54 +0200863 clear_state_bit(tree, prealloc, bits, wake, changeset);
Josef Bacik9ed74f22009-09-11 16:12:44 -0400864
Chris Masond1310b22008-01-24 16:13:08 -0500865 prealloc = NULL;
866 goto out;
867 }
Chris Mason42daec22009-09-23 19:51:09 -0400868
David Sterba6d92b302020-06-25 17:54:54 +0200869 state = clear_state_bit(tree, state, bits, wake, changeset);
Liu Bo04493142012-02-16 18:34:37 +0800870next:
Yan Zheng5c939df2009-05-27 09:16:03 -0400871 if (last_end == (u64)-1)
872 goto out;
873 start = last_end + 1;
Li Zefancdc6a392012-03-12 16:39:48 +0800874 if (start <= end && state && !need_resched())
Liu Bo692e5752012-02-16 18:34:36 +0800875 goto hit_next;
Chris Masond1310b22008-01-24 16:13:08 -0500876
877search_again:
878 if (start > end)
879 goto out;
Chris Masoncad321a2008-12-17 14:51:42 -0500880 spin_unlock(&tree->lock);
Mel Gormand0164ad2015-11-06 16:28:21 -0800881 if (gfpflags_allow_blocking(mask))
Chris Masond1310b22008-01-24 16:13:08 -0500882 cond_resched();
883 goto again;
David Sterba7ab5cb22016-04-27 01:02:15 +0200884
885out:
886 spin_unlock(&tree->lock);
887 if (prealloc)
888 free_extent_state(prealloc);
889
890 return 0;
891
Chris Masond1310b22008-01-24 16:13:08 -0500892}
Chris Masond1310b22008-01-24 16:13:08 -0500893
Jeff Mahoney143bede2012-03-01 14:56:26 +0100894static void wait_on_state(struct extent_io_tree *tree,
895 struct extent_state *state)
Christoph Hellwig641f5212008-12-02 06:36:10 -0500896 __releases(tree->lock)
897 __acquires(tree->lock)
Chris Masond1310b22008-01-24 16:13:08 -0500898{
899 DEFINE_WAIT(wait);
900 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
Chris Masoncad321a2008-12-17 14:51:42 -0500901 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500902 schedule();
Chris Masoncad321a2008-12-17 14:51:42 -0500903 spin_lock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500904 finish_wait(&state->wq, &wait);
Chris Masond1310b22008-01-24 16:13:08 -0500905}
906
907/*
908 * waits for one or more bits to clear on a range in the state tree.
909 * The range [start, end] is inclusive.
910 * The tree lock is taken by this function
911 */
David Sterba41074882013-04-29 13:38:46 +0000912static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +0800913 u32 bits)
Chris Masond1310b22008-01-24 16:13:08 -0500914{
915 struct extent_state *state;
916 struct rb_node *node;
917
Josef Bacika5dee372013-12-13 10:02:44 -0500918 btrfs_debug_check_extent_io_range(tree, start, end);
David Sterba8d599ae2013-04-30 15:22:23 +0000919
Chris Masoncad321a2008-12-17 14:51:42 -0500920 spin_lock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500921again:
922 while (1) {
923 /*
924 * this search will find all the extents that end after
925 * our range starts
926 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500927 node = tree_search(tree, start);
Filipe Mananac50d3e72014-03-31 14:53:25 +0100928process_node:
Chris Masond1310b22008-01-24 16:13:08 -0500929 if (!node)
930 break;
931
932 state = rb_entry(node, struct extent_state, rb_node);
933
934 if (state->start > end)
935 goto out;
936
937 if (state->state & bits) {
938 start = state->start;
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200939 refcount_inc(&state->refs);
Chris Masond1310b22008-01-24 16:13:08 -0500940 wait_on_state(tree, state);
941 free_extent_state(state);
942 goto again;
943 }
944 start = state->end + 1;
945
946 if (start > end)
947 break;
948
Filipe Mananac50d3e72014-03-31 14:53:25 +0100949 if (!cond_resched_lock(&tree->lock)) {
950 node = rb_next(node);
951 goto process_node;
952 }
Chris Masond1310b22008-01-24 16:13:08 -0500953 }
954out:
Chris Masoncad321a2008-12-17 14:51:42 -0500955 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500956}
Chris Masond1310b22008-01-24 16:13:08 -0500957
Jeff Mahoney1bf85042011-07-21 16:56:09 +0000958static void set_state_bits(struct extent_io_tree *tree,
Chris Masond1310b22008-01-24 16:13:08 -0500959 struct extent_state *state,
David Sterba6d92b302020-06-25 17:54:54 +0200960 u32 bits, struct extent_changeset *changeset)
Chris Masond1310b22008-01-24 16:13:08 -0500961{
David Sterba6d92b302020-06-25 17:54:54 +0200962 u32 bits_to_set = bits & ~EXTENT_CTLBITS;
David Sterba57599c72018-03-01 17:56:34 +0100963 int ret;
Josef Bacik9ed74f22009-09-11 16:12:44 -0400964
Nikolay Borisove06a1fc2018-11-01 14:09:50 +0200965 if (tree->private_data && is_data_inode(tree->private_data))
966 btrfs_set_delalloc_extent(tree->private_data, state, bits);
967
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400968 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
Chris Masond1310b22008-01-24 16:13:08 -0500969 u64 range = state->end - state->start + 1;
970 tree->dirty_bytes += range;
971 }
David Sterba57599c72018-03-01 17:56:34 +0100972 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
973 BUG_ON(ret < 0);
Yan, Zheng0ca1f7c2010-05-16 10:48:47 -0400974 state->state |= bits_to_set;
Chris Masond1310b22008-01-24 16:13:08 -0500975}
976
Filipe Mananae38e2ed2014-10-13 12:28:38 +0100977static void cache_state_if_flags(struct extent_state *state,
978 struct extent_state **cached_ptr,
David Sterba9ee49a042015-01-14 19:52:13 +0100979 unsigned flags)
Chris Mason2c64c532009-09-02 15:04:12 -0400980{
981 if (cached_ptr && !(*cached_ptr)) {
Filipe Mananae38e2ed2014-10-13 12:28:38 +0100982 if (!flags || (state->state & flags)) {
Chris Mason2c64c532009-09-02 15:04:12 -0400983 *cached_ptr = state;
Elena Reshetovab7ac31b2017-03-03 10:55:19 +0200984 refcount_inc(&state->refs);
Chris Mason2c64c532009-09-02 15:04:12 -0400985 }
986 }
987}
988
Filipe Mananae38e2ed2014-10-13 12:28:38 +0100989static void cache_state(struct extent_state *state,
990 struct extent_state **cached_ptr)
991{
992 return cache_state_if_flags(state, cached_ptr,
Nikolay Borisov88826792019-03-14 15:28:31 +0200993 EXTENT_LOCKED | EXTENT_BOUNDARY);
Filipe Mananae38e2ed2014-10-13 12:28:38 +0100994}
995
Chris Masond1310b22008-01-24 16:13:08 -0500996/*
Chris Mason1edbb732009-09-02 13:24:36 -0400997 * set some bits on a range in the tree. This may require allocations or
998 * sleeping, so the gfp mask is used to indicate what is allowed.
Chris Masond1310b22008-01-24 16:13:08 -0500999 *
Chris Mason1edbb732009-09-02 13:24:36 -04001000 * If any of the exclusive bits are set, this will fail with -EEXIST if some
1001 * part of the range already has the desired bits set. The start of the
1002 * existing range is returned in failed_start in this case.
Chris Masond1310b22008-01-24 16:13:08 -05001003 *
Chris Mason1edbb732009-09-02 13:24:36 -04001004 * [start, end] is inclusive This takes the tree lock.
Chris Masond1310b22008-01-24 16:13:08 -05001005 */
Qu Wenruof97e27e2020-11-13 20:51:40 +08001006int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1007 u32 exclusive_bits, u64 *failed_start,
Nikolay Borisov1cab5e72020-11-05 11:08:00 +02001008 struct extent_state **cached_state, gfp_t mask,
1009 struct extent_changeset *changeset)
Chris Masond1310b22008-01-24 16:13:08 -05001010{
1011 struct extent_state *state;
1012 struct extent_state *prealloc = NULL;
1013 struct rb_node *node;
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +00001014 struct rb_node **p;
1015 struct rb_node *parent;
Chris Masond1310b22008-01-24 16:13:08 -05001016 int err = 0;
Chris Masond1310b22008-01-24 16:13:08 -05001017 u64 last_start;
1018 u64 last_end;
Chris Mason42daec22009-09-23 19:51:09 -04001019
Josef Bacika5dee372013-12-13 10:02:44 -05001020 btrfs_debug_check_extent_io_range(tree, start, end);
Qu Wenruoa1d19842019-03-01 10:48:00 +08001021 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
David Sterba8d599ae2013-04-30 15:22:23 +00001022
Qu Wenruo3f6bb4a2020-10-21 14:24:51 +08001023 if (exclusive_bits)
1024 ASSERT(failed_start);
1025 else
1026 ASSERT(failed_start == NULL);
Chris Masond1310b22008-01-24 16:13:08 -05001027again:
Mel Gormand0164ad2015-11-06 16:28:21 -08001028 if (!prealloc && gfpflags_allow_blocking(mask)) {
David Sterba059f7912016-04-27 01:03:45 +02001029 /*
1030 * Don't care for allocation failure here because we might end
1031 * up not needing the pre-allocated extent state at all, which
1032 * is the case if we only have in the tree extent states that
1033 * cover our input range and don't cover too any other range.
1034 * If we end up needing a new extent state we allocate it later.
1035 */
Chris Masond1310b22008-01-24 16:13:08 -05001036 prealloc = alloc_extent_state(mask);
Chris Masond1310b22008-01-24 16:13:08 -05001037 }
1038
Chris Masoncad321a2008-12-17 14:51:42 -05001039 spin_lock(&tree->lock);
Chris Mason9655d292009-09-02 15:22:30 -04001040 if (cached_state && *cached_state) {
1041 state = *cached_state;
Josef Bacikdf98b6e2011-06-20 14:53:48 -04001042 if (state->start <= start && state->end > start &&
Filipe Manana27a35072014-07-06 20:09:59 +01001043 extent_state_in_tree(state)) {
Chris Mason9655d292009-09-02 15:22:30 -04001044 node = &state->rb_node;
1045 goto hit_next;
1046 }
1047 }
Chris Masond1310b22008-01-24 16:13:08 -05001048 /*
1049 * this search will find all the extents that end after
1050 * our range starts.
1051 */
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +00001052 node = tree_search_for_insert(tree, start, &p, &parent);
Chris Masond1310b22008-01-24 16:13:08 -05001053 if (!node) {
Xiao Guangrong82337672011-04-20 06:44:57 +00001054 prealloc = alloc_extent_state_atomic(prealloc);
1055 BUG_ON(!prealloc);
David Sterbacee51262020-06-25 17:18:24 +02001056 prealloc->start = start;
1057 prealloc->end = end;
David Sterbafb8f07d2020-06-25 18:11:31 +02001058 insert_state_fast(tree, prealloc, p, parent, bits, changeset);
Filipe David Borba Mananac42ac0b2013-11-26 15:01:34 +00001059 cache_state(prealloc, cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05001060 prealloc = NULL;
Chris Masond1310b22008-01-24 16:13:08 -05001061 goto out;
1062 }
Chris Masond1310b22008-01-24 16:13:08 -05001063 state = rb_entry(node, struct extent_state, rb_node);
Chris Mason40431d62009-08-05 12:57:59 -04001064hit_next:
Chris Masond1310b22008-01-24 16:13:08 -05001065 last_start = state->start;
1066 last_end = state->end;
1067
1068 /*
1069 * | ---- desired range ---- |
1070 * | state |
1071 *
1072 * Just lock what we found and keep going
1073 */
1074 if (state->start == start && state->end <= end) {
Chris Mason1edbb732009-09-02 13:24:36 -04001075 if (state->state & exclusive_bits) {
Chris Masond1310b22008-01-24 16:13:08 -05001076 *failed_start = state->start;
1077 err = -EEXIST;
1078 goto out;
1079 }
Chris Mason42daec22009-09-23 19:51:09 -04001080
David Sterba6d92b302020-06-25 17:54:54 +02001081 set_state_bits(tree, state, bits, changeset);
Chris Mason2c64c532009-09-02 15:04:12 -04001082 cache_state(state, cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05001083 merge_state(tree, state);
Yan Zheng5c939df2009-05-27 09:16:03 -04001084 if (last_end == (u64)-1)
1085 goto out;
1086 start = last_end + 1;
Liu Bod1ac6e42012-05-10 18:10:39 +08001087 state = next_state(state);
1088 if (start < end && state && state->start == start &&
1089 !need_resched())
1090 goto hit_next;
Chris Masond1310b22008-01-24 16:13:08 -05001091 goto search_again;
1092 }
1093
1094 /*
1095 * | ---- desired range ---- |
1096 * | state |
1097 * or
1098 * | ------------- state -------------- |
1099 *
1100 * We need to split the extent we found, and may flip bits on
1101 * second half.
1102 *
1103 * If the extent we found extends past our
1104 * range, we just split and search again. It'll get split
1105 * again the next time though.
1106 *
1107 * If the extent we found is inside our range, we set the
1108 * desired bit on it.
1109 */
1110 if (state->start < start) {
Chris Mason1edbb732009-09-02 13:24:36 -04001111 if (state->state & exclusive_bits) {
Chris Masond1310b22008-01-24 16:13:08 -05001112 *failed_start = start;
1113 err = -EEXIST;
1114 goto out;
1115 }
Xiao Guangrong82337672011-04-20 06:44:57 +00001116
Filipe Manana55ffaab2020-02-13 10:20:02 +00001117 /*
1118 * If this extent already has all the bits we want set, then
1119 * skip it, not necessary to split it or do anything with it.
1120 */
1121 if ((state->state & bits) == bits) {
1122 start = state->end + 1;
1123 cache_state(state, cached_state);
1124 goto search_again;
1125 }
1126
Xiao Guangrong82337672011-04-20 06:44:57 +00001127 prealloc = alloc_extent_state_atomic(prealloc);
1128 BUG_ON(!prealloc);
Chris Masond1310b22008-01-24 16:13:08 -05001129 err = split_state(tree, state, prealloc, start);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001130 if (err)
1131 extent_io_tree_panic(tree, err);
1132
Chris Masond1310b22008-01-24 16:13:08 -05001133 prealloc = NULL;
1134 if (err)
1135 goto out;
1136 if (state->end <= end) {
David Sterba6d92b302020-06-25 17:54:54 +02001137 set_state_bits(tree, state, bits, changeset);
Chris Mason2c64c532009-09-02 15:04:12 -04001138 cache_state(state, cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05001139 merge_state(tree, state);
Yan Zheng5c939df2009-05-27 09:16:03 -04001140 if (last_end == (u64)-1)
1141 goto out;
1142 start = last_end + 1;
Liu Bod1ac6e42012-05-10 18:10:39 +08001143 state = next_state(state);
1144 if (start < end && state && state->start == start &&
1145 !need_resched())
1146 goto hit_next;
Chris Masond1310b22008-01-24 16:13:08 -05001147 }
1148 goto search_again;
1149 }
1150 /*
1151 * | ---- desired range ---- |
1152 * | state | or | state |
1153 *
1154 * There's a hole, we need to insert something in it and
1155 * ignore the extent we found.
1156 */
1157 if (state->start > start) {
1158 u64 this_end;
1159 if (end < last_start)
1160 this_end = end;
1161 else
Chris Masond3977122009-01-05 21:25:51 -05001162 this_end = last_start - 1;
Xiao Guangrong82337672011-04-20 06:44:57 +00001163
1164 prealloc = alloc_extent_state_atomic(prealloc);
1165 BUG_ON(!prealloc);
Xiao Guangrongc7f895a2011-04-20 06:45:49 +00001166
1167 /*
1168 * Avoid to free 'prealloc' if it can be merged with
1169 * the later extent.
1170 */
David Sterbacee51262020-06-25 17:18:24 +02001171 prealloc->start = start;
1172 prealloc->end = this_end;
David Sterbac3676022020-06-25 18:15:31 +02001173 err = insert_state(tree, prealloc, bits, changeset);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001174 if (err)
1175 extent_io_tree_panic(tree, err);
1176
Chris Mason2c64c532009-09-02 15:04:12 -04001177 cache_state(prealloc, cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05001178 prealloc = NULL;
Chris Masond1310b22008-01-24 16:13:08 -05001179 start = this_end + 1;
1180 goto search_again;
1181 }
1182 /*
1183 * | ---- desired range ---- |
1184 * | state |
1185 * We need to split the extent, and set the bit
1186 * on the first half
1187 */
1188 if (state->start <= end && state->end > end) {
Chris Mason1edbb732009-09-02 13:24:36 -04001189 if (state->state & exclusive_bits) {
Chris Masond1310b22008-01-24 16:13:08 -05001190 *failed_start = start;
1191 err = -EEXIST;
1192 goto out;
1193 }
Xiao Guangrong82337672011-04-20 06:44:57 +00001194
1195 prealloc = alloc_extent_state_atomic(prealloc);
1196 BUG_ON(!prealloc);
Chris Masond1310b22008-01-24 16:13:08 -05001197 err = split_state(tree, state, prealloc, end + 1);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001198 if (err)
1199 extent_io_tree_panic(tree, err);
Chris Masond1310b22008-01-24 16:13:08 -05001200
David Sterba6d92b302020-06-25 17:54:54 +02001201 set_state_bits(tree, prealloc, bits, changeset);
Chris Mason2c64c532009-09-02 15:04:12 -04001202 cache_state(prealloc, cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05001203 merge_state(tree, prealloc);
1204 prealloc = NULL;
1205 goto out;
1206 }
1207
David Sterbab5a4ba142016-04-27 01:02:15 +02001208search_again:
1209 if (start > end)
1210 goto out;
1211 spin_unlock(&tree->lock);
1212 if (gfpflags_allow_blocking(mask))
1213 cond_resched();
1214 goto again;
Chris Masond1310b22008-01-24 16:13:08 -05001215
1216out:
Chris Masoncad321a2008-12-17 14:51:42 -05001217 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001218 if (prealloc)
1219 free_extent_state(prealloc);
1220
1221 return err;
1222
Chris Masond1310b22008-01-24 16:13:08 -05001223}
Chris Masond1310b22008-01-24 16:13:08 -05001224
Josef Bacik462d6fa2011-09-26 13:56:12 -04001225/**
Liu Bo10983f22012-07-11 15:26:19 +08001226 * convert_extent_bit - convert all bits in a given range from one bit to
1227 * another
Josef Bacik462d6fa2011-09-26 13:56:12 -04001228 * @tree: the io tree to search
1229 * @start: the start offset in bytes
1230 * @end: the end offset in bytes (inclusive)
1231 * @bits: the bits to set in this range
1232 * @clear_bits: the bits to clear in this range
Josef Bacike6138872012-09-27 17:07:30 -04001233 * @cached_state: state that we're going to cache
Josef Bacik462d6fa2011-09-26 13:56:12 -04001234 *
1235 * This will go through and set bits for the given range. If any states exist
1236 * already in this range they are set with the given bit and cleared of the
1237 * clear_bits. This is only meant to be used by things that are mergeable, ie
1238 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1239 * boundary bits like LOCK.
David Sterba210aa272016-04-26 23:54:39 +02001240 *
1241 * All allocations are done with GFP_NOFS.
Josef Bacik462d6fa2011-09-26 13:56:12 -04001242 */
1243int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001244 u32 bits, u32 clear_bits,
David Sterba210aa272016-04-26 23:54:39 +02001245 struct extent_state **cached_state)
Josef Bacik462d6fa2011-09-26 13:56:12 -04001246{
1247 struct extent_state *state;
1248 struct extent_state *prealloc = NULL;
1249 struct rb_node *node;
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +00001250 struct rb_node **p;
1251 struct rb_node *parent;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001252 int err = 0;
1253 u64 last_start;
1254 u64 last_end;
Filipe Mananac8fd3de2014-10-13 12:28:39 +01001255 bool first_iteration = true;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001256
Josef Bacika5dee372013-12-13 10:02:44 -05001257 btrfs_debug_check_extent_io_range(tree, start, end);
Qu Wenruoa1d19842019-03-01 10:48:00 +08001258 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1259 clear_bits);
David Sterba8d599ae2013-04-30 15:22:23 +00001260
Josef Bacik462d6fa2011-09-26 13:56:12 -04001261again:
David Sterba210aa272016-04-26 23:54:39 +02001262 if (!prealloc) {
Filipe Mananac8fd3de2014-10-13 12:28:39 +01001263 /*
1264 * Best effort, don't worry if extent state allocation fails
1265 * here for the first iteration. We might have a cached state
1266 * that matches exactly the target range, in which case no
1267 * extent state allocations are needed. We'll only know this
1268 * after locking the tree.
1269 */
David Sterba210aa272016-04-26 23:54:39 +02001270 prealloc = alloc_extent_state(GFP_NOFS);
Filipe Mananac8fd3de2014-10-13 12:28:39 +01001271 if (!prealloc && !first_iteration)
Josef Bacik462d6fa2011-09-26 13:56:12 -04001272 return -ENOMEM;
1273 }
1274
1275 spin_lock(&tree->lock);
Josef Bacike6138872012-09-27 17:07:30 -04001276 if (cached_state && *cached_state) {
1277 state = *cached_state;
1278 if (state->start <= start && state->end > start &&
Filipe Manana27a35072014-07-06 20:09:59 +01001279 extent_state_in_tree(state)) {
Josef Bacike6138872012-09-27 17:07:30 -04001280 node = &state->rb_node;
1281 goto hit_next;
1282 }
1283 }
1284
Josef Bacik462d6fa2011-09-26 13:56:12 -04001285 /*
1286 * this search will find all the extents that end after
1287 * our range starts.
1288 */
Filipe David Borba Manana12cfbad2013-11-26 15:41:47 +00001289 node = tree_search_for_insert(tree, start, &p, &parent);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001290 if (!node) {
1291 prealloc = alloc_extent_state_atomic(prealloc);
Liu Bo1cf4ffd2011-12-07 20:08:40 -05001292 if (!prealloc) {
1293 err = -ENOMEM;
1294 goto out;
1295 }
David Sterbacee51262020-06-25 17:18:24 +02001296 prealloc->start = start;
1297 prealloc->end = end;
David Sterbafb8f07d2020-06-25 18:11:31 +02001298 insert_state_fast(tree, prealloc, p, parent, bits, NULL);
Filipe David Borba Mananac42ac0b2013-11-26 15:01:34 +00001299 cache_state(prealloc, cached_state);
1300 prealloc = NULL;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001301 goto out;
1302 }
1303 state = rb_entry(node, struct extent_state, rb_node);
1304hit_next:
1305 last_start = state->start;
1306 last_end = state->end;
1307
1308 /*
1309 * | ---- desired range ---- |
1310 * | state |
1311 *
1312 * Just lock what we found and keep going
1313 */
1314 if (state->start == start && state->end <= end) {
David Sterba6d92b302020-06-25 17:54:54 +02001315 set_state_bits(tree, state, bits, NULL);
Josef Bacike6138872012-09-27 17:07:30 -04001316 cache_state(state, cached_state);
David Sterba6d92b302020-06-25 17:54:54 +02001317 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001318 if (last_end == (u64)-1)
1319 goto out;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001320 start = last_end + 1;
Liu Bod1ac6e42012-05-10 18:10:39 +08001321 if (start < end && state && state->start == start &&
1322 !need_resched())
1323 goto hit_next;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001324 goto search_again;
1325 }
1326
1327 /*
1328 * | ---- desired range ---- |
1329 * | state |
1330 * or
1331 * | ------------- state -------------- |
1332 *
1333 * We need to split the extent we found, and may flip bits on
1334 * second half.
1335 *
1336 * If the extent we found extends past our
1337 * range, we just split and search again. It'll get split
1338 * again the next time though.
1339 *
1340 * If the extent we found is inside our range, we set the
1341 * desired bit on it.
1342 */
1343 if (state->start < start) {
1344 prealloc = alloc_extent_state_atomic(prealloc);
Liu Bo1cf4ffd2011-12-07 20:08:40 -05001345 if (!prealloc) {
1346 err = -ENOMEM;
1347 goto out;
1348 }
Josef Bacik462d6fa2011-09-26 13:56:12 -04001349 err = split_state(tree, state, prealloc, start);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001350 if (err)
1351 extent_io_tree_panic(tree, err);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001352 prealloc = NULL;
1353 if (err)
1354 goto out;
1355 if (state->end <= end) {
David Sterba6d92b302020-06-25 17:54:54 +02001356 set_state_bits(tree, state, bits, NULL);
Josef Bacike6138872012-09-27 17:07:30 -04001357 cache_state(state, cached_state);
David Sterba6d92b302020-06-25 17:54:54 +02001358 state = clear_state_bit(tree, state, clear_bits, 0, NULL);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001359 if (last_end == (u64)-1)
1360 goto out;
1361 start = last_end + 1;
Liu Bod1ac6e42012-05-10 18:10:39 +08001362 if (start < end && state && state->start == start &&
1363 !need_resched())
1364 goto hit_next;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001365 }
1366 goto search_again;
1367 }
1368 /*
1369 * | ---- desired range ---- |
1370 * | state | or | state |
1371 *
1372 * There's a hole, we need to insert something in it and
1373 * ignore the extent we found.
1374 */
1375 if (state->start > start) {
1376 u64 this_end;
1377 if (end < last_start)
1378 this_end = end;
1379 else
1380 this_end = last_start - 1;
1381
1382 prealloc = alloc_extent_state_atomic(prealloc);
Liu Bo1cf4ffd2011-12-07 20:08:40 -05001383 if (!prealloc) {
1384 err = -ENOMEM;
1385 goto out;
1386 }
Josef Bacik462d6fa2011-09-26 13:56:12 -04001387
1388 /*
1389 * Avoid to free 'prealloc' if it can be merged with
1390 * the later extent.
1391 */
David Sterbacee51262020-06-25 17:18:24 +02001392 prealloc->start = start;
1393 prealloc->end = this_end;
David Sterbac3676022020-06-25 18:15:31 +02001394 err = insert_state(tree, prealloc, bits, NULL);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001395 if (err)
1396 extent_io_tree_panic(tree, err);
Josef Bacike6138872012-09-27 17:07:30 -04001397 cache_state(prealloc, cached_state);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001398 prealloc = NULL;
1399 start = this_end + 1;
1400 goto search_again;
1401 }
1402 /*
1403 * | ---- desired range ---- |
1404 * | state |
1405 * We need to split the extent, and set the bit
1406 * on the first half
1407 */
1408 if (state->start <= end && state->end > end) {
1409 prealloc = alloc_extent_state_atomic(prealloc);
Liu Bo1cf4ffd2011-12-07 20:08:40 -05001410 if (!prealloc) {
1411 err = -ENOMEM;
1412 goto out;
1413 }
Josef Bacik462d6fa2011-09-26 13:56:12 -04001414
1415 err = split_state(tree, state, prealloc, end + 1);
Jeff Mahoneyc2d904e2011-10-03 23:22:32 -04001416 if (err)
1417 extent_io_tree_panic(tree, err);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001418
David Sterba6d92b302020-06-25 17:54:54 +02001419 set_state_bits(tree, prealloc, bits, NULL);
Josef Bacike6138872012-09-27 17:07:30 -04001420 cache_state(prealloc, cached_state);
David Sterba6d92b302020-06-25 17:54:54 +02001421 clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
Josef Bacik462d6fa2011-09-26 13:56:12 -04001422 prealloc = NULL;
1423 goto out;
1424 }
1425
Josef Bacik462d6fa2011-09-26 13:56:12 -04001426search_again:
1427 if (start > end)
1428 goto out;
1429 spin_unlock(&tree->lock);
David Sterba210aa272016-04-26 23:54:39 +02001430 cond_resched();
Filipe Mananac8fd3de2014-10-13 12:28:39 +01001431 first_iteration = false;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001432 goto again;
Josef Bacik462d6fa2011-09-26 13:56:12 -04001433
1434out:
1435 spin_unlock(&tree->lock);
1436 if (prealloc)
1437 free_extent_state(prealloc);
1438
1439 return err;
1440}
1441
Chris Masond1310b22008-01-24 16:13:08 -05001442/* wrappers around set/clear extent bit */
Qu Wenruod38ed272015-10-12 14:53:37 +08001443int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001444 u32 bits, struct extent_changeset *changeset)
Qu Wenruod38ed272015-10-12 14:53:37 +08001445{
1446 /*
1447 * We don't support EXTENT_LOCKED yet, as current changeset will
1448 * record any bits changed, so for EXTENT_LOCKED case, it will
1449 * either fail with -EEXIST or changeset will record the whole
1450 * range.
1451 */
1452 BUG_ON(bits & EXTENT_LOCKED);
1453
Nikolay Borisov1cab5e72020-11-05 11:08:00 +02001454 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1455 changeset);
Qu Wenruod38ed272015-10-12 14:53:37 +08001456}
1457
Nikolay Borisov4ca73652019-03-27 14:24:10 +02001458int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001459 u32 bits)
Nikolay Borisov4ca73652019-03-27 14:24:10 +02001460{
Nikolay Borisov1cab5e72020-11-05 11:08:00 +02001461 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1462 GFP_NOWAIT, NULL);
Nikolay Borisov4ca73652019-03-27 14:24:10 +02001463}
1464
Qu Wenruofefdc552015-10-12 15:35:38 +08001465int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001466 u32 bits, int wake, int delete,
David Sterbaae0f1622017-10-31 16:37:52 +01001467 struct extent_state **cached)
Qu Wenruofefdc552015-10-12 15:35:38 +08001468{
1469 return __clear_extent_bit(tree, start, end, bits, wake, delete,
David Sterbaae0f1622017-10-31 16:37:52 +01001470 cached, GFP_NOFS, NULL);
Qu Wenruofefdc552015-10-12 15:35:38 +08001471}
1472
Qu Wenruofefdc552015-10-12 15:35:38 +08001473int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001474 u32 bits, struct extent_changeset *changeset)
Qu Wenruofefdc552015-10-12 15:35:38 +08001475{
1476 /*
1477 * Don't support EXTENT_LOCKED case, same reason as
1478 * set_record_extent_bits().
1479 */
1480 BUG_ON(bits & EXTENT_LOCKED);
1481
David Sterbaf734c442016-04-26 23:54:39 +02001482 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
Qu Wenruofefdc552015-10-12 15:35:38 +08001483 changeset);
1484}
1485
Chris Masond352ac62008-09-29 15:18:18 -04001486/*
1487 * either insert or lock state struct between start and end use mask to tell
1488 * us if waiting is desired.
1489 */
Chris Mason1edbb732009-09-02 13:24:36 -04001490int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
David Sterbaff13db42015-12-03 14:30:40 +01001491 struct extent_state **cached_state)
Chris Masond1310b22008-01-24 16:13:08 -05001492{
1493 int err;
1494 u64 failed_start;
David Sterba9ee49a042015-01-14 19:52:13 +01001495
Chris Masond1310b22008-01-24 16:13:08 -05001496 while (1) {
Nikolay Borisov1cab5e72020-11-05 11:08:00 +02001497 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1498 EXTENT_LOCKED, &failed_start,
1499 cached_state, GFP_NOFS, NULL);
Jeff Mahoneyd0082372012-03-01 14:57:19 +01001500 if (err == -EEXIST) {
Chris Masond1310b22008-01-24 16:13:08 -05001501 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1502 start = failed_start;
Jeff Mahoneyd0082372012-03-01 14:57:19 +01001503 } else
Chris Masond1310b22008-01-24 16:13:08 -05001504 break;
Chris Masond1310b22008-01-24 16:13:08 -05001505 WARN_ON(start > end);
1506 }
1507 return err;
1508}
Chris Masond1310b22008-01-24 16:13:08 -05001509
Jeff Mahoneyd0082372012-03-01 14:57:19 +01001510int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
Josef Bacik251792012008-10-29 14:49:05 -04001511{
1512 int err;
1513 u64 failed_start;
1514
Nikolay Borisov1cab5e72020-11-05 11:08:00 +02001515 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1516 &failed_start, NULL, GFP_NOFS, NULL);
Yan Zheng66435582008-10-30 14:19:50 -04001517 if (err == -EEXIST) {
1518 if (failed_start > start)
1519 clear_extent_bit(tree, start, failed_start - 1,
David Sterbaae0f1622017-10-31 16:37:52 +01001520 EXTENT_LOCKED, 1, 0, NULL);
Josef Bacik251792012008-10-29 14:49:05 -04001521 return 0;
Yan Zheng66435582008-10-30 14:19:50 -04001522 }
Josef Bacik251792012008-10-29 14:49:05 -04001523 return 1;
1524}
Josef Bacik251792012008-10-29 14:49:05 -04001525
David Sterbabd1fa4f2015-12-03 13:08:59 +01001526void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
Chris Mason4adaa612013-03-26 13:07:00 -04001527{
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001528 unsigned long index = start >> PAGE_SHIFT;
1529 unsigned long end_index = end >> PAGE_SHIFT;
Chris Mason4adaa612013-03-26 13:07:00 -04001530 struct page *page;
1531
1532 while (index <= end_index) {
1533 page = find_get_page(inode->i_mapping, index);
1534 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1535 clear_page_dirty_for_io(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001536 put_page(page);
Chris Mason4adaa612013-03-26 13:07:00 -04001537 index++;
1538 }
Chris Mason4adaa612013-03-26 13:07:00 -04001539}
1540
David Sterbaf6311572015-12-03 13:08:59 +01001541void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
Chris Mason4adaa612013-03-26 13:07:00 -04001542{
Matthew Wilcox (Oracle)ebf55c82022-02-09 20:22:04 +00001543 struct address_space *mapping = inode->i_mapping;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001544 unsigned long index = start >> PAGE_SHIFT;
1545 unsigned long end_index = end >> PAGE_SHIFT;
Matthew Wilcox (Oracle)ebf55c82022-02-09 20:22:04 +00001546 struct folio *folio;
Chris Mason4adaa612013-03-26 13:07:00 -04001547
1548 while (index <= end_index) {
Matthew Wilcox (Oracle)ebf55c82022-02-09 20:22:04 +00001549 folio = filemap_get_folio(mapping, index);
1550 filemap_dirty_folio(mapping, folio);
1551 folio_account_redirty(folio);
1552 index += folio_nr_pages(folio);
1553 folio_put(folio);
Chris Mason4adaa612013-03-26 13:07:00 -04001554 }
Chris Mason4adaa612013-03-26 13:07:00 -04001555}
1556
Chris Masond352ac62008-09-29 15:18:18 -04001557/* find the first state struct with 'bits' set after 'start', and
1558 * return it. tree->lock must be held. NULL will returned if
1559 * nothing was found after 'start'
1560 */
Eric Sandeen48a3b632013-04-25 20:41:01 +00001561static struct extent_state *
Qu Wenruof97e27e2020-11-13 20:51:40 +08001562find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
Chris Masond7fc6402008-02-18 12:12:38 -05001563{
1564 struct rb_node *node;
1565 struct extent_state *state;
1566
1567 /*
1568 * this search will find all the extents that end after
1569 * our range starts.
1570 */
1571 node = tree_search(tree, start);
Chris Masond3977122009-01-05 21:25:51 -05001572 if (!node)
Chris Masond7fc6402008-02-18 12:12:38 -05001573 goto out;
Chris Masond7fc6402008-02-18 12:12:38 -05001574
Chris Masond3977122009-01-05 21:25:51 -05001575 while (1) {
Chris Masond7fc6402008-02-18 12:12:38 -05001576 state = rb_entry(node, struct extent_state, rb_node);
Chris Masond3977122009-01-05 21:25:51 -05001577 if (state->end >= start && (state->state & bits))
Chris Masond7fc6402008-02-18 12:12:38 -05001578 return state;
Chris Masond3977122009-01-05 21:25:51 -05001579
Chris Masond7fc6402008-02-18 12:12:38 -05001580 node = rb_next(node);
1581 if (!node)
1582 break;
1583 }
1584out:
1585 return NULL;
1586}
Chris Masond7fc6402008-02-18 12:12:38 -05001587
Chris Masond352ac62008-09-29 15:18:18 -04001588/*
Qu Wenruo03509b72020-10-21 14:24:50 +08001589 * Find the first offset in the io tree with one or more @bits set.
Xiao Guangrong69261c42011-07-14 03:19:45 +00001590 *
Qu Wenruo03509b72020-10-21 14:24:50 +08001591 * Note: If there are multiple bits set in @bits, any of them will match.
1592 *
1593 * Return 0 if we find something, and update @start_ret and @end_ret.
1594 * Return 1 if we found nothing.
Xiao Guangrong69261c42011-07-14 03:19:45 +00001595 */
1596int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001597 u64 *start_ret, u64 *end_ret, u32 bits,
Josef Bacike6138872012-09-27 17:07:30 -04001598 struct extent_state **cached_state)
Xiao Guangrong69261c42011-07-14 03:19:45 +00001599{
1600 struct extent_state *state;
1601 int ret = 1;
1602
1603 spin_lock(&tree->lock);
Josef Bacike6138872012-09-27 17:07:30 -04001604 if (cached_state && *cached_state) {
1605 state = *cached_state;
Filipe Manana27a35072014-07-06 20:09:59 +01001606 if (state->end == start - 1 && extent_state_in_tree(state)) {
Liu Bo9688e9a2018-08-23 03:14:53 +08001607 while ((state = next_state(state)) != NULL) {
Josef Bacike6138872012-09-27 17:07:30 -04001608 if (state->state & bits)
1609 goto got_it;
Josef Bacike6138872012-09-27 17:07:30 -04001610 }
1611 free_extent_state(*cached_state);
1612 *cached_state = NULL;
1613 goto out;
1614 }
1615 free_extent_state(*cached_state);
1616 *cached_state = NULL;
1617 }
1618
Xiao Guangrong69261c42011-07-14 03:19:45 +00001619 state = find_first_extent_bit_state(tree, start, bits);
Josef Bacike6138872012-09-27 17:07:30 -04001620got_it:
Xiao Guangrong69261c42011-07-14 03:19:45 +00001621 if (state) {
Filipe Mananae38e2ed2014-10-13 12:28:38 +01001622 cache_state_if_flags(state, cached_state, 0);
Xiao Guangrong69261c42011-07-14 03:19:45 +00001623 *start_ret = state->start;
1624 *end_ret = state->end;
1625 ret = 0;
1626 }
Josef Bacike6138872012-09-27 17:07:30 -04001627out:
Xiao Guangrong69261c42011-07-14 03:19:45 +00001628 spin_unlock(&tree->lock);
1629 return ret;
1630}
1631
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001632/**
Nikolay Borisov3bed2da2021-01-22 11:58:03 +02001633 * Find a contiguous area of bits
1634 *
1635 * @tree: io tree to check
1636 * @start: offset to start the search from
1637 * @start_ret: the first offset we found with the bits set
1638 * @end_ret: the final contiguous range of the bits that were set
1639 * @bits: bits to look for
Josef Bacik41a2ee72020-01-17 09:02:21 -05001640 *
1641 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1642 * to set bits appropriately, and then merge them again. During this time it
1643 * will drop the tree->lock, so use this helper if you want to find the actual
1644 * contiguous area for given bits. We will search to the first bit we find, and
1645 * then walk down the tree until we find a non-contiguous area. The area
1646 * returned will be the full contiguous area with the bits set.
1647 */
1648int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001649 u64 *start_ret, u64 *end_ret, u32 bits)
Josef Bacik41a2ee72020-01-17 09:02:21 -05001650{
1651 struct extent_state *state;
1652 int ret = 1;
1653
1654 spin_lock(&tree->lock);
1655 state = find_first_extent_bit_state(tree, start, bits);
1656 if (state) {
1657 *start_ret = state->start;
1658 *end_ret = state->end;
1659 while ((state = next_state(state)) != NULL) {
1660 if (state->start > (*end_ret + 1))
1661 break;
1662 *end_ret = state->end;
1663 }
1664 ret = 0;
1665 }
1666 spin_unlock(&tree->lock);
1667 return ret;
1668}
1669
1670/**
Nikolay Borisov3bed2da2021-01-22 11:58:03 +02001671 * Find the first range that has @bits not set. This range could start before
1672 * @start.
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001673 *
Nikolay Borisov3bed2da2021-01-22 11:58:03 +02001674 * @tree: the tree to search
1675 * @start: offset at/after which the found extent should start
1676 * @start_ret: records the beginning of the range
1677 * @end_ret: records the end of the range (inclusive)
1678 * @bits: the set of bits which must be unset
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001679 *
1680 * Since unallocated range is also considered one which doesn't have the bits
1681 * set it's possible that @end_ret contains -1, this happens in case the range
1682 * spans (last_range_end, end of device]. In this case it's up to the caller to
1683 * trim @end_ret to the appropriate size.
1684 */
1685void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
Qu Wenruof97e27e2020-11-13 20:51:40 +08001686 u64 *start_ret, u64 *end_ret, u32 bits)
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001687{
1688 struct extent_state *state;
1689 struct rb_node *node, *prev = NULL, *next;
1690
1691 spin_lock(&tree->lock);
1692
1693 /* Find first extent with bits cleared */
1694 while (1) {
David Sterba9db33892020-06-25 19:03:41 +02001695 node = tree_search_prev_next(tree, start, &prev, &next);
Nikolay Borisov5750c372020-01-27 11:59:26 +02001696 if (!node && !next && !prev) {
1697 /*
1698 * Tree is completely empty, send full range and let
1699 * caller deal with it
1700 */
1701 *start_ret = 0;
1702 *end_ret = -1;
1703 goto out;
1704 } else if (!node && !next) {
1705 /*
1706 * We are past the last allocated chunk, set start at
1707 * the end of the last extent.
1708 */
1709 state = rb_entry(prev, struct extent_state, rb_node);
1710 *start_ret = state->end + 1;
1711 *end_ret = -1;
1712 goto out;
1713 } else if (!node) {
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001714 node = next;
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001715 }
Nikolay Borisov1eaebb32019-06-03 13:06:02 +03001716 /*
1717 * At this point 'node' either contains 'start' or start is
1718 * before 'node'
1719 */
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001720 state = rb_entry(node, struct extent_state, rb_node);
Nikolay Borisov1eaebb32019-06-03 13:06:02 +03001721
1722 if (in_range(start, state->start, state->end - state->start + 1)) {
1723 if (state->state & bits) {
1724 /*
1725 * |--range with bits sets--|
1726 * |
1727 * start
1728 */
1729 start = state->end + 1;
1730 } else {
1731 /*
1732 * 'start' falls within a range that doesn't
1733 * have the bits set, so take its start as
1734 * the beginning of the desired range
1735 *
1736 * |--range with bits cleared----|
1737 * |
1738 * start
1739 */
1740 *start_ret = state->start;
1741 break;
1742 }
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001743 } else {
Nikolay Borisov1eaebb32019-06-03 13:06:02 +03001744 /*
1745 * |---prev range---|---hole/unset---|---node range---|
1746 * |
1747 * start
1748 *
1749 * or
1750 *
1751 * |---hole/unset--||--first node--|
1752 * 0 |
1753 * start
1754 */
1755 if (prev) {
1756 state = rb_entry(prev, struct extent_state,
1757 rb_node);
1758 *start_ret = state->end + 1;
1759 } else {
1760 *start_ret = 0;
1761 }
Nikolay Borisov45bfcfc2019-03-27 14:24:17 +02001762 break;
1763 }
1764 }
1765
1766 /*
1767 * Find the longest stretch from start until an entry which has the
1768 * bits set
1769 */
1770 while (1) {
1771 state = rb_entry(node, struct extent_state, rb_node);
1772 if (state->end >= start && !(state->state & bits)) {
1773 *end_ret = state->end;
1774 } else {
1775 *end_ret = state->start - 1;
1776 break;
1777 }
1778
1779 node = rb_next(node);
1780 if (!node)
1781 break;
1782 }
1783out:
1784 spin_unlock(&tree->lock);
1785}
1786
Xiao Guangrong69261c42011-07-14 03:19:45 +00001787/*
Chris Masond352ac62008-09-29 15:18:18 -04001788 * find a contiguous range of bytes in the file marked as delalloc, not
1789 * more than 'max_bytes'. start and end are used to return the range,
1790 *
Lu Fengqi3522e902018-11-29 11:33:38 +08001791 * true is returned if we find something, false if nothing was in the tree
Chris Masond352ac62008-09-29 15:18:18 -04001792 */
Josef Bacik083e75e2019-09-23 10:05:20 -04001793bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1794 u64 *end, u64 max_bytes,
1795 struct extent_state **cached_state)
Chris Masond1310b22008-01-24 16:13:08 -05001796{
1797 struct rb_node *node;
1798 struct extent_state *state;
1799 u64 cur_start = *start;
Lu Fengqi3522e902018-11-29 11:33:38 +08001800 bool found = false;
Chris Masond1310b22008-01-24 16:13:08 -05001801 u64 total_bytes = 0;
1802
Chris Masoncad321a2008-12-17 14:51:42 -05001803 spin_lock(&tree->lock);
Chris Masonc8b97812008-10-29 14:49:59 -04001804
Chris Masond1310b22008-01-24 16:13:08 -05001805 /*
1806 * this search will find all the extents that end after
1807 * our range starts.
1808 */
Chris Mason80ea96b2008-02-01 14:51:59 -05001809 node = tree_search(tree, cur_start);
Peter2b114d12008-04-01 11:21:40 -04001810 if (!node) {
Lu Fengqi3522e902018-11-29 11:33:38 +08001811 *end = (u64)-1;
Chris Masond1310b22008-01-24 16:13:08 -05001812 goto out;
1813 }
1814
Chris Masond3977122009-01-05 21:25:51 -05001815 while (1) {
Chris Masond1310b22008-01-24 16:13:08 -05001816 state = rb_entry(node, struct extent_state, rb_node);
Zheng Yan5b21f2e2008-09-26 10:05:38 -04001817 if (found && (state->start != cur_start ||
1818 (state->state & EXTENT_BOUNDARY))) {
Chris Masond1310b22008-01-24 16:13:08 -05001819 goto out;
1820 }
1821 if (!(state->state & EXTENT_DELALLOC)) {
1822 if (!found)
1823 *end = state->end;
1824 goto out;
1825 }
Josef Bacikc2a128d2010-02-02 21:19:11 +00001826 if (!found) {
Chris Masond1310b22008-01-24 16:13:08 -05001827 *start = state->start;
Josef Bacikc2a128d2010-02-02 21:19:11 +00001828 *cached_state = state;
Elena Reshetovab7ac31b2017-03-03 10:55:19 +02001829 refcount_inc(&state->refs);
Josef Bacikc2a128d2010-02-02 21:19:11 +00001830 }
Lu Fengqi3522e902018-11-29 11:33:38 +08001831 found = true;
Chris Masond1310b22008-01-24 16:13:08 -05001832 *end = state->end;
1833 cur_start = state->end + 1;
1834 node = rb_next(node);
Chris Masond1310b22008-01-24 16:13:08 -05001835 total_bytes += state->end - state->start + 1;
Josef Bacik7bf811a52013-10-07 22:11:09 -04001836 if (total_bytes >= max_bytes)
Josef Bacik573aeca2013-08-30 14:38:49 -04001837 break;
Josef Bacik573aeca2013-08-30 14:38:49 -04001838 if (!node)
Chris Masond1310b22008-01-24 16:13:08 -05001839 break;
1840 }
1841out:
Chris Masoncad321a2008-12-17 14:51:42 -05001842 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001843 return found;
1844}
1845
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001846/*
1847 * Process one page for __process_pages_contig().
1848 *
1849 * Return >0 if we hit @page == @locked_page.
1850 * Return 0 if we updated the page status.
1851 * Return -EGAIN if the we need to try again.
1852 * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1853 */
Qu Wenruoe38992b2021-05-31 16:50:42 +08001854static int process_one_page(struct btrfs_fs_info *fs_info,
1855 struct address_space *mapping,
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001856 struct page *page, struct page *locked_page,
Qu Wenruoe38992b2021-05-31 16:50:42 +08001857 unsigned long page_ops, u64 start, u64 end)
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001858{
Qu Wenruoe38992b2021-05-31 16:50:42 +08001859 u32 len;
1860
1861 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1862 len = end + 1 - start;
1863
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001864 if (page_ops & PAGE_SET_ORDERED)
Qu Wenruob945a462021-05-31 16:50:46 +08001865 btrfs_page_clamp_set_ordered(fs_info, page, start, len);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001866 if (page_ops & PAGE_SET_ERROR)
Qu Wenruoe38992b2021-05-31 16:50:42 +08001867 btrfs_page_clamp_set_error(fs_info, page, start, len);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001868 if (page_ops & PAGE_START_WRITEBACK) {
Qu Wenruoe38992b2021-05-31 16:50:42 +08001869 btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1870 btrfs_page_clamp_set_writeback(fs_info, page, start, len);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001871 }
1872 if (page_ops & PAGE_END_WRITEBACK)
Qu Wenruoe38992b2021-05-31 16:50:42 +08001873 btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
Qu Wenruoa33a8e92021-05-31 16:50:47 +08001874
1875 if (page == locked_page)
1876 return 1;
1877
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001878 if (page_ops & PAGE_LOCK) {
Qu Wenruo1e1de382021-05-31 16:50:44 +08001879 int ret;
1880
1881 ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1882 if (ret)
1883 return ret;
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001884 if (!PageDirty(page) || page->mapping != mapping) {
Qu Wenruo1e1de382021-05-31 16:50:44 +08001885 btrfs_page_end_writer_lock(fs_info, page, start, len);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001886 return -EAGAIN;
1887 }
1888 }
1889 if (page_ops & PAGE_UNLOCK)
Qu Wenruo1e1de382021-05-31 16:50:44 +08001890 btrfs_page_end_writer_lock(fs_info, page, start, len);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001891 return 0;
1892}
1893
Liu Boda2c7002017-02-10 16:41:05 +01001894static int __process_pages_contig(struct address_space *mapping,
1895 struct page *locked_page,
Qu Wenruo98af9ab2021-05-31 16:50:37 +08001896 u64 start, u64 end, unsigned long page_ops,
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001897 u64 *processed_end)
1898{
Qu Wenruoe38992b2021-05-31 16:50:42 +08001899 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001900 pgoff_t start_index = start >> PAGE_SHIFT;
1901 pgoff_t end_index = end >> PAGE_SHIFT;
1902 pgoff_t index = start_index;
1903 unsigned long nr_pages = end_index - start_index + 1;
1904 unsigned long pages_processed = 0;
1905 struct page *pages[16];
1906 int err = 0;
1907 int i;
1908
1909 if (page_ops & PAGE_LOCK) {
1910 ASSERT(page_ops == PAGE_LOCK);
1911 ASSERT(processed_end && *processed_end == start);
1912 }
1913
1914 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1915 mapping_set_error(mapping, -EIO);
1916
1917 while (nr_pages > 0) {
1918 int found_pages;
1919
1920 found_pages = find_get_pages_contig(mapping, index,
1921 min_t(unsigned long,
1922 nr_pages, ARRAY_SIZE(pages)), pages);
1923 if (found_pages == 0) {
1924 /*
1925 * Only if we're going to lock these pages, we can find
1926 * nothing at @index.
1927 */
1928 ASSERT(page_ops & PAGE_LOCK);
1929 err = -EAGAIN;
1930 goto out;
1931 }
1932
1933 for (i = 0; i < found_pages; i++) {
1934 int process_ret;
1935
Qu Wenruoe38992b2021-05-31 16:50:42 +08001936 process_ret = process_one_page(fs_info, mapping,
1937 pages[i], locked_page, page_ops,
1938 start, end);
Qu Wenruoed8f13b2021-05-31 16:50:38 +08001939 if (process_ret < 0) {
1940 for (; i < found_pages; i++)
1941 put_page(pages[i]);
1942 err = -EAGAIN;
1943 goto out;
1944 }
1945 put_page(pages[i]);
1946 pages_processed++;
1947 }
1948 nr_pages -= found_pages;
1949 index += found_pages;
1950 cond_resched();
1951 }
1952out:
1953 if (err && processed_end) {
1954 /*
1955 * Update @processed_end. I know this is awful since it has
1956 * two different return value patterns (inclusive vs exclusive).
1957 *
1958 * But the exclusive pattern is necessary if @start is 0, or we
1959 * underflow and check against processed_end won't work as
1960 * expected.
1961 */
1962 if (pages_processed)
1963 *processed_end = min(end,
1964 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1965 else
1966 *processed_end = start;
1967 }
1968 return err;
1969}
Liu Boda2c7002017-02-10 16:41:05 +01001970
Jeff Mahoney143bede2012-03-01 14:56:26 +01001971static noinline void __unlock_for_delalloc(struct inode *inode,
1972 struct page *locked_page,
1973 u64 start, u64 end)
Chris Masonc8b97812008-10-29 14:49:59 -04001974{
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001975 unsigned long index = start >> PAGE_SHIFT;
1976 unsigned long end_index = end >> PAGE_SHIFT;
Chris Masonc8b97812008-10-29 14:49:59 -04001977
Liu Bo76c00212017-02-10 16:42:14 +01001978 ASSERT(locked_page);
Chris Masonc8b97812008-10-29 14:49:59 -04001979 if (index == locked_page->index && end_index == index)
Jeff Mahoney143bede2012-03-01 14:56:26 +01001980 return;
Chris Masonc8b97812008-10-29 14:49:59 -04001981
Qu Wenruo98af9ab2021-05-31 16:50:37 +08001982 __process_pages_contig(inode->i_mapping, locked_page, start, end,
Liu Bo76c00212017-02-10 16:42:14 +01001983 PAGE_UNLOCK, NULL);
Chris Masonc8b97812008-10-29 14:49:59 -04001984}
1985
1986static noinline int lock_delalloc_pages(struct inode *inode,
1987 struct page *locked_page,
1988 u64 delalloc_start,
1989 u64 delalloc_end)
1990{
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001991 unsigned long index = delalloc_start >> PAGE_SHIFT;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001992 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
Qu Wenruo98af9ab2021-05-31 16:50:37 +08001993 u64 processed_end = delalloc_start;
Chris Masonc8b97812008-10-29 14:49:59 -04001994 int ret;
Chris Masonc8b97812008-10-29 14:49:59 -04001995
Liu Bo76c00212017-02-10 16:42:14 +01001996 ASSERT(locked_page);
Chris Masonc8b97812008-10-29 14:49:59 -04001997 if (index == locked_page->index && index == end_index)
1998 return 0;
1999
Qu Wenruo98af9ab2021-05-31 16:50:37 +08002000 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
2001 delalloc_end, PAGE_LOCK, &processed_end);
2002 if (ret == -EAGAIN && processed_end > delalloc_start)
Liu Bo76c00212017-02-10 16:42:14 +01002003 __unlock_for_delalloc(inode, locked_page, delalloc_start,
Qu Wenruo98af9ab2021-05-31 16:50:37 +08002004 processed_end);
Chris Masonc8b97812008-10-29 14:49:59 -04002005 return ret;
2006}
2007
2008/*
Lu Fengqi3522e902018-11-29 11:33:38 +08002009 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002010 * more than @max_bytes.
Chris Masonc8b97812008-10-29 14:49:59 -04002011 *
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002012 * @start: The original start bytenr to search.
2013 * Will store the extent range start bytenr.
2014 * @end: The original end bytenr of the search range
2015 * Will store the extent range end bytenr.
2016 *
2017 * Return true if we find a delalloc range which starts inside the original
2018 * range, and @start/@end will store the delalloc range start/end.
2019 *
2020 * Return false if we can't find any delalloc range which starts inside the
2021 * original range, and @start/@end will be the non-delalloc range start/end.
Chris Masonc8b97812008-10-29 14:49:59 -04002022 */
Johannes Thumshirnce9f9672018-11-19 10:38:17 +01002023EXPORT_FOR_TESTS
Lu Fengqi3522e902018-11-29 11:33:38 +08002024noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
Josef Bacik294e30f2013-10-09 12:00:56 -04002025 struct page *locked_page, u64 *start,
Nikolay Borisov917aace2018-10-26 14:43:20 +03002026 u64 *end)
Chris Masonc8b97812008-10-29 14:49:59 -04002027{
Naohiro Aotaf7b12a62022-07-09 08:18:40 +09002028 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Goldwyn Rodrigues99780592019-06-21 10:02:54 -05002029 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002030 const u64 orig_start = *start;
2031 const u64 orig_end = *end;
Naohiro Aotaf7b12a62022-07-09 08:18:40 +09002032 /* The sanity tests may not set a valid fs_info. */
2033 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
Chris Masonc8b97812008-10-29 14:49:59 -04002034 u64 delalloc_start;
2035 u64 delalloc_end;
Lu Fengqi3522e902018-11-29 11:33:38 +08002036 bool found;
Chris Mason9655d292009-09-02 15:22:30 -04002037 struct extent_state *cached_state = NULL;
Chris Masonc8b97812008-10-29 14:49:59 -04002038 int ret;
2039 int loops = 0;
2040
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002041 /* Caller should pass a valid @end to indicate the search range end */
2042 ASSERT(orig_end > orig_start);
2043
2044 /* The range should at least cover part of the page */
2045 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2046 orig_end <= page_offset(locked_page)));
Chris Masonc8b97812008-10-29 14:49:59 -04002047again:
2048 /* step one, find a bunch of delalloc bytes starting at start */
2049 delalloc_start = *start;
2050 delalloc_end = 0;
Josef Bacik083e75e2019-09-23 10:05:20 -04002051 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2052 max_bytes, &cached_state);
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002053 if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
Chris Masonc8b97812008-10-29 14:49:59 -04002054 *start = delalloc_start;
Qu Wenruo2749f7e2021-09-27 15:22:07 +08002055
2056 /* @delalloc_end can be -1, never go beyond @orig_end */
2057 *end = min(delalloc_end, orig_end);
Josef Bacikc2a128d2010-02-02 21:19:11 +00002058 free_extent_state(cached_state);
Lu Fengqi3522e902018-11-29 11:33:38 +08002059 return false;
Chris Masonc8b97812008-10-29 14:49:59 -04002060 }
2061
2062 /*
Chris Mason70b99e62008-10-31 12:46:39 -04002063 * start comes from the offset of locked_page. We have to lock
2064 * pages in order, so we can't process delalloc bytes before
2065 * locked_page
2066 */
Chris Masond3977122009-01-05 21:25:51 -05002067 if (delalloc_start < *start)
Chris Mason70b99e62008-10-31 12:46:39 -04002068 delalloc_start = *start;
Chris Mason70b99e62008-10-31 12:46:39 -04002069
2070 /*
Chris Masonc8b97812008-10-29 14:49:59 -04002071 * make sure to limit the number of pages we try to lock down
Chris Masonc8b97812008-10-29 14:49:59 -04002072 */
Josef Bacik7bf811a52013-10-07 22:11:09 -04002073 if (delalloc_end + 1 - delalloc_start > max_bytes)
2074 delalloc_end = delalloc_start + max_bytes - 1;
Chris Masond3977122009-01-05 21:25:51 -05002075
Chris Masonc8b97812008-10-29 14:49:59 -04002076 /* step two, lock all the pages after the page that has start */
2077 ret = lock_delalloc_pages(inode, locked_page,
2078 delalloc_start, delalloc_end);
Nikolay Borisov9bfd61d2018-10-26 14:43:21 +03002079 ASSERT(!ret || ret == -EAGAIN);
Chris Masonc8b97812008-10-29 14:49:59 -04002080 if (ret == -EAGAIN) {
2081 /* some of the pages are gone, lets avoid looping by
2082 * shortening the size of the delalloc range we're searching
2083 */
Chris Mason9655d292009-09-02 15:22:30 -04002084 free_extent_state(cached_state);
Chris Mason7d788742014-05-21 05:49:54 -07002085 cached_state = NULL;
Chris Masonc8b97812008-10-29 14:49:59 -04002086 if (!loops) {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002087 max_bytes = PAGE_SIZE;
Chris Masonc8b97812008-10-29 14:49:59 -04002088 loops = 1;
2089 goto again;
2090 } else {
Lu Fengqi3522e902018-11-29 11:33:38 +08002091 found = false;
Chris Masonc8b97812008-10-29 14:49:59 -04002092 goto out_failed;
2093 }
2094 }
Chris Masonc8b97812008-10-29 14:49:59 -04002095
2096 /* step three, lock the state bits for the whole range */
David Sterbaff13db42015-12-03 14:30:40 +01002097 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
Chris Masonc8b97812008-10-29 14:49:59 -04002098
2099 /* then test to make sure it is all still delalloc */
2100 ret = test_range_bit(tree, delalloc_start, delalloc_end,
Chris Mason9655d292009-09-02 15:22:30 -04002101 EXTENT_DELALLOC, 1, cached_state);
Chris Masonc8b97812008-10-29 14:49:59 -04002102 if (!ret) {
Chris Mason9655d292009-09-02 15:22:30 -04002103 unlock_extent_cached(tree, delalloc_start, delalloc_end,
David Sterbae43bbe52017-12-12 21:43:52 +01002104 &cached_state);
Chris Masonc8b97812008-10-29 14:49:59 -04002105 __unlock_for_delalloc(inode, locked_page,
2106 delalloc_start, delalloc_end);
2107 cond_resched();
2108 goto again;
2109 }
Chris Mason9655d292009-09-02 15:22:30 -04002110 free_extent_state(cached_state);
Chris Masonc8b97812008-10-29 14:49:59 -04002111 *start = delalloc_start;
2112 *end = delalloc_end;
2113out_failed:
2114 return found;
2115}
2116
Nikolay Borisovad7ff172020-06-03 08:55:06 +03002117void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
Nikolay Borisov74e91942019-07-17 16:18:16 +03002118 struct page *locked_page,
Qu Wenruof97e27e2020-11-13 20:51:40 +08002119 u32 clear_bits, unsigned long page_ops)
Liu Bo873695b2017-02-02 17:49:22 -08002120{
Nikolay Borisovad7ff172020-06-03 08:55:06 +03002121 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
Liu Bo873695b2017-02-02 17:49:22 -08002122
Nikolay Borisovad7ff172020-06-03 08:55:06 +03002123 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
Qu Wenruo98af9ab2021-05-31 16:50:37 +08002124 start, end, page_ops, NULL);
Liu Bo873695b2017-02-02 17:49:22 -08002125}
2126
Chris Masond352ac62008-09-29 15:18:18 -04002127/*
2128 * count the number of bytes in the tree that have a given bit(s)
2129 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2130 * cached. The total number found is returned.
2131 */
Chris Masond1310b22008-01-24 16:13:08 -05002132u64 count_range_bits(struct extent_io_tree *tree,
2133 u64 *start, u64 search_end, u64 max_bytes,
Qu Wenruof97e27e2020-11-13 20:51:40 +08002134 u32 bits, int contig)
Chris Masond1310b22008-01-24 16:13:08 -05002135{
2136 struct rb_node *node;
2137 struct extent_state *state;
2138 u64 cur_start = *start;
2139 u64 total_bytes = 0;
Chris Masonec29ed52011-02-23 16:23:20 -05002140 u64 last = 0;
Chris Masond1310b22008-01-24 16:13:08 -05002141 int found = 0;
2142
Dulshani Gunawardhanafae7f212013-10-31 10:30:08 +05302143 if (WARN_ON(search_end <= cur_start))
Chris Masond1310b22008-01-24 16:13:08 -05002144 return 0;
Chris Masond1310b22008-01-24 16:13:08 -05002145
Chris Masoncad321a2008-12-17 14:51:42 -05002146 spin_lock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002147 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2148 total_bytes = tree->dirty_bytes;
2149 goto out;
2150 }
2151 /*
2152 * this search will find all the extents that end after
2153 * our range starts.
2154 */
Chris Mason80ea96b2008-02-01 14:51:59 -05002155 node = tree_search(tree, cur_start);
Chris Masond3977122009-01-05 21:25:51 -05002156 if (!node)
Chris Masond1310b22008-01-24 16:13:08 -05002157 goto out;
Chris Masond1310b22008-01-24 16:13:08 -05002158
Chris Masond3977122009-01-05 21:25:51 -05002159 while (1) {
Chris Masond1310b22008-01-24 16:13:08 -05002160 state = rb_entry(node, struct extent_state, rb_node);
2161 if (state->start > search_end)
2162 break;
Chris Masonec29ed52011-02-23 16:23:20 -05002163 if (contig && found && state->start > last + 1)
2164 break;
2165 if (state->end >= cur_start && (state->state & bits) == bits) {
Chris Masond1310b22008-01-24 16:13:08 -05002166 total_bytes += min(search_end, state->end) + 1 -
2167 max(cur_start, state->start);
2168 if (total_bytes >= max_bytes)
2169 break;
2170 if (!found) {
Josef Bacikaf60bed2011-05-04 11:11:17 -04002171 *start = max(cur_start, state->start);
Chris Masond1310b22008-01-24 16:13:08 -05002172 found = 1;
2173 }
Chris Masonec29ed52011-02-23 16:23:20 -05002174 last = state->end;
2175 } else if (contig && found) {
2176 break;
Chris Masond1310b22008-01-24 16:13:08 -05002177 }
2178 node = rb_next(node);
2179 if (!node)
2180 break;
2181 }
2182out:
Chris Masoncad321a2008-12-17 14:51:42 -05002183 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002184 return total_bytes;
2185}
Christoph Hellwigb2950862008-12-02 09:54:17 -05002186
Chris Masond352ac62008-09-29 15:18:18 -04002187/*
2188 * set the private field for a given byte offset in the tree. If there isn't
2189 * an extent_state there already, this does nothing.
2190 */
Josef Bacikb3f167a2019-09-23 10:05:21 -04002191int set_state_failrec(struct extent_io_tree *tree, u64 start,
2192 struct io_failure_record *failrec)
Chris Masond1310b22008-01-24 16:13:08 -05002193{
2194 struct rb_node *node;
2195 struct extent_state *state;
2196 int ret = 0;
2197
Chris Masoncad321a2008-12-17 14:51:42 -05002198 spin_lock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002199 /*
2200 * this search will find all the extents that end after
2201 * our range starts.
2202 */
Chris Mason80ea96b2008-02-01 14:51:59 -05002203 node = tree_search(tree, start);
Peter2b114d12008-04-01 11:21:40 -04002204 if (!node) {
Chris Masond1310b22008-01-24 16:13:08 -05002205 ret = -ENOENT;
2206 goto out;
2207 }
2208 state = rb_entry(node, struct extent_state, rb_node);
2209 if (state->start != start) {
2210 ret = -ENOENT;
2211 goto out;
2212 }
David Sterba47dc1962016-02-11 13:24:13 +01002213 state->failrec = failrec;
Chris Masond1310b22008-01-24 16:13:08 -05002214out:
Chris Masoncad321a2008-12-17 14:51:42 -05002215 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002216 return ret;
2217}
2218
Nikolay Borisov2279a272020-07-02 15:23:28 +03002219struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
Chris Masond1310b22008-01-24 16:13:08 -05002220{
2221 struct rb_node *node;
2222 struct extent_state *state;
Nikolay Borisov2279a272020-07-02 15:23:28 +03002223 struct io_failure_record *failrec;
Chris Masond1310b22008-01-24 16:13:08 -05002224
Chris Masoncad321a2008-12-17 14:51:42 -05002225 spin_lock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002226 /*
2227 * this search will find all the extents that end after
2228 * our range starts.
2229 */
Chris Mason80ea96b2008-02-01 14:51:59 -05002230 node = tree_search(tree, start);
Peter2b114d12008-04-01 11:21:40 -04002231 if (!node) {
Nikolay Borisov2279a272020-07-02 15:23:28 +03002232 failrec = ERR_PTR(-ENOENT);
Chris Masond1310b22008-01-24 16:13:08 -05002233 goto out;
2234 }
2235 state = rb_entry(node, struct extent_state, rb_node);
2236 if (state->start != start) {
Nikolay Borisov2279a272020-07-02 15:23:28 +03002237 failrec = ERR_PTR(-ENOENT);
Chris Masond1310b22008-01-24 16:13:08 -05002238 goto out;
2239 }
Nikolay Borisov2279a272020-07-02 15:23:28 +03002240
2241 failrec = state->failrec;
Chris Masond1310b22008-01-24 16:13:08 -05002242out:
Chris Masoncad321a2008-12-17 14:51:42 -05002243 spin_unlock(&tree->lock);
Nikolay Borisov2279a272020-07-02 15:23:28 +03002244 return failrec;
Chris Masond1310b22008-01-24 16:13:08 -05002245}
2246
2247/*
2248 * searches a range in the state tree for a given mask.
Chris Mason70dec802008-01-29 09:59:12 -05002249 * If 'filled' == 1, this returns 1 only if every extent in the tree
Chris Masond1310b22008-01-24 16:13:08 -05002250 * has the bits set. Otherwise, 1 is returned if any bit in the
2251 * range is found set.
2252 */
2253int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
Qu Wenruof97e27e2020-11-13 20:51:40 +08002254 u32 bits, int filled, struct extent_state *cached)
Chris Masond1310b22008-01-24 16:13:08 -05002255{
2256 struct extent_state *state = NULL;
2257 struct rb_node *node;
2258 int bitset = 0;
Chris Masond1310b22008-01-24 16:13:08 -05002259
Chris Masoncad321a2008-12-17 14:51:42 -05002260 spin_lock(&tree->lock);
Filipe Manana27a35072014-07-06 20:09:59 +01002261 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
Josef Bacikdf98b6e2011-06-20 14:53:48 -04002262 cached->end > start)
Chris Mason9655d292009-09-02 15:22:30 -04002263 node = &cached->rb_node;
2264 else
2265 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -05002266 while (node && start <= end) {
2267 state = rb_entry(node, struct extent_state, rb_node);
2268
2269 if (filled && state->start > start) {
2270 bitset = 0;
2271 break;
2272 }
2273
2274 if (state->start > end)
2275 break;
2276
2277 if (state->state & bits) {
2278 bitset = 1;
2279 if (!filled)
2280 break;
2281 } else if (filled) {
2282 bitset = 0;
2283 break;
2284 }
Chris Mason46562ce2009-09-23 20:23:16 -04002285
2286 if (state->end == (u64)-1)
2287 break;
2288
Chris Masond1310b22008-01-24 16:13:08 -05002289 start = state->end + 1;
2290 if (start > end)
2291 break;
2292 node = rb_next(node);
2293 if (!node) {
2294 if (filled)
2295 bitset = 0;
2296 break;
2297 }
2298 }
Chris Masoncad321a2008-12-17 14:51:42 -05002299 spin_unlock(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05002300 return bitset;
2301}
Chris Masond1310b22008-01-24 16:13:08 -05002302
Josef Bacik7870d082017-05-05 11:57:15 -04002303int free_io_failure(struct extent_io_tree *failure_tree,
2304 struct extent_io_tree *io_tree,
2305 struct io_failure_record *rec)
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002306{
2307 int ret;
2308 int err = 0;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002309
David Sterba47dc1962016-02-11 13:24:13 +01002310 set_state_failrec(failure_tree, rec->start, NULL);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002311 ret = clear_extent_bits(failure_tree, rec->start,
2312 rec->start + rec->len - 1,
David Sterba91166212016-04-26 23:54:39 +02002313 EXTENT_LOCKED | EXTENT_DIRTY);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002314 if (ret)
2315 err = ret;
2316
Josef Bacik7870d082017-05-05 11:57:15 -04002317 ret = clear_extent_bits(io_tree, rec->start,
David Woodhouse53b381b2013-01-29 18:40:14 -05002318 rec->start + rec->len - 1,
David Sterba91166212016-04-26 23:54:39 +02002319 EXTENT_DAMAGED);
David Woodhouse53b381b2013-01-29 18:40:14 -05002320 if (ret && !err)
2321 err = ret;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002322
2323 kfree(rec);
2324 return err;
2325}
2326
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002327/*
2328 * this bypasses the standard btrfs submit functions deliberately, as
2329 * the standard behavior is to write all copies in a raid setup. here we only
2330 * want to write the one bad copy. so we do the mapping for ourselves and issue
2331 * submit_bio directly.
Stefan Behrens3ec706c2012-11-05 15:46:42 +01002332 * to avoid any synchronization issues, wait for the data after writing, which
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002333 * actually prevents the read that triggered the error from finishing.
2334 * currently, there can be no more than two copies of every data bit. thus,
2335 * exactly one rewrite is required.
2336 */
Qu Wenruo38d5e542021-09-03 20:45:14 +08002337static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2338 u64 length, u64 logical, struct page *page,
2339 unsigned int pg_offset, int mirror_num)
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002340{
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002341 struct btrfs_device *dev;
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002342 struct bio_vec bvec;
2343 struct bio bio;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002344 u64 map_length = 0;
2345 u64 sector;
Qu Wenruo4c664612021-09-15 15:17:16 +08002346 struct btrfs_io_context *bioc = NULL;
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002347 int ret = 0;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002348
Linus Torvalds1751e8a2017-11-27 13:05:09 -08002349 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002350 BUG_ON(!mirror_num);
2351
Johannes Thumshirn554aed72021-12-07 06:28:36 -08002352 if (btrfs_repair_one_zone(fs_info, logical))
2353 return 0;
Naohiro Aotaf7ef5282021-02-04 19:22:16 +09002354
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002355 map_length = length;
2356
Filipe Mananab5de8d02016-05-27 22:21:27 +01002357 /*
Qu Wenruo4c664612021-09-15 15:17:16 +08002358 * Avoid races with device replace and make sure our bioc has devices
Filipe Mananab5de8d02016-05-27 22:21:27 +01002359 * associated to its stripes that don't go away while we are doing the
2360 * read repair operation.
2361 */
2362 btrfs_bio_counter_inc_blocked(fs_info);
Nikolay Borisove4ff5fb2017-07-19 10:48:42 +03002363 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
Liu Boc7253282017-03-29 10:53:58 -07002364 /*
2365 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2366 * to update all raid stripes, but here we just want to correct
2367 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2368 * stripe's dev and sector.
2369 */
2370 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
Qu Wenruo4c664612021-09-15 15:17:16 +08002371 &map_length, &bioc, 0);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002372 if (ret)
2373 goto out_counter_dec;
Qu Wenruo4c664612021-09-15 15:17:16 +08002374 ASSERT(bioc->mirror_num == 1);
Liu Boc7253282017-03-29 10:53:58 -07002375 } else {
2376 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
Qu Wenruo4c664612021-09-15 15:17:16 +08002377 &map_length, &bioc, mirror_num);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002378 if (ret)
2379 goto out_counter_dec;
Qu Wenruo4c664612021-09-15 15:17:16 +08002380 BUG_ON(mirror_num != bioc->mirror_num);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002381 }
Liu Boc7253282017-03-29 10:53:58 -07002382
Qu Wenruo4c664612021-09-15 15:17:16 +08002383 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
Qu Wenruo4c664612021-09-15 15:17:16 +08002384 dev = bioc->stripes[bioc->mirror_num - 1].dev;
2385 btrfs_put_bioc(bioc);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002386
Anand Jainebbede42017-12-04 12:54:52 +08002387 if (!dev || !dev->bdev ||
2388 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002389 ret = -EIO;
2390 goto out_counter_dec;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002391 }
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002392
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002393 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2394 bio.bi_iter.bi_sector = sector;
2395 __bio_add_page(&bio, page, length, pg_offset);
2396
2397 btrfsic_check_bio(&bio);
2398 ret = submit_bio_wait(&bio);
2399 if (ret) {
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002400 /* try to remap that extent elsewhere? */
Stefan Behrens442a4f62012-05-25 16:06:08 +02002401 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002402 goto out_bio_uninit;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002403 }
2404
David Sterbab14af3b2015-10-08 10:43:10 +02002405 btrfs_info_rl_in_rcu(fs_info,
2406 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
Josef Bacik6ec656b2017-05-05 11:57:14 -04002407 ino, start,
Miao Xie1203b682014-09-12 18:44:01 +08002408 rcu_str_deref(dev->name), sector);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002409 ret = 0;
2410
2411out_bio_uninit:
2412 bio_uninit(&bio);
2413out_counter_dec:
Filipe Mananab5de8d02016-05-27 22:21:27 +01002414 btrfs_bio_counter_dec(fs_info);
Christoph Hellwige9458bf2022-04-04 06:45:20 +02002415 return ret;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002416}
2417
David Sterba2b489662020-04-29 03:04:10 +02002418int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
Josef Bacikea466792012-03-26 21:57:36 -04002419{
David Sterba20a1fbf92019-03-20 11:23:44 +01002420 struct btrfs_fs_info *fs_info = eb->fs_info;
Josef Bacikea466792012-03-26 21:57:36 -04002421 u64 start = eb->start;
David Sterbacc5e31a2018-03-01 18:20:27 +01002422 int i, num_pages = num_extent_pages(eb);
Chris Masond95603b2012-04-12 15:55:15 -04002423 int ret = 0;
Josef Bacikea466792012-03-26 21:57:36 -04002424
David Howellsbc98a422017-07-17 08:45:34 +01002425 if (sb_rdonly(fs_info->sb))
Ilya Dryomov908960c2013-11-03 19:06:39 +02002426 return -EROFS;
2427
Josef Bacikea466792012-03-26 21:57:36 -04002428 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02002429 struct page *p = eb->pages[i];
Miao Xie1203b682014-09-12 18:44:01 +08002430
Josef Bacik6ec656b2017-05-05 11:57:14 -04002431 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
Miao Xie1203b682014-09-12 18:44:01 +08002432 start - page_offset(p), mirror_num);
Josef Bacikea466792012-03-26 21:57:36 -04002433 if (ret)
2434 break;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002435 start += PAGE_SIZE;
Josef Bacikea466792012-03-26 21:57:36 -04002436 }
2437
2438 return ret;
2439}
2440
Christoph Hellwigc144c632022-07-07 07:33:26 +02002441static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
2442{
2443 if (cur_mirror == failrec->num_copies)
2444 return cur_mirror + 1 - failrec->num_copies;
2445 return cur_mirror + 1;
2446}
2447
2448static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
2449{
2450 if (cur_mirror == 1)
2451 return failrec->num_copies;
2452 return cur_mirror - 1;
2453}
2454
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002455/*
2456 * each time an IO finishes, we do a fast check in the IO failure tree
2457 * to see if we need to process or clean up an io_failure_record
2458 */
Josef Bacik7870d082017-05-05 11:57:15 -04002459int clean_io_failure(struct btrfs_fs_info *fs_info,
2460 struct extent_io_tree *failure_tree,
2461 struct extent_io_tree *io_tree, u64 start,
2462 struct page *page, u64 ino, unsigned int pg_offset)
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002463{
2464 u64 private;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002465 struct io_failure_record *failrec;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002466 struct extent_state *state;
Christoph Hellwigc144c632022-07-07 07:33:26 +02002467 int mirror;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002468 int ret;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002469
2470 private = 0;
Josef Bacik7870d082017-05-05 11:57:15 -04002471 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2472 EXTENT_DIRTY, 0);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002473 if (!ret)
2474 return 0;
2475
Nikolay Borisov2279a272020-07-02 15:23:28 +03002476 failrec = get_state_failrec(failure_tree, start);
2477 if (IS_ERR(failrec))
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002478 return 0;
2479
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002480 BUG_ON(!failrec->this_mirror);
2481
David Howellsbc98a422017-07-17 08:45:34 +01002482 if (sb_rdonly(fs_info->sb))
Ilya Dryomov908960c2013-11-03 19:06:39 +02002483 goto out;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002484
Josef Bacik7870d082017-05-05 11:57:15 -04002485 spin_lock(&io_tree->lock);
2486 state = find_first_extent_bit_state(io_tree,
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002487 failrec->start,
2488 EXTENT_LOCKED);
Josef Bacik7870d082017-05-05 11:57:15 -04002489 spin_unlock(&io_tree->lock);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002490
Christoph Hellwigc144c632022-07-07 07:33:26 +02002491 if (!state || state->start > failrec->start ||
2492 state->end < failrec->start + failrec->len - 1)
2493 goto out;
2494
2495 mirror = failrec->this_mirror;
2496 do {
2497 mirror = prev_mirror(failrec, mirror);
2498 repair_io_failure(fs_info, ino, start, failrec->len,
2499 failrec->logical, page, pg_offset, mirror);
2500 } while (mirror != failrec->failed_mirror);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002501
2502out:
Josef Bacik7870d082017-05-05 11:57:15 -04002503 free_io_failure(failure_tree, io_tree, failrec);
Miao Xie454ff3d2014-09-12 18:43:58 +08002504 return 0;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002505}
2506
Miao Xief6124962014-09-12 18:44:04 +08002507/*
2508 * Can be called when
2509 * - hold extent lock
2510 * - under ordered extent
2511 * - the inode is freeing
2512 */
Nikolay Borisov7ab79562017-02-20 13:50:57 +02002513void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
Miao Xief6124962014-09-12 18:44:04 +08002514{
Nikolay Borisov7ab79562017-02-20 13:50:57 +02002515 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
Miao Xief6124962014-09-12 18:44:04 +08002516 struct io_failure_record *failrec;
2517 struct extent_state *state, *next;
2518
2519 if (RB_EMPTY_ROOT(&failure_tree->state))
2520 return;
2521
2522 spin_lock(&failure_tree->lock);
2523 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2524 while (state) {
2525 if (state->start > end)
2526 break;
2527
2528 ASSERT(state->end <= end);
2529
2530 next = next_state(state);
2531
David Sterba47dc1962016-02-11 13:24:13 +01002532 failrec = state->failrec;
Miao Xief6124962014-09-12 18:44:04 +08002533 free_extent_state(state);
2534 kfree(failrec);
2535
2536 state = next;
2537 }
2538 spin_unlock(&failure_tree->lock);
2539}
2540
Nikolay Borisov35263022020-07-02 15:23:29 +03002541static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002542 struct btrfs_bio *bbio,
2543 unsigned int bio_offset)
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002544{
Jeff Mahoneyab8d0fc2016-09-20 10:05:02 -04002545 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002546 u64 start = bbio->file_offset + bio_offset;
Miao Xie2fe63032014-09-12 18:43:59 +08002547 struct io_failure_record *failrec;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002548 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2549 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002550 const u32 sectorsize = fs_info->sectorsize;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002551 int ret;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002552
Nikolay Borisov2279a272020-07-02 15:23:28 +03002553 failrec = get_state_failrec(failure_tree, start);
Nikolay Borisov35263022020-07-02 15:23:29 +03002554 if (!IS_ERR(failrec)) {
Jeff Mahoneyab8d0fc2016-09-20 10:05:02 -04002555 btrfs_debug(fs_info,
Qu Wenruo12458352021-05-03 10:08:56 +08002556 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2557 failrec->logical, failrec->start, failrec->len);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002558 /*
2559 * when data can be on disk more than twice, add to failrec here
2560 * (e.g. with a list for failed_mirror) to make
2561 * clean_io_failure() clean all those errors at once.
2562 */
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002563 ASSERT(failrec->this_mirror == bbio->mirror_num);
Christoph Hellwigc144c632022-07-07 07:33:26 +02002564 ASSERT(failrec->len == fs_info->sectorsize);
Nikolay Borisov35263022020-07-02 15:23:29 +03002565 return failrec;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002566 }
Miao Xie2fe63032014-09-12 18:43:59 +08002567
Nikolay Borisov35263022020-07-02 15:23:29 +03002568 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2569 if (!failrec)
2570 return ERR_PTR(-ENOMEM);
Miao Xie2fe63032014-09-12 18:43:59 +08002571
Nikolay Borisov35263022020-07-02 15:23:29 +03002572 failrec->start = start;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002573 failrec->len = sectorsize;
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002574 failrec->failed_mirror = bbio->mirror_num;
2575 failrec->this_mirror = bbio->mirror_num;
Christoph Hellwig81bd9322022-07-07 07:33:30 +02002576 failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
Nikolay Borisov35263022020-07-02 15:23:29 +03002577
2578 btrfs_debug(fs_info,
Christoph Hellwig81bd9322022-07-07 07:33:30 +02002579 "new io failure record logical %llu start %llu",
2580 failrec->logical, start);
Nikolay Borisov35263022020-07-02 15:23:29 +03002581
Christoph Hellwig81bd9322022-07-07 07:33:30 +02002582 failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
Christoph Hellwigc144c632022-07-07 07:33:26 +02002583 if (failrec->num_copies == 1) {
2584 /*
2585 * We only have a single copy of the data, so don't bother with
2586 * all the retry and error correction code that follows. No
2587 * matter what the error is, it is very likely to persist.
2588 */
2589 btrfs_debug(fs_info,
2590 "cannot repair logical %llu num_copies %d",
2591 failrec->logical, failrec->num_copies);
2592 kfree(failrec);
2593 return ERR_PTR(-EIO);
2594 }
Nikolay Borisov35263022020-07-02 15:23:29 +03002595
2596 /* Set the bits in the private failure tree */
Qu Wenruo150e4b02021-05-03 10:08:55 +08002597 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
Nikolay Borisov35263022020-07-02 15:23:29 +03002598 EXTENT_LOCKED | EXTENT_DIRTY);
2599 if (ret >= 0) {
2600 ret = set_state_failrec(failure_tree, start, failrec);
2601 /* Set the bits in the inode's tree */
Qu Wenruo150e4b02021-05-03 10:08:55 +08002602 ret = set_extent_bits(tree, start, start + sectorsize - 1,
2603 EXTENT_DAMAGED);
Nikolay Borisov35263022020-07-02 15:23:29 +03002604 } else if (ret < 0) {
2605 kfree(failrec);
2606 return ERR_PTR(ret);
2607 }
2608
2609 return failrec;
Miao Xie2fe63032014-09-12 18:43:59 +08002610}
2611
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002612int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
2613 u32 bio_offset, struct page *page, unsigned int pgoff,
Qu Wenruo150e4b02021-05-03 10:08:55 +08002614 submit_bio_hook_t *submit_bio_hook)
Liu Boc3cfb652017-07-13 15:00:50 -07002615{
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002616 u64 start = failed_bbio->file_offset + bio_offset;
Miao Xie2fe63032014-09-12 18:43:59 +08002617 struct io_failure_record *failrec;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002618 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Miao Xie2fe63032014-09-12 18:43:59 +08002619 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
Josef Bacik7870d082017-05-05 11:57:15 -04002620 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002621 struct bio *failed_bio = &failed_bbio->bio;
Qu Wenruo7ffd27e2020-12-02 14:47:58 +08002622 const int icsum = bio_offset >> fs_info->sectorsize_bits;
Omar Sandoval77d5d682020-04-16 14:46:25 -07002623 struct bio *repair_bio;
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002624 struct btrfs_bio *repair_bbio;
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002625
Omar Sandoval77d5d682020-04-16 14:46:25 -07002626 btrfs_debug(fs_info,
2627 "repair read error: read error at %llu", start);
Tsutomu Itohe627ee72012-04-12 16:03:56 -04002628
Mike Christie1f7ad752016-06-05 14:31:51 -05002629 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
Miao Xie2fe63032014-09-12 18:43:59 +08002630
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002631 failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
Nikolay Borisov35263022020-07-02 15:23:29 +03002632 if (IS_ERR(failrec))
Qu Wenruo150e4b02021-05-03 10:08:55 +08002633 return PTR_ERR(failrec);
Miao Xie454ff3d2014-09-12 18:43:58 +08002634
Tsutomu Itohe627ee72012-04-12 16:03:56 -04002635 /*
2636 * There are two premises:
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002637 * a) deliver good data to the caller
Kent Overstreet4f024f32013-10-11 15:44:27 -07002638 * b) correct the bad sectors on disk
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002639 *
Kent Overstreet4f024f32013-10-11 15:44:27 -07002640 * Since we're only doing repair for one sector, we only need to get
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002641 * a good copy of the failed sector and if we succeed, we have setup
Miao Xiefacc8a222013-07-25 19:22:34 +08002642 * everything for repair_io_failure to do the rest for us.
2643 */
Christoph Hellwigc144c632022-07-07 07:33:26 +02002644 failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
2645 if (failrec->this_mirror == failrec->failed_mirror) {
Miao Xiefacc8a222013-07-25 19:22:34 +08002646 btrfs_debug(fs_info,
Christoph Hellwigc144c632022-07-07 07:33:26 +02002647 "failed to repair num_copies %d this_mirror %d failed_mirror %d",
2648 failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
Josef Bacik7870d082017-05-05 11:57:15 -04002649 free_io_failure(failure_tree, tree, failrec);
Qu Wenruo150e4b02021-05-03 10:08:55 +08002650 return -EIO;
Miao Xie2fe63032014-09-12 18:43:59 +08002651 }
2652
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002653 repair_bio = btrfs_bio_alloc(1);
2654 repair_bbio = btrfs_bio(repair_bio);
Christoph Hellwig00d82522022-03-24 17:06:27 +01002655 repair_bbio->file_offset = start;
Omar Sandoval77d5d682020-04-16 14:46:25 -07002656 repair_bio->bi_opf = REQ_OP_READ;
Omar Sandoval77d5d682020-04-16 14:46:25 -07002657 repair_bio->bi_end_io = failed_bio->bi_end_io;
2658 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2659 repair_bio->bi_private = failed_bio->bi_private;
Miao Xie2fe63032014-09-12 18:43:59 +08002660
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002661 if (failed_bbio->csum) {
David Sterba223486c2020-07-02 11:27:30 +02002662 const u32 csum_size = fs_info->csum_size;
Omar Sandoval77d5d682020-04-16 14:46:25 -07002663
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002664 repair_bbio->csum = repair_bbio->csum_inline;
2665 memcpy(repair_bbio->csum,
2666 failed_bbio->csum + csum_size * icsum, csum_size);
Omar Sandoval77d5d682020-04-16 14:46:25 -07002667 }
2668
2669 bio_add_page(repair_bio, page, failrec->len, pgoff);
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002670 repair_bbio->iter = repair_bio->bi_iter;
Miao Xie2fe63032014-09-12 18:43:59 +08002671
Jeff Mahoneyab8d0fc2016-09-20 10:05:02 -04002672 btrfs_debug(btrfs_sb(inode->i_sb),
Qu Wenruo12458352021-05-03 10:08:56 +08002673 "repair read error: submitting new read to mirror %d",
2674 failrec->this_mirror);
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002675
Josef Bacik8cbc3002022-02-18 10:03:29 -05002676 /*
2677 * At this point we have a bio, so any errors from submit_bio_hook()
2678 * will be handled by the endio on the repair_bio, so we can't return an
2679 * error here.
2680 */
Christoph Hellwig81bd9322022-07-07 07:33:30 +02002681 submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
Josef Bacik8cbc3002022-02-18 10:03:29 -05002682 return BLK_STS_OK;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002683}
2684
2685static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2686{
2687 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2688
2689 ASSERT(page_offset(page) <= start &&
2690 start + len <= page_offset(page) + PAGE_SIZE);
2691
Qu Wenruo150e4b02021-05-03 10:08:55 +08002692 if (uptodate) {
Boris Burkov14605402021-06-30 13:01:49 -07002693 if (fsverity_active(page->mapping->host) &&
2694 !PageError(page) &&
2695 !PageUptodate(page) &&
2696 start < i_size_read(page->mapping->host) &&
2697 !fsverity_verify_page(page)) {
2698 btrfs_page_set_error(fs_info, page, start, len);
2699 } else {
2700 btrfs_page_set_uptodate(fs_info, page, start, len);
2701 }
Qu Wenruo150e4b02021-05-03 10:08:55 +08002702 } else {
2703 btrfs_page_clear_uptodate(fs_info, page, start, len);
2704 btrfs_page_set_error(fs_info, page, start, len);
2705 }
2706
Qu Wenruofbca46e2022-01-13 13:22:09 +08002707 if (!btrfs_is_subpage(fs_info, page))
Qu Wenruo150e4b02021-05-03 10:08:55 +08002708 unlock_page(page);
Qu Wenruo3d078ef2021-06-07 17:02:58 +08002709 else
Qu Wenruo150e4b02021-05-03 10:08:55 +08002710 btrfs_subpage_end_reader(fs_info, page, start, len);
2711}
2712
Christoph Hellwiga5aa7ab2022-05-22 13:47:50 +02002713static void end_sector_io(struct page *page, u64 offset, bool uptodate)
Qu Wenruo150e4b02021-05-03 10:08:55 +08002714{
Christoph Hellwiga5aa7ab2022-05-22 13:47:50 +02002715 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2716 const u32 sectorsize = inode->root->fs_info->sectorsize;
2717 struct extent_state *cached = NULL;
2718
2719 end_page_read(page, uptodate, offset, sectorsize);
2720 if (uptodate)
2721 set_extent_uptodate(&inode->io_tree, offset,
2722 offset + sectorsize - 1, &cached, GFP_ATOMIC);
2723 unlock_extent_cached_atomic(&inode->io_tree, offset,
2724 offset + sectorsize - 1, &cached);
2725}
2726
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002727static void submit_data_read_repair(struct inode *inode,
2728 struct btrfs_bio *failed_bbio,
Qu Wenruofd5a6f62022-05-22 13:47:49 +02002729 u32 bio_offset, const struct bio_vec *bvec,
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002730 unsigned int error_bitmap)
Qu Wenruo150e4b02021-05-03 10:08:55 +08002731{
Qu Wenruofd5a6f62022-05-22 13:47:49 +02002732 const unsigned int pgoff = bvec->bv_offset;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002733 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Qu Wenruofd5a6f62022-05-22 13:47:49 +02002734 struct page *page = bvec->bv_page;
2735 const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
2736 const u64 end = start + bvec->bv_len - 1;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002737 const u32 sectorsize = fs_info->sectorsize;
2738 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002739 int i;
2740
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002741 BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
Qu Wenruo150e4b02021-05-03 10:08:55 +08002742
Qu Wenruoc0111c42022-03-21 13:48:42 +08002743 /* This repair is only for data */
2744 ASSERT(is_data_inode(inode));
2745
Qu Wenruo150e4b02021-05-03 10:08:55 +08002746 /* We're here because we had some read errors or csum mismatch */
2747 ASSERT(error_bitmap);
2748
2749 /*
2750 * We only get called on buffered IO, thus page must be mapped and bio
2751 * must not be cloned.
2752 */
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002753 ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
Qu Wenruo150e4b02021-05-03 10:08:55 +08002754
2755 /* Iterate through all the sectors in the range */
2756 for (i = 0; i < nr_bits; i++) {
2757 const unsigned int offset = i * sectorsize;
Qu Wenruo150e4b02021-05-03 10:08:55 +08002758 bool uptodate = false;
2759 int ret;
2760
2761 if (!(error_bitmap & (1U << i))) {
2762 /*
2763 * This sector has no error, just end the page read
2764 * and unlock the range.
2765 */
2766 uptodate = true;
2767 goto next;
2768 }
2769
Christoph Hellwig7aa51232022-07-07 07:33:28 +02002770 ret = btrfs_repair_one_sector(inode, failed_bbio,
2771 bio_offset + offset, page, pgoff + offset,
2772 btrfs_submit_data_read_bio);
Qu Wenruo150e4b02021-05-03 10:08:55 +08002773 if (!ret) {
2774 /*
2775 * We have submitted the read repair, the page release
2776 * will be handled by the endio function of the
2777 * submitted repair bio.
2778 * Thus we don't need to do any thing here.
2779 */
2780 continue;
2781 }
2782 /*
Qu Wenruofd5a6f62022-05-22 13:47:49 +02002783 * Continue on failed repair, otherwise the remaining sectors
2784 * will not be properly unlocked.
Qu Wenruo150e4b02021-05-03 10:08:55 +08002785 */
Qu Wenruo150e4b02021-05-03 10:08:55 +08002786next:
Christoph Hellwiga5aa7ab2022-05-22 13:47:50 +02002787 end_sector_io(page, start + offset, uptodate);
Qu Wenruo150e4b02021-05-03 10:08:55 +08002788 }
Jan Schmidt4a54c8c2011-07-22 15:41:52 +02002789}
2790
Chris Masond1310b22008-01-24 16:13:08 -05002791/* lots and lots of room for performance fixes in the end_bio funcs */
2792
David Sterbab5227c02015-12-03 13:08:59 +01002793void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
Jeff Mahoney87826df2012-02-15 16:23:57 +01002794{
Qu Wenruo38a39ac72021-04-08 20:32:27 +08002795 struct btrfs_inode *inode;
David Sterba25c12522021-07-26 14:15:08 +02002796 const bool uptodate = (err == 0);
Eric Sandeen3e2426b2014-06-12 00:39:58 -05002797 int ret = 0;
Jeff Mahoney87826df2012-02-15 16:23:57 +01002798
Qu Wenruo38a39ac72021-04-08 20:32:27 +08002799 ASSERT(page && page->mapping);
2800 inode = BTRFS_I(page->mapping->host);
2801 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
Jeff Mahoney87826df2012-02-15 16:23:57 +01002802
Jeff Mahoney87826df2012-02-15 16:23:57 +01002803 if (!uptodate) {
Qu Wenruo963e4db2021-07-26 14:35:07 +08002804 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2805 u32 len;
2806
2807 ASSERT(end + 1 - start <= U32_MAX);
2808 len = end + 1 - start;
2809
2810 btrfs_page_clear_uptodate(fs_info, page, start, len);
2811 btrfs_page_set_error(fs_info, page, start, len);
Colin Ian Kingbff5baf2017-05-09 18:14:01 +01002812 ret = err < 0 ? err : -EIO;
Liu Bo5dca6ee2014-05-12 12:47:36 +08002813 mapping_set_error(page->mapping, ret);
Jeff Mahoney87826df2012-02-15 16:23:57 +01002814 }
Jeff Mahoney87826df2012-02-15 16:23:57 +01002815}
2816
Chris Masond1310b22008-01-24 16:13:08 -05002817/*
2818 * after a writepage IO is done, we need to:
2819 * clear the uptodate bits on error
2820 * clear the writeback bits in the extent tree for this IO
2821 * end_page_writeback if the page has no more pending IO
2822 *
2823 * Scheduling is not allowed, so the extent state tree is expected
2824 * to have one and only one object corresponding to this IO.
2825 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002826static void end_bio_extent_writepage(struct bio *bio)
Chris Masond1310b22008-01-24 16:13:08 -05002827{
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002828 int error = blk_status_to_errno(bio->bi_status);
Kent Overstreet2c30c712013-11-07 12:20:26 -08002829 struct bio_vec *bvec;
Chris Masond1310b22008-01-24 16:13:08 -05002830 u64 start;
2831 u64 end;
Ming Lei6dc4f102019-02-15 19:13:19 +08002832 struct bvec_iter_all iter_all;
Naohiro Aotad8e3fb12021-02-04 19:22:05 +09002833 bool first_bvec = true;
Chris Masond1310b22008-01-24 16:13:08 -05002834
David Sterbac09abff2017-07-13 18:10:07 +02002835 ASSERT(!bio_flagged(bio, BIO_CLONED));
Christoph Hellwig2b070cf2019-04-25 09:03:00 +02002836 bio_for_each_segment_all(bvec, bio, iter_all) {
Chris Masond1310b22008-01-24 16:13:08 -05002837 struct page *page = bvec->bv_page;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002838 struct inode *inode = page->mapping->host;
2839 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Qu Wenruo321a02d2021-05-31 16:50:40 +08002840 const u32 sectorsize = fs_info->sectorsize;
David Woodhouse902b22f2008-08-20 08:51:49 -04002841
Qu Wenruo321a02d2021-05-31 16:50:40 +08002842 /* Our read/write should always be sector aligned. */
2843 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2844 btrfs_err(fs_info,
2845 "partial page write in btrfs with offset %u and length %u",
2846 bvec->bv_offset, bvec->bv_len);
2847 else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2848 btrfs_info(fs_info,
2849 "incomplete page write with offset %u and length %u",
2850 bvec->bv_offset, bvec->bv_len);
Chris Masond1310b22008-01-24 16:13:08 -05002851
Qu Wenruo321a02d2021-05-31 16:50:40 +08002852 start = page_offset(page) + bvec->bv_offset;
2853 end = start + bvec->bv_len - 1;
Chris Masond1310b22008-01-24 16:13:08 -05002854
Naohiro Aotad8e3fb12021-02-04 19:22:05 +09002855 if (first_bvec) {
2856 btrfs_record_physical_zoned(inode, start, bio);
2857 first_bvec = false;
2858 }
2859
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02002860 end_extent_writepage(page, error, start, end);
Qu Wenruo9047e312021-05-31 16:50:43 +08002861
2862 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
Kent Overstreet2c30c712013-11-07 12:20:26 -08002863 }
Chris Mason2b1f55b2008-09-24 11:48:04 -04002864
Chris Masond1310b22008-01-24 16:13:08 -05002865 bio_put(bio);
Chris Masond1310b22008-01-24 16:13:08 -05002866}
2867
Qu Wenruo94e8c952020-11-13 20:51:28 +08002868/*
2869 * Record previously processed extent range
2870 *
2871 * For endio_readpage_release_extent() to handle a full extent range, reducing
2872 * the extent io operations.
2873 */
2874struct processed_extent {
2875 struct btrfs_inode *inode;
2876 /* Start of the range in @inode */
2877 u64 start;
Nigel Christian2e626e52021-01-24 20:41:41 -05002878 /* End of the range in @inode */
Qu Wenruo94e8c952020-11-13 20:51:28 +08002879 u64 end;
2880 bool uptodate;
2881};
2882
2883/*
2884 * Try to release processed extent range
2885 *
2886 * May not release the extent range right now if the current range is
2887 * contiguous to processed extent.
2888 *
2889 * Will release processed extent when any of @inode, @uptodate, the range is
2890 * no longer contiguous to the processed range.
2891 *
2892 * Passing @inode == NULL will force processed extent to be released.
2893 */
2894static void endio_readpage_release_extent(struct processed_extent *processed,
2895 struct btrfs_inode *inode, u64 start, u64 end,
2896 bool uptodate)
Miao Xie883d0de2013-07-25 19:22:35 +08002897{
2898 struct extent_state *cached = NULL;
Qu Wenruo94e8c952020-11-13 20:51:28 +08002899 struct extent_io_tree *tree;
Miao Xie883d0de2013-07-25 19:22:35 +08002900
Qu Wenruo94e8c952020-11-13 20:51:28 +08002901 /* The first extent, initialize @processed */
2902 if (!processed->inode)
2903 goto update;
2904
2905 /*
2906 * Contiguous to processed extent, just uptodate the end.
2907 *
2908 * Several things to notice:
2909 *
2910 * - bio can be merged as long as on-disk bytenr is contiguous
2911 * This means we can have page belonging to other inodes, thus need to
2912 * check if the inode still matches.
2913 * - bvec can contain range beyond current page for multi-page bvec
2914 * Thus we need to do processed->end + 1 >= start check
2915 */
2916 if (processed->inode == inode && processed->uptodate == uptodate &&
2917 processed->end + 1 >= start && end >= processed->end) {
2918 processed->end = end;
2919 return;
2920 }
2921
2922 tree = &processed->inode->io_tree;
2923 /*
2924 * Now we don't have range contiguous to the processed range, release
2925 * the processed range now.
2926 */
2927 if (processed->uptodate && tree->track_uptodate)
2928 set_extent_uptodate(tree, processed->start, processed->end,
2929 &cached, GFP_ATOMIC);
2930 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2931 &cached);
2932
2933update:
2934 /* Update processed to current range */
2935 processed->inode = inode;
2936 processed->start = start;
2937 processed->end = end;
2938 processed->uptodate = uptodate;
Miao Xie883d0de2013-07-25 19:22:35 +08002939}
2940
Qu Wenruo92082d42021-02-02 10:28:36 +08002941static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2942{
2943 ASSERT(PageLocked(page));
Qu Wenruofbca46e2022-01-13 13:22:09 +08002944 if (!btrfs_is_subpage(fs_info, page))
Qu Wenruo92082d42021-02-02 10:28:36 +08002945 return;
2946
2947 ASSERT(PagePrivate(page));
2948 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2949}
2950
Chris Masond1310b22008-01-24 16:13:08 -05002951/*
David Sterba01cd3902022-07-15 13:59:31 +02002952 * Find extent buffer for a givne bytenr.
Qu Wenruod9bb77d2021-03-15 13:39:14 +08002953 *
2954 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2955 * in endio context.
2956 */
2957static struct extent_buffer *find_extent_buffer_readpage(
2958 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2959{
2960 struct extent_buffer *eb;
2961
2962 /*
2963 * For regular sectorsize, we can use page->private to grab extent
2964 * buffer
2965 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08002966 if (fs_info->nodesize >= PAGE_SIZE) {
Qu Wenruod9bb77d2021-03-15 13:39:14 +08002967 ASSERT(PagePrivate(page) && page->private);
2968 return (struct extent_buffer *)page->private;
2969 }
2970
David Sterba01cd3902022-07-15 13:59:31 +02002971 /* For subpage case, we need to lookup buffer radix tree */
2972 rcu_read_lock();
2973 eb = radix_tree_lookup(&fs_info->buffer_radix,
2974 bytenr >> fs_info->sectorsize_bits);
2975 rcu_read_unlock();
Qu Wenruod9bb77d2021-03-15 13:39:14 +08002976 ASSERT(eb);
2977 return eb;
2978}
2979
2980/*
Chris Masond1310b22008-01-24 16:13:08 -05002981 * after a readpage IO is done, we need to:
2982 * clear the uptodate bits on error
2983 * set the uptodate bits if things worked
2984 * set the page up to date if all extents in the tree are uptodate
2985 * clear the lock bit in the extent tree
2986 * unlock the page if there are no other extents locked for it
2987 *
2988 * Scheduling is not allowed, so the extent state tree is expected
2989 * to have one and only one object corresponding to this IO.
2990 */
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002991static void end_bio_extent_readpage(struct bio *bio)
Chris Masond1310b22008-01-24 16:13:08 -05002992{
Kent Overstreet2c30c712013-11-07 12:20:26 -08002993 struct bio_vec *bvec;
Qu Wenruoc3a3b192021-09-15 15:17:18 +08002994 struct btrfs_bio *bbio = btrfs_bio(bio);
Josef Bacik7870d082017-05-05 11:57:15 -04002995 struct extent_io_tree *tree, *failure_tree;
Qu Wenruo94e8c952020-11-13 20:51:28 +08002996 struct processed_extent processed = { 0 };
Qu Wenruo7ffd27e2020-12-02 14:47:58 +08002997 /*
2998 * The offset to the beginning of a bio, since one bio can never be
2999 * larger than UINT_MAX, u32 here is enough.
3000 */
3001 u32 bio_offset = 0;
Josef Bacik5cf1ab52012-04-16 09:42:26 -04003002 int mirror;
Ming Lei6dc4f102019-02-15 19:13:19 +08003003 struct bvec_iter_all iter_all;
Chris Masond1310b22008-01-24 16:13:08 -05003004
David Sterbac09abff2017-07-13 18:10:07 +02003005 ASSERT(!bio_flagged(bio, BIO_CLONED));
Christoph Hellwig2b070cf2019-04-25 09:03:00 +02003006 bio_for_each_segment_all(bvec, bio, iter_all) {
Qu Wenruo150e4b02021-05-03 10:08:55 +08003007 bool uptodate = !bio->bi_status;
Chris Masond1310b22008-01-24 16:13:08 -05003008 struct page *page = bvec->bv_page;
Josef Bacika71754f2013-06-17 17:14:39 -04003009 struct inode *inode = page->mapping->host;
Jeff Mahoneyab8d0fc2016-09-20 10:05:02 -04003010 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Qu Wenruo7ffd27e2020-12-02 14:47:58 +08003011 const u32 sectorsize = fs_info->sectorsize;
Qu Wenruo150e4b02021-05-03 10:08:55 +08003012 unsigned int error_bitmap = (unsigned int)-1;
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003013 bool repair = false;
Qu Wenruo7ffd27e2020-12-02 14:47:58 +08003014 u64 start;
3015 u64 end;
3016 u32 len;
Arne Jansen507903b2011-04-06 10:02:20 +00003017
Jeff Mahoneyab8d0fc2016-09-20 10:05:02 -04003018 btrfs_debug(fs_info,
3019 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
David Sterba1201b582020-11-26 15:41:27 +01003020 bio->bi_iter.bi_sector, bio->bi_status,
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003021 bbio->mirror_num);
Josef Bacika71754f2013-06-17 17:14:39 -04003022 tree = &BTRFS_I(inode)->io_tree;
Josef Bacik7870d082017-05-05 11:57:15 -04003023 failure_tree = &BTRFS_I(inode)->io_failure_tree;
David Woodhouse902b22f2008-08-20 08:51:49 -04003024
Qu Wenruo8b8bbd42020-10-21 14:24:58 +08003025 /*
3026 * We always issue full-sector reads, but if some block in a
3027 * page fails to read, blk_update_request() will advance
3028 * bv_offset and adjust bv_len to compensate. Print a warning
3029 * for unaligned offsets, and an error if they don't add up to
3030 * a full sector.
3031 */
3032 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3033 btrfs_err(fs_info,
3034 "partial page read in btrfs with offset %u and length %u",
3035 bvec->bv_offset, bvec->bv_len);
3036 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3037 sectorsize))
3038 btrfs_info(fs_info,
3039 "incomplete page read with offset %u and length %u",
3040 bvec->bv_offset, bvec->bv_len);
Chris Masond1310b22008-01-24 16:13:08 -05003041
Qu Wenruo8b8bbd42020-10-21 14:24:58 +08003042 start = page_offset(page) + bvec->bv_offset;
3043 end = start + bvec->bv_len - 1;
Miao Xiefacc8a222013-07-25 19:22:34 +08003044 len = bvec->bv_len;
Chris Masond1310b22008-01-24 16:13:08 -05003045
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003046 mirror = bbio->mirror_num;
Nikolay Borisov78e62c02018-11-22 10:17:49 +02003047 if (likely(uptodate)) {
Qu Wenruo150e4b02021-05-03 10:08:55 +08003048 if (is_data_inode(inode)) {
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003049 error_bitmap = btrfs_verify_data_csum(bbio,
Goldwyn Rodrigues5e295762021-03-03 06:55:37 -06003050 bio_offset, page, start, end);
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003051 if (error_bitmap)
3052 uptodate = false;
Qu Wenruo150e4b02021-05-03 10:08:55 +08003053 } else {
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003054 if (btrfs_validate_metadata_buffer(bbio,
3055 page, start, end, mirror))
3056 uptodate = false;
Qu Wenruo150e4b02021-05-03 10:08:55 +08003057 }
Chris Masond1310b22008-01-24 16:13:08 -05003058 }
Josef Bacikea466792012-03-26 21:57:36 -04003059
Miao Xie883d0de2013-07-25 19:22:35 +08003060 if (likely(uptodate)) {
Josef Bacika71754f2013-06-17 17:14:39 -04003061 loff_t i_size = i_size_read(inode);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03003062 pgoff_t end_index = i_size >> PAGE_SHIFT;
Josef Bacika71754f2013-06-17 17:14:39 -04003063
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003064 clean_io_failure(BTRFS_I(inode)->root->fs_info,
3065 failure_tree, tree, start, page,
3066 btrfs_ino(BTRFS_I(inode)), 0);
3067
Qu Wenruoc28ea612021-03-01 16:44:22 +08003068 /*
3069 * Zero out the remaining part if this range straddles
3070 * i_size.
3071 *
3072 * Here we should only zero the range inside the bvec,
3073 * not touch anything else.
3074 *
3075 * NOTE: i_size is exclusive while end is inclusive.
3076 */
3077 if (page->index == end_index && i_size <= end) {
3078 u32 zero_start = max(offset_in_page(i_size),
Qu Wenruod2dcc8e2021-03-08 17:20:17 +08003079 offset_in_page(start));
Qu Wenruoc28ea612021-03-01 16:44:22 +08003080
3081 zero_user_segment(page, zero_start,
3082 offset_in_page(end) + 1);
3083 }
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003084 } else if (is_data_inode(inode)) {
3085 /*
3086 * Only try to repair bios that actually made it to a
3087 * device. If the bio failed to be submitted mirror
3088 * is 0 and we need to fail it without retrying.
Christoph Hellwig81bd9322022-07-07 07:33:30 +02003089 *
3090 * This also includes the high level bios for compressed
3091 * extents - these never make it to a device and repair
3092 * is already handled on the lower compressed bio.
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003093 */
3094 if (mirror > 0)
3095 repair = true;
3096 } else {
3097 struct extent_buffer *eb;
3098
3099 eb = find_extent_buffer_readpage(fs_info, page, start);
3100 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3101 eb->read_mirror = mirror;
3102 atomic_dec(&eb->io_pages);
Chris Mason70dec802008-01-29 09:59:12 -05003103 }
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003104
3105 if (repair) {
3106 /*
3107 * submit_data_read_repair() will handle all the good
3108 * and bad sectors, we just continue to the next bvec.
3109 */
Christoph Hellwig7aa51232022-07-07 07:33:28 +02003110 submit_data_read_repair(inode, bbio, bio_offset, bvec,
3111 error_bitmap);
Christoph Hellwig97861cd2022-05-22 13:47:51 +02003112 } else {
3113 /* Update page status and unlock */
3114 end_page_read(page, uptodate, start, len);
3115 endio_readpage_release_extent(&processed, BTRFS_I(inode),
3116 start, end, PageUptodate(page));
3117 }
3118
Qu Wenruo7ffd27e2020-12-02 14:47:58 +08003119 ASSERT(bio_offset + len > bio_offset);
3120 bio_offset += len;
Miao Xie883d0de2013-07-25 19:22:35 +08003121
Kent Overstreet2c30c712013-11-07 12:20:26 -08003122 }
Qu Wenruo94e8c952020-11-13 20:51:28 +08003123 /* Release the last extent */
3124 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003125 btrfs_bio_free_csum(bbio);
Chris Masond1310b22008-01-24 16:13:08 -05003126 bio_put(bio);
Chris Masond1310b22008-01-24 16:13:08 -05003127}
3128
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04003129/**
3130 * Populate every free slot in a provided array with pages.
3131 *
3132 * @nr_pages: number of pages to allocate
3133 * @page_array: the array to fill with pages; any existing non-null entries in
3134 * the array will be skipped
3135 *
3136 * Return: 0 if all pages were able to be allocated;
3137 * -ENOMEM otherwise, and the caller is responsible for freeing all
3138 * non-null page pointers in the array.
3139 */
3140int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3141{
Sweet Tea Dorminy91d6ac12022-03-30 16:11:23 -04003142 unsigned int allocated;
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04003143
Sweet Tea Dorminy91d6ac12022-03-30 16:11:23 -04003144 for (allocated = 0; allocated < nr_pages;) {
3145 unsigned int last = allocated;
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04003146
Sweet Tea Dorminy91d6ac12022-03-30 16:11:23 -04003147 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3148
Sweet Tea Dorminy395cb572022-04-06 14:24:18 -04003149 if (allocated == nr_pages)
3150 return 0;
3151
Sweet Tea Dorminy91d6ac12022-03-30 16:11:23 -04003152 /*
3153 * During this iteration, no page could be allocated, even
3154 * though alloc_pages_bulk_array() falls back to alloc_page()
3155 * if it could not bulk-allocate. So we must be out of memory.
3156 */
3157 if (allocated == last)
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04003158 return -ENOMEM;
Sweet Tea Dorminy395cb572022-04-06 14:24:18 -04003159
3160 memalloc_retry_wait(GFP_NOFS);
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04003161 }
3162 return 0;
3163}
3164
Chris Mason9be33952013-05-17 18:30:14 -04003165/*
David Sterba184f9992017-06-12 17:29:39 +02003166 * Initialize the members up to but not including 'bio'. Use after allocating a
3167 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3168 * 'bio' because use of __GFP_ZERO is not supported.
Chris Mason9be33952013-05-17 18:30:14 -04003169 */
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003170static inline void btrfs_bio_init(struct btrfs_bio *bbio)
Chris Masond1310b22008-01-24 16:13:08 -05003171{
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003172 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
David Sterba184f9992017-06-12 17:29:39 +02003173}
3174
3175/*
Qu Wenruocd8e0cc2021-09-15 15:17:17 +08003176 * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3177 *
3178 * The bio allocation is backed by bioset and does not fail.
Chris Masond1310b22008-01-24 16:13:08 -05003179 */
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003180struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
Chris Masond1310b22008-01-24 16:13:08 -05003181{
3182 struct bio *bio;
3183
Qu Wenruocd8e0cc2021-09-15 15:17:17 +08003184 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
Christoph Hellwig609be102022-01-24 10:11:03 +01003185 bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003186 btrfs_bio_init(btrfs_bio(bio));
Chris Masond1310b22008-01-24 16:13:08 -05003187 return bio;
3188}
3189
Chaitanya Kulkarni21dda652021-07-21 21:43:33 +09003190struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
Liu Bo2f8e9142017-05-15 17:43:31 -07003191{
3192 struct bio *bio;
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003193 struct btrfs_bio *bbio;
Liu Bo2f8e9142017-05-15 17:43:31 -07003194
Chaitanya Kulkarni21dda652021-07-21 21:43:33 +09003195 ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3196
Liu Bo2f8e9142017-05-15 17:43:31 -07003197 /* this will never fail when it's backed by a bioset */
Christoph Hellwigabfc4262022-02-02 17:01:09 +01003198 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
Liu Bo2f8e9142017-05-15 17:43:31 -07003199 ASSERT(bio);
3200
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003201 bbio = btrfs_bio(bio);
3202 btrfs_bio_init(bbio);
Liu Bo2f8e9142017-05-15 17:43:31 -07003203
3204 bio_trim(bio, offset >> 9, size >> 9);
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003205 bbio->iter = bio->bi_iter;
Liu Bo2f8e9142017-05-15 17:43:31 -07003206 return bio;
3207}
Chris Mason9be33952013-05-17 18:30:14 -04003208
Naohiro Aota953651e2021-02-04 19:21:57 +09003209/**
3210 * Attempt to add a page to bio
3211 *
Yang Libe8d1a22021-12-20 15:23:06 +08003212 * @bio_ctrl: record both the bio, and its bio_flags
Naohiro Aota953651e2021-02-04 19:21:57 +09003213 * @page: page to add to the bio
3214 * @disk_bytenr: offset of the new bio or to check whether we are adding
3215 * a contiguous page to the previous one
Naohiro Aota953651e2021-02-04 19:21:57 +09003216 * @size: portion of page that we want to write
Yang Libe8d1a22021-12-20 15:23:06 +08003217 * @pg_offset: starting offset in the page
David Sterbacb3a12d2021-07-27 14:59:41 +02003218 * @compress_type: compression type of the current bio to see if we can merge them
Naohiro Aota953651e2021-02-04 19:21:57 +09003219 *
3220 * Attempt to add a page to bio considering stripe alignment etc.
3221 *
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003222 * Return >= 0 for the number of bytes added to the bio.
3223 * Can return 0 if the current bio is already at stripe/zone boundary.
3224 * Return <0 for error.
Naohiro Aota953651e2021-02-04 19:21:57 +09003225 */
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003226static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3227 struct page *page,
3228 u64 disk_bytenr, unsigned int size,
3229 unsigned int pg_offset,
David Sterbacb3a12d2021-07-27 14:59:41 +02003230 enum btrfs_compression_type compress_type)
Naohiro Aota953651e2021-02-04 19:21:57 +09003231{
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003232 struct bio *bio = bio_ctrl->bio;
3233 u32 bio_size = bio->bi_iter.bi_size;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003234 u32 real_size;
Naohiro Aota953651e2021-02-04 19:21:57 +09003235 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3236 bool contig;
Naohiro Aotae1326f02021-02-04 19:21:58 +09003237 int ret;
Naohiro Aota953651e2021-02-04 19:21:57 +09003238
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003239 ASSERT(bio);
3240 /* The limit should be calculated when bio_ctrl->bio is allocated */
3241 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
David Sterba0f070032021-07-27 15:11:53 +02003242 if (bio_ctrl->compress_type != compress_type)
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003243 return 0;
Naohiro Aota953651e2021-02-04 19:21:57 +09003244
David Sterba0f070032021-07-27 15:11:53 +02003245 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
Naohiro Aota953651e2021-02-04 19:21:57 +09003246 contig = bio->bi_iter.bi_sector == sector;
3247 else
3248 contig = bio_end_sector(bio) == sector;
3249 if (!contig)
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003250 return 0;
Naohiro Aota953651e2021-02-04 19:21:57 +09003251
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003252 real_size = min(bio_ctrl->len_to_oe_boundary,
3253 bio_ctrl->len_to_stripe_boundary) - bio_size;
3254 real_size = min(real_size, size);
3255
3256 /*
3257 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3258 * bio will still execute its endio function on the page!
3259 */
3260 if (real_size == 0)
3261 return 0;
Naohiro Aota953651e2021-02-04 19:21:57 +09003262
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003263 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003264 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003265 else
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003266 ret = bio_add_page(bio, page, real_size, pg_offset);
Naohiro Aotae1326f02021-02-04 19:21:58 +09003267
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003268 return ret;
Naohiro Aota953651e2021-02-04 19:21:57 +09003269}
3270
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003271static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
Naohiro Aota939c7feb2021-08-11 15:37:08 +09003272 struct btrfs_inode *inode, u64 file_offset)
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003273{
3274 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3275 struct btrfs_io_geometry geom;
3276 struct btrfs_ordered_extent *ordered;
3277 struct extent_map *em;
3278 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3279 int ret;
3280
3281 /*
3282 * Pages for compressed extent are never submitted to disk directly,
3283 * thus it has no real boundary, just set them to U32_MAX.
3284 *
3285 * The split happens for real compressed bio, which happens in
3286 * btrfs_submit_compressed_read/write().
3287 */
David Sterba0f070032021-07-27 15:11:53 +02003288 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003289 bio_ctrl->len_to_oe_boundary = U32_MAX;
3290 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3291 return 0;
3292 }
3293 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3294 if (IS_ERR(em))
3295 return PTR_ERR(em);
3296 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3297 logical, &geom);
3298 free_extent_map(em);
3299 if (ret < 0) {
3300 return ret;
3301 }
3302 if (geom.len > U32_MAX)
3303 bio_ctrl->len_to_stripe_boundary = U32_MAX;
3304 else
3305 bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3306
Johannes Thumshirn73672712021-12-07 06:28:37 -08003307 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003308 bio_ctrl->len_to_oe_boundary = U32_MAX;
3309 return 0;
3310 }
3311
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003312 /* Ordered extent not yet created, so we're good */
Naohiro Aota939c7feb2021-08-11 15:37:08 +09003313 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003314 if (!ordered) {
3315 bio_ctrl->len_to_oe_boundary = U32_MAX;
3316 return 0;
3317 }
3318
3319 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3320 ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3321 btrfs_put_ordered_extent(ordered);
3322 return 0;
3323}
3324
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003325static int alloc_new_bio(struct btrfs_inode *inode,
3326 struct btrfs_bio_ctrl *bio_ctrl,
3327 struct writeback_control *wbc,
Bart Van Asschebf9486d2022-07-14 11:07:16 -07003328 blk_opf_t opf,
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003329 bio_end_io_t end_io_func,
Naohiro Aota939c7feb2021-08-11 15:37:08 +09003330 u64 disk_bytenr, u32 offset, u64 file_offset,
David Sterbacb3a12d2021-07-27 14:59:41 +02003331 enum btrfs_compression_type compress_type)
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003332{
3333 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3334 struct bio *bio;
3335 int ret;
3336
Qu Wenruoc3a3b192021-09-15 15:17:18 +08003337 bio = btrfs_bio_alloc(BIO_MAX_VECS);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003338 /*
3339 * For compressed page range, its disk_bytenr is always @disk_bytenr
3340 * passed in, no matter if we have added any range into previous bio.
3341 */
David Sterbacb3a12d2021-07-27 14:59:41 +02003342 if (compress_type != BTRFS_COMPRESS_NONE)
Qu Wenruocd8e0cc2021-09-15 15:17:17 +08003343 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003344 else
Qu Wenruocd8e0cc2021-09-15 15:17:17 +08003345 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003346 bio_ctrl->bio = bio;
David Sterba0f070032021-07-27 15:11:53 +02003347 bio_ctrl->compress_type = compress_type;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003348 bio->bi_end_io = end_io_func;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003349 bio->bi_opf = opf;
Naohiro Aota939c7feb2021-08-11 15:37:08 +09003350 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3351 if (ret < 0)
3352 goto error;
Christoph Hellwig50f1cff2022-03-24 17:52:10 +01003353
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003354 if (wbc) {
Christoph Hellwig50f1cff2022-03-24 17:52:10 +01003355 /*
3356 * For Zone append we need the correct block_device that we are
3357 * going to write to set in the bio to be able to respect the
3358 * hardware limitation. Look it up here:
3359 */
3360 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3361 struct btrfs_device *dev;
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003362
Christoph Hellwig50f1cff2022-03-24 17:52:10 +01003363 dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3364 fs_info->sectorsize);
3365 if (IS_ERR(dev)) {
3366 ret = PTR_ERR(dev);
3367 goto error;
3368 }
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003369
Christoph Hellwig50f1cff2022-03-24 17:52:10 +01003370 bio_set_dev(bio, dev->bdev);
3371 } else {
3372 /*
3373 * Otherwise pick the last added device to support
3374 * cgroup writeback. For multi-device file systems this
3375 * means blk-cgroup policies have to always be set on the
3376 * last added/replaced device. This is a bit odd but has
3377 * been like that for a long time.
3378 */
3379 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003380 }
Christoph Hellwig50f1cff2022-03-24 17:52:10 +01003381 wbc_init_bio(wbc, bio);
3382 } else {
3383 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003384 }
3385 return 0;
3386error:
3387 bio_ctrl->bio = NULL;
3388 bio->bi_status = errno_to_blk_status(ret);
3389 bio_endio(bio);
3390 return ret;
3391}
3392
David Sterba4b81ba42017-06-06 19:14:26 +02003393/*
3394 * @opf: bio REQ_OP_* and REQ_* flags as one value
David Sterbab8b3d622017-06-12 19:50:41 +02003395 * @wbc: optional writeback control for io accounting
3396 * @page: page to add to the bio
Qu Wenruo0c64c332021-01-06 09:01:40 +08003397 * @disk_bytenr: logical bytenr where the write will be
3398 * @size: portion of page that we want to write to
David Sterbab8b3d622017-06-12 19:50:41 +02003399 * @pg_offset: offset of the new bio or to check whether we are adding
3400 * a contiguous page to the previous one
David Sterba5c2b1fd2017-06-06 19:22:55 +02003401 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
David Sterbab8b3d622017-06-12 19:50:41 +02003402 * @end_io_func: end_io callback for new bio
3403 * @mirror_num: desired mirror to read/write
3404 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
David Sterbacb3a12d2021-07-27 14:59:41 +02003405 * @compress_type: compress type for current bio
David Sterba4b81ba42017-06-06 19:14:26 +02003406 */
Bart Van Asschebf9486d2022-07-14 11:07:16 -07003407static int submit_extent_page(blk_opf_t opf,
Chris Masonda2f0f72015-07-02 13:57:22 -07003408 struct writeback_control *wbc,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003409 struct btrfs_bio_ctrl *bio_ctrl,
Qu Wenruo0c64c332021-01-06 09:01:40 +08003410 struct page *page, u64 disk_bytenr,
David Sterba6c5a4e22017-10-04 17:10:34 +02003411 size_t size, unsigned long pg_offset,
Chris Masonf1885912008-04-09 16:28:12 -04003412 bio_end_io_t end_io_func,
David Sterbacb3a12d2021-07-27 14:59:41 +02003413 enum btrfs_compression_type compress_type,
Filipe Manana005efed2015-09-14 09:09:31 +01003414 bool force_bio_submit)
Chris Masond1310b22008-01-24 16:13:08 -05003415{
3416 int ret = 0;
Naohiro Aotae1326f02021-02-04 19:21:58 +09003417 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003418 unsigned int cur = pg_offset;
Chris Masond1310b22008-01-24 16:13:08 -05003419
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003420 ASSERT(bio_ctrl);
David Sterba5c2b1fd2017-06-06 19:22:55 +02003421
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003422 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3423 pg_offset + size <= PAGE_SIZE);
Christoph Hellwig722c82a2022-06-03 09:11:03 +02003424 if (force_bio_submit)
3425 submit_one_bio(bio_ctrl);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003426
3427 while (cur < pg_offset + size) {
3428 u32 offset = cur - pg_offset;
3429 int added;
3430
3431 /* Allocate new bio if needed */
3432 if (!bio_ctrl->bio) {
3433 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3434 end_io_func, disk_bytenr, offset,
Naohiro Aota939c7feb2021-08-11 15:37:08 +09003435 page_offset(page) + cur,
David Sterbacb3a12d2021-07-27 14:59:41 +02003436 compress_type);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003437 if (ret < 0)
3438 return ret;
3439 }
3440 /*
3441 * We must go through btrfs_bio_add_page() to ensure each
3442 * page range won't cross various boundaries.
3443 */
David Sterbacb3a12d2021-07-27 14:59:41 +02003444 if (compress_type != BTRFS_COMPRESS_NONE)
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003445 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3446 size - offset, pg_offset + offset,
David Sterbacb3a12d2021-07-27 14:59:41 +02003447 compress_type);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003448 else
3449 added = btrfs_bio_add_page(bio_ctrl, page,
3450 disk_bytenr + offset, size - offset,
David Sterbacb3a12d2021-07-27 14:59:41 +02003451 pg_offset + offset, compress_type);
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003452
3453 /* Metadata page range should never be split */
3454 if (!is_data_inode(&inode->vfs_inode))
3455 ASSERT(added == 0 || added == size - offset);
3456
3457 /* At least we added some page, update the account */
3458 if (wbc && added)
3459 wbc_account_cgroup_owner(wbc, page, added);
3460
3461 /* We have reached boundary, submit right now */
3462 if (added < size - offset) {
3463 /* The bio should contain some page(s) */
3464 ASSERT(bio_ctrl->bio->bi_iter.bi_size);
Christoph Hellwig722c82a2022-06-03 09:11:03 +02003465 submit_one_bio(bio_ctrl);
Chris Masond1310b22008-01-24 16:13:08 -05003466 }
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003467 cur += added;
Chris Masond1310b22008-01-24 16:13:08 -05003468 }
Qu Wenruoe0eefe02021-07-26 14:35:00 +08003469 return 0;
Chris Masond1310b22008-01-24 16:13:08 -05003470}
3471
Qu Wenruo760f9912021-01-26 16:33:48 +08003472static int attach_extent_buffer_page(struct extent_buffer *eb,
3473 struct page *page,
3474 struct btrfs_subpage *prealloc)
Josef Bacik4f2de97a2012-03-07 16:20:05 -05003475{
Qu Wenruo760f9912021-01-26 16:33:48 +08003476 struct btrfs_fs_info *fs_info = eb->fs_info;
3477 int ret = 0;
3478
Qu Wenruo0d01e242020-10-21 14:25:02 +08003479 /*
3480 * If the page is mapped to btree inode, we should hold the private
3481 * lock to prevent race.
3482 * For cloned or dummy extent buffers, their pages are not mapped and
3483 * will not race with any other ebs.
3484 */
3485 if (page->mapping)
3486 lockdep_assert_held(&page->mapping->private_lock);
3487
Qu Wenruofbca46e2022-01-13 13:22:09 +08003488 if (fs_info->nodesize >= PAGE_SIZE) {
Qu Wenruo760f9912021-01-26 16:33:48 +08003489 if (!PagePrivate(page))
3490 attach_page_private(page, eb);
3491 else
3492 WARN_ON(page->private != (unsigned long)eb);
3493 return 0;
3494 }
3495
3496 /* Already mapped, just free prealloc */
3497 if (PagePrivate(page)) {
3498 btrfs_free_subpage(prealloc);
3499 return 0;
3500 }
3501
3502 if (prealloc)
3503 /* Has preallocated memory for subpage */
3504 attach_page_private(page, prealloc);
Guoqing Jiangd1b89bc2020-06-01 21:47:45 -07003505 else
Qu Wenruo760f9912021-01-26 16:33:48 +08003506 /* Do new allocation to attach subpage */
3507 ret = btrfs_attach_subpage(fs_info, page,
3508 BTRFS_SUBPAGE_METADATA);
3509 return ret;
Josef Bacik4f2de97a2012-03-07 16:20:05 -05003510}
3511
Qu Wenruo32443de2021-01-26 16:34:00 +08003512int set_page_extent_mapped(struct page *page)
Chris Masond1310b22008-01-24 16:13:08 -05003513{
Qu Wenruo32443de2021-01-26 16:34:00 +08003514 struct btrfs_fs_info *fs_info;
3515
3516 ASSERT(page->mapping);
3517
3518 if (PagePrivate(page))
3519 return 0;
3520
3521 fs_info = btrfs_sb(page->mapping->host->i_sb);
3522
Qu Wenruofbca46e2022-01-13 13:22:09 +08003523 if (btrfs_is_subpage(fs_info, page))
Qu Wenruo32443de2021-01-26 16:34:00 +08003524 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3525
3526 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3527 return 0;
3528}
3529
3530void clear_page_extent_mapped(struct page *page)
3531{
3532 struct btrfs_fs_info *fs_info;
3533
3534 ASSERT(page->mapping);
3535
Guoqing Jiangd1b89bc2020-06-01 21:47:45 -07003536 if (!PagePrivate(page))
Qu Wenruo32443de2021-01-26 16:34:00 +08003537 return;
3538
3539 fs_info = btrfs_sb(page->mapping->host->i_sb);
Qu Wenruofbca46e2022-01-13 13:22:09 +08003540 if (btrfs_is_subpage(fs_info, page))
Qu Wenruo32443de2021-01-26 16:34:00 +08003541 return btrfs_detach_subpage(fs_info, page);
3542
3543 detach_page_private(page);
Chris Masond1310b22008-01-24 16:13:08 -05003544}
3545
Miao Xie125bac012013-07-25 19:22:37 +08003546static struct extent_map *
3547__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
Nikolay Borisov1a5ee1e2020-09-14 12:37:06 +03003548 u64 start, u64 len, struct extent_map **em_cached)
Miao Xie125bac012013-07-25 19:22:37 +08003549{
3550 struct extent_map *em;
3551
3552 if (em_cached && *em_cached) {
3553 em = *em_cached;
Filipe Mananacbc0e922014-02-25 14:15:12 +00003554 if (extent_map_in_tree(em) && start >= em->start &&
Miao Xie125bac012013-07-25 19:22:37 +08003555 start < extent_map_end(em)) {
Elena Reshetova490b54d2017-03-03 10:55:12 +02003556 refcount_inc(&em->refs);
Miao Xie125bac012013-07-25 19:22:37 +08003557 return em;
3558 }
3559
3560 free_extent_map(em);
3561 *em_cached = NULL;
3562 }
3563
Nikolay Borisov1a5ee1e2020-09-14 12:37:06 +03003564 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
Filipe Mananac0347552022-02-03 15:36:42 +00003565 if (em_cached && !IS_ERR(em)) {
Miao Xie125bac012013-07-25 19:22:37 +08003566 BUG_ON(*em_cached);
Elena Reshetova490b54d2017-03-03 10:55:12 +02003567 refcount_inc(&em->refs);
Miao Xie125bac012013-07-25 19:22:37 +08003568 *em_cached = em;
3569 }
3570 return em;
3571}
Chris Masond1310b22008-01-24 16:13:08 -05003572/*
3573 * basic readpage implementation. Locked extent state structs are inserted
3574 * into the tree that are removed when the IO is done (by the end_io
3575 * handlers)
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003576 * XXX JDM: This needs looking at to ensure proper page locking
Liu Bobaf863b2016-07-11 10:39:07 -07003577 * return 0 on success, otherwise return error
Chris Masond1310b22008-01-24 16:13:08 -05003578 */
Christoph Hellwig7aab8b32022-04-15 16:33:24 +02003579static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003580 struct btrfs_bio_ctrl *bio_ctrl,
Bart Van Asschebf9486d2022-07-14 11:07:16 -07003581 blk_opf_t read_flags, u64 *prev_em_start)
Chris Masond1310b22008-01-24 16:13:08 -05003582{
3583 struct inode *inode = page->mapping->host;
Qu Wenruo92082d42021-02-02 10:28:36 +08003584 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Miao Xie4eee4fa2012-12-21 09:17:45 +00003585 u64 start = page_offset(page);
David Sterba8eec8292017-06-06 19:50:13 +02003586 const u64 end = start + PAGE_SIZE - 1;
Chris Masond1310b22008-01-24 16:13:08 -05003587 u64 cur = start;
3588 u64 extent_offset;
3589 u64 last_byte = i_size_read(inode);
3590 u64 block_start;
3591 u64 cur_end;
Chris Masond1310b22008-01-24 16:13:08 -05003592 struct extent_map *em;
Liu Bobaf863b2016-07-11 10:39:07 -07003593 int ret = 0;
David Sterba306e16c2011-04-19 14:29:38 +02003594 size_t pg_offset = 0;
Chris Masond1310b22008-01-24 16:13:08 -05003595 size_t iosize;
3596 size_t blocksize = inode->i_sb->s_blocksize;
David Sterbaf657a312020-02-05 19:09:42 +01003597 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
David Sterbaae6957e2020-02-05 19:09:30 +01003598
Qu Wenruo32443de2021-01-26 16:34:00 +08003599 ret = set_page_extent_mapped(page);
3600 if (ret < 0) {
3601 unlock_extent(tree, start, end);
Qu Wenruo92082d42021-02-02 10:28:36 +08003602 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3603 unlock_page(page);
Qu Wenruo32443de2021-01-26 16:34:00 +08003604 goto out;
3605 }
Chris Masond1310b22008-01-24 16:13:08 -05003606
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03003607 if (page->index == last_byte >> PAGE_SHIFT) {
Johannes Thumshirn70730172018-12-05 15:23:03 +01003608 size_t zero_offset = offset_in_page(last_byte);
Chris Masonc8b97812008-10-29 14:49:59 -04003609
3610 if (zero_offset) {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03003611 iosize = PAGE_SIZE - zero_offset;
Ira Weinyd048b9c2021-05-04 18:40:07 -07003612 memzero_page(page, zero_offset, iosize);
Chris Masonc8b97812008-10-29 14:49:59 -04003613 }
3614 }
Qu Wenruo92082d42021-02-02 10:28:36 +08003615 begin_page_read(fs_info, page);
Chris Masond1310b22008-01-24 16:13:08 -05003616 while (cur <= end) {
Qu Wenruo4c37a792021-07-26 14:34:50 +08003617 unsigned long this_bio_flag = 0;
Filipe Manana005efed2015-09-14 09:09:31 +01003618 bool force_bio_submit = false;
Qu Wenruo0c64c332021-01-06 09:01:40 +08003619 u64 disk_bytenr;
Josef Bacikc8f2f242013-02-11 11:33:00 -05003620
Qu Wenruo6a404912021-09-27 15:21:47 +08003621 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
Chris Masond1310b22008-01-24 16:13:08 -05003622 if (cur >= last_byte) {
Arne Jansen507903b2011-04-06 10:02:20 +00003623 struct extent_state *cached = NULL;
3624
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03003625 iosize = PAGE_SIZE - pg_offset;
Ira Weinyd048b9c2021-05-04 18:40:07 -07003626 memzero_page(page, pg_offset, iosize);
Chris Masond1310b22008-01-24 16:13:08 -05003627 set_extent_uptodate(tree, cur, cur + iosize - 1,
Arne Jansen507903b2011-04-06 10:02:20 +00003628 &cached, GFP_NOFS);
Filipe Manana7f042a82016-01-27 19:17:20 +00003629 unlock_extent_cached(tree, cur,
David Sterbae43bbe52017-12-12 21:43:52 +01003630 cur + iosize - 1, &cached);
Qu Wenruo92082d42021-02-02 10:28:36 +08003631 end_page_read(page, true, cur, iosize);
Chris Masond1310b22008-01-24 16:13:08 -05003632 break;
3633 }
Miao Xie125bac012013-07-25 19:22:37 +08003634 em = __get_extent_map(inode, page, pg_offset, cur,
Nikolay Borisov1a5ee1e2020-09-14 12:37:06 +03003635 end - cur + 1, em_cached);
Filipe Mananac0347552022-02-03 15:36:42 +00003636 if (IS_ERR(em)) {
Filipe Manana7f042a82016-01-27 19:17:20 +00003637 unlock_extent(tree, cur, end);
Qu Wenruo92082d42021-02-02 10:28:36 +08003638 end_page_read(page, false, cur, end + 1 - cur);
Filipe Mananabbf0ea72022-02-03 15:36:43 +00003639 ret = PTR_ERR(em);
Chris Masond1310b22008-01-24 16:13:08 -05003640 break;
3641 }
Chris Masond1310b22008-01-24 16:13:08 -05003642 extent_offset = cur - em->start;
3643 BUG_ON(extent_map_end(em) <= cur);
3644 BUG_ON(end < cur);
3645
David Sterba7f6ca7f2021-07-27 14:49:32 +02003646 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3647 this_bio_flag = em->compress_type;
Chris Masonc8b97812008-10-29 14:49:59 -04003648
Chris Masond1310b22008-01-24 16:13:08 -05003649 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3650 cur_end = min(extent_map_end(em) - 1, end);
Qu Wenruofda28322013-02-26 08:10:22 +00003651 iosize = ALIGN(iosize, blocksize);
David Sterba2a5232a2021-07-27 14:47:09 +02003652 if (this_bio_flag != BTRFS_COMPRESS_NONE)
Qu Wenruo0c64c332021-01-06 09:01:40 +08003653 disk_bytenr = em->block_start;
Goldwyn Rodrigues949b3272020-09-15 10:41:40 -05003654 else
Qu Wenruo0c64c332021-01-06 09:01:40 +08003655 disk_bytenr = em->block_start + extent_offset;
Chris Masond1310b22008-01-24 16:13:08 -05003656 block_start = em->block_start;
Yan Zhengd899e052008-10-30 14:25:28 -04003657 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3658 block_start = EXTENT_MAP_HOLE;
Filipe Manana005efed2015-09-14 09:09:31 +01003659
3660 /*
3661 * If we have a file range that points to a compressed extent
Randy Dunlap260db432020-08-04 19:48:34 -07003662 * and it's followed by a consecutive file range that points
Filipe Manana005efed2015-09-14 09:09:31 +01003663 * to the same compressed extent (possibly with a different
3664 * offset and/or length, so it either points to the whole extent
3665 * or only part of it), we must make sure we do not submit a
3666 * single bio to populate the pages for the 2 ranges because
3667 * this makes the compressed extent read zero out the pages
3668 * belonging to the 2nd range. Imagine the following scenario:
3669 *
3670 * File layout
3671 * [0 - 8K] [8K - 24K]
3672 * | |
3673 * | |
3674 * points to extent X, points to extent X,
3675 * offset 4K, length of 8K offset 0, length 16K
3676 *
3677 * [extent X, compressed length = 4K uncompressed length = 16K]
3678 *
3679 * If the bio to read the compressed extent covers both ranges,
3680 * it will decompress extent X into the pages belonging to the
3681 * first range and then it will stop, zeroing out the remaining
3682 * pages that belong to the other range that points to extent X.
3683 * So here we make sure we submit 2 bios, one for the first
3684 * range and another one for the third range. Both will target
3685 * the same physical extent from disk, but we can't currently
3686 * make the compressed bio endio callback populate the pages
3687 * for both ranges because each compressed bio is tightly
3688 * coupled with a single extent map, and each range can have
3689 * an extent map with a different offset value relative to the
3690 * uncompressed data of our extent and different lengths. This
3691 * is a corner case so we prioritize correctness over
3692 * non-optimal behavior (submitting 2 bios for the same extent).
3693 */
3694 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3695 prev_em_start && *prev_em_start != (u64)-1 &&
Filipe Manana8e928212019-02-14 15:17:20 +00003696 *prev_em_start != em->start)
Filipe Manana005efed2015-09-14 09:09:31 +01003697 force_bio_submit = true;
3698
3699 if (prev_em_start)
Filipe Manana8e928212019-02-14 15:17:20 +00003700 *prev_em_start = em->start;
Filipe Manana005efed2015-09-14 09:09:31 +01003701
Chris Masond1310b22008-01-24 16:13:08 -05003702 free_extent_map(em);
3703 em = NULL;
3704
3705 /* we've found a hole, just zero and go on */
3706 if (block_start == EXTENT_MAP_HOLE) {
Arne Jansen507903b2011-04-06 10:02:20 +00003707 struct extent_state *cached = NULL;
3708
Ira Weinyd048b9c2021-05-04 18:40:07 -07003709 memzero_page(page, pg_offset, iosize);
Chris Masond1310b22008-01-24 16:13:08 -05003710
3711 set_extent_uptodate(tree, cur, cur + iosize - 1,
Arne Jansen507903b2011-04-06 10:02:20 +00003712 &cached, GFP_NOFS);
Filipe Manana7f042a82016-01-27 19:17:20 +00003713 unlock_extent_cached(tree, cur,
David Sterbae43bbe52017-12-12 21:43:52 +01003714 cur + iosize - 1, &cached);
Qu Wenruo92082d42021-02-02 10:28:36 +08003715 end_page_read(page, true, cur, iosize);
Chris Masond1310b22008-01-24 16:13:08 -05003716 cur = cur + iosize;
David Sterba306e16c2011-04-19 14:29:38 +02003717 pg_offset += iosize;
Chris Masond1310b22008-01-24 16:13:08 -05003718 continue;
3719 }
3720 /* the get_extent function already copied into the page */
Chris Mason9655d292009-09-02 15:22:30 -04003721 if (test_range_bit(tree, cur, cur_end,
3722 EXTENT_UPTODATE, 1, NULL)) {
Filipe Manana7f042a82016-01-27 19:17:20 +00003723 unlock_extent(tree, cur, cur + iosize - 1);
Qu Wenruo92082d42021-02-02 10:28:36 +08003724 end_page_read(page, true, cur, iosize);
Chris Masond1310b22008-01-24 16:13:08 -05003725 cur = cur + iosize;
David Sterba306e16c2011-04-19 14:29:38 +02003726 pg_offset += iosize;
Chris Masond1310b22008-01-24 16:13:08 -05003727 continue;
3728 }
Chris Mason70dec802008-01-29 09:59:12 -05003729 /* we have an inline extent but it didn't get marked up
3730 * to date. Error out
3731 */
3732 if (block_start == EXTENT_MAP_INLINE) {
Filipe Manana7f042a82016-01-27 19:17:20 +00003733 unlock_extent(tree, cur, cur + iosize - 1);
Qu Wenruo92082d42021-02-02 10:28:36 +08003734 end_page_read(page, false, cur, iosize);
Chris Mason70dec802008-01-29 09:59:12 -05003735 cur = cur + iosize;
David Sterba306e16c2011-04-19 14:29:38 +02003736 pg_offset += iosize;
Chris Mason70dec802008-01-29 09:59:12 -05003737 continue;
3738 }
Chris Masond1310b22008-01-24 16:13:08 -05003739
David Sterba0ceb34b2020-02-05 19:09:28 +01003740 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003741 bio_ctrl, page, disk_bytenr, iosize,
Christoph Hellwig722c82a2022-06-03 09:11:03 +02003742 pg_offset, end_bio_extent_readpage,
3743 this_bio_flag, force_bio_submit);
Filipe Mananaad3fc7942022-02-03 15:36:44 +00003744 if (ret) {
Qu Wenruo10f7f6f2022-04-12 20:30:14 +08003745 /*
3746 * We have to unlock the remaining range, or the page
3747 * will never be unlocked.
3748 */
3749 unlock_extent(tree, cur, end);
3750 end_page_read(page, false, cur, end + 1 - cur);
Liu Bobaf863b2016-07-11 10:39:07 -07003751 goto out;
Josef Bacikedd33c92012-10-05 16:40:32 -04003752 }
Chris Masond1310b22008-01-24 16:13:08 -05003753 cur = cur + iosize;
David Sterba306e16c2011-04-19 14:29:38 +02003754 pg_offset += iosize;
Chris Masond1310b22008-01-24 16:13:08 -05003755 }
Dan Magenheimer90a887c2011-05-26 10:01:56 -06003756out:
Liu Bobaf863b2016-07-11 10:39:07 -07003757 return ret;
Chris Masond1310b22008-01-24 16:13:08 -05003758}
3759
Linus Torvaldsfdaf9a52022-05-24 19:55:07 -07003760int btrfs_read_folio(struct file *file, struct folio *folio)
Christoph Hellwig7aab8b32022-04-15 16:33:24 +02003761{
Linus Torvaldsfdaf9a52022-05-24 19:55:07 -07003762 struct page *page = &folio->page;
Christoph Hellwig7aab8b32022-04-15 16:33:24 +02003763 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3764 u64 start = page_offset(page);
3765 u64 end = start + PAGE_SIZE - 1;
3766 struct btrfs_bio_ctrl bio_ctrl = { 0 };
3767 int ret;
3768
3769 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3770
3771 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
3772 /*
3773 * If btrfs_do_readpage() failed we will want to submit the assembled
3774 * bio to do the cleanup.
3775 */
Christoph Hellwig722c82a2022-06-03 09:11:03 +02003776 submit_one_bio(&bio_ctrl);
Christoph Hellwig7aab8b32022-04-15 16:33:24 +02003777 return ret;
3778}
3779
David Sterbab6660e82020-02-05 19:09:40 +01003780static inline void contiguous_readpages(struct page *pages[], int nr_pages,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003781 u64 start, u64 end,
3782 struct extent_map **em_cached,
3783 struct btrfs_bio_ctrl *bio_ctrl,
3784 u64 *prev_em_start)
Miao Xie99740902013-07-25 19:22:36 +08003785{
Nikolay Borisov23d31bd2019-05-07 10:19:23 +03003786 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
Miao Xie99740902013-07-25 19:22:36 +08003787 int index;
3788
David Sterbab272ae22020-02-05 19:09:33 +01003789 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
Miao Xie99740902013-07-25 19:22:36 +08003790
3791 for (index = 0; index < nr_pages; index++) {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08003792 btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
Nikolay Borisov0f208812020-09-14 14:39:16 +03003793 REQ_RAHEAD, prev_em_start);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03003794 put_page(pages[index]);
Miao Xie99740902013-07-25 19:22:36 +08003795 }
3796}
3797
Chris Masond1310b22008-01-24 16:13:08 -05003798/*
Chris Mason40f76582014-05-21 13:35:51 -07003799 * helper for __extent_writepage, doing all of the delayed allocation setup.
3800 *
Nikolay Borisov5eaad972018-11-01 14:09:46 +02003801 * This returns 1 if btrfs_run_delalloc_range function did all the work required
Chris Mason40f76582014-05-21 13:35:51 -07003802 * to write the page (copy into inline extent). In this case the IO has
3803 * been started and the page is already unlocked.
3804 *
3805 * This returns 0 if all went well (page still locked)
3806 * This returns < 0 if there were errors (page still locked)
Chris Masond1310b22008-01-24 16:13:08 -05003807 */
Nikolay Borisovcd4c0bf942020-06-05 10:42:10 +03003808static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
Qu Wenruo83f1b682021-11-12 13:33:14 +08003809 struct page *page, struct writeback_control *wbc)
Chris Masond1310b22008-01-24 16:13:08 -05003810{
Qu Wenruo2749f7e2021-09-27 15:22:07 +08003811 const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
Qu Wenruocf3075f2021-09-27 15:21:44 +08003812 u64 delalloc_start = page_offset(page);
Chris Mason40f76582014-05-21 13:35:51 -07003813 u64 delalloc_to_write = 0;
Qu Wenruo83f1b682021-11-12 13:33:14 +08003814 /* How many pages are started by btrfs_run_delalloc_range() */
3815 unsigned long nr_written = 0;
Chris Mason40f76582014-05-21 13:35:51 -07003816 int ret;
3817 int page_started = 0;
3818
Qu Wenruo2749f7e2021-09-27 15:22:07 +08003819 while (delalloc_start < page_end) {
3820 u64 delalloc_end = page_end;
3821 bool found;
Chris Mason40f76582014-05-21 13:35:51 -07003822
Nikolay Borisovcd4c0bf942020-06-05 10:42:10 +03003823 found = find_lock_delalloc_range(&inode->vfs_inode, page,
Chris Mason40f76582014-05-21 13:35:51 -07003824 &delalloc_start,
Nikolay Borisov917aace2018-10-26 14:43:20 +03003825 &delalloc_end);
Lu Fengqi3522e902018-11-29 11:33:38 +08003826 if (!found) {
Chris Mason40f76582014-05-21 13:35:51 -07003827 delalloc_start = delalloc_end + 1;
3828 continue;
3829 }
Nikolay Borisovcd4c0bf942020-06-05 10:42:10 +03003830 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
Qu Wenruo83f1b682021-11-12 13:33:14 +08003831 delalloc_end, &page_started, &nr_written, wbc);
Chris Mason40f76582014-05-21 13:35:51 -07003832 if (ret) {
Qu Wenruo963e4db2021-07-26 14:35:07 +08003833 btrfs_page_set_error(inode->root->fs_info, page,
3834 page_offset(page), PAGE_SIZE);
Qu Wenruo7361b4ae2021-07-28 14:05:05 +08003835 return ret;
Chris Mason40f76582014-05-21 13:35:51 -07003836 }
3837 /*
Kirill A. Shutemovea1754a2016-04-01 15:29:48 +03003838 * delalloc_end is already one less than the total length, so
3839 * we don't subtract one from PAGE_SIZE
Chris Mason40f76582014-05-21 13:35:51 -07003840 */
3841 delalloc_to_write += (delalloc_end - delalloc_start +
Kirill A. Shutemovea1754a2016-04-01 15:29:48 +03003842 PAGE_SIZE) >> PAGE_SHIFT;
Chris Mason40f76582014-05-21 13:35:51 -07003843 delalloc_start = delalloc_end + 1;
3844 }
3845 if (wbc->nr_to_write < delalloc_to_write) {
3846 int thresh = 8192;
3847
3848 if (delalloc_to_write < thresh * 2)
3849 thresh = delalloc_to_write;
3850 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3851 thresh);
3852 }
3853
Qu Wenruo83f1b682021-11-12 13:33:14 +08003854 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
Chris Mason40f76582014-05-21 13:35:51 -07003855 if (page_started) {
3856 /*
Qu Wenruo83f1b682021-11-12 13:33:14 +08003857 * We've unlocked the page, so we can't update the mapping's
3858 * writeback index, just update nr_to_write.
Chris Mason40f76582014-05-21 13:35:51 -07003859 */
Qu Wenruo83f1b682021-11-12 13:33:14 +08003860 wbc->nr_to_write -= nr_written;
Chris Mason40f76582014-05-21 13:35:51 -07003861 return 1;
3862 }
3863
Nikolay Borisovb69d1ee2020-07-16 18:17:19 +03003864 return 0;
Chris Mason40f76582014-05-21 13:35:51 -07003865}
3866
3867/*
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003868 * Find the first byte we need to write.
3869 *
3870 * For subpage, one page can contain several sectors, and
3871 * __extent_writepage_io() will just grab all extent maps in the page
3872 * range and try to submit all non-inline/non-compressed extents.
3873 *
3874 * This is a big problem for subpage, we shouldn't re-submit already written
3875 * data at all.
3876 * This function will lookup subpage dirty bit to find which range we really
3877 * need to submit.
3878 *
3879 * Return the next dirty range in [@start, @end).
3880 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3881 */
3882static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3883 struct page *page, u64 *start, u64 *end)
3884{
3885 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
Qu Wenruo72a69cd2021-08-17 17:38:52 +08003886 struct btrfs_subpage_info *spi = fs_info->subpage_info;
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003887 u64 orig_start = *start;
3888 /* Declare as unsigned long so we can use bitmap ops */
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003889 unsigned long flags;
Qu Wenruo72a69cd2021-08-17 17:38:52 +08003890 int range_start_bit;
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003891 int range_end_bit;
3892
3893 /*
3894 * For regular sector size == page size case, since one page only
3895 * contains one sector, we return the page offset directly.
3896 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08003897 if (!btrfs_is_subpage(fs_info, page)) {
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003898 *start = page_offset(page);
3899 *end = page_offset(page) + PAGE_SIZE;
3900 return;
3901 }
3902
Qu Wenruo72a69cd2021-08-17 17:38:52 +08003903 range_start_bit = spi->dirty_offset +
3904 (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3905
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003906 /* We should have the page locked, but just in case */
3907 spin_lock_irqsave(&subpage->lock, flags);
Qu Wenruo72a69cd2021-08-17 17:38:52 +08003908 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3909 spi->dirty_offset + spi->bitmap_nr_bits);
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003910 spin_unlock_irqrestore(&subpage->lock, flags);
3911
Qu Wenruo72a69cd2021-08-17 17:38:52 +08003912 range_start_bit -= spi->dirty_offset;
3913 range_end_bit -= spi->dirty_offset;
3914
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003915 *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3916 *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3917}
3918
3919/*
Chris Mason40f76582014-05-21 13:35:51 -07003920 * helper for __extent_writepage. This calls the writepage start hooks,
3921 * and does the loop to map the page into extents and bios.
3922 *
3923 * We return 1 if the IO is started and the page is unlocked,
3924 * 0 if all went well (page still locked)
3925 * < 0 if there were errors (page still locked)
3926 */
Nikolay Borisovd4580fe2020-06-03 08:55:33 +03003927static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
Chris Mason40f76582014-05-21 13:35:51 -07003928 struct page *page,
3929 struct writeback_control *wbc,
3930 struct extent_page_data *epd,
3931 loff_t i_size,
David Sterba57e5ffe2019-10-29 18:28:55 +01003932 int *nr_ret)
Chris Mason40f76582014-05-21 13:35:51 -07003933{
Qu Wenruo6bc56362021-01-06 09:01:41 +08003934 struct btrfs_fs_info *fs_info = inode->root->fs_info;
Qu Wenruoa129ffb2021-07-27 13:41:32 +08003935 u64 cur = page_offset(page);
3936 u64 end = cur + PAGE_SIZE - 1;
Chris Masond1310b22008-01-24 16:13:08 -05003937 u64 extent_offset;
Chris Masond1310b22008-01-24 16:13:08 -05003938 u64 block_start;
Chris Masond1310b22008-01-24 16:13:08 -05003939 struct extent_map *em;
Qu Wenruo44e58012022-04-12 20:30:15 +08003940 int saved_ret = 0;
Chris Mason40f76582014-05-21 13:35:51 -07003941 int ret = 0;
3942 int nr = 0;
Bart Van Asschebf9486d2022-07-14 11:07:16 -07003943 enum req_op op = REQ_OP_WRITE;
3944 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
Qu Wenruo44e58012022-04-12 20:30:15 +08003945 bool has_error = false;
Chris Mason40f76582014-05-21 13:35:51 -07003946 bool compressed;
Chris Masond1310b22008-01-24 16:13:08 -05003947
Qu Wenruoa129ffb2021-07-27 13:41:32 +08003948 ret = btrfs_writepage_cow_fixup(page);
Nikolay Borisovd75855b2018-11-01 14:09:47 +02003949 if (ret) {
3950 /* Fixup worker will requeue */
Josef Bacik5ab58052020-01-21 11:51:43 -05003951 redirty_page_for_writepage(wbc, page);
Nikolay Borisovd75855b2018-11-01 14:09:47 +02003952 unlock_page(page);
3953 return 1;
Chris Mason247e7432008-07-17 12:53:51 -04003954 }
3955
Chris Mason11c83492009-04-20 15:50:09 -04003956 /*
3957 * we don't want to touch the inode after unlocking the page,
3958 * so we update the mapping writeback index now
3959 */
David Sterba572f3da2021-07-27 12:45:11 +02003960 wbc->nr_to_write--;
Chris Mason771ed682008-11-06 22:02:51 -05003961
Chris Masond1310b22008-01-24 16:13:08 -05003962 while (cur <= end) {
Qu Wenruo0c64c332021-01-06 09:01:40 +08003963 u64 disk_bytenr;
Chris Mason40f76582014-05-21 13:35:51 -07003964 u64 em_end;
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003965 u64 dirty_range_start = cur;
3966 u64 dirty_range_end;
Qu Wenruo6bc56362021-01-06 09:01:41 +08003967 u32 iosize;
David Sterba58409ed2016-05-04 11:46:10 +02003968
Chris Mason40f76582014-05-21 13:35:51 -07003969 if (cur >= i_size) {
Qu Wenruo38a39ac72021-04-08 20:32:27 +08003970 btrfs_writepage_endio_finish_ordered(inode, page, cur,
David Sterba25c12522021-07-26 14:15:08 +02003971 end, true);
Qu Wenruocc1d0d92021-07-26 14:34:58 +08003972 /*
3973 * This range is beyond i_size, thus we don't need to
3974 * bother writing back.
3975 * But we still need to clear the dirty subpage bit, or
3976 * the next time the page gets dirtied, we will try to
3977 * writeback the sectors with subpage dirty bits,
3978 * causing writeback without ordered extent.
3979 */
3980 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
Chris Masond1310b22008-01-24 16:13:08 -05003981 break;
3982 }
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003983
3984 find_next_dirty_byte(fs_info, page, &dirty_range_start,
3985 &dirty_range_end);
3986 if (cur < dirty_range_start) {
3987 cur = dirty_range_start;
3988 continue;
3989 }
3990
Nikolay Borisovd4580fe2020-06-03 08:55:33 +03003991 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
Filipe Mananac0347552022-02-03 15:36:42 +00003992 if (IS_ERR(em)) {
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08003993 btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
Filipe Manana61391d52014-05-09 17:17:40 +01003994 ret = PTR_ERR_OR_ZERO(em);
Qu Wenruo44e58012022-04-12 20:30:15 +08003995 has_error = true;
3996 if (!saved_ret)
3997 saved_ret = ret;
Chris Masond1310b22008-01-24 16:13:08 -05003998 break;
3999 }
4000
4001 extent_offset = cur - em->start;
Chris Mason40f76582014-05-21 13:35:51 -07004002 em_end = extent_map_end(em);
Qu Wenruo6bc56362021-01-06 09:01:41 +08004003 ASSERT(cur <= em_end);
4004 ASSERT(cur < end);
4005 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4006 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
Chris Masond1310b22008-01-24 16:13:08 -05004007 block_start = em->block_start;
Chris Masonc8b97812008-10-29 14:49:59 -04004008 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
Qu Wenruo6bc56362021-01-06 09:01:41 +08004009 disk_bytenr = em->block_start + extent_offset;
4010
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08004011 /*
4012 * Note that em_end from extent_map_end() and dirty_range_end from
4013 * find_next_dirty_byte() are all exclusive
4014 */
4015 iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
Naohiro Aotad8e3fb12021-02-04 19:22:05 +09004016
Johannes Thumshirne380adf2021-05-19 00:40:27 +09004017 if (btrfs_use_zone_append(inode, em->block_start))
Bart Van Asschebf9486d2022-07-14 11:07:16 -07004018 op = REQ_OP_ZONE_APPEND;
Naohiro Aotad8e3fb12021-02-04 19:22:05 +09004019
Chris Masond1310b22008-01-24 16:13:08 -05004020 free_extent_map(em);
4021 em = NULL;
4022
Chris Masonc8b97812008-10-29 14:49:59 -04004023 /*
4024 * compressed and inline extents are written through other
4025 * paths in the FS
4026 */
4027 if (compressed || block_start == EXTENT_MAP_HOLE ||
Chris Masond1310b22008-01-24 16:13:08 -05004028 block_start == EXTENT_MAP_INLINE) {
Omar Sandovalc8b04032019-12-02 17:34:24 -08004029 if (compressed)
Chris Masonc8b97812008-10-29 14:49:59 -04004030 nr++;
Omar Sandovalc8b04032019-12-02 17:34:24 -08004031 else
Qu Wenruo38a39ac72021-04-08 20:32:27 +08004032 btrfs_writepage_endio_finish_ordered(inode,
David Sterba25c12522021-07-26 14:15:08 +02004033 page, cur, cur + iosize - 1, true);
Qu Wenruocc1d0d92021-07-26 14:34:58 +08004034 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
Chris Masonc8b97812008-10-29 14:49:59 -04004035 cur += iosize;
Chris Masond1310b22008-01-24 16:13:08 -05004036 continue;
4037 }
Chris Masonc8b97812008-10-29 14:49:59 -04004038
Qu Wenruod2a91062021-05-31 16:50:49 +08004039 btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
David Sterba58409ed2016-05-04 11:46:10 +02004040 if (!PageWriteback(page)) {
Nikolay Borisovd4580fe2020-06-03 08:55:33 +03004041 btrfs_err(inode->root->fs_info,
David Sterba58409ed2016-05-04 11:46:10 +02004042 "page %lu not writeback, cur %llu end %llu",
4043 page->index, cur, end);
Chris Masond1310b22008-01-24 16:13:08 -05004044 }
David Sterba58409ed2016-05-04 11:46:10 +02004045
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08004046 /*
4047 * Although the PageDirty bit is cleared before entering this
4048 * function, subpage dirty bit is not cleared.
4049 * So clear subpage dirty bit here so next time we won't submit
4050 * page for range already written to disk.
4051 */
4052 btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4053
Bart Van Asschebf9486d2022-07-14 11:07:16 -07004054 ret = submit_extent_page(op | write_flags, wbc,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08004055 &epd->bio_ctrl, page,
Naohiro Aotad8e3fb12021-02-04 19:22:05 +09004056 disk_bytenr, iosize,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08004057 cur - page_offset(page),
David Sterba58409ed2016-05-04 11:46:10 +02004058 end_bio_extent_writepage,
Christoph Hellwig722c82a2022-06-03 09:11:03 +02004059 0, false);
Takafumi Kubotafe01aa62017-02-09 17:24:33 +09004060 if (ret) {
Qu Wenruo44e58012022-04-12 20:30:15 +08004061 has_error = true;
4062 if (!saved_ret)
4063 saved_ret = ret;
4064
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08004065 btrfs_page_set_error(fs_info, page, cur, iosize);
Takafumi Kubotafe01aa62017-02-09 17:24:33 +09004066 if (PageWriteback(page))
Qu Wenruoc5ef5c62021-05-31 16:50:50 +08004067 btrfs_page_clear_writeback(fs_info, page, cur,
4068 iosize);
Takafumi Kubotafe01aa62017-02-09 17:24:33 +09004069 }
Chris Mason7f3c74f2008-07-18 12:01:11 -04004070
Qu Wenruo6bc56362021-01-06 09:01:41 +08004071 cur += iosize;
Chris Masond1310b22008-01-24 16:13:08 -05004072 nr++;
4073 }
Qu Wenruocc1d0d92021-07-26 14:34:58 +08004074 /*
4075 * If we finish without problem, we should not only clear page dirty,
4076 * but also empty subpage dirty bits
4077 */
Qu Wenruo44e58012022-04-12 20:30:15 +08004078 if (!has_error)
Qu Wenruocc1d0d92021-07-26 14:34:58 +08004079 btrfs_page_assert_not_dirty(fs_info, page);
Qu Wenruo44e58012022-04-12 20:30:15 +08004080 else
4081 ret = saved_ret;
Chris Mason40f76582014-05-21 13:35:51 -07004082 *nr_ret = nr;
Chris Mason40f76582014-05-21 13:35:51 -07004083 return ret;
4084}
4085
4086/*
4087 * the writepage semantics are similar to regular writepage. extent
4088 * records are inserted to lock ranges in the tree, and as dirty areas
4089 * are found, they are marked writeback. Then the lock bits are removed
4090 * and the end_io handler clears the writeback ranges
Qu Wenruo30659762019-03-20 14:27:42 +08004091 *
4092 * Return 0 if everything goes well.
4093 * Return <0 for error.
Chris Mason40f76582014-05-21 13:35:51 -07004094 */
4095static int __extent_writepage(struct page *page, struct writeback_control *wbc,
David Sterbaaab6e9e2017-11-30 18:00:02 +01004096 struct extent_page_data *epd)
Chris Mason40f76582014-05-21 13:35:51 -07004097{
Matthew Wilcox (Oracle)8e1dec82022-02-09 20:21:29 +00004098 struct folio *folio = page_folio(page);
Chris Mason40f76582014-05-21 13:35:51 -07004099 struct inode *inode = page->mapping->host;
Qu Wenruoe55a0de2021-09-27 15:22:05 +08004100 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Qu Wenruocf3075f2021-09-27 15:21:44 +08004101 const u64 page_start = page_offset(page);
4102 const u64 page_end = page_start + PAGE_SIZE - 1;
Chris Mason40f76582014-05-21 13:35:51 -07004103 int ret;
4104 int nr = 0;
Omar Sandovaleb70d222019-12-02 17:34:20 -08004105 size_t pg_offset;
Chris Mason40f76582014-05-21 13:35:51 -07004106 loff_t i_size = i_size_read(inode);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004107 unsigned long end_index = i_size >> PAGE_SHIFT;
Chris Mason40f76582014-05-21 13:35:51 -07004108
Chris Mason40f76582014-05-21 13:35:51 -07004109 trace___extent_writepage(page, inode, wbc);
4110
4111 WARN_ON(!PageLocked(page));
4112
Qu Wenruo963e4db2021-07-26 14:35:07 +08004113 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4114 page_offset(page), PAGE_SIZE);
Chris Mason40f76582014-05-21 13:35:51 -07004115
Johannes Thumshirn70730172018-12-05 15:23:03 +01004116 pg_offset = offset_in_page(i_size);
Chris Mason40f76582014-05-21 13:35:51 -07004117 if (page->index > end_index ||
4118 (page->index == end_index && !pg_offset)) {
Matthew Wilcox (Oracle)8e1dec82022-02-09 20:21:29 +00004119 folio_invalidate(folio, 0, folio_size(folio));
4120 folio_unlock(folio);
Chris Mason40f76582014-05-21 13:35:51 -07004121 return 0;
4122 }
4123
David Sterba21a89352022-06-01 13:47:54 +02004124 if (page->index == end_index)
Ira Weinyd048b9c2021-05-04 18:40:07 -07004125 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
Chris Mason40f76582014-05-21 13:35:51 -07004126
Qu Wenruo32443de2021-01-26 16:34:00 +08004127 ret = set_page_extent_mapped(page);
4128 if (ret < 0) {
4129 SetPageError(page);
4130 goto done;
4131 }
Chris Mason40f76582014-05-21 13:35:51 -07004132
Nikolay Borisov7789a552018-11-08 10:18:06 +02004133 if (!epd->extent_locked) {
Qu Wenruo83f1b682021-11-12 13:33:14 +08004134 ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
Nikolay Borisov7789a552018-11-08 10:18:06 +02004135 if (ret == 1)
Omar Sandoval169d2c82019-12-02 17:34:21 -08004136 return 0;
Nikolay Borisov7789a552018-11-08 10:18:06 +02004137 if (ret)
4138 goto done;
4139 }
Chris Mason40f76582014-05-21 13:35:51 -07004140
Nikolay Borisovd4580fe2020-06-03 08:55:33 +03004141 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
Qu Wenruo83f1b682021-11-12 13:33:14 +08004142 &nr);
Chris Mason40f76582014-05-21 13:35:51 -07004143 if (ret == 1)
Omar Sandoval169d2c82019-12-02 17:34:21 -08004144 return 0;
Chris Mason40f76582014-05-21 13:35:51 -07004145
4146done:
Chris Masond1310b22008-01-24 16:13:08 -05004147 if (nr == 0) {
4148 /* make sure the mapping tag for page dirty gets cleared */
4149 set_page_writeback(page);
4150 end_page_writeback(page);
4151 }
Qu Wenruo963e4db2021-07-26 14:35:07 +08004152 /*
4153 * Here we used to have a check for PageError() and then set @ret and
4154 * call end_extent_writepage().
4155 *
4156 * But in fact setting @ret here will cause different error paths
4157 * between subpage and regular sectorsize.
4158 *
4159 * For regular page size, we never submit current page, but only add
4160 * current page to current bio.
4161 * The bio submission can only happen in next page.
4162 * Thus if we hit the PageError() branch, @ret is already set to
4163 * non-zero value and will not get updated for regular sectorsize.
4164 *
4165 * But for subpage case, it's possible we submit part of current page,
4166 * thus can get PageError() set by submitted bio of the same page,
4167 * while our @ret is still 0.
4168 *
4169 * So here we unify the behavior and don't set @ret.
4170 * Error can still be properly passed to higher layer as page will
4171 * be set error, here we just don't handle the IO failure.
4172 *
4173 * NOTE: This is just a hotfix for subpage.
4174 * The root fix will be properly ending ordered extent when we hit
4175 * an error during writeback.
4176 *
4177 * But that needs a bigger refactoring, as we not only need to grab the
4178 * submitted OE, but also need to know exactly at which bytenr we hit
4179 * the error.
4180 * Currently the full page based __extent_writepage_io() is not
4181 * capable of that.
4182 */
4183 if (PageError(page))
Qu Wenruocf3075f2021-09-27 15:21:44 +08004184 end_extent_writepage(page, ret, page_start, page_end);
Qu Wenruoe55a0de2021-09-27 15:22:05 +08004185 if (epd->extent_locked) {
4186 /*
4187 * If epd->extent_locked, it's from extent_write_locked_range(),
4188 * the page can either be locked by lock_page() or
4189 * process_one_page().
4190 * Let btrfs_page_unlock_writer() handle both cases.
4191 */
4192 ASSERT(wbc);
4193 btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4194 wbc->range_end + 1 - wbc->range_start);
4195 } else {
4196 unlock_page(page);
4197 }
Qu Wenruo30659762019-03-20 14:27:42 +08004198 ASSERT(ret <= 0);
Chris Mason40f76582014-05-21 13:35:51 -07004199 return ret;
Chris Masond1310b22008-01-24 16:13:08 -05004200}
4201
Josef Bacikfd8b2b62013-04-24 16:41:19 -04004202void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004203{
NeilBrown74316202014-07-07 15:16:04 +10004204 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4205 TASK_UNINTERRUPTIBLE);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004206}
4207
Filipe Manana18dfa712019-09-11 17:42:00 +01004208static void end_extent_buffer_writeback(struct extent_buffer *eb)
4209{
4210 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4211 smp_mb__after_atomic();
4212 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4213}
4214
Qu Wenruo2e3c2512019-03-20 14:27:46 +08004215/*
Qu Wenruoa3efb2f2020-10-21 14:24:49 +08004216 * Lock extent buffer status and pages for writeback.
Qu Wenruo2e3c2512019-03-20 14:27:46 +08004217 *
Qu Wenruoa3efb2f2020-10-21 14:24:49 +08004218 * May try to flush write bio if we can't get the lock.
4219 *
4220 * Return 0 if the extent buffer doesn't need to be submitted.
4221 * (E.g. the extent buffer is not dirty)
4222 * Return >0 is the extent buffer is submitted to bio.
4223 * Return <0 if something went wrong, no page is locked.
Qu Wenruo2e3c2512019-03-20 14:27:46 +08004224 */
David Sterba9df76fb2019-03-20 11:21:41 +01004225static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
Chris Mason0e378df2014-05-19 20:55:27 -07004226 struct extent_page_data *epd)
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004227{
David Sterba9df76fb2019-03-20 11:21:41 +01004228 struct btrfs_fs_info *fs_info = eb->fs_info;
Qu Wenruoc9583ad2022-04-12 20:30:13 +08004229 int i, num_pages;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004230 int flush = 0;
4231 int ret = 0;
4232
4233 if (!btrfs_try_tree_write_lock(eb)) {
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004234 submit_write_bio(epd, 0);
Qu Wenruo2e3c2512019-03-20 14:27:46 +08004235 flush = 1;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004236 btrfs_tree_lock(eb);
4237 }
4238
4239 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4240 btrfs_tree_unlock(eb);
4241 if (!epd->sync_io)
4242 return 0;
4243 if (!flush) {
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004244 submit_write_bio(epd, 0);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004245 flush = 1;
4246 }
Chris Masona098d8e82012-03-21 12:09:56 -04004247 while (1) {
4248 wait_on_extent_buffer_writeback(eb);
4249 btrfs_tree_lock(eb);
4250 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4251 break;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004252 btrfs_tree_unlock(eb);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004253 }
4254 }
4255
Josef Bacik51561ff2012-07-20 16:25:24 -04004256 /*
4257 * We need to do this to prevent races in people who check if the eb is
4258 * under IO since we can end up having no IO bits set for a short period
4259 * of time.
4260 */
4261 spin_lock(&eb->refs_lock);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004262 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4263 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
Josef Bacik51561ff2012-07-20 16:25:24 -04004264 spin_unlock(&eb->refs_lock);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004265 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
Nikolay Borisov104b4e52017-06-20 21:01:20 +03004266 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4267 -eb->len,
4268 fs_info->dirty_metadata_batch);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004269 ret = 1;
Josef Bacik51561ff2012-07-20 16:25:24 -04004270 } else {
4271 spin_unlock(&eb->refs_lock);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004272 }
4273
4274 btrfs_tree_unlock(eb);
4275
Qu Wenruof3156df2021-04-06 08:36:02 +08004276 /*
4277 * Either we don't need to submit any tree block, or we're submitting
4278 * subpage eb.
4279 * Subpage metadata doesn't use page locking at all, so we can skip
4280 * the page locking.
4281 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08004282 if (!ret || fs_info->nodesize < PAGE_SIZE)
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004283 return ret;
4284
David Sterba65ad0102018-06-29 10:56:49 +02004285 num_pages = num_extent_pages(eb);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004286 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02004287 struct page *p = eb->pages[i];
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004288
4289 if (!trylock_page(p)) {
4290 if (!flush) {
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004291 submit_write_bio(epd, 0);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004292 flush = 1;
4293 }
4294 lock_page(p);
4295 }
4296 }
4297
4298 return ret;
4299}
4300
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004301static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
Filipe Manana656f30d2014-09-26 12:25:56 +01004302{
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004303 struct btrfs_fs_info *fs_info = eb->fs_info;
Filipe Manana656f30d2014-09-26 12:25:56 +01004304
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004305 btrfs_page_set_error(fs_info, page, eb->start, eb->len);
Filipe Manana656f30d2014-09-26 12:25:56 +01004306 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4307 return;
4308
4309 /*
Josef Bacikc2e39302021-11-24 14:14:23 -05004310 * A read may stumble upon this buffer later, make sure that it gets an
4311 * error and knows there was an error.
4312 */
4313 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4314
4315 /*
Josef Bacik68b85582021-11-24 14:14:25 -05004316 * We need to set the mapping with the io error as well because a write
4317 * error will flip the file system readonly, and then syncfs() will
4318 * return a 0 because we are readonly if we don't modify the err seq for
4319 * the superblock.
4320 */
4321 mapping_set_error(page->mapping, -EIO);
4322
4323 /*
Dennis Zhoueb5b64f2019-09-13 14:54:07 +01004324 * If we error out, we should add back the dirty_metadata_bytes
4325 * to make it consistent.
4326 */
Dennis Zhoueb5b64f2019-09-13 14:54:07 +01004327 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4328 eb->len, fs_info->dirty_metadata_batch);
4329
4330 /*
Filipe Manana656f30d2014-09-26 12:25:56 +01004331 * If writeback for a btree extent that doesn't belong to a log tree
4332 * failed, increment the counter transaction->eb_write_errors.
4333 * We do this because while the transaction is running and before it's
4334 * committing (when we call filemap_fdata[write|wait]_range against
4335 * the btree inode), we might have
4336 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4337 * returns an error or an error happens during writeback, when we're
4338 * committing the transaction we wouldn't know about it, since the pages
4339 * can be no longer dirty nor marked anymore for writeback (if a
4340 * subsequent modification to the extent buffer didn't happen before the
4341 * transaction commit), which makes filemap_fdata[write|wait]_range not
4342 * able to find the pages tagged with SetPageError at transaction
4343 * commit time. So if this happens we must abort the transaction,
4344 * otherwise we commit a super block with btree roots that point to
4345 * btree nodes/leafs whose content on disk is invalid - either garbage
4346 * or the content of some node/leaf from a past generation that got
4347 * cowed or deleted and is no longer valid.
4348 *
4349 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4350 * not be enough - we need to distinguish between log tree extents vs
4351 * non-log tree extents, and the next filemap_fdatawait_range() call
4352 * will catch and clear such errors in the mapping - and that call might
4353 * be from a log sync and not from a transaction commit. Also, checking
4354 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4355 * not done and would not be reliable - the eb might have been released
4356 * from memory and reading it back again means that flag would not be
4357 * set (since it's a runtime flag, not persisted on disk).
4358 *
4359 * Using the flags below in the btree inode also makes us achieve the
4360 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4361 * writeback for all dirty pages and before filemap_fdatawait_range()
4362 * is called, the writeback for all dirty pages had already finished
4363 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4364 * filemap_fdatawait_range() would return success, as it could not know
4365 * that writeback errors happened (the pages were no longer tagged for
4366 * writeback).
4367 */
4368 switch (eb->log_index) {
4369 case -1:
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004370 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
Filipe Manana656f30d2014-09-26 12:25:56 +01004371 break;
4372 case 0:
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004373 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
Filipe Manana656f30d2014-09-26 12:25:56 +01004374 break;
4375 case 1:
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004376 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
Filipe Manana656f30d2014-09-26 12:25:56 +01004377 break;
4378 default:
4379 BUG(); /* unexpected, logic error */
4380 }
4381}
4382
Qu Wenruo2f3186d2021-04-06 08:36:00 +08004383/*
4384 * The endio specific version which won't touch any unsafe spinlock in endio
4385 * context.
4386 */
4387static struct extent_buffer *find_extent_buffer_nolock(
4388 struct btrfs_fs_info *fs_info, u64 start)
4389{
4390 struct extent_buffer *eb;
4391
4392 rcu_read_lock();
David Sterba01cd3902022-07-15 13:59:31 +02004393 eb = radix_tree_lookup(&fs_info->buffer_radix,
4394 start >> fs_info->sectorsize_bits);
Qu Wenruo2f3186d2021-04-06 08:36:00 +08004395 if (eb && atomic_inc_not_zero(&eb->refs)) {
4396 rcu_read_unlock();
4397 return eb;
4398 }
4399 rcu_read_unlock();
4400 return NULL;
4401}
4402
4403/*
4404 * The endio function for subpage extent buffer write.
4405 *
4406 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4407 * after all extent buffers in the page has finished their writeback.
4408 */
Qu Wenruofa04c162021-04-27 12:53:35 +08004409static void end_bio_subpage_eb_writepage(struct bio *bio)
Qu Wenruo2f3186d2021-04-06 08:36:00 +08004410{
Qu Wenruofa04c162021-04-27 12:53:35 +08004411 struct btrfs_fs_info *fs_info;
Qu Wenruo2f3186d2021-04-06 08:36:00 +08004412 struct bio_vec *bvec;
4413 struct bvec_iter_all iter_all;
4414
Qu Wenruofa04c162021-04-27 12:53:35 +08004415 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
Qu Wenruofbca46e2022-01-13 13:22:09 +08004416 ASSERT(fs_info->nodesize < PAGE_SIZE);
Qu Wenruofa04c162021-04-27 12:53:35 +08004417
Qu Wenruo2f3186d2021-04-06 08:36:00 +08004418 ASSERT(!bio_flagged(bio, BIO_CLONED));
4419 bio_for_each_segment_all(bvec, bio, iter_all) {
4420 struct page *page = bvec->bv_page;
4421 u64 bvec_start = page_offset(page) + bvec->bv_offset;
4422 u64 bvec_end = bvec_start + bvec->bv_len - 1;
4423 u64 cur_bytenr = bvec_start;
4424
4425 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4426
4427 /* Iterate through all extent buffers in the range */
4428 while (cur_bytenr <= bvec_end) {
4429 struct extent_buffer *eb;
4430 int done;
4431
4432 /*
4433 * Here we can't use find_extent_buffer(), as it may
4434 * try to lock eb->refs_lock, which is not safe in endio
4435 * context.
4436 */
4437 eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4438 ASSERT(eb);
4439
4440 cur_bytenr = eb->start + eb->len;
4441
4442 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4443 done = atomic_dec_and_test(&eb->io_pages);
4444 ASSERT(done);
4445
4446 if (bio->bi_status ||
4447 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4448 ClearPageUptodate(page);
4449 set_btree_ioerr(page, eb);
4450 }
4451
4452 btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4453 eb->len);
4454 end_extent_buffer_writeback(eb);
4455 /*
4456 * free_extent_buffer() will grab spinlock which is not
4457 * safe in endio context. Thus here we manually dec
4458 * the ref.
4459 */
4460 atomic_dec(&eb->refs);
4461 }
4462 }
4463 bio_put(bio);
4464}
4465
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02004466static void end_bio_extent_buffer_writepage(struct bio *bio)
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004467{
Kent Overstreet2c30c712013-11-07 12:20:26 -08004468 struct bio_vec *bvec;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004469 struct extent_buffer *eb;
Christoph Hellwig2b070cf2019-04-25 09:03:00 +02004470 int done;
Ming Lei6dc4f102019-02-15 19:13:19 +08004471 struct bvec_iter_all iter_all;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004472
David Sterbac09abff2017-07-13 18:10:07 +02004473 ASSERT(!bio_flagged(bio, BIO_CLONED));
Christoph Hellwig2b070cf2019-04-25 09:03:00 +02004474 bio_for_each_segment_all(bvec, bio, iter_all) {
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004475 struct page *page = bvec->bv_page;
4476
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004477 eb = (struct extent_buffer *)page->private;
4478 BUG_ON(!eb);
4479 done = atomic_dec_and_test(&eb->io_pages);
4480
Christoph Hellwig4e4cbee2017-06-03 09:38:06 +02004481 if (bio->bi_status ||
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02004482 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004483 ClearPageUptodate(page);
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004484 set_btree_ioerr(page, eb);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004485 }
4486
4487 end_page_writeback(page);
4488
4489 if (!done)
4490 continue;
4491
4492 end_extent_buffer_writeback(eb);
Kent Overstreet2c30c712013-11-07 12:20:26 -08004493 }
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004494
4495 bio_put(bio);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004496}
4497
Qu Wenruofa04c162021-04-27 12:53:35 +08004498static void prepare_eb_write(struct extent_buffer *eb)
4499{
4500 u32 nritems;
4501 unsigned long start;
4502 unsigned long end;
4503
4504 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4505 atomic_set(&eb->io_pages, num_extent_pages(eb));
4506
4507 /* Set btree blocks beyond nritems with 0 to avoid stale content */
4508 nritems = btrfs_header_nritems(eb);
4509 if (btrfs_header_level(eb) > 0) {
4510 end = btrfs_node_key_ptr_offset(nritems);
4511 memzero_extent_buffer(eb, end, eb->len - end);
4512 } else {
4513 /*
4514 * Leaf:
4515 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4516 */
4517 start = btrfs_item_nr_offset(nritems);
4518 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4519 memzero_extent_buffer(eb, start, end - start);
4520 }
4521}
4522
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004523/*
4524 * Unlike the work in write_one_eb(), we rely completely on extent locking.
4525 * Page locking is only utilized at minimum to keep the VMM code happy.
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004526 */
4527static int write_one_subpage_eb(struct extent_buffer *eb,
4528 struct writeback_control *wbc,
4529 struct extent_page_data *epd)
4530{
4531 struct btrfs_fs_info *fs_info = eb->fs_info;
4532 struct page *page = eb->pages[0];
Linus Torvalds353767e2022-08-03 14:54:52 -07004533 blk_opf_t write_flags = wbc_to_write_flags(wbc);
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004534 bool no_dirty_ebs = false;
4535 int ret;
4536
Qu Wenruofa04c162021-04-27 12:53:35 +08004537 prepare_eb_write(eb);
4538
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004539 /* clear_page_dirty_for_io() in subpage helper needs page locked */
4540 lock_page(page);
4541 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4542
4543 /* Check if this is the last dirty bit to update nr_written */
4544 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4545 eb->start, eb->len);
4546 if (no_dirty_ebs)
4547 clear_page_dirty_for_io(page);
4548
Qu Wenruo390ed29b82021-04-14 16:42:15 +08004549 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4550 &epd->bio_ctrl, page, eb->start, eb->len,
4551 eb->start - page_offset(page),
Christoph Hellwig722c82a2022-06-03 09:11:03 +02004552 end_bio_subpage_eb_writepage, 0, false);
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004553 if (ret) {
4554 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4555 set_btree_ioerr(page, eb);
4556 unlock_page(page);
4557
4558 if (atomic_dec_and_test(&eb->io_pages))
4559 end_extent_buffer_writeback(eb);
4560 return -EIO;
4561 }
4562 unlock_page(page);
4563 /*
4564 * Submission finished without problem, if no range of the page is
4565 * dirty anymore, we have submitted a page. Update nr_written in wbc.
4566 */
4567 if (no_dirty_ebs)
David Sterba572f3da2021-07-27 12:45:11 +02004568 wbc->nr_to_write--;
Qu Wenruo35b6ddf2021-04-06 08:36:01 +08004569 return ret;
4570}
4571
Chris Mason0e378df2014-05-19 20:55:27 -07004572static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004573 struct writeback_control *wbc,
4574 struct extent_page_data *epd)
4575{
Qu Wenruo0c64c332021-01-06 09:01:40 +08004576 u64 disk_bytenr = eb->start;
David Sterbacc5e31a2018-03-01 18:20:27 +01004577 int i, num_pages;
Linus Torvalds353767e2022-08-03 14:54:52 -07004578 blk_opf_t write_flags = wbc_to_write_flags(wbc);
Josef Bacikd7dbe9e2012-04-23 14:00:51 -04004579 int ret = 0;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004580
Qu Wenruofa04c162021-04-27 12:53:35 +08004581 prepare_eb_write(eb);
4582
David Sterba65ad0102018-06-29 10:56:49 +02004583 num_pages = num_extent_pages(eb);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004584 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02004585 struct page *p = eb->pages[i];
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004586
4587 clear_page_dirty_for_io(p);
4588 set_page_writeback(p);
David Sterba0ceb34b2020-02-05 19:09:28 +01004589 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08004590 &epd->bio_ctrl, p, disk_bytenr,
4591 PAGE_SIZE, 0,
Mike Christie1f7ad752016-06-05 14:31:51 -05004592 end_bio_extent_buffer_writepage,
Christoph Hellwig722c82a2022-06-03 09:11:03 +02004593 0, false);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004594 if (ret) {
Qu Wenruo5a2c6072021-03-25 15:14:44 +08004595 set_btree_ioerr(p, eb);
Takafumi Kubotafe01aa62017-02-09 17:24:33 +09004596 if (PageWriteback(p))
4597 end_page_writeback(p);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004598 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4599 end_extent_buffer_writeback(eb);
4600 ret = -EIO;
4601 break;
4602 }
Qu Wenruo0c64c332021-01-06 09:01:40 +08004603 disk_bytenr += PAGE_SIZE;
David Sterba572f3da2021-07-27 12:45:11 +02004604 wbc->nr_to_write--;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004605 unlock_page(p);
4606 }
4607
4608 if (unlikely(ret)) {
4609 for (; i < num_pages; i++) {
Chris Masonbbf65cf2014-10-04 09:56:45 -07004610 struct page *p = eb->pages[i];
Liu Bo81465022014-09-23 22:22:33 +08004611 clear_page_dirty_for_io(p);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004612 unlock_page(p);
4613 }
4614 }
4615
4616 return ret;
4617}
4618
Qu Wenruof91e0d02020-12-02 14:48:00 +08004619/*
Qu Wenruoc4aec292021-04-06 08:36:03 +08004620 * Submit one subpage btree page.
4621 *
4622 * The main difference to submit_eb_page() is:
4623 * - Page locking
4624 * For subpage, we don't rely on page locking at all.
4625 *
4626 * - Flush write bio
4627 * We only flush bio if we may be unable to fit current extent buffers into
4628 * current bio.
4629 *
4630 * Return >=0 for the number of submitted extent buffers.
4631 * Return <0 for fatal error.
4632 */
4633static int submit_eb_subpage(struct page *page,
4634 struct writeback_control *wbc,
4635 struct extent_page_data *epd)
4636{
4637 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4638 int submitted = 0;
4639 u64 page_start = page_offset(page);
4640 int bit_start = 0;
Qu Wenruoc4aec292021-04-06 08:36:03 +08004641 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4642 int ret;
4643
4644 /* Lock and write each dirty extent buffers in the range */
Qu Wenruo72a69cd2021-08-17 17:38:52 +08004645 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
Qu Wenruoc4aec292021-04-06 08:36:03 +08004646 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4647 struct extent_buffer *eb;
4648 unsigned long flags;
4649 u64 start;
4650
4651 /*
4652 * Take private lock to ensure the subpage won't be detached
4653 * in the meantime.
4654 */
4655 spin_lock(&page->mapping->private_lock);
4656 if (!PagePrivate(page)) {
4657 spin_unlock(&page->mapping->private_lock);
4658 break;
4659 }
4660 spin_lock_irqsave(&subpage->lock, flags);
Qu Wenruo72a69cd2021-08-17 17:38:52 +08004661 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4662 subpage->bitmaps)) {
Qu Wenruoc4aec292021-04-06 08:36:03 +08004663 spin_unlock_irqrestore(&subpage->lock, flags);
4664 spin_unlock(&page->mapping->private_lock);
4665 bit_start++;
4666 continue;
4667 }
4668
4669 start = page_start + bit_start * fs_info->sectorsize;
4670 bit_start += sectors_per_node;
4671
4672 /*
4673 * Here we just want to grab the eb without touching extra
4674 * spin locks, so call find_extent_buffer_nolock().
4675 */
4676 eb = find_extent_buffer_nolock(fs_info, start);
4677 spin_unlock_irqrestore(&subpage->lock, flags);
4678 spin_unlock(&page->mapping->private_lock);
4679
4680 /*
4681 * The eb has already reached 0 refs thus find_extent_buffer()
4682 * doesn't return it. We don't need to write back such eb
4683 * anyway.
4684 */
4685 if (!eb)
4686 continue;
4687
4688 ret = lock_extent_buffer_for_io(eb, epd);
4689 if (ret == 0) {
4690 free_extent_buffer(eb);
4691 continue;
4692 }
4693 if (ret < 0) {
4694 free_extent_buffer(eb);
4695 goto cleanup;
4696 }
Qu Wenruofa04c162021-04-27 12:53:35 +08004697 ret = write_one_subpage_eb(eb, wbc, epd);
Qu Wenruoc4aec292021-04-06 08:36:03 +08004698 free_extent_buffer(eb);
4699 if (ret < 0)
4700 goto cleanup;
4701 submitted++;
4702 }
4703 return submitted;
4704
4705cleanup:
4706 /* We hit error, end bio for the submitted extent buffers */
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004707 submit_write_bio(epd, ret);
Qu Wenruoc4aec292021-04-06 08:36:03 +08004708 return ret;
4709}
4710
4711/*
Qu Wenruof91e0d02020-12-02 14:48:00 +08004712 * Submit all page(s) of one extent buffer.
4713 *
4714 * @page: the page of one extent buffer
4715 * @eb_context: to determine if we need to submit this page, if current page
4716 * belongs to this eb, we don't need to submit
4717 *
4718 * The caller should pass each page in their bytenr order, and here we use
4719 * @eb_context to determine if we have submitted pages of one extent buffer.
4720 *
4721 * If we have, we just skip until we hit a new page that doesn't belong to
4722 * current @eb_context.
4723 *
4724 * If not, we submit all the page(s) of the extent buffer.
4725 *
4726 * Return >0 if we have submitted the extent buffer successfully.
4727 * Return 0 if we don't need to submit the page, as it's already submitted by
4728 * previous call.
4729 * Return <0 for fatal error.
4730 */
4731static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4732 struct extent_page_data *epd,
4733 struct extent_buffer **eb_context)
4734{
4735 struct address_space *mapping = page->mapping;
Naohiro Aota0bc09ca2021-02-04 19:22:08 +09004736 struct btrfs_block_group *cache = NULL;
Qu Wenruof91e0d02020-12-02 14:48:00 +08004737 struct extent_buffer *eb;
4738 int ret;
4739
4740 if (!PagePrivate(page))
4741 return 0;
4742
Qu Wenruofbca46e2022-01-13 13:22:09 +08004743 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
Qu Wenruoc4aec292021-04-06 08:36:03 +08004744 return submit_eb_subpage(page, wbc, epd);
4745
Qu Wenruof91e0d02020-12-02 14:48:00 +08004746 spin_lock(&mapping->private_lock);
4747 if (!PagePrivate(page)) {
4748 spin_unlock(&mapping->private_lock);
4749 return 0;
4750 }
4751
4752 eb = (struct extent_buffer *)page->private;
4753
4754 /*
4755 * Shouldn't happen and normally this would be a BUG_ON but no point
4756 * crashing the machine for something we can survive anyway.
4757 */
4758 if (WARN_ON(!eb)) {
4759 spin_unlock(&mapping->private_lock);
4760 return 0;
4761 }
4762
4763 if (eb == *eb_context) {
4764 spin_unlock(&mapping->private_lock);
4765 return 0;
4766 }
4767 ret = atomic_inc_not_zero(&eb->refs);
4768 spin_unlock(&mapping->private_lock);
4769 if (!ret)
4770 return 0;
4771
Naohiro Aota0bc09ca2021-02-04 19:22:08 +09004772 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4773 /*
4774 * If for_sync, this hole will be filled with
4775 * trasnsaction commit.
4776 */
4777 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4778 ret = -EAGAIN;
4779 else
4780 ret = 0;
4781 free_extent_buffer(eb);
4782 return ret;
4783 }
4784
Qu Wenruof91e0d02020-12-02 14:48:00 +08004785 *eb_context = eb;
4786
4787 ret = lock_extent_buffer_for_io(eb, epd);
4788 if (ret <= 0) {
Naohiro Aota0bc09ca2021-02-04 19:22:08 +09004789 btrfs_revert_meta_write_pointer(cache, eb);
4790 if (cache)
4791 btrfs_put_block_group(cache);
Qu Wenruof91e0d02020-12-02 14:48:00 +08004792 free_extent_buffer(eb);
4793 return ret;
4794 }
Naohiro Aotabe1a1d72021-08-19 21:19:23 +09004795 if (cache) {
Nikolay Borisovd3e29962022-03-07 15:30:02 +02004796 /*
4797 * Implies write in zoned mode. Mark the last eb in a block group.
4798 */
Naohiro Aota56fbb0a2022-05-03 17:48:53 -07004799 btrfs_schedule_zone_finish_bg(cache, eb);
Nikolay Borisovd3e29962022-03-07 15:30:02 +02004800 btrfs_put_block_group(cache);
Naohiro Aotabe1a1d72021-08-19 21:19:23 +09004801 }
Qu Wenruof91e0d02020-12-02 14:48:00 +08004802 ret = write_one_eb(eb, wbc, epd);
4803 free_extent_buffer(eb);
4804 if (ret < 0)
4805 return ret;
4806 return 1;
4807}
4808
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004809int btree_write_cache_pages(struct address_space *mapping,
4810 struct writeback_control *wbc)
4811{
Qu Wenruof91e0d02020-12-02 14:48:00 +08004812 struct extent_buffer *eb_context = NULL;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004813 struct extent_page_data epd = {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08004814 .bio_ctrl = { 0 },
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004815 .extent_locked = 0,
4816 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4817 };
Qu Wenruob3ff8f12020-02-12 14:12:44 +08004818 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004819 int ret = 0;
4820 int done = 0;
4821 int nr_to_write_done = 0;
4822 struct pagevec pvec;
4823 int nr_pages;
4824 pgoff_t index;
4825 pgoff_t end; /* Inclusive */
4826 int scanned = 0;
Matthew Wilcox10bbd232017-12-05 17:30:38 -05004827 xa_mark_t tag;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004828
Mel Gorman86679822017-11-15 17:37:52 -08004829 pagevec_init(&pvec);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004830 if (wbc->range_cyclic) {
4831 index = mapping->writeback_index; /* Start from prev offset */
4832 end = -1;
Josef Bacik556755a2020-01-03 10:38:44 -05004833 /*
4834 * Start from the beginning does not need to cycle over the
4835 * range, mark it as scanned.
4836 */
4837 scanned = (index == 0);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004838 } else {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004839 index = wbc->range_start >> PAGE_SHIFT;
4840 end = wbc->range_end >> PAGE_SHIFT;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004841 scanned = 1;
4842 }
4843 if (wbc->sync_mode == WB_SYNC_ALL)
4844 tag = PAGECACHE_TAG_TOWRITE;
4845 else
4846 tag = PAGECACHE_TAG_DIRTY;
Naohiro Aota0bc09ca2021-02-04 19:22:08 +09004847 btrfs_zoned_meta_io_lock(fs_info);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004848retry:
4849 if (wbc->sync_mode == WB_SYNC_ALL)
4850 tag_pages_for_writeback(mapping, index, end);
4851 while (!done && !nr_to_write_done && (index <= end) &&
Jan Kara4006f432017-11-15 17:34:37 -08004852 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
Jan Kara67fd7072017-11-15 17:35:19 -08004853 tag))) {
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004854 unsigned i;
4855
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004856 for (i = 0; i < nr_pages; i++) {
4857 struct page *page = pvec.pages[i];
4858
Qu Wenruof91e0d02020-12-02 14:48:00 +08004859 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4860 if (ret == 0)
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004861 continue;
Qu Wenruof91e0d02020-12-02 14:48:00 +08004862 if (ret < 0) {
Filipe Manana0607eb1d2019-09-11 17:42:28 +01004863 done = 1;
Filipe Manana0607eb1d2019-09-11 17:42:28 +01004864 break;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004865 }
4866
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004867 /*
4868 * the filesystem may choose to bump up nr_to_write.
4869 * We have to make sure to honor the new nr_to_write
4870 * at any time
4871 */
4872 nr_to_write_done = wbc->nr_to_write <= 0;
4873 }
4874 pagevec_release(&pvec);
4875 cond_resched();
4876 }
4877 if (!scanned && !done) {
4878 /*
4879 * We hit the last page and there is more work to be done: wrap
4880 * back to the start of the file
4881 */
4882 scanned = 1;
4883 index = 0;
4884 goto retry;
4885 }
Qu Wenruob3ff8f12020-02-12 14:12:44 +08004886 /*
4887 * If something went wrong, don't allow any metadata write bio to be
4888 * submitted.
4889 *
4890 * This would prevent use-after-free if we had dirty pages not
4891 * cleaned up, which can still happen by fuzzed images.
4892 *
4893 * - Bad extent tree
4894 * Allowing existing tree block to be allocated for other trees.
4895 *
4896 * - Log tree operations
4897 * Exiting tree blocks get allocated to log tree, bumps its
4898 * generation, then get cleaned in tree re-balance.
4899 * Such tree block will not be written back, since it's clean,
4900 * thus no WRITTEN flag set.
4901 * And after log writes back, this tree block is not traced by
4902 * any dirty extent_io_tree.
4903 *
4904 * - Offending tree block gets re-dirtied from its original owner
4905 * Since it has bumped generation, no WRITTEN flag, it can be
4906 * reused without COWing. This tree block will not be traced
4907 * by btrfs_transaction::dirty_pages.
4908 *
4909 * Now such dirty tree block will not be cleaned by any dirty
4910 * extent io tree. Thus we don't want to submit such wild eb
4911 * if the fs already has error.
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004912 *
Qu Wenruoc9583ad2022-04-12 20:30:13 +08004913 * We can get ret > 0 from submit_extent_page() indicating how many ebs
4914 * were submitted. Reset it to 0 to avoid false alerts for the caller.
4915 */
4916 if (ret > 0)
4917 ret = 0;
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02004918 if (!ret && BTRFS_FS_ERROR(fs_info))
4919 ret = -EROFS;
4920 submit_write_bio(&epd, ret);
4921
4922 btrfs_zoned_meta_io_unlock(fs_info);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04004923 return ret;
4924}
4925
Chris Masond1310b22008-01-24 16:13:08 -05004926/**
Nikolay Borisov3bed2da2021-01-22 11:58:03 +02004927 * Walk the list of dirty pages of the given address space and write all of them.
4928 *
Chris Masond1310b22008-01-24 16:13:08 -05004929 * @mapping: address space structure to write
Nikolay Borisov3bed2da2021-01-22 11:58:03 +02004930 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4931 * @epd: holds context for the write, namely the bio
Chris Masond1310b22008-01-24 16:13:08 -05004932 *
4933 * If a page is already under I/O, write_cache_pages() skips it, even
4934 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4935 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4936 * and msync() need to guarantee that all the data which was dirty at the time
4937 * the call was made get new I/O started against them. If wbc->sync_mode is
4938 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4939 * existing IO to complete.
4940 */
David Sterba4242b642017-02-10 19:38:24 +01004941static int extent_write_cache_pages(struct address_space *mapping,
Chris Mason4bef0842008-09-08 11:18:08 -04004942 struct writeback_control *wbc,
David Sterbaaab6e9e2017-11-30 18:00:02 +01004943 struct extent_page_data *epd)
Chris Masond1310b22008-01-24 16:13:08 -05004944{
Josef Bacik7fd1a3f2012-06-27 17:18:41 -04004945 struct inode *inode = mapping->host;
Chris Masond1310b22008-01-24 16:13:08 -05004946 int ret = 0;
4947 int done = 0;
Chris Masonf85d7d6c2009-09-18 16:03:16 -04004948 int nr_to_write_done = 0;
Chris Masond1310b22008-01-24 16:13:08 -05004949 struct pagevec pvec;
4950 int nr_pages;
4951 pgoff_t index;
4952 pgoff_t end; /* Inclusive */
Liu Boa91326672016-03-07 16:56:21 -08004953 pgoff_t done_index;
4954 int range_whole = 0;
Chris Masond1310b22008-01-24 16:13:08 -05004955 int scanned = 0;
Matthew Wilcox10bbd232017-12-05 17:30:38 -05004956 xa_mark_t tag;
Chris Masond1310b22008-01-24 16:13:08 -05004957
Josef Bacik7fd1a3f2012-06-27 17:18:41 -04004958 /*
4959 * We have to hold onto the inode so that ordered extents can do their
4960 * work when the IO finishes. The alternative to this is failing to add
4961 * an ordered extent if the igrab() fails there and that is a huge pain
4962 * to deal with, so instead just hold onto the inode throughout the
4963 * writepages operation. If it fails here we are freeing up the inode
4964 * anyway and we'd rather not waste our time writing out stuff that is
4965 * going to be truncated anyway.
4966 */
4967 if (!igrab(inode))
4968 return 0;
4969
Mel Gorman86679822017-11-15 17:37:52 -08004970 pagevec_init(&pvec);
Chris Masond1310b22008-01-24 16:13:08 -05004971 if (wbc->range_cyclic) {
4972 index = mapping->writeback_index; /* Start from prev offset */
4973 end = -1;
Josef Bacik556755a2020-01-03 10:38:44 -05004974 /*
4975 * Start from the beginning does not need to cycle over the
4976 * range, mark it as scanned.
4977 */
4978 scanned = (index == 0);
Chris Masond1310b22008-01-24 16:13:08 -05004979 } else {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004980 index = wbc->range_start >> PAGE_SHIFT;
4981 end = wbc->range_end >> PAGE_SHIFT;
Liu Boa91326672016-03-07 16:56:21 -08004982 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4983 range_whole = 1;
Chris Masond1310b22008-01-24 16:13:08 -05004984 scanned = 1;
4985 }
Ethan Lien3cd24c62018-11-01 14:49:03 +08004986
4987 /*
4988 * We do the tagged writepage as long as the snapshot flush bit is set
4989 * and we are the first one who do the filemap_flush() on this inode.
4990 *
4991 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4992 * not race in and drop the bit.
4993 */
4994 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4995 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4996 &BTRFS_I(inode)->runtime_flags))
4997 wbc->tagged_writepages = 1;
4998
4999 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
Josef Bacikf7aaa062011-07-15 21:26:38 +00005000 tag = PAGECACHE_TAG_TOWRITE;
5001 else
5002 tag = PAGECACHE_TAG_DIRTY;
Chris Masond1310b22008-01-24 16:13:08 -05005003retry:
Ethan Lien3cd24c62018-11-01 14:49:03 +08005004 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
Josef Bacikf7aaa062011-07-15 21:26:38 +00005005 tag_pages_for_writeback(mapping, index, end);
Liu Boa91326672016-03-07 16:56:21 -08005006 done_index = index;
Chris Masonf85d7d6c2009-09-18 16:03:16 -04005007 while (!done && !nr_to_write_done && (index <= end) &&
Jan Kara67fd7072017-11-15 17:35:19 -08005008 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5009 &index, end, tag))) {
Chris Masond1310b22008-01-24 16:13:08 -05005010 unsigned i;
5011
Chris Masond1310b22008-01-24 16:13:08 -05005012 for (i = 0; i < nr_pages; i++) {
5013 struct page *page = pvec.pages[i];
5014
Tejun Heof7bddf12019-10-03 07:27:13 -07005015 done_index = page->index + 1;
Chris Masond1310b22008-01-24 16:13:08 -05005016 /*
Matthew Wilcoxb93b0162018-04-10 16:36:56 -07005017 * At this point we hold neither the i_pages lock nor
5018 * the page lock: the page may be truncated or
5019 * invalidated (changing page->mapping to NULL),
5020 * or even swizzled back from swapper_space to
5021 * tmpfs file mapping
Chris Masond1310b22008-01-24 16:13:08 -05005022 */
Josef Bacikc8f2f242013-02-11 11:33:00 -05005023 if (!trylock_page(page)) {
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02005024 submit_write_bio(epd, 0);
Josef Bacikc8f2f242013-02-11 11:33:00 -05005025 lock_page(page);
Chris Mason01d658f2011-11-01 10:08:06 -04005026 }
Chris Masond1310b22008-01-24 16:13:08 -05005027
5028 if (unlikely(page->mapping != mapping)) {
5029 unlock_page(page);
5030 continue;
5031 }
5032
Chris Masond2c3f4f2008-11-19 12:44:22 -05005033 if (wbc->sync_mode != WB_SYNC_NONE) {
Qu Wenruoc9583ad2022-04-12 20:30:13 +08005034 if (PageWriteback(page))
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02005035 submit_write_bio(epd, 0);
Chris Masond1310b22008-01-24 16:13:08 -05005036 wait_on_page_writeback(page);
Chris Masond2c3f4f2008-11-19 12:44:22 -05005037 }
Chris Masond1310b22008-01-24 16:13:08 -05005038
5039 if (PageWriteback(page) ||
5040 !clear_page_dirty_for_io(page)) {
5041 unlock_page(page);
5042 continue;
5043 }
5044
David Sterbaaab6e9e2017-11-30 18:00:02 +01005045 ret = __extent_writepage(page, wbc, epd);
Liu Boa91326672016-03-07 16:56:21 -08005046 if (ret < 0) {
Liu Boa91326672016-03-07 16:56:21 -08005047 done = 1;
5048 break;
5049 }
Chris Masonf85d7d6c2009-09-18 16:03:16 -04005050
5051 /*
5052 * the filesystem may choose to bump up nr_to_write.
5053 * We have to make sure to honor the new nr_to_write
5054 * at any time
5055 */
5056 nr_to_write_done = wbc->nr_to_write <= 0;
Chris Masond1310b22008-01-24 16:13:08 -05005057 }
5058 pagevec_release(&pvec);
5059 cond_resched();
5060 }
Liu Bo894b36e2016-03-07 16:56:22 -08005061 if (!scanned && !done) {
Chris Masond1310b22008-01-24 16:13:08 -05005062 /*
5063 * We hit the last page and there is more work to be done: wrap
5064 * back to the start of the file
5065 */
5066 scanned = 1;
5067 index = 0;
Josef Bacik42ffb0b2020-01-23 15:33:02 -05005068
5069 /*
5070 * If we're looping we could run into a page that is locked by a
5071 * writer and that writer could be waiting on writeback for a
5072 * page in our current bio, and thus deadlock, so flush the
5073 * write bio here.
5074 */
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02005075 submit_write_bio(epd, 0);
Qu Wenruoc9583ad2022-04-12 20:30:13 +08005076 goto retry;
Chris Masond1310b22008-01-24 16:13:08 -05005077 }
Liu Boa91326672016-03-07 16:56:21 -08005078
5079 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5080 mapping->writeback_index = done_index;
5081
Josef Bacik7fd1a3f2012-06-27 17:18:41 -04005082 btrfs_add_delayed_iput(inode);
Liu Bo894b36e2016-03-07 16:56:22 -08005083 return ret;
Chris Masond1310b22008-01-24 16:13:08 -05005084}
Chris Masond1310b22008-01-24 16:13:08 -05005085
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005086/*
5087 * Submit the pages in the range to bio for call sites which delalloc range has
5088 * already been ran (aka, ordered extent inserted) and all pages are still
5089 * locked.
5090 */
5091int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
Chris Mason771ed682008-11-06 22:02:51 -05005092{
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005093 bool found_error = false;
5094 int first_error = 0;
Chris Mason771ed682008-11-06 22:02:51 -05005095 int ret = 0;
5096 struct address_space *mapping = inode->i_mapping;
5097 struct page *page;
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005098 u64 cur = start;
Qu Wenruo66448b92021-09-27 15:22:02 +08005099 unsigned long nr_pages;
5100 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
Chris Mason771ed682008-11-06 22:02:51 -05005101 struct extent_page_data epd = {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08005102 .bio_ctrl = { 0 },
Chris Mason771ed682008-11-06 22:02:51 -05005103 .extent_locked = 1,
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005104 .sync_io = 1,
Chris Mason771ed682008-11-06 22:02:51 -05005105 };
5106 struct writeback_control wbc_writepages = {
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005107 .sync_mode = WB_SYNC_ALL,
Chris Mason771ed682008-11-06 22:02:51 -05005108 .range_start = start,
5109 .range_end = end + 1,
Chris Masonec39f762019-07-10 12:28:17 -07005110 /* We're called from an async helper function */
5111 .punt_to_cgroup = 1,
5112 .no_cgroup_owner = 1,
Chris Mason771ed682008-11-06 22:02:51 -05005113 };
5114
Qu Wenruo66448b92021-09-27 15:22:02 +08005115 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5116 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5117 PAGE_SHIFT;
5118 wbc_writepages.nr_to_write = nr_pages * 2;
5119
Chris Masondbb70be2019-07-10 12:28:18 -07005120 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005121 while (cur <= end) {
Qu Wenruo66448b92021-09-27 15:22:02 +08005122 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5123
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005124 page = find_get_page(mapping, cur >> PAGE_SHIFT);
5125 /*
5126 * All pages in the range are locked since
5127 * btrfs_run_delalloc_range(), thus there is no way to clear
5128 * the page dirty flag.
5129 */
Qu Wenruo66448b92021-09-27 15:22:02 +08005130 ASSERT(PageLocked(page));
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005131 ASSERT(PageDirty(page));
5132 clear_page_dirty_for_io(page);
5133 ret = __extent_writepage(page, &wbc_writepages, &epd);
5134 ASSERT(ret <= 0);
5135 if (ret < 0) {
5136 found_error = true;
5137 first_error = ret;
Chris Mason771ed682008-11-06 22:02:51 -05005138 }
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03005139 put_page(page);
Qu Wenruo66448b92021-09-27 15:22:02 +08005140 cur = cur_end + 1;
Chris Mason771ed682008-11-06 22:02:51 -05005141 }
5142
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02005143 submit_write_bio(&epd, found_error ? ret : 0);
Chris Masondbb70be2019-07-10 12:28:18 -07005144
5145 wbc_detach_inode(&wbc_writepages);
Qu Wenruo2bd0fc92021-09-27 15:21:58 +08005146 if (found_error)
5147 return first_error;
Chris Mason771ed682008-11-06 22:02:51 -05005148 return ret;
5149}
Chris Masond1310b22008-01-24 16:13:08 -05005150
Nikolay Borisov8ae225a2018-04-19 10:46:38 +03005151int extent_writepages(struct address_space *mapping,
Chris Masond1310b22008-01-24 16:13:08 -05005152 struct writeback_control *wbc)
5153{
Johannes Thumshirn35156d82021-09-09 01:19:27 +09005154 struct inode *inode = mapping->host;
Chris Masond1310b22008-01-24 16:13:08 -05005155 int ret = 0;
5156 struct extent_page_data epd = {
Qu Wenruo390ed29b82021-04-14 16:42:15 +08005157 .bio_ctrl = { 0 },
Chris Mason771ed682008-11-06 22:02:51 -05005158 .extent_locked = 0,
Chris Masonffbd5172009-04-20 15:50:09 -04005159 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
Chris Masond1310b22008-01-24 16:13:08 -05005160 };
5161
Johannes Thumshirn35156d82021-09-09 01:19:27 +09005162 /*
5163 * Allow only a single thread to do the reloc work in zoned mode to
5164 * protect the write pointer updates.
5165 */
Johannes Thumshirn869f4cd2021-12-07 06:28:34 -08005166 btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
David Sterba935db852017-06-23 04:30:28 +02005167 ret = extent_write_cache_pages(mapping, wbc, &epd);
Christoph Hellwig9845e5d2022-06-03 09:11:02 +02005168 submit_write_bio(&epd, ret);
Naohiro Aota19ab78c2022-06-07 16:08:30 +09005169 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
Chris Masond1310b22008-01-24 16:13:08 -05005170 return ret;
5171}
Chris Masond1310b22008-01-24 16:13:08 -05005172
Matthew Wilcox (Oracle)ba206a02020-06-01 21:47:05 -07005173void extent_readahead(struct readahead_control *rac)
Chris Masond1310b22008-01-24 16:13:08 -05005174{
Qu Wenruo390ed29b82021-04-14 16:42:15 +08005175 struct btrfs_bio_ctrl bio_ctrl = { 0 };
Liu Bo67c96842012-07-20 21:43:09 -06005176 struct page *pagepool[16];
Miao Xie125bac012013-07-25 19:22:37 +08005177 struct extent_map *em_cached = NULL;
Filipe Manana808f80b2015-09-28 09:56:26 +01005178 u64 prev_em_start = (u64)-1;
Matthew Wilcox (Oracle)ba206a02020-06-01 21:47:05 -07005179 int nr;
Chris Masond1310b22008-01-24 16:13:08 -05005180
Matthew Wilcox (Oracle)ba206a02020-06-01 21:47:05 -07005181 while ((nr = readahead_page_batch(rac, pagepool))) {
Matthew Wilcox (Oracle)32c0a6b2021-03-21 21:03:11 +00005182 u64 contig_start = readahead_pos(rac);
5183 u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
Chris Masond1310b22008-01-24 16:13:08 -05005184
Matthew Wilcox (Oracle)ba206a02020-06-01 21:47:05 -07005185 contiguous_readpages(pagepool, nr, contig_start, contig_end,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08005186 &em_cached, &bio_ctrl, &prev_em_start);
Chris Masond1310b22008-01-24 16:13:08 -05005187 }
Liu Bo67c96842012-07-20 21:43:09 -06005188
Miao Xie125bac012013-07-25 19:22:37 +08005189 if (em_cached)
5190 free_extent_map(em_cached);
Christoph Hellwig722c82a2022-06-03 09:11:03 +02005191 submit_one_bio(&bio_ctrl);
Chris Masond1310b22008-01-24 16:13:08 -05005192}
Chris Masond1310b22008-01-24 16:13:08 -05005193
5194/*
Matthew Wilcox (Oracle)895586e2022-02-09 20:21:39 +00005195 * basic invalidate_folio code, this waits on any locked or writeback
5196 * ranges corresponding to the folio, and then deletes any extent state
Chris Masond1310b22008-01-24 16:13:08 -05005197 * records from the tree
5198 */
Matthew Wilcox (Oracle)895586e2022-02-09 20:21:39 +00005199int extent_invalidate_folio(struct extent_io_tree *tree,
5200 struct folio *folio, size_t offset)
Chris Masond1310b22008-01-24 16:13:08 -05005201{
Josef Bacik2ac55d42010-02-03 19:33:23 +00005202 struct extent_state *cached_state = NULL;
Matthew Wilcox (Oracle)895586e2022-02-09 20:21:39 +00005203 u64 start = folio_pos(folio);
5204 u64 end = start + folio_size(folio) - 1;
5205 size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
Chris Masond1310b22008-01-24 16:13:08 -05005206
Qu Wenruo829ddec2020-11-13 20:51:39 +08005207 /* This function is only called for the btree inode */
5208 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5209
Qu Wenruofda28322013-02-26 08:10:22 +00005210 start += ALIGN(offset, blocksize);
Chris Masond1310b22008-01-24 16:13:08 -05005211 if (start > end)
5212 return 0;
5213
David Sterbaff13db42015-12-03 14:30:40 +01005214 lock_extent_bits(tree, start, end, &cached_state);
Matthew Wilcox (Oracle)895586e2022-02-09 20:21:39 +00005215 folio_wait_writeback(folio);
Qu Wenruo829ddec2020-11-13 20:51:39 +08005216
5217 /*
5218 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5219 * so here we only need to unlock the extent range to free any
5220 * existing extent state.
5221 */
5222 unlock_extent_cached(tree, start, end, &cached_state);
Chris Masond1310b22008-01-24 16:13:08 -05005223 return 0;
5224}
Chris Masond1310b22008-01-24 16:13:08 -05005225
5226/*
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04005227 * a helper for release_folio, this tests for areas of the page that
Chris Mason7b13b7b2008-04-18 10:29:50 -04005228 * are locked or under IO and drops the related state bits if it is safe
5229 * to drop the page.
5230 */
Nikolay Borisov29c68b2d2018-04-19 10:46:35 +03005231static int try_release_extent_state(struct extent_io_tree *tree,
Eric Sandeen48a3b632013-04-25 20:41:01 +00005232 struct page *page, gfp_t mask)
Chris Mason7b13b7b2008-04-18 10:29:50 -04005233{
Miao Xie4eee4fa2012-12-21 09:17:45 +00005234 u64 start = page_offset(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03005235 u64 end = start + PAGE_SIZE - 1;
Chris Mason7b13b7b2008-04-18 10:29:50 -04005236 int ret = 1;
5237
Nikolay Borisov88826792019-03-14 15:28:31 +02005238 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
Chris Mason7b13b7b2008-04-18 10:29:50 -04005239 ret = 0;
Nikolay Borisov88826792019-03-14 15:28:31 +02005240 } else {
Chris Mason11ef1602009-09-23 20:28:46 -04005241 /*
Filipe Manana2766ff62020-11-04 11:07:34 +00005242 * At this point we can safely clear everything except the
5243 * locked bit, the nodatasum bit and the delalloc new bit.
5244 * The delalloc new bit will be cleared by ordered extent
5245 * completion.
Chris Mason11ef1602009-09-23 20:28:46 -04005246 */
David Sterba66b0c882017-10-31 16:30:47 +01005247 ret = __clear_extent_bit(tree, start, end,
Filipe Manana2766ff62020-11-04 11:07:34 +00005248 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5249 0, 0, NULL, mask, NULL);
Chris Masone3f24cc2011-02-14 12:52:08 -05005250
5251 /* if clear_extent_bit failed for enomem reasons,
5252 * we can't allow the release to continue.
5253 */
5254 if (ret < 0)
5255 ret = 0;
5256 else
5257 ret = 1;
Chris Mason7b13b7b2008-04-18 10:29:50 -04005258 }
5259 return ret;
5260}
Chris Mason7b13b7b2008-04-18 10:29:50 -04005261
5262/*
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04005263 * a helper for release_folio. As long as there are no locked extents
Chris Masond1310b22008-01-24 16:13:08 -05005264 * in the range corresponding to the page, both state records and extent
5265 * map records are removed
5266 */
Nikolay Borisov477a30b2018-04-19 10:46:34 +03005267int try_release_extent_mapping(struct page *page, gfp_t mask)
Chris Masond1310b22008-01-24 16:13:08 -05005268{
5269 struct extent_map *em;
Miao Xie4eee4fa2012-12-21 09:17:45 +00005270 u64 start = page_offset(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03005271 u64 end = start + PAGE_SIZE - 1;
Filipe Mananabd3599a2018-07-12 01:36:43 +01005272 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5273 struct extent_io_tree *tree = &btrfs_inode->io_tree;
5274 struct extent_map_tree *map = &btrfs_inode->extent_tree;
Chris Mason7b13b7b2008-04-18 10:29:50 -04005275
Mel Gormand0164ad2015-11-06 16:28:21 -08005276 if (gfpflags_allow_blocking(mask) &&
Byongho Leeee221842015-12-15 01:42:10 +09005277 page->mapping->host->i_size > SZ_16M) {
Yan39b56372008-02-15 10:40:50 -05005278 u64 len;
Chris Mason70dec802008-01-29 09:59:12 -05005279 while (start <= end) {
Filipe Mananafbc2bd72020-07-22 12:28:52 +01005280 struct btrfs_fs_info *fs_info;
5281 u64 cur_gen;
5282
Yan39b56372008-02-15 10:40:50 -05005283 len = end - start + 1;
Chris Mason890871b2009-09-02 16:24:52 -04005284 write_lock(&map->lock);
Yan39b56372008-02-15 10:40:50 -05005285 em = lookup_extent_mapping(map, start, len);
Tsutomu Itoh285190d2012-02-16 16:23:58 +09005286 if (!em) {
Chris Mason890871b2009-09-02 16:24:52 -04005287 write_unlock(&map->lock);
Chris Mason70dec802008-01-29 09:59:12 -05005288 break;
5289 }
Chris Mason7f3c74f2008-07-18 12:01:11 -04005290 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5291 em->start != start) {
Chris Mason890871b2009-09-02 16:24:52 -04005292 write_unlock(&map->lock);
Chris Mason70dec802008-01-29 09:59:12 -05005293 free_extent_map(em);
5294 break;
5295 }
Filipe Manana3d6448e2020-07-22 12:28:37 +01005296 if (test_range_bit(tree, em->start,
5297 extent_map_end(em) - 1,
5298 EXTENT_LOCKED, 0, NULL))
5299 goto next;
5300 /*
5301 * If it's not in the list of modified extents, used
5302 * by a fast fsync, we can remove it. If it's being
5303 * logged we can safely remove it since fsync took an
5304 * extra reference on the em.
5305 */
5306 if (list_empty(&em->list) ||
Filipe Mananafbc2bd72020-07-22 12:28:52 +01005307 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5308 goto remove_em;
5309 /*
5310 * If it's in the list of modified extents, remove it
5311 * only if its generation is older then the current one,
5312 * in which case we don't need it for a fast fsync.
5313 * Otherwise don't remove it, we could be racing with an
5314 * ongoing fast fsync that could miss the new extent.
5315 */
5316 fs_info = btrfs_inode->root->fs_info;
5317 spin_lock(&fs_info->trans_lock);
5318 cur_gen = fs_info->generation;
5319 spin_unlock(&fs_info->trans_lock);
5320 if (em->generation >= cur_gen)
5321 goto next;
5322remove_em:
Filipe Manana5e548b32020-07-22 12:29:01 +01005323 /*
5324 * We only remove extent maps that are not in the list of
5325 * modified extents or that are in the list but with a
5326 * generation lower then the current generation, so there
5327 * is no need to set the full fsync flag on the inode (it
5328 * hurts the fsync performance for workloads with a data
5329 * size that exceeds or is close to the system's memory).
5330 */
Filipe Mananafbc2bd72020-07-22 12:28:52 +01005331 remove_extent_mapping(map, em);
5332 /* once for the rb tree */
5333 free_extent_map(em);
Filipe Manana3d6448e2020-07-22 12:28:37 +01005334next:
Chris Mason70dec802008-01-29 09:59:12 -05005335 start = extent_map_end(em);
Chris Mason890871b2009-09-02 16:24:52 -04005336 write_unlock(&map->lock);
Chris Mason70dec802008-01-29 09:59:12 -05005337
5338 /* once for us */
Chris Masond1310b22008-01-24 16:13:08 -05005339 free_extent_map(em);
Paul E. McKenney9f47eb52020-05-08 14:15:37 -07005340
5341 cond_resched(); /* Allow large-extent preemption. */
Chris Masond1310b22008-01-24 16:13:08 -05005342 }
Chris Masond1310b22008-01-24 16:13:08 -05005343 }
Nikolay Borisov29c68b2d2018-04-19 10:46:35 +03005344 return try_release_extent_state(tree, page, mask);
Chris Masond1310b22008-01-24 16:13:08 -05005345}
Chris Masond1310b22008-01-24 16:13:08 -05005346
Chris Masonec29ed52011-02-23 16:23:20 -05005347/*
5348 * helper function for fiemap, which doesn't want to see any holes.
5349 * This maps until we find something past 'last'
5350 */
Nikolay Borisovf1bbde82020-08-31 14:42:45 +03005351static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
David Sterbae3350e12017-06-23 04:09:57 +02005352 u64 offset, u64 last)
Chris Masonec29ed52011-02-23 16:23:20 -05005353{
Nikolay Borisovf1bbde82020-08-31 14:42:45 +03005354 u64 sectorsize = btrfs_inode_sectorsize(inode);
Chris Masonec29ed52011-02-23 16:23:20 -05005355 struct extent_map *em;
5356 u64 len;
5357
5358 if (offset >= last)
5359 return NULL;
5360
Dulshani Gunawardhana67871252013-10-31 10:33:04 +05305361 while (1) {
Chris Masonec29ed52011-02-23 16:23:20 -05005362 len = last - offset;
5363 if (len == 0)
5364 break;
Qu Wenruofda28322013-02-26 08:10:22 +00005365 len = ALIGN(len, sectorsize);
Nikolay Borisovf1bbde82020-08-31 14:42:45 +03005366 em = btrfs_get_extent_fiemap(inode, offset, len);
Johannes Thumshirn6b5b7a42022-02-04 04:06:27 -08005367 if (IS_ERR(em))
Chris Masonec29ed52011-02-23 16:23:20 -05005368 return em;
5369
5370 /* if this isn't a hole return it */
Nikolay Borisov4a2d25c2017-11-23 10:51:43 +02005371 if (em->block_start != EXTENT_MAP_HOLE)
Chris Masonec29ed52011-02-23 16:23:20 -05005372 return em;
Chris Masonec29ed52011-02-23 16:23:20 -05005373
5374 /* this is a hole, advance to the next extent */
5375 offset = extent_map_end(em);
5376 free_extent_map(em);
5377 if (offset >= last)
5378 break;
5379 }
5380 return NULL;
5381}
5382
Qu Wenruo47518322017-04-07 10:43:15 +08005383/*
5384 * To cache previous fiemap extent
5385 *
5386 * Will be used for merging fiemap extent
5387 */
5388struct fiemap_cache {
5389 u64 offset;
5390 u64 phys;
5391 u64 len;
5392 u32 flags;
5393 bool cached;
5394};
5395
5396/*
5397 * Helper to submit fiemap extent.
5398 *
5399 * Will try to merge current fiemap extent specified by @offset, @phys,
5400 * @len and @flags with cached one.
5401 * And only when we fails to merge, cached one will be submitted as
5402 * fiemap extent.
5403 *
5404 * Return value is the same as fiemap_fill_next_extent().
5405 */
5406static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5407 struct fiemap_cache *cache,
5408 u64 offset, u64 phys, u64 len, u32 flags)
5409{
5410 int ret = 0;
5411
5412 if (!cache->cached)
5413 goto assign;
5414
5415 /*
5416 * Sanity check, extent_fiemap() should have ensured that new
Andrea Gelmini52042d82018-11-28 12:05:13 +01005417 * fiemap extent won't overlap with cached one.
Qu Wenruo47518322017-04-07 10:43:15 +08005418 * Not recoverable.
5419 *
5420 * NOTE: Physical address can overlap, due to compression
5421 */
5422 if (cache->offset + cache->len > offset) {
5423 WARN_ON(1);
5424 return -EINVAL;
5425 }
5426
5427 /*
5428 * Only merges fiemap extents if
5429 * 1) Their logical addresses are continuous
5430 *
5431 * 2) Their physical addresses are continuous
5432 * So truly compressed (physical size smaller than logical size)
5433 * extents won't get merged with each other
5434 *
5435 * 3) Share same flags except FIEMAP_EXTENT_LAST
5436 * So regular extent won't get merged with prealloc extent
5437 */
5438 if (cache->offset + cache->len == offset &&
5439 cache->phys + cache->len == phys &&
5440 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5441 (flags & ~FIEMAP_EXTENT_LAST)) {
5442 cache->len += len;
5443 cache->flags |= flags;
5444 goto try_submit_last;
5445 }
5446
5447 /* Not mergeable, need to submit cached one */
5448 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5449 cache->len, cache->flags);
5450 cache->cached = false;
5451 if (ret)
5452 return ret;
5453assign:
5454 cache->cached = true;
5455 cache->offset = offset;
5456 cache->phys = phys;
5457 cache->len = len;
5458 cache->flags = flags;
5459try_submit_last:
5460 if (cache->flags & FIEMAP_EXTENT_LAST) {
5461 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5462 cache->phys, cache->len, cache->flags);
5463 cache->cached = false;
5464 }
5465 return ret;
5466}
5467
5468/*
Qu Wenruo848c23b2017-06-22 10:01:21 +08005469 * Emit last fiemap cache
Qu Wenruo47518322017-04-07 10:43:15 +08005470 *
Qu Wenruo848c23b2017-06-22 10:01:21 +08005471 * The last fiemap cache may still be cached in the following case:
5472 * 0 4k 8k
5473 * |<- Fiemap range ->|
5474 * |<------------ First extent ----------->|
5475 *
5476 * In this case, the first extent range will be cached but not emitted.
5477 * So we must emit it before ending extent_fiemap().
Qu Wenruo47518322017-04-07 10:43:15 +08005478 */
David Sterba5c5aff92019-03-20 11:29:46 +01005479static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
Qu Wenruo848c23b2017-06-22 10:01:21 +08005480 struct fiemap_cache *cache)
Qu Wenruo47518322017-04-07 10:43:15 +08005481{
5482 int ret;
5483
5484 if (!cache->cached)
5485 return 0;
5486
Qu Wenruo47518322017-04-07 10:43:15 +08005487 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5488 cache->len, cache->flags);
5489 cache->cached = false;
5490 if (ret > 0)
5491 ret = 0;
5492 return ret;
5493}
5494
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005495int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
David Sterbabab16e22020-06-23 20:56:12 +02005496 u64 start, u64 len)
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005497{
Josef Bacik975f84f2010-11-23 19:36:57 +00005498 int ret = 0;
Boris Burkov15c77452021-04-06 15:31:18 -07005499 u64 off;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005500 u64 max = start + len;
5501 u32 flags = 0;
Josef Bacik975f84f2010-11-23 19:36:57 +00005502 u32 found_type;
5503 u64 last;
Chris Masonec29ed52011-02-23 16:23:20 -05005504 u64 last_for_get_extent = 0;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005505 u64 disko = 0;
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005506 u64 isize = i_size_read(&inode->vfs_inode);
Josef Bacik975f84f2010-11-23 19:36:57 +00005507 struct btrfs_key found_key;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005508 struct extent_map *em = NULL;
Josef Bacik2ac55d42010-02-03 19:33:23 +00005509 struct extent_state *cached_state = NULL;
Josef Bacik975f84f2010-11-23 19:36:57 +00005510 struct btrfs_path *path;
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005511 struct btrfs_root *root = inode->root;
Qu Wenruo47518322017-04-07 10:43:15 +08005512 struct fiemap_cache cache = { 0 };
David Sterba5911c8f2019-05-15 15:31:04 +02005513 struct ulist *roots;
5514 struct ulist *tmp_ulist;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005515 int end = 0;
Chris Masonec29ed52011-02-23 16:23:20 -05005516 u64 em_start = 0;
5517 u64 em_len = 0;
5518 u64 em_end = 0;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005519
5520 if (len == 0)
5521 return -EINVAL;
5522
Josef Bacik975f84f2010-11-23 19:36:57 +00005523 path = btrfs_alloc_path();
5524 if (!path)
5525 return -ENOMEM;
Josef Bacik975f84f2010-11-23 19:36:57 +00005526
David Sterba5911c8f2019-05-15 15:31:04 +02005527 roots = ulist_alloc(GFP_KERNEL);
5528 tmp_ulist = ulist_alloc(GFP_KERNEL);
5529 if (!roots || !tmp_ulist) {
5530 ret = -ENOMEM;
5531 goto out_free_ulist;
5532 }
5533
Boris Burkov15c77452021-04-06 15:31:18 -07005534 /*
5535 * We can't initialize that to 'start' as this could miss extents due
5536 * to extent item merging
5537 */
5538 off = 0;
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005539 start = round_down(start, btrfs_inode_sectorsize(inode));
5540 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
Josef Bacik4d479cf2011-11-17 11:34:31 -05005541
Chris Masonec29ed52011-02-23 16:23:20 -05005542 /*
5543 * lookup the last file extent. We're not using i_size here
5544 * because there might be preallocation past i_size
5545 */
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005546 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5547 0);
Josef Bacik975f84f2010-11-23 19:36:57 +00005548 if (ret < 0) {
David Sterba5911c8f2019-05-15 15:31:04 +02005549 goto out_free_ulist;
Liu Bo2d324f52016-05-17 17:21:48 -07005550 } else {
5551 WARN_ON(!ret);
5552 if (ret == 1)
5553 ret = 0;
Josef Bacik975f84f2010-11-23 19:36:57 +00005554 }
Liu Bo2d324f52016-05-17 17:21:48 -07005555
Josef Bacik975f84f2010-11-23 19:36:57 +00005556 path->slots[0]--;
Josef Bacik975f84f2010-11-23 19:36:57 +00005557 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
David Sterba962a2982014-06-04 18:41:45 +02005558 found_type = found_key.type;
Josef Bacik975f84f2010-11-23 19:36:57 +00005559
Chris Masonec29ed52011-02-23 16:23:20 -05005560 /* No extents, but there might be delalloc bits */
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005561 if (found_key.objectid != btrfs_ino(inode) ||
Josef Bacik975f84f2010-11-23 19:36:57 +00005562 found_type != BTRFS_EXTENT_DATA_KEY) {
Chris Masonec29ed52011-02-23 16:23:20 -05005563 /* have to trust i_size as the end */
5564 last = (u64)-1;
5565 last_for_get_extent = isize;
5566 } else {
5567 /*
5568 * remember the start of the last extent. There are a
5569 * bunch of different factors that go into the length of the
5570 * extent, so its much less complex to remember where it started
5571 */
5572 last = found_key.offset;
5573 last_for_get_extent = last + 1;
Josef Bacik975f84f2010-11-23 19:36:57 +00005574 }
Liu Bofe09e162013-09-22 12:54:23 +08005575 btrfs_release_path(path);
Josef Bacik975f84f2010-11-23 19:36:57 +00005576
Chris Masonec29ed52011-02-23 16:23:20 -05005577 /*
5578 * we might have some extents allocated but more delalloc past those
5579 * extents. so, we trust isize unless the start of the last extent is
5580 * beyond isize
5581 */
5582 if (last < isize) {
5583 last = (u64)-1;
5584 last_for_get_extent = isize;
5585 }
5586
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005587 lock_extent_bits(&inode->io_tree, start, start + len - 1,
Jeff Mahoneyd0082372012-03-01 14:57:19 +01005588 &cached_state);
Chris Masonec29ed52011-02-23 16:23:20 -05005589
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005590 em = get_extent_skip_holes(inode, start, last_for_get_extent);
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005591 if (!em)
5592 goto out;
5593 if (IS_ERR(em)) {
5594 ret = PTR_ERR(em);
5595 goto out;
5596 }
Josef Bacik975f84f2010-11-23 19:36:57 +00005597
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005598 while (!end) {
Josef Bacikb76bb702013-07-05 13:52:51 -04005599 u64 offset_in_extent = 0;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005600
Chris Masonea8efc72011-03-08 11:54:40 -05005601 /* break if the extent we found is outside the range */
5602 if (em->start >= max || extent_map_end(em) < off)
5603 break;
5604
5605 /*
5606 * get_extent may return an extent that starts before our
5607 * requested range. We have to make sure the ranges
5608 * we return to fiemap always move forward and don't
5609 * overlap, so adjust the offsets here
5610 */
5611 em_start = max(em->start, off);
5612
5613 /*
5614 * record the offset from the start of the extent
Josef Bacikb76bb702013-07-05 13:52:51 -04005615 * for adjusting the disk offset below. Only do this if the
5616 * extent isn't compressed since our in ram offset may be past
5617 * what we have actually allocated on disk.
Chris Masonea8efc72011-03-08 11:54:40 -05005618 */
Josef Bacikb76bb702013-07-05 13:52:51 -04005619 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5620 offset_in_extent = em_start - em->start;
Chris Masonec29ed52011-02-23 16:23:20 -05005621 em_end = extent_map_end(em);
Chris Masonea8efc72011-03-08 11:54:40 -05005622 em_len = em_end - em_start;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005623 flags = 0;
Filipe Mananaf0986312018-06-20 10:02:30 +01005624 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5625 disko = em->block_start + offset_in_extent;
5626 else
5627 disko = 0;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005628
Chris Masonea8efc72011-03-08 11:54:40 -05005629 /*
5630 * bump off for our next call to get_extent
5631 */
5632 off = extent_map_end(em);
5633 if (off >= max)
5634 end = 1;
5635
Heiko Carstens93dbfad2009-04-03 10:33:45 -04005636 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005637 end = 1;
5638 flags |= FIEMAP_EXTENT_LAST;
Heiko Carstens93dbfad2009-04-03 10:33:45 -04005639 } else if (em->block_start == EXTENT_MAP_INLINE) {
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005640 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5641 FIEMAP_EXTENT_NOT_ALIGNED);
Heiko Carstens93dbfad2009-04-03 10:33:45 -04005642 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005643 flags |= (FIEMAP_EXTENT_DELALLOC |
5644 FIEMAP_EXTENT_UNKNOWN);
Josef Bacikdc046b12014-09-10 16:20:45 -04005645 } else if (fieinfo->fi_extents_max) {
5646 u64 bytenr = em->block_start -
5647 (em->start - em->orig_start);
Liu Bofe09e162013-09-22 12:54:23 +08005648
Liu Bofe09e162013-09-22 12:54:23 +08005649 /*
5650 * As btrfs supports shared space, this information
5651 * can be exported to userspace tools via
Josef Bacikdc046b12014-09-10 16:20:45 -04005652 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
5653 * then we're just getting a count and we can skip the
5654 * lookup stuff.
Liu Bofe09e162013-09-22 12:54:23 +08005655 */
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005656 ret = btrfs_check_shared(root, btrfs_ino(inode),
David Sterba5911c8f2019-05-15 15:31:04 +02005657 bytenr, roots, tmp_ulist);
Josef Bacikdc046b12014-09-10 16:20:45 -04005658 if (ret < 0)
Liu Bofe09e162013-09-22 12:54:23 +08005659 goto out_free;
Josef Bacikdc046b12014-09-10 16:20:45 -04005660 if (ret)
Liu Bofe09e162013-09-22 12:54:23 +08005661 flags |= FIEMAP_EXTENT_SHARED;
Josef Bacikdc046b12014-09-10 16:20:45 -04005662 ret = 0;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005663 }
5664 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5665 flags |= FIEMAP_EXTENT_ENCODED;
Josef Bacik0d2b2372015-05-19 10:44:04 -04005666 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5667 flags |= FIEMAP_EXTENT_UNWRITTEN;
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005668
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005669 free_extent_map(em);
5670 em = NULL;
Chris Masonec29ed52011-02-23 16:23:20 -05005671 if ((em_start >= last) || em_len == (u64)-1 ||
5672 (last == (u64)-1 && isize <= em_end)) {
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005673 flags |= FIEMAP_EXTENT_LAST;
5674 end = 1;
5675 }
5676
Chris Masonec29ed52011-02-23 16:23:20 -05005677 /* now scan forward to see if this is really the last extent. */
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005678 em = get_extent_skip_holes(inode, off, last_for_get_extent);
Chris Masonec29ed52011-02-23 16:23:20 -05005679 if (IS_ERR(em)) {
5680 ret = PTR_ERR(em);
5681 goto out;
5682 }
5683 if (!em) {
Josef Bacik975f84f2010-11-23 19:36:57 +00005684 flags |= FIEMAP_EXTENT_LAST;
5685 end = 1;
5686 }
Qu Wenruo47518322017-04-07 10:43:15 +08005687 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5688 em_len, flags);
Chengyu Song26e726a2015-03-24 18:12:56 -04005689 if (ret) {
5690 if (ret == 1)
5691 ret = 0;
Chris Masonec29ed52011-02-23 16:23:20 -05005692 goto out_free;
Chengyu Song26e726a2015-03-24 18:12:56 -04005693 }
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005694 }
5695out_free:
Qu Wenruo47518322017-04-07 10:43:15 +08005696 if (!ret)
David Sterba5c5aff92019-03-20 11:29:46 +01005697 ret = emit_last_fiemap_cache(fieinfo, &cache);
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005698 free_extent_map(em);
5699out:
Nikolay Borisovfacee0a2020-08-31 14:42:49 +03005700 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
David Sterbae43bbe52017-12-12 21:43:52 +01005701 &cached_state);
David Sterba5911c8f2019-05-15 15:31:04 +02005702
5703out_free_ulist:
Colin Ian Kinge02d48e2019-07-05 08:26:24 +01005704 btrfs_free_path(path);
David Sterba5911c8f2019-05-15 15:31:04 +02005705 ulist_free(roots);
5706 ulist_free(tmp_ulist);
Yehuda Sadeh1506fcc2009-01-21 14:39:14 -05005707 return ret;
5708}
5709
Chris Mason727011e2010-08-06 13:21:20 -04005710static void __free_extent_buffer(struct extent_buffer *eb)
5711{
Chris Mason727011e2010-08-06 13:21:20 -04005712 kmem_cache_free(extent_buffer_cache, eb);
5713}
5714
David Sterba2b489662020-04-29 03:04:10 +02005715int extent_buffer_under_io(const struct extent_buffer *eb)
Chris Masond1310b22008-01-24 16:13:08 -05005716{
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005717 return (atomic_read(&eb->io_pages) ||
5718 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5719 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
Chris Masond1310b22008-01-24 16:13:08 -05005720}
5721
Qu Wenruo8ff84662021-01-26 16:33:50 +08005722static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
Miao Xie897ca6e92010-10-26 20:57:29 -04005723{
Qu Wenruo8ff84662021-01-26 16:33:50 +08005724 struct btrfs_subpage *subpage;
Miao Xie897ca6e92010-10-26 20:57:29 -04005725
Qu Wenruo8ff84662021-01-26 16:33:50 +08005726 lockdep_assert_held(&page->mapping->private_lock);
Miao Xie897ca6e92010-10-26 20:57:29 -04005727
Qu Wenruo8ff84662021-01-26 16:33:50 +08005728 if (PagePrivate(page)) {
5729 subpage = (struct btrfs_subpage *)page->private;
5730 if (atomic_read(&subpage->eb_refs))
5731 return true;
Qu Wenruo3d078ef2021-06-07 17:02:58 +08005732 /*
5733 * Even there is no eb refs here, we may still have
5734 * end_page_read() call relying on page::private.
5735 */
5736 if (atomic_read(&subpage->readers))
5737 return true;
Qu Wenruo8ff84662021-01-26 16:33:50 +08005738 }
5739 return false;
5740}
Miao Xie897ca6e92010-10-26 20:57:29 -04005741
Qu Wenruo8ff84662021-01-26 16:33:50 +08005742static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5743{
5744 struct btrfs_fs_info *fs_info = eb->fs_info;
5745 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5746
5747 /*
5748 * For mapped eb, we're going to change the page private, which should
5749 * be done under the private_lock.
5750 */
5751 if (mapped)
5752 spin_lock(&page->mapping->private_lock);
5753
5754 if (!PagePrivate(page)) {
Forrest Liu5d2361d2015-02-09 17:31:45 +08005755 if (mapped)
Qu Wenruo8ff84662021-01-26 16:33:50 +08005756 spin_unlock(&page->mapping->private_lock);
5757 return;
5758 }
5759
Qu Wenruofbca46e2022-01-13 13:22:09 +08005760 if (fs_info->nodesize >= PAGE_SIZE) {
Forrest Liu5d2361d2015-02-09 17:31:45 +08005761 /*
5762 * We do this since we'll remove the pages after we've
5763 * removed the eb from the radix tree, so we could race
5764 * and have this page now attached to the new eb. So
5765 * only clear page_private if it's still connected to
5766 * this eb.
5767 */
5768 if (PagePrivate(page) &&
5769 page->private == (unsigned long)eb) {
5770 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5771 BUG_ON(PageDirty(page));
5772 BUG_ON(PageWriteback(page));
Josef Bacik4f2de97a2012-03-07 16:20:05 -05005773 /*
Forrest Liu5d2361d2015-02-09 17:31:45 +08005774 * We need to make sure we haven't be attached
5775 * to a new eb.
Josef Bacik4f2de97a2012-03-07 16:20:05 -05005776 */
Guoqing Jiangd1b89bc2020-06-01 21:47:45 -07005777 detach_page_private(page);
Josef Bacik4f2de97a2012-03-07 16:20:05 -05005778 }
Forrest Liu5d2361d2015-02-09 17:31:45 +08005779 if (mapped)
5780 spin_unlock(&page->mapping->private_lock);
Qu Wenruo8ff84662021-01-26 16:33:50 +08005781 return;
5782 }
5783
5784 /*
5785 * For subpage, we can have dummy eb with page private. In this case,
5786 * we can directly detach the private as such page is only attached to
5787 * one dummy eb, no sharing.
5788 */
5789 if (!mapped) {
5790 btrfs_detach_subpage(fs_info, page);
5791 return;
5792 }
5793
5794 btrfs_page_dec_eb_refs(fs_info, page);
5795
5796 /*
5797 * We can only detach the page private if there are no other ebs in the
Qu Wenruo3d078ef2021-06-07 17:02:58 +08005798 * page range and no unfinished IO.
Qu Wenruo8ff84662021-01-26 16:33:50 +08005799 */
5800 if (!page_range_has_eb(fs_info, page))
5801 btrfs_detach_subpage(fs_info, page);
5802
5803 spin_unlock(&page->mapping->private_lock);
5804}
5805
5806/* Release all pages attached to the extent buffer */
5807static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5808{
5809 int i;
5810 int num_pages;
5811
5812 ASSERT(!extent_buffer_under_io(eb));
5813
5814 num_pages = num_extent_pages(eb);
5815 for (i = 0; i < num_pages; i++) {
5816 struct page *page = eb->pages[i];
5817
5818 if (!page)
5819 continue;
5820
5821 detach_extent_buffer_page(eb, page);
Forrest Liu5d2361d2015-02-09 17:31:45 +08005822
Nicholas D Steeves01327612016-05-19 21:18:45 -04005823 /* One for when we allocated the page */
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03005824 put_page(page);
Nikolay Borisovd64766f2018-06-27 16:38:22 +03005825 }
Miao Xie897ca6e92010-10-26 20:57:29 -04005826}
5827
5828/*
5829 * Helper for releasing the extent buffer.
5830 */
5831static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5832{
David Sterba55ac0132018-07-19 17:24:32 +02005833 btrfs_release_extent_buffer_pages(eb);
Josef Bacik8c389382020-02-14 16:11:42 -05005834 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
Miao Xie897ca6e92010-10-26 20:57:29 -04005835 __free_extent_buffer(eb);
5836}
5837
Josef Bacikf28491e2013-12-16 13:24:27 -05005838static struct extent_buffer *
5839__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
David Sterba23d79d82014-06-15 02:55:29 +02005840 unsigned long len)
Josef Bacikdb7f3432013-08-07 14:54:37 -04005841{
5842 struct extent_buffer *eb = NULL;
5843
Michal Hockod1b5c5672015-08-19 14:17:40 +02005844 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005845 eb->start = start;
5846 eb->len = len;
Josef Bacikf28491e2013-12-16 13:24:27 -05005847 eb->fs_info = fs_info;
Josef Bacikdb7f3432013-08-07 14:54:37 -04005848 eb->bflags = 0;
Josef Bacik196d59a2020-08-20 11:46:09 -04005849 init_rwsem(&eb->lock);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005850
Josef Bacik3fd63722020-02-14 16:11:40 -05005851 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5852 &fs_info->allocated_ebs);
Naohiro Aotad35751562021-02-04 19:21:54 +09005853 INIT_LIST_HEAD(&eb->release_list);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005854
5855 spin_lock_init(&eb->refs_lock);
5856 atomic_set(&eb->refs, 1);
5857 atomic_set(&eb->io_pages, 0);
5858
Qu Wenruodeb67892020-12-02 14:48:01 +08005859 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005860
5861 return eb;
5862}
5863
David Sterba2b489662020-04-29 03:04:10 +02005864struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
Josef Bacikdb7f3432013-08-07 14:54:37 -04005865{
David Sterbacc5e31a2018-03-01 18:20:27 +01005866 int i;
Josef Bacikdb7f3432013-08-07 14:54:37 -04005867 struct extent_buffer *new;
David Sterbacc5e31a2018-03-01 18:20:27 +01005868 int num_pages = num_extent_pages(src);
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005869 int ret;
Josef Bacikdb7f3432013-08-07 14:54:37 -04005870
David Sterba3f556f72014-06-15 03:20:26 +02005871 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005872 if (new == NULL)
5873 return NULL;
5874
Qu Wenruo62c053f2021-01-26 16:33:46 +08005875 /*
5876 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5877 * btrfs_release_extent_buffer() have different behavior for
5878 * UNMAPPED subpage extent buffer.
5879 */
5880 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5881
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005882 memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5883 ret = btrfs_alloc_page_array(num_pages, new->pages);
5884 if (ret) {
5885 btrfs_release_extent_buffer(new);
5886 return NULL;
5887 }
5888
Josef Bacikdb7f3432013-08-07 14:54:37 -04005889 for (i = 0; i < num_pages; i++) {
Qu Wenruo760f9912021-01-26 16:33:48 +08005890 int ret;
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005891 struct page *p = new->pages[i];
Qu Wenruo760f9912021-01-26 16:33:48 +08005892
Qu Wenruo760f9912021-01-26 16:33:48 +08005893 ret = attach_extent_buffer_page(new, p, NULL);
5894 if (ret < 0) {
Qu Wenruo760f9912021-01-26 16:33:48 +08005895 btrfs_release_extent_buffer(new);
5896 return NULL;
5897 }
Josef Bacikdb7f3432013-08-07 14:54:37 -04005898 WARN_ON(PageDirty(p));
David Sterbafba1acf2016-11-08 17:56:24 +01005899 copy_page(page_address(p), page_address(src->pages[i]));
Josef Bacikdb7f3432013-08-07 14:54:37 -04005900 }
Qu Wenruo92d83e92021-01-26 16:33:55 +08005901 set_extent_buffer_uptodate(new);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005902
5903 return new;
5904}
5905
Omar Sandoval0f331222015-09-29 20:50:31 -07005906struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5907 u64 start, unsigned long len)
Josef Bacikdb7f3432013-08-07 14:54:37 -04005908{
5909 struct extent_buffer *eb;
David Sterbacc5e31a2018-03-01 18:20:27 +01005910 int num_pages;
5911 int i;
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005912 int ret;
Josef Bacikdb7f3432013-08-07 14:54:37 -04005913
David Sterba3f556f72014-06-15 03:20:26 +02005914 eb = __alloc_extent_buffer(fs_info, start, len);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005915 if (!eb)
5916 return NULL;
5917
David Sterba65ad0102018-06-29 10:56:49 +02005918 num_pages = num_extent_pages(eb);
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005919 ret = btrfs_alloc_page_array(num_pages, eb->pages);
5920 if (ret)
5921 goto err;
Qu Wenruo09bc1f02021-01-26 16:33:51 +08005922
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005923 for (i = 0; i < num_pages; i++) {
5924 struct page *p = eb->pages[i];
5925
5926 ret = attach_extent_buffer_page(eb, p, NULL);
Qu Wenruo09bc1f02021-01-26 16:33:51 +08005927 if (ret < 0)
5928 goto err;
Josef Bacikdb7f3432013-08-07 14:54:37 -04005929 }
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005930
Josef Bacikdb7f3432013-08-07 14:54:37 -04005931 set_extent_buffer_uptodate(eb);
5932 btrfs_set_header_nritems(eb, 0);
Nikolay Borisovb0132a3b2018-06-27 16:38:24 +03005933 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
Josef Bacikdb7f3432013-08-07 14:54:37 -04005934
5935 return eb;
5936err:
Sweet Tea Dorminydd137dd2022-03-30 16:11:22 -04005937 for (i = 0; i < num_pages; i++) {
5938 if (eb->pages[i]) {
5939 detach_extent_buffer_page(eb, eb->pages[i]);
5940 __free_page(eb->pages[i]);
5941 }
Qu Wenruo09bc1f02021-01-26 16:33:51 +08005942 }
Josef Bacikdb7f3432013-08-07 14:54:37 -04005943 __free_extent_buffer(eb);
5944 return NULL;
5945}
5946
Omar Sandoval0f331222015-09-29 20:50:31 -07005947struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
Jeff Mahoneyda170662016-06-15 09:22:56 -04005948 u64 start)
Omar Sandoval0f331222015-09-29 20:50:31 -07005949{
Jeff Mahoneyda170662016-06-15 09:22:56 -04005950 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
Omar Sandoval0f331222015-09-29 20:50:31 -07005951}
5952
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005953static void check_buffer_tree_ref(struct extent_buffer *eb)
5954{
Chris Mason242e18c2013-01-29 17:49:37 -05005955 int refs;
Boris Burkov6bf9cd22020-06-17 11:35:19 -07005956 /*
5957 * The TREE_REF bit is first set when the extent_buffer is added
5958 * to the radix tree. It is also reset, if unset, when a new reference
5959 * is created by find_extent_buffer.
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005960 *
Boris Burkov6bf9cd22020-06-17 11:35:19 -07005961 * It is only cleared in two cases: freeing the last non-tree
5962 * reference to the extent_buffer when its STALE bit is set or
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04005963 * calling release_folio when the tree reference is the only reference.
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005964 *
Boris Burkov6bf9cd22020-06-17 11:35:19 -07005965 * In both cases, care is taken to ensure that the extent_buffer's
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04005966 * pages are not under io. However, release_folio can be concurrently
Boris Burkov6bf9cd22020-06-17 11:35:19 -07005967 * called with creating new references, which is prone to race
5968 * conditions between the calls to check_buffer_tree_ref in those
5969 * codepaths and clearing TREE_REF in try_release_extent_buffer.
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005970 *
Boris Burkov6bf9cd22020-06-17 11:35:19 -07005971 * The actual lifetime of the extent_buffer in the radix tree is
5972 * adequately protected by the refcount, but the TREE_REF bit and
5973 * its corresponding reference are not. To protect against this
5974 * class of races, we call check_buffer_tree_ref from the codepaths
5975 * which trigger io after they set eb->io_pages. Note that once io is
5976 * initiated, TREE_REF can no longer be cleared, so that is the
5977 * moment at which any such race is best fixed.
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005978 */
Chris Mason242e18c2013-01-29 17:49:37 -05005979 refs = atomic_read(&eb->refs);
5980 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5981 return;
5982
Josef Bacik594831c2012-07-20 16:11:08 -04005983 spin_lock(&eb->refs_lock);
5984 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005985 atomic_inc(&eb->refs);
Josef Bacik594831c2012-07-20 16:11:08 -04005986 spin_unlock(&eb->refs_lock);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005987}
5988
Mel Gorman2457aec2014-06-04 16:10:31 -07005989static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5990 struct page *accessed)
Josef Bacik5df42352012-03-15 18:24:42 -04005991{
David Sterbacc5e31a2018-03-01 18:20:27 +01005992 int num_pages, i;
Josef Bacik5df42352012-03-15 18:24:42 -04005993
Josef Bacik0b32f4b2012-03-13 09:38:00 -04005994 check_buffer_tree_ref(eb);
5995
David Sterba65ad0102018-06-29 10:56:49 +02005996 num_pages = num_extent_pages(eb);
Josef Bacik5df42352012-03-15 18:24:42 -04005997 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02005998 struct page *p = eb->pages[i];
5999
Mel Gorman2457aec2014-06-04 16:10:31 -07006000 if (p != accessed)
6001 mark_page_accessed(p);
Josef Bacik5df42352012-03-15 18:24:42 -04006002 }
6003}
6004
Josef Bacikf28491e2013-12-16 13:24:27 -05006005struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6006 u64 start)
Chandra Seetharaman452c75c2013-10-07 10:45:25 -05006007{
6008 struct extent_buffer *eb;
6009
Qu Wenruo2f3186d2021-04-06 08:36:00 +08006010 eb = find_extent_buffer_nolock(fs_info, start);
6011 if (!eb)
6012 return NULL;
6013 /*
6014 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6015 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6016 * another task running free_extent_buffer() might have seen that flag
6017 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6018 * writeback flags not set) and it's still in the tree (flag
6019 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6020 * decrementing the extent buffer's reference count twice. So here we
6021 * could race and increment the eb's reference count, clear its stale
6022 * flag, mark it as dirty and drop our reference before the other task
6023 * finishes executing free_extent_buffer, which would later result in
6024 * an attempt to free an extent buffer that is dirty.
6025 */
6026 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6027 spin_lock(&eb->refs_lock);
6028 spin_unlock(&eb->refs_lock);
Chandra Seetharaman452c75c2013-10-07 10:45:25 -05006029 }
Qu Wenruo2f3186d2021-04-06 08:36:00 +08006030 mark_extent_buffer_accessed(eb, NULL);
6031 return eb;
Chandra Seetharaman452c75c2013-10-07 10:45:25 -05006032}
6033
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006034#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6035struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
Jeff Mahoneyda170662016-06-15 09:22:56 -04006036 u64 start)
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006037{
6038 struct extent_buffer *eb, *exists = NULL;
6039 int ret;
6040
6041 eb = find_extent_buffer(fs_info, start);
6042 if (eb)
6043 return eb;
Jeff Mahoneyda170662016-06-15 09:22:56 -04006044 eb = alloc_dummy_extent_buffer(fs_info, start);
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006045 if (!eb)
Dan Carpenterb6293c82019-12-03 14:24:58 +03006046 return ERR_PTR(-ENOMEM);
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006047 eb->fs_info = fs_info;
David Sterba01cd3902022-07-15 13:59:31 +02006048again:
6049 ret = radix_tree_preload(GFP_NOFS);
6050 if (ret) {
6051 exists = ERR_PTR(ret);
6052 goto free_eb;
6053 }
6054 spin_lock(&fs_info->buffer_lock);
6055 ret = radix_tree_insert(&fs_info->buffer_radix,
6056 start >> fs_info->sectorsize_bits, eb);
6057 spin_unlock(&fs_info->buffer_lock);
6058 radix_tree_preload_end();
6059 if (ret == -EEXIST) {
6060 exists = find_extent_buffer(fs_info, start);
6061 if (exists)
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006062 goto free_eb;
David Sterba01cd3902022-07-15 13:59:31 +02006063 else
6064 goto again;
6065 }
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006066 check_buffer_tree_ref(eb);
6067 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6068
Josef Bacikfaa2dbf2014-05-07 17:06:09 -04006069 return eb;
6070free_eb:
6071 btrfs_release_extent_buffer(eb);
6072 return exists;
6073}
6074#endif
6075
Qu Wenruo819822102021-01-26 16:33:49 +08006076static struct extent_buffer *grab_extent_buffer(
6077 struct btrfs_fs_info *fs_info, struct page *page)
Qu Wenruoc0f0a9e2021-01-06 09:01:45 +08006078{
6079 struct extent_buffer *exists;
6080
Qu Wenruo819822102021-01-26 16:33:49 +08006081 /*
6082 * For subpage case, we completely rely on radix tree to ensure we
6083 * don't try to insert two ebs for the same bytenr. So here we always
6084 * return NULL and just continue.
6085 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08006086 if (fs_info->nodesize < PAGE_SIZE)
Qu Wenruo819822102021-01-26 16:33:49 +08006087 return NULL;
6088
Qu Wenruoc0f0a9e2021-01-06 09:01:45 +08006089 /* Page not yet attached to an extent buffer */
6090 if (!PagePrivate(page))
6091 return NULL;
6092
6093 /*
6094 * We could have already allocated an eb for this page and attached one
6095 * so lets see if we can get a ref on the existing eb, and if we can we
6096 * know it's good and we can just return that one, else we know we can
6097 * just overwrite page->private.
6098 */
6099 exists = (struct extent_buffer *)page->private;
6100 if (atomic_inc_not_zero(&exists->refs))
6101 return exists;
6102
6103 WARN_ON(PageDirty(page));
6104 detach_page_private(page);
6105 return NULL;
6106}
6107
Qu Wenruofbca46e2022-01-13 13:22:09 +08006108static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6109{
6110 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6111 btrfs_err(fs_info, "bad tree block start %llu", start);
6112 return -EINVAL;
6113 }
6114
6115 if (fs_info->nodesize < PAGE_SIZE &&
6116 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6117 btrfs_err(fs_info,
6118 "tree block crosses page boundary, start %llu nodesize %u",
6119 start, fs_info->nodesize);
6120 return -EINVAL;
6121 }
6122 if (fs_info->nodesize >= PAGE_SIZE &&
Fanjun Kong1280d2d2022-05-26 22:35:40 +08006123 !PAGE_ALIGNED(start)) {
Qu Wenruofbca46e2022-01-13 13:22:09 +08006124 btrfs_err(fs_info,
6125 "tree block is not page aligned, start %llu nodesize %u",
6126 start, fs_info->nodesize);
6127 return -EINVAL;
6128 }
6129 return 0;
6130}
6131
Josef Bacikf28491e2013-12-16 13:24:27 -05006132struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
Josef Bacik3fbaf252020-11-05 10:45:20 -05006133 u64 start, u64 owner_root, int level)
Chris Masond1310b22008-01-24 16:13:08 -05006134{
Jeff Mahoneyda170662016-06-15 09:22:56 -04006135 unsigned long len = fs_info->nodesize;
David Sterbacc5e31a2018-03-01 18:20:27 +01006136 int num_pages;
6137 int i;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006138 unsigned long index = start >> PAGE_SHIFT;
Chris Masond1310b22008-01-24 16:13:08 -05006139 struct extent_buffer *eb;
Chris Mason6af118ce2008-07-22 11:18:07 -04006140 struct extent_buffer *exists = NULL;
Chris Masond1310b22008-01-24 16:13:08 -05006141 struct page *p;
Josef Bacikf28491e2013-12-16 13:24:27 -05006142 struct address_space *mapping = fs_info->btree_inode->i_mapping;
Josef Bacikb40130b2022-07-26 16:24:04 -04006143 u64 lockdep_owner = owner_root;
Chris Masond1310b22008-01-24 16:13:08 -05006144 int uptodate = 1;
Miao Xie19fe0a82010-10-26 20:57:29 -04006145 int ret;
Chris Masond1310b22008-01-24 16:13:08 -05006146
Qu Wenruofbca46e2022-01-13 13:22:09 +08006147 if (check_eb_alignment(fs_info, start))
Liu Boc871b0f2016-06-06 12:01:23 -07006148 return ERR_PTR(-EINVAL);
Liu Boc871b0f2016-06-06 12:01:23 -07006149
Qu Wenruoe9306ad2021-02-25 09:18:14 +08006150#if BITS_PER_LONG == 32
6151 if (start >= MAX_LFS_FILESIZE) {
6152 btrfs_err_rl(fs_info,
6153 "extent buffer %llu is beyond 32bit page cache limit", start);
6154 btrfs_err_32bit_limit(fs_info);
6155 return ERR_PTR(-EOVERFLOW);
6156 }
6157 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6158 btrfs_warn_32bit_limit(fs_info);
6159#endif
6160
Josef Bacikf28491e2013-12-16 13:24:27 -05006161 eb = find_extent_buffer(fs_info, start);
Chandra Seetharaman452c75c2013-10-07 10:45:25 -05006162 if (eb)
Chris Mason6af118ce2008-07-22 11:18:07 -04006163 return eb;
Chris Mason6af118ce2008-07-22 11:18:07 -04006164
David Sterba23d79d82014-06-15 02:55:29 +02006165 eb = __alloc_extent_buffer(fs_info, start, len);
Peter2b114d12008-04-01 11:21:40 -04006166 if (!eb)
Liu Boc871b0f2016-06-06 12:01:23 -07006167 return ERR_PTR(-ENOMEM);
Josef Bacikb40130b2022-07-26 16:24:04 -04006168
6169 /*
6170 * The reloc trees are just snapshots, so we need them to appear to be
6171 * just like any other fs tree WRT lockdep.
6172 */
6173 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
6174 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
6175
6176 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
Chris Masond1310b22008-01-24 16:13:08 -05006177
David Sterba65ad0102018-06-29 10:56:49 +02006178 num_pages = num_extent_pages(eb);
Chris Mason727011e2010-08-06 13:21:20 -04006179 for (i = 0; i < num_pages; i++, index++) {
Qu Wenruo760f9912021-01-26 16:33:48 +08006180 struct btrfs_subpage *prealloc = NULL;
6181
Michal Hockod1b5c5672015-08-19 14:17:40 +02006182 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
Liu Boc871b0f2016-06-06 12:01:23 -07006183 if (!p) {
6184 exists = ERR_PTR(-ENOMEM);
Chris Mason6af118ce2008-07-22 11:18:07 -04006185 goto free_eb;
Liu Boc871b0f2016-06-06 12:01:23 -07006186 }
Josef Bacik4f2de97a2012-03-07 16:20:05 -05006187
Qu Wenruo760f9912021-01-26 16:33:48 +08006188 /*
6189 * Preallocate page->private for subpage case, so that we won't
6190 * allocate memory with private_lock hold. The memory will be
6191 * freed by attach_extent_buffer_page() or freed manually if
6192 * we exit earlier.
6193 *
6194 * Although we have ensured one subpage eb can only have one
6195 * page, but it may change in the future for 16K page size
6196 * support, so we still preallocate the memory in the loop.
6197 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08006198 if (fs_info->nodesize < PAGE_SIZE) {
Qu Wenruo651fb412021-08-17 17:38:50 +08006199 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6200 if (IS_ERR(prealloc)) {
6201 ret = PTR_ERR(prealloc);
Qu Wenruofdf250d2021-08-17 17:38:49 +08006202 unlock_page(p);
6203 put_page(p);
6204 exists = ERR_PTR(ret);
6205 goto free_eb;
6206 }
Qu Wenruo760f9912021-01-26 16:33:48 +08006207 }
6208
Josef Bacik4f2de97a2012-03-07 16:20:05 -05006209 spin_lock(&mapping->private_lock);
Qu Wenruo819822102021-01-26 16:33:49 +08006210 exists = grab_extent_buffer(fs_info, p);
Qu Wenruoc0f0a9e2021-01-06 09:01:45 +08006211 if (exists) {
6212 spin_unlock(&mapping->private_lock);
6213 unlock_page(p);
6214 put_page(p);
6215 mark_extent_buffer_accessed(exists, p);
Qu Wenruo760f9912021-01-26 16:33:48 +08006216 btrfs_free_subpage(prealloc);
Qu Wenruoc0f0a9e2021-01-06 09:01:45 +08006217 goto free_eb;
Chris Masond1310b22008-01-24 16:13:08 -05006218 }
Qu Wenruo760f9912021-01-26 16:33:48 +08006219 /* Should not fail, as we have preallocated the memory */
6220 ret = attach_extent_buffer_page(eb, p, prealloc);
6221 ASSERT(!ret);
Qu Wenruo8ff84662021-01-26 16:33:50 +08006222 /*
6223 * To inform we have extra eb under allocation, so that
6224 * detach_extent_buffer_page() won't release the page private
6225 * when the eb hasn't yet been inserted into radix tree.
6226 *
6227 * The ref will be decreased when the eb released the page, in
6228 * detach_extent_buffer_page().
6229 * Thus needs no special handling in error path.
6230 */
6231 btrfs_page_inc_eb_refs(fs_info, p);
Josef Bacik4f2de97a2012-03-07 16:20:05 -05006232 spin_unlock(&mapping->private_lock);
Qu Wenruo760f9912021-01-26 16:33:48 +08006233
Qu Wenruo1e5eb3d2021-03-25 15:14:41 +08006234 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
Chris Mason727011e2010-08-06 13:21:20 -04006235 eb->pages[i] = p;
Chris Masond1310b22008-01-24 16:13:08 -05006236 if (!PageUptodate(p))
6237 uptodate = 0;
Chris Masoneb14ab82011-02-10 12:35:00 -05006238
6239 /*
Nikolay Borisovb16d0112018-07-04 10:24:52 +03006240 * We can't unlock the pages just yet since the extent buffer
6241 * hasn't been properly inserted in the radix tree, this
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04006242 * opens a race with btree_release_folio which can free a page
Nikolay Borisovb16d0112018-07-04 10:24:52 +03006243 * while we are still filling in all pages for the buffer and
6244 * we could crash.
Chris Masoneb14ab82011-02-10 12:35:00 -05006245 */
Chris Masond1310b22008-01-24 16:13:08 -05006246 }
6247 if (uptodate)
Chris Masonb4ce94d2009-02-04 09:25:08 -05006248 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
David Sterba01cd3902022-07-15 13:59:31 +02006249again:
6250 ret = radix_tree_preload(GFP_NOFS);
6251 if (ret) {
6252 exists = ERR_PTR(ret);
6253 goto free_eb;
6254 }
Miao Xie19fe0a82010-10-26 20:57:29 -04006255
David Sterba01cd3902022-07-15 13:59:31 +02006256 spin_lock(&fs_info->buffer_lock);
6257 ret = radix_tree_insert(&fs_info->buffer_radix,
6258 start >> fs_info->sectorsize_bits, eb);
6259 spin_unlock(&fs_info->buffer_lock);
6260 radix_tree_preload_end();
6261 if (ret == -EEXIST) {
6262 exists = find_extent_buffer(fs_info, start);
6263 if (exists)
Chandra Seetharaman452c75c2013-10-07 10:45:25 -05006264 goto free_eb;
David Sterba01cd3902022-07-15 13:59:31 +02006265 else
6266 goto again;
6267 }
Chris Mason6af118ce2008-07-22 11:18:07 -04006268 /* add one reference for the tree */
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006269 check_buffer_tree_ref(eb);
Josef Bacik34b41ac2013-12-13 10:41:51 -05006270 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
Chris Masoneb14ab82011-02-10 12:35:00 -05006271
6272 /*
Nikolay Borisovb16d0112018-07-04 10:24:52 +03006273 * Now it's safe to unlock the pages because any calls to
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04006274 * btree_release_folio will correctly detect that a page belongs to a
Nikolay Borisovb16d0112018-07-04 10:24:52 +03006275 * live buffer and won't free them prematurely.
Chris Masoneb14ab82011-02-10 12:35:00 -05006276 */
Nikolay Borisov28187ae2018-07-04 10:24:51 +03006277 for (i = 0; i < num_pages; i++)
6278 unlock_page(eb->pages[i]);
Chris Masond1310b22008-01-24 16:13:08 -05006279 return eb;
6280
Chris Mason6af118ce2008-07-22 11:18:07 -04006281free_eb:
Omar Sandoval5ca64f42015-02-24 02:47:05 -08006282 WARN_ON(!atomic_dec_and_test(&eb->refs));
Chris Mason727011e2010-08-06 13:21:20 -04006283 for (i = 0; i < num_pages; i++) {
6284 if (eb->pages[i])
6285 unlock_page(eb->pages[i]);
6286 }
Chris Masoneb14ab82011-02-10 12:35:00 -05006287
Miao Xie897ca6e92010-10-26 20:57:29 -04006288 btrfs_release_extent_buffer(eb);
Chris Mason6af118ce2008-07-22 11:18:07 -04006289 return exists;
Chris Masond1310b22008-01-24 16:13:08 -05006290}
Chris Masond1310b22008-01-24 16:13:08 -05006291
Josef Bacik3083ee22012-03-09 16:01:49 -05006292static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6293{
6294 struct extent_buffer *eb =
6295 container_of(head, struct extent_buffer, rcu_head);
6296
6297 __free_extent_buffer(eb);
6298}
6299
David Sterbaf7a52a42013-04-26 14:56:29 +00006300static int release_extent_buffer(struct extent_buffer *eb)
Jules Irenge5ce48d02020-02-23 23:16:42 +00006301 __releases(&eb->refs_lock)
Josef Bacik3083ee22012-03-09 16:01:49 -05006302{
Nikolay Borisov07e21c42018-06-27 16:38:23 +03006303 lockdep_assert_held(&eb->refs_lock);
6304
Josef Bacik3083ee22012-03-09 16:01:49 -05006305 WARN_ON(atomic_read(&eb->refs) == 0);
6306 if (atomic_dec_and_test(&eb->refs)) {
Josef Bacik34b41ac2013-12-13 10:41:51 -05006307 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
Josef Bacikf28491e2013-12-16 13:24:27 -05006308 struct btrfs_fs_info *fs_info = eb->fs_info;
Josef Bacik3083ee22012-03-09 16:01:49 -05006309
Jan Schmidt815a51c2012-05-16 17:00:02 +02006310 spin_unlock(&eb->refs_lock);
Josef Bacik3083ee22012-03-09 16:01:49 -05006311
David Sterba01cd3902022-07-15 13:59:31 +02006312 spin_lock(&fs_info->buffer_lock);
6313 radix_tree_delete(&fs_info->buffer_radix,
6314 eb->start >> fs_info->sectorsize_bits);
6315 spin_unlock(&fs_info->buffer_lock);
Josef Bacik34b41ac2013-12-13 10:41:51 -05006316 } else {
6317 spin_unlock(&eb->refs_lock);
Jan Schmidt815a51c2012-05-16 17:00:02 +02006318 }
Josef Bacik3083ee22012-03-09 16:01:49 -05006319
Josef Bacik8c389382020-02-14 16:11:42 -05006320 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
Josef Bacik3083ee22012-03-09 16:01:49 -05006321 /* Should be safe to release our pages at this point */
David Sterba55ac0132018-07-19 17:24:32 +02006322 btrfs_release_extent_buffer_pages(eb);
Josef Bacikbcb7e442015-03-16 17:38:02 -04006323#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
Nikolay Borisovb0132a3b2018-06-27 16:38:24 +03006324 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
Josef Bacikbcb7e442015-03-16 17:38:02 -04006325 __free_extent_buffer(eb);
6326 return 1;
6327 }
6328#endif
Josef Bacik3083ee22012-03-09 16:01:49 -05006329 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
Josef Bacike64860a2012-07-20 16:05:36 -04006330 return 1;
Josef Bacik3083ee22012-03-09 16:01:49 -05006331 }
6332 spin_unlock(&eb->refs_lock);
Josef Bacike64860a2012-07-20 16:05:36 -04006333
6334 return 0;
Josef Bacik3083ee22012-03-09 16:01:49 -05006335}
6336
Chris Masond1310b22008-01-24 16:13:08 -05006337void free_extent_buffer(struct extent_buffer *eb)
6338{
Chris Mason242e18c2013-01-29 17:49:37 -05006339 int refs;
6340 int old;
Chris Masond1310b22008-01-24 16:13:08 -05006341 if (!eb)
6342 return;
6343
Chris Mason242e18c2013-01-29 17:49:37 -05006344 while (1) {
6345 refs = atomic_read(&eb->refs);
Nikolay Borisov46cc7752018-10-15 17:04:01 +03006346 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6347 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6348 refs == 1))
Chris Mason242e18c2013-01-29 17:49:37 -05006349 break;
6350 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6351 if (old == refs)
6352 return;
6353 }
6354
Josef Bacik3083ee22012-03-09 16:01:49 -05006355 spin_lock(&eb->refs_lock);
6356 if (atomic_read(&eb->refs) == 2 &&
6357 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006358 !extent_buffer_under_io(eb) &&
Josef Bacik3083ee22012-03-09 16:01:49 -05006359 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6360 atomic_dec(&eb->refs);
Chris Masond1310b22008-01-24 16:13:08 -05006361
Josef Bacik3083ee22012-03-09 16:01:49 -05006362 /*
6363 * I know this is terrible, but it's temporary until we stop tracking
6364 * the uptodate bits and such for the extent buffers.
6365 */
David Sterbaf7a52a42013-04-26 14:56:29 +00006366 release_extent_buffer(eb);
Chris Masond1310b22008-01-24 16:13:08 -05006367}
Chris Masond1310b22008-01-24 16:13:08 -05006368
Josef Bacik3083ee22012-03-09 16:01:49 -05006369void free_extent_buffer_stale(struct extent_buffer *eb)
6370{
6371 if (!eb)
Chris Masond1310b22008-01-24 16:13:08 -05006372 return;
6373
Josef Bacik3083ee22012-03-09 16:01:49 -05006374 spin_lock(&eb->refs_lock);
6375 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6376
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006377 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
Josef Bacik3083ee22012-03-09 16:01:49 -05006378 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6379 atomic_dec(&eb->refs);
David Sterbaf7a52a42013-04-26 14:56:29 +00006380 release_extent_buffer(eb);
Chris Masond1310b22008-01-24 16:13:08 -05006381}
6382
Qu Wenruo0d277972021-03-25 15:14:43 +08006383static void btree_clear_page_dirty(struct page *page)
6384{
6385 ASSERT(PageDirty(page));
6386 ASSERT(PageLocked(page));
6387 clear_page_dirty_for_io(page);
6388 xa_lock_irq(&page->mapping->i_pages);
6389 if (!PageDirty(page))
6390 __xa_clear_mark(&page->mapping->i_pages,
6391 page_index(page), PAGECACHE_TAG_DIRTY);
6392 xa_unlock_irq(&page->mapping->i_pages);
6393}
6394
6395static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6396{
6397 struct btrfs_fs_info *fs_info = eb->fs_info;
6398 struct page *page = eb->pages[0];
6399 bool last;
6400
6401 /* btree_clear_page_dirty() needs page locked */
6402 lock_page(page);
6403 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6404 eb->len);
6405 if (last)
6406 btree_clear_page_dirty(page);
6407 unlock_page(page);
6408 WARN_ON(atomic_read(&eb->refs) == 0);
6409}
6410
David Sterba2b489662020-04-29 03:04:10 +02006411void clear_extent_buffer_dirty(const struct extent_buffer *eb)
Chris Masond1310b22008-01-24 16:13:08 -05006412{
David Sterbacc5e31a2018-03-01 18:20:27 +01006413 int i;
6414 int num_pages;
Chris Masond1310b22008-01-24 16:13:08 -05006415 struct page *page;
6416
Qu Wenruofbca46e2022-01-13 13:22:09 +08006417 if (eb->fs_info->nodesize < PAGE_SIZE)
Qu Wenruo0d277972021-03-25 15:14:43 +08006418 return clear_subpage_extent_buffer_dirty(eb);
6419
David Sterba65ad0102018-06-29 10:56:49 +02006420 num_pages = num_extent_pages(eb);
Chris Masond1310b22008-01-24 16:13:08 -05006421
6422 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006423 page = eb->pages[i];
Chris Masonb9473432009-03-13 11:00:37 -04006424 if (!PageDirty(page))
Chris Masond2c3f4f2008-11-19 12:44:22 -05006425 continue;
Chris Masona61e6f22008-07-22 11:18:08 -04006426 lock_page(page);
Qu Wenruo0d277972021-03-25 15:14:43 +08006427 btree_clear_page_dirty(page);
Chris Masonbf0da8c2011-11-04 12:29:37 -04006428 ClearPageError(page);
Chris Masona61e6f22008-07-22 11:18:08 -04006429 unlock_page(page);
Chris Masond1310b22008-01-24 16:13:08 -05006430 }
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006431 WARN_ON(atomic_read(&eb->refs) == 0);
Chris Masond1310b22008-01-24 16:13:08 -05006432}
Chris Masond1310b22008-01-24 16:13:08 -05006433
Liu Boabb57ef2018-09-14 01:44:42 +08006434bool set_extent_buffer_dirty(struct extent_buffer *eb)
Chris Masond1310b22008-01-24 16:13:08 -05006435{
David Sterbacc5e31a2018-03-01 18:20:27 +01006436 int i;
6437 int num_pages;
Liu Boabb57ef2018-09-14 01:44:42 +08006438 bool was_dirty;
Chris Masond1310b22008-01-24 16:13:08 -05006439
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006440 check_buffer_tree_ref(eb);
6441
Chris Masonb9473432009-03-13 11:00:37 -04006442 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006443
David Sterba65ad0102018-06-29 10:56:49 +02006444 num_pages = num_extent_pages(eb);
Josef Bacik3083ee22012-03-09 16:01:49 -05006445 WARN_ON(atomic_read(&eb->refs) == 0);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006446 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6447
Qu Wenruo0d277972021-03-25 15:14:43 +08006448 if (!was_dirty) {
Qu Wenruofbca46e2022-01-13 13:22:09 +08006449 bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
Liu Bo51995c32018-09-14 01:46:08 +08006450
Qu Wenruo0d277972021-03-25 15:14:43 +08006451 /*
6452 * For subpage case, we can have other extent buffers in the
6453 * same page, and in clear_subpage_extent_buffer_dirty() we
6454 * have to clear page dirty without subpage lock held.
6455 * This can cause race where our page gets dirty cleared after
6456 * we just set it.
6457 *
6458 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6459 * its page for other reasons, we can use page lock to prevent
6460 * the above race.
6461 */
6462 if (subpage)
6463 lock_page(eb->pages[0]);
6464 for (i = 0; i < num_pages; i++)
6465 btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6466 eb->start, eb->len);
6467 if (subpage)
6468 unlock_page(eb->pages[0]);
6469 }
Liu Bo51995c32018-09-14 01:46:08 +08006470#ifdef CONFIG_BTRFS_DEBUG
6471 for (i = 0; i < num_pages; i++)
6472 ASSERT(PageDirty(eb->pages[i]));
6473#endif
6474
Chris Masonb9473432009-03-13 11:00:37 -04006475 return was_dirty;
Chris Masond1310b22008-01-24 16:13:08 -05006476}
Chris Masond1310b22008-01-24 16:13:08 -05006477
David Sterba69ba3922015-12-03 13:08:59 +01006478void clear_extent_buffer_uptodate(struct extent_buffer *eb)
Chris Mason1259ab72008-05-12 13:39:03 -04006479{
Qu Wenruo251f2ac2021-01-26 16:33:54 +08006480 struct btrfs_fs_info *fs_info = eb->fs_info;
Chris Mason1259ab72008-05-12 13:39:03 -04006481 struct page *page;
David Sterbacc5e31a2018-03-01 18:20:27 +01006482 int num_pages;
Qu Wenruo251f2ac2021-01-26 16:33:54 +08006483 int i;
Chris Mason1259ab72008-05-12 13:39:03 -04006484
Chris Masonb4ce94d2009-02-04 09:25:08 -05006485 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
David Sterba65ad0102018-06-29 10:56:49 +02006486 num_pages = num_extent_pages(eb);
Chris Mason1259ab72008-05-12 13:39:03 -04006487 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006488 page = eb->pages[i];
Qu Wenruofbca46e2022-01-13 13:22:09 +08006489 if (!page)
6490 continue;
6491
6492 /*
6493 * This is special handling for metadata subpage, as regular
6494 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6495 */
6496 if (fs_info->nodesize >= PAGE_SIZE)
6497 ClearPageUptodate(page);
6498 else
6499 btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6500 eb->len);
Chris Mason1259ab72008-05-12 13:39:03 -04006501 }
Chris Mason1259ab72008-05-12 13:39:03 -04006502}
6503
David Sterba09c25a82015-12-03 13:08:59 +01006504void set_extent_buffer_uptodate(struct extent_buffer *eb)
Chris Masond1310b22008-01-24 16:13:08 -05006505{
Qu Wenruo251f2ac2021-01-26 16:33:54 +08006506 struct btrfs_fs_info *fs_info = eb->fs_info;
Chris Masond1310b22008-01-24 16:13:08 -05006507 struct page *page;
David Sterbacc5e31a2018-03-01 18:20:27 +01006508 int num_pages;
Qu Wenruo251f2ac2021-01-26 16:33:54 +08006509 int i;
Chris Masond1310b22008-01-24 16:13:08 -05006510
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006511 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
David Sterba65ad0102018-06-29 10:56:49 +02006512 num_pages = num_extent_pages(eb);
Chris Masond1310b22008-01-24 16:13:08 -05006513 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006514 page = eb->pages[i];
Qu Wenruofbca46e2022-01-13 13:22:09 +08006515
6516 /*
6517 * This is special handling for metadata subpage, as regular
6518 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6519 */
6520 if (fs_info->nodesize >= PAGE_SIZE)
6521 SetPageUptodate(page);
6522 else
6523 btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6524 eb->len);
Chris Masond1310b22008-01-24 16:13:08 -05006525 }
Chris Masond1310b22008-01-24 16:13:08 -05006526}
Chris Masond1310b22008-01-24 16:13:08 -05006527
Qu Wenruo4012daf2021-01-26 16:33:57 +08006528static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6529 int mirror_num)
6530{
6531 struct btrfs_fs_info *fs_info = eb->fs_info;
6532 struct extent_io_tree *io_tree;
6533 struct page *page = eb->pages[0];
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006534 struct btrfs_bio_ctrl bio_ctrl = {
6535 .mirror_num = mirror_num,
6536 };
Qu Wenruo4012daf2021-01-26 16:33:57 +08006537 int ret = 0;
6538
6539 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6540 ASSERT(PagePrivate(page));
6541 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6542
6543 if (wait == WAIT_NONE) {
Goldwyn Rodriguesdc562192021-04-08 07:40:25 -05006544 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6545 return -EAGAIN;
Qu Wenruo4012daf2021-01-26 16:33:57 +08006546 } else {
6547 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6548 if (ret < 0)
6549 return ret;
6550 }
6551
6552 ret = 0;
6553 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6554 PageUptodate(page) ||
6555 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6556 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6557 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6558 return ret;
6559 }
6560
6561 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6562 eb->read_mirror = 0;
6563 atomic_set(&eb->io_pages, 1);
6564 check_buffer_tree_ref(eb);
6565 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6566
Qu Wenruo3d078ef2021-06-07 17:02:58 +08006567 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
Christoph Hellwig08a6f462022-05-26 09:36:39 +02006568 ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08006569 page, eb->start, eb->len,
6570 eb->start - page_offset(page),
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006571 end_bio_extent_readpage, 0, true);
Qu Wenruo4012daf2021-01-26 16:33:57 +08006572 if (ret) {
6573 /*
6574 * In the endio function, if we hit something wrong we will
6575 * increase the io_pages, so here we need to decrease it for
6576 * error path.
6577 */
6578 atomic_dec(&eb->io_pages);
6579 }
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006580 submit_one_bio(&bio_ctrl);
Qu Wenruo4012daf2021-01-26 16:33:57 +08006581 if (ret || wait != WAIT_COMPLETE)
6582 return ret;
6583
6584 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6585 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6586 ret = -EIO;
6587 return ret;
6588}
6589
Nikolay Borisovc2ccfbc2019-04-10 17:24:40 +03006590int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
Chris Masond1310b22008-01-24 16:13:08 -05006591{
David Sterbacc5e31a2018-03-01 18:20:27 +01006592 int i;
Chris Masond1310b22008-01-24 16:13:08 -05006593 struct page *page;
6594 int err;
6595 int ret = 0;
Chris Masonce9adaa2008-04-09 16:28:12 -04006596 int locked_pages = 0;
6597 int all_uptodate = 1;
David Sterbacc5e31a2018-03-01 18:20:27 +01006598 int num_pages;
Chris Mason727011e2010-08-06 13:21:20 -04006599 unsigned long num_reads = 0;
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006600 struct btrfs_bio_ctrl bio_ctrl = {
6601 .mirror_num = mirror_num,
6602 };
Chris Masona86c12c2008-02-07 10:50:54 -05006603
Chris Masonb4ce94d2009-02-04 09:25:08 -05006604 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
Chris Masond1310b22008-01-24 16:13:08 -05006605 return 0;
6606
Josef Bacik651740a52021-12-13 14:22:33 -05006607 /*
6608 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
6609 * operation, which could potentially still be in flight. In this case
6610 * we simply want to return an error.
6611 */
6612 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6613 return -EIO;
6614
Qu Wenruofbca46e2022-01-13 13:22:09 +08006615 if (eb->fs_info->nodesize < PAGE_SIZE)
Qu Wenruo4012daf2021-01-26 16:33:57 +08006616 return read_extent_buffer_subpage(eb, wait, mirror_num);
6617
David Sterba65ad0102018-06-29 10:56:49 +02006618 num_pages = num_extent_pages(eb);
Josef Bacik8436ea912016-09-02 15:40:03 -04006619 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006620 page = eb->pages[i];
Arne Jansenbb82ab82011-06-10 14:06:53 +02006621 if (wait == WAIT_NONE) {
Qu Wenruo2c4d8cb2021-01-28 19:25:08 +08006622 /*
6623 * WAIT_NONE is only utilized by readahead. If we can't
6624 * acquire the lock atomically it means either the eb
6625 * is being read out or under modification.
6626 * Either way the eb will be or has been cached,
6627 * readahead can exit safely.
6628 */
David Woodhouse2db04962008-08-07 11:19:43 -04006629 if (!trylock_page(page))
Chris Masonce9adaa2008-04-09 16:28:12 -04006630 goto unlock_exit;
Chris Masond1310b22008-01-24 16:13:08 -05006631 } else {
6632 lock_page(page);
6633 }
Chris Masonce9adaa2008-04-09 16:28:12 -04006634 locked_pages++;
Liu Bo2571e732016-08-03 12:33:01 -07006635 }
6636 /*
6637 * We need to firstly lock all pages to make sure that
6638 * the uptodate bit of our pages won't be affected by
6639 * clear_extent_buffer_uptodate().
6640 */
Josef Bacik8436ea912016-09-02 15:40:03 -04006641 for (i = 0; i < num_pages; i++) {
Liu Bo2571e732016-08-03 12:33:01 -07006642 page = eb->pages[i];
Chris Mason727011e2010-08-06 13:21:20 -04006643 if (!PageUptodate(page)) {
6644 num_reads++;
Chris Masonce9adaa2008-04-09 16:28:12 -04006645 all_uptodate = 0;
Chris Mason727011e2010-08-06 13:21:20 -04006646 }
Chris Masonce9adaa2008-04-09 16:28:12 -04006647 }
Liu Bo2571e732016-08-03 12:33:01 -07006648
Chris Masonce9adaa2008-04-09 16:28:12 -04006649 if (all_uptodate) {
Josef Bacik8436ea912016-09-02 15:40:03 -04006650 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
Chris Masonce9adaa2008-04-09 16:28:12 -04006651 goto unlock_exit;
6652 }
6653
Filipe Manana656f30d2014-09-26 12:25:56 +01006654 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
Josef Bacik5cf1ab52012-04-16 09:42:26 -04006655 eb->read_mirror = 0;
Josef Bacik0b32f4b2012-03-13 09:38:00 -04006656 atomic_set(&eb->io_pages, num_reads);
Boris Burkov6bf9cd22020-06-17 11:35:19 -07006657 /*
Matthew Wilcox (Oracle)f913cff2022-04-30 23:15:16 -04006658 * It is possible for release_folio to clear the TREE_REF bit before we
Boris Burkov6bf9cd22020-06-17 11:35:19 -07006659 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6660 */
6661 check_buffer_tree_ref(eb);
Josef Bacik8436ea912016-09-02 15:40:03 -04006662 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006663 page = eb->pages[i];
Liu Bobaf863b2016-07-11 10:39:07 -07006664
Chris Masonce9adaa2008-04-09 16:28:12 -04006665 if (!PageUptodate(page)) {
Liu Bobaf863b2016-07-11 10:39:07 -07006666 if (ret) {
6667 atomic_dec(&eb->io_pages);
6668 unlock_page(page);
6669 continue;
6670 }
6671
Chris Masonf1885912008-04-09 16:28:12 -04006672 ClearPageError(page);
Christoph Hellwig08a6f462022-05-26 09:36:39 +02006673 err = submit_extent_page(REQ_OP_READ, NULL,
Qu Wenruo390ed29b82021-04-14 16:42:15 +08006674 &bio_ctrl, page, page_offset(page),
6675 PAGE_SIZE, 0, end_bio_extent_readpage,
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006676 0, false);
Liu Bobaf863b2016-07-11 10:39:07 -07006677 if (err) {
Liu Bobaf863b2016-07-11 10:39:07 -07006678 /*
Nikolay Borisov04201772020-09-14 12:37:04 +03006679 * We failed to submit the bio so it's the
6680 * caller's responsibility to perform cleanup
6681 * i.e unlock page/set error bit.
Liu Bobaf863b2016-07-11 10:39:07 -07006682 */
Nikolay Borisov04201772020-09-14 12:37:04 +03006683 ret = err;
6684 SetPageError(page);
6685 unlock_page(page);
Liu Bobaf863b2016-07-11 10:39:07 -07006686 atomic_dec(&eb->io_pages);
6687 }
Chris Masond1310b22008-01-24 16:13:08 -05006688 } else {
6689 unlock_page(page);
6690 }
6691 }
6692
Christoph Hellwig722c82a2022-06-03 09:11:03 +02006693 submit_one_bio(&bio_ctrl);
Chris Masona86c12c2008-02-07 10:50:54 -05006694
Arne Jansenbb82ab82011-06-10 14:06:53 +02006695 if (ret || wait != WAIT_COMPLETE)
Chris Masond1310b22008-01-24 16:13:08 -05006696 return ret;
Chris Masond3977122009-01-05 21:25:51 -05006697
Josef Bacik8436ea912016-09-02 15:40:03 -04006698 for (i = 0; i < num_pages; i++) {
David Sterbafb85fc92014-07-31 01:03:53 +02006699 page = eb->pages[i];
Chris Masond1310b22008-01-24 16:13:08 -05006700 wait_on_page_locked(page);
Chris Masond3977122009-01-05 21:25:51 -05006701 if (!PageUptodate(page))
Chris Masond1310b22008-01-24 16:13:08 -05006702 ret = -EIO;
Chris Masond1310b22008-01-24 16:13:08 -05006703 }
Chris Masond3977122009-01-05 21:25:51 -05006704
Chris Masond1310b22008-01-24 16:13:08 -05006705 return ret;
Chris Masonce9adaa2008-04-09 16:28:12 -04006706
6707unlock_exit:
Chris Masond3977122009-01-05 21:25:51 -05006708 while (locked_pages > 0) {
Chris Masonce9adaa2008-04-09 16:28:12 -04006709 locked_pages--;
Josef Bacik8436ea912016-09-02 15:40:03 -04006710 page = eb->pages[locked_pages];
6711 unlock_page(page);
Chris Masonce9adaa2008-04-09 16:28:12 -04006712 }
6713 return ret;
Chris Masond1310b22008-01-24 16:13:08 -05006714}
Chris Masond1310b22008-01-24 16:13:08 -05006715
Qu Wenruof98b6212020-08-19 14:35:47 +08006716static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6717 unsigned long len)
6718{
6719 btrfs_warn(eb->fs_info,
6720 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6721 eb->start, eb->len, start, len);
6722 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6723
6724 return true;
6725}
6726
6727/*
6728 * Check if the [start, start + len) range is valid before reading/writing
6729 * the eb.
6730 * NOTE: @start and @len are offset inside the eb, not logical address.
6731 *
6732 * Caller should not touch the dst/src memory if this function returns error.
6733 */
6734static inline int check_eb_range(const struct extent_buffer *eb,
6735 unsigned long start, unsigned long len)
6736{
6737 unsigned long offset;
6738
6739 /* start, start + len should not go beyond eb->len nor overflow */
6740 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6741 return report_eb_range(eb, start, len);
6742
6743 return false;
6744}
6745
Jeff Mahoney1cbb1f42017-06-28 21:56:53 -06006746void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6747 unsigned long start, unsigned long len)
Chris Masond1310b22008-01-24 16:13:08 -05006748{
6749 size_t cur;
6750 size_t offset;
6751 struct page *page;
6752 char *kaddr;
6753 char *dst = (char *)dstv;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006754 unsigned long i = get_eb_page_index(start);
Chris Masond1310b22008-01-24 16:13:08 -05006755
Qu Wenruof98b6212020-08-19 14:35:47 +08006756 if (check_eb_range(eb, start, len))
Liu Bof716abd2017-08-09 11:10:16 -06006757 return;
Chris Masond1310b22008-01-24 16:13:08 -05006758
Qu Wenruo884b07d2020-12-02 14:48:04 +08006759 offset = get_eb_offset_in_page(eb, start);
Chris Masond1310b22008-01-24 16:13:08 -05006760
Chris Masond3977122009-01-05 21:25:51 -05006761 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02006762 page = eb->pages[i];
Chris Masond1310b22008-01-24 16:13:08 -05006763
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006764 cur = min(len, (PAGE_SIZE - offset));
Chris Masona6591712011-07-19 12:04:14 -04006765 kaddr = page_address(page);
Chris Masond1310b22008-01-24 16:13:08 -05006766 memcpy(dst, kaddr + offset, cur);
Chris Masond1310b22008-01-24 16:13:08 -05006767
6768 dst += cur;
6769 len -= cur;
6770 offset = 0;
6771 i++;
6772 }
6773}
Chris Masond1310b22008-01-24 16:13:08 -05006774
Josef Bacika48b73e2020-08-10 11:42:27 -04006775int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6776 void __user *dstv,
6777 unsigned long start, unsigned long len)
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006778{
6779 size_t cur;
6780 size_t offset;
6781 struct page *page;
6782 char *kaddr;
6783 char __user *dst = (char __user *)dstv;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006784 unsigned long i = get_eb_page_index(start);
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006785 int ret = 0;
6786
6787 WARN_ON(start > eb->len);
6788 WARN_ON(start + len > eb->start + eb->len);
6789
Qu Wenruo884b07d2020-12-02 14:48:04 +08006790 offset = get_eb_offset_in_page(eb, start);
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006791
6792 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02006793 page = eb->pages[i];
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006794
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006795 cur = min(len, (PAGE_SIZE - offset));
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006796 kaddr = page_address(page);
Josef Bacika48b73e2020-08-10 11:42:27 -04006797 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
Gerhard Heift550ac1d2014-01-30 16:24:01 +01006798 ret = -EFAULT;
6799 break;
6800 }
6801
6802 dst += cur;
6803 len -= cur;
6804 offset = 0;
6805 i++;
6806 }
6807
6808 return ret;
6809}
6810
Jeff Mahoney1cbb1f42017-06-28 21:56:53 -06006811int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6812 unsigned long start, unsigned long len)
Chris Masond1310b22008-01-24 16:13:08 -05006813{
6814 size_t cur;
6815 size_t offset;
6816 struct page *page;
6817 char *kaddr;
6818 char *ptr = (char *)ptrv;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006819 unsigned long i = get_eb_page_index(start);
Chris Masond1310b22008-01-24 16:13:08 -05006820 int ret = 0;
6821
Qu Wenruof98b6212020-08-19 14:35:47 +08006822 if (check_eb_range(eb, start, len))
6823 return -EINVAL;
Chris Masond1310b22008-01-24 16:13:08 -05006824
Qu Wenruo884b07d2020-12-02 14:48:04 +08006825 offset = get_eb_offset_in_page(eb, start);
Chris Masond1310b22008-01-24 16:13:08 -05006826
Chris Masond3977122009-01-05 21:25:51 -05006827 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02006828 page = eb->pages[i];
Chris Masond1310b22008-01-24 16:13:08 -05006829
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006830 cur = min(len, (PAGE_SIZE - offset));
Chris Masond1310b22008-01-24 16:13:08 -05006831
Chris Masona6591712011-07-19 12:04:14 -04006832 kaddr = page_address(page);
Chris Masond1310b22008-01-24 16:13:08 -05006833 ret = memcmp(ptr, kaddr + offset, cur);
Chris Masond1310b22008-01-24 16:13:08 -05006834 if (ret)
6835 break;
6836
6837 ptr += cur;
6838 len -= cur;
6839 offset = 0;
6840 i++;
6841 }
6842 return ret;
6843}
Chris Masond1310b22008-01-24 16:13:08 -05006844
Qu Wenruob8f95772021-03-25 15:14:42 +08006845/*
6846 * Check that the extent buffer is uptodate.
6847 *
6848 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6849 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6850 */
6851static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6852 struct page *page)
6853{
6854 struct btrfs_fs_info *fs_info = eb->fs_info;
6855
Josef Bacika50e1fc2022-02-18 10:17:39 -05006856 /*
6857 * If we are using the commit root we could potentially clear a page
6858 * Uptodate while we're using the extent buffer that we've previously
6859 * looked up. We don't want to complain in this case, as the page was
6860 * valid before, we just didn't write it out. Instead we want to catch
6861 * the case where we didn't actually read the block properly, which
6862 * would have !PageUptodate && !PageError, as we clear PageError before
6863 * reading.
6864 */
Qu Wenruofbca46e2022-01-13 13:22:09 +08006865 if (fs_info->nodesize < PAGE_SIZE) {
Josef Bacika50e1fc2022-02-18 10:17:39 -05006866 bool uptodate, error;
Qu Wenruob8f95772021-03-25 15:14:42 +08006867
6868 uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6869 eb->start, eb->len);
Josef Bacika50e1fc2022-02-18 10:17:39 -05006870 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6871 WARN_ON(!uptodate && !error);
Qu Wenruob8f95772021-03-25 15:14:42 +08006872 } else {
Josef Bacika50e1fc2022-02-18 10:17:39 -05006873 WARN_ON(!PageUptodate(page) && !PageError(page));
Qu Wenruob8f95772021-03-25 15:14:42 +08006874 }
6875}
6876
David Sterba2b489662020-04-29 03:04:10 +02006877void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
David Sterbaf157bf72016-11-09 17:43:38 +01006878 const void *srcv)
6879{
6880 char *kaddr;
6881
Qu Wenruob8f95772021-03-25 15:14:42 +08006882 assert_eb_page_uptodate(eb, eb->pages[0]);
David Sterba24880be52020-09-21 22:07:14 +02006883 kaddr = page_address(eb->pages[0]) +
6884 get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6885 chunk_tree_uuid));
6886 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
David Sterbaf157bf72016-11-09 17:43:38 +01006887}
6888
David Sterba2b489662020-04-29 03:04:10 +02006889void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
David Sterbaf157bf72016-11-09 17:43:38 +01006890{
6891 char *kaddr;
6892
Qu Wenruob8f95772021-03-25 15:14:42 +08006893 assert_eb_page_uptodate(eb, eb->pages[0]);
David Sterba24880be52020-09-21 22:07:14 +02006894 kaddr = page_address(eb->pages[0]) +
6895 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6896 memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
David Sterbaf157bf72016-11-09 17:43:38 +01006897}
6898
David Sterba2b489662020-04-29 03:04:10 +02006899void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
Chris Masond1310b22008-01-24 16:13:08 -05006900 unsigned long start, unsigned long len)
6901{
6902 size_t cur;
6903 size_t offset;
6904 struct page *page;
6905 char *kaddr;
6906 char *src = (char *)srcv;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006907 unsigned long i = get_eb_page_index(start);
Chris Masond1310b22008-01-24 16:13:08 -05006908
Naohiro Aotad35751562021-02-04 19:21:54 +09006909 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6910
Qu Wenruof98b6212020-08-19 14:35:47 +08006911 if (check_eb_range(eb, start, len))
6912 return;
Chris Masond1310b22008-01-24 16:13:08 -05006913
Qu Wenruo884b07d2020-12-02 14:48:04 +08006914 offset = get_eb_offset_in_page(eb, start);
Chris Masond1310b22008-01-24 16:13:08 -05006915
Chris Masond3977122009-01-05 21:25:51 -05006916 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02006917 page = eb->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08006918 assert_eb_page_uptodate(eb, page);
Chris Masond1310b22008-01-24 16:13:08 -05006919
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006920 cur = min(len, PAGE_SIZE - offset);
Chris Masona6591712011-07-19 12:04:14 -04006921 kaddr = page_address(page);
Chris Masond1310b22008-01-24 16:13:08 -05006922 memcpy(kaddr + offset, src, cur);
Chris Masond1310b22008-01-24 16:13:08 -05006923
6924 src += cur;
6925 len -= cur;
6926 offset = 0;
6927 i++;
6928 }
6929}
Chris Masond1310b22008-01-24 16:13:08 -05006930
David Sterba2b489662020-04-29 03:04:10 +02006931void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
David Sterbab159fa22016-11-08 18:09:03 +01006932 unsigned long len)
Chris Masond1310b22008-01-24 16:13:08 -05006933{
6934 size_t cur;
6935 size_t offset;
6936 struct page *page;
6937 char *kaddr;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006938 unsigned long i = get_eb_page_index(start);
Chris Masond1310b22008-01-24 16:13:08 -05006939
Qu Wenruof98b6212020-08-19 14:35:47 +08006940 if (check_eb_range(eb, start, len))
6941 return;
Chris Masond1310b22008-01-24 16:13:08 -05006942
Qu Wenruo884b07d2020-12-02 14:48:04 +08006943 offset = get_eb_offset_in_page(eb, start);
Chris Masond1310b22008-01-24 16:13:08 -05006944
Chris Masond3977122009-01-05 21:25:51 -05006945 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02006946 page = eb->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08006947 assert_eb_page_uptodate(eb, page);
Chris Masond1310b22008-01-24 16:13:08 -05006948
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03006949 cur = min(len, PAGE_SIZE - offset);
Chris Masona6591712011-07-19 12:04:14 -04006950 kaddr = page_address(page);
David Sterbab159fa22016-11-08 18:09:03 +01006951 memset(kaddr + offset, 0, cur);
Chris Masond1310b22008-01-24 16:13:08 -05006952
6953 len -= cur;
6954 offset = 0;
6955 i++;
6956 }
6957}
Chris Masond1310b22008-01-24 16:13:08 -05006958
David Sterba2b489662020-04-29 03:04:10 +02006959void copy_extent_buffer_full(const struct extent_buffer *dst,
6960 const struct extent_buffer *src)
David Sterba58e8012c2016-11-08 18:30:31 +01006961{
6962 int i;
David Sterbacc5e31a2018-03-01 18:20:27 +01006963 int num_pages;
David Sterba58e8012c2016-11-08 18:30:31 +01006964
6965 ASSERT(dst->len == src->len);
6966
Qu Wenruofbca46e2022-01-13 13:22:09 +08006967 if (dst->fs_info->nodesize >= PAGE_SIZE) {
Qu Wenruo884b07d2020-12-02 14:48:04 +08006968 num_pages = num_extent_pages(dst);
6969 for (i = 0; i < num_pages; i++)
6970 copy_page(page_address(dst->pages[i]),
6971 page_address(src->pages[i]));
6972 } else {
6973 size_t src_offset = get_eb_offset_in_page(src, 0);
6974 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6975
Qu Wenruofbca46e2022-01-13 13:22:09 +08006976 ASSERT(src->fs_info->nodesize < PAGE_SIZE);
Qu Wenruo884b07d2020-12-02 14:48:04 +08006977 memcpy(page_address(dst->pages[0]) + dst_offset,
6978 page_address(src->pages[0]) + src_offset,
6979 src->len);
6980 }
David Sterba58e8012c2016-11-08 18:30:31 +01006981}
6982
David Sterba2b489662020-04-29 03:04:10 +02006983void copy_extent_buffer(const struct extent_buffer *dst,
6984 const struct extent_buffer *src,
Chris Masond1310b22008-01-24 16:13:08 -05006985 unsigned long dst_offset, unsigned long src_offset,
6986 unsigned long len)
6987{
6988 u64 dst_len = dst->len;
6989 size_t cur;
6990 size_t offset;
6991 struct page *page;
6992 char *kaddr;
Qu Wenruo884b07d2020-12-02 14:48:04 +08006993 unsigned long i = get_eb_page_index(dst_offset);
Chris Masond1310b22008-01-24 16:13:08 -05006994
Qu Wenruof98b6212020-08-19 14:35:47 +08006995 if (check_eb_range(dst, dst_offset, len) ||
6996 check_eb_range(src, src_offset, len))
6997 return;
6998
Chris Masond1310b22008-01-24 16:13:08 -05006999 WARN_ON(src->len != dst_len);
7000
Qu Wenruo884b07d2020-12-02 14:48:04 +08007001 offset = get_eb_offset_in_page(dst, dst_offset);
Chris Masond1310b22008-01-24 16:13:08 -05007002
Chris Masond3977122009-01-05 21:25:51 -05007003 while (len > 0) {
David Sterbafb85fc92014-07-31 01:03:53 +02007004 page = dst->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007005 assert_eb_page_uptodate(dst, page);
Chris Masond1310b22008-01-24 16:13:08 -05007006
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007007 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
Chris Masond1310b22008-01-24 16:13:08 -05007008
Chris Masona6591712011-07-19 12:04:14 -04007009 kaddr = page_address(page);
Chris Masond1310b22008-01-24 16:13:08 -05007010 read_extent_buffer(src, kaddr + offset, src_offset, cur);
Chris Masond1310b22008-01-24 16:13:08 -05007011
7012 src_offset += cur;
7013 len -= cur;
7014 offset = 0;
7015 i++;
7016 }
7017}
Chris Masond1310b22008-01-24 16:13:08 -05007018
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007019/*
7020 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
7021 * given bit number
7022 * @eb: the extent buffer
7023 * @start: offset of the bitmap item in the extent buffer
7024 * @nr: bit number
7025 * @page_index: return index of the page in the extent buffer that contains the
7026 * given bit number
7027 * @page_offset: return offset into the page given by page_index
7028 *
7029 * This helper hides the ugliness of finding the byte in an extent buffer which
7030 * contains a given bit.
7031 */
David Sterba2b489662020-04-29 03:04:10 +02007032static inline void eb_bitmap_offset(const struct extent_buffer *eb,
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007033 unsigned long start, unsigned long nr,
7034 unsigned long *page_index,
7035 size_t *page_offset)
7036{
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007037 size_t byte_offset = BIT_BYTE(nr);
7038 size_t offset;
7039
7040 /*
7041 * The byte we want is the offset of the extent buffer + the offset of
7042 * the bitmap item in the extent buffer + the offset of the byte in the
7043 * bitmap item.
7044 */
Qu Wenruo884b07d2020-12-02 14:48:04 +08007045 offset = start + offset_in_page(eb->start) + byte_offset;
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007046
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007047 *page_index = offset >> PAGE_SHIFT;
Johannes Thumshirn70730172018-12-05 15:23:03 +01007048 *page_offset = offset_in_page(offset);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007049}
7050
7051/**
7052 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
7053 * @eb: the extent buffer
7054 * @start: offset of the bitmap item in the extent buffer
7055 * @nr: bit number to test
7056 */
David Sterba2b489662020-04-29 03:04:10 +02007057int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007058 unsigned long nr)
7059{
Omar Sandoval2fe1d552016-09-22 17:24:20 -07007060 u8 *kaddr;
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007061 struct page *page;
7062 unsigned long i;
7063 size_t offset;
7064
7065 eb_bitmap_offset(eb, start, nr, &i, &offset);
7066 page = eb->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007067 assert_eb_page_uptodate(eb, page);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007068 kaddr = page_address(page);
7069 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7070}
7071
7072/**
7073 * extent_buffer_bitmap_set - set an area of a bitmap
7074 * @eb: the extent buffer
7075 * @start: offset of the bitmap item in the extent buffer
7076 * @pos: bit number of the first bit
7077 * @len: number of bits to set
7078 */
David Sterba2b489662020-04-29 03:04:10 +02007079void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007080 unsigned long pos, unsigned long len)
7081{
Omar Sandoval2fe1d552016-09-22 17:24:20 -07007082 u8 *kaddr;
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007083 struct page *page;
7084 unsigned long i;
7085 size_t offset;
7086 const unsigned int size = pos + len;
7087 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
Omar Sandoval2fe1d552016-09-22 17:24:20 -07007088 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007089
7090 eb_bitmap_offset(eb, start, pos, &i, &offset);
7091 page = eb->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007092 assert_eb_page_uptodate(eb, page);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007093 kaddr = page_address(page);
7094
7095 while (len >= bits_to_set) {
7096 kaddr[offset] |= mask_to_set;
7097 len -= bits_to_set;
7098 bits_to_set = BITS_PER_BYTE;
Dan Carpenter9c894692016-10-12 11:33:21 +03007099 mask_to_set = ~0;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007100 if (++offset >= PAGE_SIZE && len > 0) {
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007101 offset = 0;
7102 page = eb->pages[++i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007103 assert_eb_page_uptodate(eb, page);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007104 kaddr = page_address(page);
7105 }
7106 }
7107 if (len) {
7108 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7109 kaddr[offset] |= mask_to_set;
7110 }
7111}
7112
7113
7114/**
7115 * extent_buffer_bitmap_clear - clear an area of a bitmap
7116 * @eb: the extent buffer
7117 * @start: offset of the bitmap item in the extent buffer
7118 * @pos: bit number of the first bit
7119 * @len: number of bits to clear
7120 */
David Sterba2b489662020-04-29 03:04:10 +02007121void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7122 unsigned long start, unsigned long pos,
7123 unsigned long len)
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007124{
Omar Sandoval2fe1d552016-09-22 17:24:20 -07007125 u8 *kaddr;
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007126 struct page *page;
7127 unsigned long i;
7128 size_t offset;
7129 const unsigned int size = pos + len;
7130 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
Omar Sandoval2fe1d552016-09-22 17:24:20 -07007131 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007132
7133 eb_bitmap_offset(eb, start, pos, &i, &offset);
7134 page = eb->pages[i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007135 assert_eb_page_uptodate(eb, page);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007136 kaddr = page_address(page);
7137
7138 while (len >= bits_to_clear) {
7139 kaddr[offset] &= ~mask_to_clear;
7140 len -= bits_to_clear;
7141 bits_to_clear = BITS_PER_BYTE;
Dan Carpenter9c894692016-10-12 11:33:21 +03007142 mask_to_clear = ~0;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007143 if (++offset >= PAGE_SIZE && len > 0) {
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007144 offset = 0;
7145 page = eb->pages[++i];
Qu Wenruob8f95772021-03-25 15:14:42 +08007146 assert_eb_page_uptodate(eb, page);
Omar Sandoval3e1e8bb2015-09-29 20:50:30 -07007147 kaddr = page_address(page);
7148 }
7149 }
7150 if (len) {
7151 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7152 kaddr[offset] &= ~mask_to_clear;
7153 }
7154}
7155
Sergei Trofimovich33872062011-04-11 21:52:52 +00007156static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7157{
7158 unsigned long distance = (src > dst) ? src - dst : dst - src;
7159 return distance < len;
7160}
7161
Chris Masond1310b22008-01-24 16:13:08 -05007162static void copy_pages(struct page *dst_page, struct page *src_page,
7163 unsigned long dst_off, unsigned long src_off,
7164 unsigned long len)
7165{
Chris Masona6591712011-07-19 12:04:14 -04007166 char *dst_kaddr = page_address(dst_page);
Chris Masond1310b22008-01-24 16:13:08 -05007167 char *src_kaddr;
Chris Mason727011e2010-08-06 13:21:20 -04007168 int must_memmove = 0;
Chris Masond1310b22008-01-24 16:13:08 -05007169
Sergei Trofimovich33872062011-04-11 21:52:52 +00007170 if (dst_page != src_page) {
Chris Masona6591712011-07-19 12:04:14 -04007171 src_kaddr = page_address(src_page);
Sergei Trofimovich33872062011-04-11 21:52:52 +00007172 } else {
Chris Masond1310b22008-01-24 16:13:08 -05007173 src_kaddr = dst_kaddr;
Chris Mason727011e2010-08-06 13:21:20 -04007174 if (areas_overlap(src_off, dst_off, len))
7175 must_memmove = 1;
Sergei Trofimovich33872062011-04-11 21:52:52 +00007176 }
Chris Masond1310b22008-01-24 16:13:08 -05007177
Chris Mason727011e2010-08-06 13:21:20 -04007178 if (must_memmove)
7179 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7180 else
7181 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
Chris Masond1310b22008-01-24 16:13:08 -05007182}
7183
David Sterba2b489662020-04-29 03:04:10 +02007184void memcpy_extent_buffer(const struct extent_buffer *dst,
7185 unsigned long dst_offset, unsigned long src_offset,
7186 unsigned long len)
Chris Masond1310b22008-01-24 16:13:08 -05007187{
7188 size_t cur;
7189 size_t dst_off_in_page;
7190 size_t src_off_in_page;
Chris Masond1310b22008-01-24 16:13:08 -05007191 unsigned long dst_i;
7192 unsigned long src_i;
7193
Qu Wenruof98b6212020-08-19 14:35:47 +08007194 if (check_eb_range(dst, dst_offset, len) ||
7195 check_eb_range(dst, src_offset, len))
7196 return;
Chris Masond1310b22008-01-24 16:13:08 -05007197
Chris Masond3977122009-01-05 21:25:51 -05007198 while (len > 0) {
Qu Wenruo884b07d2020-12-02 14:48:04 +08007199 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7200 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
Chris Masond1310b22008-01-24 16:13:08 -05007201
Qu Wenruo884b07d2020-12-02 14:48:04 +08007202 dst_i = get_eb_page_index(dst_offset);
7203 src_i = get_eb_page_index(src_offset);
Chris Masond1310b22008-01-24 16:13:08 -05007204
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007205 cur = min(len, (unsigned long)(PAGE_SIZE -
Chris Masond1310b22008-01-24 16:13:08 -05007206 src_off_in_page));
7207 cur = min_t(unsigned long, cur,
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03007208 (unsigned long)(PAGE_SIZE - dst_off_in_page));
Chris Masond1310b22008-01-24 16:13:08 -05007209
David Sterbafb85fc92014-07-31 01:03:53 +02007210 copy_pages(dst->pages[dst_i], dst->pages[src_i],
Chris Masond1310b22008-01-24 16:13:08 -05007211 dst_off_in_page, src_off_in_page, cur);
7212
7213 src_offset += cur;
7214 dst_offset += cur;
7215 len -= cur;
7216 }
7217}
Chris Masond1310b22008-01-24 16:13:08 -05007218
David Sterba2b489662020-04-29 03:04:10 +02007219void memmove_extent_buffer(const struct extent_buffer *dst,
7220 unsigned long dst_offset, unsigned long src_offset,
7221 unsigned long len)
Chris Masond1310b22008-01-24 16:13:08 -05007222{
7223 size_t cur;
7224 size_t dst_off_in_page;
7225 size_t src_off_in_page;
7226 unsigned long dst_end = dst_offset + len - 1;
7227 unsigned long src_end = src_offset + len - 1;
Chris Masond1310b22008-01-24 16:13:08 -05007228 unsigned long dst_i;
7229 unsigned long src_i;
7230
Qu Wenruof98b6212020-08-19 14:35:47 +08007231 if (check_eb_range(dst, dst_offset, len) ||
7232 check_eb_range(dst, src_offset, len))
7233 return;
Chris Mason727011e2010-08-06 13:21:20 -04007234 if (dst_offset < src_offset) {
Chris Masond1310b22008-01-24 16:13:08 -05007235 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7236 return;
7237 }
Chris Masond3977122009-01-05 21:25:51 -05007238 while (len > 0) {
Qu Wenruo884b07d2020-12-02 14:48:04 +08007239 dst_i = get_eb_page_index(dst_end);
7240 src_i = get_eb_page_index(src_end);
Chris Masond1310b22008-01-24 16:13:08 -05007241
Qu Wenruo884b07d2020-12-02 14:48:04 +08007242 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7243 src_off_in_page = get_eb_offset_in_page(dst, src_end);
Chris Masond1310b22008-01-24 16:13:08 -05007244
7245 cur = min_t(unsigned long, len, src_off_in_page + 1);
7246 cur = min(cur, dst_off_in_page + 1);
David Sterbafb85fc92014-07-31 01:03:53 +02007247 copy_pages(dst->pages[dst_i], dst->pages[src_i],
Chris Masond1310b22008-01-24 16:13:08 -05007248 dst_off_in_page - cur + 1,
7249 src_off_in_page - cur + 1, cur);
7250
7251 dst_end -= cur;
7252 src_end -= cur;
7253 len -= cur;
7254 }
7255}
Chris Mason6af118ce2008-07-22 11:18:07 -04007256
David Sterba01cd3902022-07-15 13:59:31 +02007257#define GANG_LOOKUP_SIZE 16
Qu Wenruod1e86e32021-01-26 16:33:56 +08007258static struct extent_buffer *get_next_extent_buffer(
7259 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7260{
David Sterba01cd3902022-07-15 13:59:31 +02007261 struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7262 struct extent_buffer *found = NULL;
Qu Wenruod1e86e32021-01-26 16:33:56 +08007263 u64 page_start = page_offset(page);
David Sterba01cd3902022-07-15 13:59:31 +02007264 u64 cur = page_start;
Qu Wenruod1e86e32021-01-26 16:33:56 +08007265
7266 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
Qu Wenruod1e86e32021-01-26 16:33:56 +08007267 lockdep_assert_held(&fs_info->buffer_lock);
7268
David Sterba01cd3902022-07-15 13:59:31 +02007269 while (cur < page_start + PAGE_SIZE) {
7270 int ret;
7271 int i;
7272
7273 ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7274 (void **)gang, cur >> fs_info->sectorsize_bits,
7275 min_t(unsigned int, GANG_LOOKUP_SIZE,
7276 PAGE_SIZE / fs_info->nodesize));
7277 if (ret == 0)
7278 goto out;
7279 for (i = 0; i < ret; i++) {
7280 /* Already beyond page end */
7281 if (gang[i]->start >= page_start + PAGE_SIZE)
7282 goto out;
7283 /* Found one */
7284 if (gang[i]->start >= bytenr) {
7285 found = gang[i];
7286 goto out;
7287 }
7288 }
7289 cur = gang[ret - 1]->start + gang[ret - 1]->len;
Qu Wenruod1e86e32021-01-26 16:33:56 +08007290 }
David Sterba01cd3902022-07-15 13:59:31 +02007291out:
7292 return found;
Qu Wenruod1e86e32021-01-26 16:33:56 +08007293}
7294
7295static int try_release_subpage_extent_buffer(struct page *page)
7296{
7297 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7298 u64 cur = page_offset(page);
7299 const u64 end = page_offset(page) + PAGE_SIZE;
7300 int ret;
7301
7302 while (cur < end) {
7303 struct extent_buffer *eb = NULL;
7304
7305 /*
7306 * Unlike try_release_extent_buffer() which uses page->private
7307 * to grab buffer, for subpage case we rely on radix tree, thus
7308 * we need to ensure radix tree consistency.
7309 *
7310 * We also want an atomic snapshot of the radix tree, thus go
7311 * with spinlock rather than RCU.
7312 */
7313 spin_lock(&fs_info->buffer_lock);
7314 eb = get_next_extent_buffer(fs_info, page, cur);
7315 if (!eb) {
7316 /* No more eb in the page range after or at cur */
7317 spin_unlock(&fs_info->buffer_lock);
7318 break;
7319 }
7320 cur = eb->start + eb->len;
7321
7322 /*
7323 * The same as try_release_extent_buffer(), to ensure the eb
7324 * won't disappear out from under us.
7325 */
7326 spin_lock(&eb->refs_lock);
7327 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7328 spin_unlock(&eb->refs_lock);
7329 spin_unlock(&fs_info->buffer_lock);
7330 break;
7331 }
7332 spin_unlock(&fs_info->buffer_lock);
7333
7334 /*
7335 * If tree ref isn't set then we know the ref on this eb is a
7336 * real ref, so just return, this eb will likely be freed soon
7337 * anyway.
7338 */
7339 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7340 spin_unlock(&eb->refs_lock);
7341 break;
7342 }
7343
7344 /*
7345 * Here we don't care about the return value, we will always
7346 * check the page private at the end. And
7347 * release_extent_buffer() will release the refs_lock.
7348 */
7349 release_extent_buffer(eb);
7350 }
7351 /*
7352 * Finally to check if we have cleared page private, as if we have
7353 * released all ebs in the page, the page private should be cleared now.
7354 */
7355 spin_lock(&page->mapping->private_lock);
7356 if (!PagePrivate(page))
7357 ret = 1;
7358 else
7359 ret = 0;
7360 spin_unlock(&page->mapping->private_lock);
7361 return ret;
7362
7363}
7364
David Sterbaf7a52a42013-04-26 14:56:29 +00007365int try_release_extent_buffer(struct page *page)
Miao Xie19fe0a82010-10-26 20:57:29 -04007366{
Chris Mason6af118ce2008-07-22 11:18:07 -04007367 struct extent_buffer *eb;
Miao Xie897ca6e92010-10-26 20:57:29 -04007368
Qu Wenruofbca46e2022-01-13 13:22:09 +08007369 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
Qu Wenruod1e86e32021-01-26 16:33:56 +08007370 return try_release_subpage_extent_buffer(page);
7371
Miao Xie19fe0a82010-10-26 20:57:29 -04007372 /*
Qu Wenruod1e86e32021-01-26 16:33:56 +08007373 * We need to make sure nobody is changing page->private, as we rely on
7374 * page->private as the pointer to extent buffer.
Miao Xie19fe0a82010-10-26 20:57:29 -04007375 */
Josef Bacik3083ee22012-03-09 16:01:49 -05007376 spin_lock(&page->mapping->private_lock);
7377 if (!PagePrivate(page)) {
7378 spin_unlock(&page->mapping->private_lock);
7379 return 1;
Miao Xie19fe0a82010-10-26 20:57:29 -04007380 }
7381
Josef Bacik3083ee22012-03-09 16:01:49 -05007382 eb = (struct extent_buffer *)page->private;
7383 BUG_ON(!eb);
Miao Xie19fe0a82010-10-26 20:57:29 -04007384
Josef Bacik0b32f4b2012-03-13 09:38:00 -04007385 /*
Josef Bacik3083ee22012-03-09 16:01:49 -05007386 * This is a little awful but should be ok, we need to make sure that
7387 * the eb doesn't disappear out from under us while we're looking at
7388 * this page.
7389 */
7390 spin_lock(&eb->refs_lock);
Josef Bacik0b32f4b2012-03-13 09:38:00 -04007391 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
Josef Bacik3083ee22012-03-09 16:01:49 -05007392 spin_unlock(&eb->refs_lock);
7393 spin_unlock(&page->mapping->private_lock);
7394 return 0;
7395 }
7396 spin_unlock(&page->mapping->private_lock);
7397
Josef Bacik3083ee22012-03-09 16:01:49 -05007398 /*
7399 * If tree ref isn't set then we know the ref on this eb is a real ref,
7400 * so just return, this page will likely be freed soon anyway.
7401 */
7402 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7403 spin_unlock(&eb->refs_lock);
7404 return 0;
7405 }
Josef Bacik3083ee22012-03-09 16:01:49 -05007406
David Sterbaf7a52a42013-04-26 14:56:29 +00007407 return release_extent_buffer(eb);
Chris Mason6af118ce2008-07-22 11:18:07 -04007408}
Josef Bacikbfb484d2020-11-05 10:45:09 -05007409
7410/*
7411 * btrfs_readahead_tree_block - attempt to readahead a child block
7412 * @fs_info: the fs_info
7413 * @bytenr: bytenr to read
Josef Bacik3fbaf252020-11-05 10:45:20 -05007414 * @owner_root: objectid of the root that owns this eb
Josef Bacikbfb484d2020-11-05 10:45:09 -05007415 * @gen: generation for the uptodate check, can be 0
Josef Bacik3fbaf252020-11-05 10:45:20 -05007416 * @level: level for the eb
Josef Bacikbfb484d2020-11-05 10:45:09 -05007417 *
7418 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
7419 * normal uptodate check of the eb, without checking the generation. If we have
7420 * to read the block we will not block on anything.
7421 */
7422void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
Josef Bacik3fbaf252020-11-05 10:45:20 -05007423 u64 bytenr, u64 owner_root, u64 gen, int level)
Josef Bacikbfb484d2020-11-05 10:45:09 -05007424{
7425 struct extent_buffer *eb;
7426 int ret;
7427
Josef Bacik3fbaf252020-11-05 10:45:20 -05007428 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
Josef Bacikbfb484d2020-11-05 10:45:09 -05007429 if (IS_ERR(eb))
7430 return;
7431
7432 if (btrfs_buffer_uptodate(eb, gen, 1)) {
7433 free_extent_buffer(eb);
7434 return;
7435 }
7436
7437 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7438 if (ret < 0)
7439 free_extent_buffer_stale(eb);
7440 else
7441 free_extent_buffer(eb);
7442}
7443
7444/*
7445 * btrfs_readahead_node_child - readahead a node's child block
7446 * @node: parent node we're reading from
7447 * @slot: slot in the parent node for the child we want to read
7448 *
7449 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7450 * the slot in the node provided.
7451 */
7452void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7453{
7454 btrfs_readahead_tree_block(node->fs_info,
7455 btrfs_node_blockptr(node, slot),
Josef Bacik3fbaf252020-11-05 10:45:20 -05007456 btrfs_header_owner(node),
7457 btrfs_node_ptr_generation(node, slot),
7458 btrfs_header_level(node) - 1);
Josef Bacikbfb484d2020-11-05 10:45:09 -05007459}