Blame - fs/buffer.c - linux

blob: 8c19e705b9c33f5603dc8799e385bfdc235a59cb [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2	/*
				3	* linux/fs/buffer.c
				4	*
				5	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				6	*/
				7
				8	/*
				9	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				10	*
				11	* Removed a lot of unnecessary code and simplified things now that
				12	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				13	*
				14	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				15	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				16	*
				17	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				18	*
				19	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				20	*/
				21
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	22	#include <linux/kernel.h>
Ingo Molnar	f361bf4	2017-02-03 23:47:37 +0100	[diff] [blame]	23	#include <linux/sched/signal.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/syscalls.h>
				25	#include <linux/fs.h>
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	26	#include <linux/iomap.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	27	#include <linux/mm.h>
				28	#include <linux/percpu.h>
				29	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	30	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	31	#include <linux/blkdev.h>
				32	#include <linux/file.h>
				33	#include <linux/quotaops.h>
				34	#include <linux/highmem.h>
Paul Gortmaker	630d9c4	2011-11-16 23:57:37 -0500	[diff] [blame]	35	#include <linux/export.h>
Tejun Heo	bafc0db	2015-06-02 08:37:23 -0600	[diff] [blame]	36	#include <linux/backing-dev.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	37	#include <linux/writeback.h>
				38	#include <linux/hash.h>
				39	#include <linux/suspend.h>
				40	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	41	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	42	#include <linux/bio.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	43	#include <linux/cpu.h>
				44	#include <linux/bitops.h>
				45	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	46	#include <linux/bit_spinlock.h>
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	47	#include <linux/pagevec.h>
Shakeel Butt	f745c6f	2018-08-17 15:46:44 -0700	[diff] [blame]	48	#include <linux/sched/mm.h>
Tejun Heo	5305cb8	2013-01-11 13:06:36 -0800	[diff] [blame]	49	#include <trace/events/block.h>
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	50	#include <linux/fscrypt.h>
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	51	#include <linux/fsverity.h>
Marcelo Tosatti	8a237ad	2023-06-27 17:08:15 -0300	[diff] [blame]	52	#include <linux/sched/isolation.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	53
Ben Dooks	2b211dc	2019-11-30 17:49:18 -0800	[diff] [blame]	54	#include "internal.h"
				55
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	56	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Ritesh Harjani (IBM)	5bdf402	2022-08-18 10:34:40 +0530	[diff] [blame]	57	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	58	enum rw_hint hint, struct writeback_control *wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	59
				60	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				61
Tejun Heo	f0059af	2013-01-11 13:06:35 -0800	[diff] [blame]	62	inline void touch_buffer(struct buffer_head *bh)
				63	{
Tejun Heo	5305cb8	2013-01-11 13:06:36 -0800	[diff] [blame]	64	trace_block_touch_buffer(bh);
Matthew Wilcox (Oracle)	03c5f33	2022-12-15 21:43:53 +0000	[diff] [blame]	65	folio_mark_accessed(bh->b_folio);
Tejun Heo	f0059af	2013-01-11 13:06:35 -0800	[diff] [blame]	66	}
				67	EXPORT_SYMBOL(touch_buffer);
				68
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	69	void __lock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	70	{
NeilBrown	7431620	2014-07-07 15:16:04 +1000	[diff] [blame]	71	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72	}
				73	EXPORT_SYMBOL(__lock_buffer);
				74
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	75	void unlock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	76	{
Nick Piggin	51b07fc	2008-10-18 20:27:00 -0700	[diff] [blame]	77	clear_bit_unlock(BH_Lock, &bh->b_state);
Peter Zijlstra	4e857c5	2014-03-17 18:06:10 +0100	[diff] [blame]	78	smp_mb__after_atomic();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	79	wake_up_bit(&bh->b_state, BH_Lock);
				80	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	81	EXPORT_SYMBOL(unlock_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	82
				83	/*
Matthew Wilcox (Oracle)	520f301	2022-01-17 14:35:22 -0500	[diff] [blame]	84	* Returns if the folio has dirty or writeback buffers. If all the buffers
				85	* are unlocked and clean then the folio_test_dirty information is stale. If
				86	* any of the buffers are locked, it is assumed they are locked for IO.
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	87	*/
Matthew Wilcox (Oracle)	520f301	2022-01-17 14:35:22 -0500	[diff] [blame]	88	void buffer_check_dirty_writeback(struct folio *folio,
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	89	bool dirty, bool writeback)
				90	{
				91	struct buffer_head head, bh;
				92	*dirty = false;
				93	*writeback = false;
				94
Matthew Wilcox (Oracle)	520f301	2022-01-17 14:35:22 -0500	[diff] [blame]	95	BUG_ON(!folio_test_locked(folio));
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	96
Matthew Wilcox (Oracle)	520f301	2022-01-17 14:35:22 -0500	[diff] [blame]	97	head = folio_buffers(folio);
				98	if (!head)
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	99	return;
				100
Matthew Wilcox (Oracle)	520f301	2022-01-17 14:35:22 -0500	[diff] [blame]	101	if (folio_test_writeback(folio))
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	102	*writeback = true;
				103
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	104	bh = head;
				105	do {
				106	if (buffer_locked(bh))
				107	*writeback = true;
				108
				109	if (buffer_dirty(bh))
				110	*dirty = true;
				111
				112	bh = bh->b_this_page;
				113	} while (bh != head);
				114	}
Mel Gorman	b459722	2013-07-03 15:02:05 -0700	[diff] [blame]	115
				116	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	117	* Block until a buffer comes unlocked. This doesn't stop it
				118	* from becoming locked again - you have to lock it yourself
				119	* if you want to preserve its state.
				120	*/
				121	void __wait_on_buffer(struct buffer_head * bh)
				122	{
NeilBrown	7431620	2014-07-07 15:16:04 +1000	[diff] [blame]	123	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	124	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	125	EXPORT_SYMBOL(__wait_on_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	126
Robert Elliott	b744c2a	2014-10-21 13:55:09 -0600	[diff] [blame]	127	static void buffer_io_error(struct buffer_head bh, char msg)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	{
Robert Elliott	432f16e	2014-10-21 13:55:11 -0600	[diff] [blame]	129	if (!test_bit(BH_Quiet, &bh->b_state))
				130	printk_ratelimited(KERN_ERR
Dmitry Monakhov	a1c6f057	2015-04-13 16:31:37 +0400	[diff] [blame]	131	"Buffer I/O error on dev %pg, logical block %llu%s\n",
				132	bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	133	}
				134
				135	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	136	* End-of-IO handler helper function which does not touch the bh after
				137	* unlocking it.
				138	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				139	* a race there is benign: unlock_buffer() only use the bh's address for
				140	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				141	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	142	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	143	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	144	{
				145	if (uptodate) {
				146	set_buffer_uptodate(bh);
				147	} else {
Christoph Hellwig	7024628	2016-07-19 11:28:41 +0200	[diff] [blame]	148	/* This happens, due to failed read-ahead attempts. */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	149	clear_buffer_uptodate(bh);
				150	}
				151	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	152	}
				153
				154	/*
				155	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
Zhang Yi	79f5978	2022-09-01 21:35:03 +0800	[diff] [blame]	156	* unlock the buffer.
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	157	*/
				158	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				159	{
				160	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	161	put_bh(bh);
				162	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	163	EXPORT_SYMBOL(end_buffer_read_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164
				165	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				166	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	167	if (uptodate) {
				168	set_buffer_uptodate(bh);
				169	} else {
Robert Elliott	432f16e	2014-10-21 13:55:11 -0600	[diff] [blame]	170	buffer_io_error(bh, ", lost sync page write");
Jeff Layton	87354e5	2017-07-06 07:02:21 -0400	[diff] [blame]	171	mark_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	172	clear_buffer_uptodate(bh);
				173	}
				174	unlock_buffer(bh);
				175	put_bh(bh);
				176	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	177	EXPORT_SYMBOL(end_buffer_write_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	178
				179	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	180	* Various filesystems appear to want __find_get_block to be non-blocking.
				181	* But it's the page lock which protects the buffers. To get around this,
				182	* we get exclusion from try_to_free_buffers with the blockdev mapping's
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	183	* i_private_lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	185	* Hack idea: for the blockdev mapping, i_private_lock contention
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	186	* may be quite high. This code could TryLock the page, and if that
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	187	* succeeds, there is no need to take i_private_lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	188	*/
				189	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	190	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	191	{
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	192	struct address_space *bd_mapping = bdev->bd_mapping;
				193	const int blkbits = bd_mapping->host->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194	struct buffer_head *ret = NULL;
				195	pgoff_t index;
				196	struct buffer_head *bh;
				197	struct buffer_head *head;
Matthew Wilcox (Oracle)	eee25182	2023-06-12 22:01:40 +0100	[diff] [blame]	198	struct folio *folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	int all_mapped = 1;
Tetsuo Handa	43636c8	2019-01-21 22:49:37 +0900	[diff] [blame]	200	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	201
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	202	index = ((loff_t)block << blkbits) / PAGE_SIZE;
Matthew Wilcox (Oracle)	eee25182	2023-06-12 22:01:40 +0100	[diff] [blame]	203	folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
				204	if (IS_ERR(folio))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205	goto out;
				206
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	207	spin_lock(&bd_mapping->i_private_lock);
Matthew Wilcox (Oracle)	eee25182	2023-06-12 22:01:40 +0100	[diff] [blame]	208	head = folio_buffers(folio);
				209	if (!head)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	210	goto out_unlock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	211	bh = head;
				212	do {
Nikanth Karthikesan	97f76d3	2009-04-02 16:56:46 -0700	[diff] [blame]	213	if (!buffer_mapped(bh))
				214	all_mapped = 0;
				215	else if (bh->b_blocknr == block) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	216	ret = bh;
				217	get_bh(bh);
				218	goto out_unlock;
				219	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	220	bh = bh->b_this_page;
				221	} while (bh != head);
				222
				223	/* we might be here because some of the buffers on this page are
				224	* not mapped. This is due to various races between
				225	* file io on the block device and getblk. It gets dealt with
				226	* elsewhere, don't buffer_error if we had some unmapped buffers
				227	*/
Tetsuo Handa	43636c8	2019-01-21 22:49:37 +0900	[diff] [blame]	228	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
				229	if (all_mapped && __ratelimit(&last_warned)) {
				230	printk("__find_get_block_slow() failed. block=%llu, "
				231	"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
				232	"device %pg blocksize: %d\n",
				233	(unsigned long long)block,
				234	(unsigned long long)bh->b_blocknr,
				235	bh->b_state, bh->b_size, bdev,
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	236	1 << blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	237	}
				238	out_unlock:
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	239	spin_unlock(&bd_mapping->i_private_lock);
Matthew Wilcox (Oracle)	eee25182	2023-06-12 22:01:40 +0100	[diff] [blame]	240	folio_put(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	241	out:
				242	return ret;
				243	}
				244
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	245	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				246	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	247	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	248	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	249	struct buffer_head *tmp;
Matthew Wilcox (Oracle)	2e2dba1	2022-12-15 21:43:54 +0000	[diff] [blame]	250	struct folio *folio;
				251	int folio_uptodate = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	252
				253	BUG_ON(!buffer_async_read(bh));
				254
Matthew Wilcox (Oracle)	2e2dba1	2022-12-15 21:43:54 +0000	[diff] [blame]	255	folio = bh->b_folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	256	if (uptodate) {
				257	set_buffer_uptodate(bh);
				258	} else {
				259	clear_buffer_uptodate(bh);
Robert Elliott	432f16e	2014-10-21 13:55:11 -0600	[diff] [blame]	260	buffer_io_error(bh, ", async page read");
Matthew Wilcox (Oracle)	2e2dba1	2022-12-15 21:43:54 +0000	[diff] [blame]	261	folio_set_error(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	262	}
				263
				264	/*
				265	* Be _very_ careful from here on. Bad things can happen if
				266	* two buffer heads end IO at almost the same time and both
				267	* decide that the page is now completely done.
				268	*/
Matthew Wilcox (Oracle)	2e2dba1	2022-12-15 21:43:54 +0000	[diff] [blame]	269	first = folio_buffers(folio);
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	270	spin_lock_irqsave(&first->b_uptodate_lock, flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	271	clear_buffer_async_read(bh);
				272	unlock_buffer(bh);
				273	tmp = bh;
				274	do {
				275	if (!buffer_uptodate(tmp))
Matthew Wilcox (Oracle)	2e2dba1	2022-12-15 21:43:54 +0000	[diff] [blame]	276	folio_uptodate = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	277	if (buffer_async_read(tmp)) {
				278	BUG_ON(!buffer_locked(tmp));
				279	goto still_busy;
				280	}
				281	tmp = tmp->b_this_page;
				282	} while (tmp != bh);
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	283	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	284
Matthew Wilcox (Oracle)	6ba924d	2023-10-04 17:53:05 +0100	[diff] [blame]	285	folio_end_read(folio, folio_uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	286	return;
				287
				288	still_busy:
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	289	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	290	return;
				291	}
				292
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	293	struct postprocess_bh_ctx {
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	294	struct work_struct work;
				295	struct buffer_head *bh;
				296	};
				297
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	298	static void verify_bh(struct work_struct *work)
				299	{
				300	struct postprocess_bh_ctx *ctx =
				301	container_of(work, struct postprocess_bh_ctx, work);
				302	struct buffer_head *bh = ctx->bh;
				303	bool valid;
				304
Eric Biggers	8b7d3fe	2023-02-24 15:25:30 -0800	[diff] [blame]	305	valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	306	end_buffer_async_read(bh, valid);
				307	kfree(ctx);
				308	}
				309
				310	static bool need_fsverity(struct buffer_head *bh)
				311	{
Eric Biggers	8b7d3fe	2023-02-24 15:25:30 -0800	[diff] [blame]	312	struct folio *folio = bh->b_folio;
				313	struct inode *inode = folio->mapping->host;
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	314
				315	return fsverity_active(inode) &&
				316	/* needed by ext4 */
Eric Biggers	8b7d3fe	2023-02-24 15:25:30 -0800	[diff] [blame]	317	folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	318	}
				319
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	320	static void decrypt_bh(struct work_struct *work)
				321	{
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	322	struct postprocess_bh_ctx *ctx =
				323	container_of(work, struct postprocess_bh_ctx, work);
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	324	struct buffer_head *bh = ctx->bh;
				325	int err;
				326
Eric Biggers	9c7fb7f	2023-02-24 15:25:03 -0800	[diff] [blame]	327	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
				328	bh_offset(bh));
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	329	if (err == 0 && need_fsverity(bh)) {
				330	/*
				331	* We use different work queues for decryption and for verity
				332	* because verity may require reading metadata pages that need
				333	* decryption, and we shouldn't recurse to the same workqueue.
				334	*/
				335	INIT_WORK(&ctx->work, verify_bh);
				336	fsverity_enqueue_verify_work(&ctx->work);
				337	return;
				338	}
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	339	end_buffer_async_read(bh, err == 0);
				340	kfree(ctx);
				341	}
				342
				343	/*
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	344	* I/O completion handler for block_read_full_folio() - pages
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	345	* which come unlocked at the end of I/O.
				346	*/
				347	static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
				348	{
Linus Torvalds	3822a7c	2023-02-23 17:09:35 -0800	[diff] [blame]	349	struct inode *inode = bh->b_folio->mapping->host;
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	350	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
				351	bool verify = need_fsverity(bh);
				352
				353	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
				354	if (uptodate && (decrypt \|\| verify)) {
				355	struct postprocess_bh_ctx *ctx =
				356	kmalloc(sizeof(*ctx), GFP_ATOMIC);
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	357
				358	if (ctx) {
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	359	ctx->bh = bh;
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	360	if (decrypt) {
				361	INIT_WORK(&ctx->work, decrypt_bh);
				362	fscrypt_enqueue_decrypt_work(&ctx->work);
				363	} else {
				364	INIT_WORK(&ctx->work, verify_bh);
				365	fsverity_enqueue_verify_work(&ctx->work);
				366	}
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	367	return;
				368	}
				369	uptodate = 0;
				370	}
				371	end_buffer_async_read(bh, uptodate);
				372	}
				373
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	374	/*
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	375	* Completion handler for block_write_full_folio() - folios which are unlocked
				376	* during I/O, and which have the writeback flag cleared upon I/O completion.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	377	*/
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	378	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	379	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	380	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	381	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	382	struct buffer_head *tmp;
Matthew Wilcox (Oracle)	743ed81	2022-12-15 21:43:55 +0000	[diff] [blame]	383	struct folio *folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384
				385	BUG_ON(!buffer_async_write(bh));
				386
Matthew Wilcox (Oracle)	743ed81	2022-12-15 21:43:55 +0000	[diff] [blame]	387	folio = bh->b_folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	388	if (uptodate) {
				389	set_buffer_uptodate(bh);
				390	} else {
Robert Elliott	432f16e	2014-10-21 13:55:11 -0600	[diff] [blame]	391	buffer_io_error(bh, ", lost async page write");
Jeff Layton	87354e5	2017-07-06 07:02:21 -0400	[diff] [blame]	392	mark_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	393	clear_buffer_uptodate(bh);
Matthew Wilcox (Oracle)	743ed81	2022-12-15 21:43:55 +0000	[diff] [blame]	394	folio_set_error(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	395	}
				396
Matthew Wilcox (Oracle)	743ed81	2022-12-15 21:43:55 +0000	[diff] [blame]	397	first = folio_buffers(folio);
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	398	spin_lock_irqsave(&first->b_uptodate_lock, flags);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	399
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	400	clear_buffer_async_write(bh);
				401	unlock_buffer(bh);
				402	tmp = bh->b_this_page;
				403	while (tmp != bh) {
				404	if (buffer_async_write(tmp)) {
				405	BUG_ON(!buffer_locked(tmp));
				406	goto still_busy;
				407	}
				408	tmp = tmp->b_this_page;
				409	}
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	410	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
Matthew Wilcox (Oracle)	743ed81	2022-12-15 21:43:55 +0000	[diff] [blame]	411	folio_end_writeback(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	412	return;
				413
				414	still_busy:
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	415	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	416	return;
				417	}
				418
				419	/*
				420	* If a page's buffers are under async readin (end_buffer_async_read
				421	* completion) then there is a possibility that another thread of
				422	* control could lock one of the buffers after it has completed
				423	* but while some of the other buffers have not completed. This
				424	* locked buffer would confuse end_buffer_async_read() into not unlocking
				425	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				426	* that this buffer is not under async I/O.
				427	*
				428	* The page comes unlocked when it has no locked buffer_async buffers
				429	* left.
				430	*
				431	* PageLocked prevents anyone starting new async I/O reads any of
				432	* the buffers.
				433	*
				434	* PageWriteback is used to prevent simultaneous writeout of the same
				435	* page.
				436	*
				437	* PageLocked prevents anyone from starting writeback of a page which is
				438	* under read I/O (PageWriteback is only ever set against a locked page).
				439	*/
				440	static void mark_buffer_async_read(struct buffer_head *bh)
				441	{
Eric Biggers	31fb992	2019-10-22 20:33:11 -0700	[diff] [blame]	442	bh->b_end_io = end_buffer_async_read_io;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	443	set_buffer_async_read(bh);
				444	}
				445
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	446	static void mark_buffer_async_write_endio(struct buffer_head *bh,
				447	bh_end_io_t *handler)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	448	{
				449	bh->b_end_io = handler;
				450	set_buffer_async_write(bh);
				451	}
				452
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	453	void mark_buffer_async_write(struct buffer_head *bh)
				454	{
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	455	mark_buffer_async_write_endio(bh, end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	456	}
				457	EXPORT_SYMBOL(mark_buffer_async_write);
				458
				459
				460	/*
				461	* fs/buffer.c contains helper functions for buffer-backed address space's
				462	* fsync functions. A common requirement for buffer-based filesystems is
				463	* that certain data from the backing blockdev needs to be written out for
				464	* a successful fsync(). For example, ext2 indirect blocks need to be
				465	* written back and waited upon before fsync() returns.
				466	*
Andreas Gruenbacher	73f65b8	2024-01-08 18:20:40 +0100	[diff] [blame]	467	* The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	468	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	469	* management of a list of dependent buffers at ->i_mapping->i_private_list.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	470	*
				471	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				472	* from their controlling inode's queue when they are being freed. But
				473	* try_to_free_buffers() will be operating against the blockdev mapping
				474	* at the time, not against the S_ISREG file which depends on those buffers.
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	475	* So the locking for i_private_list is via the i_private_lock in the address_space
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	* which backs the buffers. Which is different from the address_space
				477	* against which the buffers are listed. So for a particular address_space,
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	478	* mapping->i_private_lock does not protect mapping->i_private_list! In fact,
				479	* mapping->i_private_list will always be protected by the backing blockdev's
				480	* ->i_private_lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	481	*
				482	* Which introduces a requirement: all buffers on an address_space's
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	483	* ->i_private_list must be from the same address_space: the blockdev's.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	485	* address_spaces which do not place buffers at ->i_private_list via these
				486	* utility functions are free to use i_private_lock and i_private_list for
				487	* whatever they want. The only requirement is that list_empty(i_private_list)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	488	* be true at clear_inode() time.
				489	*
				490	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				491	* filesystems should do that. invalidate_inode_buffers() should just go
				492	* BUG_ON(!list_empty).
				493	*
				494	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				495	* take an address_space, not an inode. And it should be called
				496	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				497	* queued up.
				498	*
				499	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				500	* list if it is already on a list. Because if the buffer is on a list,
				501	* it must already be on the right one. If not, the filesystem is being
				502	* silly. This will save a ton of locking. But first we have to ensure
				503	* that buffers are taken off the old inode's list when they are freed
				504	* (presumably in truncate). That requires careful auditing of all
				505	* filesystems (do it inside bforget()). It could also be done by bringing
				506	* b_inode back.
				507	*/
				508
				509	/*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	510	* The buffer's backing address_space's i_private_lock must be held
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	511	*/
Thomas Petazzoni	dbacefc	2008-07-29 22:33:47 -0700	[diff] [blame]	512	static void __remove_assoc_queue(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	513	{
				514	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407b	2006-10-17 00:10:19 -0700	[diff] [blame]	515	WARN_ON(!bh->b_assoc_map);
Jan Kara	58ff407b	2006-10-17 00:10:19 -0700	[diff] [blame]	516	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	517	}
				518
				519	int inode_has_buffers(struct inode *inode)
				520	{
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	521	return !list_empty(&inode->i_data.i_private_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	522	}
				523
				524	/*
				525	* osync is designed to support O_SYNC io. It waits synchronously for
				526	* all already-submitted IO to complete, but does not queue any new
				527	* writes to the disk.
				528	*
Zhang Yi	79f5978	2022-09-01 21:35:03 +0800	[diff] [blame]	529	* To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
				530	* as you dirty the buffers, and then use osync_inode_buffers to wait for
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	531	* completion. Any other dirty buffers which are not yet queued for
				532	* write will not be flushed to disk by the osync.
				533	*/
				534	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				535	{
				536	struct buffer_head *bh;
				537	struct list_head *p;
				538	int err = 0;
				539
				540	spin_lock(lock);
				541	repeat:
				542	list_for_each_prev(p, list) {
				543	bh = BH_ENTRY(p);
				544	if (buffer_locked(bh)) {
				545	get_bh(bh);
				546	spin_unlock(lock);
				547	wait_on_buffer(bh);
				548	if (!buffer_uptodate(bh))
				549	err = -EIO;
				550	brelse(bh);
				551	spin_lock(lock);
				552	goto repeat;
				553	}
				554	}
				555	spin_unlock(lock);
				556	return err;
				557	}
				558
				559	/**
Randy Dunlap	78a4a50	2008-02-29 22:02:31 -0800	[diff] [blame]	560	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	561	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	562	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	563	* Starts I/O against the buffers at mapping->i_private_list, and waits upon
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	564	* that I/O.
				565	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	566	* Basically, this is a convenience function for fsync().
				567	* @mapping is a file or directory which needs those buffers to be written for
				568	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	569	*/
				570	int sync_mapping_buffers(struct address_space *mapping)
				571	{
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	572	struct address_space *buffer_mapping = mapping->i_private_data;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	573
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	574	if (buffer_mapping == NULL \|\| list_empty(&mapping->i_private_list))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	575	return 0;
				576
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	577	return fsync_buffers_list(&buffer_mapping->i_private_lock,
				578	&mapping->i_private_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	579	}
				580	EXPORT_SYMBOL(sync_mapping_buffers);
				581
Ritesh Harjani (IBM)	31b2ebc	2023-04-21 15:16:12 +0530	[diff] [blame]	582	/**
				583	* generic_buffers_fsync_noflush - generic buffer fsync implementation
				584	* for simple filesystems with no inode lock
				585	*
				586	* @file: file to synchronize
				587	* @start: start offset in bytes
				588	* @end: end offset in bytes (inclusive)
				589	* @datasync: only synchronize essential metadata if true
				590	*
				591	* This is a generic implementation of the fsync method for simple
				592	* filesystems which track all non-inode metadata in the buffers list
				593	* hanging off the address_space structure.
				594	*/
				595	int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
				596	bool datasync)
				597	{
				598	struct inode *inode = file->f_mapping->host;
				599	int err;
				600	int ret;
				601
				602	err = file_write_and_wait_range(file, start, end);
				603	if (err)
				604	return err;
				605
				606	ret = sync_mapping_buffers(inode->i_mapping);
				607	if (!(inode->i_state & I_DIRTY_ALL))
				608	goto out;
				609	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
				610	goto out;
				611
				612	err = sync_inode_metadata(inode, 1);
				613	if (ret == 0)
				614	ret = err;
				615
				616	out:
				617	/* check and advance again to catch errors after syncing out buffers */
				618	err = file_check_and_advance_wb_err(file);
				619	if (ret == 0)
				620	ret = err;
				621	return ret;
				622	}
				623	EXPORT_SYMBOL(generic_buffers_fsync_noflush);
				624
				625	/**
				626	* generic_buffers_fsync - generic buffer fsync implementation
				627	* for simple filesystems with no inode lock
				628	*
				629	* @file: file to synchronize
				630	* @start: start offset in bytes
				631	* @end: end offset in bytes (inclusive)
				632	* @datasync: only synchronize essential metadata if true
				633	*
				634	* This is a generic implementation of the fsync method for simple
				635	* filesystems which track all non-inode metadata in the buffers list
				636	* hanging off the address_space structure. This also makes sure that
				637	* a device cache flush operation is called at the end.
				638	*/
				639	int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
				640	bool datasync)
				641	{
				642	struct inode *inode = file->f_mapping->host;
				643	int ret;
				644
				645	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
				646	if (!ret)
				647	ret = blkdev_issue_flush(inode->i_sb->s_bdev);
				648	return ret;
				649	}
				650	EXPORT_SYMBOL(generic_buffers_fsync);
				651
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	652	/*
				653	* Called when we've recently written block `bblock', and it is known that
				654	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				655	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				656	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				657	*/
				658	void write_boundary_block(struct block_device *bdev,
				659	sector_t bblock, unsigned blocksize)
				660	{
				661	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				662	if (bh) {
				663	if (buffer_dirty(bh))
Zhang Yi	e7ea112	2022-09-01 21:34:54 +0800	[diff] [blame]	664	write_dirty_buffer(bh, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	665	put_bh(bh);
				666	}
				667	}
				668
				669	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				670	{
				671	struct address_space *mapping = inode->i_mapping;
Matthew Wilcox (Oracle)	abc8a8a	2022-12-15 21:43:52 +0000	[diff] [blame]	672	struct address_space *buffer_mapping = bh->b_folio->mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	673
				674	mark_buffer_dirty(bh);
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	675	if (!mapping->i_private_data) {
				676	mapping->i_private_data = buffer_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	677	} else {
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	678	BUG_ON(mapping->i_private_data != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	679	}
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	680	if (!bh->b_assoc_map) {
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	681	spin_lock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	682	list_move_tail(&bh->b_assoc_buffers,
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	683	&mapping->i_private_list);
Jan Kara	58ff407b	2006-10-17 00:10:19 -0700	[diff] [blame]	684	bh->b_assoc_map = mapping;
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	685	spin_unlock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	686	}
				687	}
				688	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				689
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	690	/**
				691	* block_dirty_folio - Mark a folio as dirty.
				692	* @mapping: The address space containing this folio.
				693	* @folio: The folio to mark dirty.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	694	*
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	695	* Filesystems which use buffer_heads can use this function as their
				696	* ->dirty_folio implementation. Some filesystems need to do a little
				697	* work before calling this function. Filesystems which do not use
				698	* buffer_heads should call filemap_dirty_folio() instead.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	699	*
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	700	* If the folio has buffers, the uptodate buffers are set dirty, to
				701	* preserve dirty-state coherency between the folio and the buffers.
				702	* Buffers added to a dirty folio are created dirty.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	703	*
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	704	* The buffers are dirtied before the folio is dirtied. There's a small
				705	* race window in which writeback may see the folio cleanness but not the
				706	* buffer dirtiness. That's fine. If this code were to set the folio
				707	* dirty before the buffers, writeback could clear the folio dirty flag,
				708	* see a bunch of clean buffers and we'd end up with dirty buffers/clean
				709	* folio on the dirty folio list.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	710	*
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	711	* We use i_private_lock to lock against try_to_free_buffers() while
				712	* using the folio's buffer list. This also prevents clean buffers
				713	* being added to the folio after it was set dirty.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	714	*
Matthew Wilcox (Oracle)	3814ec8	2024-04-16 04:17:46 +0100	[diff] [blame]	715	* Context: May only be called from process context. Does not sleep.
				716	* Caller must ensure that @folio cannot be truncated during this call,
				717	* typically by holding the folio lock or having a page in the folio
				718	* mapped and holding the page table lock.
				719	*
				720	* Return: True if the folio was dirtied; false if it was already dirtied.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	721	*/
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	722	bool block_dirty_folio(struct address_space mapping, struct folio folio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	723	{
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	724	struct buffer_head *head;
				725	bool newly_dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	726
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	727	spin_lock(&mapping->i_private_lock);
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	728	head = folio_buffers(folio);
				729	if (head) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	730	struct buffer_head *bh = head;
				731
				732	do {
				733	set_buffer_dirty(bh);
				734	bh = bh->b_this_page;
				735	} while (bh != head);
				736	}
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	737	/*
Roman Gushchin	bcfe06b	2020-12-01 13:58:27 -0800	[diff] [blame]	738	* Lock out page's memcg migration to keep PageDirty
Johannes Weiner	81f8c3a	2016-03-15 14:57:04 -0700	[diff] [blame]	739	* synchronized with per-memcg dirty page counters.
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	740	*/
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	741	folio_memcg_lock(folio);
				742	newly_dirty = !folio_test_set_dirty(folio);
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	743	spin_unlock(&mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	744
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	745	if (newly_dirty)
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	746	__folio_mark_dirty(folio, mapping, 1);
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	747
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	748	folio_memcg_unlock(folio);
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	749
				750	if (newly_dirty)
				751	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				752
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	753	return newly_dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	754	}
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	755	EXPORT_SYMBOL(block_dirty_folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	756
				757	/*
				758	* Write out and wait upon a list of buffers.
				759	*
				760	* We have conflicting pressures: we want to make sure that all
				761	* initially dirty buffers get waited on, but that any subsequently
				762	* dirtied buffers don't. After all, we don't want fsync to last
				763	* forever if somebody is actively writing to the file.
				764	*
				765	* Do this in two main stages: first we copy dirty buffers to a
				766	* temporary inode list, queueing the writes as we go. Then we clean
				767	* up, waiting for those writes to complete.
				768	*
				769	* During this second stage, any subsequent updates to the file may end
				770	* up refiling the buffer on the original inode's dirty list again, so
				771	* there is a chance we will end up with a buffer queued for write but
				772	* not yet completed on that list. So, as a final cleanup we go through
				773	* the osync code to catch these locked, dirty buffers without requeuing
				774	* any newly dirty buffers for write.
				775	*/
				776	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				777	{
				778	struct buffer_head *bh;
				779	struct list_head tmp;
Jens Axboe	7eaceac	2011-03-10 08:52:07 +0100	[diff] [blame]	780	struct address_space *mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	781	int err = 0, err2;
Jens Axboe	4ee2491	2011-03-17 10:51:40 +0100	[diff] [blame]	782	struct blk_plug plug;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	783
				784	INIT_LIST_HEAD(&tmp);
Jens Axboe	4ee2491	2011-03-17 10:51:40 +0100	[diff] [blame]	785	blk_start_plug(&plug);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	786
				787	spin_lock(lock);
				788	while (!list_empty(list)) {
				789	bh = BH_ENTRY(list->next);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	790	mapping = bh->b_assoc_map;
Jan Kara	58ff407b	2006-10-17 00:10:19 -0700	[diff] [blame]	791	__remove_assoc_queue(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	792	/* Avoid race with mark_buffer_dirty_inode() which does
				793	* a lockless check and we rely on seeing the dirty bit */
				794	smp_mb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	795	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				796	list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	797	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	798	if (buffer_dirty(bh)) {
				799	get_bh(bh);
				800	spin_unlock(lock);
				801	/*
				802	* Ensure any pending I/O completes so that
Christoph Hellwig	9cb569d	2010-08-11 17:06:24 +0200	[diff] [blame]	803	* write_dirty_buffer() actually writes the
				804	* current contents - it is a noop if I/O is
				805	* still in flight on potentially older
				806	* contents.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	807	*/
Christoph Hellwig	70fd761	2016-11-01 07:40:10 -0600	[diff] [blame]	808	write_dirty_buffer(bh, REQ_SYNC);
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	809
				810	/*
				811	* Kick off IO for the previous mapping. Note
				812	* that we will not run the very last mapping,
				813	* wait_on_buffer() will do that for us
				814	* through sync_buffer().
				815	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	816	brelse(bh);
				817	spin_lock(lock);
				818	}
				819	}
				820	}
				821
Jens Axboe	4ee2491	2011-03-17 10:51:40 +0100	[diff] [blame]	822	spin_unlock(lock);
				823	blk_finish_plug(&plug);
				824	spin_lock(lock);
				825
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	826	while (!list_empty(&tmp)) {
				827	bh = BH_ENTRY(tmp.prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	828	get_bh(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	829	mapping = bh->b_assoc_map;
				830	__remove_assoc_queue(bh);
				831	/* Avoid race with mark_buffer_dirty_inode() which does
				832	* a lockless check and we rely on seeing the dirty bit */
				833	smp_mb();
				834	if (buffer_dirty(bh)) {
				835	list_add(&bh->b_assoc_buffers,
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	836	&mapping->i_private_list);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	837	bh->b_assoc_map = mapping;
				838	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	839	spin_unlock(lock);
				840	wait_on_buffer(bh);
				841	if (!buffer_uptodate(bh))
				842	err = -EIO;
				843	brelse(bh);
				844	spin_lock(lock);
				845	}
				846
				847	spin_unlock(lock);
				848	err2 = osync_buffers_list(lock, list);
				849	if (err)
				850	return err;
				851	else
				852	return err2;
				853	}
				854
				855	/*
				856	* Invalidate any and all dirty buffers on a given inode. We are
				857	* probably unmounting the fs, but that doesn't mean we have already
				858	* done a sync(). Just drop the buffers from the inode list.
				859	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	860	* NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	861	* assumes that all the buffers are against the blockdev. Not true
				862	* for reiserfs.
				863	*/
				864	void invalidate_inode_buffers(struct inode *inode)
				865	{
				866	if (inode_has_buffers(inode)) {
				867	struct address_space *mapping = &inode->i_data;
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	868	struct list_head *list = &mapping->i_private_list;
				869	struct address_space *buffer_mapping = mapping->i_private_data;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	870
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	871	spin_lock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	872	while (!list_empty(list))
				873	__remove_assoc_queue(BH_ENTRY(list->next));
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	874	spin_unlock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	875	}
				876	}
Jan Kara	52b19ac	2008-09-23 18:24:08 +0200	[diff] [blame]	877	EXPORT_SYMBOL(invalidate_inode_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	878
				879	/*
				880	* Remove any clean buffers from the inode's buffer list. This is called
				881	* when we're trying to free the inode itself. Those buffers can pin it.
				882	*
				883	* Returns true if all buffers were removed.
				884	*/
				885	int remove_inode_buffers(struct inode *inode)
				886	{
				887	int ret = 1;
				888
				889	if (inode_has_buffers(inode)) {
				890	struct address_space *mapping = &inode->i_data;
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	891	struct list_head *list = &mapping->i_private_list;
				892	struct address_space *buffer_mapping = mapping->i_private_data;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	893
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	894	spin_lock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	895	while (!list_empty(list)) {
				896	struct buffer_head *bh = BH_ENTRY(list->next);
				897	if (buffer_dirty(bh)) {
				898	ret = 0;
				899	break;
				900	}
				901	__remove_assoc_queue(bh);
				902	}
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	903	spin_unlock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	904	}
				905	return ret;
				906	}
				907
				908	/*
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	909	* Create the appropriate buffers when given a folio for data area and
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	910	* the size of each buffer.. Use the bh->b_this_page linked list to
				911	* follow the buffers created. Return NULL if unable to create more
				912	* buffers.
				913	*
				914	* The retry flag is used to differentiate async IO (paging, swapping)
				915	* which may not fail from ordinary buffer allocations.
				916	*/
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	917	struct buffer_head folio_alloc_buffers(struct folio folio, unsigned long size,
Matthew Wilcox (Oracle)	2a41815	2023-09-14 16:00:04 +0100	[diff] [blame]	918	gfp_t gfp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	919	{
				920	struct buffer_head bh, head;
				921	long offset;
Roman Gushchin	b87d8ce	2020-10-17 16:13:40 -0700	[diff] [blame]	922	struct mem_cgroup memcg, old_memcg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	923
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	924	/* The folio lock pins the memcg */
				925	memcg = folio_memcg(folio);
Roman Gushchin	b87d8ce	2020-10-17 16:13:40 -0700	[diff] [blame]	926	old_memcg = set_active_memcg(memcg);
Shakeel Butt	f745c6f	2018-08-17 15:46:44 -0700	[diff] [blame]	927
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	928	head = NULL;
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	929	offset = folio_size(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	930	while ((offset -= size) >= 0) {
Jens Axboe	640ab98	2017-09-27 05:40:16 -0600	[diff] [blame]	931	bh = alloc_buffer_head(gfp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	932	if (!bh)
				933	goto no_grow;
				934
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	935	bh->b_this_page = head;
				936	bh->b_blocknr = -1;
				937	head = bh;
				938
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	939	bh->b_size = size;
				940
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	941	/* Link the buffer to its folio */
				942	folio_set_bh(bh, folio, offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	943	}
Shakeel Butt	f745c6f	2018-08-17 15:46:44 -0700	[diff] [blame]	944	out:
Roman Gushchin	b87d8ce	2020-10-17 16:13:40 -0700	[diff] [blame]	945	set_active_memcg(old_memcg);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	946	return head;
				947	/*
				948	* In case anything failed, we just free everything we got.
				949	*/
				950	no_grow:
				951	if (head) {
				952	do {
				953	bh = head;
				954	head = head->b_this_page;
				955	free_buffer_head(bh);
				956	} while (head);
				957	}
				958
Shakeel Butt	f745c6f	2018-08-17 15:46:44 -0700	[diff] [blame]	959	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	960	}
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	961	EXPORT_SYMBOL_GPL(folio_alloc_buffers);
				962
				963	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				964	bool retry)
				965	{
Matthew Wilcox (Oracle)	2a41815	2023-09-14 16:00:04 +0100	[diff] [blame]	966	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT;
				967	if (retry)
				968	gfp \|= __GFP_NOFAIL;
				969
				970	return folio_alloc_buffers(page_folio(page), size, gfp);
Pankaj Raghav	c71124a	2023-04-17 14:36:16 +0200	[diff] [blame]	971	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	972	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				973
Matthew Wilcox (Oracle)	08d84ad	2023-06-12 22:01:39 +0100	[diff] [blame]	974	static inline void link_dev_buffers(struct folio *folio,
				975	struct buffer_head *head)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	976	{
				977	struct buffer_head bh, tail;
				978
				979	bh = head;
				980	do {
				981	tail = bh;
				982	bh = bh->b_this_page;
				983	} while (bh);
				984	tail->b_this_page = head;
Matthew Wilcox (Oracle)	08d84ad	2023-06-12 22:01:39 +0100	[diff] [blame]	985	folio_attach_private(folio, head);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	986	}
				987
Linus Torvalds	bbec0270	2012-11-29 12:31:52 -0800	[diff] [blame]	988	static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
				989	{
				990	sector_t retval = ~((sector_t)0);
Christoph Hellwig	b86058f	2021-10-18 12:11:09 +0200	[diff] [blame]	991	loff_t sz = bdev_nr_bytes(bdev);
Linus Torvalds	bbec0270	2012-11-29 12:31:52 -0800	[diff] [blame]	992
				993	if (sz) {
				994	unsigned int sizebits = blksize_bits(size);
				995	retval = (sz >> sizebits);
				996	}
				997	return retval;
				998	}
				999
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1000	/*
Matthew Wilcox (Oracle)	6f24ce6	2023-06-12 22:01:38 +0100	[diff] [blame]	1001	* Initialise the state of a blockdev folio's buffers.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1002	*/
Matthew Wilcox (Oracle)	6f24ce6	2023-06-12 22:01:38 +0100	[diff] [blame]	1003	static sector_t folio_init_buffers(struct folio *folio,
Matthew Wilcox (Oracle)	382497a	2023-11-09 21:06:03 +0000	[diff] [blame]	1004	struct block_device *bdev, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1005	{
Matthew Wilcox (Oracle)	6f24ce6	2023-06-12 22:01:38 +0100	[diff] [blame]	1006	struct buffer_head *head = folio_buffers(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1007	struct buffer_head *bh = head;
Matthew Wilcox (Oracle)	6f24ce6	2023-06-12 22:01:38 +0100	[diff] [blame]	1008	bool uptodate = folio_test_uptodate(folio);
Matthew Wilcox (Oracle)	382497a	2023-11-09 21:06:03 +0000	[diff] [blame]	1009	sector_t block = div_u64(folio_pos(folio), size);
Christoph Hellwig	bcd1d06	2021-10-18 12:11:10 +0200	[diff] [blame]	1010	sector_t end_block = blkdev_max_block(bdev, size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1011
				1012	do {
				1013	if (!buffer_mapped(bh)) {
Eric Biggers	01950a3	2018-01-16 22:25:12 -0800	[diff] [blame]	1014	bh->b_end_io = NULL;
				1015	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1016	bh->b_bdev = bdev;
				1017	bh->b_blocknr = block;
				1018	if (uptodate)
				1019	set_buffer_uptodate(bh);
Jeff Moyer	080399a	2012-05-11 16:34:10 +0200	[diff] [blame]	1020	if (block < end_block)
				1021	set_buffer_mapped(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1022	}
				1023	block++;
				1024	bh = bh->b_this_page;
				1025	} while (bh != head);
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1026
				1027	/*
				1028	* Caller needs to validate requested block against end of device.
				1029	*/
				1030	return end_block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1031	}
				1032
				1033	/*
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1034	* Create the page-cache folio that contains the requested block.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1035	*
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1036	* This is used purely for blockdev mappings.
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1037	*
Matthew Wilcox (Oracle)	bcd30d4	2024-01-01 09:38:48 +0000	[diff] [blame]	1038	* Returns false if we have a failure which cannot be cured by retrying
				1039	* without sleeping. Returns true if we succeeded, or the caller should retry.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1040	*/
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1041	static bool grow_dev_folio(struct block_device *bdev, sector_t block,
Matthew Wilcox (Oracle)	382497a	2023-11-09 21:06:03 +0000	[diff] [blame]	1042	pgoff_t index, unsigned size, gfp_t gfp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1043	{
Al Viro	22f89a4	2024-04-11 15:53:38 +0100	[diff] [blame]	1044	struct address_space *mapping = bdev->bd_mapping;
Matthew Wilcox (Oracle)	3c98a41	2023-06-12 22:01:37 +0100	[diff] [blame]	1045	struct folio *folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1046	struct buffer_head *bh;
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1047	sector_t end_block = 0;
Johannes Weiner	84235de	2013-10-16 13:47:00 -0700	[diff] [blame]	1048
Al Viro	22f89a4	2024-04-11 15:53:38 +0100	[diff] [blame]	1049	folio = __filemap_get_folio(mapping, index,
Matthew Wilcox (Oracle)	3ed65f0	2023-09-14 16:00:05 +0100	[diff] [blame]	1050	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp);
				1051	if (IS_ERR(folio))
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1052	return false;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1053
Matthew Wilcox (Oracle)	3c98a41	2023-06-12 22:01:37 +0100	[diff] [blame]	1054	bh = folio_buffers(folio);
				1055	if (bh) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1056	if (bh->b_size == size) {
Matthew Wilcox (Oracle)	382497a	2023-11-09 21:06:03 +0000	[diff] [blame]	1057	end_block = folio_init_buffers(folio, bdev, size);
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1058	goto unlock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1059	}
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1060
Matthew Wilcox (Oracle)	bcd30d4	2024-01-01 09:38:48 +0000	[diff] [blame]	1061	/*
				1062	* Retrying may succeed; for example the folio may finish
				1063	* writeback, or buffers may be cleaned. This should not
				1064	* happen very often; maybe we have old buffers attached to
				1065	* this blockdev's page cache and we're trying to change
				1066	* the block size?
				1067	*/
				1068	if (!try_to_free_buffers(folio)) {
				1069	end_block = ~0ULL;
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1070	goto unlock;
Matthew Wilcox (Oracle)	bcd30d4	2024-01-01 09:38:48 +0000	[diff] [blame]	1071	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1072	}
				1073
Matthew Wilcox (Oracle)	3ed65f0	2023-09-14 16:00:05 +0100	[diff] [blame]	1074	bh = folio_alloc_buffers(folio, size, gfp \| __GFP_ACCOUNT);
				1075	if (!bh)
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1076	goto unlock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1077
				1078	/*
Matthew Wilcox (Oracle)	3c98a41	2023-06-12 22:01:37 +0100	[diff] [blame]	1079	* Link the folio to the buffers and initialise them. Take the
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1080	* lock to be atomic wrt __find_get_block(), which does not
Matthew Wilcox (Oracle)	3c98a41	2023-06-12 22:01:37 +0100	[diff] [blame]	1081	* run under the folio lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1082	*/
Al Viro	22f89a4	2024-04-11 15:53:38 +0100	[diff] [blame]	1083	spin_lock(&mapping->i_private_lock);
Matthew Wilcox (Oracle)	08d84ad	2023-06-12 22:01:39 +0100	[diff] [blame]	1084	link_dev_buffers(folio, bh);
Matthew Wilcox (Oracle)	382497a	2023-11-09 21:06:03 +0000	[diff] [blame]	1085	end_block = folio_init_buffers(folio, bdev, size);
Al Viro	22f89a4	2024-04-11 15:53:38 +0100	[diff] [blame]	1086	spin_unlock(&mapping->i_private_lock);
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1087	unlock:
Matthew Wilcox (Oracle)	3c98a41	2023-06-12 22:01:37 +0100	[diff] [blame]	1088	folio_unlock(folio);
				1089	folio_put(folio);
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1090	return block < end_block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1091	}
				1092
				1093	/*
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1094	* Create buffers for the specified block device block's folio. If
				1095	* that folio was dirty, the buffers are set dirty also. Returns false
				1096	* if we've hit a permanent error.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1097	*/
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1098	static bool grow_buffers(struct block_device *bdev, sector_t block,
				1099	unsigned size, gfp_t gfp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1100	{
Matthew Wilcox (Oracle)	5f3bd90	2023-11-09 21:06:04 +0000	[diff] [blame]	1101	loff_t pos;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1102
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1103	/*
Matthew Wilcox (Oracle)	5f3bd90	2023-11-09 21:06:04 +0000	[diff] [blame]	1104	* Check for a block which lies outside our maximum possible
				1105	* pagecache index.
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1106	*/
Matthew Wilcox (Oracle)	5f3bd90	2023-11-09 21:06:04 +0000	[diff] [blame]	1107	if (check_mul_overflow(block, (sector_t)size, &pos) \|\| pos > MAX_LFS_FILESIZE) {
				1108	printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
Harvey Harrison	8e24eea	2008-04-30 00:55:09 -0700	[diff] [blame]	1109	__func__, (unsigned long long)block,
Dmitry Monakhov	a1c6f057	2015-04-13 16:31:37 +0400	[diff] [blame]	1110	bdev);
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1111	return false;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1112	}
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1113
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1114	/* Create a folio with the proper size buffers */
Matthew Wilcox (Oracle)	5f3bd90	2023-11-09 21:06:04 +0000	[diff] [blame]	1115	return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1116	}
				1117
Eric Biggers	0026ba4	2016-09-12 13:30:41 -0700	[diff] [blame]	1118	static struct buffer_head *
Gioh Kim	3b5e645	2014-09-04 22:04:42 -0400	[diff] [blame]	1119	__getblk_slow(struct block_device *bdev, sector_t block,
				1120	unsigned size, gfp_t gfp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1121	{
				1122	/* Size must be multiple of hard sectorsize */
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1123	if (unlikely(size & (bdev_logical_block_size(bdev)-1) \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1124	(size < 512 \|\| size > PAGE_SIZE))) {
				1125	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1126	size);
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1127	printk(KERN_ERR "logical block size: %d\n",
				1128	bdev_logical_block_size(bdev));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1129
				1130	dump_stack();
				1131	return NULL;
				1132	}
				1133
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1134	for (;;) {
				1135	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1136
				1137	bh = __find_get_block(bdev, block, size);
				1138	if (bh)
				1139	return bh;
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1140
Matthew Wilcox (Oracle)	6d840a1	2023-11-09 21:06:02 +0000	[diff] [blame]	1141	if (!grow_buffers(bdev, block, size, gfp))
Hugh Dickins	676ce6d	2012-08-23 12:17:36 +0200	[diff] [blame]	1142	return NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1143	}
				1144	}
				1145
				1146	/*
				1147	* The relationship between dirty buffers and dirty pages:
				1148	*
				1149	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
Matthew Wilcox	ec82e1c	2017-12-04 10:40:41 -0500	[diff] [blame]	1150	* the page is tagged dirty in the page cache.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1151	*
				1152	* At all times, the dirtiness of the buffers represents the dirtiness of
				1153	* subsections of the page. If the page has buffers, the page dirty bit is
				1154	* merely a hint about the true dirty state.
				1155	*
				1156	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1157	* (if the page has buffers).
				1158	*
				1159	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1160	* buffers are not.
				1161	*
				1162	* Also. When blockdev buffers are explicitly read with bread(), they
				1163	* individually become uptodate. But their backing page remains not
				1164	* uptodate - even if all of its buffers are uptodate. A subsequent
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	1165	* block_read_full_folio() against that folio will discover all the uptodate
				1166	* buffers, will set the folio uptodate and will perform no I/O.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1167	*/
				1168
				1169	/**
				1170	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1171	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1172	*
Matthew Wilcox	ec82e1c	2017-12-04 10:40:41 -0500	[diff] [blame]	1173	* mark_buffer_dirty() will set the dirty bit against the buffer, then set
				1174	* its backing page dirty, then tag the page as dirty in the page cache
				1175	* and then attach the address_space's inode to its superblock's dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1176	* inode list.
				1177	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1178	* mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
Matthew Wilcox	b93b016	2018-04-10 16:36:56 -0700	[diff] [blame]	1179	* i_pages lock and mapping->host->i_lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1180	*/
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	1181	void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182	{
Nick Piggin	787d2214	2007-07-17 04:03:34 -0700	[diff] [blame]	1183	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1184
Tejun Heo	5305cb8	2013-01-11 13:06:36 -0800	[diff] [blame]	1185	trace_block_dirty_buffer(bh);
				1186
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1187	/*
				1188	* Very carefully optimize the it-is-already-dirty case.
				1189	*
				1190	* Don't let the final "is it dirty" escape to before we
				1191	* perhaps modified the buffer.
				1192	*/
				1193	if (buffer_dirty(bh)) {
				1194	smp_mb();
				1195	if (buffer_dirty(bh))
				1196	return;
				1197	}
				1198
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1199	if (!test_set_buffer_dirty(bh)) {
Matthew Wilcox (Oracle)	cf1d341	2022-12-15 21:43:57 +0000	[diff] [blame]	1200	struct folio *folio = bh->b_folio;
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	1201	struct address_space *mapping = NULL;
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	1202
Matthew Wilcox (Oracle)	cf1d341	2022-12-15 21:43:57 +0000	[diff] [blame]	1203	folio_memcg_lock(folio);
				1204	if (!folio_test_set_dirty(folio)) {
				1205	mapping = folio->mapping;
Linus Torvalds	8e9d78e	2009-08-21 17:40:08 -0700	[diff] [blame]	1206	if (mapping)
Matthew Wilcox (Oracle)	cf1d341	2022-12-15 21:43:57 +0000	[diff] [blame]	1207	__folio_mark_dirty(folio, mapping, 0);
Linus Torvalds	8e9d78e	2009-08-21 17:40:08 -0700	[diff] [blame]	1208	}
Matthew Wilcox (Oracle)	cf1d341	2022-12-15 21:43:57 +0000	[diff] [blame]	1209	folio_memcg_unlock(folio);
Greg Thelen	c4843a7	2015-05-22 17:13:16 -0400	[diff] [blame]	1210	if (mapping)
				1211	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1212	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1213	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1214	EXPORT_SYMBOL(mark_buffer_dirty);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1215
Jeff Layton	87354e5	2017-07-06 07:02:21 -0400	[diff] [blame]	1216	void mark_buffer_write_io_error(struct buffer_head *bh)
				1217	{
				1218	set_buffer_write_io_error(bh);
				1219	/* FIXME: do we need to set this in both places? */
Matthew Wilcox (Oracle)	abc8a8a	2022-12-15 21:43:52 +0000	[diff] [blame]	1220	if (bh->b_folio && bh->b_folio->mapping)
				1221	mapping_set_error(bh->b_folio->mapping, -EIO);
Christoph Hellwig	4b2201d	2023-08-07 12:26:22 +0100	[diff] [blame]	1222	if (bh->b_assoc_map) {
Jeff Layton	87354e5	2017-07-06 07:02:21 -0400	[diff] [blame]	1223	mapping_set_error(bh->b_assoc_map, -EIO);
Christoph Hellwig	4b2201d	2023-08-07 12:26:22 +0100	[diff] [blame]	1224	errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
				1225	}
Jeff Layton	87354e5	2017-07-06 07:02:21 -0400	[diff] [blame]	1226	}
				1227	EXPORT_SYMBOL(mark_buffer_write_io_error);
				1228
Matthew Wilcox (Oracle)	66924fd	2024-04-16 04:17:49 +0100	[diff] [blame]	1229	/**
				1230	* __brelse - Release a buffer.
				1231	* @bh: The buffer to release.
				1232	*
				1233	* This variant of brelse() can be called if @bh is guaranteed to not be NULL.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1234	*/
Matthew Wilcox (Oracle)	66924fd	2024-04-16 04:17:49 +0100	[diff] [blame]	1235	void __brelse(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1236	{
Matthew Wilcox (Oracle)	66924fd	2024-04-16 04:17:49 +0100	[diff] [blame]	1237	if (atomic_read(&bh->b_count)) {
				1238	put_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1239	return;
				1240	}
Arjan van de Ven	5c752ad	2008-07-25 19:45:40 -0700	[diff] [blame]	1241	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1242	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1243	EXPORT_SYMBOL(__brelse);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1244
Matthew Wilcox (Oracle)	b73a936	2024-04-16 04:17:50 +0100	[diff] [blame]	1245	/**
				1246	* __bforget - Discard any dirty data in a buffer.
				1247	* @bh: The buffer to forget.
				1248	*
				1249	* This variant of bforget() can be called if @bh is guaranteed to not
				1250	* be NULL.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1251	*/
				1252	void __bforget(struct buffer_head *bh)
				1253	{
				1254	clear_buffer_dirty(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	1255	if (bh->b_assoc_map) {
Matthew Wilcox (Oracle)	abc8a8a	2022-12-15 21:43:52 +0000	[diff] [blame]	1256	struct address_space *buffer_mapping = bh->b_folio->mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1257
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1258	spin_lock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1259	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407b	2006-10-17 00:10:19 -0700	[diff] [blame]	1260	bh->b_assoc_map = NULL;
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1261	spin_unlock(&buffer_mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1262	}
				1263	__brelse(bh);
				1264	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1265	EXPORT_SYMBOL(__bforget);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1266
				1267	static struct buffer_head __bread_slow(struct buffer_head bh)
				1268	{
				1269	lock_buffer(bh);
				1270	if (buffer_uptodate(bh)) {
				1271	unlock_buffer(bh);
				1272	return bh;
				1273	} else {
				1274	get_bh(bh);
				1275	bh->b_end_io = end_buffer_read_sync;
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	1276	submit_bh(REQ_OP_READ, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1277	wait_on_buffer(bh);
				1278	if (buffer_uptodate(bh))
				1279	return bh;
				1280	}
				1281	brelse(bh);
				1282	return NULL;
				1283	}
				1284
				1285	/*
				1286	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1287	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1288	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1289	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1290	* CPU's LRUs at the same time.
				1291	*
				1292	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1293	* sb_find_get_block().
				1294	*
				1295	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1296	* a local interrupt disable for that.
				1297	*/
				1298
Sebastien Buisson	86cf78d	2014-10-09 15:29:38 -0700	[diff] [blame]	1299	#define BH_LRU_SIZE 16
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1300
				1301	struct bh_lru {
				1302	struct buffer_head *bhs[BH_LRU_SIZE];
				1303	};
				1304
				1305	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1306
				1307	#ifdef CONFIG_SMP
				1308	#define bh_lru_lock() local_irq_disable()
				1309	#define bh_lru_unlock() local_irq_enable()
				1310	#else
				1311	#define bh_lru_lock() preempt_disable()
				1312	#define bh_lru_unlock() preempt_enable()
				1313	#endif
				1314
				1315	static inline void check_irqs_on(void)
				1316	{
				1317	#ifdef irqs_disabled
				1318	BUG_ON(irqs_disabled());
				1319	#endif
				1320	}
				1321
				1322	/*
Eric Biggers	241f01f	2017-07-10 15:47:29 -0700	[diff] [blame]	1323	* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
				1324	* inserted at the front, and the buffer_head at the back if any is evicted.
				1325	* Or, if already in the LRU it is moved to the front.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1326	*/
				1327	static void bh_lru_install(struct buffer_head *bh)
				1328	{
Eric Biggers	241f01f	2017-07-10 15:47:29 -0700	[diff] [blame]	1329	struct buffer_head *evictee = bh;
				1330	struct bh_lru *b;
				1331	int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1332
				1333	check_irqs_on();
Minchan Kim	c0226eb	2022-03-22 14:39:34 -0700	[diff] [blame]	1334	bh_lru_lock();
				1335
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1336	/*
				1337	* the refcount of buffer_head in bh_lru prevents dropping the
				1338	* attached page(i.e., try_to_free_buffers) so it could cause
				1339	* failing page migration.
				1340	* Skip putting upcoming bh into bh_lru until migration is done.
				1341	*/
Marcelo Tosatti	8a237ad	2023-06-27 17:08:15 -0300	[diff] [blame]	1342	if (lru_cache_disabled() \|\| cpu_is_isolated(smp_processor_id())) {
Minchan Kim	c0226eb	2022-03-22 14:39:34 -0700	[diff] [blame]	1343	bh_lru_unlock();
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1344	return;
Minchan Kim	c0226eb	2022-03-22 14:39:34 -0700	[diff] [blame]	1345	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1346
Eric Biggers	241f01f	2017-07-10 15:47:29 -0700	[diff] [blame]	1347	b = this_cpu_ptr(&bh_lrus);
				1348	for (i = 0; i < BH_LRU_SIZE; i++) {
				1349	swap(evictee, b->bhs[i]);
				1350	if (evictee == bh) {
				1351	bh_lru_unlock();
				1352	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1353	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1354	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1355
Eric Biggers	241f01f	2017-07-10 15:47:29 -0700	[diff] [blame]	1356	get_bh(bh);
				1357	bh_lru_unlock();
				1358	brelse(evictee);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1359	}
				1360
				1361	/*
				1362	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1363	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1364	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1365	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1366	{
				1367	struct buffer_head *ret = NULL;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1368	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1369
				1370	check_irqs_on();
				1371	bh_lru_lock();
Marcelo Tosatti	8a237ad	2023-06-27 17:08:15 -0300	[diff] [blame]	1372	if (cpu_is_isolated(smp_processor_id())) {
				1373	bh_lru_unlock();
				1374	return NULL;
				1375	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	for (i = 0; i < BH_LRU_SIZE; i++) {
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	1377	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1378
Zach Brown	9470dd5	2014-10-13 15:55:05 -0700	[diff] [blame]	1379	if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
				1380	bh->b_size == size) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1381	if (i) {
				1382	while (i) {
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	1383	__this_cpu_write(bh_lrus.bhs[i],
				1384	__this_cpu_read(bh_lrus.bhs[i - 1]));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	i--;
				1386	}
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	1387	__this_cpu_write(bh_lrus.bhs[0], bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1388	}
				1389	get_bh(bh);
				1390	ret = bh;
				1391	break;
				1392	}
				1393	}
				1394	bh_lru_unlock();
				1395	return ret;
				1396	}
				1397
				1398	/*
				1399	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1400	* it in the LRU and mark it as accessed. If it is not present then return
				1401	* NULL
				1402	*/
				1403	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1404	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1405	{
				1406	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1407
				1408	if (bh == NULL) {
Mel Gorman	2457aec	2014-06-04 16:10:31 -0700	[diff] [blame]	1409	/* __find_get_block_slow will mark the page accessed */
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1410	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1411	if (bh)
				1412	bh_lru_install(bh);
Mel Gorman	2457aec	2014-06-04 16:10:31 -0700	[diff] [blame]	1413	} else
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1414	touch_buffer(bh);
Mel Gorman	2457aec	2014-06-04 16:10:31 -0700	[diff] [blame]	1415
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1416	return bh;
				1417	}
				1418	EXPORT_SYMBOL(__find_get_block);
				1419
Matthew Wilcox (Oracle)	3ed65f0	2023-09-14 16:00:05 +0100	[diff] [blame]	1420	/**
				1421	* bdev_getblk - Get a buffer_head in a block device's buffer cache.
				1422	* @bdev: The block device.
				1423	* @block: The block number.
				1424	* @size: The size of buffer_heads for this @bdev.
				1425	* @gfp: The memory allocation flags to use.
				1426	*
Matthew Wilcox (Oracle)	0b116ff	2024-04-16 04:17:51 +0100	[diff] [blame]	1427	* The returned buffer head has its reference count incremented, but is
				1428	* not locked. The caller should call brelse() when it has finished
				1429	* with the buffer. The buffer may not be uptodate. If needed, the
				1430	* caller can bring it uptodate either by reading it or overwriting it.
				1431	*
Matthew Wilcox (Oracle)	3ed65f0	2023-09-14 16:00:05 +0100	[diff] [blame]	1432	* Return: The buffer head, or NULL if memory could not be allocated.
				1433	*/
				1434	struct buffer_head bdev_getblk(struct block_device bdev, sector_t block,
				1435	unsigned size, gfp_t gfp)
				1436	{
				1437	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1438
				1439	might_alloc(gfp);
				1440	if (bh)
				1441	return bh;
				1442
				1443	return __getblk_slow(bdev, block, size, gfp);
				1444	}
				1445	EXPORT_SYMBOL(bdev_getblk);
				1446
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1447	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1448	* Do async read-ahead on a buffer..
				1449	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1450	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1451	{
Matthew Wilcox (Oracle)	775d9b1	2023-09-14 16:00:07 +0100	[diff] [blame]	1452	struct buffer_head *bh = bdev_getblk(bdev, block, size,
				1453	GFP_NOWAIT \| __GFP_MOVABLE);
				1454
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1455	if (likely(bh)) {
Zhang Yi	e7ea112	2022-09-01 21:34:54 +0800	[diff] [blame]	1456	bh_readahead(bh, REQ_RAHEAD);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1457	brelse(bh);
				1458	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1459	}
				1460	EXPORT_SYMBOL(__breadahead);
				1461
				1462	/**
Matthew Wilcox (Oracle)	324ecae	2024-04-16 04:17:48 +0100	[diff] [blame]	1463	* __bread_gfp() - Read a block.
				1464	* @bdev: The block device to read from.
				1465	* @block: Block number in units of block size.
				1466	* @size: The block size of this device in bytes.
				1467	* @gfp: Not page allocation flags; see below.
Gioh Kim	3b5e645	2014-09-04 22:04:42 -0400	[diff] [blame]	1468	*
Matthew Wilcox (Oracle)	324ecae	2024-04-16 04:17:48 +0100	[diff] [blame]	1469	* You are not expected to call this function. You should use one of
				1470	* sb_bread(), sb_bread_unmovable() or __bread().
				1471	*
				1472	* Read a specified block, and return the buffer head that refers to it.
				1473	* If @gfp is 0, the memory will be allocated using the block device's
				1474	* default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
				1475	* allocated from a movable area. Do not pass in a complete set of
				1476	* GFP flags.
				1477	*
				1478	* The returned buffer head has its refcount increased. The caller should
				1479	* call brelse() when it has finished with the buffer.
				1480	*
				1481	* Context: May sleep waiting for I/O.
				1482	* Return: NULL if the block was unreadable.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1483	*/
Matthew Wilcox (Oracle)	324ecae	2024-04-16 04:17:48 +0100	[diff] [blame]	1484	struct buffer_head __bread_gfp(struct block_device bdev, sector_t block,
				1485	unsigned size, gfp_t gfp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1486	{
Matthew Wilcox (Oracle)	93b13ec	2023-09-14 16:00:11 +0100	[diff] [blame]	1487	struct buffer_head *bh;
				1488
Al Viro	224941e	2024-04-11 15:53:37 +0100	[diff] [blame]	1489	gfp \|= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
Matthew Wilcox (Oracle)	93b13ec	2023-09-14 16:00:11 +0100	[diff] [blame]	1490
				1491	/*
				1492	* Prefer looping in the allocator rather than here, at least that
				1493	* code knows what it's doing.
				1494	*/
				1495	gfp \|= __GFP_NOFAIL;
				1496
				1497	bh = bdev_getblk(bdev, block, size, gfp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1498
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1499	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1500	bh = __bread_slow(bh);
				1501	return bh;
				1502	}
Gioh Kim	3b5e645	2014-09-04 22:04:42 -0400	[diff] [blame]	1503	EXPORT_SYMBOL(__bread_gfp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1504
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1505	static void __invalidate_bh_lrus(struct bh_lru *b)
				1506	{
				1507	int i;
				1508
				1509	for (i = 0; i < BH_LRU_SIZE; i++) {
				1510	brelse(b->bhs[i]);
				1511	b->bhs[i] = NULL;
				1512	}
				1513	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1514	/*
				1515	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1516	* This doesn't race because it runs in each cpu either in irq
				1517	* or with preempt disabled.
				1518	*/
				1519	static void invalidate_bh_lru(void *arg)
				1520	{
				1521	struct bh_lru *b = &get_cpu_var(bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1522
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1523	__invalidate_bh_lrus(b);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1524	put_cpu_var(bh_lrus);
				1525	}
Gilad Ben-Yossef	42be35d	2012-03-28 14:42:45 -0700	[diff] [blame]	1526
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1527	bool has_bh_in_lru(int cpu, void *dummy)
Gilad Ben-Yossef	42be35d	2012-03-28 14:42:45 -0700	[diff] [blame]	1528	{
				1529	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
				1530	int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1531
Gilad Ben-Yossef	42be35d	2012-03-28 14:42:45 -0700	[diff] [blame]	1532	for (i = 0; i < BH_LRU_SIZE; i++) {
				1533	if (b->bhs[i])
Saurav Girepunje	1d70667	2019-11-30 17:49:15 -0800	[diff] [blame]	1534	return true;
Gilad Ben-Yossef	42be35d	2012-03-28 14:42:45 -0700	[diff] [blame]	1535	}
				1536
Saurav Girepunje	1d70667	2019-11-30 17:49:15 -0800	[diff] [blame]	1537	return false;
Gilad Ben-Yossef	42be35d	2012-03-28 14:42:45 -0700	[diff] [blame]	1538	}
				1539
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1540	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1541	{
Sebastian Andrzej Siewior	cb92315	2020-01-17 10:01:37 +0100	[diff] [blame]	1542	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1543	}
Nick Piggin	9db5579	2008-02-08 04:19:49 -0800	[diff] [blame]	1544	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1545
Minchan Kim	243418e	2021-09-24 15:43:47 -0700	[diff] [blame]	1546	/*
				1547	* It's called from workqueue context so we need a bh_lru_lock to close
				1548	* the race with preemption/irq.
				1549	*/
				1550	void invalidate_bh_lrus_cpu(void)
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1551	{
				1552	struct bh_lru *b;
				1553
				1554	bh_lru_lock();
Minchan Kim	243418e	2021-09-24 15:43:47 -0700	[diff] [blame]	1555	b = this_cpu_ptr(&bh_lrus);
Minchan Kim	8cc621d	2021-05-04 18:37:00 -0700	[diff] [blame]	1556	__invalidate_bh_lrus(b);
				1557	bh_lru_unlock();
				1558	}
				1559
Pankaj Raghav	465e5e6	2023-04-17 14:36:15 +0200	[diff] [blame]	1560	void folio_set_bh(struct buffer_head bh, struct folio folio,
				1561	unsigned long offset)
				1562	{
				1563	bh->b_folio = folio;
				1564	BUG_ON(offset >= folio_size(folio));
				1565	if (folio_test_highmem(folio))
				1566	/*
				1567	* This catches illegal uses and preserves the offset:
				1568	*/
				1569	bh->b_data = (char *)(0 + offset);
				1570	else
				1571	bh->b_data = folio_address(folio) + offset;
				1572	}
				1573	EXPORT_SYMBOL(folio_set_bh);
				1574
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1575	/*
				1576	* Called when truncating a buffer on a page completely.
				1577	*/
Mel Gorman	e7470ee	2014-06-04 16:10:29 -0700	[diff] [blame]	1578
				1579	/* Bits that are cleared during an invalidate */
				1580	#define BUFFER_FLAGS_DISCARD \
				1581	(1 << BH_Mapped \| 1 << BH_New \| 1 << BH_Req \| \
				1582	1 << BH_Delay \| 1 << BH_Unwritten)
				1583
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1584	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1585	{
Uros Bizjak	b019229	2022-07-14 19:16:53 +0200	[diff] [blame]	1586	unsigned long b_state;
Mel Gorman	e7470ee	2014-06-04 16:10:29 -0700	[diff] [blame]	1587
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1588	lock_buffer(bh);
				1589	clear_buffer_dirty(bh);
				1590	bh->b_bdev = NULL;
Uros Bizjak	b019229	2022-07-14 19:16:53 +0200	[diff] [blame]	1591	b_state = READ_ONCE(bh->b_state);
				1592	do {
				1593	} while (!try_cmpxchg(&bh->b_state, &b_state,
				1594	b_state & ~BUFFER_FLAGS_DISCARD));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1595	unlock_buffer(bh);
				1596	}
				1597
				1598	/**
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1599	* block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
				1600	* @folio: The folio which is affected.
Lukas Czerner	d47992f	2013-05-21 23:17:23 -0400	[diff] [blame]	1601	* @offset: start of the range to invalidate
				1602	* @length: length of the range to invalidate
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1603	*
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1604	* block_invalidate_folio() is called when all or part of the folio has been
Wang Sheng-Hui	814e1d2	2011-09-01 08:22:57 +0800	[diff] [blame]	1605	* invalidated by a truncate operation.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1606	*
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1607	* block_invalidate_folio() does not have to release all buffers, but it must
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1608	* ensure that no dirty buffer is left outside @offset and that no I/O
				1609	* is underway against any of the blocks which are outside the truncation
				1610	* point. Because the caller is about to free (and possibly reuse) those
				1611	* blocks on-disk.
				1612	*/
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1613	void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1614	{
				1615	struct buffer_head head, bh, *next;
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1616	size_t curr_off = 0;
				1617	size_t stop = length + offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1618
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1619	BUG_ON(!folio_test_locked(folio));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1620
Lukas Czerner	d47992f	2013-05-21 23:17:23 -0400	[diff] [blame]	1621	/*
				1622	* Check for overflow
				1623	*/
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1624	BUG_ON(stop > folio_size(folio) \|\| stop < length);
Lukas Czerner	d47992f	2013-05-21 23:17:23 -0400	[diff] [blame]	1625
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1626	head = folio_buffers(folio);
				1627	if (!head)
				1628	return;
				1629
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1630	bh = head;
				1631	do {
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1632	size_t next_off = curr_off + bh->b_size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1633	next = bh->b_this_page;
				1634
				1635	/*
Lukas Czerner	d47992f	2013-05-21 23:17:23 -0400	[diff] [blame]	1636	* Are we still fully in range ?
				1637	*/
				1638	if (next_off > stop)
				1639	goto out;
				1640
				1641	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1642	* is this block fully invalidated?
				1643	*/
				1644	if (offset <= curr_off)
				1645	discard_buffer(bh);
				1646	curr_off = next_off;
				1647	bh = next;
				1648	} while (bh != head);
				1649
				1650	/*
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1651	* We release buffers only if the entire folio is being invalidated.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1652	* The get_block cached value has been unconditionally invalidated,
				1653	* so real IO is not possible anymore.
				1654	*/
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1655	if (length == folio_size(folio))
				1656	filemap_release_folio(folio, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1657	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1658	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1659	}
Matthew Wilcox (Oracle)	7ba13ab	2022-02-09 20:21:34 +0000	[diff] [blame]	1660	EXPORT_SYMBOL(block_invalidate_folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1661
				1662	/*
				1663	* We attach and possibly dirty the buffers atomically wrt
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1664	* block_dirty_folio() via i_private_lock. try_to_free_buffers
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1665	* is already excluded via the folio lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1666	*/
Matthew Wilcox (Oracle)	0a88810	2023-10-16 21:11:14 +0100	[diff] [blame]	1667	struct buffer_head create_empty_buffers(struct folio folio,
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	1668	unsigned long blocksize, unsigned long b_state)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1669	{
				1670	struct buffer_head bh, head, *tail;
Matthew Wilcox (Oracle)	2a41815	2023-09-14 16:00:04 +0100	[diff] [blame]	1671	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT \| __GFP_NOFAIL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1672
Matthew Wilcox (Oracle)	2a41815	2023-09-14 16:00:04 +0100	[diff] [blame]	1673	head = folio_alloc_buffers(folio, blocksize, gfp);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1674	bh = head;
				1675	do {
				1676	bh->b_state \|= b_state;
				1677	tail = bh;
				1678	bh = bh->b_this_page;
				1679	} while (bh);
				1680	tail->b_this_page = head;
				1681
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1682	spin_lock(&folio->mapping->i_private_lock);
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1683	if (folio_test_uptodate(folio) \|\| folio_test_dirty(folio)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1684	bh = head;
				1685	do {
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1686	if (folio_test_dirty(folio))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1687	set_buffer_dirty(bh);
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1688	if (folio_test_uptodate(folio))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1689	set_buffer_uptodate(bh);
				1690	bh = bh->b_this_page;
				1691	} while (bh != head);
				1692	}
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1693	folio_attach_private(folio, head);
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1694	spin_unlock(&folio->mapping->i_private_lock);
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	1695
				1696	return head;
Pankaj Raghav	8e2e175	2023-04-17 14:36:17 +0200	[diff] [blame]	1697	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1698	EXPORT_SYMBOL(create_empty_buffers);
				1699
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1700	/**
				1701	* clean_bdev_aliases: clean a range of buffers in block device
				1702	* @bdev: Block device to clean buffers in
				1703	* @block: Start of a range of blocks to clean
				1704	* @len: Number of blocks to clean
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1705	*
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1706	* We are taking a range of blocks for data and we don't want writeback of any
				1707	* buffer-cache aliases starting from return from this function and until the
				1708	* moment when something will explicitly mark the buffer dirty (hopefully that
				1709	* will not happen until we will free that block ;-) We don't even need to mark
				1710	* it not-uptodate - nobody can expect anything from a newly allocated buffer
				1711	* anyway. We used to use unmap_buffer() for such invalidation, but that was
				1712	* wrong. We definitely don't want to mark the alias unmapped, for example - it
				1713	* would confuse anyone who might pick it with bread() afterwards...
				1714	*
				1715	* Also.. Note that bforget() doesn't lock the buffer. So there can be
				1716	* writeout I/O going on against recently-freed buffers. We don't wait on that
				1717	* I/O in bforget() - it's more efficient to wait on the I/O only if we really
				1718	* need to. That happens here.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1719	*/
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1720	void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1721	{
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	1722	struct address_space *bd_mapping = bdev->bd_mapping;
				1723	const int blkbits = bd_mapping->host->i_blkbits;
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1724	struct folio_batch fbatch;
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	1725	pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1726	pgoff_t end;
Jan Kara	c10f778	2017-09-06 16:21:24 -0700	[diff] [blame]	1727	int i, count;
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1728	struct buffer_head *bh;
				1729	struct buffer_head *head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1730
Al Viro	53cd4cd	2024-04-28 19:41:13 -0400	[diff] [blame]	1731	end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1732	folio_batch_init(&fbatch);
				1733	while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
				1734	count = folio_batch_count(&fbatch);
Jan Kara	c10f778	2017-09-06 16:21:24 -0700	[diff] [blame]	1735	for (i = 0; i < count; i++) {
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1736	struct folio *folio = fbatch.folios[i];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1737
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1738	if (!folio_buffers(folio))
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1739	continue;
				1740	/*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	1741	* We use folio lock instead of bd_mapping->i_private_lock
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1742	* to pin buffers here since we can afford to sleep and
				1743	* it scales better than a global spinlock lock.
				1744	*/
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1745	folio_lock(folio);
				1746	/* Recheck when the folio is locked which pins bhs */
				1747	head = folio_buffers(folio);
				1748	if (!head)
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1749	goto unlock_page;
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1750	bh = head;
				1751	do {
Chandan Rajendra	6c006a9	2016-12-25 19:01:03 +0530	[diff] [blame]	1752	if (!buffer_mapped(bh) \|\| (bh->b_blocknr < block))
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1753	goto next;
				1754	if (bh->b_blocknr >= block + len)
				1755	break;
				1756	clear_buffer_dirty(bh);
				1757	wait_on_buffer(bh);
				1758	clear_buffer_req(bh);
				1759	next:
				1760	bh = bh->b_this_page;
				1761	} while (bh != head);
				1762	unlock_page:
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1763	folio_unlock(folio);
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1764	}
Matthew Wilcox (Oracle)	9e0b6f3	2022-06-04 16:24:22 -0400	[diff] [blame]	1765	folio_batch_release(&fbatch);
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1766	cond_resched();
Jan Kara	c10f778	2017-09-06 16:21:24 -0700	[diff] [blame]	1767	/* End of range already reached? */
				1768	if (index > end \|\| !index)
				1769	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1770	}
				1771	}
Jan Kara	29f3ad7	2016-11-04 18:08:11 +0100	[diff] [blame]	1772	EXPORT_SYMBOL(clean_bdev_aliases);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1773
Pankaj Raghav	c6c8c3e	2023-04-17 14:36:18 +0200	[diff] [blame]	1774	static struct buffer_head folio_create_buffers(struct folio folio,
				1775	struct inode *inode,
				1776	unsigned int b_state)
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	1777	{
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	1778	struct buffer_head *bh;
				1779
Pankaj Raghav	c6c8c3e	2023-04-17 14:36:18 +0200	[diff] [blame]	1780	BUG_ON(!folio_test_locked(folio));
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	1781
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	1782	bh = folio_buffers(folio);
				1783	if (!bh)
Matthew Wilcox (Oracle)	0a88810	2023-10-16 21:11:14 +0100	[diff] [blame]	1784	bh = create_empty_buffers(folio,
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	1785	1 << READ_ONCE(inode->i_blkbits), b_state);
				1786	return bh;
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	1787	}
				1788
				1789	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1790	* NOTE! All mapped/uptodate combinations are valid:
				1791	*
				1792	* Mapped Uptodate Meaning
				1793	*
				1794	* No No "unknown" - must do get_block()
				1795	* No Yes "hole" - zero-filled
				1796	* Yes No "allocated" - allocated on disk, not read in
				1797	* Yes Yes "valid" - allocated and up-to-date in memory.
				1798	*
				1799	* "Dirty" is valid only with the last case (mapped+uptodate).
				1800	*/
				1801
				1802	/*
Matthew Wilcox (Oracle)	17bf23a	2023-12-15 20:02:44 +0000	[diff] [blame]	1803	* While block_write_full_folio is writing back the dirty buffers under
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1804	* the page lock, whoever dirtied the buffers may decide to clean them
				1805	* again at any time. We handle that by only looking at the buffer
				1806	* state inside lock_buffer().
				1807	*
Matthew Wilcox (Oracle)	17bf23a	2023-12-15 20:02:44 +0000	[diff] [blame]	1808	* If block_write_full_folio() is called for regular writeback
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1809	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1810	* locked buffer. This only can happen if someone has written the buffer
				1811	* directly, with submit_bh(). At the address_space level PageWriteback
				1812	* prevents this contention from occurring.
Theodore Ts'o	6e34eedd	2009-04-07 18:12:43 -0400	[diff] [blame]	1813	*
Matthew Wilcox (Oracle)	17bf23a	2023-12-15 20:02:44 +0000	[diff] [blame]	1814	* If block_write_full_folio() is called with wbc->sync_mode ==
Christoph Hellwig	70fd761	2016-11-01 07:40:10 -0600	[diff] [blame]	1815	* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
Jens Axboe	721a960	2011-03-09 11:56:30 +0100	[diff] [blame]	1816	* causes the writes to be flagged as synchronous writes.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1817	*/
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1818	int __block_write_full_folio(struct inode inode, struct folio folio,
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	1819	get_block_t get_block, struct writeback_control wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1820	{
				1821	int err;
				1822	sector_t block;
				1823	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1824	struct buffer_head bh, head;
Matthew Wilcox (Oracle)	fa399c3	2023-11-09 21:06:08 +0000	[diff] [blame]	1825	size_t blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1826	int nr_underway = 0;
Bart Van Assche	3ae7286	2022-07-14 11:07:12 -0700	[diff] [blame]	1827	blk_opf_t write_flags = wbc_to_write_flags(wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1828
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1829	head = folio_create_buffers(folio, inode,
Pankaj Raghav	c6c8c3e	2023-04-17 14:36:18 +0200	[diff] [blame]	1830	(1 << BH_Dirty) \| (1 << BH_Uptodate));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1831
				1832	/*
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	1833	* Be very careful. We have no exclusion from block_dirty_folio
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1834	* here, and the (potentially unmapped) buffers may become dirty at
				1835	* any time. If a buffer becomes dirty here after we've inspected it
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1836	* then we just miss that fact, and the folio stays dirty.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1837	*
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	1838	* Buffers outside i_size may be dirtied by block_dirty_folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1839	* handle that here by just cleaning them.
				1840	*/
				1841
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1842	bh = head;
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	1843	blocksize = bh->b_size;
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	1844
Matthew Wilcox (Oracle)	fa399c3	2023-11-09 21:06:08 +0000	[diff] [blame]	1845	block = div_u64(folio_pos(folio), blocksize);
				1846	last_block = div_u64(i_size_read(inode) - 1, blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1847
				1848	/*
				1849	* Get all the dirty buffers mapped to disk addresses and
				1850	* handle any aliases from the underlying blockdev's mapping.
				1851	*/
				1852	do {
				1853	if (block > last_block) {
				1854	/*
				1855	* mapped buffers outside i_size will occur, because
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1856	* this folio can be outside i_size when there is a
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1857	* truncate in progress.
				1858	*/
				1859	/*
Matthew Wilcox (Oracle)	17bf23a	2023-12-15 20:02:44 +0000	[diff] [blame]	1860	* The buffer was zeroed by block_write_full_folio()
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1861	*/
				1862	clear_buffer_dirty(bh);
				1863	set_buffer_uptodate(bh);
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1864	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
				1865	buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1866	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1867	err = get_block(inode, block, bh, 1);
				1868	if (err)
				1869	goto recover;
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1870	clear_buffer_delay(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1871	if (buffer_new(bh)) {
				1872	/* blockdev mappings never come here */
				1873	clear_buffer_new(bh);
Jan Kara	e64855c	2016-11-04 18:08:15 +0100	[diff] [blame]	1874	clean_bdev_bh_alias(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1875	}
				1876	}
				1877	bh = bh->b_this_page;
				1878	block++;
				1879	} while (bh != head);
				1880
				1881	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1882	if (!buffer_mapped(bh))
				1883	continue;
				1884	/*
				1885	* If it's a fully non-blocking write attempt and we cannot
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1886	* lock the buffer then redirty the folio. Note that this can
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	1887	* potentially cause a busy-wait loop from writeback threads
				1888	* and kswapd activity, but those code paths have their own
				1889	* higher-level throttling.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1890	*/
Wu Fengguang	1b430be	2010-10-26 14:21:26 -0700	[diff] [blame]	1891	if (wbc->sync_mode != WB_SYNC_NONE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1892	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	1893	} else if (!trylock_buffer(bh)) {
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1894	folio_redirty_for_writepage(wbc, folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1895	continue;
				1896	}
				1897	if (test_clear_buffer_dirty(bh)) {
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	1898	mark_buffer_async_write_endio(bh,
				1899	end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1900	} else {
				1901	unlock_buffer(bh);
				1902	}
				1903	} while ((bh = bh->b_this_page) != head);
				1904
				1905	/*
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1906	* The folio and its buffers are protected by the writeback flag,
				1907	* so we can drop the bh refcounts early.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1908	*/
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1909	BUG_ON(folio_test_writeback(folio));
				1910	folio_start_writeback(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1911
				1912	do {
				1913	struct buffer_head *next = bh->b_this_page;
				1914	if (buffer_async_write(bh)) {
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	1915	submit_bh_wbc(REQ_OP_WRITE \| write_flags, bh,
				1916	inode->i_write_hint, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1917	nr_underway++;
				1918	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1919	bh = next;
				1920	} while (bh != head);
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1921	folio_unlock(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1922
				1923	err = 0;
				1924	done:
				1925	if (nr_underway == 0) {
				1926	/*
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1927	* The folio was marked dirty, but the buffers were
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1928	* clean. Someone wrote them back by hand with
Zhang Yi	79f5978	2022-09-01 21:35:03 +0800	[diff] [blame]	1929	* write_dirty_buffer/submit_bh. A rare case.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1930	*/
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1931	folio_end_writeback(folio);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1932
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1933	/*
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1934	* The folio and buffer_heads can be released at any time from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1935	* here on.
				1936	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1937	}
				1938	return err;
				1939
				1940	recover:
				1941	/*
				1942	* ENOSPC, or some other error. We may already have added some
				1943	* blocks to the file, so we need to write these out to avoid
				1944	* exposing stale data.
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1945	* The folio is currently locked and not marked for writeback
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1946	*/
				1947	bh = head;
				1948	/* Recovery: lock and submit the mapped buffers */
				1949	do {
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1950	if (buffer_mapped(bh) && buffer_dirty(bh) &&
				1951	!buffer_delay(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1952	lock_buffer(bh);
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	1953	mark_buffer_async_write_endio(bh,
				1954	end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1955	} else {
				1956	/*
				1957	* The buffer may have been set dirty during
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1958	* attachment to a dirty folio.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1959	*/
				1960	clear_buffer_dirty(bh);
				1961	}
				1962	} while ((bh = bh->b_this_page) != head);
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1963	folio_set_error(folio);
				1964	BUG_ON(folio_test_writeback(folio));
				1965	mapping_set_error(folio->mapping, err);
				1966	folio_start_writeback(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1967	do {
				1968	struct buffer_head *next = bh->b_this_page;
				1969	if (buffer_async_write(bh)) {
				1970	clear_buffer_dirty(bh);
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	1971	submit_bh_wbc(REQ_OP_WRITE \| write_flags, bh,
				1972	inode->i_write_hint, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1973	nr_underway++;
				1974	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1975	bh = next;
				1976	} while (bh != head);
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1977	folio_unlock(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1978	goto done;
				1979	}
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	1980	EXPORT_SYMBOL(__block_write_full_folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1981
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1982	/*
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	1983	* If a folio has any new buffers, zero them out here, and mark them uptodate
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1984	* and dirty so they'll be written out (in order to prevent uninitialised
				1985	* block data from leaking). And clear the new bit.
				1986	*/
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	1987	void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1988	{
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	1989	size_t block_start, block_end;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1990	struct buffer_head head, bh;
				1991
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	1992	BUG_ON(!folio_test_locked(folio));
				1993	head = folio_buffers(folio);
				1994	if (!head)
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1995	return;
				1996
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	1997	bh = head;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1998	block_start = 0;
				1999	do {
				2000	block_end = block_start + bh->b_size;
				2001
				2002	if (buffer_new(bh)) {
				2003	if (block_end > from && block_start < to) {
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2004	if (!folio_test_uptodate(folio)) {
				2005	size_t start, xend;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2006
				2007	start = max(from, block_start);
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2008	xend = min(to, block_end);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2009
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2010	folio_zero_segment(folio, start, xend);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2011	set_buffer_uptodate(bh);
				2012	}
				2013
				2014	clear_buffer_new(bh);
				2015	mark_buffer_dirty(bh);
				2016	}
				2017	}
				2018
				2019	block_start = block_end;
				2020	bh = bh->b_this_page;
				2021	} while (bh != head);
				2022	}
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2023	EXPORT_SYMBOL(folio_zero_new_buffers);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2024
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2025	static int
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2026	iomap_to_bh(struct inode inode, sector_t block, struct buffer_head bh,
Christoph Hellwig	6d49cc8	2021-08-10 18:33:05 -0700	[diff] [blame]	2027	const struct iomap *iomap)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2028	{
Matthew Wilcox (Oracle)	8084419	2023-11-09 21:06:05 +0000	[diff] [blame]	2029	loff_t offset = (loff_t)block << inode->i_blkbits;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2030
				2031	bh->b_bdev = iomap->bdev;
				2032
				2033	/*
				2034	* Block points to offset in file we need to map, iomap contains
				2035	* the offset at which the map starts. If the map ends before the
				2036	* current block, then do not map the buffer and let the caller
				2037	* handle it.
				2038	*/
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2039	if (offset >= iomap->offset + iomap->length)
				2040	return -EIO;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2041
				2042	switch (iomap->type) {
				2043	case IOMAP_HOLE:
				2044	/*
				2045	* If the buffer is not up to date or beyond the current EOF,
				2046	* we need to mark it as new to ensure sub-block zeroing is
				2047	* executed if necessary.
				2048	*/
				2049	if (!buffer_uptodate(bh) \|\|
				2050	(offset >= i_size_read(inode)))
				2051	set_buffer_new(bh);
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2052	return 0;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2053	case IOMAP_DELALLOC:
				2054	if (!buffer_uptodate(bh) \|\|
				2055	(offset >= i_size_read(inode)))
				2056	set_buffer_new(bh);
				2057	set_buffer_uptodate(bh);
				2058	set_buffer_mapped(bh);
				2059	set_buffer_delay(bh);
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2060	return 0;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2061	case IOMAP_UNWRITTEN:
				2062	/*
Andreas Gruenbacher	3d7b6b21	2018-06-19 15:10:55 -0700	[diff] [blame]	2063	* For unwritten regions, we always need to ensure that regions
				2064	* in the block we are not writing to are zeroed. Mark the
				2065	* buffer as new to ensure this.
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2066	*/
				2067	set_buffer_new(bh);
				2068	set_buffer_unwritten(bh);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	2069	fallthrough;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2070	case IOMAP_MAPPED:
Andreas Gruenbacher	3d7b6b21	2018-06-19 15:10:55 -0700	[diff] [blame]	2071	if ((iomap->flags & IOMAP_F_NEW) \|\|
Christoph Hellwig	381c043	2023-09-25 08:54:45 -0700	[diff] [blame]	2072	offset >= i_size_read(inode)) {
				2073	/*
				2074	* This can happen if truncating the block device races
				2075	* with the check in the caller as i_size updates on
				2076	* block devices aren't synchronized by i_rwsem for
				2077	* block devices.
				2078	*/
				2079	if (S_ISBLK(inode->i_mode))
				2080	return -EIO;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2081	set_buffer_new(bh);
Christoph Hellwig	381c043	2023-09-25 08:54:45 -0700	[diff] [blame]	2082	}
Andreas Gruenbacher	19fe5f6	2017-10-01 17:55:54 -0400	[diff] [blame]	2083	bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
				2084	inode->i_blkbits;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2085	set_buffer_mapped(bh);
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2086	return 0;
				2087	default:
				2088	WARN_ON_ONCE(1);
				2089	return -EIO;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2090	}
				2091	}
				2092
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2093	int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
Christoph Hellwig	6d49cc8	2021-08-10 18:33:05 -0700	[diff] [blame]	2094	get_block_t get_block, const struct iomap iomap)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2095	{
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2096	size_t from = offset_in_folio(folio, pos);
				2097	size_t to = from + len;
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2098	struct inode *inode = folio->mapping->host;
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2099	size_t block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2100	sector_t block;
				2101	int err = 0;
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2102	size_t blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2103	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				2104
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2105	BUG_ON(!folio_test_locked(folio));
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2106	BUG_ON(to > folio_size(folio));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2107	BUG_ON(from > to);
				2108
Pankaj Raghav	c6c8c3e	2023-04-17 14:36:18 +0200	[diff] [blame]	2109	head = folio_create_buffers(folio, inode, 0);
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2110	blocksize = head->b_size;
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2111	block = div_u64(folio_pos(folio), blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2112
Matthew Wilcox (Oracle)	b061940	2023-11-09 21:06:07 +0000	[diff] [blame]	2113	for (bh = head, block_start = 0; bh != head \|\| !block_start;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2114	block++, block_start=block_end, bh = bh->b_this_page) {
				2115	block_end = block_start + blocksize;
				2116	if (block_end <= from \|\| block_start >= to) {
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2117	if (folio_test_uptodate(folio)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2118	if (!buffer_uptodate(bh))
				2119	set_buffer_uptodate(bh);
				2120	}
				2121	continue;
				2122	}
				2123	if (buffer_new(bh))
				2124	clear_buffer_new(bh);
				2125	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2126	WARN_ON(bh->b_size != blocksize);
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2127	if (get_block)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2128	err = get_block(inode, block, bh, 1);
Christoph Hellwig	4aa8cdd	2023-09-12 10:05:48 -0700	[diff] [blame]	2129	else
				2130	err = iomap_to_bh(inode, block, bh, iomap);
				2131	if (err)
				2132	break;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2133
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2134	if (buffer_new(bh)) {
Jan Kara	e64855c	2016-11-04 18:08:15 +0100	[diff] [blame]	2135	clean_bdev_bh_alias(bh);
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2136	if (folio_test_uptodate(folio)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	2137	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2138	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	2139	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2140	continue;
				2141	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2142	if (block_end > to \|\| block_start < from)
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2143	folio_zero_segments(folio,
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2144	to, block_end,
				2145	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2146	continue;
				2147	}
				2148	}
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2149	if (folio_test_uptodate(folio)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2150	if (!buffer_uptodate(bh))
				2151	set_buffer_uptodate(bh);
				2152	continue;
				2153	}
				2154	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2155	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2156	(block_start < from \|\| block_end > to)) {
Zhang Yi	e7ea112	2022-09-01 21:34:54 +0800	[diff] [blame]	2157	bh_read_nowait(bh, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2158	*wait_bh++=bh;
				2159	}
				2160	}
				2161	/*
				2162	* If we issued read requests - let them complete.
				2163	*/
				2164	while(wait_bh > wait) {
				2165	wait_on_buffer(*--wait_bh);
				2166	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	2167	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2168	}
Jan Kara	f9f07b6	2011-06-14 00:58:27 +0200	[diff] [blame]	2169	if (unlikely(err))
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2170	folio_zero_new_buffers(folio, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2171	return err;
				2172	}
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2173
				2174	int __block_write_begin(struct page *page, loff_t pos, unsigned len,
				2175	get_block_t *get_block)
				2176	{
Matthew Wilcox (Oracle)	d1bd0b4	2021-11-03 14:05:47 -0400	[diff] [blame]	2177	return __block_write_begin_int(page_folio(page), pos, len, get_block,
				2178	NULL);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	2179	}
Christoph Hellwig	ebdec24	2010-10-06 10:47:23 +0200	[diff] [blame]	2180	EXPORT_SYMBOL(__block_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2181
Bean Huo	a524fcf	2023-06-26 07:55:18 +0200	[diff] [blame]	2182	static void __block_commit_write(struct folio *folio, size_t from, size_t to)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2183	{
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2184	size_t block_start, block_end;
				2185	bool partial = false;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2186	unsigned blocksize;
				2187	struct buffer_head bh, head;
				2188
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2189	bh = head = folio_buffers(folio);
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2190	blocksize = bh->b_size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2191
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2192	block_start = 0;
				2193	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2194	block_end = block_start + blocksize;
				2195	if (block_end <= from \|\| block_start >= to) {
				2196	if (!buffer_uptodate(bh))
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2197	partial = true;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2198	} else {
				2199	set_buffer_uptodate(bh);
				2200	mark_buffer_dirty(bh);
				2201	}
Yang Guo	4ebd3ae	2021-02-24 12:02:48 -0800	[diff] [blame]	2202	if (buffer_new(bh))
				2203	clear_buffer_new(bh);
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2204
				2205	block_start = block_end;
				2206	bh = bh->b_this_page;
				2207	} while (bh != head);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2208
				2209	/*
				2210	* If this is a partial write which happened to make all buffers
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2211	* uptodate then we can optimize away a bogus read_folio() for
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2212	* the next read(). Here we 'discover' whether the folio went
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2213	* uptodate as a result of this (potentially partial) write.
				2214	*/
				2215	if (!partial)
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2216	folio_mark_uptodate(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2217	}
				2218
				2219	/*
Christoph Hellwig	155130a	2010-06-04 11:29:58 +0200	[diff] [blame]	2220	* block_write_begin takes care of the basic task of block allocation and
				2221	* bringing partial write blocks uptodate first.
				2222	*
npiggin@suse.de	7bb46a6	2010-05-27 01:05:33 +1000	[diff] [blame]	2223	* The filesystem needs to handle block truncation upon failure.
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2224	*/
Christoph Hellwig	155130a	2010-06-04 11:29:58 +0200	[diff] [blame]	2225	int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
Matthew Wilcox (Oracle)	b3992d1	2022-02-22 11:25:12 -0500	[diff] [blame]	2226	struct page *pagep, get_block_t get_block)
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2227	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2228	pgoff_t index = pos >> PAGE_SHIFT;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2229	struct page *page;
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2230	int status;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2231
Matthew Wilcox (Oracle)	b7446e7	2022-02-22 11:25:12 -0500	[diff] [blame]	2232	page = grab_cache_page_write_begin(mapping, index);
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2233	if (!page)
				2234	return -ENOMEM;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2235
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2236	status = __block_write_begin(page, pos, len, get_block);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2237	if (unlikely(status)) {
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2238	unlock_page(page);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2239	put_page(page);
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2240	page = NULL;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2241	}
				2242
Christoph Hellwig	6e1db88	2010-06-04 11:29:57 +0200	[diff] [blame]	2243	*pagep = page;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2244	return status;
				2245	}
				2246	EXPORT_SYMBOL(block_write_begin);
				2247
				2248	int block_write_end(struct file file, struct address_space mapping,
				2249	loff_t pos, unsigned len, unsigned copied,
				2250	struct page page, void fsdata)
				2251	{
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2252	struct folio *folio = page_folio(page);
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2253	size_t start = pos - folio_pos(folio);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2254
				2255	if (unlikely(copied < len)) {
				2256	/*
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2257	* The buffers that were written will now be uptodate, so
				2258	* we don't have to worry about a read_folio reading them
				2259	* and overwriting a partial write. However if we have
				2260	* encountered a short write and only partially written
				2261	* into a buffer, it will not be marked uptodate, so a
				2262	* read_folio might come in and destroy our partial write.
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2263	*
				2264	* Do the simplest thing, and just treat any short write to a
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2265	* non uptodate folio as a zero-length write, and force the
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2266	* caller to redo the whole thing.
				2267	*/
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2268	if (!folio_test_uptodate(folio))
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2269	copied = 0;
				2270
Matthew Wilcox (Oracle)	4a9622f	2023-06-12 22:01:36 +0100	[diff] [blame]	2271	folio_zero_new_buffers(folio, start+copied, start+len);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2272	}
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2273	flush_dcache_folio(folio);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2274
				2275	/* This could be a short (even 0-length) commit */
Bean Huo	489b7e7	2023-06-26 07:55:17 +0200	[diff] [blame]	2276	__block_commit_write(folio, start, start + copied);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2277
				2278	return copied;
				2279	}
				2280	EXPORT_SYMBOL(block_write_end);
				2281
				2282	int generic_write_end(struct file file, struct address_space mapping,
				2283	loff_t pos, unsigned len, unsigned copied,
				2284	struct page page, void fsdata)
				2285	{
Christoph Hellwig	8af54f2	2019-06-27 17:28:40 -0700	[diff] [blame]	2286	struct inode *inode = mapping->host;
				2287	loff_t old_size = inode->i_size;
				2288	bool i_size_changed = false;
				2289
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2290	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
Christoph Hellwig	8af54f2	2019-06-27 17:28:40 -0700	[diff] [blame]	2291
				2292	/*
				2293	* No need to use i_size_read() here, the i_size cannot change under us
				2294	* because we hold i_rwsem.
				2295	*
				2296	* But it's important to update i_size while still holding page lock:
				2297	* page writeout could otherwise come in and zero beyond i_size.
				2298	*/
				2299	if (pos + copied > inode->i_size) {
				2300	i_size_write(inode, pos + copied);
				2301	i_size_changed = true;
				2302	}
				2303
				2304	unlock_page(page);
Andreas Gruenbacher	7a77dad	2019-04-30 08:45:34 -0700	[diff] [blame]	2305	put_page(page);
Christoph Hellwig	8af54f2	2019-06-27 17:28:40 -0700	[diff] [blame]	2306
				2307	if (old_size < pos)
				2308	pagecache_isize_extended(inode, old_size, pos);
				2309	/*
				2310	* Don't mark the inode dirty under page lock. First, it unnecessarily
				2311	* makes the holding time of page lock longer. Second, it forces lock
				2312	* ordering of page lock and transaction start for journaling
				2313	* filesystems.
				2314	*/
				2315	if (i_size_changed)
				2316	mark_inode_dirty(inode);
Andreas Gruenbacher	26ddb1f	2019-04-30 08:45:33 -0700	[diff] [blame]	2317	return copied;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2318	}
				2319	EXPORT_SYMBOL(generic_write_end);
				2320
				2321	/*
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2322	* block_is_partially_uptodate checks whether buffers within a folio are
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2323	* uptodate or not.
				2324	*
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2325	* Returns true if all buffers which correspond to the specified part
				2326	* of the folio are uptodate.
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2327	*/
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2328	bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2329	{
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2330	unsigned block_start, block_end, blocksize;
				2331	unsigned to;
				2332	struct buffer_head bh, head;
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2333	bool ret = true;
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2334
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2335	head = folio_buffers(folio);
				2336	if (!head)
				2337	return false;
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2338	blocksize = head->b_size;
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2339	to = min_t(unsigned, folio_size(folio) - from, count);
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2340	to = from + to;
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2341	if (from < blocksize && to > folio_size(folio) - blocksize)
				2342	return false;
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2343
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2344	bh = head;
				2345	block_start = 0;
				2346	do {
				2347	block_end = block_start + blocksize;
				2348	if (block_end > from && block_start < to) {
				2349	if (!buffer_uptodate(bh)) {
Matthew Wilcox (Oracle)	2e7e80f	2022-02-09 20:21:27 +0000	[diff] [blame]	2350	ret = false;
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2351	break;
				2352	}
				2353	if (block_end >= to)
				2354	break;
				2355	}
				2356	block_start = block_end;
				2357	bh = bh->b_this_page;
				2358	} while (bh != head);
				2359
				2360	return ret;
				2361	}
				2362	EXPORT_SYMBOL(block_is_partially_uptodate);
				2363
				2364	/*
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2365	* Generic "read_folio" function for block devices that have the normal
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2366	* get_block functionality. This is most of the block device filesystems.
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2367	* Reads the folio asynchronously --- the unlock_buffer() and
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2368	* set/clear_buffer_uptodate() functions propagate buffer state into the
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2369	* folio once IO has completed.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2370	*/
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2371	int block_read_full_folio(struct folio folio, get_block_t get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2372	{
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2373	struct inode *inode = folio->mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2374	sector_t iblock, lblock;
				2375	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
Matthew Wilcox (Oracle)	fa399c3	2023-11-09 21:06:08 +0000	[diff] [blame]	2376	size_t blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2377	int nr, i;
				2378	int fully_mapped = 1;
Matthew Wilcox (Oracle)	b7a6eb2	2022-05-26 23:19:49 -0400	[diff] [blame]	2379	bool page_error = false;
Eric Biggers	4fa512c	2022-12-23 12:36:37 -0800	[diff] [blame]	2380	loff_t limit = i_size_read(inode);
				2381
				2382	/* This is needed for ext4. */
				2383	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
				2384	limit = inode->i_sb->s_maxbytes;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2385
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2386	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
				2387
Pankaj Raghav	c6c8c3e	2023-04-17 14:36:18 +0200	[diff] [blame]	2388	head = folio_create_buffers(folio, inode, 0);
Linus Torvalds	45bce8f	2012-11-29 10:21:43 -0800	[diff] [blame]	2389	blocksize = head->b_size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2390
Matthew Wilcox (Oracle)	fa399c3	2023-11-09 21:06:08 +0000	[diff] [blame]	2391	iblock = div_u64(folio_pos(folio), blocksize);
				2392	lblock = div_u64(limit + blocksize - 1, blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2393	bh = head;
				2394	nr = 0;
				2395	i = 0;
				2396
				2397	do {
				2398	if (buffer_uptodate(bh))
				2399	continue;
				2400
				2401	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2402	int err = 0;
				2403
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2404	fully_mapped = 0;
				2405	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2406	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2407	err = get_block(inode, iblock, bh, 0);
Matthew Wilcox (Oracle)	b7a6eb2	2022-05-26 23:19:49 -0400	[diff] [blame]	2408	if (err) {
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2409	folio_set_error(folio);
Matthew Wilcox (Oracle)	b7a6eb2	2022-05-26 23:19:49 -0400	[diff] [blame]	2410	page_error = true;
				2411	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2412	}
				2413	if (!buffer_mapped(bh)) {
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2414	folio_zero_range(folio, i * blocksize,
				2415	blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2416	if (!err)
				2417	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2418	continue;
				2419	}
				2420	/*
				2421	* get_block() might have updated the buffer
				2422	* synchronously
				2423	*/
				2424	if (buffer_uptodate(bh))
				2425	continue;
				2426	}
				2427	arr[nr++] = bh;
				2428	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2429
				2430	if (fully_mapped)
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2431	folio_set_mappedtodisk(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2432
				2433	if (!nr) {
				2434	/*
Matthew Wilcox (Oracle)	6ba924d	2023-10-04 17:53:05 +0100	[diff] [blame]	2435	* All buffers are uptodate or get_block() returned an
				2436	* error when trying to map them - we can finish the read.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2437	*/
Matthew Wilcox (Oracle)	6ba924d	2023-10-04 17:53:05 +0100	[diff] [blame]	2438	folio_end_read(folio, !page_error);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2439	return 0;
				2440	}
				2441
				2442	/* Stage two: lock the buffers */
				2443	for (i = 0; i < nr; i++) {
				2444	bh = arr[i];
				2445	lock_buffer(bh);
				2446	mark_buffer_async_read(bh);
				2447	}
				2448
				2449	/*
				2450	* Stage 3: start the IO. Check for uptodateness
				2451	* inside the buffer lock in case another process reading
				2452	* the underlying blockdev brought it uptodate (the sct fix).
				2453	*/
				2454	for (i = 0; i < nr; i++) {
				2455	bh = arr[i];
				2456	if (buffer_uptodate(bh))
				2457	end_buffer_async_read(bh, 1);
				2458	else
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2459	submit_bh(REQ_OP_READ, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2460	}
				2461	return 0;
				2462	}
Matthew Wilcox (Oracle)	2c69e20	2022-04-29 10:40:40 -0400	[diff] [blame]	2463	EXPORT_SYMBOL(block_read_full_folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2464
				2465	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2466	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2467	* deal with the hole.
				2468	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2469	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2470	{
				2471	struct address_space *mapping = inode->i_mapping;
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2472	const struct address_space_operations *aops = mapping->a_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2473	struct page *page;
Alexander Potapenko	1468c6f	2022-09-15 17:04:16 +0200	[diff] [blame]	2474	void *fsdata = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2475	int err;
				2476
npiggin@suse.de	c08d3b0	2009-08-21 02:35:06 +1000	[diff] [blame]	2477	err = inode_newsize_ok(inode, size);
				2478	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2479	goto out;
				2480
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2481	err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2482	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2483	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2484
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2485	err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2486	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2487
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2488	out:
				2489	return err;
				2490	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2491	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2492
Adrian Bunk	f1e3af7	2008-04-29 00:59:01 -0700	[diff] [blame]	2493	static int cont_expand_zero(struct file file, struct address_space mapping,
				2494	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2495	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2496	struct inode *inode = mapping->host;
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2497	const struct address_space_operations *aops = mapping->a_ops;
Fabian Frederick	9340747	2017-02-27 14:28:32 -0800	[diff] [blame]	2498	unsigned int blocksize = i_blocksize(inode);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2499	struct page *page;
Alexander Potapenko	1468c6f	2022-09-15 17:04:16 +0200	[diff] [blame]	2500	void *fsdata = NULL;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2501	pgoff_t index, curidx;
				2502	loff_t curpos;
				2503	unsigned zerofrom, offset, len;
				2504	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2505
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2506	index = pos >> PAGE_SHIFT;
				2507	offset = pos & ~PAGE_MASK;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2508
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2509	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
				2510	zerofrom = curpos & ~PAGE_MASK;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2511	if (zerofrom & (blocksize-1)) {
				2512	*bytes \|= (blocksize-1);
				2513	(*bytes)++;
				2514	}
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2515	len = PAGE_SIZE - zerofrom;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2516
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2517	err = aops->write_begin(file, mapping, curpos, len,
Tetsuo Handa	c718a97	2017-05-08 15:58:59 -0700	[diff] [blame]	2518	&page, &fsdata);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2519	if (err)
				2520	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2521	zero_user(page, zerofrom, len);
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2522	err = aops->write_end(file, mapping, curpos, len, len,
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2523	page, fsdata);
				2524	if (err < 0)
				2525	goto out;
				2526	BUG_ON(err != len);
				2527	err = 0;
OGAWA Hirofumi	061e974	2008-04-28 02:16:28 -0700	[diff] [blame]	2528
				2529	balance_dirty_pages_ratelimited(mapping);
Mikulas Patocka	c2ca0fc	2014-07-27 13:00:41 -0400	[diff] [blame]	2530
Davidlohr Bueso	08d405c	2019-01-03 15:28:58 -0800	[diff] [blame]	2531	if (fatal_signal_pending(current)) {
Mikulas Patocka	c2ca0fc	2014-07-27 13:00:41 -0400	[diff] [blame]	2532	err = -EINTR;
				2533	goto out;
				2534	}
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2535	}
				2536
				2537	/* page covers the boundary, find the boundary offset */
				2538	if (index == curidx) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2539	zerofrom = curpos & ~PAGE_MASK;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2540	/* if we will expand the thing last block will be filled */
				2541	if (offset <= zerofrom) {
				2542	goto out;
				2543	}
				2544	if (zerofrom & (blocksize-1)) {
				2545	*bytes \|= (blocksize-1);
				2546	(*bytes)++;
				2547	}
				2548	len = offset - zerofrom;
				2549
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2550	err = aops->write_begin(file, mapping, curpos, len,
Tetsuo Handa	c718a97	2017-05-08 15:58:59 -0700	[diff] [blame]	2551	&page, &fsdata);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2552	if (err)
				2553	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2554	zero_user(page, zerofrom, len);
Matthew Wilcox (Oracle)	53b524b	2022-03-03 13:35:20 -0500	[diff] [blame]	2555	err = aops->write_end(file, mapping, curpos, len, len,
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2556	page, fsdata);
				2557	if (err < 0)
				2558	goto out;
				2559	BUG_ON(err != len);
				2560	err = 0;
				2561	}
				2562	out:
				2563	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2564	}
				2565
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2566	/*
				2567	* For moronic filesystems that do not allow holes in file.
				2568	* We may have to extend the file.
				2569	*/
Christoph Hellwig	282dc17	2010-06-04 11:29:55 +0200	[diff] [blame]	2570	int cont_write_begin(struct file file, struct address_space mapping,
Matthew Wilcox (Oracle)	be3bbbc	2022-02-22 11:25:12 -0500	[diff] [blame]	2571	loff_t pos, unsigned len,
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2572	struct page pagep, void fsdata,
				2573	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2574	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2575	struct inode *inode = mapping->host;
Fabian Frederick	9340747	2017-02-27 14:28:32 -0800	[diff] [blame]	2576	unsigned int blocksize = i_blocksize(inode);
				2577	unsigned int zerofrom;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2578	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2579
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2580	err = cont_expand_zero(file, mapping, pos, bytes);
				2581	if (err)
Christoph Hellwig	155130a	2010-06-04 11:29:58 +0200	[diff] [blame]	2582	return err;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2583
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2584	zerofrom = *bytes & ~PAGE_MASK;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2585	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2586	*bytes \|= (blocksize-1);
				2587	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2588	}
				2589
Matthew Wilcox (Oracle)	b3992d1	2022-02-22 11:25:12 -0500	[diff] [blame]	2590	return block_write_begin(mapping, pos, len, pagep, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2591	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2592	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2593
Bean Huo	a524fcf	2023-06-26 07:55:18 +0200	[diff] [blame]	2594	void block_commit_write(struct page *page, unsigned from, unsigned to)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2595	{
Matthew Wilcox (Oracle)	8c6cb3e	2023-06-12 22:01:35 +0100	[diff] [blame]	2596	struct folio *folio = page_folio(page);
Bean Huo	489b7e7	2023-06-26 07:55:17 +0200	[diff] [blame]	2597	__block_commit_write(folio, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2598	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2599	EXPORT_SYMBOL(block_commit_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2600
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2601	/*
				2602	* block_page_mkwrite() is not allowed to change the file size as it gets
				2603	* called from a page fault handler when a page is first dirtied. Hence we must
				2604	* be careful to check for EOF conditions here. We set the page up correctly
				2605	* for a written page which means we get ENOSPC checking when writing into
				2606	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2607	* support these features.
				2608	*
				2609	* We are not allowed to take the i_mutex here so we have to play games to
				2610	* protect against truncate races as the page could now be beyond EOF. Because
npiggin@suse.de	7bb46a6	2010-05-27 01:05:33 +1000	[diff] [blame]	2611	* truncate writes the inode size before removing pages, once we have the
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2612	* page lock we can determine safely if the page is beyond EOF. If it is not
				2613	* beyond EOF, then the page is guaranteed safe against truncation until we
				2614	* unlock the page.
Jan Kara	ea13a86	2011-05-24 00:23:35 +0200	[diff] [blame]	2615	*
Jan Kara	14da920	2012-06-12 16:20:37 +0200	[diff] [blame]	2616	* Direct callers of this function should protect against filesystem freezing
Ross Zwisler	5c50002	2015-10-13 16:51:02 -0600	[diff] [blame]	2617	* using sb_start_pagefault() - sb_end_pagefault() functions.
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2618	*/
Ross Zwisler	5c50002	2015-10-13 16:51:02 -0600	[diff] [blame]	2619	int block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
Jan Kara	24da4fa	2011-05-24 00:23:34 +0200	[diff] [blame]	2620	get_block_t get_block)
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2621	{
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2622	struct folio *folio = page_folio(vmf->page);
Al Viro	496ad9a	2013-01-23 17:07:38 -0500	[diff] [blame]	2623	struct inode *inode = file_inode(vma->vm_file);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2624	unsigned long end;
				2625	loff_t size;
Jan Kara	24da4fa	2011-05-24 00:23:34 +0200	[diff] [blame]	2626	int ret;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2627
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2628	folio_lock(folio);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2629	size = i_size_read(inode);
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2630	if ((folio->mapping != inode->i_mapping) \|\|
				2631	(folio_pos(folio) >= size)) {
Jan Kara	24da4fa	2011-05-24 00:23:34 +0200	[diff] [blame]	2632	/* We overload EFAULT to mean page got truncated */
				2633	ret = -EFAULT;
				2634	goto out_unlock;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2635	}
				2636
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2637	end = folio_size(folio);
				2638	/* folio is wholly or partially inside EOF */
				2639	if (folio_pos(folio) + end > size)
				2640	end = size - folio_pos(folio);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2641
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2642	ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
Bean Huo	a524fcf	2023-06-26 07:55:18 +0200	[diff] [blame]	2643	if (unlikely(ret))
Jan Kara	24da4fa	2011-05-24 00:23:34 +0200	[diff] [blame]	2644	goto out_unlock;
Bean Huo	a524fcf	2023-06-26 07:55:18 +0200	[diff] [blame]	2645
				2646	__block_commit_write(folio, 0, end);
				2647
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2648	folio_mark_dirty(folio);
				2649	folio_wait_stable(folio);
Jan Kara	24da4fa	2011-05-24 00:23:34 +0200	[diff] [blame]	2650	return 0;
				2651	out_unlock:
Matthew Wilcox (Oracle)	fe18137	2023-06-12 22:01:34 +0100	[diff] [blame]	2652	folio_unlock(folio);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2653	return ret;
				2654	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2655	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2656
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2657	int block_truncate_page(struct address_space *mapping,
				2658	loff_t from, get_block_t *get_block)
				2659	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2660	pgoff_t index = from >> PAGE_SHIFT;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2661	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2662	sector_t iblock;
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2663	size_t offset, length, pos;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2664	struct inode *inode = mapping->host;
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2665	struct folio *folio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2666	struct buffer_head *bh;
Jiapeng Chong	dc7cb2d	2023-03-23 10:32:59 +0800	[diff] [blame]	2667	int err = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2668
Fabian Frederick	9340747	2017-02-27 14:28:32 -0800	[diff] [blame]	2669	blocksize = i_blocksize(inode);
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2670	length = from & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2671
				2672	/* Block boundary? Nothing to do */
				2673	if (!length)
				2674	return 0;
				2675
				2676	length = blocksize - length;
Matthew Wilcox (Oracle)	4b04646	2023-11-09 21:06:06 +0000	[diff] [blame]	2677	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
				2678
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2679	folio = filemap_grab_folio(mapping, index);
				2680	if (IS_ERR(folio))
				2681	return PTR_ERR(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2682
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2683	bh = folio_buffers(folio);
Matthew Wilcox (Oracle)	3decb85	2023-10-16 21:10:49 +0100	[diff] [blame]	2684	if (!bh)
Matthew Wilcox (Oracle)	0a88810	2023-10-16 21:11:14 +0100	[diff] [blame]	2685	bh = create_empty_buffers(folio, blocksize, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2686
				2687	/* Find the buffer that contains "offset" */
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2688	offset = offset_in_folio(folio, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2689	pos = blocksize;
				2690	while (offset >= pos) {
				2691	bh = bh->b_this_page;
				2692	iblock++;
				2693	pos += blocksize;
				2694	}
				2695
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2696	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2697	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2698	err = get_block(inode, iblock, bh, 0);
				2699	if (err)
				2700	goto unlock;
				2701	/* unmapped? It's a hole - nothing to do */
				2702	if (!buffer_mapped(bh))
				2703	goto unlock;
				2704	}
				2705
				2706	/* Ok, it's mapped. Make sure it's up-to-date */
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2707	if (folio_test_uptodate(folio))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2708	set_buffer_uptodate(bh);
				2709
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2710	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Zhang Yi	e7ea112	2022-09-01 21:34:54 +0800	[diff] [blame]	2711	err = bh_read(bh, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2712	/* Uhhuh. Read error. Complain and punt. */
Zhang Yi	e7ea112	2022-09-01 21:34:54 +0800	[diff] [blame]	2713	if (err < 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2714	goto unlock;
				2715	}
				2716
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2717	folio_zero_range(folio, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2718	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2719
				2720	unlock:
Matthew Wilcox (Oracle)	6d68f64	2023-06-12 22:01:41 +0100	[diff] [blame]	2721	folio_unlock(folio);
				2722	folio_put(folio);
Jiapeng Chong	dc7cb2d	2023-03-23 10:32:59 +0800	[diff] [blame]	2723
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2724	return err;
				2725	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2726	EXPORT_SYMBOL(block_truncate_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2727
				2728	/*
				2729	* The generic ->writepage function for buffer-backed address_spaces
				2730	*/
Matthew Wilcox (Oracle)	17bf23a	2023-12-15 20:02:44 +0000	[diff] [blame]	2731	int block_write_full_folio(struct folio folio, struct writeback_control wbc,
				2732	void *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2733	{
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2734	struct inode * const inode = folio->mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2735	loff_t i_size = i_size_read(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2736
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2737	/* Is the folio fully inside i_size? */
				2738	if (folio_pos(folio) + folio_size(folio) <= i_size)
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	2739	return __block_write_full_folio(inode, folio, get_block, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2740
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2741	/* Is the folio fully outside i_size? (truncate in progress) */
				2742	if (folio_pos(folio) >= i_size) {
Matthew Wilcox (Oracle)	53418a1	2023-06-12 22:01:31 +0100	[diff] [blame]	2743	folio_unlock(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2744	return 0; /* don't care */
				2745	}
				2746
				2747	/*
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2748	* The folio straddles i_size. It must be zeroed out on each and every
Adam Buchbinder	2a61aa4	2009-12-11 16:35:40 -0500	[diff] [blame]	2749	* writepage invocation because it may be mmapped. "A file is mapped
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2750	* in multiples of the page size. For a file that is not a multiple of
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2751	* the page size, the remaining memory is zeroed when mapped, and
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2752	* writes to that region are not written out to the file."
				2753	*/
Matthew Wilcox (Oracle)	bb0ea59	2023-06-12 22:01:33 +0100	[diff] [blame]	2754	folio_zero_segment(folio, offset_in_folio(folio, i_size),
				2755	folio_size(folio));
Matthew Wilcox (Oracle)	14059f6	2023-12-15 20:02:45 +0000	[diff] [blame]	2756	return __block_write_full_folio(inode, folio, get_block, wbc);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2757	}
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2758
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2759	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2760	get_block_t *get_block)
				2761	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2762	struct inode *inode = mapping->host;
Alexander Potapenko	2a527d6	2017-07-05 00:56:21 -0400	[diff] [blame]	2763	struct buffer_head tmp = {
				2764	.b_size = i_blocksize(inode),
				2765	};
				2766
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2767	get_block(inode, block, &tmp, 0);
				2768	return tmp.b_blocknr;
				2769	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2770	EXPORT_SYMBOL(generic_block_bmap);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2771
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2772	static void end_bio_bh_io_sync(struct bio *bio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2773	{
				2774	struct buffer_head *bh = bio->bi_private;
				2775
Jens Axboe	b7c44ed	2015-07-24 12:37:59 -0600	[diff] [blame]	2776	if (unlikely(bio_flagged(bio, BIO_QUIET)))
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	2777	set_bit(BH_Quiet, &bh->b_state);
				2778
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2779	bh->b_end_io(bh, !bio->bi_status);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2780	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2781	}
				2782
Ritesh Harjani (IBM)	5bdf402	2022-08-18 10:34:40 +0530	[diff] [blame]	2783	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	2784	enum rw_hint write_hint,
Ritesh Harjani (IBM)	5bdf402	2022-08-18 10:34:40 +0530	[diff] [blame]	2785	struct writeback_control *wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2786	{
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2787	const enum req_op op = opf & REQ_OP_MASK;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2788	struct bio *bio;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2789
				2790	BUG_ON(!buffer_locked(bh));
				2791	BUG_ON(!buffer_mapped(bh));
				2792	BUG_ON(!bh->b_end_io);
Aneesh Kumar K.V	8fb0e34	2009-05-12 16:22:37 -0400	[diff] [blame]	2793	BUG_ON(buffer_delay(bh));
				2794	BUG_ON(buffer_unwritten(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2795
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2796	/*
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2797	* Only clear out a write error when rewriting
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2798	*/
Mike Christie	2a222ca	2016-06-05 14:31:43 -0500	[diff] [blame]	2799	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2800	clear_buffer_write_io_error(bh);
				2801
Christoph Hellwig	07888c66	2022-01-24 10:11:05 +0100	[diff] [blame]	2802	if (buffer_meta(bh))
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2803	opf \|= REQ_META;
Christoph Hellwig	07888c66	2022-01-24 10:11:05 +0100	[diff] [blame]	2804	if (buffer_prio(bh))
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2805	opf \|= REQ_PRIO;
Christoph Hellwig	07888c66	2022-01-24 10:11:05 +0100	[diff] [blame]	2806
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2807	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2808
Eric Biggers	4f74d15	2020-07-02 01:56:07 +0000	[diff] [blame]	2809	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
				2810
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2811	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	2812	bio->bi_write_hint = write_hint;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2813
Johannes Thumshirn	741af75	2023-05-31 04:50:27 -0700	[diff] [blame]	2814	__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2815
				2816	bio->bi_end_io = end_bio_bh_io_sync;
				2817	bio->bi_private = bh;
				2818
Ming Lei	83c9c54	2020-01-05 09:41:14 +0800	[diff] [blame]	2819	/* Take care of bh's that straddle the end of the device */
				2820	guard_bio_eod(bio);
				2821
Dennis Zhou	fd42df3	2018-12-05 12:10:34 -0500	[diff] [blame]	2822	if (wbc) {
				2823	wbc_init_bio(wbc, bio);
Tejun Heo	34e51a5	2019-06-27 13:39:49 -0700	[diff] [blame]	2824	wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
Dennis Zhou	fd42df3	2018-12-05 12:10:34 -0500	[diff] [blame]	2825	}
				2826
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2827	submit_bio(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2828	}
Tejun Heo	bafc0db	2015-06-02 08:37:23 -0600	[diff] [blame]	2829
Ritesh Harjani (IBM)	5bdf402	2022-08-18 10:34:40 +0530	[diff] [blame]	2830	void submit_bh(blk_opf_t opf, struct buffer_head *bh)
Tejun Heo	bafc0db	2015-06-02 08:37:23 -0600	[diff] [blame]	2831	{
Bart Van Assche	4498135	2024-02-02 12:39:25 -0800	[diff] [blame]	2832	submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
Darrick J. Wong	713685111	2013-04-29 15:07:25 -0700	[diff] [blame]	2833	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2834	EXPORT_SYMBOL(submit_bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2835
Bart Van Assche	3ae7286	2022-07-14 11:07:12 -0700	[diff] [blame]	2836	void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
Christoph Hellwig	9cb569d	2010-08-11 17:06:24 +0200	[diff] [blame]	2837	{
				2838	lock_buffer(bh);
				2839	if (!test_clear_buffer_dirty(bh)) {
				2840	unlock_buffer(bh);
				2841	return;
				2842	}
				2843	bh->b_end_io = end_buffer_write_sync;
				2844	get_bh(bh);
Bart Van Assche	1420c4a5	2022-07-14 11:07:13 -0700	[diff] [blame]	2845	submit_bh(REQ_OP_WRITE \| op_flags, bh);
Christoph Hellwig	9cb569d	2010-08-11 17:06:24 +0200	[diff] [blame]	2846	}
				2847	EXPORT_SYMBOL(write_dirty_buffer);
				2848
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2849	/*
				2850	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2851	* and then start new I/O and then wait upon it. The caller must have a ref on
				2852	* the buffer_head.
				2853	*/
Bart Van Assche	3ae7286	2022-07-14 11:07:12 -0700	[diff] [blame]	2854	int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2855	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2856	WARN_ON(atomic_read(&bh->b_count) < 1);
				2857	lock_buffer(bh);
				2858	if (test_clear_buffer_dirty(bh)) {
Xianting Tian	377254b	2020-07-31 12:10:25 -0400	[diff] [blame]	2859	/*
				2860	* The bh should be mapped, but it might not be if the
				2861	* device was hot-removed. Not much we can do but fail the I/O.
				2862	*/
				2863	if (!buffer_mapped(bh)) {
				2864	unlock_buffer(bh);
				2865	return -EIO;
				2866	}
				2867
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2868	get_bh(bh);
				2869	bh->b_end_io = end_buffer_write_sync;
Ritesh Harjani (IBM)	ab62062	2022-08-18 10:34:39 +0530	[diff] [blame]	2870	submit_bh(REQ_OP_WRITE \| op_flags, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2871	wait_on_buffer(bh);
Ritesh Harjani (IBM)	ab62062	2022-08-18 10:34:39 +0530	[diff] [blame]	2872	if (!buffer_uptodate(bh))
				2873	return -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2874	} else {
				2875	unlock_buffer(bh);
				2876	}
Ritesh Harjani (IBM)	ab62062	2022-08-18 10:34:39 +0530	[diff] [blame]	2877	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2878	}
Christoph Hellwig	87e9951	2010-08-11 17:05:45 +0200	[diff] [blame]	2879	EXPORT_SYMBOL(__sync_dirty_buffer);
				2880
				2881	int sync_dirty_buffer(struct buffer_head *bh)
				2882	{
Christoph Hellwig	70fd761	2016-11-01 07:40:10 -0600	[diff] [blame]	2883	return __sync_dirty_buffer(bh, REQ_SYNC);
Christoph Hellwig	87e9951	2010-08-11 17:05:45 +0200	[diff] [blame]	2884	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2885	EXPORT_SYMBOL(sync_dirty_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2886
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2887	static inline int buffer_busy(struct buffer_head *bh)
				2888	{
				2889	return atomic_read(&bh->b_count) \|
				2890	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2891	}
				2892
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2893	static bool
				2894	drop_buffers(struct folio folio, struct buffer_head *buffers_to_free)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2895	{
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2896	struct buffer_head *head = folio_buffers(folio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2897	struct buffer_head *bh;
				2898
				2899	bh = head;
				2900	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2901	if (buffer_busy(bh))
				2902	goto failed;
				2903	bh = bh->b_this_page;
				2904	} while (bh != head);
				2905
				2906	do {
				2907	struct buffer_head *next = bh->b_this_page;
				2908
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	2909	if (bh->b_assoc_map)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2910	__remove_assoc_queue(bh);
				2911	bh = next;
				2912	} while (bh != head);
				2913	*buffers_to_free = head;
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2914	folio_detach_private(folio);
				2915	return true;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2916	failed:
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2917	return false;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2918	}
				2919
Matthew Wilcox (Oracle)	b1888d1	2024-04-16 04:17:47 +0100	[diff] [blame]	2920	/**
				2921	* try_to_free_buffers - Release buffers attached to this folio.
				2922	* @folio: The folio.
				2923	*
				2924	* If any buffers are in use (dirty, under writeback, elevated refcount),
				2925	* no buffers will be freed.
				2926	*
				2927	* If the folio is dirty but all the buffers are clean then we need to
				2928	* be sure to mark the folio clean as well. This is because the folio
				2929	* may be against a block device, and a later reattachment of buffers
				2930	* to a dirty folio will set all buffers dirty. Which would corrupt
				2931	* filesystem data on the same device.
				2932	*
				2933	* The same applies to regular filesystem folios: if all the buffers are
				2934	* clean then we set the folio clean and proceed. To do that, we require
				2935	* total exclusion from block_dirty_folio(). That is obtained with
				2936	* i_private_lock.
				2937	*
				2938	* Exclusion against try_to_free_buffers may be obtained by either
				2939	* locking the folio or by holding its mapping's i_private_lock.
				2940	*
				2941	* Context: Process context. @folio must be locked. Will not sleep.
				2942	* Return: true if all buffers attached to this folio were freed.
				2943	*/
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2944	bool try_to_free_buffers(struct folio *folio)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2945	{
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2946	struct address_space * const mapping = folio->mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2947	struct buffer_head *buffers_to_free = NULL;
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2948	bool ret = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2949
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2950	BUG_ON(!folio_test_locked(folio));
				2951	if (folio_test_writeback(folio))
				2952	return false;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2953
				2954	if (mapping == NULL) { /* can this still happen? */
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2955	ret = drop_buffers(folio, &buffers_to_free);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2956	goto out;
				2957	}
				2958
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	2959	spin_lock(&mapping->i_private_lock);
Matthew Wilcox (Oracle)	6439476	2022-05-01 06:39:50 -0400	[diff] [blame]	2960	ret = drop_buffers(folio, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2961
				2962	/*
				2963	* If the filesystem writes its buffers by hand (eg ext3)
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2964	* then we can have clean buffers against a dirty folio. We
				2965	* clean the folio here; otherwise the VM will never notice
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2966	* that the filesystem did any IO at all.
				2967	*
				2968	* Also, during truncate, discard_buffer will have marked all
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2969	* the folio's buffers clean. We discover that here and clean
				2970	* the folio also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2971	*
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	2972	* i_private_lock must be held over this entire operation in order
Matthew Wilcox (Oracle)	e621900	2022-02-09 20:22:12 +0000	[diff] [blame]	2973	* to synchronise against block_dirty_folio and prevent the
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2974	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2975	*/
Tejun Heo	11f81be	2015-05-22 17:13:15 -0400	[diff] [blame]	2976	if (ret)
Matthew Wilcox (Oracle)	68189fe	2022-05-01 01:08:08 -0400	[diff] [blame]	2977	folio_cancel_dirty(folio);
Matthew Wilcox (Oracle)	600f111	2023-11-17 21:58:23 +0000	[diff] [blame]	2978	spin_unlock(&mapping->i_private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2979	out:
				2980	if (buffers_to_free) {
				2981	struct buffer_head *bh = buffers_to_free;
				2982
				2983	do {
				2984	struct buffer_head *next = bh->b_this_page;
				2985	free_buffer_head(bh);
				2986	bh = next;
				2987	} while (bh != buffers_to_free);
				2988	}
				2989	return ret;
				2990	}
				2991	EXPORT_SYMBOL(try_to_free_buffers);
				2992
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2993	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2994	* Buffer-head allocation
				2995	*/
Alexey Dobriyan	68279f9	2023-10-11 19:55:00 +0300	[diff] [blame]	2996	static struct kmem_cache *bh_cachep __ro_after_init;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2997
				2998	/*
				2999	* Once the number of bh's in the machine exceeds this level, we start
				3000	* stripping them in writeback.
				3001	*/
Alexey Dobriyan	68279f9	2023-10-11 19:55:00 +0300	[diff] [blame]	3002	static unsigned long max_buffer_heads __ro_after_init;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3003
				3004	int buffer_heads_over_limit;
				3005
				3006	struct bh_accounting {
				3007	int nr; /* Number of live bh's */
				3008	int ratelimit; /* Limit cacheline bouncing */
				3009	};
				3010
				3011	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3012
				3013	static void recalc_bh_state(void)
				3014	{
				3015	int i;
				3016	int tot = 0;
				3017
Christoph Lameter	ee1be86	2010-12-06 11:40:05 -0600	[diff] [blame]	3018	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3019	return;
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3020	__this_cpu_write(bh_accounting.ratelimit, 0);
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3021	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3022	tot += per_cpu(bh_accounting, i).nr;
				3023	buffer_heads_over_limit = (tot > max_buffer_heads);
				3024	}
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3025
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3026	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3027	{
Richard Kennedy	019b4d1	2010-03-10 15:20:33 -0800	[diff] [blame]	3028	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3029	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3030	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Thomas Gleixner	f1e67e3	2019-11-18 14:28:24 +0100	[diff] [blame]	3031	spin_lock_init(&ret->b_uptodate_lock);
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3032	preempt_disable();
				3033	__this_cpu_inc(bh_accounting.nr);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3034	recalc_bh_state();
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3035	preempt_enable();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3036	}
				3037	return ret;
				3038	}
				3039	EXPORT_SYMBOL(alloc_buffer_head);
				3040
				3041	void free_buffer_head(struct buffer_head *bh)
				3042	{
				3043	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3044	kmem_cache_free(bh_cachep, bh);
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3045	preempt_disable();
				3046	__this_cpu_dec(bh_accounting.nr);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3047	recalc_bh_state();
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3048	preempt_enable();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3049	}
				3050	EXPORT_SYMBOL(free_buffer_head);
				3051
Sebastian Andrzej Siewior	fc4d24c	2016-11-03 15:49:57 +0100	[diff] [blame]	3052	static int buffer_exit_cpu_dead(unsigned int cpu)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3053	{
				3054	int i;
				3055	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3056
				3057	for (i = 0; i < BH_LRU_SIZE; i++) {
				3058	brelse(b->bhs[i]);
				3059	b->bhs[i] = NULL;
				3060	}
Christoph Lameter	c7b9251	2010-12-06 11:16:28 -0600	[diff] [blame]	3061	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3062	per_cpu(bh_accounting, cpu).nr = 0;
Sebastian Andrzej Siewior	fc4d24c	2016-11-03 15:49:57 +0100	[diff] [blame]	3063	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3064	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3065
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3066	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3067	* bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3068	* @bh: struct buffer_head
				3069	*
				3070	* Return true if the buffer is up-to-date and false,
				3071	* with the buffer locked, if not.
				3072	*/
				3073	int bh_uptodate_or_lock(struct buffer_head *bh)
				3074	{
				3075	if (!buffer_uptodate(bh)) {
				3076	lock_buffer(bh);
				3077	if (!buffer_uptodate(bh))
				3078	return 0;
				3079	unlock_buffer(bh);
				3080	}
				3081	return 1;
				3082	}
				3083	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3084
				3085	/**
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3086	* __bh_read - Submit read for a locked buffer
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3087	* @bh: struct buffer_head
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3088	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
				3089	* @wait: wait until reading finish
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3090	*
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3091	* Returns zero on success or don't wait, and -EIO on error.
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3092	*/
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3093	int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3094	{
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3095	int ret = 0;
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3096
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3097	BUG_ON(!buffer_locked(bh));
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3098
				3099	get_bh(bh);
				3100	bh->b_end_io = end_buffer_read_sync;
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3101	submit_bh(REQ_OP_READ \| op_flags, bh);
				3102	if (wait) {
				3103	wait_on_buffer(bh);
				3104	if (!buffer_uptodate(bh))
				3105	ret = -EIO;
				3106	}
				3107	return ret;
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3108	}
Zhang Yi	fdee117	2022-09-01 21:34:53 +0800	[diff] [blame]	3109	EXPORT_SYMBOL(__bh_read);
				3110
				3111	/**
				3112	* __bh_read_batch - Submit read for a batch of unlocked buffers
				3113	* @nr: entry number of the buffer batch
				3114	* @bhs: a batch of struct buffer_head
				3115	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
				3116	* @force_lock: force to get a lock on the buffer if set, otherwise drops any
				3117	* buffer that cannot lock.
				3118	*
				3119	* Returns zero on success or don't wait, and -EIO on error.
				3120	*/
				3121	void __bh_read_batch(int nr, struct buffer_head *bhs[],
				3122	blk_opf_t op_flags, bool force_lock)
				3123	{
				3124	int i;
				3125
				3126	for (i = 0; i < nr; i++) {
				3127	struct buffer_head *bh = bhs[i];
				3128
				3129	if (buffer_uptodate(bh))
				3130	continue;
				3131
				3132	if (force_lock)
				3133	lock_buffer(bh);
				3134	else
				3135	if (!trylock_buffer(bh))
				3136	continue;
				3137
				3138	if (buffer_uptodate(bh)) {
				3139	unlock_buffer(bh);
				3140	continue;
				3141	}
				3142
				3143	bh->b_end_io = end_buffer_read_sync;
				3144	get_bh(bh);
				3145	submit_bh(REQ_OP_READ \| op_flags, bh);
				3146	}
				3147	}
				3148	EXPORT_SYMBOL(__bh_read_batch);
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3149
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3150	void __init buffer_init(void)
				3151	{
Zhang Yanfei	43be594	2013-02-22 16:35:46 -0800	[diff] [blame]	3152	unsigned long nrpages;
Sebastian Andrzej Siewior	fc4d24c	2016-11-03 15:49:57 +0100	[diff] [blame]	3153	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3154
Kunwu Chan	de8a320	2024-01-16 17:11:37 +0800	[diff] [blame]	3155	bh_cachep = KMEM_CACHE(buffer_head,
Chengming Zhou	c997d68	2024-02-24 13:53:15 +0000	[diff] [blame]	3156	SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3157	/*
				3158	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3159	*/
				3160	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3161	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
Sebastian Andrzej Siewior	fc4d24c	2016-11-03 15:49:57 +0100	[diff] [blame]	3162	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
				3163	NULL, buffer_exit_cpu_dead);
				3164	WARN_ON(ret < 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3165	}