Blame - fs/afs/validation.c - linux

blob: 46b37f2cce7d907562ef5014762b5cd37e99775a [file] [log] [blame]

David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/* vnode and volume validity verification.
				3	*
				4	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
				5	* Written by David Howells (dhowells@redhat.com)
				6	*/
				7
				8	#include <linux/kernel.h>
				9	#include <linux/module.h>
				10	#include <linux/sched.h>
				11	#include "internal.h"
				12
				13	/*
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	14	* Data validation is managed through a number of mechanisms from the server:
				15	*
				16	* (1) On first contact with a server (such as if it has just been rebooted),
				17	* the server sends us a CB.InitCallBackState* request.
				18	*
				19	* (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
				20	* calls, the server maintains a time-limited per-vnode promise that it
				21	* will send us a CB.CallBack request if a third party alters the vnodes
				22	* accessed.
				23	*
				24	* Note that a vnode-level callbacks may also be sent for other reasons,
				25	* such as filelock release.
				26	*
				27	* (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
				28	* calls, each server maintains a time-limited per-volume promise that it
				29	* will send us a CB.CallBack request if the RO volume is updated to a
				30	* snapshot of the RW volume ("vos release"). This is an atomic event
				31	* that cuts over all instances of the RO volume across multiple servers
				32	* simultaneously.
				33	*
				34	* Note that a volume-level callbacks may also be sent for other reasons,
				35	* such as the volumeserver taking over control of the volume from the
				36	* fileserver.
				37	*
				38	* Note also that each server maintains an independent time limit on an
				39	* independent callback.
				40	*
				41	* (4) Certain RPC calls include a volume information record "VolSync" in
				42	* their reply. This contains a creation date for the volume that should
				43	* remain unchanged for a RW volume (but will be changed if the volume is
				44	* restored from backup) or will be bumped to the time of snapshotting
				45	* when a RO volume is released.
				46	*
				47	* In order to track this events, the following are provided:
				48	*
				49	* ->cb_v_break. A counter of events that might mean that the contents of
				50	* a volume have been altered since we last checked a vnode.
				51	*
				52	* ->cb_v_check. A counter of the number of events that we've sent a
				53	* query to the server for. Everything's up to date if this equals
				54	* cb_v_break.
				55	*
				56	* ->cb_scrub. A counter of the number of regression events for which we
				57	* have to completely wipe the cache.
				58	*
				59	* ->cb_ro_snapshot. A counter of the number of times that we've
				60	* recognised that a RO volume has been updated.
				61	*
				62	* ->cb_break. A counter of events that might mean that the contents of a
				63	* vnode have been altered.
				64	*
				65	* ->cb_expires_at. The time at which the callback promise expires or
				66	* AFS_NO_CB_PROMISE if we have no promise.
				67	*
				68	* The way we manage things is:
				69	*
				70	* (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
				71	* the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
				72	* volume and volume's server record.
				73	*
				74	* (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
				75	* callback break on all the volumes that have been using that volume
				76	* (ie. increment ->cb_v_break and reset ->cb_expires_at).
				77	*
				78	* (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
				79	* vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
				80	* dispatch a work item to unmap all PTEs to the vnode's pagecache to
				81	* force reentry to the filesystem for revalidation.
				82	*
				83	* (4) When entering the filesystem, we call afs_validate() to check the
				84	* validity of a vnode. This first checks to see if ->cb_v_check and
				85	* ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
				86	* exclusively and perform an FS.FetchStatus on the vnode.
				87	*
				88	* After checking the volume, we check the vnode. If there's a mismatch
				89	* between the volume counters and the vnode's mirrors of those counters,
				90	* we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
				91	*
				92	* (5) When the reply from FS.FetchStatus arrives, the VolSync record is
				93	* parsed:
				94	*
				95	* (A) If the Creation timestamp has changed on a RW volume or regressed
				96	* on a RO volume, we try to increment ->cb_scrub; if it advances on a
				97	* RO volume, we assume "vos release" happened and try to increment
				98	* ->cb_ro_snapshot.
				99	*
				100	* (B) If the Update timestamp has regressed, we try to increment
				101	* ->cb_scrub.
				102	*
				103	* Note that in both of these cases, we only do the increment if we can
				104	* cmpxchg the value of the timestamp from the value we noted before the
				105	* op. This tries to prevent parallel ops from fighting one another.
				106	*
				107	* volume->cb_v_check is then set to ->cb_v_break.
				108	*
				109	* (6) The AFSCallBack record included in the FS.FetchStatus reply is also
				110	* parsed and used to set the promise in ->cb_expires_at for the vnode,
				111	* the volume and the volume's server record.
				112	*
				113	* (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
				114	* the vnode.
				115	*/
				116
				117	/*
				118	* Check the validity of a vnode/inode and its parent volume.
				119	*/
				120	bool afs_check_validity(const struct afs_vnode *vnode)
				121	{
				122	const struct afs_volume *volume = vnode->volume;
				123	time64_t deadline = ktime_get_real_seconds() + 10;
				124
				125	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
				126	atomic64_read(&vnode->cb_expires_at) <= deadline \|\|
				127	volume->cb_expires_at <= deadline \|\|
				128	vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) \|\|
				129	vnode->cb_scrub != atomic_read(&volume->cb_scrub) \|\|
				130	test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
				131	_debug("inval");
				132	return false;
				133	}
				134
				135	return true;
				136	}
				137
				138	/*
David Howells	16069e1	2023-11-05 16:11:07 +0000	[diff] [blame]	139	* See if the server we've just talked to is currently excluded.
				140	*/
				141	static bool __afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
				142	{
				143	const struct afs_server_entry *se;
				144	const struct afs_server_list *slist;
				145	bool is_excluded = true;
				146	int i;
				147
				148	rcu_read_lock();
				149
				150	slist = rcu_dereference(volume->servers);
				151	for (i = 0; i < slist->nr_servers; i++) {
				152	se = &slist->servers[i];
				153	if (op->server == se->server) {
				154	is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
				155	break;
				156	}
				157	}
				158
				159	rcu_read_unlock();
				160	return is_excluded;
				161	}
				162
				163	/*
				164	* Update the volume's server list when the creation time changes and see if
				165	* the server we've just talked to is currently excluded.
				166	*/
				167	static int afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
				168	{
				169	int ret;
				170
				171	if (__afs_is_server_excluded(op, volume))
				172	return 1;
				173
				174	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
				175	ret = afs_check_volume_status(op->volume, op);
				176	if (ret < 0)
				177	return ret;
				178
				179	return __afs_is_server_excluded(op, volume);
				180	}
				181
				182	/*
				183	* Handle a change to the volume creation time in the VolSync record.
				184	*/
				185	static int afs_update_volume_creation_time(struct afs_operation op, struct afs_volume volume)
				186	{
				187	unsigned int snap;
				188	time64_t cur = volume->creation_time;
				189	time64_t old = op->pre_volsync.creation;
				190	time64_t new = op->volsync.creation;
				191	int ret;
				192
				193	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
				194
				195	if (cur == TIME64_MIN) {
				196	volume->creation_time = new;
				197	return 0;
				198	}
				199
				200	if (new == cur)
				201	return 0;
				202
				203	/* Try to advance the creation timestamp from what we had before the
				204	* operation to what we got back from the server. This should
				205	* hopefully ensure that in a race between multiple operations only one
				206	* of them will do this.
				207	*/
				208	if (cur != old)
				209	return 0;
				210
				211	/* If the creation time changes in an unexpected way, we need to scrub
				212	* our caches. For a RW vol, this will only change if the volume is
				213	* restored from a backup; for a RO/Backup vol, this will advance when
				214	* the volume is updated to a new snapshot (eg. "vos release").
				215	*/
				216	if (volume->type == AFSVL_RWVOL)
				217	goto regressed;
				218	if (volume->type == AFSVL_BACKVOL) {
				219	if (new < old)
				220	goto regressed;
				221	goto advance;
				222	}
				223
				224	/* We have an RO volume, we need to query the VL server and look at the
				225	* server flags to see if RW->RO replication is in progress.
				226	*/
				227	ret = afs_is_server_excluded(op, volume);
				228	if (ret < 0)
				229	return ret;
				230	if (ret > 0) {
				231	snap = atomic_read(&volume->cb_ro_snapshot);
				232	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
				233	return ret;
				234	}
				235
				236	advance:
				237	snap = atomic_inc_return(&volume->cb_ro_snapshot);
				238	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
				239	volume->creation_time = new;
				240	return 0;
				241
				242	regressed:
				243	atomic_inc(&volume->cb_scrub);
				244	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
				245	volume->creation_time = new;
				246	return 0;
				247	}
				248
				249	/*
				250	* Handle a change to the volume update time in the VolSync record.
				251	*/
				252	static void afs_update_volume_update_time(struct afs_operation op, struct afs_volume volume)
				253	{
				254	enum afs_cb_break_reason reason = afs_cb_break_no_break;
				255	time64_t cur = volume->update_time;
				256	time64_t old = op->pre_volsync.update;
				257	time64_t new = op->volsync.update;
				258
				259	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
				260
				261	if (cur == TIME64_MIN) {
				262	volume->update_time = new;
				263	return;
				264	}
				265
				266	if (new == cur)
				267	return;
				268
				269	/* If the volume update time changes in an unexpected way, we need to
				270	* scrub our caches. For a RW vol, this will advance on every
				271	* modification op; for a RO/Backup vol, this will advance when the
				272	* volume is updated to a new snapshot (eg. "vos release").
				273	*/
				274	if (new < old)
				275	reason = afs_cb_break_for_update_regress;
				276
				277	/* Try to advance the update timestamp from what we had before the
				278	* operation to what we got back from the server. This should
				279	* hopefully ensure that in a race between multiple operations only one
				280	* of them will do this.
				281	*/
				282	if (cur == old) {
				283	if (reason == afs_cb_break_for_update_regress) {
				284	atomic_inc(&volume->cb_scrub);
				285	trace_afs_cb_v_break(volume->vid, 0, reason);
				286	}
				287	volume->update_time = new;
				288	}
				289	}
				290
				291	static int afs_update_volume_times(struct afs_operation op, struct afs_volume volume)
				292	{
				293	int ret = 0;
				294
				295	if (likely(op->volsync.creation == volume->creation_time &&
				296	op->volsync.update == volume->update_time))
				297	return 0;
				298
				299	mutex_lock(&volume->volsync_lock);
				300	if (op->volsync.creation != volume->creation_time) {
				301	ret = afs_update_volume_creation_time(op, volume);
				302	if (ret < 0)
				303	goto out;
				304	}
				305	if (op->volsync.update != volume->update_time)
				306	afs_update_volume_update_time(op, volume);
				307	out:
				308	mutex_unlock(&volume->volsync_lock);
				309	return ret;
				310	}
				311
				312	/*
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	313	* Update the state of a volume, including recording the expiration time of the
				314	* callback promise. Returns 1 to redo the operation from the start.
David Howells	16069e1	2023-11-05 16:11:07 +0000	[diff] [blame]	315	*/
				316	int afs_update_volume_state(struct afs_operation *op)
				317	{
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	318	struct afs_server_list *slist = op->server_list;
				319	struct afs_server_entry *se = &slist->servers[op->server_index];
				320	struct afs_callback *cb = &op->file[0].scb.callback;
David Howells	16069e1	2023-11-05 16:11:07 +0000	[diff] [blame]	321	struct afs_volume *volume = op->volume;
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	322	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
				323	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
David Howells	16069e1	2023-11-05 16:11:07 +0000	[diff] [blame]	324	int ret;
				325
				326	_enter("%llx", op->volume->vid);
				327
				328	if (op->volsync.creation != TIME64_MIN \|\| op->volsync.update != TIME64_MIN) {
				329	ret = afs_update_volume_times(op, volume);
				330	if (ret != 0) {
				331	_leave(" = %d", ret);
				332	return ret;
				333	}
				334	}
				335
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	336	if (op->cb_v_break == cb_v_break &&
				337	(op->file[0].scb.have_cb \|\| op->file[1].scb.have_cb)) {
				338	time64_t expires_at = cb->expires_at;
				339
				340	if (!op->file[0].scb.have_cb)
				341	expires_at = op->file[1].scb.callback.expires_at;
				342
				343	se->cb_expires_at = expires_at;
				344	volume->cb_expires_at = expires_at;
				345	}
				346	if (cb_v_check < op->cb_v_break)
				347	atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
David Howells	16069e1	2023-11-05 16:11:07 +0000	[diff] [blame]	348	return 0;
				349	}
				350
				351	/*
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	352	* mark the data attached to an inode as obsolete due to a write on the server
				353	* - might also want to ditch all the outstanding writes and dirty pages
				354	*/
				355	static void afs_zap_data(struct afs_vnode *vnode)
				356	{
				357	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
				358
				359	afs_invalidate_cache(vnode, 0);
				360
				361	/* nuke all the non-dirty pages that aren't locked, mapped or being
				362	* written back in a regular file and completely discard the pages in a
				363	* directory or symlink */
				364	if (S_ISREG(vnode->netfs.inode.i_mode))
				365	invalidate_remote_inode(&vnode->netfs.inode);
				366	else
				367	invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
				368	}
				369
				370	/*
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	371	* validate a vnode/inode
				372	* - there are several things we need to check
				373	* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
				374	* symlink)
				375	* - parent dir metadata changed (security changes)
				376	* - dentry data changed (write, truncate)
				377	* - dentry metadata changed (security changes)
				378	*/
				379	int afs_validate(struct afs_vnode vnode, struct key key)
				380	{
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	381	struct afs_volume *volume = vnode->volume;
				382	unsigned int cb_ro_snapshot, cb_scrub;
				383	time64_t deadline = ktime_get_real_seconds() + 10;
				384	bool zap = false, locked_vol = false;
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	385	int ret;
				386
				387	_enter("{v={%llx:%llu} fl=%lx},%x",
				388	vnode->fid.vid, vnode->fid.vnode, vnode->flags,
				389	key_serial(key));
				390
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	391	if (afs_check_validity(vnode))
				392	return 0;
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	393
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	394	ret = down_write_killable(&vnode->validate_lock);
				395	if (ret < 0)
				396	goto error;
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	397
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	398	/* Validate a volume after the v_break has changed or the volume
				399	* callback expired. We only want to do this once per volume per
				400	* v_break change. The actual work will be done when parsing the
				401	* status fetch reply.
				402	*/
				403	if (volume->cb_expires_at <= deadline \|\|
				404	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
				405	ret = mutex_lock_interruptible(&volume->cb_check_lock);
				406	if (ret < 0)
				407	goto error_unlock;
				408	locked_vol = true;
				409	}
				410
				411	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
				412	cb_scrub = atomic_read(&volume->cb_scrub);
				413	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
				414	vnode->cb_scrub != cb_scrub)
				415	unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
				416
				417	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
				418	vnode->cb_scrub != cb_scrub \|\|
				419	volume->cb_expires_at <= deadline \|\|
				420	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
				421	atomic64_read(&vnode->cb_expires_at) <= deadline
				422	) {
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	423	ret = afs_fetch_status(vnode, key, false, NULL);
				424	if (ret < 0) {
				425	if (ret == -ENOENT) {
				426	set_bit(AFS_VNODE_DELETED, &vnode->flags);
				427	ret = -ESTALE;
				428	}
				429	goto error_unlock;
				430	}
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	431
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	432	_debug("new promise [fl=%lx]", vnode->flags);
				433	}
				434
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	435	/* We can drop the volume lock now as. */
				436	if (locked_vol) {
				437	mutex_unlock(&volume->cb_check_lock);
				438	locked_vol = false;
				439	}
				440
				441	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
				442	cb_scrub = atomic_read(&volume->cb_scrub);
				443	_debug("vnode inval %x==%x %x==%x",
				444	vnode->cb_ro_snapshot, cb_ro_snapshot,
				445	vnode->cb_scrub, cb_scrub);
				446	if (vnode->cb_scrub != cb_scrub)
				447	zap = true;
				448	vnode->cb_ro_snapshot = cb_ro_snapshot;
				449	vnode->cb_scrub = cb_scrub;
				450
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	451	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
				452	_debug("file already deleted");
				453	ret = -ESTALE;
				454	goto error_unlock;
				455	}
				456
				457	/* if the vnode's data version number changed then its contents are
				458	* different */
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	459	zap \|= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
				460	if (zap)
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	461	afs_zap_data(vnode);
				462	up_write(&vnode->validate_lock);
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	463	_leave(" = 0");
				464	return 0;
				465
				466	error_unlock:
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	467	if (locked_vol)
				468	mutex_unlock(&volume->cb_check_lock);
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	469	up_write(&vnode->validate_lock);
David Howells	453924de	2023-11-08 13:57:42 +0000	[diff] [blame]	470	error:
David Howells	dfa0a44	2023-11-07 09:47:52 +0000	[diff] [blame]	471	_leave(" = %d", ret);
				472	return ret;
				473	}