fs/afs/validation.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /* vnode and volume validity verification.
  *
  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  */

 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include "internal.h"

 /*
  * Data validation is managed through a number of mechanisms from the server:
  *
  *  (1) On first contact with a server (such as if it has just been rebooted),
  *      the server sends us a CB.InitCallBackState* request.
  *
  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
  *      calls, the server maintains a time-limited per-vnode promise that it
  *      will send us a CB.CallBack request if a third party alters the vnodes
  *      accessed.
  *
  *      Note that a vnode-level callbacks may also be sent for other reasons,
  *      such as filelock release.
  *
  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
  *      calls, each server maintains a time-limited per-volume promise that it
  *      will send us a CB.CallBack request if the RO volume is updated to a
  *      snapshot of the RW volume ("vos release").  This is an atomic event
  *      that cuts over all instances of the RO volume across multiple servers
  *      simultaneously.
  *
  *	Note that a volume-level callbacks may also be sent for other reasons,
  *	such as the volumeserver taking over control of the volume from the
  *	fileserver.
  *
  *	Note also that each server maintains an independent time limit on an
  *	independent callback.
  *
  *  (4) Certain RPC calls include a volume information record "VolSync" in
  *      their reply.  This contains a creation date for the volume that should
  *      remain unchanged for a RW volume (but will be changed if the volume is
  *      restored from backup) or will be bumped to the time of snapshotting
  *      when a RO volume is released.
  *
  * In order to track this events, the following are provided:
  *
  *	->cb_v_break.  A counter of events that might mean that the contents of
  *	a volume have been altered since we last checked a vnode.
  *
  *	->cb_v_check.  A counter of the number of events that we've sent a
  *	query to the server for.  Everything's up to date if this equals
  *	cb_v_break.
  *
  *	->cb_scrub.  A counter of the number of regression events for which we
  *	have to completely wipe the cache.
  *
  *	->cb_ro_snapshot.  A counter of the number of times that we've
  *      recognised that a RO volume has been updated.
  *
  *	->cb_break.  A counter of events that might mean that the contents of a
  *      vnode have been altered.
  *
  *	->cb_expires_at.  The time at which the callback promise expires or
  *      AFS_NO_CB_PROMISE if we have no promise.
  *
  * The way we manage things is:
  *
  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
  *      volume and volume's server record.
  *
  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
  *	callback break on all the volumes that have been using that volume
  *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
  *
  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
  *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
  *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
  *	force reentry to the filesystem for revalidation.
  *
  *  (4) When entering the filesystem, we call afs_validate() to check the
  *	validity of a vnode.  This first checks to see if ->cb_v_check and
  *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
  *	exclusively and perform an FS.FetchStatus on the vnode.
  *
  *	After checking the volume, we check the vnode.  If there's a mismatch
  *	between the volume counters and the vnode's mirrors of those counters,
  *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
  *
  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
  *      parsed:
  *
  *	(A) If the Creation timestamp has changed on a RW volume or regressed
  *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
  *	    RO volume, we assume "vos release" happened and try to increment
  *	    ->cb_ro_snapshot.
  *
  *      (B) If the Update timestamp has regressed, we try to increment
  *	    ->cb_scrub.
  *
  *      Note that in both of these cases, we only do the increment if we can
  *      cmpxchg the value of the timestamp from the value we noted before the
  *      op.  This tries to prevent parallel ops from fighting one another.
  *
  *	volume->cb_v_check is then set to ->cb_v_break.
  *
  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
  *	parsed and used to set the promise in ->cb_expires_at for the vnode,
  *	the volume and the volume's server record.
  *
  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
  *      the vnode.
  */

 /*
  * Check the validity of a vnode/inode and its parent volume.
  */
 bool afs_check_validity(const struct afs_vnode *vnode)
 {
 	const struct afs_volume *volume = vnode->volume;
 	time64_t deadline = ktime_get_real_seconds() + 10;

 	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
 	    volume->cb_expires_at <= deadline ||
 	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
 	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
 	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
 		_debug("inval");
 		return false;
 	}

 	return true;
 }

 /*
  * See if the server we've just talked to is currently excluded.
  */
 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 {
 	const struct afs_server_entry *se;
 	const struct afs_server_list *slist;
 	bool is_excluded = true;
 	int i;

 	rcu_read_lock();

 	slist = rcu_dereference(volume->servers);
 	for (i = 0; i < slist->nr_servers; i++) {
 		se = &slist->servers[i];
 		if (op->server == se->server) {
 			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
 			break;
 		}
 	}

 	rcu_read_unlock();
 	return is_excluded;
 }

 /*
  * Update the volume's server list when the creation time changes and see if
  * the server we've just talked to is currently excluded.
  */
 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 {
 	int ret;

 	if (__afs_is_server_excluded(op, volume))
 		return 1;

 	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
 	ret = afs_check_volume_status(op->volume, op);
 	if (ret < 0)
 		return ret;

 	return __afs_is_server_excluded(op, volume);
 }

 /*
  * Handle a change to the volume creation time in the VolSync record.
  */
 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
 {
 	unsigned int snap;
 	time64_t cur = volume->creation_time;
 	time64_t old = op->pre_volsync.creation;
 	time64_t new = op->volsync.creation;
 	int ret;

 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

 	if (cur == TIME64_MIN) {
 		volume->creation_time = new;
 		return 0;
 	}

 	if (new == cur)
 		return 0;

 	/* Try to advance the creation timestamp from what we had before the
 	 * operation to what we got back from the server.  This should
 	 * hopefully ensure that in a race between multiple operations only one
 	 * of them will do this.
 	 */
 	if (cur != old)
 		return 0;

 	/* If the creation time changes in an unexpected way, we need to scrub
 	 * our caches.  For a RW vol, this will only change if the volume is
 	 * restored from a backup; for a RO/Backup vol, this will advance when
 	 * the volume is updated to a new snapshot (eg. "vos release").
 	 */
 	if (volume->type == AFSVL_RWVOL)
 		goto regressed;
 	if (volume->type == AFSVL_BACKVOL) {
 		if (new < old)
 			goto regressed;
 		goto advance;
 	}

 	/* We have an RO volume, we need to query the VL server and look at the
 	 * server flags to see if RW->RO replication is in progress.
 	 */
 	ret = afs_is_server_excluded(op, volume);
 	if (ret < 0)
 		return ret;
 	if (ret > 0) {
 		snap = atomic_read(&volume->cb_ro_snapshot);
 		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
 		return ret;
 	}

 advance:
 	snap = atomic_inc_return(&volume->cb_ro_snapshot);
 	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
 	volume->creation_time = new;
 	return 0;

 regressed:
 	atomic_inc(&volume->cb_scrub);
 	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
 	volume->creation_time = new;
 	return 0;
 }

 /*
  * Handle a change to the volume update time in the VolSync record.
  */
 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
 {
 	enum afs_cb_break_reason reason = afs_cb_break_no_break;
 	time64_t cur = volume->update_time;
 	time64_t old = op->pre_volsync.update;
 	time64_t new = op->volsync.update;

 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

 	if (cur == TIME64_MIN) {
 		volume->update_time = new;
 		return;
 	}

 	if (new == cur)
 		return;

 	/* If the volume update time changes in an unexpected way, we need to
 	 * scrub our caches.  For a RW vol, this will advance on every
 	 * modification op; for a RO/Backup vol, this will advance when the
 	 * volume is updated to a new snapshot (eg. "vos release").
 	 */
 	if (new < old)
 		reason = afs_cb_break_for_update_regress;

 	/* Try to advance the update timestamp from what we had before the
 	 * operation to what we got back from the server.  This should
 	 * hopefully ensure that in a race between multiple operations only one
 	 * of them will do this.
 	 */
 	if (cur == old) {
 		if (reason == afs_cb_break_for_update_regress) {
 			atomic_inc(&volume->cb_scrub);
 			trace_afs_cb_v_break(volume->vid, 0, reason);
 		}
 		volume->update_time = new;
 	}
 }

 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
 {
 	int ret = 0;

 	if (likely(op->volsync.creation == volume->creation_time &&
 		   op->volsync.update == volume->update_time))
 		return 0;

 	mutex_lock(&volume->volsync_lock);
 	if (op->volsync.creation != volume->creation_time) {
 		ret = afs_update_volume_creation_time(op, volume);
 		if (ret < 0)
 			goto out;
 	}
 	if (op->volsync.update != volume->update_time)
 		afs_update_volume_update_time(op, volume);
 out:
 	mutex_unlock(&volume->volsync_lock);
 	return ret;
 }

 /*
  * Update the state of a volume, including recording the expiration time of the
  * callback promise.  Returns 1 to redo the operation from the start.
  */
 int afs_update_volume_state(struct afs_operation *op)
 {
 	struct afs_server_list *slist = op->server_list;
 	struct afs_server_entry *se = &slist->servers[op->server_index];
 	struct afs_callback *cb = &op->file[0].scb.callback;
 	struct afs_volume *volume = op->volume;
 	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
 	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
 	int ret;

 	_enter("%llx", op->volume->vid);

 	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
 		ret = afs_update_volume_times(op, volume);
 		if (ret != 0) {
 			_leave(" = %d", ret);
 			return ret;
 		}
 	}

 	if (op->cb_v_break == cb_v_break &&
 	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
 		time64_t expires_at = cb->expires_at;

 		if (!op->file[0].scb.have_cb)
 			expires_at = op->file[1].scb.callback.expires_at;

 		se->cb_expires_at = expires_at;
 		volume->cb_expires_at = expires_at;
 	}
 	if (cb_v_check < op->cb_v_break)
 		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
 	return 0;
 }

 /*
  * mark the data attached to an inode as obsolete due to a write on the server
  * - might also want to ditch all the outstanding writes and dirty pages
  */
 static void afs_zap_data(struct afs_vnode *vnode)
 {
 	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);

 	afs_invalidate_cache(vnode, 0);

 	/* nuke all the non-dirty pages that aren't locked, mapped or being
 	 * written back in a regular file and completely discard the pages in a
 	 * directory or symlink */
 	if (S_ISREG(vnode->netfs.inode.i_mode))
 		invalidate_remote_inode(&vnode->netfs.inode);
 	else
 		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
 }

 /*
  * validate a vnode/inode
  * - there are several things we need to check
  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
  *     symlink)
  *   - parent dir metadata changed (security changes)
  *   - dentry data changed (write, truncate)
  *   - dentry metadata changed (security changes)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
 	struct afs_volume *volume = vnode->volume;
 	unsigned int cb_ro_snapshot, cb_scrub;
 	time64_t deadline = ktime_get_real_seconds() + 10;
 	bool zap = false, locked_vol = false;
 	int ret;

 	_enter("{v={%llx:%llu} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));

 	if (afs_check_validity(vnode))
 		return 0;

 	ret = down_write_killable(&vnode->validate_lock);
 	if (ret < 0)
 		goto error;

 	/* Validate a volume after the v_break has changed or the volume
 	 * callback expired.  We only want to do this once per volume per
 	 * v_break change.  The actual work will be done when parsing the
 	 * status fetch reply.
 	 */
 	if (volume->cb_expires_at <= deadline ||
 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
 		ret = mutex_lock_interruptible(&volume->cb_check_lock);
 		if (ret < 0)
 			goto error_unlock;
 		locked_vol = true;
 	}

 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 	cb_scrub = atomic_read(&volume->cb_scrub);
 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 	    vnode->cb_scrub	  != cb_scrub)
 		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);

 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 	    vnode->cb_scrub	  != cb_scrub ||
 	    volume->cb_expires_at <= deadline ||
 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 	    atomic64_read(&vnode->cb_expires_at) <= deadline
 	    ) {
 		ret = afs_fetch_status(vnode, key, false, NULL);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
 				ret = -ESTALE;
 			}
 			goto error_unlock;
 		}

 		_debug("new promise [fl=%lx]", vnode->flags);
 	}

 	/* We can drop the volume lock now as. */
 	if (locked_vol) {
 		mutex_unlock(&volume->cb_check_lock);
 		locked_vol = false;
 	}

 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 	cb_scrub = atomic_read(&volume->cb_scrub);
 	_debug("vnode inval %x==%x %x==%x",
 	       vnode->cb_ro_snapshot, cb_ro_snapshot,
 	       vnode->cb_scrub, cb_scrub);
 	if (vnode->cb_scrub != cb_scrub)
 		zap = true;
 	vnode->cb_ro_snapshot = cb_ro_snapshot;
 	vnode->cb_scrub = cb_scrub;

 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		_debug("file already deleted");
 		ret = -ESTALE;
 		goto error_unlock;
 	}

 	/* if the vnode's data version number changed then its contents are
 	 * different */
 	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 	if (zap)
 		afs_zap_data(vnode);
 	up_write(&vnode->validate_lock);
 	_leave(" = 0");
 	return 0;

 error_unlock:
 	if (locked_vol)
 		mutex_unlock(&volume->cb_check_lock);
 	up_write(&vnode->validate_lock);
 error:
 	_leave(" = %d", ret);
 	return ret;
 }
	// SPDX-License-Identifier: GPL-2.0-or-later
	/* vnode and volume validity verification.
	*
	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
	* Written by David Howells (dhowells@redhat.com)
	*/

	#include <linux/kernel.h>
	#include <linux/module.h>
	#include <linux/sched.h>
	#include "internal.h"

	/*
	* Data validation is managed through a number of mechanisms from the server:
	*
	* (1) On first contact with a server (such as if it has just been rebooted),
	* the server sends us a CB.InitCallBackState* request.
	*
	* (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
	* calls, the server maintains a time-limited per-vnode promise that it
	* will send us a CB.CallBack request if a third party alters the vnodes
	* accessed.
	*
	* Note that a vnode-level callbacks may also be sent for other reasons,
	* such as filelock release.
	*
	* (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
	* calls, each server maintains a time-limited per-volume promise that it
	* will send us a CB.CallBack request if the RO volume is updated to a
	* snapshot of the RW volume ("vos release"). This is an atomic event
	* that cuts over all instances of the RO volume across multiple servers
	* simultaneously.
	*
	* Note that a volume-level callbacks may also be sent for other reasons,
	* such as the volumeserver taking over control of the volume from the
	* fileserver.
	*
	* Note also that each server maintains an independent time limit on an
	* independent callback.
	*
	* (4) Certain RPC calls include a volume information record "VolSync" in
	* their reply. This contains a creation date for the volume that should
	* remain unchanged for a RW volume (but will be changed if the volume is
	* restored from backup) or will be bumped to the time of snapshotting
	* when a RO volume is released.
	*
	* In order to track this events, the following are provided:
	*
	* ->cb_v_break. A counter of events that might mean that the contents of
	* a volume have been altered since we last checked a vnode.
	*
	* ->cb_v_check. A counter of the number of events that we've sent a
	* query to the server for. Everything's up to date if this equals
	* cb_v_break.
	*
	* ->cb_scrub. A counter of the number of regression events for which we
	* have to completely wipe the cache.
	*
	* ->cb_ro_snapshot. A counter of the number of times that we've
	* recognised that a RO volume has been updated.
	*
	* ->cb_break. A counter of events that might mean that the contents of a
	* vnode have been altered.
	*
	* ->cb_expires_at. The time at which the callback promise expires or
	* AFS_NO_CB_PROMISE if we have no promise.
	*
	* The way we manage things is:
	*
	* (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
	* the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
	* volume and volume's server record.
	*
	* (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
	* callback break on all the volumes that have been using that volume
	* (ie. increment ->cb_v_break and reset ->cb_expires_at).
	*
	* (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
	* vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
	* dispatch a work item to unmap all PTEs to the vnode's pagecache to
	* force reentry to the filesystem for revalidation.
	*
	* (4) When entering the filesystem, we call afs_validate() to check the
	* validity of a vnode. This first checks to see if ->cb_v_check and
	* ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
	* exclusively and perform an FS.FetchStatus on the vnode.
	*
	* After checking the volume, we check the vnode. If there's a mismatch
	* between the volume counters and the vnode's mirrors of those counters,
	* we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
	*
	* (5) When the reply from FS.FetchStatus arrives, the VolSync record is
	* parsed:
	*
	* (A) If the Creation timestamp has changed on a RW volume or regressed
	* on a RO volume, we try to increment ->cb_scrub; if it advances on a
	* RO volume, we assume "vos release" happened and try to increment
	* ->cb_ro_snapshot.
	*
	* (B) If the Update timestamp has regressed, we try to increment
	* ->cb_scrub.
	*
	* Note that in both of these cases, we only do the increment if we can
	* cmpxchg the value of the timestamp from the value we noted before the
	* op. This tries to prevent parallel ops from fighting one another.
	*
	* volume->cb_v_check is then set to ->cb_v_break.
	*
	* (6) The AFSCallBack record included in the FS.FetchStatus reply is also
	* parsed and used to set the promise in ->cb_expires_at for the vnode,
	* the volume and the volume's server record.
	*
	* (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
	* the vnode.
	*/

	/*
	* Check the validity of a vnode/inode and its parent volume.
	*/
	bool afs_check_validity(const struct afs_vnode *vnode)
	{
	const struct afs_volume *volume = vnode->volume;
	time64_t deadline = ktime_get_real_seconds() + 10;

	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
	atomic64_read(&vnode->cb_expires_at) <= deadline \|\|
	volume->cb_expires_at <= deadline \|\|
	vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) \|\|
	vnode->cb_scrub != atomic_read(&volume->cb_scrub) \|\|
	test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
	_debug("inval");
	return false;
	}

	return true;
	}

	/*
	* See if the server we've just talked to is currently excluded.
	*/
	static bool __afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
	{
	const struct afs_server_entry *se;
	const struct afs_server_list *slist;
	bool is_excluded = true;
	int i;

	rcu_read_lock();

	slist = rcu_dereference(volume->servers);
	for (i = 0; i < slist->nr_servers; i++) {
	se = &slist->servers[i];
	if (op->server == se->server) {
	is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
	break;
	}
	}

	rcu_read_unlock();
	return is_excluded;
	}

	/*
	* Update the volume's server list when the creation time changes and see if
	* the server we've just talked to is currently excluded.
	*/
	static int afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
	{
	int ret;

	if (__afs_is_server_excluded(op, volume))
	return 1;

	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
	ret = afs_check_volume_status(op->volume, op);
	if (ret < 0)
	return ret;

	return __afs_is_server_excluded(op, volume);
	}

	/*
	* Handle a change to the volume creation time in the VolSync record.
	*/
	static int afs_update_volume_creation_time(struct afs_operation op, struct afs_volume volume)
	{
	unsigned int snap;
	time64_t cur = volume->creation_time;
	time64_t old = op->pre_volsync.creation;
	time64_t new = op->volsync.creation;
	int ret;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
	volume->creation_time = new;
	return 0;
	}

	if (new == cur)
	return 0;

	/* Try to advance the creation timestamp from what we had before the
	* operation to what we got back from the server. This should
	* hopefully ensure that in a race between multiple operations only one
	* of them will do this.
	*/
	if (cur != old)
	return 0;

	/* If the creation time changes in an unexpected way, we need to scrub
	* our caches. For a RW vol, this will only change if the volume is
	* restored from a backup; for a RO/Backup vol, this will advance when
	* the volume is updated to a new snapshot (eg. "vos release").
	*/
	if (volume->type == AFSVL_RWVOL)
	goto regressed;
	if (volume->type == AFSVL_BACKVOL) {
	if (new < old)
	goto regressed;
	goto advance;
	}

	/* We have an RO volume, we need to query the VL server and look at the
	* server flags to see if RW->RO replication is in progress.
	*/
	ret = afs_is_server_excluded(op, volume);
	if (ret < 0)
	return ret;
	if (ret > 0) {
	snap = atomic_read(&volume->cb_ro_snapshot);
	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
	return ret;
	}

	advance:
	snap = atomic_inc_return(&volume->cb_ro_snapshot);
	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
	volume->creation_time = new;
	return 0;

	regressed:
	atomic_inc(&volume->cb_scrub);
	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
	volume->creation_time = new;
	return 0;
	}

	/*
	* Handle a change to the volume update time in the VolSync record.
	*/
	static void afs_update_volume_update_time(struct afs_operation op, struct afs_volume volume)
	{
	enum afs_cb_break_reason reason = afs_cb_break_no_break;
	time64_t cur = volume->update_time;
	time64_t old = op->pre_volsync.update;
	time64_t new = op->volsync.update;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
	volume->update_time = new;
	return;
	}

	if (new == cur)
	return;

	/* If the volume update time changes in an unexpected way, we need to
	* scrub our caches. For a RW vol, this will advance on every
	* modification op; for a RO/Backup vol, this will advance when the
	* volume is updated to a new snapshot (eg. "vos release").
	*/
	if (new < old)
	reason = afs_cb_break_for_update_regress;

	/* Try to advance the update timestamp from what we had before the
	* operation to what we got back from the server. This should
	* hopefully ensure that in a race between multiple operations only one
	* of them will do this.
	*/
	if (cur == old) {
	if (reason == afs_cb_break_for_update_regress) {
	atomic_inc(&volume->cb_scrub);
	trace_afs_cb_v_break(volume->vid, 0, reason);
	}
	volume->update_time = new;
	}
	}

	static int afs_update_volume_times(struct afs_operation op, struct afs_volume volume)
	{
	int ret = 0;

	if (likely(op->volsync.creation == volume->creation_time &&
	op->volsync.update == volume->update_time))
	return 0;

	mutex_lock(&volume->volsync_lock);
	if (op->volsync.creation != volume->creation_time) {
	ret = afs_update_volume_creation_time(op, volume);
	if (ret < 0)
	goto out;
	}
	if (op->volsync.update != volume->update_time)
	afs_update_volume_update_time(op, volume);
	out:
	mutex_unlock(&volume->volsync_lock);
	return ret;
	}

	/*
	* Update the state of a volume, including recording the expiration time of the
	* callback promise. Returns 1 to redo the operation from the start.
	*/
	int afs_update_volume_state(struct afs_operation *op)
	{
	struct afs_server_list *slist = op->server_list;
	struct afs_server_entry *se = &slist->servers[op->server_index];
	struct afs_callback *cb = &op->file[0].scb.callback;
	struct afs_volume *volume = op->volume;
	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
	int ret;

	_enter("%llx", op->volume->vid);

	if (op->volsync.creation != TIME64_MIN \|\| op->volsync.update != TIME64_MIN) {
	ret = afs_update_volume_times(op, volume);
	if (ret != 0) {
	_leave(" = %d", ret);
	return ret;
	}
	}

	if (op->cb_v_break == cb_v_break &&
	(op->file[0].scb.have_cb \|\| op->file[1].scb.have_cb)) {
	time64_t expires_at = cb->expires_at;

	if (!op->file[0].scb.have_cb)
	expires_at = op->file[1].scb.callback.expires_at;

	se->cb_expires_at = expires_at;
	volume->cb_expires_at = expires_at;
	}
	if (cb_v_check < op->cb_v_break)
	atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
	return 0;
	}

	/*
	* mark the data attached to an inode as obsolete due to a write on the server
	* - might also want to ditch all the outstanding writes and dirty pages
	*/
	static void afs_zap_data(struct afs_vnode *vnode)
	{
	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);

	afs_invalidate_cache(vnode, 0);

	/* nuke all the non-dirty pages that aren't locked, mapped or being
	* written back in a regular file and completely discard the pages in a
	* directory or symlink */
	if (S_ISREG(vnode->netfs.inode.i_mode))
	invalidate_remote_inode(&vnode->netfs.inode);
	else
	invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
	}

	/*
	* validate a vnode/inode
	* - there are several things we need to check
	* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
	* symlink)
	* - parent dir metadata changed (security changes)
	* - dentry data changed (write, truncate)
	* - dentry metadata changed (security changes)
	*/
	int afs_validate(struct afs_vnode vnode, struct key key)
	{
	struct afs_volume *volume = vnode->volume;
	unsigned int cb_ro_snapshot, cb_scrub;
	time64_t deadline = ktime_get_real_seconds() + 10;
	bool zap = false, locked_vol = false;
	int ret;

	_enter("{v={%llx:%llu} fl=%lx},%x",
	vnode->fid.vid, vnode->fid.vnode, vnode->flags,
	key_serial(key));

	if (afs_check_validity(vnode))
	return 0;

	ret = down_write_killable(&vnode->validate_lock);
	if (ret < 0)
	goto error;

	/* Validate a volume after the v_break has changed or the volume
	* callback expired. We only want to do this once per volume per
	* v_break change. The actual work will be done when parsing the
	* status fetch reply.
	*/
	if (volume->cb_expires_at <= deadline \|\|
	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
	ret = mutex_lock_interruptible(&volume->cb_check_lock);
	if (ret < 0)
	goto error_unlock;
	locked_vol = true;
	}

	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	cb_scrub = atomic_read(&volume->cb_scrub);
	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
	vnode->cb_scrub != cb_scrub)
	unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);

	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
	vnode->cb_scrub != cb_scrub \|\|
	volume->cb_expires_at <= deadline \|\|
	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
	atomic64_read(&vnode->cb_expires_at) <= deadline
	) {
	ret = afs_fetch_status(vnode, key, false, NULL);
	if (ret < 0) {
	if (ret == -ENOENT) {
	set_bit(AFS_VNODE_DELETED, &vnode->flags);
	ret = -ESTALE;
	}
	goto error_unlock;
	}

	_debug("new promise [fl=%lx]", vnode->flags);
	}

	/* We can drop the volume lock now as. */
	if (locked_vol) {
	mutex_unlock(&volume->cb_check_lock);
	locked_vol = false;
	}

	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	cb_scrub = atomic_read(&volume->cb_scrub);
	_debug("vnode inval %x==%x %x==%x",
	vnode->cb_ro_snapshot, cb_ro_snapshot,
	vnode->cb_scrub, cb_scrub);
	if (vnode->cb_scrub != cb_scrub)
	zap = true;
	vnode->cb_ro_snapshot = cb_ro_snapshot;
	vnode->cb_scrub = cb_scrub;

	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
	_debug("file already deleted");
	ret = -ESTALE;
	goto error_unlock;
	}

	/* if the vnode's data version number changed then its contents are
	* different */
	zap \|= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
	if (zap)
	afs_zap_data(vnode);
	up_write(&vnode->validate_lock);
	_leave(" = 0");
	return 0;

	error_unlock:
	if (locked_vol)
	mutex_unlock(&volume->cb_check_lock);
	up_write(&vnode->validate_lock);
	error:
	_leave(" = %d", ret);
	return ret;
	}