blob: 46b37f2cce7d907562ef5014762b5cd37e99775a [file] [log] [blame]
David Howellsdfa0a442023-11-07 09:47:52 +00001// SPDX-License-Identifier: GPL-2.0-or-later
2/* vnode and volume validity verification.
3 *
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/sched.h>
11#include "internal.h"
12
13/*
David Howells453924de2023-11-08 13:57:42 +000014 * Data validation is managed through a number of mechanisms from the server:
15 *
16 * (1) On first contact with a server (such as if it has just been rebooted),
17 * the server sends us a CB.InitCallBackState* request.
18 *
19 * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20 * calls, the server maintains a time-limited per-vnode promise that it
21 * will send us a CB.CallBack request if a third party alters the vnodes
22 * accessed.
23 *
24 * Note that a vnode-level callbacks may also be sent for other reasons,
25 * such as filelock release.
26 *
27 * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28 * calls, each server maintains a time-limited per-volume promise that it
29 * will send us a CB.CallBack request if the RO volume is updated to a
30 * snapshot of the RW volume ("vos release"). This is an atomic event
31 * that cuts over all instances of the RO volume across multiple servers
32 * simultaneously.
33 *
34 * Note that a volume-level callbacks may also be sent for other reasons,
35 * such as the volumeserver taking over control of the volume from the
36 * fileserver.
37 *
38 * Note also that each server maintains an independent time limit on an
39 * independent callback.
40 *
41 * (4) Certain RPC calls include a volume information record "VolSync" in
42 * their reply. This contains a creation date for the volume that should
43 * remain unchanged for a RW volume (but will be changed if the volume is
44 * restored from backup) or will be bumped to the time of snapshotting
45 * when a RO volume is released.
46 *
47 * In order to track this events, the following are provided:
48 *
49 * ->cb_v_break. A counter of events that might mean that the contents of
50 * a volume have been altered since we last checked a vnode.
51 *
52 * ->cb_v_check. A counter of the number of events that we've sent a
53 * query to the server for. Everything's up to date if this equals
54 * cb_v_break.
55 *
56 * ->cb_scrub. A counter of the number of regression events for which we
57 * have to completely wipe the cache.
58 *
59 * ->cb_ro_snapshot. A counter of the number of times that we've
60 * recognised that a RO volume has been updated.
61 *
62 * ->cb_break. A counter of events that might mean that the contents of a
63 * vnode have been altered.
64 *
65 * ->cb_expires_at. The time at which the callback promise expires or
66 * AFS_NO_CB_PROMISE if we have no promise.
67 *
68 * The way we manage things is:
69 *
70 * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71 * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72 * volume and volume's server record.
73 *
74 * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75 * callback break on all the volumes that have been using that volume
76 * (ie. increment ->cb_v_break and reset ->cb_expires_at).
77 *
78 * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79 * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
80 * dispatch a work item to unmap all PTEs to the vnode's pagecache to
81 * force reentry to the filesystem for revalidation.
82 *
83 * (4) When entering the filesystem, we call afs_validate() to check the
84 * validity of a vnode. This first checks to see if ->cb_v_check and
85 * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86 * exclusively and perform an FS.FetchStatus on the vnode.
87 *
88 * After checking the volume, we check the vnode. If there's a mismatch
89 * between the volume counters and the vnode's mirrors of those counters,
90 * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91 *
92 * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93 * parsed:
94 *
95 * (A) If the Creation timestamp has changed on a RW volume or regressed
96 * on a RO volume, we try to increment ->cb_scrub; if it advances on a
97 * RO volume, we assume "vos release" happened and try to increment
98 * ->cb_ro_snapshot.
99 *
100 * (B) If the Update timestamp has regressed, we try to increment
101 * ->cb_scrub.
102 *
103 * Note that in both of these cases, we only do the increment if we can
104 * cmpxchg the value of the timestamp from the value we noted before the
105 * op. This tries to prevent parallel ops from fighting one another.
106 *
107 * volume->cb_v_check is then set to ->cb_v_break.
108 *
109 * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110 * parsed and used to set the promise in ->cb_expires_at for the vnode,
111 * the volume and the volume's server record.
112 *
113 * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114 * the vnode.
115 */
116
117/*
118 * Check the validity of a vnode/inode and its parent volume.
119 */
120bool afs_check_validity(const struct afs_vnode *vnode)
121{
122 const struct afs_volume *volume = vnode->volume;
123 time64_t deadline = ktime_get_real_seconds() + 10;
124
125 if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
126 atomic64_read(&vnode->cb_expires_at) <= deadline ||
127 volume->cb_expires_at <= deadline ||
128 vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
129 vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
130 test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
131 _debug("inval");
132 return false;
133 }
134
135 return true;
136}
137
138/*
David Howells16069e12023-11-05 16:11:07 +0000139 * See if the server we've just talked to is currently excluded.
140 */
141static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
142{
143 const struct afs_server_entry *se;
144 const struct afs_server_list *slist;
145 bool is_excluded = true;
146 int i;
147
148 rcu_read_lock();
149
150 slist = rcu_dereference(volume->servers);
151 for (i = 0; i < slist->nr_servers; i++) {
152 se = &slist->servers[i];
153 if (op->server == se->server) {
154 is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
155 break;
156 }
157 }
158
159 rcu_read_unlock();
160 return is_excluded;
161}
162
163/*
164 * Update the volume's server list when the creation time changes and see if
165 * the server we've just talked to is currently excluded.
166 */
167static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
168{
169 int ret;
170
171 if (__afs_is_server_excluded(op, volume))
172 return 1;
173
174 set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
175 ret = afs_check_volume_status(op->volume, op);
176 if (ret < 0)
177 return ret;
178
179 return __afs_is_server_excluded(op, volume);
180}
181
182/*
183 * Handle a change to the volume creation time in the VolSync record.
184 */
185static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
186{
187 unsigned int snap;
188 time64_t cur = volume->creation_time;
189 time64_t old = op->pre_volsync.creation;
190 time64_t new = op->volsync.creation;
191 int ret;
192
193 _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
194
195 if (cur == TIME64_MIN) {
196 volume->creation_time = new;
197 return 0;
198 }
199
200 if (new == cur)
201 return 0;
202
203 /* Try to advance the creation timestamp from what we had before the
204 * operation to what we got back from the server. This should
205 * hopefully ensure that in a race between multiple operations only one
206 * of them will do this.
207 */
208 if (cur != old)
209 return 0;
210
211 /* If the creation time changes in an unexpected way, we need to scrub
212 * our caches. For a RW vol, this will only change if the volume is
213 * restored from a backup; for a RO/Backup vol, this will advance when
214 * the volume is updated to a new snapshot (eg. "vos release").
215 */
216 if (volume->type == AFSVL_RWVOL)
217 goto regressed;
218 if (volume->type == AFSVL_BACKVOL) {
219 if (new < old)
220 goto regressed;
221 goto advance;
222 }
223
224 /* We have an RO volume, we need to query the VL server and look at the
225 * server flags to see if RW->RO replication is in progress.
226 */
227 ret = afs_is_server_excluded(op, volume);
228 if (ret < 0)
229 return ret;
230 if (ret > 0) {
231 snap = atomic_read(&volume->cb_ro_snapshot);
232 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
233 return ret;
234 }
235
236advance:
237 snap = atomic_inc_return(&volume->cb_ro_snapshot);
238 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
239 volume->creation_time = new;
240 return 0;
241
242regressed:
243 atomic_inc(&volume->cb_scrub);
244 trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
245 volume->creation_time = new;
246 return 0;
247}
248
249/*
250 * Handle a change to the volume update time in the VolSync record.
251 */
252static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
253{
254 enum afs_cb_break_reason reason = afs_cb_break_no_break;
255 time64_t cur = volume->update_time;
256 time64_t old = op->pre_volsync.update;
257 time64_t new = op->volsync.update;
258
259 _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
260
261 if (cur == TIME64_MIN) {
262 volume->update_time = new;
263 return;
264 }
265
266 if (new == cur)
267 return;
268
269 /* If the volume update time changes in an unexpected way, we need to
270 * scrub our caches. For a RW vol, this will advance on every
271 * modification op; for a RO/Backup vol, this will advance when the
272 * volume is updated to a new snapshot (eg. "vos release").
273 */
274 if (new < old)
275 reason = afs_cb_break_for_update_regress;
276
277 /* Try to advance the update timestamp from what we had before the
278 * operation to what we got back from the server. This should
279 * hopefully ensure that in a race between multiple operations only one
280 * of them will do this.
281 */
282 if (cur == old) {
283 if (reason == afs_cb_break_for_update_regress) {
284 atomic_inc(&volume->cb_scrub);
285 trace_afs_cb_v_break(volume->vid, 0, reason);
286 }
287 volume->update_time = new;
288 }
289}
290
291static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
292{
293 int ret = 0;
294
295 if (likely(op->volsync.creation == volume->creation_time &&
296 op->volsync.update == volume->update_time))
297 return 0;
298
299 mutex_lock(&volume->volsync_lock);
300 if (op->volsync.creation != volume->creation_time) {
301 ret = afs_update_volume_creation_time(op, volume);
302 if (ret < 0)
303 goto out;
304 }
305 if (op->volsync.update != volume->update_time)
306 afs_update_volume_update_time(op, volume);
307out:
308 mutex_unlock(&volume->volsync_lock);
309 return ret;
310}
311
312/*
David Howells453924de2023-11-08 13:57:42 +0000313 * Update the state of a volume, including recording the expiration time of the
314 * callback promise. Returns 1 to redo the operation from the start.
David Howells16069e12023-11-05 16:11:07 +0000315 */
316int afs_update_volume_state(struct afs_operation *op)
317{
David Howells453924de2023-11-08 13:57:42 +0000318 struct afs_server_list *slist = op->server_list;
319 struct afs_server_entry *se = &slist->servers[op->server_index];
320 struct afs_callback *cb = &op->file[0].scb.callback;
David Howells16069e12023-11-05 16:11:07 +0000321 struct afs_volume *volume = op->volume;
David Howells453924de2023-11-08 13:57:42 +0000322 unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
323 unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
David Howells16069e12023-11-05 16:11:07 +0000324 int ret;
325
326 _enter("%llx", op->volume->vid);
327
328 if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
329 ret = afs_update_volume_times(op, volume);
330 if (ret != 0) {
331 _leave(" = %d", ret);
332 return ret;
333 }
334 }
335
David Howells453924de2023-11-08 13:57:42 +0000336 if (op->cb_v_break == cb_v_break &&
337 (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
338 time64_t expires_at = cb->expires_at;
339
340 if (!op->file[0].scb.have_cb)
341 expires_at = op->file[1].scb.callback.expires_at;
342
343 se->cb_expires_at = expires_at;
344 volume->cb_expires_at = expires_at;
345 }
346 if (cb_v_check < op->cb_v_break)
347 atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
David Howells16069e12023-11-05 16:11:07 +0000348 return 0;
349}
350
351/*
David Howellsdfa0a442023-11-07 09:47:52 +0000352 * mark the data attached to an inode as obsolete due to a write on the server
353 * - might also want to ditch all the outstanding writes and dirty pages
354 */
355static void afs_zap_data(struct afs_vnode *vnode)
356{
357 _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
358
359 afs_invalidate_cache(vnode, 0);
360
361 /* nuke all the non-dirty pages that aren't locked, mapped or being
362 * written back in a regular file and completely discard the pages in a
363 * directory or symlink */
364 if (S_ISREG(vnode->netfs.inode.i_mode))
365 invalidate_remote_inode(&vnode->netfs.inode);
366 else
367 invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
368}
369
370/*
David Howellsdfa0a442023-11-07 09:47:52 +0000371 * validate a vnode/inode
372 * - there are several things we need to check
373 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
374 * symlink)
375 * - parent dir metadata changed (security changes)
376 * - dentry data changed (write, truncate)
377 * - dentry metadata changed (security changes)
378 */
379int afs_validate(struct afs_vnode *vnode, struct key *key)
380{
David Howells453924de2023-11-08 13:57:42 +0000381 struct afs_volume *volume = vnode->volume;
382 unsigned int cb_ro_snapshot, cb_scrub;
383 time64_t deadline = ktime_get_real_seconds() + 10;
384 bool zap = false, locked_vol = false;
David Howellsdfa0a442023-11-07 09:47:52 +0000385 int ret;
386
387 _enter("{v={%llx:%llu} fl=%lx},%x",
388 vnode->fid.vid, vnode->fid.vnode, vnode->flags,
389 key_serial(key));
390
David Howells453924de2023-11-08 13:57:42 +0000391 if (afs_check_validity(vnode))
392 return 0;
David Howellsdfa0a442023-11-07 09:47:52 +0000393
David Howells453924de2023-11-08 13:57:42 +0000394 ret = down_write_killable(&vnode->validate_lock);
395 if (ret < 0)
396 goto error;
David Howellsdfa0a442023-11-07 09:47:52 +0000397
David Howells453924de2023-11-08 13:57:42 +0000398 /* Validate a volume after the v_break has changed or the volume
399 * callback expired. We only want to do this once per volume per
400 * v_break change. The actual work will be done when parsing the
401 * status fetch reply.
402 */
403 if (volume->cb_expires_at <= deadline ||
404 atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
405 ret = mutex_lock_interruptible(&volume->cb_check_lock);
406 if (ret < 0)
407 goto error_unlock;
408 locked_vol = true;
409 }
410
411 cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
412 cb_scrub = atomic_read(&volume->cb_scrub);
413 if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
414 vnode->cb_scrub != cb_scrub)
415 unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
416
417 if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
418 vnode->cb_scrub != cb_scrub ||
419 volume->cb_expires_at <= deadline ||
420 atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
421 atomic64_read(&vnode->cb_expires_at) <= deadline
422 ) {
David Howellsdfa0a442023-11-07 09:47:52 +0000423 ret = afs_fetch_status(vnode, key, false, NULL);
424 if (ret < 0) {
425 if (ret == -ENOENT) {
426 set_bit(AFS_VNODE_DELETED, &vnode->flags);
427 ret = -ESTALE;
428 }
429 goto error_unlock;
430 }
David Howells453924de2023-11-08 13:57:42 +0000431
David Howellsdfa0a442023-11-07 09:47:52 +0000432 _debug("new promise [fl=%lx]", vnode->flags);
433 }
434
David Howells453924de2023-11-08 13:57:42 +0000435 /* We can drop the volume lock now as. */
436 if (locked_vol) {
437 mutex_unlock(&volume->cb_check_lock);
438 locked_vol = false;
439 }
440
441 cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
442 cb_scrub = atomic_read(&volume->cb_scrub);
443 _debug("vnode inval %x==%x %x==%x",
444 vnode->cb_ro_snapshot, cb_ro_snapshot,
445 vnode->cb_scrub, cb_scrub);
446 if (vnode->cb_scrub != cb_scrub)
447 zap = true;
448 vnode->cb_ro_snapshot = cb_ro_snapshot;
449 vnode->cb_scrub = cb_scrub;
450
David Howellsdfa0a442023-11-07 09:47:52 +0000451 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
452 _debug("file already deleted");
453 ret = -ESTALE;
454 goto error_unlock;
455 }
456
457 /* if the vnode's data version number changed then its contents are
458 * different */
David Howells453924de2023-11-08 13:57:42 +0000459 zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
460 if (zap)
David Howellsdfa0a442023-11-07 09:47:52 +0000461 afs_zap_data(vnode);
462 up_write(&vnode->validate_lock);
David Howellsdfa0a442023-11-07 09:47:52 +0000463 _leave(" = 0");
464 return 0;
465
466error_unlock:
David Howells453924de2023-11-08 13:57:42 +0000467 if (locked_vol)
468 mutex_unlock(&volume->cb_check_lock);
David Howellsdfa0a442023-11-07 09:47:52 +0000469 up_write(&vnode->validate_lock);
David Howells453924de2023-11-08 13:57:42 +0000470error:
David Howellsdfa0a442023-11-07 09:47:52 +0000471 _leave(" = %d", ret);
472 return ret;
473}