fs/afs/validation.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* vnode and volume validity verification.
   3  *
   4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/module.h>
  10 #include <linux/sched.h>
  11 #include "internal.h"
  12
  13 /*
  14  * Data validation is managed through a number of mechanisms from the server:
  15  *
  16  *  (1) On first contact with a server (such as if it has just been rebooted),
  17  *      the server sends us a CB.InitCallBackState* request.
  18  *
  19  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
  20  *      calls, the server maintains a time-limited per-vnode promise that it
  21  *      will send us a CB.CallBack request if a third party alters the vnodes
  22  *      accessed.
  23  *
  24  *      Note that a vnode-level callbacks may also be sent for other reasons,
  25  *      such as filelock release.
  26  *
  27  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
  28  *      calls, each server maintains a time-limited per-volume promise that it
  29  *      will send us a CB.CallBack request if the RO volume is updated to a
  30  *      snapshot of the RW volume ("vos release").  This is an atomic event
  31  *      that cuts over all instances of the RO volume across multiple servers
  32  *      simultaneously.
  33  *
  34  *      Note that a volume-level callbacks may also be sent for other reasons,
  35  *      such as the volumeserver taking over control of the volume from the
  36  *      fileserver.
  37  *
  38  *      Note also that each server maintains an independent time limit on an
  39  *      independent callback.
  40  *
  41  *  (4) Certain RPC calls include a volume information record "VolSync" in
  42  *      their reply.  This contains a creation date for the volume that should
  43  *      remain unchanged for a RW volume (but will be changed if the volume is
  44  *      restored from backup) or will be bumped to the time of snapshotting
  45  *      when a RO volume is released.
  46  *
  47  * In order to track this events, the following are provided:
  48  *
  49  *      ->cb_v_break.  A counter of events that might mean that the contents of
  50  *      a volume have been altered since we last checked a vnode.
  51  *
  52  *      ->cb_v_check.  A counter of the number of events that we've sent a
  53  *      query to the server for.  Everything's up to date if this equals
  54  *      cb_v_break.
  55  *
  56  *      ->cb_scrub.  A counter of the number of regression events for which we
  57  *      have to completely wipe the cache.
  58  *
  59  *      ->cb_ro_snapshot.  A counter of the number of times that we've
  60  *      recognised that a RO volume has been updated.
  61  *
  62  *      ->cb_break.  A counter of events that might mean that the contents of a
  63  *      vnode have been altered.
  64  *
  65  *      ->cb_expires_at.  The time at which the callback promise expires or
  66  *      AFS_NO_CB_PROMISE if we have no promise.
  67  *
  68  * The way we manage things is:
  69  *
  70  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
  71  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
  72  *      volume and volume's server record.
  73  *
  74  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
  75  *      callback break on all the volumes that have been using that volume
  76  *      (ie. increment ->cb_v_break and reset ->cb_expires_at).
  77  *
  78  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
  79  *      vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
  80  *      dispatch a work item to unmap all PTEs to the vnode's pagecache to
  81  *      force reentry to the filesystem for revalidation.
  82  *
  83  *  (4) When entering the filesystem, we call afs_validate() to check the
  84  *      validity of a vnode.  This first checks to see if ->cb_v_check and
  85  *      ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
  86  *      exclusively and perform an FS.FetchStatus on the vnode.
  87  *
  88  *      After checking the volume, we check the vnode.  If there's a mismatch
  89  *      between the volume counters and the vnode's mirrors of those counters,
  90  *      we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
  91  *
  92  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
  93  *      parsed:
  94  *
  95  *      (A) If the Creation timestamp has changed on a RW volume or regressed
  96  *          on a RO volume, we try to increment ->cb_scrub; if it advances on a
  97  *          RO volume, we assume "vos release" happened and try to increment
  98  *          ->cb_ro_snapshot.
  99  *
 100  *      (B) If the Update timestamp has regressed, we try to increment
 101  *          ->cb_scrub.
 102  *
 103  *      Note that in both of these cases, we only do the increment if we can
 104  *      cmpxchg the value of the timestamp from the value we noted before the
 105  *      op.  This tries to prevent parallel ops from fighting one another.
 106  *
 107  *      volume->cb_v_check is then set to ->cb_v_break.
 108  *
 109  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
 110  *      parsed and used to set the promise in ->cb_expires_at for the vnode,
 111  *      the volume and the volume's server record.
 112  *
 113  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
 114  *      the vnode.
 115  */
 116
 117 /*
 118  * Check the validity of a vnode/inode and its parent volume.
 119  */
 120 bool afs_check_validity(const struct afs_vnode *vnode)
 121 {
 122         const struct afs_volume *volume = vnode->volume;
 123         time64_t deadline = ktime_get_real_seconds() + 10;
 124
 125         if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 126             atomic64_read(&vnode->cb_expires_at)  <= deadline ||
 127             volume->cb_expires_at <= deadline ||
 128             vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
 129             vnode->cb_scrub       != atomic_read(&volume->cb_scrub) ||
 130             test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
 131                 _debug("inval");
 132                 return false;
 133         }
 134
 135         return true;
 136 }
 137
 138 /*
 139  * See if the server we've just talked to is currently excluded.
 140  */
 141 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 142 {
 143         const struct afs_server_entry *se;
 144         const struct afs_server_list *slist;
 145         bool is_excluded = true;
 146         int i;
 147
 148         rcu_read_lock();
 149
 150         slist = rcu_dereference(volume->servers);
 151         for (i = 0; i < slist->nr_servers; i++) {
 152                 se = &slist->servers[i];
 153                 if (op->server == se->server) {
 154                         is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
 155                         break;
 156                 }
 157         }
 158
 159         rcu_read_unlock();
 160         return is_excluded;
 161 }
 162
 163 /*
 164  * Update the volume's server list when the creation time changes and see if
 165  * the server we've just talked to is currently excluded.
 166  */
 167 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 168 {
 169         int ret;
 170
 171         if (__afs_is_server_excluded(op, volume))
 172                 return 1;
 173
 174         set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
 175         ret = afs_check_volume_status(op->volume, op);
 176         if (ret < 0)
 177                 return ret;
 178
 179         return __afs_is_server_excluded(op, volume);
 180 }
 181
 182 /*
 183  * Handle a change to the volume creation time in the VolSync record.
 184  */
 185 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
 186 {
 187         unsigned int snap;
 188         time64_t cur = volume->creation_time;
 189         time64_t old = op->pre_volsync.creation;
 190         time64_t new = op->volsync.creation;
 191         int ret;
 192
 193         _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
 194
 195         if (cur == TIME64_MIN) {
 196                 volume->creation_time = new;
 197                 return 0;
 198         }
 199
 200         if (new == cur)
 201                 return 0;
 202
 203         /* Try to advance the creation timestamp from what we had before the
 204          * operation to what we got back from the server.  This should
 205          * hopefully ensure that in a race between multiple operations only one
 206          * of them will do this.
 207          */
 208         if (cur != old)
 209                 return 0;
 210
 211         /* If the creation time changes in an unexpected way, we need to scrub
 212          * our caches.  For a RW vol, this will only change if the volume is
 213          * restored from a backup; for a RO/Backup vol, this will advance when
 214          * the volume is updated to a new snapshot (eg. "vos release").
 215          */
 216         if (volume->type == AFSVL_RWVOL)
 217                 goto regressed;
 218         if (volume->type == AFSVL_BACKVOL) {
 219                 if (new < old)
 220                         goto regressed;
 221                 goto advance;
 222         }
 223
 224         /* We have an RO volume, we need to query the VL server and look at the
 225          * server flags to see if RW->RO replication is in progress.
 226          */
 227         ret = afs_is_server_excluded(op, volume);
 228         if (ret < 0)
 229                 return ret;
 230         if (ret > 0) {
 231                 snap = atomic_read(&volume->cb_ro_snapshot);
 232                 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
 233                 return ret;
 234         }
 235
 236 advance:
 237         snap = atomic_inc_return(&volume->cb_ro_snapshot);
 238         trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
 239         volume->creation_time = new;
 240         return 0;
 241
 242 regressed:
 243         atomic_inc(&volume->cb_scrub);
 244         trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
 245         volume->creation_time = new;
 246         return 0;
 247 }
 248
 249 /*
 250  * Handle a change to the volume update time in the VolSync record.
 251  */
 252 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
 253 {
 254         enum afs_cb_break_reason reason = afs_cb_break_no_break;
 255         time64_t cur = volume->update_time;
 256         time64_t old = op->pre_volsync.update;
 257         time64_t new = op->volsync.update;
 258
 259         _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
 260
 261         if (cur == TIME64_MIN) {
 262                 volume->update_time = new;
 263                 return;
 264         }
 265
 266         if (new == cur)
 267                 return;
 268
 269         /* If the volume update time changes in an unexpected way, we need to
 270          * scrub our caches.  For a RW vol, this will advance on every
 271          * modification op; for a RO/Backup vol, this will advance when the
 272          * volume is updated to a new snapshot (eg. "vos release").
 273          */
 274         if (new < old)
 275                 reason = afs_cb_break_for_update_regress;
 276
 277         /* Try to advance the update timestamp from what we had before the
 278          * operation to what we got back from the server.  This should
 279          * hopefully ensure that in a race between multiple operations only one
 280          * of them will do this.
 281          */
 282         if (cur == old) {
 283                 if (reason == afs_cb_break_for_update_regress) {
 284                         atomic_inc(&volume->cb_scrub);
 285                         trace_afs_cb_v_break(volume->vid, 0, reason);
 286                 }
 287                 volume->update_time = new;
 288         }
 289 }
 290
 291 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
 292 {
 293         int ret = 0;
 294
 295         if (likely(op->volsync.creation == volume->creation_time &&
 296                    op->volsync.update == volume->update_time))
 297                 return 0;
 298
 299         mutex_lock(&volume->volsync_lock);
 300         if (op->volsync.creation != volume->creation_time) {
 301                 ret = afs_update_volume_creation_time(op, volume);
 302                 if (ret < 0)
 303                         goto out;
 304         }
 305         if (op->volsync.update != volume->update_time)
 306                 afs_update_volume_update_time(op, volume);
 307 out:
 308         mutex_unlock(&volume->volsync_lock);
 309         return ret;
 310 }
 311
 312 /*
 313  * Update the state of a volume, including recording the expiration time of the
 314  * callback promise.  Returns 1 to redo the operation from the start.
 315  */
 316 int afs_update_volume_state(struct afs_operation *op)
 317 {
 318         struct afs_server_list *slist = op->server_list;
 319         struct afs_server_entry *se = &slist->servers[op->server_index];
 320         struct afs_callback *cb = &op->file[0].scb.callback;
 321         struct afs_volume *volume = op->volume;
 322         unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
 323         unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
 324         int ret;
 325
 326         _enter("%llx", op->volume->vid);
 327
 328         if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
 329                 ret = afs_update_volume_times(op, volume);
 330                 if (ret != 0) {
 331                         _leave(" = %d", ret);
 332                         return ret;
 333                 }
 334         }
 335
 336         if (op->cb_v_break == cb_v_break &&
 337             (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
 338                 time64_t expires_at = cb->expires_at;
 339
 340                 if (!op->file[0].scb.have_cb)
 341                         expires_at = op->file[1].scb.callback.expires_at;
 342
 343                 se->cb_expires_at = expires_at;
 344                 volume->cb_expires_at = expires_at;
 345         }
 346         if (cb_v_check < op->cb_v_break)
 347                 atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
 348         return 0;
 349 }
 350
 351 /*
 352  * mark the data attached to an inode as obsolete due to a write on the server
 353  * - might also want to ditch all the outstanding writes and dirty pages
 354  */
 355 static void afs_zap_data(struct afs_vnode *vnode)
 356 {
 357         _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 358
 359         afs_invalidate_cache(vnode, 0);
 360
 361         /* nuke all the non-dirty pages that aren't locked, mapped or being
 362          * written back in a regular file and completely discard the pages in a
 363          * directory or symlink */
 364         if (S_ISREG(vnode->netfs.inode.i_mode))
 365                 invalidate_remote_inode(&vnode->netfs.inode);
 366         else
 367                 invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
 368 }
 369
 370 /*
 371  * validate a vnode/inode
 372  * - there are several things we need to check
 373  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
 374  *     symlink)
 375  *   - parent dir metadata changed (security changes)
 376  *   - dentry data changed (write, truncate)
 377  *   - dentry metadata changed (security changes)
 378  */
 379 int afs_validate(struct afs_vnode *vnode, struct key *key)
 380 {
 381         struct afs_volume *volume = vnode->volume;
 382         unsigned int cb_ro_snapshot, cb_scrub;
 383         time64_t deadline = ktime_get_real_seconds() + 10;
 384         bool zap = false, locked_vol = false;
 385         int ret;
 386
 387         _enter("{v={%llx:%llu} fl=%lx},%x",
 388                vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 389                key_serial(key));
 390
 391         if (afs_check_validity(vnode))
 392                 return 0;
 393
 394         ret = down_write_killable(&vnode->validate_lock);
 395         if (ret < 0)
 396                 goto error;
 397
 398         /* Validate a volume after the v_break has changed or the volume
 399          * callback expired.  We only want to do this once per volume per
 400          * v_break change.  The actual work will be done when parsing the
 401          * status fetch reply.
 402          */
 403         if (volume->cb_expires_at <= deadline ||
 404             atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
 405                 ret = mutex_lock_interruptible(&volume->cb_check_lock);
 406                 if (ret < 0)
 407                         goto error_unlock;
 408                 locked_vol = true;
 409         }
 410
 411         cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 412         cb_scrub = atomic_read(&volume->cb_scrub);
 413         if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 414             vnode->cb_scrub       != cb_scrub)
 415                 unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
 416
 417         if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 418             vnode->cb_scrub       != cb_scrub ||
 419             volume->cb_expires_at <= deadline ||
 420             atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 421             atomic64_read(&vnode->cb_expires_at) <= deadline
 422             ) {
 423                 ret = afs_fetch_status(vnode, key, false, NULL);
 424                 if (ret < 0) {
 425                         if (ret == -ENOENT) {
 426                                 set_bit(AFS_VNODE_DELETED, &vnode->flags);
 427                                 ret = -ESTALE;
 428                         }
 429                         goto error_unlock;
 430                 }
 431
 432                 _debug("new promise [fl=%lx]", vnode->flags);
 433         }
 434
 435         /* We can drop the volume lock now as. */
 436         if (locked_vol) {
 437                 mutex_unlock(&volume->cb_check_lock);
 438                 locked_vol = false;
 439         }
 440
 441         cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 442         cb_scrub = atomic_read(&volume->cb_scrub);
 443         _debug("vnode inval %x==%x %x==%x",
 444                vnode->cb_ro_snapshot, cb_ro_snapshot,
 445                vnode->cb_scrub, cb_scrub);
 446         if (vnode->cb_scrub != cb_scrub)
 447                 zap = true;
 448         vnode->cb_ro_snapshot = cb_ro_snapshot;
 449         vnode->cb_scrub = cb_scrub;
 450
 451         if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 452                 _debug("file already deleted");
 453                 ret = -ESTALE;
 454                 goto error_unlock;
 455         }
 456
 457         /* if the vnode's data version number changed then its contents are
 458          * different */
 459         zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 460         if (zap)
 461                 afs_zap_data(vnode);
 462         up_write(&vnode->validate_lock);
 463         _leave(" = 0");
 464         return 0;
 465
 466 error_unlock:
 467         if (locked_vol)
 468                 mutex_unlock(&volume->cb_check_lock);
 469         up_write(&vnode->validate_lock);
 470 error:
 471         _leave(" = %d", ret);
 472         return ret;
 473 }