vhost: correct misleading printing information
[sfrench/cifs-2.6.git] / fs / bcachefs / snapshot.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "errcode.h"
9 #include "error.h"
10 #include "fs.h"
11 #include "recovery_passes.h"
12 #include "snapshot.h"
13
14 #include <linux/random.h>
15
16 /*
17  * Snapshot trees:
18  *
19  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
20  * exist to provide a stable identifier for the whole lifetime of a snapshot
21  * tree.
22  */
23
24 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
25                                 struct bkey_s_c k)
26 {
27         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
28
29         prt_printf(out, "subvol %u root snapshot %u",
30                    le32_to_cpu(t.v->master_subvol),
31                    le32_to_cpu(t.v->root_snapshot));
32 }
33
34 int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
35                                enum bkey_invalid_flags flags,
36                                struct printbuf *err)
37 {
38         int ret = 0;
39
40         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41                          bkey_lt(k.k->p, POS(0, 1)), c, err,
42                          snapshot_tree_pos_bad,
43                          "bad pos");
44 fsck_err:
45         return ret;
46 }
47
48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49                               struct bch_snapshot_tree *s)
50 {
51         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
53
54         if (bch2_err_matches(ret, ENOENT))
55                 ret = -BCH_ERR_ENOENT_snapshot_tree;
56         return ret;
57 }
58
59 struct bkey_i_snapshot_tree *
60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62         struct btree_iter iter;
63         int ret = bch2_bkey_get_empty_slot(trans, &iter,
64                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65         struct bkey_i_snapshot_tree *s_t;
66
67         if (ret == -BCH_ERR_ENOSPC_btree_slot)
68                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
69         if (ret)
70                 return ERR_PTR(ret);
71
72         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73         ret = PTR_ERR_OR_ZERO(s_t);
74         bch2_trans_iter_exit(trans, &iter);
75         return ret ? ERR_PTR(ret) : s_t;
76 }
77
78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79                                 u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81         struct bkey_i_snapshot_tree *n_tree =
82                 __bch2_snapshot_tree_create(trans);
83
84         if (IS_ERR(n_tree))
85                 return PTR_ERR(n_tree);
86
87         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
88         n_tree->v.root_snapshot = cpu_to_le32(root_id);
89         *tree_id = n_tree->k.p.offset;
90         return 0;
91 }
92
93 /* Snapshot nodes: */
94
95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97         while (id && id < ancestor) {
98                 const struct snapshot_t *s = __snapshot_t(t, id);
99                 id = s ? s->parent : 0;
100         }
101         return id == ancestor;
102 }
103
104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106         rcu_read_lock();
107         bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108         rcu_read_unlock();
109
110         return ret;
111 }
112
113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115         const struct snapshot_t *s = __snapshot_t(t, id);
116         if (!s)
117                 return 0;
118
119         if (s->skip[2] <= ancestor)
120                 return s->skip[2];
121         if (s->skip[1] <= ancestor)
122                 return s->skip[1];
123         if (s->skip[0] <= ancestor)
124                 return s->skip[0];
125         return s->parent;
126 }
127
128 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
129 {
130         bool ret;
131
132         rcu_read_lock();
133         struct snapshot_table *t = rcu_dereference(c->snapshots);
134
135         if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
136                 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
137                 goto out;
138         }
139
140         while (id && id < ancestor - IS_ANCESTOR_BITMAP)
141                 id = get_ancestor_below(t, id, ancestor);
142
143         if (id && id < ancestor) {
144                 ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
145
146                 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
147         } else {
148                 ret = id == ancestor;
149         }
150 out:
151         rcu_read_unlock();
152
153         return ret;
154 }
155
156 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
157 {
158         size_t idx = U32_MAX - id;
159         struct snapshot_table *new, *old;
160
161         size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
162         size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
163
164         new = kvzalloc(new_bytes, GFP_KERNEL);
165         if (!new)
166                 return NULL;
167
168         new->nr = new_size;
169
170         old = rcu_dereference_protected(c->snapshots, true);
171         if (old)
172                 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
173
174         rcu_assign_pointer(c->snapshots, new);
175         kvfree_rcu(old, rcu);
176
177         return &rcu_dereference_protected(c->snapshots,
178                                 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
179 }
180
181 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
182 {
183         size_t idx = U32_MAX - id;
184         struct snapshot_table *table =
185                 rcu_dereference_protected(c->snapshots,
186                                 lockdep_is_held(&c->snapshot_table_lock));
187
188         lockdep_assert_held(&c->snapshot_table_lock);
189
190         if (likely(table && idx < table->nr))
191                 return &table->s[idx];
192
193         return __snapshot_t_mut(c, id);
194 }
195
196 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
197                            struct bkey_s_c k)
198 {
199         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
200
201         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
202                BCH_SNAPSHOT_SUBVOL(s.v),
203                BCH_SNAPSHOT_DELETED(s.v),
204                le32_to_cpu(s.v->parent),
205                le32_to_cpu(s.v->children[0]),
206                le32_to_cpu(s.v->children[1]),
207                le32_to_cpu(s.v->subvol),
208                le32_to_cpu(s.v->tree));
209
210         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
211                 prt_printf(out, " depth %u skiplist %u %u %u",
212                            le32_to_cpu(s.v->depth),
213                            le32_to_cpu(s.v->skip[0]),
214                            le32_to_cpu(s.v->skip[1]),
215                            le32_to_cpu(s.v->skip[2]));
216 }
217
218 int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
219                           enum bkey_invalid_flags flags,
220                           struct printbuf *err)
221 {
222         struct bkey_s_c_snapshot s;
223         u32 i, id;
224         int ret = 0;
225
226         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
227                          bkey_lt(k.k->p, POS(0, 1)), c, err,
228                          snapshot_pos_bad,
229                          "bad pos");
230
231         s = bkey_s_c_to_snapshot(k);
232
233         id = le32_to_cpu(s.v->parent);
234         bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
235                          snapshot_parent_bad,
236                          "bad parent node (%u <= %llu)",
237                          id, k.k->p.offset);
238
239         bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
240                          snapshot_children_not_normalized,
241                          "children not normalized");
242
243         bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
244                          snapshot_child_duplicate,
245                          "duplicate child nodes");
246
247         for (i = 0; i < 2; i++) {
248                 id = le32_to_cpu(s.v->children[i]);
249
250                 bkey_fsck_err_on(id >= k.k->p.offset, c, err,
251                                  snapshot_child_bad,
252                                  "bad child node (%u >= %llu)",
253                                  id, k.k->p.offset);
254         }
255
256         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
257                 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
258                                  le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
259                                  snapshot_skiplist_not_normalized,
260                                  "skiplist not normalized");
261
262                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
263                         id = le32_to_cpu(s.v->skip[i]);
264
265                         bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
266                                          snapshot_skiplist_bad,
267                                          "bad skiplist node %u", id);
268                 }
269         }
270 fsck_err:
271         return ret;
272 }
273
274 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
275 {
276         struct snapshot_t *t = snapshot_t_mut(c, id);
277         u32 parent = id;
278
279         while ((parent = bch2_snapshot_parent_early(c, parent)) &&
280                parent - id - 1 < IS_ANCESTOR_BITMAP)
281                 __set_bit(parent - id - 1, t->is_ancestor);
282 }
283
284 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
285 {
286         mutex_lock(&c->snapshot_table_lock);
287         __set_is_ancestor_bitmap(c, id);
288         mutex_unlock(&c->snapshot_table_lock);
289 }
290
291 static int __bch2_mark_snapshot(struct btree_trans *trans,
292                        enum btree_id btree, unsigned level,
293                        struct bkey_s_c old, struct bkey_s_c new,
294                        unsigned flags)
295 {
296         struct bch_fs *c = trans->c;
297         struct snapshot_t *t;
298         u32 id = new.k->p.offset;
299         int ret = 0;
300
301         mutex_lock(&c->snapshot_table_lock);
302
303         t = snapshot_t_mut(c, id);
304         if (!t) {
305                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
306                 goto err;
307         }
308
309         if (new.k->type == KEY_TYPE_snapshot) {
310                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
311
312                 t->parent       = le32_to_cpu(s.v->parent);
313                 t->children[0]  = le32_to_cpu(s.v->children[0]);
314                 t->children[1]  = le32_to_cpu(s.v->children[1]);
315                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
316                 t->tree         = le32_to_cpu(s.v->tree);
317
318                 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
319                         t->depth        = le32_to_cpu(s.v->depth);
320                         t->skip[0]      = le32_to_cpu(s.v->skip[0]);
321                         t->skip[1]      = le32_to_cpu(s.v->skip[1]);
322                         t->skip[2]      = le32_to_cpu(s.v->skip[2]);
323                 } else {
324                         t->depth        = 0;
325                         t->skip[0]      = 0;
326                         t->skip[1]      = 0;
327                         t->skip[2]      = 0;
328                 }
329
330                 __set_is_ancestor_bitmap(c, id);
331
332                 if (BCH_SNAPSHOT_DELETED(s.v)) {
333                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
334                         if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
335                                 bch2_delete_dead_snapshots_async(c);
336                 }
337         } else {
338                 memset(t, 0, sizeof(*t));
339         }
340 err:
341         mutex_unlock(&c->snapshot_table_lock);
342         return ret;
343 }
344
345 int bch2_mark_snapshot(struct btree_trans *trans,
346                        enum btree_id btree, unsigned level,
347                        struct bkey_s_c old, struct bkey_s new,
348                        unsigned flags)
349 {
350         return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
351 }
352
353 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
354                          struct bch_snapshot *s)
355 {
356         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
357                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
358 }
359
360 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
361 {
362         struct bch_snapshot v;
363         int ret;
364
365         if (!id)
366                 return 0;
367
368         ret = bch2_snapshot_lookup(trans, id, &v);
369         if (bch2_err_matches(ret, ENOENT))
370                 bch_err(trans->c, "snapshot node %u not found", id);
371         if (ret)
372                 return ret;
373
374         return !BCH_SNAPSHOT_DELETED(&v);
375 }
376
377 /*
378  * If @k is a snapshot with just one live child, it's part of a linear chain,
379  * which we consider to be an equivalence class: and then after snapshot
380  * deletion cleanup, there should only be a single key at a given position in
381  * this equivalence class.
382  *
383  * This sets the equivalence class of @k to be the child's equivalence class, if
384  * it's part of such a linear chain: this correctly sets equivalence classes on
385  * startup if we run leaf to root (i.e. in natural key order).
386  */
387 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
388 {
389         struct bch_fs *c = trans->c;
390         unsigned i, nr_live = 0, live_idx = 0;
391         struct bkey_s_c_snapshot snap;
392         u32 id = k.k->p.offset, child[2];
393
394         if (k.k->type != KEY_TYPE_snapshot)
395                 return 0;
396
397         snap = bkey_s_c_to_snapshot(k);
398
399         child[0] = le32_to_cpu(snap.v->children[0]);
400         child[1] = le32_to_cpu(snap.v->children[1]);
401
402         for (i = 0; i < 2; i++) {
403                 int ret = bch2_snapshot_live(trans, child[i]);
404
405                 if (ret < 0)
406                         return ret;
407
408                 if (ret)
409                         live_idx = i;
410                 nr_live += ret;
411         }
412
413         mutex_lock(&c->snapshot_table_lock);
414
415         snapshot_t_mut(c, id)->equiv = nr_live == 1
416                 ? snapshot_t_mut(c, child[live_idx])->equiv
417                 : id;
418
419         mutex_unlock(&c->snapshot_table_lock);
420
421         return 0;
422 }
423
424 /* fsck: */
425
426 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
427 {
428         return snapshot_t(c, id)->children[child];
429 }
430
431 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
432 {
433         return bch2_snapshot_child(c, id, 0);
434 }
435
436 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
437 {
438         return bch2_snapshot_child(c, id, 1);
439 }
440
441 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
442 {
443         u32 n, parent;
444
445         n = bch2_snapshot_left_child(c, id);
446         if (n)
447                 return n;
448
449         while ((parent = bch2_snapshot_parent(c, id))) {
450                 n = bch2_snapshot_right_child(c, parent);
451                 if (n && n != id)
452                         return n;
453                 id = parent;
454         }
455
456         return 0;
457 }
458
459 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
460 {
461         u32 id = snapshot_root;
462         u32 subvol = 0, s;
463
464         while (id) {
465                 s = snapshot_t(c, id)->subvol;
466
467                 if (s && (!subvol || s < subvol))
468                         subvol = s;
469
470                 id = bch2_snapshot_tree_next(c, id);
471         }
472
473         return subvol;
474 }
475
476 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
477                                             u32 snapshot_root, u32 *subvol_id)
478 {
479         struct bch_fs *c = trans->c;
480         struct btree_iter iter;
481         struct bkey_s_c k;
482         bool found = false;
483         int ret;
484
485         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
486                                      0, k, ret) {
487                 if (k.k->type != KEY_TYPE_subvolume)
488                         continue;
489
490                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
491                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
492                         continue;
493                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
494                         *subvol_id = s.k->p.offset;
495                         found = true;
496                         break;
497                 }
498         }
499
500         bch2_trans_iter_exit(trans, &iter);
501
502         if (!ret && !found) {
503                 struct bkey_i_subvolume *u;
504
505                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
506
507                 u = bch2_bkey_get_mut_typed(trans, &iter,
508                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
509                                             0, subvolume);
510                 ret = PTR_ERR_OR_ZERO(u);
511                 if (ret)
512                         return ret;
513
514                 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
515         }
516
517         return ret;
518 }
519
520 static int check_snapshot_tree(struct btree_trans *trans,
521                                struct btree_iter *iter,
522                                struct bkey_s_c k)
523 {
524         struct bch_fs *c = trans->c;
525         struct bkey_s_c_snapshot_tree st;
526         struct bch_snapshot s;
527         struct bch_subvolume subvol;
528         struct printbuf buf = PRINTBUF;
529         u32 root_id;
530         int ret;
531
532         if (k.k->type != KEY_TYPE_snapshot_tree)
533                 return 0;
534
535         st = bkey_s_c_to_snapshot_tree(k);
536         root_id = le32_to_cpu(st.v->root_snapshot);
537
538         ret = bch2_snapshot_lookup(trans, root_id, &s);
539         if (ret && !bch2_err_matches(ret, ENOENT))
540                 goto err;
541
542         if (fsck_err_on(ret ||
543                         root_id != bch2_snapshot_root(c, root_id) ||
544                         st.k->p.offset != le32_to_cpu(s.tree),
545                         c, snapshot_tree_to_missing_snapshot,
546                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
547                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
548                 ret = bch2_btree_delete_at(trans, iter, 0);
549                 goto err;
550         }
551
552         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
553                                  false, 0, &subvol);
554         if (ret && !bch2_err_matches(ret, ENOENT))
555                 goto err;
556
557         if (fsck_err_on(ret,
558                         c, snapshot_tree_to_missing_subvol,
559                         "snapshot tree points to missing subvolume:\n  %s",
560                         (printbuf_reset(&buf),
561                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
562             fsck_err_on(!bch2_snapshot_is_ancestor(c,
563                                                 le32_to_cpu(subvol.snapshot),
564                                                 root_id),
565                         c, snapshot_tree_to_wrong_subvol,
566                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
567                         (printbuf_reset(&buf),
568                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
569             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
570                         c, snapshot_tree_to_snapshot_subvol,
571                         "snapshot tree points to snapshot subvolume:\n  %s",
572                         (printbuf_reset(&buf),
573                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
574                 struct bkey_i_snapshot_tree *u;
575                 u32 subvol_id;
576
577                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
578                 bch_err_fn(c, ret);
579
580                 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
581                         ret = 0;
582                         goto err;
583                 }
584
585                 if (ret)
586                         goto err;
587
588                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
589                 ret = PTR_ERR_OR_ZERO(u);
590                 if (ret)
591                         goto err;
592
593                 u->v.master_subvol = cpu_to_le32(subvol_id);
594                 st = snapshot_tree_i_to_s_c(u);
595         }
596 err:
597 fsck_err:
598         printbuf_exit(&buf);
599         return ret;
600 }
601
602 /*
603  * For each snapshot_tree, make sure it points to the root of a snapshot tree
604  * and that snapshot entry points back to it, or delete it.
605  *
606  * And, make sure it points to a subvolume within that snapshot tree, or correct
607  * it to point to the oldest subvolume within that snapshot tree.
608  */
609 int bch2_check_snapshot_trees(struct bch_fs *c)
610 {
611         int ret = bch2_trans_run(c,
612                 for_each_btree_key_commit(trans, iter,
613                         BTREE_ID_snapshot_trees, POS_MIN,
614                         BTREE_ITER_PREFETCH, k,
615                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
616                 check_snapshot_tree(trans, &iter, k)));
617         bch_err_fn(c, ret);
618         return ret;
619 }
620
621 /*
622  * Look up snapshot tree for @tree_id and find root,
623  * make sure @snap_id is a descendent:
624  */
625 static int snapshot_tree_ptr_good(struct btree_trans *trans,
626                                   u32 snap_id, u32 tree_id)
627 {
628         struct bch_snapshot_tree s_t;
629         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
630
631         if (bch2_err_matches(ret, ENOENT))
632                 return 0;
633         if (ret)
634                 return ret;
635
636         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
637 }
638
639 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
640 {
641         const struct snapshot_t *s;
642
643         if (!id)
644                 return 0;
645
646         rcu_read_lock();
647         s = snapshot_t(c, id);
648         if (s->parent)
649                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
650         rcu_read_unlock();
651
652         return id;
653 }
654
655 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
656 {
657         unsigned i;
658
659         for (i = 0; i < 3; i++)
660                 if (!s.parent) {
661                         if (s.skip[i])
662                                 return false;
663                 } else {
664                         if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
665                                 return false;
666                 }
667
668         return true;
669 }
670
671 /*
672  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
673  * its snapshot_tree pointer is correct (allocate new one if necessary), then
674  * update this node's pointer to root node's pointer:
675  */
676 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
677                                     struct btree_iter *iter,
678                                     struct bkey_s_c k,
679                                     struct bch_snapshot *s)
680 {
681         struct bch_fs *c = trans->c;
682         struct btree_iter root_iter;
683         struct bch_snapshot_tree s_t;
684         struct bkey_s_c_snapshot root;
685         struct bkey_i_snapshot *u;
686         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
687         int ret;
688
689         root = bch2_bkey_get_iter_typed(trans, &root_iter,
690                                BTREE_ID_snapshots, POS(0, root_id),
691                                BTREE_ITER_WITH_UPDATES, snapshot);
692         ret = bkey_err(root);
693         if (ret)
694                 goto err;
695
696         tree_id = le32_to_cpu(root.v->tree);
697
698         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
699         if (ret && !bch2_err_matches(ret, ENOENT))
700                 return ret;
701
702         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
703                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
704                 ret =   PTR_ERR_OR_ZERO(u) ?:
705                         bch2_snapshot_tree_create(trans, root_id,
706                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
707                                 &tree_id);
708                 if (ret)
709                         goto err;
710
711                 u->v.tree = cpu_to_le32(tree_id);
712                 if (k.k->p.offset == root_id)
713                         *s = u->v;
714         }
715
716         if (k.k->p.offset != root_id) {
717                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
718                 ret = PTR_ERR_OR_ZERO(u);
719                 if (ret)
720                         goto err;
721
722                 u->v.tree = cpu_to_le32(tree_id);
723                 *s = u->v;
724         }
725 err:
726         bch2_trans_iter_exit(trans, &root_iter);
727         return ret;
728 }
729
730 static int check_snapshot(struct btree_trans *trans,
731                           struct btree_iter *iter,
732                           struct bkey_s_c k)
733 {
734         struct bch_fs *c = trans->c;
735         struct bch_snapshot s;
736         struct bch_subvolume subvol;
737         struct bch_snapshot v;
738         struct bkey_i_snapshot *u;
739         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
740         u32 real_depth;
741         struct printbuf buf = PRINTBUF;
742         u32 i, id;
743         int ret = 0;
744
745         if (k.k->type != KEY_TYPE_snapshot)
746                 return 0;
747
748         memset(&s, 0, sizeof(s));
749         memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
750
751         id = le32_to_cpu(s.parent);
752         if (id) {
753                 ret = bch2_snapshot_lookup(trans, id, &v);
754                 if (bch2_err_matches(ret, ENOENT))
755                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
756                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
757                 if (ret)
758                         goto err;
759
760                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
761                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
762                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
763                                 id, k.k->p.offset);
764                         ret = -EINVAL;
765                         goto err;
766                 }
767         }
768
769         for (i = 0; i < 2 && s.children[i]; i++) {
770                 id = le32_to_cpu(s.children[i]);
771
772                 ret = bch2_snapshot_lookup(trans, id, &v);
773                 if (bch2_err_matches(ret, ENOENT))
774                         bch_err(c, "snapshot node %llu has nonexistent child %u",
775                                 k.k->p.offset, id);
776                 if (ret)
777                         goto err;
778
779                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
780                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
781                                 id, le32_to_cpu(v.parent), k.k->p.offset);
782                         ret = -EINVAL;
783                         goto err;
784                 }
785         }
786
787         bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
788                 !BCH_SNAPSHOT_DELETED(&s);
789
790         if (should_have_subvol) {
791                 id = le32_to_cpu(s.subvol);
792                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
793                 if (bch2_err_matches(ret, ENOENT))
794                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
795                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
796                 if (ret)
797                         goto err;
798
799                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
800                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
801                                 k.k->p.offset);
802                         ret = -EINVAL;
803                         goto err;
804                 }
805         } else {
806                 if (fsck_err_on(s.subvol,
807                                 c, snapshot_should_not_have_subvol,
808                                 "snapshot should not point to subvol:\n  %s",
809                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
810                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
811                         ret = PTR_ERR_OR_ZERO(u);
812                         if (ret)
813                                 goto err;
814
815                         u->v.subvol = 0;
816                         s = u->v;
817                 }
818         }
819
820         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
821         if (ret < 0)
822                 goto err;
823
824         if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
825                         "snapshot points to missing/incorrect tree:\n  %s",
826                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
827                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
828                 if (ret)
829                         goto err;
830         }
831         ret = 0;
832
833         real_depth = bch2_snapshot_depth(c, parent_id);
834
835         if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
836                         c, snapshot_bad_depth,
837                         "snapshot with incorrect depth field, should be %u:\n  %s",
838                         real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
839                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
840                 ret = PTR_ERR_OR_ZERO(u);
841                 if (ret)
842                         goto err;
843
844                 u->v.depth = cpu_to_le32(real_depth);
845                 s = u->v;
846         }
847
848         ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
849         if (ret < 0)
850                 goto err;
851
852         if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
853                         "snapshot with bad skiplist field:\n  %s",
854                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
855                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
856                 ret = PTR_ERR_OR_ZERO(u);
857                 if (ret)
858                         goto err;
859
860                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
861                         u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
862
863                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
864                 s = u->v;
865         }
866         ret = 0;
867 err:
868 fsck_err:
869         printbuf_exit(&buf);
870         return ret;
871 }
872
873 int bch2_check_snapshots(struct bch_fs *c)
874 {
875         /*
876          * We iterate backwards as checking/fixing the depth field requires that
877          * the parent's depth already be correct:
878          */
879         int ret = bch2_trans_run(c,
880                 for_each_btree_key_reverse_commit(trans, iter,
881                                 BTREE_ID_snapshots, POS_MAX,
882                                 BTREE_ITER_PREFETCH, k,
883                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
884                         check_snapshot(trans, &iter, k)));
885         bch_err_fn(c, ret);
886         return ret;
887 }
888
889 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
890 {
891         struct bch_fs *c = trans->c;
892
893         if (bch2_snapshot_equiv(c, id))
894                 return 0;
895
896         u32 tree_id;
897         int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
898         if (ret)
899                 return ret;
900
901         struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
902         ret = PTR_ERR_OR_ZERO(snapshot);
903         if (ret)
904                 return ret;
905
906         bkey_snapshot_init(&snapshot->k_i);
907         snapshot->k.p           = POS(0, id);
908         snapshot->v.tree        = cpu_to_le32(tree_id);
909         snapshot->v.btime.lo    = cpu_to_le64(bch2_current_time(c));
910
911         return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
912                 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
913                                    bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
914                 bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
915 }
916
917 /* Figure out which snapshot nodes belong in the same tree: */
918 struct snapshot_tree_reconstruct {
919         enum btree_id                   btree;
920         struct bpos                     cur_pos;
921         snapshot_id_list                cur_ids;
922         DARRAY(snapshot_id_list)        trees;
923 };
924
925 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
926 {
927         darray_for_each(r->trees, i)
928                 darray_exit(i);
929         darray_exit(&r->trees);
930         darray_exit(&r->cur_ids);
931 }
932
933 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
934 {
935         return r->btree == BTREE_ID_inodes
936                 ? r->cur_pos.offset == pos.offset
937                 : r->cur_pos.inode == pos.inode;
938 }
939
940 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
941 {
942         darray_for_each(*l, i)
943                 if (snapshot_list_has_id(r, *i))
944                         return true;
945         return false;
946 }
947
948 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
949 {
950         bool first = true;
951         darray_for_each(*s, i) {
952                 if (!first)
953                         prt_char(out, ' ');
954                 first = false;
955                 prt_printf(out, "%u", *i);
956         }
957 }
958
959 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
960 {
961         if (r->cur_ids.nr) {
962                 darray_for_each(r->trees, i)
963                         if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
964                                 int ret = snapshot_list_merge(c, i, &r->cur_ids);
965                                 if (ret)
966                                         return ret;
967                                 goto out;
968                         }
969                 darray_push(&r->trees, r->cur_ids);
970                 darray_init(&r->cur_ids);
971         }
972 out:
973         r->cur_ids.nr = 0;
974         return 0;
975 }
976
977 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
978 {
979         if (!same_snapshot(r, pos))
980                 snapshot_tree_reconstruct_next(c, r);
981         r->cur_pos = pos;
982         return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
983 }
984
985 int bch2_reconstruct_snapshots(struct bch_fs *c)
986 {
987         struct btree_trans *trans = bch2_trans_get(c);
988         struct printbuf buf = PRINTBUF;
989         struct snapshot_tree_reconstruct r = {};
990         int ret = 0;
991
992         for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
993                 if (btree_type_has_snapshots(btree)) {
994                         r.btree = btree;
995
996                         ret = for_each_btree_key(trans, iter, btree, POS_MIN,
997                                         BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
998                                 get_snapshot_trees(c, &r, k.k->p);
999                         }));
1000                         if (ret)
1001                                 goto err;
1002
1003                         snapshot_tree_reconstruct_next(c, &r);
1004                 }
1005         }
1006
1007         darray_for_each(r.trees, t) {
1008                 printbuf_reset(&buf);
1009                 snapshot_id_list_to_text(&buf, t);
1010
1011                 darray_for_each(*t, id) {
1012                         if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
1013                                         c, snapshot_node_missing,
1014                                         "snapshot node %u from tree %s missing", *id, buf.buf)) {
1015                                 if (t->nr > 1) {
1016                                         bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1017                                         ret = -BCH_ERR_fsck_repair_unimplemented;
1018                                         goto err;
1019                                 }
1020
1021                                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1022                                                 check_snapshot_exists(trans, *id));
1023                                 if (ret)
1024                                         goto err;
1025                         }
1026                 }
1027         }
1028 fsck_err:
1029 err:
1030         bch2_trans_put(trans);
1031         snapshot_tree_reconstruct_exit(&r);
1032         printbuf_exit(&buf);
1033         bch_err_fn(c, ret);
1034         return ret;
1035 }
1036
1037 /*
1038  * Mark a snapshot as deleted, for future cleanup:
1039  */
1040 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1041 {
1042         struct btree_iter iter;
1043         struct bkey_i_snapshot *s;
1044         int ret = 0;
1045
1046         s = bch2_bkey_get_mut_typed(trans, &iter,
1047                                     BTREE_ID_snapshots, POS(0, id),
1048                                     0, snapshot);
1049         ret = PTR_ERR_OR_ZERO(s);
1050         if (unlikely(ret)) {
1051                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1052                                         trans->c, "missing snapshot %u", id);
1053                 return ret;
1054         }
1055
1056         /* already deleted? */
1057         if (BCH_SNAPSHOT_DELETED(&s->v))
1058                 goto err;
1059
1060         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1061         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1062         s->v.subvol = 0;
1063 err:
1064         bch2_trans_iter_exit(trans, &iter);
1065         return ret;
1066 }
1067
1068 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1069 {
1070         if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1071                 swap(s->children[0], s->children[1]);
1072 }
1073
1074 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1075 {
1076         struct bch_fs *c = trans->c;
1077         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1078         struct btree_iter c_iter = (struct btree_iter) { NULL };
1079         struct btree_iter tree_iter = (struct btree_iter) { NULL };
1080         struct bkey_s_c_snapshot s;
1081         u32 parent_id, child_id;
1082         unsigned i;
1083         int ret = 0;
1084
1085         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1086                                      BTREE_ITER_INTENT, snapshot);
1087         ret = bkey_err(s);
1088         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1089                                 "missing snapshot %u", id);
1090
1091         if (ret)
1092                 goto err;
1093
1094         BUG_ON(s.v->children[1]);
1095
1096         parent_id = le32_to_cpu(s.v->parent);
1097         child_id = le32_to_cpu(s.v->children[0]);
1098
1099         if (parent_id) {
1100                 struct bkey_i_snapshot *parent;
1101
1102                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1103                                      BTREE_ID_snapshots, POS(0, parent_id),
1104                                      0, snapshot);
1105                 ret = PTR_ERR_OR_ZERO(parent);
1106                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1107                                         "missing snapshot %u", parent_id);
1108                 if (unlikely(ret))
1109                         goto err;
1110
1111                 /* find entry in parent->children for node being deleted */
1112                 for (i = 0; i < 2; i++)
1113                         if (le32_to_cpu(parent->v.children[i]) == id)
1114                                 break;
1115
1116                 if (bch2_fs_inconsistent_on(i == 2, c,
1117                                         "snapshot %u missing child pointer to %u",
1118                                         parent_id, id))
1119                         goto err;
1120
1121                 parent->v.children[i] = cpu_to_le32(child_id);
1122
1123                 normalize_snapshot_child_pointers(&parent->v);
1124         }
1125
1126         if (child_id) {
1127                 struct bkey_i_snapshot *child;
1128
1129                 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1130                                      BTREE_ID_snapshots, POS(0, child_id),
1131                                      0, snapshot);
1132                 ret = PTR_ERR_OR_ZERO(child);
1133                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1134                                         "missing snapshot %u", child_id);
1135                 if (unlikely(ret))
1136                         goto err;
1137
1138                 child->v.parent = cpu_to_le32(parent_id);
1139
1140                 if (!child->v.parent) {
1141                         child->v.skip[0] = 0;
1142                         child->v.skip[1] = 0;
1143                         child->v.skip[2] = 0;
1144                 }
1145         }
1146
1147         if (!parent_id) {
1148                 /*
1149                  * We're deleting the root of a snapshot tree: update the
1150                  * snapshot_tree entry to point to the new root, or delete it if
1151                  * this is the last snapshot ID in this tree:
1152                  */
1153                 struct bkey_i_snapshot_tree *s_t;
1154
1155                 BUG_ON(s.v->children[1]);
1156
1157                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1158                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1159                                 0, snapshot_tree);
1160                 ret = PTR_ERR_OR_ZERO(s_t);
1161                 if (ret)
1162                         goto err;
1163
1164                 if (s.v->children[0]) {
1165                         s_t->v.root_snapshot = s.v->children[0];
1166                 } else {
1167                         s_t->k.type = KEY_TYPE_deleted;
1168                         set_bkey_val_u64s(&s_t->k, 0);
1169                 }
1170         }
1171
1172         ret = bch2_btree_delete_at(trans, &iter, 0);
1173 err:
1174         bch2_trans_iter_exit(trans, &tree_iter);
1175         bch2_trans_iter_exit(trans, &p_iter);
1176         bch2_trans_iter_exit(trans, &c_iter);
1177         bch2_trans_iter_exit(trans, &iter);
1178         return ret;
1179 }
1180
1181 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1182                           u32 *new_snapids,
1183                           u32 *snapshot_subvols,
1184                           unsigned nr_snapids)
1185 {
1186         struct bch_fs *c = trans->c;
1187         struct btree_iter iter;
1188         struct bkey_i_snapshot *n;
1189         struct bkey_s_c k;
1190         unsigned i, j;
1191         u32 depth = bch2_snapshot_depth(c, parent);
1192         int ret;
1193
1194         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1195                              POS_MIN, BTREE_ITER_INTENT);
1196         k = bch2_btree_iter_peek(&iter);
1197         ret = bkey_err(k);
1198         if (ret)
1199                 goto err;
1200
1201         for (i = 0; i < nr_snapids; i++) {
1202                 k = bch2_btree_iter_prev_slot(&iter);
1203                 ret = bkey_err(k);
1204                 if (ret)
1205                         goto err;
1206
1207                 if (!k.k || !k.k->p.offset) {
1208                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1209                         goto err;
1210                 }
1211
1212                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1213                 ret = PTR_ERR_OR_ZERO(n);
1214                 if (ret)
1215                         goto err;
1216
1217                 n->v.flags      = 0;
1218                 n->v.parent     = cpu_to_le32(parent);
1219                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1220                 n->v.tree       = cpu_to_le32(tree);
1221                 n->v.depth      = cpu_to_le32(depth);
1222                 n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
1223                 n->v.btime.hi   = 0;
1224
1225                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1226                         n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1227
1228                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1229                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1230
1231                 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1232                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1233                 if (ret)
1234                         goto err;
1235
1236                 new_snapids[i]  = iter.pos.offset;
1237
1238                 mutex_lock(&c->snapshot_table_lock);
1239                 snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1240                 mutex_unlock(&c->snapshot_table_lock);
1241         }
1242 err:
1243         bch2_trans_iter_exit(trans, &iter);
1244         return ret;
1245 }
1246
1247 /*
1248  * Create new snapshot IDs as children of an existing snapshot ID:
1249  */
1250 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1251                               u32 *new_snapids,
1252                               u32 *snapshot_subvols,
1253                               unsigned nr_snapids)
1254 {
1255         struct btree_iter iter;
1256         struct bkey_i_snapshot *n_parent;
1257         int ret = 0;
1258
1259         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1260                         BTREE_ID_snapshots, POS(0, parent),
1261                         0, snapshot);
1262         ret = PTR_ERR_OR_ZERO(n_parent);
1263         if (unlikely(ret)) {
1264                 if (bch2_err_matches(ret, ENOENT))
1265                         bch_err(trans->c, "snapshot %u not found", parent);
1266                 return ret;
1267         }
1268
1269         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1270                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1271                 ret = -EINVAL;
1272                 goto err;
1273         }
1274
1275         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1276                              new_snapids, snapshot_subvols, nr_snapids);
1277         if (ret)
1278                 goto err;
1279
1280         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1281         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1282         n_parent->v.subvol = 0;
1283         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1284 err:
1285         bch2_trans_iter_exit(trans, &iter);
1286         return ret;
1287 }
1288
1289 /*
1290  * Create a snapshot node that is the root of a new tree:
1291  */
1292 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1293                               u32 *new_snapids,
1294                               u32 *snapshot_subvols,
1295                               unsigned nr_snapids)
1296 {
1297         struct bkey_i_snapshot_tree *n_tree;
1298         int ret;
1299
1300         n_tree = __bch2_snapshot_tree_create(trans);
1301         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1302                 create_snapids(trans, 0, n_tree->k.p.offset,
1303                              new_snapids, snapshot_subvols, nr_snapids);
1304         if (ret)
1305                 return ret;
1306
1307         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1308         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1309         return 0;
1310 }
1311
1312 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1313                               u32 *new_snapids,
1314                               u32 *snapshot_subvols,
1315                               unsigned nr_snapids)
1316 {
1317         BUG_ON((parent == 0) != (nr_snapids == 1));
1318         BUG_ON((parent != 0) != (nr_snapids == 2));
1319
1320         return parent
1321                 ? bch2_snapshot_node_create_children(trans, parent,
1322                                 new_snapids, snapshot_subvols, nr_snapids)
1323                 : bch2_snapshot_node_create_tree(trans,
1324                                 new_snapids, snapshot_subvols, nr_snapids);
1325
1326 }
1327
1328 /*
1329  * If we have an unlinked inode in an internal snapshot node, and the inode
1330  * really has been deleted in all child snapshots, how does this get cleaned up?
1331  *
1332  * first there is the problem of how keys that have been overwritten in all
1333  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1334  * special?
1335  *
1336  * also: unlinked inode in internal snapshot appears to not be getting deleted
1337  * correctly if inode doesn't exist in leaf snapshots
1338  *
1339  * solution:
1340  *
1341  * for a key in an interior snapshot node that needs work to be done that
1342  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1343  * that key to snapshot leaf nodes, where we can mutate it
1344  */
1345
1346 static int snapshot_delete_key(struct btree_trans *trans,
1347                                struct btree_iter *iter,
1348                                struct bkey_s_c k,
1349                                snapshot_id_list *deleted,
1350                                snapshot_id_list *equiv_seen,
1351                                struct bpos *last_pos)
1352 {
1353         struct bch_fs *c = trans->c;
1354         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1355
1356         if (!bkey_eq(k.k->p, *last_pos))
1357                 equiv_seen->nr = 0;
1358         *last_pos = k.k->p;
1359
1360         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1361             snapshot_list_has_id(equiv_seen, equiv)) {
1362                 return bch2_btree_delete_at(trans, iter,
1363                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1364         } else {
1365                 return snapshot_list_add(c, equiv_seen, equiv);
1366         }
1367 }
1368
1369 static int move_key_to_correct_snapshot(struct btree_trans *trans,
1370                                struct btree_iter *iter,
1371                                struct bkey_s_c k)
1372 {
1373         struct bch_fs *c = trans->c;
1374         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1375
1376         /*
1377          * When we have a linear chain of snapshot nodes, we consider
1378          * those to form an equivalence class: we're going to collapse
1379          * them all down to a single node, and keep the leaf-most node -
1380          * which has the same id as the equivalence class id.
1381          *
1382          * If there are multiple keys in different snapshots at the same
1383          * position, we're only going to keep the one in the newest
1384          * snapshot - the rest have been overwritten and are redundant,
1385          * and for the key we're going to keep we need to move it to the
1386          * equivalance class ID if it's not there already.
1387          */
1388         if (equiv != k.k->p.snapshot) {
1389                 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1390                 struct btree_iter new_iter;
1391                 int ret;
1392
1393                 ret = PTR_ERR_OR_ZERO(new);
1394                 if (ret)
1395                         return ret;
1396
1397                 new->k.p.snapshot = equiv;
1398
1399                 bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1400                                      BTREE_ITER_ALL_SNAPSHOTS|
1401                                      BTREE_ITER_CACHED|
1402                                      BTREE_ITER_INTENT);
1403
1404                 ret =   bch2_btree_iter_traverse(&new_iter) ?:
1405                         bch2_trans_update(trans, &new_iter, new,
1406                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
1407                         bch2_btree_delete_at(trans, iter,
1408                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1409                 bch2_trans_iter_exit(trans, &new_iter);
1410                 if (ret)
1411                         return ret;
1412         }
1413
1414         return 0;
1415 }
1416
1417 static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
1418 {
1419         struct bkey_s_c_snapshot snap;
1420         u32 children[2];
1421         int ret;
1422
1423         if (k.k->type != KEY_TYPE_snapshot)
1424                 return 0;
1425
1426         snap = bkey_s_c_to_snapshot(k);
1427         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1428             BCH_SNAPSHOT_SUBVOL(snap.v))
1429                 return 0;
1430
1431         children[0] = le32_to_cpu(snap.v->children[0]);
1432         children[1] = le32_to_cpu(snap.v->children[1]);
1433
1434         ret   = bch2_snapshot_live(trans, children[0]) ?:
1435                 bch2_snapshot_live(trans, children[1]);
1436         if (ret < 0)
1437                 return ret;
1438         return !ret;
1439 }
1440
1441 /*
1442  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1443  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1444  * as deleted.
1445  */
1446 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1447 {
1448         int ret = bch2_snapshot_needs_delete(trans, k);
1449
1450         return ret <= 0
1451                 ? ret
1452                 : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1453 }
1454
1455 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1456                                                 snapshot_id_list *skip)
1457 {
1458         rcu_read_lock();
1459         while (snapshot_list_has_id(skip, id))
1460                 id = __bch2_snapshot_parent(c, id);
1461
1462         while (n--) {
1463                 do {
1464                         id = __bch2_snapshot_parent(c, id);
1465                 } while (snapshot_list_has_id(skip, id));
1466         }
1467         rcu_read_unlock();
1468
1469         return id;
1470 }
1471
1472 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1473                                               struct btree_iter *iter, struct bkey_s_c k,
1474                                               snapshot_id_list *deleted)
1475 {
1476         struct bch_fs *c = trans->c;
1477         u32 nr_deleted_ancestors = 0;
1478         struct bkey_i_snapshot *s;
1479         int ret;
1480
1481         if (k.k->type != KEY_TYPE_snapshot)
1482                 return 0;
1483
1484         if (snapshot_list_has_id(deleted, k.k->p.offset))
1485                 return 0;
1486
1487         s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1488         ret = PTR_ERR_OR_ZERO(s);
1489         if (ret)
1490                 return ret;
1491
1492         darray_for_each(*deleted, i)
1493                 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1494
1495         if (!nr_deleted_ancestors)
1496                 return 0;
1497
1498         le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1499
1500         if (!s->v.depth) {
1501                 s->v.skip[0] = 0;
1502                 s->v.skip[1] = 0;
1503                 s->v.skip[2] = 0;
1504         } else {
1505                 u32 depth = le32_to_cpu(s->v.depth);
1506                 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1507
1508                 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1509                         u32 id = le32_to_cpu(s->v.skip[j]);
1510
1511                         if (snapshot_list_has_id(deleted, id)) {
1512                                 id = bch2_snapshot_nth_parent_skip(c,
1513                                                         parent,
1514                                                         depth > 1
1515                                                         ? get_random_u32_below(depth - 1)
1516                                                         : 0,
1517                                                         deleted);
1518                                 s->v.skip[j] = cpu_to_le32(id);
1519                         }
1520                 }
1521
1522                 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1523         }
1524
1525         return bch2_trans_update(trans, iter, &s->k_i, 0);
1526 }
1527
1528 int bch2_delete_dead_snapshots(struct bch_fs *c)
1529 {
1530         struct btree_trans *trans;
1531         snapshot_id_list deleted = { 0 };
1532         snapshot_id_list deleted_interior = { 0 };
1533         u32 id;
1534         int ret = 0;
1535
1536         if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1537                 return 0;
1538
1539         if (!test_bit(BCH_FS_started, &c->flags)) {
1540                 ret = bch2_fs_read_write_early(c);
1541                 bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
1542                 if (ret)
1543                         return ret;
1544         }
1545
1546         trans = bch2_trans_get(c);
1547
1548         /*
1549          * For every snapshot node: If we have no live children and it's not
1550          * pointed to by a subvolume, delete it:
1551          */
1552         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1553                         POS_MIN, 0, k,
1554                         NULL, NULL, 0,
1555                 bch2_delete_redundant_snapshot(trans, k));
1556         bch_err_msg(c, ret, "deleting redundant snapshots");
1557         if (ret)
1558                 goto err;
1559
1560         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1561                                  POS_MIN, 0, k,
1562                 bch2_snapshot_set_equiv(trans, k));
1563         bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1564         if (ret)
1565                 goto err;
1566
1567         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1568                                  POS_MIN, 0, k, ({
1569                 if (k.k->type != KEY_TYPE_snapshot)
1570                         continue;
1571
1572                 BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
1573                         ? snapshot_list_add(c, &deleted, k.k->p.offset)
1574                         : 0;
1575         }));
1576         bch_err_msg(c, ret, "walking snapshots");
1577         if (ret)
1578                 goto err;
1579
1580         for (id = 0; id < BTREE_ID_NR; id++) {
1581                 struct bpos last_pos = POS_MIN;
1582                 snapshot_id_list equiv_seen = { 0 };
1583                 struct disk_reservation res = { 0 };
1584
1585                 if (!btree_type_has_snapshots(id))
1586                         continue;
1587
1588                 /*
1589                  * deleted inodes btree is maintained by a trigger on the inodes
1590                  * btree - no work for us to do here, and it's not safe to scan
1591                  * it because we'll see out of date keys due to the btree write
1592                  * buffer:
1593                  */
1594                 if (id == BTREE_ID_deleted_inodes)
1595                         continue;
1596
1597                 ret = for_each_btree_key_commit(trans, iter,
1598                                 id, POS_MIN,
1599                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1600                                 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1601                         snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
1602                       for_each_btree_key_commit(trans, iter,
1603                                 id, POS_MIN,
1604                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1605                                 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1606                         move_key_to_correct_snapshot(trans, &iter, k));
1607
1608                 bch2_disk_reservation_put(c, &res);
1609                 darray_exit(&equiv_seen);
1610
1611                 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1612                 if (ret)
1613                         goto err;
1614         }
1615
1616         bch2_trans_unlock(trans);
1617         down_write(&c->snapshot_create_lock);
1618
1619         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1620                                  POS_MIN, 0, k, ({
1621                 u32 snapshot = k.k->p.offset;
1622                 u32 equiv = bch2_snapshot_equiv(c, snapshot);
1623
1624                 equiv != snapshot
1625                         ? snapshot_list_add(c, &deleted_interior, snapshot)
1626                         : 0;
1627         }));
1628
1629         bch_err_msg(c, ret, "walking snapshots");
1630         if (ret)
1631                 goto err_create_lock;
1632
1633         /*
1634          * Fixing children of deleted snapshots can't be done completely
1635          * atomically, if we crash between here and when we delete the interior
1636          * nodes some depth fields will be off:
1637          */
1638         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1639                                   BTREE_ITER_INTENT, k,
1640                                   NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1641                 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1642         if (ret)
1643                 goto err_create_lock;
1644
1645         darray_for_each(deleted, i) {
1646                 ret = commit_do(trans, NULL, NULL, 0,
1647                         bch2_snapshot_node_delete(trans, *i));
1648                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1649                 if (ret)
1650                         goto err_create_lock;
1651         }
1652
1653         darray_for_each(deleted_interior, i) {
1654                 ret = commit_do(trans, NULL, NULL, 0,
1655                         bch2_snapshot_node_delete(trans, *i));
1656                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1657                 if (ret)
1658                         goto err_create_lock;
1659         }
1660 err_create_lock:
1661         up_write(&c->snapshot_create_lock);
1662 err:
1663         darray_exit(&deleted_interior);
1664         darray_exit(&deleted);
1665         bch2_trans_put(trans);
1666         bch_err_fn(c, ret);
1667         return ret;
1668 }
1669
1670 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1671 {
1672         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1673
1674         bch2_delete_dead_snapshots(c);
1675         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1676 }
1677
1678 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1679 {
1680         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1681             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1682                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1683 }
1684
1685 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1686                                        enum btree_id id,
1687                                        struct bpos pos)
1688 {
1689         struct bch_fs *c = trans->c;
1690         struct btree_iter iter;
1691         struct bkey_s_c k;
1692         int ret;
1693
1694         bch2_trans_iter_init(trans, &iter, id, pos,
1695                              BTREE_ITER_NOT_EXTENTS|
1696                              BTREE_ITER_ALL_SNAPSHOTS);
1697         while (1) {
1698                 k = bch2_btree_iter_prev(&iter);
1699                 ret = bkey_err(k);
1700                 if (ret)
1701                         break;
1702
1703                 if (!k.k)
1704                         break;
1705
1706                 if (!bkey_eq(pos, k.k->p))
1707                         break;
1708
1709                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1710                         ret = 1;
1711                         break;
1712                 }
1713         }
1714         bch2_trans_iter_exit(trans, &iter);
1715
1716         return ret;
1717 }
1718
1719 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1720 {
1721         const struct snapshot_t *s = snapshot_t(c, id);
1722
1723         return s->children[1] ?: s->children[0];
1724 }
1725
1726 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1727 {
1728         u32 child;
1729
1730         while ((child = bch2_snapshot_smallest_child(c, id)))
1731                 id = child;
1732         return id;
1733 }
1734
1735 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1736                                                enum btree_id btree,
1737                                                struct bkey_s_c interior_k,
1738                                                u32 leaf_id, struct bpos *new_min_pos)
1739 {
1740         struct btree_iter iter;
1741         struct bpos pos = interior_k.k->p;
1742         struct bkey_s_c k;
1743         struct bkey_i *new;
1744         int ret;
1745
1746         pos.snapshot = leaf_id;
1747
1748         bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
1749         k = bch2_btree_iter_peek_slot(&iter);
1750         ret = bkey_err(k);
1751         if (ret)
1752                 goto out;
1753
1754         /* key already overwritten in this snapshot? */
1755         if (k.k->p.snapshot != interior_k.k->p.snapshot)
1756                 goto out;
1757
1758         if (bpos_eq(*new_min_pos, POS_MIN)) {
1759                 *new_min_pos = k.k->p;
1760                 new_min_pos->snapshot = leaf_id;
1761         }
1762
1763         new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1764         ret = PTR_ERR_OR_ZERO(new);
1765         if (ret)
1766                 goto out;
1767
1768         new->k.p.snapshot = leaf_id;
1769         ret = bch2_trans_update(trans, &iter, new, 0);
1770 out:
1771         bch2_trans_iter_exit(trans, &iter);
1772         return ret;
1773 }
1774
1775 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1776                                           enum btree_id btree,
1777                                           struct bkey_s_c k,
1778                                           struct bpos *new_min_pos)
1779 {
1780         struct bch_fs *c = trans->c;
1781         struct bkey_buf sk;
1782         u32 restart_count = trans->restart_count;
1783         int ret = 0;
1784
1785         bch2_bkey_buf_init(&sk);
1786         bch2_bkey_buf_reassemble(&sk, c, k);
1787         k = bkey_i_to_s_c(sk.k);
1788
1789         *new_min_pos = POS_MIN;
1790
1791         for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1792              id < k.k->p.snapshot;
1793              id++) {
1794                 if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1795                     !bch2_snapshot_is_leaf(c, id))
1796                         continue;
1797 again:
1798                 ret =   btree_trans_too_many_iters(trans) ?:
1799                         bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1800                         bch2_trans_commit(trans, NULL, NULL, 0);
1801                 if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1802                         bch2_trans_begin(trans);
1803                         goto again;
1804                 }
1805
1806                 if (ret)
1807                         break;
1808         }
1809
1810         bch2_bkey_buf_exit(&sk, c);
1811
1812         return ret ?: trans_was_restarted(trans, restart_count);
1813 }
1814
1815 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1816 {
1817         struct bch_fs *c = trans->c;
1818         struct bkey_s_c_snapshot snap;
1819         int ret = 0;
1820
1821         if (k.k->type != KEY_TYPE_snapshot)
1822                 return 0;
1823
1824         snap = bkey_s_c_to_snapshot(k);
1825         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1826             bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
1827             (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
1828                 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
1829                 return 0;
1830         }
1831
1832         return ret;
1833 }
1834
1835 int bch2_snapshots_read(struct bch_fs *c)
1836 {
1837         int ret = bch2_trans_run(c,
1838                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1839                                    POS_MIN, 0, k,
1840                         __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1841                         bch2_snapshot_set_equiv(trans, k) ?:
1842                         bch2_check_snapshot_needs_deletion(trans, k)) ?:
1843                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1844                                    POS_MIN, 0, k,
1845                            (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1846         bch_err_fn(c, ret);
1847
1848         /*
1849          * It's important that we check if we need to reconstruct snapshots
1850          * before going RW, so we mark that pass as required in the superblock -
1851          * otherwise, we could end up deleting keys with missing snapshot nodes
1852          * instead
1853          */
1854         BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1855                test_bit(BCH_FS_may_go_rw, &c->flags));
1856
1857         if (bch2_err_matches(ret, EIO) ||
1858             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1859                 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1860
1861         return ret;
1862 }
1863
1864 void bch2_fs_snapshots_exit(struct bch_fs *c)
1865 {
1866         kvfree(rcu_dereference_protected(c->snapshots, true));
1867 }