sched/doc: Update documentation for base_slice_ns and CONFIG_HZ relation
[sfrench/cifs-2.6.git] / fs / bcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "btree_write_buffer.h"
8 #include "buckets.h"
9 #include "checksum.h"
10 #include "disk_groups.h"
11 #include "error.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17 #include "sb-clean.h"
18 #include "trace.h"
19
20 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
21                                struct journal_replay *j)
22 {
23         darray_for_each(j->ptrs, i) {
24                 struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
25                 u64 offset;
26
27                 div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
28
29                 if (i != j->ptrs.data)
30                         prt_printf(out, " ");
31                 prt_printf(out, "%u:%u:%u (sector %llu)",
32                            i->dev, i->bucket, i->bucket_offset, i->sector);
33         }
34 }
35
36 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
37                                         struct journal_replay *j)
38 {
39         prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
40
41         bch2_journal_ptrs_to_text(out, c, j);
42
43         for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
44                 struct jset_entry_datetime *datetime =
45                         container_of(entry, struct jset_entry_datetime, entry);
46                 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
47                 break;
48         }
49 }
50
51 static struct nonce journal_nonce(const struct jset *jset)
52 {
53         return (struct nonce) {{
54                 [0] = 0,
55                 [1] = ((__le32 *) &jset->seq)[0],
56                 [2] = ((__le32 *) &jset->seq)[1],
57                 [3] = BCH_NONCE_JOURNAL,
58         }};
59 }
60
61 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
62 {
63         if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
64                 *csum = (struct bch_csum) {};
65                 return false;
66         }
67
68         *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
69         return !bch2_crc_cmp(j->csum, *csum);
70 }
71
72 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
73 {
74         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
75 }
76
77 static void __journal_replay_free(struct bch_fs *c,
78                                   struct journal_replay *i)
79 {
80         struct journal_replay **p =
81                 genradix_ptr(&c->journal_entries,
82                              journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
83
84         BUG_ON(*p != i);
85         *p = NULL;
86         kvfree(i);
87 }
88
89 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
90 {
91         if (blacklisted)
92                 i->ignore_blacklisted = true;
93         else
94                 i->ignore_not_dirty = true;
95
96         if (!c->opts.read_entire_journal)
97                 __journal_replay_free(c, i);
98 }
99
100 struct journal_list {
101         struct closure          cl;
102         u64                     last_seq;
103         struct mutex            lock;
104         int                     ret;
105 };
106
107 #define JOURNAL_ENTRY_ADD_OK            0
108 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
109
110 /*
111  * Given a journal entry we just read, add it to the list of journal entries to
112  * be replayed:
113  */
114 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
115                              struct journal_ptr entry_ptr,
116                              struct journal_list *jlist, struct jset *j)
117 {
118         struct genradix_iter iter;
119         struct journal_replay **_i, *i, *dup;
120         size_t bytes = vstruct_bytes(j);
121         u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
122         struct printbuf buf = PRINTBUF;
123         int ret = JOURNAL_ENTRY_ADD_OK;
124
125         /* Is this entry older than the range we need? */
126         if (!c->opts.read_entire_journal &&
127             le64_to_cpu(j->seq) < jlist->last_seq)
128                 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
129
130         /*
131          * genradixes are indexed by a ulong, not a u64, so we can't index them
132          * by sequence number directly: Assume instead that they will all fall
133          * within the range of +-2billion of the filrst one we find.
134          */
135         if (!c->journal_entries_base_seq)
136                 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
137
138         /* Drop entries we don't need anymore */
139         if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
140                 genradix_for_each_from(&c->journal_entries, iter, _i,
141                                        journal_entry_radix_idx(c, jlist->last_seq)) {
142                         i = *_i;
143
144                         if (journal_replay_ignore(i))
145                                 continue;
146
147                         if (le64_to_cpu(i->j.seq) >= last_seq)
148                                 break;
149
150                         journal_replay_free(c, i, false);
151                 }
152         }
153
154         jlist->last_seq = max(jlist->last_seq, last_seq);
155
156         _i = genradix_ptr_alloc(&c->journal_entries,
157                                 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
158                                 GFP_KERNEL);
159         if (!_i)
160                 return -BCH_ERR_ENOMEM_journal_entry_add;
161
162         /*
163          * Duplicate journal entries? If so we want the one that didn't have a
164          * checksum error:
165          */
166         dup = *_i;
167         if (dup) {
168                 bool identical = bytes == vstruct_bytes(&dup->j) &&
169                         !memcmp(j, &dup->j, bytes);
170                 bool not_identical = !identical &&
171                         entry_ptr.csum_good &&
172                         dup->csum_good;
173
174                 bool same_device = false;
175                 darray_for_each(dup->ptrs, ptr)
176                         if (ptr->dev == ca->dev_idx)
177                                 same_device = true;
178
179                 ret = darray_push(&dup->ptrs, entry_ptr);
180                 if (ret)
181                         goto out;
182
183                 bch2_journal_replay_to_text(&buf, c, dup);
184
185                 fsck_err_on(same_device,
186                             c, journal_entry_dup_same_device,
187                             "duplicate journal entry on same device\n  %s",
188                             buf.buf);
189
190                 fsck_err_on(not_identical,
191                             c, journal_entry_replicas_data_mismatch,
192                             "found duplicate but non identical journal entries\n  %s",
193                             buf.buf);
194
195                 if (entry_ptr.csum_good && !identical)
196                         goto replace;
197
198                 goto out;
199         }
200 replace:
201         i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
202         if (!i)
203                 return -BCH_ERR_ENOMEM_journal_entry_add;
204
205         darray_init(&i->ptrs);
206         i->csum_good            = entry_ptr.csum_good;
207         i->ignore_blacklisted   = false;
208         i->ignore_not_dirty     = false;
209         unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
210
211         if (dup) {
212                 /* The first ptr should represent the jset we kept: */
213                 darray_for_each(dup->ptrs, ptr)
214                         darray_push(&i->ptrs, *ptr);
215                 __journal_replay_free(c, dup);
216         } else {
217                 darray_push(&i->ptrs, entry_ptr);
218         }
219
220         *_i = i;
221 out:
222 fsck_err:
223         printbuf_exit(&buf);
224         return ret;
225 }
226
227 /* this fills in a range with empty jset_entries: */
228 static void journal_entry_null_range(void *start, void *end)
229 {
230         struct jset_entry *entry;
231
232         for (entry = start; entry != end; entry = vstruct_next(entry))
233                 memset(entry, 0, sizeof(*entry));
234 }
235
236 #define JOURNAL_ENTRY_REREAD    5
237 #define JOURNAL_ENTRY_NONE      6
238 #define JOURNAL_ENTRY_BAD       7
239
240 static void journal_entry_err_msg(struct printbuf *out,
241                                   u32 version,
242                                   struct jset *jset,
243                                   struct jset_entry *entry)
244 {
245         prt_str(out, "invalid journal entry, version=");
246         bch2_version_to_text(out, version);
247
248         if (entry) {
249                 prt_str(out, " type=");
250                 prt_str(out, bch2_jset_entry_types[entry->type]);
251         }
252
253         if (!jset) {
254                 prt_printf(out, " in superblock");
255         } else {
256
257                 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
258
259                 if (entry)
260                         prt_printf(out, " offset=%zi/%u",
261                                    (u64 *) entry - jset->_data,
262                                    le32_to_cpu(jset->u64s));
263         }
264
265         prt_str(out, ": ");
266 }
267
268 #define journal_entry_err(c, version, jset, entry, _err, msg, ...)      \
269 ({                                                                      \
270         struct printbuf _buf = PRINTBUF;                                \
271                                                                         \
272         journal_entry_err_msg(&_buf, version, jset, entry);             \
273         prt_printf(&_buf, msg, ##__VA_ARGS__);                          \
274                                                                         \
275         switch (flags & BKEY_INVALID_WRITE) {                           \
276         case READ:                                                      \
277                 mustfix_fsck_err(c, _err, "%s", _buf.buf);              \
278                 break;                                                  \
279         case WRITE:                                                     \
280                 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);            \
281                 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
282                 if (bch2_fs_inconsistent(c)) {                          \
283                         ret = -BCH_ERR_fsck_errors_not_fixed;           \
284                         goto fsck_err;                                  \
285                 }                                                       \
286                 break;                                                  \
287         }                                                               \
288                                                                         \
289         printbuf_exit(&_buf);                                           \
290         true;                                                           \
291 })
292
293 #define journal_entry_err_on(cond, ...)                                 \
294         ((cond) ? journal_entry_err(__VA_ARGS__) : false)
295
296 #define FSCK_DELETED_KEY        5
297
298 static int journal_validate_key(struct bch_fs *c,
299                                 struct jset *jset,
300                                 struct jset_entry *entry,
301                                 unsigned level, enum btree_id btree_id,
302                                 struct bkey_i *k,
303                                 unsigned version, int big_endian,
304                                 enum bkey_invalid_flags flags)
305 {
306         int write = flags & BKEY_INVALID_WRITE;
307         void *next = vstruct_next(entry);
308         struct printbuf buf = PRINTBUF;
309         int ret = 0;
310
311         if (journal_entry_err_on(!k->k.u64s,
312                                  c, version, jset, entry,
313                                  journal_entry_bkey_u64s_0,
314                                  "k->u64s 0")) {
315                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
316                 journal_entry_null_range(vstruct_next(entry), next);
317                 return FSCK_DELETED_KEY;
318         }
319
320         if (journal_entry_err_on((void *) bkey_next(k) >
321                                  (void *) vstruct_next(entry),
322                                  c, version, jset, entry,
323                                  journal_entry_bkey_past_end,
324                                  "extends past end of journal entry")) {
325                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
326                 journal_entry_null_range(vstruct_next(entry), next);
327                 return FSCK_DELETED_KEY;
328         }
329
330         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
331                                  c, version, jset, entry,
332                                  journal_entry_bkey_bad_format,
333                                  "bad format %u", k->k.format)) {
334                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
335                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
336                 journal_entry_null_range(vstruct_next(entry), next);
337                 return FSCK_DELETED_KEY;
338         }
339
340         if (!write)
341                 bch2_bkey_compat(level, btree_id, version, big_endian,
342                                  write, NULL, bkey_to_packed(k));
343
344         if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
345                               __btree_node_type(level, btree_id), write, &buf)) {
346                 printbuf_reset(&buf);
347                 journal_entry_err_msg(&buf, version, jset, entry);
348                 prt_newline(&buf);
349                 printbuf_indent_add(&buf, 2);
350
351                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
352                 prt_newline(&buf);
353                 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
354                                   __btree_node_type(level, btree_id), write, &buf);
355
356                 mustfix_fsck_err(c, journal_entry_bkey_invalid,
357                                  "%s", buf.buf);
358
359                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
360                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
361                 journal_entry_null_range(vstruct_next(entry), next);
362
363                 printbuf_exit(&buf);
364                 return FSCK_DELETED_KEY;
365         }
366
367         if (write)
368                 bch2_bkey_compat(level, btree_id, version, big_endian,
369                                  write, NULL, bkey_to_packed(k));
370 fsck_err:
371         printbuf_exit(&buf);
372         return ret;
373 }
374
375 static int journal_entry_btree_keys_validate(struct bch_fs *c,
376                                 struct jset *jset,
377                                 struct jset_entry *entry,
378                                 unsigned version, int big_endian,
379                                 enum bkey_invalid_flags flags)
380 {
381         struct bkey_i *k = entry->start;
382
383         while (k != vstruct_last(entry)) {
384                 int ret = journal_validate_key(c, jset, entry,
385                                                entry->level,
386                                                entry->btree_id,
387                                                k, version, big_endian,
388                                                flags|BKEY_INVALID_JOURNAL);
389                 if (ret == FSCK_DELETED_KEY)
390                         continue;
391
392                 k = bkey_next(k);
393         }
394
395         return 0;
396 }
397
398 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
399                                              struct jset_entry *entry)
400 {
401         bool first = true;
402
403         jset_entry_for_each_key(entry, k) {
404                 if (!first) {
405                         prt_newline(out);
406                         prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
407                 }
408                 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
409                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
410                 first = false;
411         }
412 }
413
414 static int journal_entry_btree_root_validate(struct bch_fs *c,
415                                 struct jset *jset,
416                                 struct jset_entry *entry,
417                                 unsigned version, int big_endian,
418                                 enum bkey_invalid_flags flags)
419 {
420         struct bkey_i *k = entry->start;
421         int ret = 0;
422
423         if (journal_entry_err_on(!entry->u64s ||
424                                  le16_to_cpu(entry->u64s) != k->k.u64s,
425                                  c, version, jset, entry,
426                                  journal_entry_btree_root_bad_size,
427                                  "invalid btree root journal entry: wrong number of keys")) {
428                 void *next = vstruct_next(entry);
429                 /*
430                  * we don't want to null out this jset_entry,
431                  * just the contents, so that later we can tell
432                  * we were _supposed_ to have a btree root
433                  */
434                 entry->u64s = 0;
435                 journal_entry_null_range(vstruct_next(entry), next);
436                 return 0;
437         }
438
439         ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
440                                    version, big_endian, flags);
441         if (ret == FSCK_DELETED_KEY)
442                 ret = 0;
443 fsck_err:
444         return ret;
445 }
446
447 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
448                                              struct jset_entry *entry)
449 {
450         journal_entry_btree_keys_to_text(out, c, entry);
451 }
452
453 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
454                                 struct jset *jset,
455                                 struct jset_entry *entry,
456                                 unsigned version, int big_endian,
457                                 enum bkey_invalid_flags flags)
458 {
459         /* obsolete, don't care: */
460         return 0;
461 }
462
463 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
464                                             struct jset_entry *entry)
465 {
466 }
467
468 static int journal_entry_blacklist_validate(struct bch_fs *c,
469                                 struct jset *jset,
470                                 struct jset_entry *entry,
471                                 unsigned version, int big_endian,
472                                 enum bkey_invalid_flags flags)
473 {
474         int ret = 0;
475
476         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
477                                  c, version, jset, entry,
478                                  journal_entry_blacklist_bad_size,
479                 "invalid journal seq blacklist entry: bad size")) {
480                 journal_entry_null_range(entry, vstruct_next(entry));
481         }
482 fsck_err:
483         return ret;
484 }
485
486 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
487                                             struct jset_entry *entry)
488 {
489         struct jset_entry_blacklist *bl =
490                 container_of(entry, struct jset_entry_blacklist, entry);
491
492         prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
493 }
494
495 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
496                                 struct jset *jset,
497                                 struct jset_entry *entry,
498                                 unsigned version, int big_endian,
499                                 enum bkey_invalid_flags flags)
500 {
501         struct jset_entry_blacklist_v2 *bl_entry;
502         int ret = 0;
503
504         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
505                                  c, version, jset, entry,
506                                  journal_entry_blacklist_v2_bad_size,
507                 "invalid journal seq blacklist entry: bad size")) {
508                 journal_entry_null_range(entry, vstruct_next(entry));
509                 goto out;
510         }
511
512         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
513
514         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
515                                  le64_to_cpu(bl_entry->end),
516                                  c, version, jset, entry,
517                                  journal_entry_blacklist_v2_start_past_end,
518                 "invalid journal seq blacklist entry: start > end")) {
519                 journal_entry_null_range(entry, vstruct_next(entry));
520         }
521 out:
522 fsck_err:
523         return ret;
524 }
525
526 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
527                                                struct jset_entry *entry)
528 {
529         struct jset_entry_blacklist_v2 *bl =
530                 container_of(entry, struct jset_entry_blacklist_v2, entry);
531
532         prt_printf(out, "start=%llu end=%llu",
533                le64_to_cpu(bl->start),
534                le64_to_cpu(bl->end));
535 }
536
537 static int journal_entry_usage_validate(struct bch_fs *c,
538                                 struct jset *jset,
539                                 struct jset_entry *entry,
540                                 unsigned version, int big_endian,
541                                 enum bkey_invalid_flags flags)
542 {
543         struct jset_entry_usage *u =
544                 container_of(entry, struct jset_entry_usage, entry);
545         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
546         int ret = 0;
547
548         if (journal_entry_err_on(bytes < sizeof(*u),
549                                  c, version, jset, entry,
550                                  journal_entry_usage_bad_size,
551                                  "invalid journal entry usage: bad size")) {
552                 journal_entry_null_range(entry, vstruct_next(entry));
553                 return ret;
554         }
555
556 fsck_err:
557         return ret;
558 }
559
560 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
561                                         struct jset_entry *entry)
562 {
563         struct jset_entry_usage *u =
564                 container_of(entry, struct jset_entry_usage, entry);
565
566         prt_printf(out, "type=%s v=%llu",
567                bch2_fs_usage_types[u->entry.btree_id],
568                le64_to_cpu(u->v));
569 }
570
571 static int journal_entry_data_usage_validate(struct bch_fs *c,
572                                 struct jset *jset,
573                                 struct jset_entry *entry,
574                                 unsigned version, int big_endian,
575                                 enum bkey_invalid_flags flags)
576 {
577         struct jset_entry_data_usage *u =
578                 container_of(entry, struct jset_entry_data_usage, entry);
579         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
580         struct printbuf err = PRINTBUF;
581         int ret = 0;
582
583         if (journal_entry_err_on(bytes < sizeof(*u) ||
584                                  bytes < sizeof(*u) + u->r.nr_devs,
585                                  c, version, jset, entry,
586                                  journal_entry_data_usage_bad_size,
587                                  "invalid journal entry usage: bad size")) {
588                 journal_entry_null_range(entry, vstruct_next(entry));
589                 goto out;
590         }
591
592         if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
593                                  c, version, jset, entry,
594                                  journal_entry_data_usage_bad_size,
595                                  "invalid journal entry usage: %s", err.buf)) {
596                 journal_entry_null_range(entry, vstruct_next(entry));
597                 goto out;
598         }
599 out:
600 fsck_err:
601         printbuf_exit(&err);
602         return ret;
603 }
604
605 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
606                                              struct jset_entry *entry)
607 {
608         struct jset_entry_data_usage *u =
609                 container_of(entry, struct jset_entry_data_usage, entry);
610
611         bch2_replicas_entry_to_text(out, &u->r);
612         prt_printf(out, "=%llu", le64_to_cpu(u->v));
613 }
614
615 static int journal_entry_clock_validate(struct bch_fs *c,
616                                 struct jset *jset,
617                                 struct jset_entry *entry,
618                                 unsigned version, int big_endian,
619                                 enum bkey_invalid_flags flags)
620 {
621         struct jset_entry_clock *clock =
622                 container_of(entry, struct jset_entry_clock, entry);
623         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
624         int ret = 0;
625
626         if (journal_entry_err_on(bytes != sizeof(*clock),
627                                  c, version, jset, entry,
628                                  journal_entry_clock_bad_size,
629                                  "bad size")) {
630                 journal_entry_null_range(entry, vstruct_next(entry));
631                 return ret;
632         }
633
634         if (journal_entry_err_on(clock->rw > 1,
635                                  c, version, jset, entry,
636                                  journal_entry_clock_bad_rw,
637                                  "bad rw")) {
638                 journal_entry_null_range(entry, vstruct_next(entry));
639                 return ret;
640         }
641
642 fsck_err:
643         return ret;
644 }
645
646 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
647                                         struct jset_entry *entry)
648 {
649         struct jset_entry_clock *clock =
650                 container_of(entry, struct jset_entry_clock, entry);
651
652         prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
653 }
654
655 static int journal_entry_dev_usage_validate(struct bch_fs *c,
656                                 struct jset *jset,
657                                 struct jset_entry *entry,
658                                 unsigned version, int big_endian,
659                                 enum bkey_invalid_flags flags)
660 {
661         struct jset_entry_dev_usage *u =
662                 container_of(entry, struct jset_entry_dev_usage, entry);
663         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
664         unsigned expected = sizeof(*u);
665         unsigned dev;
666         int ret = 0;
667
668         if (journal_entry_err_on(bytes < expected,
669                                  c, version, jset, entry,
670                                  journal_entry_dev_usage_bad_size,
671                                  "bad size (%u < %u)",
672                                  bytes, expected)) {
673                 journal_entry_null_range(entry, vstruct_next(entry));
674                 return ret;
675         }
676
677         dev = le32_to_cpu(u->dev);
678
679         if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
680                                  c, version, jset, entry,
681                                  journal_entry_dev_usage_bad_dev,
682                                  "bad dev")) {
683                 journal_entry_null_range(entry, vstruct_next(entry));
684                 return ret;
685         }
686
687         if (journal_entry_err_on(u->pad,
688                                  c, version, jset, entry,
689                                  journal_entry_dev_usage_bad_pad,
690                                  "bad pad")) {
691                 journal_entry_null_range(entry, vstruct_next(entry));
692                 return ret;
693         }
694
695 fsck_err:
696         return ret;
697 }
698
699 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
700                                             struct jset_entry *entry)
701 {
702         struct jset_entry_dev_usage *u =
703                 container_of(entry, struct jset_entry_dev_usage, entry);
704         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
705
706         prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
707
708         for (i = 0; i < nr_types; i++) {
709                 bch2_prt_data_type(out, i);
710                 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
711                        le64_to_cpu(u->d[i].buckets),
712                        le64_to_cpu(u->d[i].sectors),
713                        le64_to_cpu(u->d[i].fragmented));
714         }
715 }
716
717 static int journal_entry_log_validate(struct bch_fs *c,
718                                 struct jset *jset,
719                                 struct jset_entry *entry,
720                                 unsigned version, int big_endian,
721                                 enum bkey_invalid_flags flags)
722 {
723         return 0;
724 }
725
726 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
727                                       struct jset_entry *entry)
728 {
729         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
730         unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
731
732         prt_printf(out, "%.*s", bytes, l->d);
733 }
734
735 static int journal_entry_overwrite_validate(struct bch_fs *c,
736                                 struct jset *jset,
737                                 struct jset_entry *entry,
738                                 unsigned version, int big_endian,
739                                 enum bkey_invalid_flags flags)
740 {
741         return journal_entry_btree_keys_validate(c, jset, entry,
742                                 version, big_endian, READ);
743 }
744
745 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
746                                             struct jset_entry *entry)
747 {
748         journal_entry_btree_keys_to_text(out, c, entry);
749 }
750
751 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
752                                 struct jset *jset,
753                                 struct jset_entry *entry,
754                                 unsigned version, int big_endian,
755                                 enum bkey_invalid_flags flags)
756 {
757         return journal_entry_btree_keys_validate(c, jset, entry,
758                                 version, big_endian, READ);
759 }
760
761 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
762                                             struct jset_entry *entry)
763 {
764         journal_entry_btree_keys_to_text(out, c, entry);
765 }
766
767 static int journal_entry_datetime_validate(struct bch_fs *c,
768                                 struct jset *jset,
769                                 struct jset_entry *entry,
770                                 unsigned version, int big_endian,
771                                 enum bkey_invalid_flags flags)
772 {
773         unsigned bytes = vstruct_bytes(entry);
774         unsigned expected = 16;
775         int ret = 0;
776
777         if (journal_entry_err_on(vstruct_bytes(entry) < expected,
778                                  c, version, jset, entry,
779                                  journal_entry_dev_usage_bad_size,
780                                  "bad size (%u < %u)",
781                                  bytes, expected)) {
782                 journal_entry_null_range(entry, vstruct_next(entry));
783                 return ret;
784         }
785 fsck_err:
786         return ret;
787 }
788
789 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
790                                             struct jset_entry *entry)
791 {
792         struct jset_entry_datetime *datetime =
793                 container_of(entry, struct jset_entry_datetime, entry);
794
795         bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
796 }
797
798 struct jset_entry_ops {
799         int (*validate)(struct bch_fs *, struct jset *,
800                         struct jset_entry *, unsigned, int,
801                         enum bkey_invalid_flags);
802         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
803 };
804
805 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
806 #define x(f, nr)                                                \
807         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
808                 .validate       = journal_entry_##f##_validate, \
809                 .to_text        = journal_entry_##f##_to_text,  \
810         },
811         BCH_JSET_ENTRY_TYPES()
812 #undef x
813 };
814
815 int bch2_journal_entry_validate(struct bch_fs *c,
816                                 struct jset *jset,
817                                 struct jset_entry *entry,
818                                 unsigned version, int big_endian,
819                                 enum bkey_invalid_flags flags)
820 {
821         return entry->type < BCH_JSET_ENTRY_NR
822                 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
823                                 version, big_endian, flags)
824                 : 0;
825 }
826
827 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
828                                 struct jset_entry *entry)
829 {
830         if (entry->type < BCH_JSET_ENTRY_NR) {
831                 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
832                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
833         } else {
834                 prt_printf(out, "(unknown type %u)", entry->type);
835         }
836 }
837
838 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
839                                  enum bkey_invalid_flags flags)
840 {
841         unsigned version = le32_to_cpu(jset->version);
842         int ret = 0;
843
844         vstruct_for_each(jset, entry) {
845                 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
846                                 c, version, jset, entry,
847                                 journal_entry_past_jset_end,
848                                 "journal entry extends past end of jset")) {
849                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
850                         break;
851                 }
852
853                 ret = bch2_journal_entry_validate(c, jset, entry,
854                                         version, JSET_BIG_ENDIAN(jset), flags);
855                 if (ret)
856                         break;
857         }
858 fsck_err:
859         return ret;
860 }
861
862 static int jset_validate(struct bch_fs *c,
863                          struct bch_dev *ca,
864                          struct jset *jset, u64 sector,
865                          enum bkey_invalid_flags flags)
866 {
867         unsigned version;
868         int ret = 0;
869
870         if (le64_to_cpu(jset->magic) != jset_magic(c))
871                 return JOURNAL_ENTRY_NONE;
872
873         version = le32_to_cpu(jset->version);
874         if (journal_entry_err_on(!bch2_version_compatible(version),
875                         c, version, jset, NULL,
876                         jset_unsupported_version,
877                         "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
878                         ca ? ca->name : c->name,
879                         sector, le64_to_cpu(jset->seq),
880                         BCH_VERSION_MAJOR(version),
881                         BCH_VERSION_MINOR(version))) {
882                 /* don't try to continue: */
883                 return -EINVAL;
884         }
885
886         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
887                         c, version, jset, NULL,
888                         jset_unknown_csum,
889                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
890                         ca ? ca->name : c->name,
891                         sector, le64_to_cpu(jset->seq),
892                         JSET_CSUM_TYPE(jset)))
893                 ret = JOURNAL_ENTRY_BAD;
894
895         /* last_seq is ignored when JSET_NO_FLUSH is true */
896         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
897                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
898                                  c, version, jset, NULL,
899                                  jset_last_seq_newer_than_seq,
900                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
901                                  le64_to_cpu(jset->last_seq),
902                                  le64_to_cpu(jset->seq))) {
903                 jset->last_seq = jset->seq;
904                 return JOURNAL_ENTRY_BAD;
905         }
906
907         ret = jset_validate_entries(c, jset, flags);
908 fsck_err:
909         return ret;
910 }
911
912 static int jset_validate_early(struct bch_fs *c,
913                          struct bch_dev *ca,
914                          struct jset *jset, u64 sector,
915                          unsigned bucket_sectors_left,
916                          unsigned sectors_read)
917 {
918         size_t bytes = vstruct_bytes(jset);
919         unsigned version;
920         enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
921         int ret = 0;
922
923         if (le64_to_cpu(jset->magic) != jset_magic(c))
924                 return JOURNAL_ENTRY_NONE;
925
926         version = le32_to_cpu(jset->version);
927         if (journal_entry_err_on(!bch2_version_compatible(version),
928                         c, version, jset, NULL,
929                         jset_unsupported_version,
930                         "%s sector %llu seq %llu: unknown journal entry version %u.%u",
931                         ca ? ca->name : c->name,
932                         sector, le64_to_cpu(jset->seq),
933                         BCH_VERSION_MAJOR(version),
934                         BCH_VERSION_MINOR(version))) {
935                 /* don't try to continue: */
936                 return -EINVAL;
937         }
938
939         if (bytes > (sectors_read << 9) &&
940             sectors_read < bucket_sectors_left)
941                 return JOURNAL_ENTRY_REREAD;
942
943         if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
944                         c, version, jset, NULL,
945                         jset_past_bucket_end,
946                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
947                         ca ? ca->name : c->name,
948                         sector, le64_to_cpu(jset->seq), bytes))
949                 le32_add_cpu(&jset->u64s,
950                              -((bytes - (bucket_sectors_left << 9)) / 8));
951 fsck_err:
952         return ret;
953 }
954
955 struct journal_read_buf {
956         void            *data;
957         size_t          size;
958 };
959
960 static int journal_read_buf_realloc(struct journal_read_buf *b,
961                                     size_t new_size)
962 {
963         void *n;
964
965         /* the bios are sized for this many pages, max: */
966         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
967                 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
968
969         new_size = roundup_pow_of_two(new_size);
970         n = kvmalloc(new_size, GFP_KERNEL);
971         if (!n)
972                 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
973
974         kvfree(b->data);
975         b->data = n;
976         b->size = new_size;
977         return 0;
978 }
979
980 static int journal_read_bucket(struct bch_dev *ca,
981                                struct journal_read_buf *buf,
982                                struct journal_list *jlist,
983                                unsigned bucket)
984 {
985         struct bch_fs *c = ca->fs;
986         struct journal_device *ja = &ca->journal;
987         struct jset *j = NULL;
988         unsigned sectors, sectors_read = 0;
989         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
990             end = offset + ca->mi.bucket_size;
991         bool saw_bad = false, csum_good;
992         struct printbuf err = PRINTBUF;
993         int ret = 0;
994
995         pr_debug("reading %u", bucket);
996
997         while (offset < end) {
998                 if (!sectors_read) {
999                         struct bio *bio;
1000                         unsigned nr_bvecs;
1001 reread:
1002                         sectors_read = min_t(unsigned,
1003                                 end - offset, buf->size >> 9);
1004                         nr_bvecs = buf_pages(buf->data, sectors_read << 9);
1005
1006                         bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
1007                         bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1008
1009                         bio->bi_iter.bi_sector = offset;
1010                         bch2_bio_map(bio, buf->data, sectors_read << 9);
1011
1012                         ret = submit_bio_wait(bio);
1013                         kfree(bio);
1014
1015                         if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
1016                                                "journal read error: sector %llu",
1017                                                offset) ||
1018                             bch2_meta_read_fault("journal")) {
1019                                 /*
1020                                  * We don't error out of the recovery process
1021                                  * here, since the relevant journal entry may be
1022                                  * found on a different device, and missing or
1023                                  * no journal entries will be handled later
1024                                  */
1025                                 goto out;
1026                         }
1027
1028                         j = buf->data;
1029                 }
1030
1031                 ret = jset_validate_early(c, ca, j, offset,
1032                                     end - offset, sectors_read);
1033                 switch (ret) {
1034                 case 0:
1035                         sectors = vstruct_sectors(j, c->block_bits);
1036                         break;
1037                 case JOURNAL_ENTRY_REREAD:
1038                         if (vstruct_bytes(j) > buf->size) {
1039                                 ret = journal_read_buf_realloc(buf,
1040                                                         vstruct_bytes(j));
1041                                 if (ret)
1042                                         goto err;
1043                         }
1044                         goto reread;
1045                 case JOURNAL_ENTRY_NONE:
1046                         if (!saw_bad)
1047                                 goto out;
1048                         /*
1049                          * On checksum error we don't really trust the size
1050                          * field of the journal entry we read, so try reading
1051                          * again at next block boundary:
1052                          */
1053                         sectors = block_sectors(c);
1054                         goto next_block;
1055                 default:
1056                         goto err;
1057                 }
1058
1059                 /*
1060                  * This happens sometimes if we don't have discards on -
1061                  * when we've partially overwritten a bucket with new
1062                  * journal entries. We don't need the rest of the
1063                  * bucket:
1064                  */
1065                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
1066                         goto out;
1067
1068                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
1069
1070                 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
1071                 struct bch_csum csum;
1072                 csum_good = jset_csum_good(c, j, &csum);
1073
1074                 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
1075                                        "%s",
1076                                        (printbuf_reset(&err),
1077                                         prt_str(&err, "journal "),
1078                                         bch2_csum_err_msg(&err, csum_type, j->csum, csum),
1079                                         err.buf)))
1080                         saw_bad = true;
1081
1082                 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
1083                              j->encrypted_start,
1084                              vstruct_end(j) - (void *) j->encrypted_start);
1085                 bch2_fs_fatal_err_on(ret, c,
1086                                 "error decrypting journal entry: %s",
1087                                 bch2_err_str(ret));
1088
1089                 mutex_lock(&jlist->lock);
1090                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
1091                                         .csum_good      = csum_good,
1092                                         .dev            = ca->dev_idx,
1093                                         .bucket         = bucket,
1094                                         .bucket_offset  = offset -
1095                                                 bucket_to_sector(ca, ja->buckets[bucket]),
1096                                         .sector         = offset,
1097                                         }, jlist, j);
1098                 mutex_unlock(&jlist->lock);
1099
1100                 switch (ret) {
1101                 case JOURNAL_ENTRY_ADD_OK:
1102                         break;
1103                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
1104                         break;
1105                 default:
1106                         goto err;
1107                 }
1108 next_block:
1109                 pr_debug("next");
1110                 offset          += sectors;
1111                 sectors_read    -= sectors;
1112                 j = ((void *) j) + (sectors << 9);
1113         }
1114
1115 out:
1116         ret = 0;
1117 err:
1118         printbuf_exit(&err);
1119         return ret;
1120 }
1121
1122 static CLOSURE_CALLBACK(bch2_journal_read_device)
1123 {
1124         closure_type(ja, struct journal_device, read);
1125         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
1126         struct bch_fs *c = ca->fs;
1127         struct journal_list *jlist =
1128                 container_of(cl->parent, struct journal_list, cl);
1129         struct journal_replay *r, **_r;
1130         struct genradix_iter iter;
1131         struct journal_read_buf buf = { NULL, 0 };
1132         unsigned i;
1133         int ret = 0;
1134
1135         if (!ja->nr)
1136                 goto out;
1137
1138         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
1139         if (ret)
1140                 goto err;
1141
1142         pr_debug("%u journal buckets", ja->nr);
1143
1144         for (i = 0; i < ja->nr; i++) {
1145                 ret = journal_read_bucket(ca, &buf, jlist, i);
1146                 if (ret)
1147                         goto err;
1148         }
1149
1150         ja->sectors_free = ca->mi.bucket_size;
1151
1152         mutex_lock(&jlist->lock);
1153         genradix_for_each_reverse(&c->journal_entries, iter, _r) {
1154                 r = *_r;
1155
1156                 if (!r)
1157                         continue;
1158
1159                 darray_for_each(r->ptrs, i)
1160                         if (i->dev == ca->dev_idx) {
1161                                 unsigned wrote = bucket_remainder(ca, i->sector) +
1162                                         vstruct_sectors(&r->j, c->block_bits);
1163
1164                                 ja->cur_idx = i->bucket;
1165                                 ja->sectors_free = ca->mi.bucket_size - wrote;
1166                                 goto found;
1167                         }
1168         }
1169 found:
1170         mutex_unlock(&jlist->lock);
1171
1172         if (ja->bucket_seq[ja->cur_idx] &&
1173             ja->sectors_free == ca->mi.bucket_size) {
1174 #if 0
1175                 /*
1176                  * Debug code for ZNS support, where we (probably) want to be
1177                  * correlated where we stopped in the journal to the zone write
1178                  * points:
1179                  */
1180                 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1181                 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1182                 for (i = 0; i < 3; i++) {
1183                         unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1184
1185                         bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1186                 }
1187 #endif
1188                 ja->sectors_free = 0;
1189         }
1190
1191         /*
1192          * Set dirty_idx to indicate the entire journal is full and needs to be
1193          * reclaimed - journal reclaim will immediately reclaim whatever isn't
1194          * pinned when it first runs:
1195          */
1196         ja->discard_idx = ja->dirty_idx_ondisk =
1197                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1198 out:
1199         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1200         kvfree(buf.data);
1201         percpu_ref_put(&ca->io_ref);
1202         closure_return(cl);
1203         return;
1204 err:
1205         mutex_lock(&jlist->lock);
1206         jlist->ret = ret;
1207         mutex_unlock(&jlist->lock);
1208         goto out;
1209 }
1210
1211 int bch2_journal_read(struct bch_fs *c,
1212                       u64 *last_seq,
1213                       u64 *blacklist_seq,
1214                       u64 *start_seq)
1215 {
1216         struct journal_list jlist;
1217         struct journal_replay *i, **_i, *prev = NULL;
1218         struct genradix_iter radix_iter;
1219         struct printbuf buf = PRINTBUF;
1220         bool degraded = false, last_write_torn = false;
1221         u64 seq;
1222         int ret = 0;
1223
1224         closure_init_stack(&jlist.cl);
1225         mutex_init(&jlist.lock);
1226         jlist.last_seq = 0;
1227         jlist.ret = 0;
1228
1229         for_each_member_device(c, ca) {
1230                 if (!c->opts.fsck &&
1231                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1232                         continue;
1233
1234                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1235                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1236                     percpu_ref_tryget(&ca->io_ref))
1237                         closure_call(&ca->journal.read,
1238                                      bch2_journal_read_device,
1239                                      system_unbound_wq,
1240                                      &jlist.cl);
1241                 else
1242                         degraded = true;
1243         }
1244
1245         closure_sync(&jlist.cl);
1246
1247         if (jlist.ret)
1248                 return jlist.ret;
1249
1250         *last_seq       = 0;
1251         *start_seq      = 0;
1252         *blacklist_seq  = 0;
1253
1254         /*
1255          * Find most recent flush entry, and ignore newer non flush entries -
1256          * those entries will be blacklisted:
1257          */
1258         genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1259                 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
1260
1261                 i = *_i;
1262
1263                 if (journal_replay_ignore(i))
1264                         continue;
1265
1266                 if (!*start_seq)
1267                         *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
1268
1269                 if (JSET_NO_FLUSH(&i->j)) {
1270                         i->ignore_blacklisted = true;
1271                         continue;
1272                 }
1273
1274                 if (!last_write_torn && !i->csum_good) {
1275                         last_write_torn = true;
1276                         i->ignore_blacklisted = true;
1277                         continue;
1278                 }
1279
1280                 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
1281                                          c, le32_to_cpu(i->j.version), &i->j, NULL,
1282                                          jset_last_seq_newer_than_seq,
1283                                          "invalid journal entry: last_seq > seq (%llu > %llu)",
1284                                          le64_to_cpu(i->j.last_seq),
1285                                          le64_to_cpu(i->j.seq)))
1286                         i->j.last_seq = i->j.seq;
1287
1288                 *last_seq       = le64_to_cpu(i->j.last_seq);
1289                 *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1290                 break;
1291         }
1292
1293         if (!*start_seq) {
1294                 bch_info(c, "journal read done, but no entries found");
1295                 return 0;
1296         }
1297
1298         if (!*last_seq) {
1299                 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
1300                          "journal read done, but no entries found after dropping non-flushes");
1301                 return 0;
1302         }
1303
1304         bch_info(c, "journal read done, replaying entries %llu-%llu",
1305                  *last_seq, *blacklist_seq - 1);
1306
1307         if (*start_seq != *blacklist_seq)
1308                 bch_info(c, "dropped unflushed entries %llu-%llu",
1309                          *blacklist_seq, *start_seq - 1);
1310
1311         /* Drop blacklisted entries and entries older than last_seq: */
1312         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1313                 i = *_i;
1314
1315                 if (journal_replay_ignore(i))
1316                         continue;
1317
1318                 seq = le64_to_cpu(i->j.seq);
1319                 if (seq < *last_seq) {
1320                         journal_replay_free(c, i, false);
1321                         continue;
1322                 }
1323
1324                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1325                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1326                                     jset_seq_blacklisted,
1327                                     "found blacklisted journal entry %llu", seq);
1328                         i->ignore_blacklisted = true;
1329                 }
1330         }
1331
1332         /* Check for missing entries: */
1333         seq = *last_seq;
1334         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1335                 i = *_i;
1336
1337                 if (journal_replay_ignore(i))
1338                         continue;
1339
1340                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1341
1342                 while (seq < le64_to_cpu(i->j.seq)) {
1343                         u64 missing_start, missing_end;
1344                         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1345
1346                         while (seq < le64_to_cpu(i->j.seq) &&
1347                                bch2_journal_seq_is_blacklisted(c, seq, false))
1348                                 seq++;
1349
1350                         if (seq == le64_to_cpu(i->j.seq))
1351                                 break;
1352
1353                         missing_start = seq;
1354
1355                         while (seq < le64_to_cpu(i->j.seq) &&
1356                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1357                                 seq++;
1358
1359                         if (prev) {
1360                                 bch2_journal_ptrs_to_text(&buf1, c, prev);
1361                                 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1362                         } else
1363                                 prt_printf(&buf1, "(none)");
1364                         bch2_journal_ptrs_to_text(&buf2, c, i);
1365
1366                         missing_end = seq - 1;
1367                         fsck_err(c, journal_entries_missing,
1368                                  "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1369                                  "  prev at %s\n"
1370                                  "  next at %s",
1371                                  missing_start, missing_end,
1372                                  *last_seq, *blacklist_seq - 1,
1373                                  buf1.buf, buf2.buf);
1374
1375                         printbuf_exit(&buf1);
1376                         printbuf_exit(&buf2);
1377                 }
1378
1379                 prev = i;
1380                 seq++;
1381         }
1382
1383         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1384                 struct bch_replicas_padded replicas = {
1385                         .e.data_type = BCH_DATA_journal,
1386                         .e.nr_required = 1,
1387                 };
1388
1389                 i = *_i;
1390                 if (journal_replay_ignore(i))
1391                         continue;
1392
1393                 darray_for_each(i->ptrs, ptr) {
1394                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1395
1396                         if (!ptr->csum_good)
1397                                 bch_err_dev_offset(ca, ptr->sector,
1398                                                    "invalid journal checksum, seq %llu%s",
1399                                                    le64_to_cpu(i->j.seq),
1400                                                    i->csum_good ? " (had good copy on another device)" : "");
1401                 }
1402
1403                 ret = jset_validate(c,
1404                                     bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
1405                                     &i->j,
1406                                     i->ptrs.data[0].sector,
1407                                     READ);
1408                 if (ret)
1409                         goto err;
1410
1411                 darray_for_each(i->ptrs, ptr)
1412                         replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
1413
1414                 bch2_replicas_entry_sort(&replicas.e);
1415
1416                 printbuf_reset(&buf);
1417                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1418
1419                 if (!degraded &&
1420                     !bch2_replicas_marked(c, &replicas.e) &&
1421                     (le64_to_cpu(i->j.seq) == *last_seq ||
1422                      fsck_err(c, journal_entry_replicas_not_marked,
1423                               "superblock not marked as containing replicas for journal entry %llu\n  %s",
1424                               le64_to_cpu(i->j.seq), buf.buf))) {
1425                         ret = bch2_mark_replicas(c, &replicas.e);
1426                         if (ret)
1427                                 goto err;
1428                 }
1429         }
1430 err:
1431 fsck_err:
1432         printbuf_exit(&buf);
1433         return ret;
1434 }
1435
1436 /* journal write: */
1437
1438 static void __journal_write_alloc(struct journal *j,
1439                                   struct journal_buf *w,
1440                                   struct dev_alloc_list *devs_sorted,
1441                                   unsigned sectors,
1442                                   unsigned *replicas,
1443                                   unsigned replicas_want)
1444 {
1445         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1446         struct journal_device *ja;
1447         struct bch_dev *ca;
1448         unsigned i;
1449
1450         if (*replicas >= replicas_want)
1451                 return;
1452
1453         for (i = 0; i < devs_sorted->nr; i++) {
1454                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1455                 if (!ca)
1456                         continue;
1457
1458                 ja = &ca->journal;
1459
1460                 /*
1461                  * Check that we can use this device, and aren't already using
1462                  * it:
1463                  */
1464                 if (!ca->mi.durability ||
1465                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1466                     !ja->nr ||
1467                     bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
1468                     sectors > ja->sectors_free)
1469                         continue;
1470
1471                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1472
1473                 bch2_bkey_append_ptr(&w->key,
1474                         (struct bch_extent_ptr) {
1475                                   .offset = bucket_to_sector(ca,
1476                                         ja->buckets[ja->cur_idx]) +
1477                                         ca->mi.bucket_size -
1478                                         ja->sectors_free,
1479                                   .dev = ca->dev_idx,
1480                 });
1481
1482                 ja->sectors_free -= sectors;
1483                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1484
1485                 *replicas += ca->mi.durability;
1486
1487                 if (*replicas >= replicas_want)
1488                         break;
1489         }
1490 }
1491
1492 /**
1493  * journal_write_alloc - decide where to write next journal entry
1494  *
1495  * @j:          journal object
1496  * @w:          journal buf (entry to be written)
1497  *
1498  * Returns: 0 on success, or -EROFS on failure
1499  */
1500 static int journal_write_alloc(struct journal *j, struct journal_buf *w)
1501 {
1502         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1503         struct bch_devs_mask devs;
1504         struct journal_device *ja;
1505         struct bch_dev *ca;
1506         struct dev_alloc_list devs_sorted;
1507         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1508         unsigned target = c->opts.metadata_target ?:
1509                 c->opts.foreground_target;
1510         unsigned i, replicas = 0, replicas_want =
1511                 READ_ONCE(c->opts.metadata_replicas);
1512         unsigned replicas_need = min_t(unsigned, replicas_want,
1513                                        READ_ONCE(c->opts.metadata_replicas_required));
1514
1515         rcu_read_lock();
1516 retry:
1517         devs = target_rw_devs(c, BCH_DATA_journal, target);
1518
1519         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1520
1521         __journal_write_alloc(j, w, &devs_sorted,
1522                               sectors, &replicas, replicas_want);
1523
1524         if (replicas >= replicas_want)
1525                 goto done;
1526
1527         for (i = 0; i < devs_sorted.nr; i++) {
1528                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1529                 if (!ca)
1530                         continue;
1531
1532                 ja = &ca->journal;
1533
1534                 if (sectors > ja->sectors_free &&
1535                     sectors <= ca->mi.bucket_size &&
1536                     bch2_journal_dev_buckets_available(j, ja,
1537                                         journal_space_discarded)) {
1538                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1539                         ja->sectors_free = ca->mi.bucket_size;
1540
1541                         /*
1542                          * ja->bucket_seq[ja->cur_idx] must always have
1543                          * something sensible:
1544                          */
1545                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1546                 }
1547         }
1548
1549         __journal_write_alloc(j, w, &devs_sorted,
1550                               sectors, &replicas, replicas_want);
1551
1552         if (replicas < replicas_want && target) {
1553                 /* Retry from all devices: */
1554                 target = 0;
1555                 goto retry;
1556         }
1557 done:
1558         rcu_read_unlock();
1559
1560         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1561
1562         return replicas >= replicas_need ? 0 : -EROFS;
1563 }
1564
1565 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1566 {
1567         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1568
1569         /* we aren't holding j->lock: */
1570         unsigned new_size = READ_ONCE(j->buf_size_want);
1571         void *new_buf;
1572
1573         if (buf->buf_size >= new_size)
1574                 return;
1575
1576         size_t btree_write_buffer_size = new_size / 64;
1577
1578         if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
1579                 return;
1580
1581         new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
1582         if (!new_buf)
1583                 return;
1584
1585         memcpy(new_buf, buf->data, buf->buf_size);
1586
1587         spin_lock(&j->lock);
1588         swap(buf->data,         new_buf);
1589         swap(buf->buf_size,     new_size);
1590         spin_unlock(&j->lock);
1591
1592         kvfree(new_buf);
1593 }
1594
1595 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1596 {
1597         return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1598 }
1599
1600 static CLOSURE_CALLBACK(journal_write_done)
1601 {
1602         closure_type(w, struct journal_buf, io);
1603         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1604         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1605         struct bch_replicas_padded replicas;
1606         union journal_res_state old, new;
1607         u64 v, seq = le64_to_cpu(w->data->seq);
1608         int err = 0;
1609
1610         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1611                                ? j->flush_write_time
1612                                : j->noflush_write_time, j->write_start_time);
1613
1614         if (!w->devs_written.nr) {
1615                 bch_err(c, "unable to write journal to sufficient devices");
1616                 err = -EIO;
1617         } else {
1618                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1619                                          w->devs_written);
1620                 if (bch2_mark_replicas(c, &replicas.e))
1621                         err = -EIO;
1622         }
1623
1624         if (err)
1625                 bch2_fatal_error(c);
1626
1627         closure_debug_destroy(cl);
1628
1629         spin_lock(&j->lock);
1630         if (seq >= j->pin.front)
1631                 journal_seq_pin(j, seq)->devs = w->devs_written;
1632         if (err && (!j->err_seq || seq < j->err_seq))
1633                 j->err_seq      = seq;
1634         w->write_done = true;
1635
1636         bool completed = false;
1637
1638         for (seq = journal_last_unwritten_seq(j);
1639              seq <= journal_cur_seq(j);
1640              seq++) {
1641                 w = j->buf + (seq & JOURNAL_BUF_MASK);
1642                 if (!w->write_done)
1643                         break;
1644
1645                 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
1646                         j->flushed_seq_ondisk = seq;
1647                         j->last_seq_ondisk = w->last_seq;
1648
1649                         bch2_do_discards(c);
1650                         closure_wake_up(&c->freelist_wait);
1651                         bch2_reset_alloc_cursors(c);
1652                 }
1653
1654                 j->seq_ondisk = seq;
1655
1656                 /*
1657                  * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1658                  * more buckets:
1659                  *
1660                  * Must come before signaling write completion, for
1661                  * bch2_fs_journal_stop():
1662                  */
1663                 if (j->watermark != BCH_WATERMARK_stripe)
1664                         journal_reclaim_kick(&c->journal);
1665
1666                 v = atomic64_read(&j->reservations.counter);
1667                 do {
1668                         old.v = new.v = v;
1669                         BUG_ON(journal_state_count(new, new.unwritten_idx));
1670                         BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
1671
1672                         new.unwritten_idx++;
1673                 } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
1674
1675                 closure_wake_up(&w->wait);
1676                 completed = true;
1677         }
1678
1679         if (completed) {
1680                 bch2_journal_reclaim_fast(j);
1681                 bch2_journal_space_available(j);
1682
1683                 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
1684
1685                 journal_wake(j);
1686         }
1687
1688         if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1689                    new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1690                 struct journal_buf *buf = journal_cur_buf(j);
1691                 long delta = buf->expires - jiffies;
1692
1693                 /*
1694                  * We don't close a journal entry to write it while there's
1695                  * previous entries still in flight - the current journal entry
1696                  * might want to be written now:
1697                  */
1698                 mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
1699         }
1700
1701         spin_unlock(&j->lock);
1702 }
1703
1704 static void journal_write_endio(struct bio *bio)
1705 {
1706         struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
1707         struct bch_dev *ca = jbio->ca;
1708         struct journal *j = &ca->fs->journal;
1709         struct journal_buf *w = j->buf + jbio->buf_idx;
1710
1711         if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1712                                "error writing journal entry %llu: %s",
1713                                le64_to_cpu(w->data->seq),
1714                                bch2_blk_status_to_str(bio->bi_status)) ||
1715             bch2_meta_write_fault("journal")) {
1716                 unsigned long flags;
1717
1718                 spin_lock_irqsave(&j->err_lock, flags);
1719                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1720                 spin_unlock_irqrestore(&j->err_lock, flags);
1721         }
1722
1723         closure_put(&w->io);
1724         percpu_ref_put(&ca->io_ref);
1725 }
1726
1727 static CLOSURE_CALLBACK(do_journal_write)
1728 {
1729         closure_type(w, struct journal_buf, io);
1730         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1731         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1732         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1733
1734         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1735                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1736                 struct journal_device *ja = &ca->journal;
1737
1738                 if (!percpu_ref_tryget(&ca->io_ref)) {
1739                         /* XXX: fix this */
1740                         bch_err(c, "missing device for journal write\n");
1741                         continue;
1742                 }
1743
1744                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1745                              sectors);
1746
1747                 struct bio *bio = &ja->bio[w->idx]->bio;
1748                 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1749                 bio->bi_iter.bi_sector  = ptr->offset;
1750                 bio->bi_end_io          = journal_write_endio;
1751                 bio->bi_private         = ca;
1752
1753                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1754                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1755
1756                 if (!JSET_NO_FLUSH(w->data))
1757                         bio->bi_opf    |= REQ_FUA;
1758                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1759                         bio->bi_opf    |= REQ_PREFLUSH;
1760
1761                 bch2_bio_map(bio, w->data, sectors << 9);
1762
1763                 trace_and_count(c, journal_write, bio);
1764                 closure_bio_submit(bio, cl);
1765
1766                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1767         }
1768
1769         continue_at(cl, journal_write_done, j->wq);
1770 }
1771
1772 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
1773 {
1774         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1775         struct jset_entry *start, *end;
1776         struct jset *jset = w->data;
1777         struct journal_keys_to_wb wb = { NULL };
1778         unsigned sectors, bytes, u64s;
1779         unsigned long btree_roots_have = 0;
1780         bool validate_before_checksum = false;
1781         u64 seq = le64_to_cpu(jset->seq);
1782         int ret;
1783
1784         /*
1785          * Simple compaction, dropping empty jset_entries (from journal
1786          * reservations that weren't fully used) and merging jset_entries that
1787          * can be.
1788          *
1789          * If we wanted to be really fancy here, we could sort all the keys in
1790          * the jset and drop keys that were overwritten - probably not worth it:
1791          */
1792         vstruct_for_each(jset, i) {
1793                 unsigned u64s = le16_to_cpu(i->u64s);
1794
1795                 /* Empty entry: */
1796                 if (!u64s)
1797                         continue;
1798
1799                 /*
1800                  * New btree roots are set by journalling them; when the journal
1801                  * entry gets written we have to propagate them to
1802                  * c->btree_roots
1803                  *
1804                  * But, every journal entry we write has to contain all the
1805                  * btree roots (at least for now); so after we copy btree roots
1806                  * to c->btree_roots we have to get any missing btree roots and
1807                  * add them to this journal entry:
1808                  */
1809                 switch (i->type) {
1810                 case BCH_JSET_ENTRY_btree_root:
1811                         bch2_journal_entry_to_btree_root(c, i);
1812                         __set_bit(i->btree_id, &btree_roots_have);
1813                         break;
1814                 case BCH_JSET_ENTRY_write_buffer_keys:
1815                         EBUG_ON(!w->need_flush_to_write_buffer);
1816
1817                         if (!wb.wb)
1818                                 bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
1819
1820                         jset_entry_for_each_key(i, k) {
1821                                 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
1822                                 if (ret) {
1823                                         bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
1824                                         bch2_journal_keys_to_write_buffer_end(c, &wb);
1825                                         return ret;
1826                                 }
1827                         }
1828                         i->type = BCH_JSET_ENTRY_btree_keys;
1829                         break;
1830                 }
1831         }
1832
1833         if (wb.wb)
1834                 bch2_journal_keys_to_write_buffer_end(c, &wb);
1835
1836         spin_lock(&c->journal.lock);
1837         w->need_flush_to_write_buffer = false;
1838         spin_unlock(&c->journal.lock);
1839
1840         start = end = vstruct_last(jset);
1841
1842         end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
1843
1844         struct jset_entry_datetime *d =
1845                 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
1846         d->entry.type   = BCH_JSET_ENTRY_datetime;
1847         d->seconds      = cpu_to_le64(ktime_get_real_seconds());
1848
1849         bch2_journal_super_entries_add_common(c, &end, seq);
1850         u64s    = (u64 *) end - (u64 *) start;
1851         BUG_ON(u64s > j->entry_u64s_reserved);
1852
1853         le32_add_cpu(&jset->u64s, u64s);
1854
1855         sectors = vstruct_sectors(jset, c->block_bits);
1856         bytes   = vstruct_bytes(jset);
1857
1858         if (sectors > w->sectors) {
1859                 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
1860                                     vstruct_bytes(jset), w->sectors << 9,
1861                                     u64s, w->u64s_reserved, j->entry_u64s_reserved);
1862                 return -EINVAL;
1863         }
1864
1865         jset->magic             = cpu_to_le64(jset_magic(c));
1866         jset->version           = cpu_to_le32(c->sb.version);
1867
1868         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1869         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1870
1871         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1872                 j->last_empty_seq = seq;
1873
1874         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1875                 validate_before_checksum = true;
1876
1877         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1878                 validate_before_checksum = true;
1879
1880         if (validate_before_checksum &&
1881             (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1882                 return ret;
1883
1884         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1885                     jset->encrypted_start,
1886                     vstruct_end(jset) - (void *) jset->encrypted_start);
1887         if (bch2_fs_fatal_err_on(ret, c,
1888                         "error decrypting journal entry: %i", ret))
1889                 return ret;
1890
1891         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1892                                   journal_nonce(jset), jset);
1893
1894         if (!validate_before_checksum &&
1895             (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1896                 return ret;
1897
1898         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1899         return 0;
1900 }
1901
1902 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
1903 {
1904         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1905         int error = bch2_journal_error(j);
1906
1907         /*
1908          * If the journal is in an error state - we did an emergency shutdown -
1909          * we prefer to continue doing journal writes. We just mark them as
1910          * noflush so they'll never be used, but they'll still be visible by the
1911          * list_journal tool - this helps in debugging.
1912          *
1913          * There's a caveat: the first journal write after marking the
1914          * superblock dirty must always be a flush write, because on startup
1915          * from a clean shutdown we didn't necessarily read the journal and the
1916          * new journal write might overwrite whatever was in the journal
1917          * previously - we can't leave the journal without any flush writes in
1918          * it.
1919          *
1920          * So if we're in an error state, and we're still starting up, we don't
1921          * write anything at all.
1922          */
1923         if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
1924                 return -EIO;
1925
1926         if (error ||
1927             w->noflush ||
1928             (!w->must_flush &&
1929              (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1930              test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1931                 w->noflush = true;
1932                 SET_JSET_NO_FLUSH(w->data, true);
1933                 w->data->last_seq       = 0;
1934                 w->last_seq             = 0;
1935
1936                 j->nr_noflush_writes++;
1937         } else {
1938                 w->must_flush = true;
1939                 j->last_flush_write = jiffies;
1940                 j->nr_flush_writes++;
1941                 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
1942         }
1943
1944         return 0;
1945 }
1946
1947 CLOSURE_CALLBACK(bch2_journal_write)
1948 {
1949         closure_type(w, struct journal_buf, io);
1950         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1951         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1952         struct bch_replicas_padded replicas;
1953         struct printbuf journal_debug_buf = PRINTBUF;
1954         unsigned nr_rw_members = 0;
1955         int ret;
1956
1957         for_each_rw_member(c, ca)
1958                 nr_rw_members++;
1959
1960         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1961         BUG_ON(!w->write_started);
1962         BUG_ON(w->write_allocated);
1963         BUG_ON(w->write_done);
1964
1965         j->write_start_time = local_clock();
1966
1967         spin_lock(&j->lock);
1968         if (nr_rw_members > 1)
1969                 w->separate_flush = true;
1970
1971         ret = bch2_journal_write_pick_flush(j, w);
1972         spin_unlock(&j->lock);
1973         if (ret)
1974                 goto err;
1975
1976         mutex_lock(&j->buf_lock);
1977         journal_buf_realloc(j, w);
1978
1979         ret = bch2_journal_write_prep(j, w);
1980         mutex_unlock(&j->buf_lock);
1981         if (ret)
1982                 goto err;
1983
1984         j->entry_bytes_written += vstruct_bytes(w->data);
1985
1986         while (1) {
1987                 spin_lock(&j->lock);
1988                 ret = journal_write_alloc(j, w);
1989                 if (!ret || !j->can_discard)
1990                         break;
1991
1992                 spin_unlock(&j->lock);
1993                 bch2_journal_do_discards(j);
1994         }
1995
1996         if (ret) {
1997                 __bch2_journal_debug_to_text(&journal_debug_buf, j);
1998                 spin_unlock(&j->lock);
1999                 bch_err(c, "Unable to allocate journal write:\n%s",
2000                         journal_debug_buf.buf);
2001                 printbuf_exit(&journal_debug_buf);
2002                 goto err;
2003         }
2004
2005         /*
2006          * write is allocated, no longer need to account for it in
2007          * bch2_journal_space_available():
2008          */
2009         w->sectors = 0;
2010         w->write_allocated = true;
2011
2012         /*
2013          * journal entry has been compacted and allocated, recalculate space
2014          * available:
2015          */
2016         bch2_journal_space_available(j);
2017         bch2_journal_do_writes(j);
2018         spin_unlock(&j->lock);
2019
2020         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
2021
2022         if (c->opts.nochanges)
2023                 goto no_io;
2024
2025         /*
2026          * Mark journal replicas before we submit the write to guarantee
2027          * recovery will find the journal entries after a crash.
2028          */
2029         bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
2030                                  w->devs_written);
2031         ret = bch2_mark_replicas(c, &replicas.e);
2032         if (ret)
2033                 goto err;
2034
2035         if (!JSET_NO_FLUSH(w->data))
2036                 closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
2037
2038         if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
2039                 for_each_rw_member(c, ca) {
2040                         percpu_ref_get(&ca->io_ref);
2041
2042                         struct journal_device *ja = &ca->journal;
2043                         struct bio *bio = &ja->bio[w->idx]->bio;
2044                         bio_reset(bio, ca->disk_sb.bdev,
2045                                   REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
2046                         bio->bi_end_io          = journal_write_endio;
2047                         bio->bi_private         = ca;
2048                         closure_bio_submit(bio, cl);
2049                 }
2050         }
2051
2052         continue_at(cl, do_journal_write, j->wq);
2053         return;
2054 no_io:
2055         continue_at(cl, journal_write_done, j->wq);
2056         return;
2057 err:
2058         bch2_fatal_error(c);
2059         continue_at(cl, journal_write_done, j->wq);
2060 }