1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligibility is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * Fast Commit Replay Idempotence
107 * ------------------------------
108 *
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
113 *
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
118 *
119 * - Link dirent b to inode 10
120 * - Unlink dirent a
121 * - Inode <10> with valid refcount
122 *
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
125 *
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
128 *
129 * rm A; mv B A; read A
130 * (x) (y) (z)
131 *
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
140 *
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
143 *
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
145 * (w) (x) (y) (z)
146 *
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
154 *
155 * TODOs
156 * -----
157 *
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
167 * commits.
168 *
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
174 *
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176 *
177 * 3) Handle more ineligible cases.
178 */
179
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185 BUFFER_TRACE(bh, "");
186 if (uptodate) {
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__, bh->b_blocknr);
189 set_buffer_uptodate(bh);
190 } else {
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__, bh->b_blocknr);
193 clear_buffer_uptodate(bh);
194 }
195
196 unlock_buffer(bh);
197 }
198
ext4_fc_reset_inode(struct inode * inode)199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201 struct ext4_inode_info *ei = EXT4_I(inode);
202
203 ei->i_fc_lblk_start = 0;
204 ei->i_fc_lblk_len = 0;
205 }
206
ext4_fc_init_inode(struct inode * inode)207 void ext4_fc_init_inode(struct inode *inode)
208 {
209 struct ext4_inode_info *ei = EXT4_I(inode);
210
211 ext4_fc_reset_inode(inode);
212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 INIT_LIST_HEAD(&ei->i_fc_list);
214 init_waitqueue_head(&ei->i_fc_wait);
215 atomic_set(&ei->i_fc_updates, 0);
216 }
217
218 /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222 wait_queue_head_t *wq;
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
225 #if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 EXT4_STATE_FC_COMMITTING);
228 wq = bit_waitqueue(&ei->i_state_flags,
229 EXT4_STATE_FC_COMMITTING);
230 #else
231 DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 EXT4_STATE_FC_COMMITTING);
233 wq = bit_waitqueue(&ei->i_flags,
234 EXT4_STATE_FC_COMMITTING);
235 #endif
236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 schedule();
240 finish_wait(wq, &wait.wq_entry);
241 }
242
243 /*
244 * Inform Ext4's fast about start of an inode update
245 *
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
249 */
ext4_fc_start_update(struct inode * inode)250 void ext4_fc_start_update(struct inode *inode)
251 {
252 struct ext4_inode_info *ei = EXT4_I(inode);
253
254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 return;
257
258 restart:
259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 if (list_empty(&ei->i_fc_list))
261 goto out;
262
263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 ext4_fc_wait_committing_inode(inode);
265 goto restart;
266 }
267 out:
268 atomic_inc(&ei->i_fc_updates);
269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271
272 /*
273 * Stop inode update and wake up waiting fast commits if any.
274 */
ext4_fc_stop_update(struct inode * inode)275 void ext4_fc_stop_update(struct inode *inode)
276 {
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 return;
282
283 if (atomic_dec_and_test(&ei->i_fc_updates))
284 wake_up_all(&ei->i_fc_wait);
285 }
286
287 /*
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
290 */
ext4_fc_del(struct inode * inode)291 void ext4_fc_del(struct inode *inode)
292 {
293 struct ext4_inode_info *ei = EXT4_I(inode);
294
295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 return;
298
299 restart:
300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 if (list_empty(&ei->i_fc_list)) {
302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 return;
304 }
305
306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 ext4_fc_wait_committing_inode(inode);
308 goto restart;
309 }
310 list_del_init(&ei->i_fc_list);
311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313
314 /*
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
317 */
ext4_fc_mark_ineligible(struct super_block * sb,int reason)318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320 struct ext4_sb_info *sbi = EXT4_SB(sb);
321
322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 return;
325
326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330
331 /*
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
334 */
ext4_fc_start_ineligible(struct super_block * sb,int reason)335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
338
339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 return;
342
343 WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347
348 /*
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
352 */
ext4_fc_stop_ineligible(struct super_block * sb)353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 return;
358
359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362
ext4_fc_is_ineligible(struct super_block * sb)363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368
369 /*
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
376 *
377 * If enqueue is set, this function enqueues the inode in fast commit list.
378 */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(struct inode *,void *,bool),void * args,int enqueue)379 static int ext4_fc_track_template(
380 handle_t *handle, struct inode *inode,
381 int (*__fc_track_fn)(struct inode *, void *, bool),
382 void *args, int enqueue)
383 {
384 bool update = false;
385 struct ext4_inode_info *ei = EXT4_I(inode);
386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 tid_t tid = 0;
388 int ret;
389
390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 (sbi->s_mount_state & EXT4_FC_REPLAY))
392 return -EOPNOTSUPP;
393
394 if (ext4_fc_is_ineligible(inode->i_sb))
395 return -EINVAL;
396
397 tid = handle->h_transaction->t_tid;
398 mutex_lock(&ei->i_fc_lock);
399 if (tid == ei->i_sync_tid) {
400 update = true;
401 } else {
402 ext4_fc_reset_inode(inode);
403 ei->i_sync_tid = tid;
404 }
405 ret = __fc_track_fn(inode, args, update);
406 mutex_unlock(&ei->i_fc_lock);
407
408 if (!enqueue)
409 return ret;
410
411 spin_lock(&sbi->s_fc_lock);
412 if (list_empty(&EXT4_I(inode)->i_fc_list))
413 list_add_tail(&EXT4_I(inode)->i_fc_list,
414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 &sbi->s_fc_q[FC_Q_STAGING] :
416 &sbi->s_fc_q[FC_Q_MAIN]);
417 spin_unlock(&sbi->s_fc_lock);
418
419 return ret;
420 }
421
422 struct __track_dentry_update_args {
423 struct dentry *dentry;
424 int op;
425 };
426
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(struct inode * inode,void * arg,bool update)428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430 struct ext4_fc_dentry_update *node;
431 struct ext4_inode_info *ei = EXT4_I(inode);
432 struct __track_dentry_update_args *dentry_update =
433 (struct __track_dentry_update_args *)arg;
434 struct dentry *dentry = dentry_update->dentry;
435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436
437 mutex_unlock(&ei->i_fc_lock);
438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 if (!node) {
440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 mutex_lock(&ei->i_fc_lock);
442 return -ENOMEM;
443 }
444
445 node->fcd_op = dentry_update->op;
446 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 node->fcd_ino = inode->i_ino;
448 if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 if (!node->fcd_name.name) {
451 kmem_cache_free(ext4_fc_dentry_cachep, node);
452 ext4_fc_mark_ineligible(inode->i_sb,
453 EXT4_FC_REASON_NOMEM);
454 mutex_lock(&ei->i_fc_lock);
455 return -ENOMEM;
456 }
457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 dentry->d_name.len);
459 } else {
460 memcpy(node->fcd_iname, dentry->d_name.name,
461 dentry->d_name.len);
462 node->fcd_name.name = node->fcd_iname;
463 }
464 node->fcd_name.len = dentry->d_name.len;
465
466 spin_lock(&sbi->s_fc_lock);
467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 list_add_tail(&node->fcd_list,
469 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 else
471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 spin_unlock(&sbi->s_fc_lock);
473 mutex_lock(&ei->i_fc_lock);
474
475 return 0;
476 }
477
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)478 void __ext4_fc_track_unlink(handle_t *handle,
479 struct inode *inode, struct dentry *dentry)
480 {
481 struct __track_dentry_update_args args;
482 int ret;
483
484 args.dentry = dentry;
485 args.op = EXT4_FC_TAG_UNLINK;
486
487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 (void *)&args, 0);
489 trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)497 void __ext4_fc_track_link(handle_t *handle,
498 struct inode *inode, struct dentry *dentry)
499 {
500 struct __track_dentry_update_args args;
501 int ret;
502
503 args.dentry = dentry;
504 args.op = EXT4_FC_TAG_LINK;
505
506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 (void *)&args, 0);
508 trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 struct dentry *dentry)
518 {
519 struct __track_dentry_update_args args;
520 int ret;
521
522 args.dentry = dentry;
523 args.op = EXT4_FC_TAG_CREAT;
524
525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 (void *)&args, 0);
527 trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531 {
532 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
533 }
534
535 /* __track_fn for inode tracking */
__track_inode(struct inode * inode,void * arg,bool update)536 static int __track_inode(struct inode *inode, void *arg, bool update)
537 {
538 if (update)
539 return -EEXIST;
540
541 EXT4_I(inode)->i_fc_lblk_len = 0;
542
543 return 0;
544 }
545
ext4_fc_track_inode(handle_t * handle,struct inode * inode)546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547 {
548 int ret;
549
550 if (S_ISDIR(inode->i_mode))
551 return;
552
553 if (ext4_should_journal_data(inode)) {
554 ext4_fc_mark_ineligible(inode->i_sb,
555 EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 return;
557 }
558
559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 trace_ext4_fc_track_inode(inode, ret);
561 }
562
563 struct __track_range_args {
564 ext4_lblk_t start, end;
565 };
566
567 /* __track_fn for tracking data updates */
__track_range(struct inode * inode,void * arg,bool update)568 static int __track_range(struct inode *inode, void *arg, bool update)
569 {
570 struct ext4_inode_info *ei = EXT4_I(inode);
571 ext4_lblk_t oldstart;
572 struct __track_range_args *__arg =
573 (struct __track_range_args *)arg;
574
575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 return -ECANCELED;
578 }
579
580 oldstart = ei->i_fc_lblk_start;
581
582 if (update && ei->i_fc_lblk_len > 0) {
583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 ei->i_fc_lblk_len =
585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 ei->i_fc_lblk_start + 1;
587 } else {
588 ei->i_fc_lblk_start = __arg->start;
589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 }
591
592 return 0;
593 }
594
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 ext4_lblk_t end)
597 {
598 struct __track_range_args args;
599 int ret;
600
601 if (S_ISDIR(inode->i_mode))
602 return;
603
604 args.start = start;
605 args.end = end;
606
607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
608
609 trace_ext4_fc_track_range(inode, start, end, ret);
610 }
611
ext4_fc_submit_bh(struct super_block * sb,bool is_tail)612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613 {
614 int write_flags = REQ_SYNC;
615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616
617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 if (test_opt(sb, BARRIER) && is_tail)
619 write_flags |= REQ_FUA | REQ_PREFLUSH;
620 lock_buffer(bh);
621 set_buffer_dirty(bh);
622 set_buffer_uptodate(bh);
623 bh->b_end_io = ext4_end_buffer_io_sync;
624 submit_bh(REQ_OP_WRITE, write_flags, bh);
625 EXT4_SB(sb)->s_fc_bh = NULL;
626 }
627
628 /* Ext4 commit path routines */
629
630 /* memzero and update CRC */
ext4_fc_memzero(struct super_block * sb,void * dst,int len,u32 * crc)631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 u32 *crc)
633 {
634 void *ret;
635
636 ret = memset(dst, 0, len);
637 if (crc)
638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 return ret;
640 }
641
642 /*
643 * Allocate len bytes on a fast commit buffer.
644 *
645 * During the commit time this function is used to manage fast commit
646 * block space. We don't split a fast commit log onto different
647 * blocks. So this function makes sure that if there's not enough space
648 * on the current block, the remaining space in the current block is
649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650 * new block is from jbd2 and CRC is updated to reflect the padding
651 * we added.
652 */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654 {
655 struct ext4_fc_tl *tl;
656 struct ext4_sb_info *sbi = EXT4_SB(sb);
657 struct buffer_head *bh;
658 int bsize = sbi->s_journal->j_blocksize;
659 int ret, off = sbi->s_fc_bytes % bsize;
660 int pad_len;
661
662 /*
663 * After allocating len, we should have space at least for a 0 byte
664 * padding.
665 */
666 if (len + sizeof(struct ext4_fc_tl) > bsize)
667 return NULL;
668
669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 /*
671 * Only allocate from current buffer if we have enough space for
672 * this request AND we have space to add a zero byte padding.
673 */
674 if (!sbi->s_fc_bh) {
675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 if (ret)
677 return NULL;
678 sbi->s_fc_bh = bh;
679 }
680 sbi->s_fc_bytes += len;
681 return sbi->s_fc_bh->b_data + off;
682 }
683 /* Need to add PAD tag */
684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 tl->fc_len = cpu_to_le16(pad_len);
688 if (crc)
689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 if (pad_len > 0)
691 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 ext4_fc_submit_bh(sb, false);
693
694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 if (ret)
696 return NULL;
697 sbi->s_fc_bh = bh;
698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 return sbi->s_fc_bh->b_data;
700 }
701
702 /* memcpy to fc reserved space and update CRC */
ext4_fc_memcpy(struct super_block * sb,void * dst,const void * src,int len,u32 * crc)703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 int len, u32 *crc)
705 {
706 if (crc)
707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 return memcpy(dst, src, len);
709 }
710
711 /*
712 * Complete a fast commit by writing tail tag.
713 *
714 * Writing tail tag marks the end of a fast commit. In order to guarantee
715 * atomicity, after writing tail tag, even if there's space remaining
716 * in the block, next commit shouldn't use it. That's why tail tag
717 * has the length as that of the remaining space on the block.
718 */
ext4_fc_write_tail(struct super_block * sb,u32 crc)719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720 {
721 struct ext4_sb_info *sbi = EXT4_SB(sb);
722 struct ext4_fc_tl tl;
723 struct ext4_fc_tail tail;
724 int off, bsize = sbi->s_journal->j_blocksize;
725 u8 *dst;
726
727 /*
728 * ext4_fc_reserve_space takes care of allocating an extra block if
729 * there's no enough space on this block for accommodating this tail.
730 */
731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 if (!dst)
733 return -ENOSPC;
734
735 off = sbi->s_fc_bytes % bsize;
736
737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740
741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 dst += sizeof(tl);
743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 dst += sizeof(tail.fc_tid);
746 tail.fc_crc = cpu_to_le32(crc);
747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748
749 ext4_fc_submit_bh(sb, true);
750
751 return 0;
752 }
753
754 /*
755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756 * Returns false if there's not enough space.
757 */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 u32 *crc)
760 {
761 struct ext4_fc_tl tl;
762 u8 *dst;
763
764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 if (!dst)
766 return false;
767
768 tl.fc_tag = cpu_to_le16(tag);
769 tl.fc_len = cpu_to_le16(len);
770
771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773
774 return true;
775 }
776
777 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u32 * crc,struct ext4_fc_dentry_update * fc_dentry)778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
779 struct ext4_fc_dentry_update *fc_dentry)
780 {
781 struct ext4_fc_dentry_info fcd;
782 struct ext4_fc_tl tl;
783 int dlen = fc_dentry->fcd_name.len;
784 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
785 crc);
786
787 if (!dst)
788 return false;
789
790 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
791 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
792 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
793 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
794 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
795 dst += sizeof(tl);
796 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
797 dst += sizeof(fcd);
798 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
799 dst += dlen;
800
801 return true;
802 }
803
804 /*
805 * Writes inode in the fast commit space under TLV with tag @tag.
806 * Returns 0 on success, error on failure.
807 */
ext4_fc_write_inode(struct inode * inode,u32 * crc)808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
809 {
810 struct ext4_inode_info *ei = EXT4_I(inode);
811 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
812 int ret;
813 struct ext4_iloc iloc;
814 struct ext4_fc_inode fc_inode;
815 struct ext4_fc_tl tl;
816 u8 *dst;
817
818 ret = ext4_get_inode_loc(inode, &iloc);
819 if (ret)
820 return ret;
821
822 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
823 inode_len = EXT4_INODE_SIZE(inode->i_sb);
824 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
825 inode_len += ei->i_extra_isize;
826
827 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
828 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
829 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
830
831 dst = ext4_fc_reserve_space(inode->i_sb,
832 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
833 if (!dst)
834 return -ECANCELED;
835
836 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
837 return -ECANCELED;
838 dst += sizeof(tl);
839 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
840 return -ECANCELED;
841 dst += sizeof(fc_inode);
842 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
843 inode_len, crc))
844 return -ECANCELED;
845
846 return 0;
847 }
848
849 /*
850 * Writes updated data ranges for the inode in question. Updates CRC.
851 * Returns 0 on success, error otherwise.
852 */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)853 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
854 {
855 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
856 struct ext4_inode_info *ei = EXT4_I(inode);
857 struct ext4_map_blocks map;
858 struct ext4_fc_add_range fc_ext;
859 struct ext4_fc_del_range lrange;
860 struct ext4_extent *ex;
861 int ret;
862
863 mutex_lock(&ei->i_fc_lock);
864 if (ei->i_fc_lblk_len == 0) {
865 mutex_unlock(&ei->i_fc_lock);
866 return 0;
867 }
868 old_blk_size = ei->i_fc_lblk_start;
869 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
870 ei->i_fc_lblk_len = 0;
871 mutex_unlock(&ei->i_fc_lock);
872
873 cur_lblk_off = old_blk_size;
874 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
875 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
876
877 while (cur_lblk_off <= new_blk_size) {
878 map.m_lblk = cur_lblk_off;
879 map.m_len = new_blk_size - cur_lblk_off + 1;
880 ret = ext4_map_blocks(NULL, inode, &map, 0);
881 if (ret < 0)
882 return -ECANCELED;
883
884 if (map.m_len == 0) {
885 cur_lblk_off++;
886 continue;
887 }
888
889 if (ret == 0) {
890 lrange.fc_ino = cpu_to_le32(inode->i_ino);
891 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
892 lrange.fc_len = cpu_to_le32(map.m_len);
893 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
894 sizeof(lrange), (u8 *)&lrange, crc))
895 return -ENOSPC;
896 } else {
897 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
898 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
899
900 /* Limit the number of blocks in one extent */
901 map.m_len = min(max, map.m_len);
902
903 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
904 ex = (struct ext4_extent *)&fc_ext.fc_ex;
905 ex->ee_block = cpu_to_le32(map.m_lblk);
906 ex->ee_len = cpu_to_le16(map.m_len);
907 ext4_ext_store_pblock(ex, map.m_pblk);
908 if (map.m_flags & EXT4_MAP_UNWRITTEN)
909 ext4_ext_mark_unwritten(ex);
910 else
911 ext4_ext_mark_initialized(ex);
912 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
913 sizeof(fc_ext), (u8 *)&fc_ext, crc))
914 return -ENOSPC;
915 }
916
917 cur_lblk_off += map.m_len;
918 }
919
920 return 0;
921 }
922
923
924 /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)925 static int ext4_fc_submit_inode_data_all(journal_t *journal)
926 {
927 struct super_block *sb = (struct super_block *)(journal->j_private);
928 struct ext4_sb_info *sbi = EXT4_SB(sb);
929 struct ext4_inode_info *ei;
930 int ret = 0;
931
932 spin_lock(&sbi->s_fc_lock);
933 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
934 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
935 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
936 while (atomic_read(&ei->i_fc_updates)) {
937 DEFINE_WAIT(wait);
938
939 prepare_to_wait(&ei->i_fc_wait, &wait,
940 TASK_UNINTERRUPTIBLE);
941 if (atomic_read(&ei->i_fc_updates)) {
942 spin_unlock(&sbi->s_fc_lock);
943 schedule();
944 spin_lock(&sbi->s_fc_lock);
945 }
946 finish_wait(&ei->i_fc_wait, &wait);
947 }
948 spin_unlock(&sbi->s_fc_lock);
949 ret = jbd2_submit_inode_data(ei->jinode);
950 if (ret)
951 return ret;
952 spin_lock(&sbi->s_fc_lock);
953 }
954 spin_unlock(&sbi->s_fc_lock);
955
956 return ret;
957 }
958
959 /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)960 static int ext4_fc_wait_inode_data_all(journal_t *journal)
961 {
962 struct super_block *sb = (struct super_block *)(journal->j_private);
963 struct ext4_sb_info *sbi = EXT4_SB(sb);
964 struct ext4_inode_info *pos, *n;
965 int ret = 0;
966
967 spin_lock(&sbi->s_fc_lock);
968 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
969 if (!ext4_test_inode_state(&pos->vfs_inode,
970 EXT4_STATE_FC_COMMITTING))
971 continue;
972 spin_unlock(&sbi->s_fc_lock);
973
974 ret = jbd2_wait_inode_data(journal, pos->jinode);
975 if (ret)
976 return ret;
977 spin_lock(&sbi->s_fc_lock);
978 }
979 spin_unlock(&sbi->s_fc_lock);
980
981 return 0;
982 }
983
984 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)985 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
986 __acquires(&sbi->s_fc_lock)
987 __releases(&sbi->s_fc_lock)
988 {
989 struct super_block *sb = (struct super_block *)(journal->j_private);
990 struct ext4_sb_info *sbi = EXT4_SB(sb);
991 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
992 struct inode *inode;
993 struct ext4_inode_info *ei, *ei_n;
994 int ret;
995
996 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
997 return 0;
998 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
999 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1000 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1001 spin_unlock(&sbi->s_fc_lock);
1002 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1003 ret = -ENOSPC;
1004 goto lock_and_exit;
1005 }
1006 spin_lock(&sbi->s_fc_lock);
1007 continue;
1008 }
1009
1010 inode = NULL;
1011 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1012 i_fc_list) {
1013 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1014 inode = &ei->vfs_inode;
1015 break;
1016 }
1017 }
1018 /*
1019 * If we don't find inode in our list, then it was deleted,
1020 * in which case, we don't need to record it's create tag.
1021 */
1022 if (!inode)
1023 continue;
1024 spin_unlock(&sbi->s_fc_lock);
1025
1026 /*
1027 * We first write the inode and then the create dirent. This
1028 * allows the recovery code to create an unnamed inode first
1029 * and then link it to a directory entry. This allows us
1030 * to use namei.c routines almost as is and simplifies
1031 * the recovery code.
1032 */
1033 ret = ext4_fc_write_inode(inode, crc);
1034 if (ret)
1035 goto lock_and_exit;
1036
1037 ret = ext4_fc_write_inode_data(inode, crc);
1038 if (ret)
1039 goto lock_and_exit;
1040
1041 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1042 ret = -ENOSPC;
1043 goto lock_and_exit;
1044 }
1045
1046 spin_lock(&sbi->s_fc_lock);
1047 }
1048 return 0;
1049 lock_and_exit:
1050 spin_lock(&sbi->s_fc_lock);
1051 return ret;
1052 }
1053
ext4_fc_perform_commit(journal_t * journal)1054 static int ext4_fc_perform_commit(journal_t *journal)
1055 {
1056 struct super_block *sb = (struct super_block *)(journal->j_private);
1057 struct ext4_sb_info *sbi = EXT4_SB(sb);
1058 struct ext4_inode_info *iter;
1059 struct ext4_fc_head head;
1060 struct inode *inode;
1061 struct blk_plug plug;
1062 int ret = 0;
1063 u32 crc = 0;
1064
1065 ret = ext4_fc_submit_inode_data_all(journal);
1066 if (ret)
1067 return ret;
1068
1069 ret = ext4_fc_wait_inode_data_all(journal);
1070 if (ret)
1071 return ret;
1072
1073 /*
1074 * If file system device is different from journal device, issue a cache
1075 * flush before we start writing fast commit blocks.
1076 */
1077 if (journal->j_fs_dev != journal->j_dev)
1078 blkdev_issue_flush(journal->j_fs_dev);
1079
1080 blk_start_plug(&plug);
1081 if (sbi->s_fc_bytes == 0) {
1082 /*
1083 * Add a head tag only if this is the first fast commit
1084 * in this TID.
1085 */
1086 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1087 head.fc_tid = cpu_to_le32(
1088 sbi->s_journal->j_running_transaction->t_tid);
1089 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1090 (u8 *)&head, &crc)) {
1091 ret = -ENOSPC;
1092 goto out;
1093 }
1094 }
1095
1096 spin_lock(&sbi->s_fc_lock);
1097 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1098 if (ret) {
1099 spin_unlock(&sbi->s_fc_lock);
1100 goto out;
1101 }
1102
1103 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1104 inode = &iter->vfs_inode;
1105 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106 continue;
1107
1108 spin_unlock(&sbi->s_fc_lock);
1109 ret = ext4_fc_write_inode_data(inode, &crc);
1110 if (ret)
1111 goto out;
1112 ret = ext4_fc_write_inode(inode, &crc);
1113 if (ret)
1114 goto out;
1115 spin_lock(&sbi->s_fc_lock);
1116 }
1117 spin_unlock(&sbi->s_fc_lock);
1118
1119 ret = ext4_fc_write_tail(sb, crc);
1120
1121 out:
1122 blk_finish_plug(&plug);
1123 return ret;
1124 }
1125
1126 /*
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1131 */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133 {
1134 struct super_block *sb = (struct super_block *)(journal->j_private);
1135 struct ext4_sb_info *sbi = EXT4_SB(sb);
1136 int nblks = 0, ret, bsize = journal->j_blocksize;
1137 int subtid = atomic_read(&sbi->s_fc_subtid);
1138 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139 ktime_t start_time, commit_time;
1140
1141 trace_ext4_fc_commit_start(sb);
1142
1143 start_time = ktime_get();
1144
1145 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146 (ext4_fc_is_ineligible(sb))) {
1147 reason = EXT4_FC_REASON_INELIGIBLE;
1148 goto out;
1149 }
1150
1151 restart_fc:
1152 ret = jbd2_fc_begin_commit(journal, commit_tid);
1153 if (ret == -EALREADY) {
1154 /* There was an ongoing commit, check if we need to restart */
1155 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156 commit_tid > journal->j_commit_sequence)
1157 goto restart_fc;
1158 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159 goto out;
1160 } else if (ret) {
1161 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162 reason = EXT4_FC_REASON_FC_START_FAILED;
1163 goto out;
1164 }
1165
1166 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167 ret = ext4_fc_perform_commit(journal);
1168 if (ret < 0) {
1169 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170 reason = EXT4_FC_REASON_FC_FAILED;
1171 goto out;
1172 }
1173 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174 ret = jbd2_fc_wait_bufs(journal, nblks);
1175 if (ret < 0) {
1176 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177 reason = EXT4_FC_REASON_FC_FAILED;
1178 goto out;
1179 }
1180 atomic_inc(&sbi->s_fc_subtid);
1181 jbd2_fc_end_commit(journal);
1182 out:
1183 /* Has any ineligible update happened since we started? */
1184 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186 reason = EXT4_FC_REASON_INELIGIBLE;
1187 }
1188
1189 spin_lock(&sbi->s_fc_lock);
1190 if (reason != EXT4_FC_REASON_OK &&
1191 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192 sbi->s_fc_stats.fc_ineligible_commits++;
1193 } else {
1194 sbi->s_fc_stats.fc_num_commits++;
1195 sbi->s_fc_stats.fc_numblks += nblks;
1196 }
1197 spin_unlock(&sbi->s_fc_lock);
1198 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199 trace_ext4_fc_commit_stop(sb, nblks, reason);
1200 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201 /*
1202 * weight the commit time higher than the average time so we don't
1203 * react too strongly to vast changes in the commit time
1204 */
1205 if (likely(sbi->s_fc_avg_commit_time))
1206 sbi->s_fc_avg_commit_time = (commit_time +
1207 sbi->s_fc_avg_commit_time * 3) / 4;
1208 else
1209 sbi->s_fc_avg_commit_time = commit_time;
1210 jbd_debug(1,
1211 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212 nblks, reason, subtid);
1213 if (reason == EXT4_FC_REASON_FC_FAILED)
1214 return jbd2_fc_end_commit_fallback(journal);
1215 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216 reason == EXT4_FC_REASON_INELIGIBLE)
1217 return jbd2_complete_transaction(journal, commit_tid);
1218 return 0;
1219 }
1220
1221 /*
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1224 */
ext4_fc_cleanup(journal_t * journal,int full)1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1226 {
1227 struct super_block *sb = journal->j_private;
1228 struct ext4_sb_info *sbi = EXT4_SB(sb);
1229 struct ext4_inode_info *iter, *iter_n;
1230 struct ext4_fc_dentry_update *fc_dentry;
1231
1232 if (full && sbi->s_fc_bh)
1233 sbi->s_fc_bh = NULL;
1234
1235 jbd2_fc_release_bufs(journal);
1236
1237 spin_lock(&sbi->s_fc_lock);
1238 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1239 i_fc_list) {
1240 list_del_init(&iter->i_fc_list);
1241 ext4_clear_inode_state(&iter->vfs_inode,
1242 EXT4_STATE_FC_COMMITTING);
1243 ext4_fc_reset_inode(&iter->vfs_inode);
1244 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1245 smp_mb();
1246 #if (BITS_PER_LONG < 64)
1247 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1248 #else
1249 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1250 #endif
1251 }
1252
1253 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1254 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1255 struct ext4_fc_dentry_update,
1256 fcd_list);
1257 list_del_init(&fc_dentry->fcd_list);
1258 spin_unlock(&sbi->s_fc_lock);
1259
1260 if (fc_dentry->fcd_name.name &&
1261 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1262 kfree(fc_dentry->fcd_name.name);
1263 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1264 spin_lock(&sbi->s_fc_lock);
1265 }
1266
1267 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1268 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1269 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1270 &sbi->s_fc_q[FC_Q_MAIN]);
1271
1272 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1273 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1274
1275 if (full)
1276 sbi->s_fc_bytes = 0;
1277 spin_unlock(&sbi->s_fc_lock);
1278 trace_ext4_fc_stats(sb);
1279 }
1280
1281 /* Ext4 Replay Path Routines */
1282
1283 /* Helper struct for dentry replay routines */
1284 struct dentry_info_args {
1285 int parent_ino, dname_len, ino, inode_len;
1286 char *dname;
1287 };
1288
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl * tl,u8 * val)1289 static inline void tl_to_darg(struct dentry_info_args *darg,
1290 struct ext4_fc_tl *tl, u8 *val)
1291 {
1292 struct ext4_fc_dentry_info fcd;
1293
1294 memcpy(&fcd, val, sizeof(fcd));
1295
1296 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1297 darg->ino = le32_to_cpu(fcd.fc_ino);
1298 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1299 darg->dname_len = le16_to_cpu(tl->fc_len) -
1300 sizeof(struct ext4_fc_dentry_info);
1301 }
1302
1303 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1304 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1305 u8 *val)
1306 {
1307 struct inode *inode, *old_parent;
1308 struct qstr entry;
1309 struct dentry_info_args darg;
1310 int ret = 0;
1311
1312 tl_to_darg(&darg, tl, val);
1313
1314 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315 darg.parent_ino, darg.dname_len);
1316
1317 entry.name = darg.dname;
1318 entry.len = darg.dname_len;
1319 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321 if (IS_ERR(inode)) {
1322 jbd_debug(1, "Inode %d not found", darg.ino);
1323 return 0;
1324 }
1325
1326 old_parent = ext4_iget(sb, darg.parent_ino,
1327 EXT4_IGET_NORMAL);
1328 if (IS_ERR(old_parent)) {
1329 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1330 iput(inode);
1331 return 0;
1332 }
1333
1334 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335 /* -ENOENT ok coz it might not exist anymore. */
1336 if (ret == -ENOENT)
1337 ret = 0;
1338 iput(old_parent);
1339 iput(inode);
1340 return ret;
1341 }
1342
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1343 static int ext4_fc_replay_link_internal(struct super_block *sb,
1344 struct dentry_info_args *darg,
1345 struct inode *inode)
1346 {
1347 struct inode *dir = NULL;
1348 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350 int ret = 0;
1351
1352 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353 if (IS_ERR(dir)) {
1354 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355 dir = NULL;
1356 goto out;
1357 }
1358
1359 dentry_dir = d_obtain_alias(dir);
1360 if (IS_ERR(dentry_dir)) {
1361 jbd_debug(1, "Failed to obtain dentry");
1362 dentry_dir = NULL;
1363 goto out;
1364 }
1365
1366 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367 if (!dentry_inode) {
1368 jbd_debug(1, "Inode dentry not created.");
1369 ret = -ENOMEM;
1370 goto out;
1371 }
1372
1373 ret = __ext4_link(dir, inode, dentry_inode);
1374 /*
1375 * It's possible that link already existed since data blocks
1376 * for the dir in question got persisted before we crashed OR
1377 * we replayed this tag and crashed before the entire replay
1378 * could complete.
1379 */
1380 if (ret && ret != -EEXIST) {
1381 jbd_debug(1, "Failed to link\n");
1382 goto out;
1383 }
1384
1385 ret = 0;
1386 out:
1387 if (dentry_dir) {
1388 d_drop(dentry_dir);
1389 dput(dentry_dir);
1390 } else if (dir) {
1391 iput(dir);
1392 }
1393 if (dentry_inode) {
1394 d_drop(dentry_inode);
1395 dput(dentry_inode);
1396 }
1397
1398 return ret;
1399 }
1400
1401 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1402 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1403 u8 *val)
1404 {
1405 struct inode *inode;
1406 struct dentry_info_args darg;
1407 int ret = 0;
1408
1409 tl_to_darg(&darg, tl, val);
1410 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1411 darg.parent_ino, darg.dname_len);
1412
1413 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1414 if (IS_ERR(inode)) {
1415 jbd_debug(1, "Inode not found.");
1416 return 0;
1417 }
1418
1419 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1420 iput(inode);
1421 return ret;
1422 }
1423
1424 /*
1425 * Record all the modified inodes during replay. We use this later to setup
1426 * block bitmaps correctly.
1427 */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1428 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1429 {
1430 struct ext4_fc_replay_state *state;
1431 int i;
1432
1433 state = &EXT4_SB(sb)->s_fc_replay_state;
1434 for (i = 0; i < state->fc_modified_inodes_used; i++)
1435 if (state->fc_modified_inodes[i] == ino)
1436 return 0;
1437 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1438 state->fc_modified_inodes_size +=
1439 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1440 state->fc_modified_inodes = krealloc(
1441 state->fc_modified_inodes, sizeof(int) *
1442 state->fc_modified_inodes_size,
1443 GFP_KERNEL);
1444 if (!state->fc_modified_inodes)
1445 return -ENOMEM;
1446 }
1447 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1448 return 0;
1449 }
1450
1451 /*
1452 * Inode replay function
1453 */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1454 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1455 u8 *val)
1456 {
1457 struct ext4_fc_inode fc_inode;
1458 struct ext4_inode *raw_inode;
1459 struct ext4_inode *raw_fc_inode;
1460 struct inode *inode = NULL;
1461 struct ext4_iloc iloc;
1462 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1463 struct ext4_extent_header *eh;
1464
1465 memcpy(&fc_inode, val, sizeof(fc_inode));
1466
1467 ino = le32_to_cpu(fc_inode.fc_ino);
1468 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1469
1470 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1471 if (!IS_ERR(inode)) {
1472 ext4_ext_clear_bb(inode);
1473 iput(inode);
1474 }
1475 inode = NULL;
1476
1477 ext4_fc_record_modified_inode(sb, ino);
1478
1479 raw_fc_inode = (struct ext4_inode *)
1480 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1481 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1482 if (ret)
1483 goto out;
1484
1485 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1486 raw_inode = ext4_raw_inode(&iloc);
1487
1488 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1489 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1490 inode_len - offsetof(struct ext4_inode, i_generation));
1491 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1492 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1493 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1494 memset(eh, 0, sizeof(*eh));
1495 eh->eh_magic = EXT4_EXT_MAGIC;
1496 eh->eh_max = cpu_to_le16(
1497 (sizeof(raw_inode->i_block) -
1498 sizeof(struct ext4_extent_header))
1499 / sizeof(struct ext4_extent));
1500 }
1501 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1502 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1503 sizeof(raw_inode->i_block));
1504 }
1505
1506 /* Immediately update the inode on disk. */
1507 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1508 if (ret)
1509 goto out;
1510 ret = sync_dirty_buffer(iloc.bh);
1511 if (ret)
1512 goto out;
1513 ret = ext4_mark_inode_used(sb, ino);
1514 if (ret)
1515 goto out;
1516
1517 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1518 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1519 if (IS_ERR(inode)) {
1520 jbd_debug(1, "Inode not found.");
1521 return -EFSCORRUPTED;
1522 }
1523
1524 /*
1525 * Our allocator could have made different decisions than before
1526 * crashing. This should be fixed but until then, we calculate
1527 * the number of blocks the inode.
1528 */
1529 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1530 ext4_ext_replay_set_iblocks(inode);
1531
1532 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533 ext4_reset_inode_seed(inode);
1534
1535 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537 sync_dirty_buffer(iloc.bh);
1538 brelse(iloc.bh);
1539 out:
1540 iput(inode);
1541 if (!ret)
1542 blkdev_issue_flush(sb->s_bdev);
1543
1544 return 0;
1545 }
1546
1547 /*
1548 * Dentry create replay function.
1549 *
1550 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551 * inode for which we are trying to create a dentry here, should already have
1552 * been replayed before we start here.
1553 */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1554 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555 u8 *val)
1556 {
1557 int ret = 0;
1558 struct inode *inode = NULL;
1559 struct inode *dir = NULL;
1560 struct dentry_info_args darg;
1561
1562 tl_to_darg(&darg, tl, val);
1563
1564 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565 darg.parent_ino, darg.dname_len);
1566
1567 /* This takes care of update group descriptor and other metadata */
1568 ret = ext4_mark_inode_used(sb, darg.ino);
1569 if (ret)
1570 goto out;
1571
1572 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573 if (IS_ERR(inode)) {
1574 jbd_debug(1, "inode %d not found.", darg.ino);
1575 inode = NULL;
1576 ret = -EINVAL;
1577 goto out;
1578 }
1579
1580 if (S_ISDIR(inode->i_mode)) {
1581 /*
1582 * If we are creating a directory, we need to make sure that the
1583 * dot and dot dot dirents are setup properly.
1584 */
1585 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586 if (IS_ERR(dir)) {
1587 jbd_debug(1, "Dir %d not found.", darg.ino);
1588 goto out;
1589 }
1590 ret = ext4_init_new_dir(NULL, dir, inode);
1591 iput(dir);
1592 if (ret) {
1593 ret = 0;
1594 goto out;
1595 }
1596 }
1597 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598 if (ret)
1599 goto out;
1600 set_nlink(inode, 1);
1601 ext4_mark_inode_dirty(NULL, inode);
1602 out:
1603 if (inode)
1604 iput(inode);
1605 return ret;
1606 }
1607
1608 /*
1609 * Record physical disk regions which are in use as per fast commit area. Our
1610 * simple replay phase allocator excludes these regions from allocation.
1611 */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len)1612 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614 {
1615 struct ext4_fc_replay_state *state;
1616 struct ext4_fc_alloc_region *region;
1617
1618 state = &EXT4_SB(sb)->s_fc_replay_state;
1619 if (state->fc_regions_used == state->fc_regions_size) {
1620 state->fc_regions_size +=
1621 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622 state->fc_regions = krealloc(
1623 state->fc_regions,
1624 state->fc_regions_size *
1625 sizeof(struct ext4_fc_alloc_region),
1626 GFP_KERNEL);
1627 if (!state->fc_regions)
1628 return -ENOMEM;
1629 }
1630 region = &state->fc_regions[state->fc_regions_used++];
1631 region->ino = ino;
1632 region->lblk = lblk;
1633 region->pblk = pblk;
1634 region->len = len;
1635
1636 return 0;
1637 }
1638
1639 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1640 static int ext4_fc_replay_add_range(struct super_block *sb,
1641 struct ext4_fc_tl *tl, u8 *val)
1642 {
1643 struct ext4_fc_add_range fc_add_ex;
1644 struct ext4_extent newex, *ex;
1645 struct inode *inode;
1646 ext4_lblk_t start, cur;
1647 int remaining, len;
1648 ext4_fsblk_t start_pblk;
1649 struct ext4_map_blocks map;
1650 struct ext4_ext_path *path = NULL;
1651 int ret;
1652
1653 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658 ext4_ext_get_actual_len(ex));
1659
1660 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661 if (IS_ERR(inode)) {
1662 jbd_debug(1, "Inode not found.");
1663 return 0;
1664 }
1665
1666 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668 start = le32_to_cpu(ex->ee_block);
1669 start_pblk = ext4_ext_pblock(ex);
1670 len = ext4_ext_get_actual_len(ex);
1671
1672 cur = start;
1673 remaining = len;
1674 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676 inode->i_ino);
1677
1678 while (remaining > 0) {
1679 map.m_lblk = cur;
1680 map.m_len = remaining;
1681 map.m_pblk = 0;
1682 ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684 if (ret < 0) {
1685 iput(inode);
1686 return 0;
1687 }
1688
1689 if (ret == 0) {
1690 /* Range is not mapped */
1691 path = ext4_find_extent(inode, cur, NULL, 0);
1692 if (IS_ERR(path)) {
1693 iput(inode);
1694 return 0;
1695 }
1696 memset(&newex, 0, sizeof(newex));
1697 newex.ee_block = cpu_to_le32(cur);
1698 ext4_ext_store_pblock(
1699 &newex, start_pblk + cur - start);
1700 newex.ee_len = cpu_to_le16(map.m_len);
1701 if (ext4_ext_is_unwritten(ex))
1702 ext4_ext_mark_unwritten(&newex);
1703 down_write(&EXT4_I(inode)->i_data_sem);
1704 ret = ext4_ext_insert_extent(
1705 NULL, inode, &path, &newex, 0);
1706 up_write((&EXT4_I(inode)->i_data_sem));
1707 ext4_ext_drop_refs(path);
1708 kfree(path);
1709 if (ret) {
1710 iput(inode);
1711 return 0;
1712 }
1713 goto next;
1714 }
1715
1716 if (start_pblk + cur - start != map.m_pblk) {
1717 /*
1718 * Logical to physical mapping changed. This can happen
1719 * if this range was removed and then reallocated to
1720 * map to new physical blocks during a fast commit.
1721 */
1722 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723 ext4_ext_is_unwritten(ex),
1724 start_pblk + cur - start);
1725 if (ret) {
1726 iput(inode);
1727 return 0;
1728 }
1729 /*
1730 * Mark the old blocks as free since they aren't used
1731 * anymore. We maintain an array of all the modified
1732 * inodes. In case these blocks are still used at either
1733 * a different logical range in the same inode or in
1734 * some different inode, we will mark them as allocated
1735 * at the end of the FC replay using our array of
1736 * modified inodes.
1737 */
1738 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739 goto next;
1740 }
1741
1742 /* Range is mapped and needs a state change */
1743 jbd_debug(1, "Converting from %ld to %d %lld",
1744 map.m_flags & EXT4_MAP_UNWRITTEN,
1745 ext4_ext_is_unwritten(ex), map.m_pblk);
1746 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747 ext4_ext_is_unwritten(ex), map.m_pblk);
1748 if (ret) {
1749 iput(inode);
1750 return 0;
1751 }
1752 /*
1753 * We may have split the extent tree while toggling the state.
1754 * Try to shrink the extent tree now.
1755 */
1756 ext4_ext_replay_shrink_inode(inode, start + len);
1757 next:
1758 cur += map.m_len;
1759 remaining -= map.m_len;
1760 }
1761 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762 sb->s_blocksize_bits);
1763 iput(inode);
1764 return 0;
1765 }
1766
1767 /* Replay DEL_RANGE tag */
1768 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl * tl,u8 * val)1769 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770 u8 *val)
1771 {
1772 struct inode *inode;
1773 struct ext4_fc_del_range lrange;
1774 struct ext4_map_blocks map;
1775 ext4_lblk_t cur, remaining;
1776 int ret;
1777
1778 memcpy(&lrange, val, sizeof(lrange));
1779 cur = le32_to_cpu(lrange.fc_lblk);
1780 remaining = le32_to_cpu(lrange.fc_len);
1781
1782 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783 le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786 if (IS_ERR(inode)) {
1787 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788 return 0;
1789 }
1790
1791 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795 le32_to_cpu(lrange.fc_len));
1796 while (remaining > 0) {
1797 map.m_lblk = cur;
1798 map.m_len = remaining;
1799
1800 ret = ext4_map_blocks(NULL, inode, &map, 0);
1801 if (ret < 0) {
1802 iput(inode);
1803 return 0;
1804 }
1805 if (ret > 0) {
1806 remaining -= ret;
1807 cur += ret;
1808 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809 } else {
1810 remaining -= map.m_len;
1811 cur += map.m_len;
1812 }
1813 }
1814
1815 ret = ext4_punch_hole(inode,
1816 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
1818 if (ret)
1819 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820 ext4_ext_replay_shrink_inode(inode,
1821 i_size_read(inode) >> sb->s_blocksize_bits);
1822 ext4_mark_inode_dirty(NULL, inode);
1823 iput(inode);
1824
1825 return 0;
1826 }
1827
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1828 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829 {
1830 struct ext4_fc_replay_state *state;
1831 struct inode *inode;
1832 struct ext4_ext_path *path = NULL;
1833 struct ext4_map_blocks map;
1834 int i, ret, j;
1835 ext4_lblk_t cur, end;
1836
1837 state = &EXT4_SB(sb)->s_fc_replay_state;
1838 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840 EXT4_IGET_NORMAL);
1841 if (IS_ERR(inode)) {
1842 jbd_debug(1, "Inode %d not found.",
1843 state->fc_modified_inodes[i]);
1844 continue;
1845 }
1846 cur = 0;
1847 end = EXT_MAX_BLOCKS;
1848 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1849 iput(inode);
1850 continue;
1851 }
1852 while (cur < end) {
1853 map.m_lblk = cur;
1854 map.m_len = end - cur;
1855
1856 ret = ext4_map_blocks(NULL, inode, &map, 0);
1857 if (ret < 0)
1858 break;
1859
1860 if (ret > 0) {
1861 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1862 if (!IS_ERR(path)) {
1863 for (j = 0; j < path->p_depth; j++)
1864 ext4_mb_mark_bb(inode->i_sb,
1865 path[j].p_block, 1, 1);
1866 ext4_ext_drop_refs(path);
1867 kfree(path);
1868 }
1869 cur += ret;
1870 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1871 map.m_len, 1);
1872 } else {
1873 cur = cur + (map.m_len ? map.m_len : 1);
1874 }
1875 }
1876 iput(inode);
1877 }
1878 }
1879
1880 /*
1881 * Check if block is in excluded regions for block allocation. The simple
1882 * allocator that runs during replay phase is calls this function to see
1883 * if it is okay to use a block.
1884 */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1885 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1886 {
1887 int i;
1888 struct ext4_fc_replay_state *state;
1889
1890 state = &EXT4_SB(sb)->s_fc_replay_state;
1891 for (i = 0; i < state->fc_regions_valid; i++) {
1892 if (state->fc_regions[i].ino == 0 ||
1893 state->fc_regions[i].len == 0)
1894 continue;
1895 if (blk >= state->fc_regions[i].pblk &&
1896 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1897 return true;
1898 }
1899 return false;
1900 }
1901
1902 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)1903 void ext4_fc_replay_cleanup(struct super_block *sb)
1904 {
1905 struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1908 kfree(sbi->s_fc_replay_state.fc_regions);
1909 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1910 }
1911
1912 /*
1913 * Recovery Scan phase handler
1914 *
1915 * This function is called during the scan phase and is responsible
1916 * for doing following things:
1917 * - Make sure the fast commit area has valid tags for replay
1918 * - Count number of tags that need to be replayed by the replay handler
1919 * - Verify CRC
1920 * - Create a list of excluded blocks for allocation during replay phase
1921 *
1922 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1923 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1924 * to indicate that scan has finished and JBD2 can now start replay phase.
1925 * It returns a negative error to indicate that there was an error. At the end
1926 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1927 * to indicate the number of tags that need to replayed during the replay phase.
1928 */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)1929 static int ext4_fc_replay_scan(journal_t *journal,
1930 struct buffer_head *bh, int off,
1931 tid_t expected_tid)
1932 {
1933 struct super_block *sb = journal->j_private;
1934 struct ext4_sb_info *sbi = EXT4_SB(sb);
1935 struct ext4_fc_replay_state *state;
1936 int ret = JBD2_FC_REPLAY_CONTINUE;
1937 struct ext4_fc_add_range ext;
1938 struct ext4_fc_tl tl;
1939 struct ext4_fc_tail tail;
1940 __u8 *start, *end, *cur, *val;
1941 struct ext4_fc_head head;
1942 struct ext4_extent *ex;
1943
1944 state = &sbi->s_fc_replay_state;
1945
1946 start = (u8 *)bh->b_data;
1947 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1948
1949 if (state->fc_replay_expected_off == 0) {
1950 state->fc_cur_tag = 0;
1951 state->fc_replay_num_tags = 0;
1952 state->fc_crc = 0;
1953 state->fc_regions = NULL;
1954 state->fc_regions_valid = state->fc_regions_used =
1955 state->fc_regions_size = 0;
1956 /* Check if we can stop early */
1957 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1958 != EXT4_FC_TAG_HEAD)
1959 return 0;
1960 }
1961
1962 if (off != state->fc_replay_expected_off) {
1963 ret = -EFSCORRUPTED;
1964 goto out_err;
1965 }
1966
1967 state->fc_replay_expected_off++;
1968 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1969 memcpy(&tl, cur, sizeof(tl));
1970 val = cur + sizeof(tl);
1971 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1972 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1973 switch (le16_to_cpu(tl.fc_tag)) {
1974 case EXT4_FC_TAG_ADD_RANGE:
1975 memcpy(&ext, val, sizeof(ext));
1976 ex = (struct ext4_extent *)&ext.fc_ex;
1977 ret = ext4_fc_record_regions(sb,
1978 le32_to_cpu(ext.fc_ino),
1979 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1980 ext4_ext_get_actual_len(ex));
1981 if (ret < 0)
1982 break;
1983 ret = JBD2_FC_REPLAY_CONTINUE;
1984 fallthrough;
1985 case EXT4_FC_TAG_DEL_RANGE:
1986 case EXT4_FC_TAG_LINK:
1987 case EXT4_FC_TAG_UNLINK:
1988 case EXT4_FC_TAG_CREAT:
1989 case EXT4_FC_TAG_INODE:
1990 case EXT4_FC_TAG_PAD:
1991 state->fc_cur_tag++;
1992 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1993 sizeof(tl) + le16_to_cpu(tl.fc_len));
1994 break;
1995 case EXT4_FC_TAG_TAIL:
1996 state->fc_cur_tag++;
1997 memcpy(&tail, val, sizeof(tail));
1998 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1999 sizeof(tl) +
2000 offsetof(struct ext4_fc_tail,
2001 fc_crc));
2002 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2003 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2004 state->fc_replay_num_tags = state->fc_cur_tag;
2005 state->fc_regions_valid =
2006 state->fc_regions_used;
2007 } else {
2008 ret = state->fc_replay_num_tags ?
2009 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2010 }
2011 state->fc_crc = 0;
2012 break;
2013 case EXT4_FC_TAG_HEAD:
2014 memcpy(&head, val, sizeof(head));
2015 if (le32_to_cpu(head.fc_features) &
2016 ~EXT4_FC_SUPPORTED_FEATURES) {
2017 ret = -EOPNOTSUPP;
2018 break;
2019 }
2020 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2021 ret = JBD2_FC_REPLAY_STOP;
2022 break;
2023 }
2024 state->fc_cur_tag++;
2025 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2026 sizeof(tl) + le16_to_cpu(tl.fc_len));
2027 break;
2028 default:
2029 ret = state->fc_replay_num_tags ?
2030 JBD2_FC_REPLAY_STOP : -ECANCELED;
2031 }
2032 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2033 break;
2034 }
2035
2036 out_err:
2037 trace_ext4_fc_replay_scan(sb, ret, off);
2038 return ret;
2039 }
2040
2041 /*
2042 * Main recovery path entry point.
2043 * The meaning of return codes is similar as above.
2044 */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2045 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2046 enum passtype pass, int off, tid_t expected_tid)
2047 {
2048 struct super_block *sb = journal->j_private;
2049 struct ext4_sb_info *sbi = EXT4_SB(sb);
2050 struct ext4_fc_tl tl;
2051 __u8 *start, *end, *cur, *val;
2052 int ret = JBD2_FC_REPLAY_CONTINUE;
2053 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2054 struct ext4_fc_tail tail;
2055
2056 if (pass == PASS_SCAN) {
2057 state->fc_current_pass = PASS_SCAN;
2058 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2059 }
2060
2061 if (state->fc_current_pass != pass) {
2062 state->fc_current_pass = pass;
2063 sbi->s_mount_state |= EXT4_FC_REPLAY;
2064 }
2065 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2066 jbd_debug(1, "Replay stops\n");
2067 ext4_fc_set_bitmaps_and_counters(sb);
2068 return 0;
2069 }
2070
2071 #ifdef CONFIG_EXT4_DEBUG
2072 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2073 pr_warn("Dropping fc block %d because max_replay set\n", off);
2074 return JBD2_FC_REPLAY_STOP;
2075 }
2076 #endif
2077
2078 start = (u8 *)bh->b_data;
2079 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2080
2081 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2082 memcpy(&tl, cur, sizeof(tl));
2083 val = cur + sizeof(tl);
2084
2085 if (state->fc_replay_num_tags == 0) {
2086 ret = JBD2_FC_REPLAY_STOP;
2087 ext4_fc_set_bitmaps_and_counters(sb);
2088 break;
2089 }
2090 jbd_debug(3, "Replay phase, tag:%s\n",
2091 tag2str(le16_to_cpu(tl.fc_tag)));
2092 state->fc_replay_num_tags--;
2093 switch (le16_to_cpu(tl.fc_tag)) {
2094 case EXT4_FC_TAG_LINK:
2095 ret = ext4_fc_replay_link(sb, &tl, val);
2096 break;
2097 case EXT4_FC_TAG_UNLINK:
2098 ret = ext4_fc_replay_unlink(sb, &tl, val);
2099 break;
2100 case EXT4_FC_TAG_ADD_RANGE:
2101 ret = ext4_fc_replay_add_range(sb, &tl, val);
2102 break;
2103 case EXT4_FC_TAG_CREAT:
2104 ret = ext4_fc_replay_create(sb, &tl, val);
2105 break;
2106 case EXT4_FC_TAG_DEL_RANGE:
2107 ret = ext4_fc_replay_del_range(sb, &tl, val);
2108 break;
2109 case EXT4_FC_TAG_INODE:
2110 ret = ext4_fc_replay_inode(sb, &tl, val);
2111 break;
2112 case EXT4_FC_TAG_PAD:
2113 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2114 le16_to_cpu(tl.fc_len), 0);
2115 break;
2116 case EXT4_FC_TAG_TAIL:
2117 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2118 le16_to_cpu(tl.fc_len), 0);
2119 memcpy(&tail, val, sizeof(tail));
2120 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2121 break;
2122 case EXT4_FC_TAG_HEAD:
2123 break;
2124 default:
2125 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2126 le16_to_cpu(tl.fc_len), 0);
2127 ret = -ECANCELED;
2128 break;
2129 }
2130 if (ret < 0)
2131 break;
2132 ret = JBD2_FC_REPLAY_CONTINUE;
2133 }
2134 return ret;
2135 }
2136
ext4_fc_init(struct super_block * sb,journal_t * journal)2137 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2138 {
2139 /*
2140 * We set replay callback even if fast commit disabled because we may
2141 * could still have fast commit blocks that need to be replayed even if
2142 * fast commit has now been turned off.
2143 */
2144 journal->j_fc_replay_callback = ext4_fc_replay;
2145 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2146 return;
2147 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2148 }
2149
2150 static const char *fc_ineligible_reasons[] = {
2151 "Extended attributes changed",
2152 "Cross rename",
2153 "Journal flag changed",
2154 "Insufficient memory",
2155 "Swap boot",
2156 "Resize",
2157 "Dir renamed",
2158 "Falloc range op",
2159 "Data journalling",
2160 "FC Commit Failed"
2161 };
2162
ext4_fc_info_show(struct seq_file * seq,void * v)2163 int ext4_fc_info_show(struct seq_file *seq, void *v)
2164 {
2165 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2166 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2167 int i;
2168
2169 if (v != SEQ_START_TOKEN)
2170 return 0;
2171
2172 seq_printf(seq,
2173 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2174 stats->fc_num_commits, stats->fc_ineligible_commits,
2175 stats->fc_numblks,
2176 div_u64(sbi->s_fc_avg_commit_time, 1000));
2177 seq_puts(seq, "Ineligible reasons:\n");
2178 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2179 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2180 stats->fc_ineligible_reason_count[i]);
2181
2182 return 0;
2183 }
2184
ext4_fc_init_dentry_cache(void)2185 int __init ext4_fc_init_dentry_cache(void)
2186 {
2187 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2188 SLAB_RECLAIM_ACCOUNT);
2189
2190 if (ext4_fc_dentry_cachep == NULL)
2191 return -ENOMEM;
2192
2193 return 0;
2194 }
2195