From f8e566c0f5e1fd8de33ccec6eb1ff815cd4b0dc3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 24 Mar 2020 23:03:24 -0700 Subject: xfs: validate the realtime geometry in xfs_validate_sb_common Validate the geometry of the realtime geometry when we mount the filesystem, so that we don't abruptly shut down the filesystem later on. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 00266de58954..c526c5e5ab76 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -328,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || -- cgit v1.2.3 From 7ec949212dba350f1dbc339d2db844db68b39725 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:20 -0700 Subject: xfs: don't try to write a start record into every iclog The xlog_write() function iterates over iclogs until it completes writing all the log vectors passed in. The ticket tracks whether a start record has been written or not, so only the first iclog gets a start record. We only ever pass single use tickets to xlog_write() so we only ever need to write a start record once per xlog_write() call. Hence we don't need to store whether we should write a start record in the ticket as the callers provide all the information we need to determine if a start record should be written. For the moment, we have to ensure that we clear the XLOG_TIC_INITED appropriately so the code in xfs_log_done() still works correctly for committing transactions. (darrick: Note the slight behavior change that we always deduct the size of the op header from the ticket, even for unmount records) Signed-off-by: Dave Chinner [hch: pass an explicit need_start_rec argument] Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 77 +++++++++++++++++++++++++-------------------------- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_log_priv.h | 12 +++----- 3 files changed, 42 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4a53768c5397..33f089aa2a25 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -921,7 +921,7 @@ xfs_log_write_unmount_record( /* remove inited flag, and account for space used */ tic->t_flags = 0; tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write(log, &vec, tic, &lsn, NULL, flags, false); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -1541,7 +1541,7 @@ xlog_commit_record( ASSERT_ALWAYS(iclog); error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + XLOG_COMMIT_TRANS, false); if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -2118,23 +2118,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2156,27 +2154,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2365,7 +2352,8 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; struct xfs_log_iovec *vecp; @@ -2381,23 +2369,22 @@ xlog_write( *start_lsn = 0; - len = xlog_write_calc_vec_length(ticket, log_vector); /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * Region headers and bytes are already accounted for. We only need to + * take into account start records and split regions in this function. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); + if (ticket->t_flags & XLOG_TIC_INITED) { + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_flags &= ~XLOG_TIC_INITED; + } /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. + * Commit record headers and unmount records need to be accounted for. + * These come in as separate writes so are easy to detect. */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + if (!need_start_rec) + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2405,6 +2392,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + index = 0; lv = log_vector; vecp = lv->lv_iovecp; @@ -2431,7 +2420,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2447,11 +2435,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2483,8 +2475,13 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 64cc0bf2ab3b..e0aeb316ce6c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -801,7 +801,7 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2b0aec37e73e..b895e16460ee 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -439,14 +439,10 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); /* * When we crack an atomic LSN, we sample it first so that the value will not -- cgit v1.2.3 From 9590e9c68449055c97e954118699bc9d470d7e30 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:21 -0700 Subject: xfs: re-order initial space accounting checks in xlog_write Commit and unmount records records do not need start records to be written, so rearrange the logic in xlog_write() to remove the need to check for XLOG_TIC_INITED to determine if we should account for the space used by a start record. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 33f089aa2a25..3a4fe4e2e736 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2356,10 +2356,10 @@ xlog_write( bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2367,24 +2367,16 @@ xlog_write( int data_cnt = 0; int error = 0; - *start_lsn = 0; - - /* - * Region headers and bytes are already accounted for. We only need to - * take into account start records and split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) { - ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_curr_res -= sizeof(struct xlog_op_header); + if (ticket->t_flags & XLOG_TIC_INITED) ticket->t_flags &= ~XLOG_TIC_INITED; - } - /* - * Commit record headers and unmount records need to be accounted for. - * These come in as separate writes so are easy to detect. - */ - if (!need_start_rec) - ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2393,10 +2385,7 @@ xlog_write( } len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); - - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; -- cgit v1.2.3 From dd401770b0ff68f896002649c593bbb9560f916d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:21 -0700 Subject: xfs: refactor and split xfs_log_done() xfs_log_done() does two separate things. Firstly, it triggers commit records to be written for permanent transactions, and secondly it releases or regrants transaction reservation space. Since delayed logging was introduced, transactions no longer write directly to the log, hence they never have the XLOG_TIC_INITED flag cleared on them. Hence transactions never write commit records to the log and only need to modify reservation space. Split up xfs_log_done into two parts, and only call the parts of the operation needed for the context xfs_log_done() is currently being called from. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 64 +++++++++++++++++---------------------------------- fs/xfs/xfs_log.h | 4 ---- fs/xfs/xfs_log_cil.c | 13 ++++++----- fs/xfs/xfs_log_priv.h | 4 ++++ fs/xfs/xfs_trans.c | 24 +++++++++---------- 5 files changed, 44 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3a4fe4e2e736..910bcff6b50a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -487,62 +487,40 @@ out_error: */ /* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. + * Write a commit record to the log to close off a running log write. */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, +int +xlog_write_done( + struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - bool regrant) + xfs_lsn_t *lsn) { - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return xlog_commit_record(log, ticket, iclog, lsn); +} - if (!regrant) { +/* + * Release or regrant the ticket reservation now the transaction is done with + * it depending on caller context. Rolling transactions need the ticket + * regranted, otherwise we release it completely. + */ +void +xlog_ticket_done( + struct xlog *log, + struct xlog_ticket *ticket, + bool regrant) +{ + if (!regrant || XLOG_FORCED_SHUTDOWN(log)) { trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ xlog_ungrant_log_space(log, ticket); } else { trace_xfs_log_done_perm(log, ticket); - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; } - xfs_log_ticket_put(ticket); - return lsn; } static bool diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index cc77cc36560a..1412d6993f1e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -105,10 +105,6 @@ struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e0aeb316ce6c..666041ef058f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -839,10 +839,11 @@ restart: } spin_unlock(&cil->xc_push_lock); - /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); - if (commit_lsn == -1) - goto out_abort; + error = xlog_write_done(log, tic, &commit_iclog, &commit_lsn); + if (error) + goto out_abort_free_ticket; + + xlog_ticket_done(log, tic, false); spin_lock(&commit_iclog->ic_callback_lock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { @@ -875,7 +876,7 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_put(tic); + xlog_ticket_done(log, tic, false); out_abort: ASSERT(XLOG_FORCED_SHUTDOWN(log)); xlog_cil_committed(ctx); @@ -1007,7 +1008,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, regrant); + xlog_ticket_done(log, tp->t_ticket, regrant); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b895e16460ee..1f450ea6192c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -443,6 +443,10 @@ int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags, bool need_start_rec); +int xlog_write_done(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xlog_ticket_done(struct xlog *log, struct xlog_ticket *ticket, + bool regrant); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 1adc6bc53a56..e20c759f4884 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,6 +9,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -150,8 +151,9 @@ xfs_trans_reserve( uint blocks, uint rtextents) { - int error = 0; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -162,7 +164,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; @@ -191,9 +193,9 @@ xfs_trans_reserve( if (tp->t_ticket != NULL) { ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, @@ -213,7 +215,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + error = xfs_mod_frextents(mp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -229,7 +231,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); + xlog_ticket_done(mp->m_log, tp->t_ticket, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -237,7 +239,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -1004,9 +1006,7 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); - if (commit_lsn == -1 && !error) - error = -EIO; + xlog_ticket_done(mp->m_log, tp->t_ticket, regrant); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1065,7 +1065,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_done(mp, tp->t_ticket, NULL, false); + xlog_ticket_done(mp->m_log, tp->t_ticket, false); tp->t_ticket = NULL; } -- cgit v1.2.3 From 70e42f2d4797d4d3f09bc0f6df57e8b8c5597e27 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:22 -0700 Subject: xfs: kill XLOG_TIC_INITED It is not longer used or checked by anything, so remove the last traces from the log ticket code. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 4 ---- fs/xfs/xfs_log_priv.h | 6 ++---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 910bcff6b50a..3ccc3ca3c701 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2352,9 +2352,6 @@ xlog_write( * to account for an extra xlog_op_header here. */ ticket->t_curr_res -= sizeof(struct xlog_op_header); - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_flags &= ~XLOG_TIC_INITED; - if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -3493,7 +3490,6 @@ xlog_ticket_alloc( tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1f450ea6192c..fd63400fff6d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,13 +51,11 @@ enum xlog_iclog_state { }; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* -- cgit v1.2.3 From 8b41e3f98e6ca17ed54615bb7a419c499d370a85 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 18:18:23 -0700 Subject: xfs: split xlog_ticket_done Remove xlog_ticket_done and just call the renamed low-level helpers for ungranting or regranting log space directly. To make that a little the reference put on the ticket and all tracing is moved into the actual helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 84 +++++++++++++++++---------------------------------- fs/xfs/xfs_log_cil.c | 9 ++++-- fs/xfs/xfs_log_priv.h | 4 +-- fs/xfs/xfs_trace.h | 14 ++++----- fs/xfs/xfs_trans.c | 9 ++++-- 5 files changed, 47 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3ccc3ca3c701..a22962e35beb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -66,14 +66,6 @@ xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog); @@ -502,27 +494,6 @@ xlog_write_done( return xlog_commit_record(log, ticket, iclog, lsn); } -/* - * Release or regrant the ticket reservation now the transaction is done with - * it depending on caller context. Rolling transactions need the ticket - * regranted, otherwise we release it completely. - */ -void -xlog_ticket_done( - struct xlog *log, - struct xlog_ticket *ticket, - bool regrant) -{ - if (!regrant || XLOG_FORCED_SHUTDOWN(log)) { - trace_xfs_log_done_nonperm(log, ticket); - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - xlog_regrant_reserve_log_space(log, ticket); - } - xfs_log_ticket_put(ticket); -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -921,8 +892,7 @@ out_err: if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } @@ -2992,19 +2962,18 @@ restart: return 0; } /* xlog_state_get_iclog_space */ -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +/* + * The first cnt-1 times through here we don't need to move the grant write head + * because the permanent reservation has reserved cnt times the unit amount. + * Release part of current permanent unit reservation and reset current + * reservation to be one units worth. Also move grant reservation head forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3016,21 +2985,20 @@ xlog_regrant_reserve_log_space( ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); - - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3046,18 +3014,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3072,9 +3041,10 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 666041ef058f..0ae187fa9af2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -843,7 +843,7 @@ restart: if (error) goto out_abort_free_ticket; - xlog_ticket_done(log, tic, false); + xfs_log_ticket_ungrant(log, tic); spin_lock(&commit_iclog->ic_callback_lock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { @@ -876,7 +876,7 @@ out_skip: return; out_abort_free_ticket: - xlog_ticket_done(log, tic, false); + xfs_log_ticket_ungrant(log, tic); out_abort: ASSERT(XLOG_FORCED_SHUTDOWN(log)); xlog_cil_committed(ctx); @@ -1008,7 +1008,10 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xlog_ticket_done(log, tp->t_ticket, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fd63400fff6d..0941b465de9e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -443,8 +443,8 @@ int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, bool need_start_rec); int xlog_write_done(struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *lsn); -void xlog_ticket_done(struct xlog *log, struct xlog_ticket *ticket, - bool regrant); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index efc7751550d9..f7f12d312fd3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) -DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); -DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); @@ -1011,12 +1009,12 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e20c759f4884..28b983ff8b11 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -231,7 +231,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xlog_ticket_done(mp->m_log, tp->t_ticket, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -1006,7 +1006,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xlog_ticket_done(mp->m_log, tp->t_ticket, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + else + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1065,7 +1068,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xlog_ticket_done(mp->m_log, tp->t_ticket, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } -- cgit v1.2.3 From f10e925def9a6d916b291e8c1e704df5a2976f8a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:23 -0700 Subject: xfs: merge xlog_commit_record with xlog_write_done xlog_write_done() is just a thin wrapper around xlog_commit_record(), so they can be merged together easily. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 43 ++++++++++--------------------------------- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_log_priv.h | 2 +- 3 files changed, 12 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a22962e35beb..ccbc9d8e6e3c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -478,22 +471,6 @@ out_error: * marked as with WANT_SYNC. */ -/* - * Write a commit record to the log to close off a running log write. - */ -int -xlog_write_done( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *lsn) -{ - if (XLOG_FORCED_SHUTDOWN(log)) - return -EIO; - - return xlog_commit_record(log, ticket, iclog, lsn); -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -1463,20 +1440,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1486,12 +1460,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS, false); + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 0ae187fa9af2..e3dd405ea767 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -839,7 +839,7 @@ restart: } spin_unlock(&cil->xc_push_lock); - error = xlog_write_done(log, tic, &commit_iclog, &commit_lsn); + error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); if (error) goto out_abort_free_ticket; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 0941b465de9e..f4a54469d7d0 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -441,7 +441,7 @@ int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags, bool need_start_rec); -int xlog_write_done(struct xlog *log, struct xlog_ticket *ticket, +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *lsn); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); -- cgit v1.2.3 From 3c702f95909ac6dfb2d3ea1b897c46fc9717d1a5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:24 -0700 Subject: xfs: refactor unmount record writing Separate out the unmount record writing from the rest of the ticket and log state futzing necessary to make it work. This is a no-op, just makes the code cleaner and places the unmount record formatting and writing alongside the commit record formatting and writing code. We can also get rid of the ticket flag clearing before the xlog_write() call because it no longer cares about the state of XLOG_TIC_INITED. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccbc9d8e6e3c..210048ca6edd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -795,32 +795,44 @@ xlog_wait_on_iclog( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. */ - -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -844,10 +856,7 @@ xfs_log_write_unmount_record( flags &= ~XLOG_UNMOUNT_TRANS; } - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags, false); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -913,7 +922,7 @@ xfs_log_unmount_write( if (XLOG_FORCED_SHUTDOWN(log)) return; xfs_log_unmount_verify_iclog(log); - xfs_log_write_unmount_record(mp); + xlog_unmount_write(log); } /* -- cgit v1.2.3 From b843299ba5f9a430dd26ecd02ee2fef805f19844 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2020 18:18:24 -0700 Subject: xfs: remove some stale comments from the log code Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 59 +++++++++++++------------------------------------------- 1 file changed, 13 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 210048ca6edd..7d1355a9cc43 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -463,14 +463,6 @@ out_error: return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - static bool __xlog_state_release_iclog( struct xlog *log, @@ -1915,7 +1907,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2458,14 +2450,6 @@ next_lv: return error; } - -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ - static void xlog_state_activate_iclog( struct xlog_in_core *iclog, @@ -2826,7 +2810,7 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log); /* also cleans log */ + xlog_state_do_callback(log); } /* @@ -2946,13 +2930,14 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ +} /* - * The first cnt-1 times through here we don't need to move the grant write head - * because the permanent reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and reset current - * reservation to be one units worth. Also move grant reservation head forward. + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ void xfs_log_ticket_regrant( @@ -3034,12 +3019,8 @@ xfs_log_ticket_ungrant( } /* - * Mark the current iclog in the ring as WANT_SYNC and move the current iclog - * pointer to the next iclog in the ring. - * - * When called from xlog_state_get_iclog_space(), the exact size of the iclog - * has not yet been determined, all we know is that we have run out of space in - * the current iclog. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3084,7 +3065,7 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3291,13 +3272,6 @@ xfs_log_force_lsn( return ret; } -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - /* * Free a used ticket when its refcount falls to zero. */ @@ -3454,13 +3428,6 @@ xlog_ticket_alloc( return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3546,7 +3513,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3649,7 +3616,7 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* -- cgit v1.2.3 From 108a42358a05312b2128533c6462a3fdeb410bdf Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:26 -0700 Subject: xfs: Lower CIL flush limit for large logs The current CIL size aggregation limit is 1/8th the log size. This means for large logs we might be aggregating at least 250MB of dirty objects in memory before the CIL is flushed to the journal. With CIL shadow buffers sitting around, this means the CIL is often consuming >500MB of temporary memory that is all allocated under GFP_NOFS conditions. Flushing the CIL can take some time to do if there is other IO ongoing, and can introduce substantial log force latency by itself. It also pins the memory until the objects are in the AIL and can be written back and reclaimed by shrinkers. Hence this threshold also tends to determine the minimum amount of memory XFS can operate in under heavy modification without triggering the OOM killer. Modify the CIL space limit to prevent such huge amounts of pinned metadata from aggregating. We can have 2MB of log IO in flight at once, so limit aggregation to 16x this size. This threshold was chosen as it little impact on performance (on 16-way fsmark) or log traffic but pins a lot less memory on large logs especially under heavy memory pressure. An aggregation limit of 8x had 5-10% performance degradation and a 50% increase in log throughput for the same workload, so clearly that was too small for highly concurrent workloads on large logs. This was found via trace analysis of AIL behaviour. e.g. insertion from a single CIL flush: xfs_ail_insert: old lsn 0/0 new lsn 1/3033090 type XFS_LI_INODE flags IN_AIL $ grep xfs_ail_insert /mnt/scratch/s.t |grep "new lsn 1/3033090" |wc -l 1721823 $ So there were 1.7 million objects inserted into the AIL from this CIL checkpoint, the first at 2323.392108, the last at 2325.667566 which was the end of the trace (i.e. it hadn't finished). Clearly a major problem. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Allison Collins Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_priv.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f4a54469d7d0..938edd19a8a6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -316,13 +316,30 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) /* * ticket grant locks, queues and accounting have their own cachlines -- cgit v1.2.3 From 0e7ab7efe77451cba4cbecb6c9f5ef83cf32b36b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:27 -0700 Subject: xfs: Throttle commits on delayed background CIL push In certain situations the background CIL push can be indefinitely delayed. While we have workarounds from the obvious cases now, it doesn't solve the underlying issue. This issue is that there is no upper limit on the CIL where we will either force or wait for a background push to start, hence allowing the CIL to grow without bound until it consumes all log space. To fix this, add a new wait queue to the CIL which allows background pushes to wait for the CIL context to be switched out. This happens when the push starts, so it will allow us to block incoming transaction commit completion until the push has started. This will only affect processes that are running modifications, and only when the CIL threshold has been significantly overrun. This has no apparent impact on performance, and doesn't even trigger until over 45 million inodes had been created in a 16-way fsmark test on a 2GB log. That was limiting at 64MB of log space used, so the active CIL size is only about 3% of the total log in that case. The concurrent removal of those files did not trigger the background sleep at all. Signed-off-by: Dave Chinner Reviewed-by: Allison Collins Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 37 +++++++++++++++++++++++++++++++++---- fs/xfs/xfs_log_priv.h | 24 ++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 1 + 3 files changed, 58 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e3dd405ea767..b43f0e8f43f2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -668,6 +668,11 @@ xlog_cil_push_work( push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); + /* + * Wake up any background push waiters now this context is being pushed. + */ + wake_up_all(&ctx->push_wait); + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push @@ -744,6 +749,7 @@ xlog_cil_push_work( */ INIT_LIST_HEAD(&new_ctx->committing); INIT_LIST_HEAD(&new_ctx->busy_extents); + init_waitqueue_head(&new_ctx->push_wait); new_ctx->sequence = ctx->sequence + 1; new_ctx->cil = cil; cil->xc_ctx = new_ctx; @@ -891,7 +897,7 @@ out_abort: */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -905,14 +911,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1032,9 +1060,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1193,6 +1221,7 @@ xlog_cil_init( INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + init_waitqueue_head(&ctx->push_wait); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 938edd19a8a6..ec22c7a3867f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -240,6 +240,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ + wait_queue_head_t push_wait; /* background push throttle */ struct work_struct discard_endio_work; }; @@ -337,10 +338,33 @@ struct xfs_cil { * buffer window (32MB) as measurements have shown this to be roughly the * point of diminishing performance increases under highly concurrent * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ #define XLOG_CIL_SPACE_LIMIT(log) \ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) + /* * ticket grant locks, queues and accounting have their own cachlines * as these are quite hot and can be operated on concurrently. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f7f12d312fd3..a4323a63438d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1015,6 +1015,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), -- cgit v1.2.3 From 2def2845cc33390e39b51440508043e4981e10ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:27 -0700 Subject: xfs: don't allow log IO to be throttled Running metadata intensive workloads, I've been seeing the AIL pushing getting stuck on pinned buffers and triggering log forces. The log force is taking a long time to run because the log IO is getting throttled by wbt_wait() - the block layer writeback throttle. It's being throttled because there is a huge amount of metadata writeback going on which is filling the request queue. IOWs, we have a priority inversion problem here. Mark the log IO bios with REQ_IDLE so they don't get throttled by the block layer writeback throttle. When we are forcing the CIL, we are likely to need to to tens of log IOs, and they are issued as fast as they can be build and IO completed. Hence REQ_IDLE is appropriate - it's an indication that more IO will follow shortly. And because we also set REQ_SYNC, the writeback throttle will now treat log IO the same way it treats direct IO writes - it will not throttle them at all. Hence we solve the priority inversion problem caused by the writeback throttle being unable to distinguish between high priority log IO and background metadata writeback. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7d1355a9cc43..46108ca20d85 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1687,7 +1687,15 @@ xlog_write_iclog( iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; -- cgit v1.2.3 From 12eba65b28b085ddebcedbda4a6aa1b9eb94ce20 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:28 -0700 Subject: xfs: Improve metadata buffer reclaim accountability The buffer cache shrinker frees more than just the xfs_buf slab objects - it also frees the pages attached to the buffers. Make sure the memory reclaim code accounts for this memory being freed correctly, similar to how the inode shrinker accounts for pages freed from the page cache due to mapping invalidation. We also need to make sure that the mm subsystem knows these are reclaimable objects. We provide the memory reclaim subsystem with a a shrinker to reclaim xfs_bufs, so we should really mark the slab that way. We also have a lot of xfs_bufs in a busy system, spread them around like we do inodes. Signed-off-by: Dave Chinner Reviewed-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f880141a2268..9ec3eaf1c618 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -327,6 +327,9 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); @@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", - sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; -- cgit v1.2.3 From d59eadaea2b9945095d4d6d44367ebabd604395c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:28 -0700 Subject: xfs: correctly acount for reclaimable slabs The XFS inode item slab actually reclaimed by inode shrinker callbacks from the memory reclaim subsystem. These should be marked as reclaimable so the mm subsystem has the full picture of how much memory it can actually reclaim from the XFS slab caches. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Allison Collins Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2094386af8ac..68fea439d974 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1861,7 +1861,8 @@ xfs_init_zones(void) xfs_ili_zone = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_MEM_SPREAD, NULL); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; -- cgit v1.2.3 From 4165994ac9672d91134675caa6de3645a9ace6c8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:29 -0700 Subject: xfs: factor common AIL item deletion code Factor the common AIL deletion code that does all the wakeups into a helper so we only have one copy of this somewhat tricky code to interface with all the wakeups necessary when the LSN of the log tail changes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Allison Collins Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 12 +----------- fs/xfs/xfs_trans_ail.c | 48 ++++++++++++++++++++++++++---------------------- fs/xfs/xfs_trans_priv.h | 4 +++- 3 files changed, 30 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 4a3d13d4a022..bd8c36809870 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -742,17 +742,7 @@ xfs_iflush_done( xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, mlip_changed); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2ef0dfbfb303..26d2e7928121 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -681,6 +681,27 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + bool do_tail_update) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + if (!do_tail_update) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -740,15 +761,7 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); - - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } + xfs_ail_update_finish(ailp, mlip_changed); } bool @@ -792,10 +805,10 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + bool need_update; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -808,17 +821,8 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + need_update = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, need_update); } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..64ffa746730e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -92,8 +92,10 @@ xfs_trans_ail_update( } bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, bool do_tail_update) + __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); + int shutdown_type); static inline void xfs_trans_ail_remove( -- cgit v1.2.3 From 8eb807bd839938b45bf7a97f0568d2a845ba6929 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:29 -0700 Subject: xfs: tail updates only need to occur when LSN changes We currently wake anything waiting on the log tail to move whenever the log item at the tail of the log is removed. Historically this was fine behaviour because there were very few items at any given LSN. But with delayed logging, there may be thousands of items at any given LSN, and we can't move the tail until they are all gone. Hence if we are removing them in near tail-first order, we might be waking up processes waiting on the tail LSN to change (e.g. log space waiters) repeatedly without them being able to make progress. This also occurs with the new sync push waiters, and can result in thousands of spurious wakeups every second when under heavy direct reclaim pressure. To fix this, check that the tail LSN has actually changed on the AIL before triggering wakeups. This will reduce the number of spurious wakeups when doing bulk AIL removal and make this code much more efficient. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Allison Collins Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 18 ++++++++++++----- fs/xfs/xfs_trans_ail.c | 52 ++++++++++++++++++++++++++++++++++--------------- fs/xfs/xfs_trans_priv.h | 4 ++-- 3 files changed, 51 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bd8c36809870..a627cb951dc6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -730,19 +730,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - xfs_ail_update_finish(ailp, mlip_changed); + xfs_ail_update_finish(ailp, tail_lsn); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 26d2e7928121..564253550b75 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -109,17 +109,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -684,11 +692,12 @@ xfs_ail_push_all_sync( void xfs_ail_update_finish( struct xfs_ail *ailp, - bool do_tail_update) __releases(ailp->ail_lock) + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { struct xfs_mount *mp = ailp->ail_mount; - if (!do_tail_update) { + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { spin_unlock(&ailp->ail_lock); return; } @@ -733,7 +742,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -748,9 +757,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -761,15 +771,23 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - xfs_ail_update_finish(ailp, mlip_changed); + xfs_ail_update_finish(ailp, tail_lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); @@ -777,7 +795,9 @@ xfs_ail_delete_one( clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } /** @@ -808,7 +828,7 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool need_update; + xfs_lsn_t tail_lsn; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -821,8 +841,8 @@ xfs_trans_ail_delete( return; } - need_update = xfs_ail_delete_one(ailp, lip); - xfs_ail_update_finish(ailp, need_update); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 64ffa746730e..35655eac01a6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,8 +91,8 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); -void xfs_ail_update_finish(struct xfs_ail *ailp, bool do_tail_update) +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, int shutdown_type); -- cgit v1.2.3 From 5806165a6663544ea41bc3216f5c5effbde4799e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Mar 2020 20:10:30 -0700 Subject: xfs: factor inode lookup from xfs_ifree_cluster There's lots of indent in this code which makes it a bit hard to follow. We are also going to completely rework the inode lookup code as part of the inode reclaim rework, so factor out the inode lookup code from the inode cluster freeing code. Based on prototype code from Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 152 +++++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 14b922f2a6db..5c930863ed5b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2503,6 +2503,88 @@ out: return error; } +/* + * Look up the inode number specified and mark it stale if it is found. If it is + * dirty, return the inode so it can be attached to the cluster buffer so it can + * be processed appropriately when the cluster free transaction completes. + */ +static struct xfs_inode * +xfs_ifree_get_one_inode( + struct xfs_perag *pag, + struct xfs_inode *free_ip, + int inum) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) + goto out_rcu_unlock; + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + goto out_rcu_unlock; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're racing with + * freeing in xfs_reclaim_inode(). See the comments in that + * function for more information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_rcu_unlock; + } + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * We don't need to attach clean inodes or those only with unlogged + * changes (which we throw away, anyway). + */ + if (!ip->i_itemp || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_no_inode; + } + return ip; + +out_rcu_unlock: + rcu_read_unlock(); +out_no_inode: + return NULL; +} + /* * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to @@ -2603,77 +2685,11 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); + ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); + if (!ip) continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; -- cgit v1.2.3 From 5cc3c006eb45524860c4d1dd4dd7ad4a506bf3f5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 26 Mar 2020 10:26:44 -0700 Subject: xfs: don't write a corrupt unmount record to force summary counter recalc In commit f467cad95f5e3, I added the ability to force a recalculation of the filesystem summary counters if they seemed incorrect. This was done (not entirely correctly) by tweaking the log code to write an unmount record without the UMOUNT_TRANS flag set. At next mount, the log recovery code will fail to find the unmount record and go into recovery, which triggers the recalculation. What actually gets written to the log is what ought to be an unmount record, but without any flags set to indicate what kind of record it actually is. This worked to trigger the recalculation, but we shouldn't write bogus log records when we could simply write nothing. Fixes: f467cad95f5e3 ("xfs: force summary counter recalc at next mount") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster --- fs/xfs/xfs_log.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46108ca20d85..00fda2e8e738 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -835,19 +835,6 @@ xlog_unmount_write( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in @@ -913,6 +900,20 @@ xfs_log_unmount_write( if (XLOG_FORCED_SHUTDOWN(log)) return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; + } + xfs_log_unmount_verify_iclog(log); xlog_unmount_write(log); } -- cgit v1.2.3 From 63337b63e7dab667bc0b4c3d468eb7e0dcf5f384 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 27 Mar 2020 08:28:39 -0700 Subject: xfs: remove unnecessary ternary from xfs_create Since the "no-allocation" reservations for file creations has been removed, the resblks value should be larger than zero, so remove unnecessary ternary conditional. Signed-off-by: Kaixu Xia Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: s/judgment/ternary/] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5c930863ed5b..0cac0d37e3ae 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1200,8 +1200,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; -- cgit v1.2.3 From 8d3d7e2b35ea7d91d6e085c93b5efecfb0fba307 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Mar 2020 08:29:45 -0700 Subject: xfs: trylock underlying buffer on dquot flush A dquot flush currently blocks on the buffer lock for the underlying dquot buffer. In turn, this causes xfsaild to block rather than continue processing other items in the meantime. Update xfs_qm_dqflush() to trylock the buffer, similar to how inode buffers are handled, and return -EAGAIN if the lock fails. Fix up any callers that don't currently handle the error properly. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_dquot.c | 6 +++--- fs/xfs/xfs_dquot_item.c | 3 ++- fs/xfs/xfs_qm.c | 14 +++++++++----- 3 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 711376ca269f..af2c8e5ceea0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1105,8 +1105,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1177,7 +1177,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index cf65e2e43c6e..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index cabdb755adae..c225691fad15 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -121,12 +121,11 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; dqp->dq_flags |= XFS_DQ_FREEING; @@ -139,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -149,6 +147,8 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + goto out_unlock; } xfs_dqflock(dqp); } @@ -174,6 +174,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* -- cgit v1.2.3 From d4bc4c5fd177066b38e3a39ac751399e8dff80cf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Mar 2020 08:29:55 -0700 Subject: xfs: return locked status of inode buffer on xfsaild push If the inode buffer backing a particular inode is locked, xfs_iflush() returns -EAGAIN and xfs_inode_item_push() skips the inode. It still returns success to xfsaild, however, which bypasses the xfsaild backoff heuristic. Update xfs_inode_item_push() to return locked status if the inode buffer couldn't be locked. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a627cb951dc6..f779cca2346f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -552,7 +552,8 @@ xfs_inode_item_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: -- cgit v1.2.3 From c6425702f21e68d7c8c293b6bfaa5a389076efe5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 Mar 2020 08:49:44 -0700 Subject: xfs: ratelimit inode flush on buffered write ENOSPC A customer reported rcu stalls and softlockup warnings on a computer with many CPU cores and many many more IO threads trying to write to a filesystem that is totally out of space. Subsequent analysis pointed to the many many IO threads calling xfs_flush_inodes -> sync_inodes_sb, which causes a lot of wb_writeback_work to be queued. The writeback worker spends so much time trying to wake the many many threads waiting for writeback completion that it trips the softlockup detector, and (in this case) the system automatically reboots. In addition, they complain that the lengthy xfs_flush_inodes scan traps all of those threads in uninterruptible sleep, which hampers their ability to kill the program or do anything else to escape the situation. If there's thousands of threads trying to write to files on a full filesystem, each of those threads will start separate copies of the inode flush scan. This is kind of pointless since we only need one scan, so rate limit the inode flush. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 88ab09ed29e7..50c43422fa17 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -167,6 +167,7 @@ typedef struct xfs_mount { struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + struct ratelimit_state m_flush_inodes_ratelimit; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 68fea439d974..abf06bf9c3f3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -528,6 +528,9 @@ xfs_flush_inodes( { struct super_block *sb = mp->m_super; + if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) + return; + if (down_read_trylock(&sb->s_umount)) { sync_inodes_sb(sb); up_read(&sb->s_umount); @@ -1366,6 +1369,17 @@ xfs_fc_fill_super( if (error) goto out_free_names; + /* + * Cap the number of invocations of xfs_flush_inodes to 16 for every + * quarter of a second. The magic numbers here were determined by + * observation neither to cause stalls in writeback when there are a + * lot of IO threads and the fs is near ENOSPC, nor cause any fstest + * regressions. YMMV. + */ + ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); + ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, + RATELIMIT_MSG_ON_RELEASE); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; -- cgit v1.2.3 From d8fcb6f1346c36316ccb20f887081299a61bbcc8 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 29 Mar 2020 09:45:19 -0700 Subject: xfs: remove redundant variable assignment in xfs_symlink() The variables 'udqp' and 'gdqp' have been initialized, so remove redundant variable assignment in xfs_symlink(). Signed-off-by: Kaixu Xia Reviewed-by: Chaitanya Kulkarni Reviewed-by: Dave Chinner Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_symlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fa0fa3c70f1a..13fb4b919648 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -176,7 +176,6 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); /* -- cgit v1.2.3 From d9fdd0adf932c8d615cfe52bbc689c373a95377f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 2 Apr 2020 08:18:57 -0700 Subject: xfs: fix inode number overflow in ifree cluster helper Qian Cai reports seemingly random buffer read verifier errors during filesystem writeback. This was isolated to a recent patch that factored out some inode cluster freeing code and happened to cast an unsigned inode number type to a signed value. If the inode number value overflows, we can skip marking in-core inodes associated with the underlying buffer stale at the time the physical inodes are freed. If such an inode happens to be dirty, xfsaild will eventually attempt to write it back over non-inode blocks. The invalidation of the underlying inode buffer causes writeback to read the buffer from disk. This fails the read verifier (preventing eventual corruption) if the buffer no longer looks like an inode cluster. Analysis by Dave Chinner. Fix up the helper to use the proper type for inode number values. Fixes: 5806165a6663 ("xfs: factor inode lookup from xfs_ifree_cluster") Reported-by: Qian Cai Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0cac0d37e3ae..ae86c870da92 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2511,7 +2511,7 @@ static struct xfs_inode * xfs_ifree_get_one_inode( struct xfs_perag *pag, struct xfs_inode *free_ip, - int inum) + xfs_ino_t inum) { struct xfs_mount *mp = pag->pag_mount; struct xfs_inode *ip; -- cgit v1.2.3 From 54fbdd1035e3a4e4f4082c335b095426cdefd092 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Apr 2020 11:45:37 -0700 Subject: xfs: factor out a new xfs_log_force_inode helper Create a new helper to force the log up to the last LSN touching an inode. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_export.c | 14 +------------- fs/xfs/xfs_file.c | 12 +----------- fs/xfs/xfs_inode.c | 19 +++++++++++++++++++ fs/xfs/xfs_inode.h | 1 + 4 files changed, 22 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..68e1cbb3cfcc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae86c870da92..d1772786af29 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3945,3 +3945,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..c6a63f6764a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -- cgit v1.2.3 From 5833112df7e9a306af9af09c60127b92ed723962 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Apr 2020 11:45:37 -0700 Subject: xfs: reflink should force the log out if mounted with wsync Reflink should force the log out to disk if the filesystem was mounted with wsync, the same as most other operations in xfs. [Note: XFS_MOUNT_WSYNC is set when the admin mounts the filesystem with either the 'wsync' or 'sync' mount options, which effectively means that we're classifying reflink/dedupe as IO operations and making them synchronous when required.] Fixes: 3fc9f5e409319 ("xfs: remove xfs_reflink_remap_range") Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster [darrick: add more to the changelog] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 68e1cbb3cfcc..4b8bdecc3863 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1059,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) -- cgit v1.2.3