From 27d7c4ed1f7dfa3fc462e870dc60a4c9c2253f3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Jul 2013 21:57:22 -0400 Subject: ext4: silence warning in ext4_writepages() The loop in mpage_map_and_submit_extent() is guaranteed to always run at least once since the caller of mpage_map_and_submit_extent() makes sure map->m_len > 0. So make that explicit using do-while instead of pure while which also silences the compiler warning about uninitialized 'err' variable. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0188e65e1f58..19a1643cbdfa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2163,7 +2163,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, mpd->io_submit.io_end->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; - while (map->m_len) { + do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; @@ -2201,7 +2201,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, err = mpage_map_and_submit_buffers(mpd); if (err < 0) return err; - } + } while (map->m_len); /* Update on-disk size after IO is submitted */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; -- cgit v1.2.3 From 960fd856fdc3b08b3638f3f9b6b4bfceb77660c7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 5 Jul 2013 23:11:16 -0400 Subject: ext4: fix ext4_get_group_number() The function ext4_get_group_number() was introduced as an optimization in commit bd86298e60b8. Unfortunately, this commit incorrectly calculate the group number for file systems with a 1k block size (when s_first_data_block is 1 instead of zero). This could cause the following kernel BUG: [ 568.877799] ------------[ cut here ]------------ [ 568.877833] kernel BUG at fs/ext4/mballoc.c:3728! [ 568.877840] Oops: Exception in kernel mode, sig: 5 [#1] [ 568.877845] SMP NR_CPUS=32 NUMA pSeries [ 568.877852] Modules linked in: binfmt_misc [ 568.877861] CPU: 1 PID: 3516 Comm: fs_mark Not tainted 3.10.0-03216-g7c6809f-dirty #1 [ 568.877867] task: c0000001fb0b8000 ti: c0000001fa954000 task.ti: c0000001fa954000 [ 568.877873] NIP: c0000000002f42a4 LR: c0000000002f4274 CTR: c000000000317ef8 [ 568.877879] REGS: c0000001fa956ed0 TRAP: 0700 Not tainted (3.10.0-03216-g7c6809f-dirty) [ 568.877884] MSR: 8000000000029032 CR: 24000428 XER: 00000000 [ 568.877902] SOFTE: 1 [ 568.877905] CFAR: c0000000002b5464 [ 568.877908] GPR00: 0000000000000001 c0000001fa957150 c000000000c6a408 c0000001fb588000 GPR04: 0000000000003fff c0000001fa9571c0 c0000001fa9571c4 000138098c50625f GPR08: 1301200000000000 0000000000000002 0000000000000001 0000000000000000 GPR12: 0000000024000422 c00000000f33a300 0000000000008000 c0000001fa9577f0 GPR16: c0000001fb7d0100 c000000000c29190 c0000000007f46e8 c000000000a14672 GPR20: 0000000000000001 0000000000000008 ffffffffffffffff 0000000000000000 GPR24: 0000000000000100 c0000001fa957278 c0000001fdb2bc78 c0000001fa957288 GPR28: 0000000000100100 c0000001fa957288 c0000001fb588000 c0000001fdb2bd10 [ 568.877993] NIP [c0000000002f42a4] .ext4_mb_release_group_pa+0xec/0x1c0 [ 568.877999] LR [c0000000002f4274] .ext4_mb_release_group_pa+0xbc/0x1c0 [ 568.878004] Call Trace: [ 568.878008] [c0000001fa957150] [c0000000002f4274] .ext4_mb_release_group_pa+0xbc/0x1c0 (unreliable) [ 568.878017] [c0000001fa957200] [c0000000002fb070] .ext4_mb_discard_lg_preallocations+0x394/0x444 [ 568.878025] [c0000001fa957340] [c0000000002fb45c] .ext4_mb_release_context+0x33c/0x734 [ 568.878032] [c0000001fa957440] [c0000000002fbcf8] .ext4_mb_new_blocks+0x4a4/0x5f4 [ 568.878039] [c0000001fa957510] [c0000000002ef56c] .ext4_ext_map_blocks+0xc28/0x1178 [ 568.878047] [c0000001fa957640] [c0000000002c1a94] .ext4_map_blocks+0x2c8/0x490 [ 568.878054] [c0000001fa957730] [c0000000002c536c] .ext4_writepages+0x738/0xc60 [ 568.878062] [c0000001fa957950] [c000000000168a78] .do_writepages+0x5c/0x80 [ 568.878069] [c0000001fa9579d0] [c00000000015d1c4] .__filemap_fdatawrite_range+0x88/0xb0 [ 568.878078] [c0000001fa957aa0] [c00000000015d23c] .filemap_write_and_wait_range+0x50/0xfc [ 568.878085] [c0000001fa957b30] [c0000000002b8edc] .ext4_sync_file+0x220/0x3c4 [ 568.878092] [c0000001fa957be0] [c0000000001f849c] .vfs_fsync_range+0x64/0x80 [ 568.878098] [c0000001fa957c70] [c0000000001f84f0] .vfs_fsync+0x38/0x4c [ 568.878105] [c0000001fa957d00] [c0000000001f87f4] .do_fsync+0x54/0x90 [ 568.878111] [c0000001fa957db0] [c0000000001f8894] .SyS_fsync+0x28/0x3c [ 568.878120] [c0000001fa957e30] [c000000000009c88] syscall_exit+0x0/0x7c [ 568.878125] Instruction dump: [ 568.878130] 60000000 813d0034 81610070 38000000 7f8b4800 419e001c 813f007c 7d2bfe70 [ 568.878144] 7d604a78 7c005850 54000ffe 7c0007b4 <0b000000> e8a10076 e87f0090 7fa4eb78 [ 568.878160] ---[ end trace 594d911d9654770b ]--- In addition fix the STD_GROUP optimization so that it works for bigalloc file systems as well. Signed-off-by: "Theodore Ts'o" Reported-by: Li Zhong Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org # 3.10 --- fs/ext4/balloc.c | 4 ++-- fs/ext4/super.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 58339393fa6e..ddd715e42a5c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) - group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - block) >> + group = (block - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 85b3dd60169b..8862d4ddf71f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3624,10 +3624,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - /* Do we have standard group size of blocksize * 8 blocks ? */ - if (sbi->s_blocks_per_group == blocksize << 3) - set_opt2(sb, STD_GROUP_SIZE); - for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; @@ -3697,6 +3693,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); + /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. -- cgit v1.2.3 From 822dbba33458cd6ad0e715f3f4a57ebc99d54d1b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Jul 2013 21:31:04 -0400 Subject: ext4: fix warning in ext4_evict_inode() The following race can lead to ext4_evict_inode() seeing i_ioend_count > 0 and thus triggering a sanity check warning: CPU1 CPU2 ext4_end_bio() ext4_evict_inode() ext4_finish_bio() end_page_writeback(); truncate_inode_pages() evict page WARN_ON(i_ioend_count > 0); ext4_put_io_end_defer() ext4_release_io_end() dec i_ioend_count This is possible use-after-free bug since we decrement i_ioend_count in possibly released inode. Since i_ioend_count is used only for sanity checks one possible solution would be to just remove it but for now I'd like to keep those sanity checks to help debugging the new ext4 writeback code. This patch changes ext4_end_bio() to call ext4_put_io_end_defer() before ext4_finish_bio() in the shortcut case when unwritten extent conversion isn't needed. In that case we don't need the io_end so we are safe to drop it early. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 48786cdb5e6c..d63cc5e9d3b5 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,6 +308,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) return io_end; } +/* BIO completion function for page writeback */ static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; @@ -318,18 +319,6 @@ static void ext4_end_bio(struct bio *bio, int error) if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - /* - * Link bio into list hanging from io_end. We have to do it - * atomically as bio completions can be racing against each - * other. - */ - bio->bi_private = xchg(&io_end->bio, bio); - } else { - ext4_finish_bio(bio); - bio_put(bio); - } - if (error) { struct inode *inode = io_end->inode; @@ -341,7 +330,24 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); } - ext4_put_io_end_defer(io_end); + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); + } } void ext4_io_submit(struct ext4_io_submit *io) -- cgit v1.2.3 From ad065dd01662ae22138899e6b1c8eeb3a529964f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 11 Jul 2013 18:54:37 -0400 Subject: ext4: don't show usrquota/grpquota twice in /proc/mounts We now print mount options in a generic fashion in ext4_show_options(), so we shouldn't be explicitly printing the {usr,grp}quota options in ext4_show_quota_options(). Without this patch, /proc/mounts can look like this: /dev/vdb /vdb ext4 rw,relatime,quota,usrquota,data=ordered,usrquota 0 0 ^^^^^^^^ ^^^^^^^^ Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/super.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8862d4ddf71f..bca26f34edf4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1702,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq, if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); #endif } -- cgit v1.2.3 From e8974c3930ae9692bb4f77380961421e9a2f76ab Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 11 Jul 2013 22:42:42 -0400 Subject: ext4: rate limit printk in buffer_io_error() If there are a lot of outstanding buffered IOs when a device is taken offline (due to hardware errors etc), ext4_end_bio prints out a message for each failed logical block. While this is desirable, we see thousands of such lines being printed out before the serial console gets overwhelmed, causing ext4_end_bio() wait for the printk to complete. This in itself isn't a disaster, except for the detail that this function is being called with the queue lock held. This causes any other function in the block layer to spin on its spin_lock_irqsave while the serial console is draining. If NMI watchdog is enabled on this machine then it eventually comes along and shoots the machine in the head. The end result is that losing any one disk causes the machine to go down. This patch rate limits the printk to bandaid around the problem. Tested: xfstests Change-Id: I8ab5690dcf4f3a67e78be147d45e489fdf4a88d8 Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d63cc5e9d3b5..6625d210fb45 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -55,7 +56,7 @@ void ext4_exit_pageio(void) static void buffer_io_error(struct buffer_head *bh) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -- cgit v1.2.3 From bdafe42aaf72859166f784f0fad3e6b4a815fa6d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 13 Jul 2013 00:40:31 -0400 Subject: ext4: fix spelling errors and a comment in extent_status tree Replace "assertation" with "assertion" in lots and lots of debugging messages. Correct the comment stating when ext4_es_insert_extent() is used. It was no doubt tree at one point, but it is no longer true... Signed-off-by: "Theodore Ts'o" Cc: Zheng Liu --- fs/ext4/extents_status.c | 22 ++++++++++------------ fs/ext4/inode.c | 8 ++++---- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ee018d5f397e..4b8df7fbb10a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -439,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add an delayed/hole extent " @@ -458,7 +458,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -468,7 +468,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } if (ee_status ^ es_status) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -481,7 +481,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " "to add an written/unwritten extent " "[%d/%d/%llu/%llx]\n", inode->i_ino, @@ -519,7 +519,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, * We want to add a delayed/hole extent but this * block has been allocated. */ - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%llx]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -527,13 +527,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, return; } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { - pr_warn("ES insert assertation failed for " + pr_warn("ES insert assertion failed for " "inode: %lu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, @@ -549,7 +549,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } } else if (retval == 0) { if (ext4_es_is_written(es)) { - pr_warn("ES insert assertation failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " "an written extent [%d/%d/%llu/%llx]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -632,10 +632,8 @@ out: } /* - * ext4_es_insert_extent() adds a space to a extent status tree. - * - * ext4_es_insert_extent is called by ext4_da_write_begin and - * ext4_es_remove_extent. + * ext4_es_insert_extent() adds information to an inode's extent + * status tree. * * Return 0 on success, error code on failure. */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 19a1643cbdfa..98b9bff92a8a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -465,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { - printk("ES cache assertation failed for inode: %lu " + printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, @@ -558,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, #ifdef ES_AGGRESSIVE_TEST if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " + printk("ES len assertion failed for inode: %lu " "retval %d != map->m_len %d " "in %s (lookup)\n", inode->i_ino, retval, map->m_len, __func__); @@ -659,7 +659,7 @@ found: #ifdef ES_AGGRESSIVE_TEST if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " + printk("ES len assertion failed for inode: %lu " "retval %d != map->m_len %d " "in %s (allocation)\n", inode->i_ino, retval, map->m_len, __func__); @@ -1642,7 +1642,7 @@ add_delayed: #ifdef ES_AGGRESSIVE_TEST if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " + printk("ES len assertion failed for inode: %lu " "retval %d != map->m_len %d " "in %s (lookup)\n", inode->i_ino, retval, map->m_len, __func__); -- cgit v1.2.3 From e7676a704ee0a1ef71a6b23760b5a8f6896cb1a1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 13 Jul 2013 00:40:35 -0400 Subject: ext4: don't allow ext4_free_blocks() to fail due to ENOMEM The filesystem should not be marked inconsistent if ext4_free_blocks() is not able to allocate memory. Unfortunately some callers (most notably ext4_truncate) don't have a way to reflect an error back up to the VFS. And even if we did, most userspace applications won't deal with most system calls returning ENOMEM anyway. Reported-by: Nagachandra P Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a9ff5e5137ca..4bbbf13bd743 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4740,11 +4740,16 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ + retry: new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { - ext4_mb_unload_buddy(&e4b); - err = -ENOMEM; - goto error_return; + /* + * We use a retry loop because + * ext4_free_blocks() is not allowed to fail. + */ + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; } new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; -- cgit v1.2.3