From 55b2598e84e97efc5d952958cb5e34236c43276b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 08:51:32 +0200 Subject: bdi: initialize ->ra_pages and ->io_pages in bdi_init Set up a readahead size by default, as very few users have a good reason to change it. This means code, ecryptfs, and orangefs now set up the values while they were previously missing it, while ubifs, mtd and vboxsf manually set it to 0 to avoid readahead. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: David Sterba [btrfs] Acked-by: Richard Weinberger [ubifs, mtd] Signed-off-by: Jens Axboe --- fs/afs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/afs') diff --git a/fs/afs/super.c b/fs/afs/super.c index b552357b1d13..3a40ee752c1e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { -- cgit v1.2.3 From ec0fa0b659144d9c68204d23f627b6a65fa53e50 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Oct 2020 14:22:12 +0100 Subject: afs: Fix deadlock between writeback and truncate The afs filesystem has a lock[*] that it uses to serialise I/O operations going to the server (vnode->io_lock), as the server will only perform one modification operation at a time on any given file or directory. This prevents the the filesystem from filling up all the call slots to a server with calls that aren't going to be executed in parallel anyway, thereby allowing operations on other files to obtain slots. [*] Note that is probably redundant for directories at least since i_rwsem is used to serialise directory modifications and lookup/reading vs modification. The server does allow parallel non-modification ops, however. When a file truncation op completes, we truncate the in-memory copy of the file to match - but we do it whilst still holding the io_lock, the idea being to prevent races with other operations. However, if writeback starts in a worker thread simultaneously with truncation (whilst notify_change() is called with i_rwsem locked, writeback pays it no heed), it may manage to set PG_writeback bits on the pages that will get truncated before afs_setattr_success() manages to call truncate_pagecache(). Truncate will then wait for those pages - whilst still inside io_lock: # cat /proc/8837/stack [<0>] wait_on_page_bit_common+0x184/0x1e7 [<0>] truncate_inode_pages_range+0x37f/0x3eb [<0>] truncate_pagecache+0x3c/0x53 [<0>] afs_setattr_success+0x4d/0x6e [<0>] afs_wait_for_operation+0xd8/0x169 [<0>] afs_do_sync_operation+0x16/0x1f [<0>] afs_setattr+0x1fb/0x25d [<0>] notify_change+0x2cf/0x3c4 [<0>] do_truncate+0x7f/0xb2 [<0>] do_sys_ftruncate+0xd1/0x104 [<0>] do_syscall_64+0x2d/0x3a [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The writeback operation, however, stalls indefinitely because it needs to get the io_lock to proceed: # cat /proc/5940/stack [<0>] afs_get_io_locks+0x58/0x1ae [<0>] afs_begin_vnode_operation+0xc7/0xd1 [<0>] afs_store_data+0x1b2/0x2a3 [<0>] afs_write_back_from_locked_page+0x418/0x57c [<0>] afs_writepages_region+0x196/0x224 [<0>] afs_writepages+0x74/0x156 [<0>] do_writepages+0x2d/0x56 [<0>] __writeback_single_inode+0x84/0x207 [<0>] writeback_sb_inodes+0x238/0x3cf [<0>] __writeback_inodes_wb+0x68/0x9f [<0>] wb_writeback+0x145/0x26c [<0>] wb_do_writeback+0x16a/0x194 [<0>] wb_workfn+0x74/0x177 [<0>] process_one_work+0x174/0x264 [<0>] worker_thread+0x117/0x1b9 [<0>] kthread+0xec/0xf1 [<0>] ret_from_fork+0x1f/0x30 and thus deadlock has occurred. Note that whilst afs_setattr() calls filemap_write_and_wait(), the fact that the caller is holding i_rwsem doesn't preclude more pages being dirtied through an mmap'd region. Fix this by: (1) Use the vnode validate_lock to mediate access between afs_setattr() and afs_writepages(): (a) Exclusively lock validate_lock in afs_setattr() around the whole RPC operation. (b) If WB_SYNC_ALL isn't set on entry to afs_writepages(), trying to shared-lock validate_lock and returning immediately if we couldn't get it. (c) If WB_SYNC_ALL is set, wait for the lock. The validate_lock is also used to validate a file and to zap its cache if the file was altered by a third party, so it's probably a good fit for this. (2) Move the truncation outside of the io_lock in setattr, using the same hook as is used for local directory editing. This requires the old i_size to be retained in the operation record as we commit the revised status to the inode members inside the io_lock still, but we still need to know if we reduced the file size. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/inode.c | 47 ++++++++++++++++++++++++++++++++++++++--------- fs/afs/internal.h | 1 + fs/afs/write.c | 11 +++++++++++ 3 files changed, 50 insertions(+), 9 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1d13d2e882ad..0fe8844b4bee 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { - struct inode *inode = &op->file[0].vnode->vfs_inode; + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; + loff_t old_i_size = i_size_read(inode); + + op->setattr.old_i_size = old_i_size; + afs_vnode_commit_status(op, vp); + /* inode->i_size has now been changed. */ + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + if (size > old_i_size) + pagecache_isize_extended(inode, old_i_size, size); + } +} + +static void afs_setattr_edit_file(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->vfs_inode; - afs_vnode_commit_status(op, &op->file[0]); if (op->setattr.attr->ia_valid & ATTR_SIZE) { - loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; - if (size > i_size) - pagecache_isize_extended(inode, i_size, size); - truncate_pagecache(inode, size); + loff_t size = op->setattr.attr->ia_size; + loff_t i_size = op->setattr.old_i_size; + + if (size < i_size) + truncate_pagecache(inode, size); } } @@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = { .issue_afs_rpc = afs_fs_setattr, .issue_yfs_rpc = yfs_fs_setattr, .success = afs_setattr_success, + .edit_dir = afs_setattr_edit_file, }; /* @@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); + /* Prevent any new writebacks from starting whilst we do this. */ + down_write(&vnode->validate_lock); + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); - if (IS_ERR(op)) - return PTR_ERR(op); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto out_unlock; + } afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; @@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + +out_unlock: + up_write(&vnode->validate_lock); + _leave(" = %d", ret); + return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 18042b7dab6a..e5f0446f27e5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -812,6 +812,7 @@ struct afs_operation { } store; struct { struct iattr *attr; + loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; diff --git a/fs/afs/write.c b/fs/afs/write.c index 4b2265cb1891..da12abd6db21 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct afs_vnode *vnode = AFS_FS_I(mapping->host); pgoff_t start, end, next; int ret; _enter(""); + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + if (wbc->sync_mode == WB_SYNC_ALL) + down_read(&vnode->validate_lock); + else if (!down_read_trylock(&vnode->validate_lock)) + return 0; + if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; @@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping, ret = afs_writepages_region(mapping, wbc, start, end, &next); } + up_read(&vnode->validate_lock); _leave(" = %d", ret); return ret; } -- cgit v1.2.3 From 92e3cc91d8f51ce64a8b7c696377180953dd316e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Oct 2020 14:11:58 +0100 Subject: afs: Fix rapid cell addition/removal by not using RCU on cells tree There are a number of problems that are being seen by the rapidly mounting and unmounting an afs dynamic root with an explicit cell and volume specified (which should probably be rejected, but that's a separate issue): What the tests are doing is to look up/create a cell record for the name given and then tear it down again without actually using it to try to talk to a server. This is repeated endlessly, very fast, and the new cell collides with the old one if it's not quick enough to reuse it. It appears (as suggested by Hillf Danton) that the search through the RB tree under a read_seqbegin_or_lock() under RCU conditions isn't safe and that it's not blocking the write_seqlock(), despite taking two passes at it. He suggested that the code should take a ref on the cell it's attempting to look at - but this shouldn't be necessary until we've compared the cell names. It's possible that I'm missing a barrier somewhere. However, using an RCU search for this is overkill, really - we only need to access the cell name in a few places, and they're places where we're may end up sleeping anyway. Fix this by switching to an R/W semaphore instead. Additionally, draw the down_read() call inside the function (renamed to afs_find_cell()) since all the callers were taking the RCU read lock (or should've been[*]). [*] afs_probe_cell_name() should have been, but that doesn't appear to be involved in the bug reports. The symptoms of this look like: general protection fault, probably for non-canonical address 0xf27d208691691fdb: 0000 [#1] PREEMPT SMP KASAN KASAN: maybe wild-memory-access in range [0x93e924348b48fed8-0x93e924348b48fedf] ... RIP: 0010:strncasecmp lib/string.c:52 [inline] RIP: 0010:strncasecmp+0x5f/0x240 lib/string.c:43 afs_lookup_cell_rcu+0x313/0x720 fs/afs/cell.c:88 afs_lookup_cell+0x2ee/0x1440 fs/afs/cell.c:249 afs_parse_source fs/afs/super.c:290 [inline] ... Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Reported-by: syzbot+459a5dce0b4cb70fd076@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Hillf Danton cc: syzkaller-bugs@googlegroups.com --- fs/afs/cell.c | 131 ++++++++++++++++++++++++------------------------------ fs/afs/dynroot.c | 21 ++++----- fs/afs/internal.h | 6 +-- fs/afs/main.c | 2 +- fs/afs/super.c | 4 +- 5 files changed, 71 insertions(+), 93 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 5b79cdceefa0..5da83e84952a 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -41,15 +41,15 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay) } /* - * Look up and get an activation reference on a cell record under RCU - * conditions. The caller must hold the RCU read lock. + * Look up and get an activation reference on a cell record. The caller must + * hold net->cells_lock at least read-locked. */ -struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, - const char *name, unsigned int namesz) +static struct afs_cell *afs_find_cell_locked(struct afs_net *net, + const char *name, unsigned int namesz) { struct afs_cell *cell = NULL; struct rb_node *p; - int n, seq = 0, ret = 0; + int n; _enter("%*.*s", namesz, namesz, name); @@ -58,61 +58,48 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, if (namesz > AFS_MAXCELLNAME) return ERR_PTR(-ENAMETOOLONG); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (cell) - afs_put_cell(net, cell); - cell = NULL; - ret = -ENOENT; - - read_seqbegin_or_lock(&net->cells_lock, &seq); - - if (!name) { - cell = rcu_dereference_raw(net->ws_cell); - if (cell) { - afs_get_cell(cell); - ret = 0; - break; - } - ret = -EDESTADDRREQ; - continue; - } + if (!name) { + cell = net->ws_cell; + if (!cell) + return ERR_PTR(-EDESTADDRREQ); + afs_get_cell(cell); + return cell; + } - p = rcu_dereference_raw(net->cells.rb_node); - while (p) { - cell = rb_entry(p, struct afs_cell, net_node); - - n = strncasecmp(cell->name, name, - min_t(size_t, cell->name_len, namesz)); - if (n == 0) - n = cell->name_len - namesz; - if (n < 0) { - p = rcu_dereference_raw(p->rb_left); - } else if (n > 0) { - p = rcu_dereference_raw(p->rb_right); - } else { - if (atomic_inc_not_zero(&cell->usage)) { - ret = 0; - break; - } - /* We want to repeat the search, this time with - * the lock properly locked. - */ - } - cell = NULL; - } + p = net->cells.rb_node; + while (p) { + cell = rb_entry(p, struct afs_cell, net_node); - } while (need_seqretry(&net->cells_lock, seq)); + n = strncasecmp(cell->name, name, + min_t(size_t, cell->name_len, namesz)); + if (n == 0) + n = cell->name_len - namesz; + if (n < 0) + p = p->rb_left; + else if (n > 0) + p = p->rb_right; + else + goto found; + } + + return ERR_PTR(-ENOENT); - done_seqretry(&net->cells_lock, seq); +found: + if (!atomic_inc_not_zero(&cell->usage)) + return ERR_PTR(-ENOENT); - if (ret != 0 && cell) - afs_put_cell(net, cell); + return cell; +} - return ret == 0 ? cell : ERR_PTR(ret); +struct afs_cell *afs_find_cell(struct afs_net *net, + const char *name, unsigned int namesz) +{ + struct afs_cell *cell; + + down_read(&net->cells_lock); + cell = afs_find_cell_locked(net, name, namesz); + up_read(&net->cells_lock); + return cell; } /* @@ -245,9 +232,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - rcu_read_lock(); - cell = afs_lookup_cell_rcu(net, name, namesz); - rcu_read_unlock(); + cell = afs_find_cell(net, name, namesz); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -268,7 +253,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, /* Find the insertion point and check to see if someone else added a * cell whilst we were allocating. */ - write_seqlock(&net->cells_lock); + down_write(&net->cells_lock); pp = &net->cells.rb_node; parent = NULL; @@ -293,7 +278,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); atomic_inc(&net->cells_outstanding); - write_sequnlock(&net->cells_lock); + up_write(&net->cells_lock); queue_work(afs_wq, &cell->manager); @@ -323,7 +308,7 @@ cell_already_exists: afs_get_cell(cursor); ret = 0; } - write_sequnlock(&net->cells_lock); + up_write(&net->cells_lock); kfree(candidate); if (ret == 0) goto wait_for_cell; @@ -377,10 +362,10 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) afs_get_cell(new_root); /* install the new cell */ - write_seqlock(&net->cells_lock); - old_root = rcu_access_pointer(net->ws_cell); - rcu_assign_pointer(net->ws_cell, new_root); - write_sequnlock(&net->cells_lock); + down_write(&net->cells_lock); + old_root = net->ws_cell; + net->ws_cell = new_root; + up_write(&net->cells_lock); afs_put_cell(net, old_root); _leave(" = 0"); @@ -674,12 +659,12 @@ again: switch (cell->state) { case AFS_CELL_INACTIVE: case AFS_CELL_FAILED: - write_seqlock(&net->cells_lock); + down_write(&net->cells_lock); usage = 1; deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0); if (deleted) rb_erase(&cell->net_node, &net->cells); - write_sequnlock(&net->cells_lock); + up_write(&net->cells_lock); if (deleted) goto final_destruction; if (cell->state == AFS_CELL_FAILED) @@ -779,7 +764,7 @@ void afs_manage_cells(struct work_struct *work) * lack of use and cells whose DNS results have expired and dispatch * their managers. */ - read_seqlock_excl(&net->cells_lock); + down_read(&net->cells_lock); for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { struct afs_cell *cell = @@ -824,7 +809,7 @@ void afs_manage_cells(struct work_struct *work) queue_work(afs_wq, &cell->manager); } - read_sequnlock_excl(&net->cells_lock); + up_read(&net->cells_lock); /* Update the timer on the way out. We have to pass an increment on * cells_outstanding in the namespace that we are in to the timer or @@ -854,10 +839,10 @@ void afs_cell_purge(struct afs_net *net) _enter(""); - write_seqlock(&net->cells_lock); - ws = rcu_access_pointer(net->ws_cell); - RCU_INIT_POINTER(net->ws_cell, NULL); - write_sequnlock(&net->cells_lock); + down_write(&net->cells_lock); + ws = net->ws_cell; + net->ws_cell = NULL; + up_write(&net->cells_lock); afs_put_cell(net, ws); _debug("del timer"); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 7b784af604fd..5b8de4fee6cd 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -123,7 +123,7 @@ static int afs_probe_cell_name(struct dentry *dentry) len--; } - cell = afs_lookup_cell_rcu(net, name, len); + cell = afs_find_cell(net, name, len); if (!IS_ERR(cell)) { afs_put_cell(net, cell); return 0; @@ -179,7 +179,6 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry) struct afs_cell *cell; struct afs_net *net = afs_d2net(dentry); struct dentry *ret; - unsigned int seq = 0; char *name; int len; @@ -191,17 +190,13 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry) if (!name) goto out_p; - rcu_read_lock(); - do { - read_seqbegin_or_lock(&net->cells_lock, &seq); - cell = rcu_dereference_raw(net->ws_cell); - if (cell) { - len = cell->name_len; - memcpy(name, cell->name, len + 1); - } - } while (need_seqretry(&net->cells_lock, seq)); - done_seqretry(&net->cells_lock, seq); - rcu_read_unlock(); + down_read(&net->cells_lock); + cell = net->ws_cell; + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len + 1); + } + up_read(&net->cells_lock); ret = ERR_PTR(-ENOENT); if (!cell) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e5f0446f27e5..257c0f07742f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -263,11 +263,11 @@ struct afs_net { /* Cell database */ struct rb_root cells; - struct afs_cell __rcu *ws_cell; + struct afs_cell *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; - seqlock_t cells_lock; + struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; @@ -917,7 +917,7 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); -extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned); +extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); extern struct afs_cell *afs_get_cell(struct afs_cell *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 31b472f7c734..accdd8970e7c 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -78,7 +78,7 @@ static int __net_init afs_net_init(struct net *net_ns) mutex_init(&net->socket_mutex); net->cells = RB_ROOT; - seqlock_init(&net->cells_lock); + init_rwsem(&net->cells_lock); INIT_WORK(&net->cells_manager, afs_manage_cells); timer_setup(&net->cells_timer, afs_cells_timer, 0); diff --git a/fs/afs/super.c b/fs/afs/super.c index b552357b1d13..0be99016ecfb 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -634,9 +634,7 @@ static int afs_init_fs_context(struct fs_context *fc) ctx->net = afs_net(fc->net_ns); /* Default to the workstation cell. */ - rcu_read_lock(); - cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); - rcu_read_unlock(); + cell = afs_find_cell(ctx->net, NULL, 0); if (IS_ERR(cell)) cell = NULL; ctx->cell = cell; -- cgit v1.2.3 From 88c853c3f5c0a07c5db61b494ee25152535cfeee Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Jul 2019 11:24:59 +0100 Subject: afs: Fix cell refcounting by splitting the usage counter Management of the lifetime of afs_cell struct has some problems due to the usage counter being used to determine whether objects of that type are in use in addition to whether anyone might be interested in the structure. This is made trickier by cell objects being cached for a period of time in case they're quickly reused as they hold the result of a setup process that may be slow (DNS lookups, AFS RPC ops). Problems include the cached root volume from alias resolution pinning its parent cell record, rmmod occasionally hanging and occasionally producing assertion failures. Fix this by splitting the count of active users from the struct reference count. Things then work as follows: (1) The cell cache keeps +1 on the cell's activity count and this has to be dropped before the cell can be removed. afs_manage_cell() tries to exchange the 1 to a 0 with the cells_lock write-locked, and if successful, the record is removed from the net->cells. (2) One struct ref is 'owned' by the activity count. That is put when the active count is reduced to 0 (final_destruction label). (3) A ref can be held on a cell whilst it is queued for management on a work queue without confusing the active count. afs_queue_cell() is added to wrap this. (4) The queue's ref is dropped at the end of the management. This is split out into a separate function, afs_manage_cell_work(). (5) The root volume record is put after a cell is removed (at the final_destruction label) rather then in the RCU destruction routine. (6) Volumes hold struct refs, but aren't active users. (7) Both counts are displayed in /proc/net/afs/cells. There are some management function changes: (*) afs_put_cell() now just decrements the refcount and triggers the RCU destruction if it becomes 0. It no longer sets a timer to have the manager do this. (*) afs_use_cell() and afs_unuse_cell() are added to increase and decrease the active count. afs_unuse_cell() sets the management timer. (*) afs_queue_cell() is added to queue a cell with approprate refs. There are also some other fixes: (*) Don't let /proc/net/afs/cells access a cell's vllist if it's NULL. (*) Make sure that candidate cells in lookups are properly destroyed rather than being simply kfree'd. This ensures the bits it points to are destroyed also. (*) afs_dec_cells_outstanding() is now called in cell destruction rather than at "final_destruction". This ensures that cell->net is still valid to the end of the destructor. (*) As a consequence of the previous two changes, move the increment of net->cells_outstanding that was at the point of insertion into the tree to the allocation routine to correctly balance things. Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Signed-off-by: David Howells --- fs/afs/cell.c | 149 ++++++++++++++++++++++++++++++++++++----------------- fs/afs/dynroot.c | 2 +- fs/afs/internal.h | 8 ++- fs/afs/mntpt.c | 4 +- fs/afs/proc.c | 23 ++++----- fs/afs/super.c | 12 ++--- fs/afs/vl_alias.c | 8 +-- fs/afs/vl_rotate.c | 2 +- fs/afs/volume.c | 4 +- 9 files changed, 136 insertions(+), 76 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 5da83e84952a..c906000b0ff8 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -19,7 +19,7 @@ static unsigned __read_mostly afs_cell_gc_delay = 10; static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; -static void afs_manage_cell(struct work_struct *); +static void afs_manage_cell_work(struct work_struct *); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -62,8 +62,7 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, cell = net->ws_cell; if (!cell) return ERR_PTR(-EDESTADDRREQ); - afs_get_cell(cell); - return cell; + goto found; } p = net->cells.rb_node; @@ -85,12 +84,12 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, return ERR_PTR(-ENOENT); found: - if (!atomic_inc_not_zero(&cell->usage)) - return ERR_PTR(-ENOENT); - - return cell; + return afs_use_cell(cell); } +/* + * Look up and get an activation reference on a cell record. + */ struct afs_cell *afs_find_cell(struct afs_net *net, const char *name, unsigned int namesz) { @@ -153,8 +152,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->name[i] = tolower(name[i]); cell->name[i] = 0; - atomic_set(&cell->usage, 2); - INIT_WORK(&cell->manager, afs_manage_cell); + atomic_set(&cell->ref, 1); + atomic_set(&cell->active, 0); + INIT_WORK(&cell->manager, afs_manage_cell_work); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); @@ -193,6 +193,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_source = vllist->source; cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ + atomic_inc(&net->cells_outstanding); _leave(" = %p", cell); return cell; @@ -275,12 +276,12 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; + atomic_set(&cell->active, 2); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); - atomic_inc(&net->cells_outstanding); up_write(&net->cells_lock); - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell); wait_for_cell: _debug("wait_for_cell"); @@ -305,16 +306,17 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_get_cell(cursor); + afs_use_cell(cursor); ret = 0; } up_write(&net->cells_lock); - kfree(candidate); + if (candidate) + afs_put_cell(candidate); if (ret == 0) goto wait_for_cell; goto error_noput; error: - afs_put_cell(net, cell); + afs_unuse_cell(net, cell); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -359,7 +361,7 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) } if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) - afs_get_cell(new_root); + afs_use_cell(new_root); /* install the new cell */ down_write(&net->cells_lock); @@ -367,7 +369,7 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) net->ws_cell = new_root; up_write(&net->cells_lock); - afs_put_cell(net, old_root); + afs_unuse_cell(net, old_root); _leave(" = 0"); return 0; } @@ -473,18 +475,21 @@ out_wake: static void afs_cell_destroy(struct rcu_head *rcu) { struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu); + struct afs_net *net = cell->net; + int u; _enter("%p{%s}", cell, cell->name); - ASSERTCMP(atomic_read(&cell->usage), ==, 0); + u = atomic_read(&cell->ref); + ASSERTCMP(u, ==, 0); - afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); - afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); - afs_put_cell(cell->net, cell->alias_of); + afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); + afs_unuse_cell(net, cell->alias_of); key_put(cell->anonymous_key); kfree(cell->name); kfree(cell); + afs_dec_cells_outstanding(net); _leave(" [destroyed]"); } @@ -519,16 +524,50 @@ void afs_cells_timer(struct timer_list *timer) */ struct afs_cell *afs_get_cell(struct afs_cell *cell) { - atomic_inc(&cell->usage); + if (atomic_read(&cell->ref) <= 0) + BUG(); + + atomic_inc(&cell->ref); return cell; } /* * Drop a reference on a cell record. */ -void afs_put_cell(struct afs_net *net, struct afs_cell *cell) +void afs_put_cell(struct afs_cell *cell) +{ + if (cell) { + unsigned int u, a; + + u = atomic_dec_return(&cell->ref); + if (u == 0) { + a = atomic_read(&cell->active); + WARN(a != 0, "Cell active count %u > 0\n", a); + call_rcu(&cell->rcu, afs_cell_destroy); + } + } +} + +/* + * Note a cell becoming more active. + */ +struct afs_cell *afs_use_cell(struct afs_cell *cell) +{ + if (atomic_read(&cell->ref) <= 0) + BUG(); + + atomic_inc(&cell->active); + return cell; +} + +/* + * Record a cell becoming less active. When the active counter reaches 1, it + * is scheduled for destruction, but may get reactivated. + */ +void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell) { time64_t now, expire_delay; + int a; if (!cell) return; @@ -541,11 +580,21 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell) if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; - if (atomic_dec_return(&cell->usage) > 1) - return; + a = atomic_dec_return(&cell->active); + WARN_ON(a == 0); + if (a == 1) + /* 'cell' may now be garbage collected. */ + afs_set_cell_timer(net, expire_delay); +} - /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); +/* + * Queue a cell for management, giving the workqueue a ref to hold. + */ +void afs_queue_cell(struct afs_cell *cell) +{ + afs_get_cell(cell); + if (!queue_work(afs_wq, &cell->manager)) + afs_put_cell(cell); } /* @@ -645,12 +694,11 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct work_struct *work) +static void afs_manage_cell(struct afs_cell *cell) { - struct afs_cell *cell = container_of(work, struct afs_cell, manager); struct afs_net *net = cell->net; bool deleted; - int ret, usage; + int ret, active; _enter("%s", cell->name); @@ -660,10 +708,11 @@ again: case AFS_CELL_INACTIVE: case AFS_CELL_FAILED: down_write(&net->cells_lock); - usage = 1; - deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0); - if (deleted) + active = 1; + deleted = atomic_try_cmpxchg_relaxed(&cell->active, &active, 0); + if (deleted) { rb_erase(&cell->net_node, &net->cells); + } up_write(&net->cells_lock); if (deleted) goto final_destruction; @@ -688,7 +737,7 @@ again: goto again; case AFS_CELL_ACTIVE: - if (atomic_read(&cell->usage) > 1) { + if (atomic_read(&cell->active) > 1) { if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { ret = afs_update_cell(cell); if (ret < 0) @@ -701,7 +750,7 @@ again: goto again; case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->usage) > 1) + if (atomic_read(&cell->active) > 1) goto reverse_deactivation; afs_deactivate_cell(net, cell); smp_store_release(&cell->state, AFS_CELL_INACTIVE); @@ -733,9 +782,18 @@ done: return; final_destruction: - call_rcu(&cell->rcu, afs_cell_destroy); - afs_dec_cells_outstanding(net); - _leave(" [destruct %d]", atomic_read(&net->cells_outstanding)); + /* The root volume is pinning the cell */ + afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); + cell->root_volume = NULL; + afs_put_cell(cell); +} + +static void afs_manage_cell_work(struct work_struct *work) +{ + struct afs_cell *cell = container_of(work, struct afs_cell, manager); + + afs_manage_cell(cell); + afs_put_cell(cell); } /* @@ -769,21 +827,20 @@ void afs_manage_cells(struct work_struct *work) for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); - unsigned usage; + unsigned active; bool sched_cell = false; - usage = atomic_read(&cell->usage); - _debug("manage %s %u", cell->name, usage); + active = atomic_read(&cell->active); + _debug("manage %s %u %u", cell->name, atomic_read(&cell->ref), active); - ASSERTCMP(usage, >=, 1); + ASSERTCMP(active, >=, 1); if (purging) { if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - usage = atomic_dec_return(&cell->usage); - ASSERTCMP(usage, ==, 1); + atomic_dec(&cell->active); } - if (usage == 1) { + if (active == 1) { struct afs_vlserver_list *vllist; time64_t expire_at = cell->last_inactive; @@ -806,7 +863,7 @@ void afs_manage_cells(struct work_struct *work) } if (sched_cell) - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell); } up_read(&net->cells_lock); @@ -843,7 +900,7 @@ void afs_cell_purge(struct afs_net *net) ws = net->ws_cell; net->ws_cell = NULL; up_write(&net->cells_lock); - afs_put_cell(net, ws); + afs_unuse_cell(net, ws); _debug("del timer"); if (del_timer_sync(&net->cells_timer)) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 5b8de4fee6cd..da32797dd425 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -125,7 +125,7 @@ static int afs_probe_cell_name(struct dentry *dentry) cell = afs_find_cell(net, name, len); if (!IS_ERR(cell)) { - afs_put_cell(net, cell); + afs_unuse_cell(net, cell); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 257c0f07742f..0363511290c8 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -363,7 +363,8 @@ struct afs_cell { #endif time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ - atomic_t usage; + atomic_t ref; /* Struct refcount */ + atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ @@ -920,8 +921,11 @@ extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); +extern struct afs_cell *afs_use_cell(struct afs_cell *); +extern void afs_unuse_cell(struct afs_net *, struct afs_cell *); extern struct afs_cell *afs_get_cell(struct afs_cell *); -extern void afs_put_cell(struct afs_net *, struct afs_cell *); +extern void afs_put_cell(struct afs_cell *); +extern void afs_queue_cell(struct afs_cell *); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 79bc5f1338ed..c69a0282960c 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -88,7 +88,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -124,7 +124,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) char *buf; if (src_as->cell) - ctx->cell = afs_get_cell(src_as->cell); + ctx->cell = afs_use_cell(src_as->cell); if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e8babb62ed44..76fbe0560cfb 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -38,7 +38,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) if (v == SEQ_START_TOKEN) { /* display header on line 1 */ - seq_puts(m, "USE TTL SV ST NAME\n"); + seq_puts(m, "USE ACT TTL SV ST NAME\n"); return 0; } @@ -46,10 +46,11 @@ static int afs_proc_cells_show(struct seq_file *m, void *v) vllist = rcu_dereference(cell->vl_servers); /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %6lld %2u %2u %s\n", - atomic_read(&cell->usage), + seq_printf(m, "%3u %3u %6lld %2u %2u %s\n", + atomic_read(&cell->ref), + atomic_read(&cell->active), cell->dns_expiry - ktime_get_real_seconds(), - vllist->nr_servers, + vllist ? vllist->nr_servers : 0, cell->state, cell->name); return 0; @@ -128,7 +129,7 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_put_cell(net, cell); + afs_unuse_cell(net, cell); } else { goto inval; } @@ -154,13 +155,11 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v) struct afs_net *net; net = afs_seq2net_single(m); - if (rcu_access_pointer(net->ws_cell)) { - rcu_read_lock(); - cell = rcu_dereference(net->ws_cell); - if (cell) - seq_printf(m, "%s\n", cell->name); - rcu_read_unlock(); - } + down_read(&net->cells_lock); + cell = net->ws_cell; + if (cell) + seq_printf(m, "%s\n", cell->name); + up_read(&net->cells_lock); return 0; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 0be99016ecfb..e72c223f831d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -294,7 +294,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell); ctx->cell = cell; } @@ -389,8 +389,8 @@ static int afs_validate_fc(struct fs_context *fc) _debug("switch to alias"); key_put(ctx->key); ctx->key = NULL; - cell = afs_get_cell(ctx->cell->alias_of); - afs_put_cell(ctx->net, ctx->cell); + cell = afs_use_cell(ctx->cell->alias_of); + afs_unuse_cell(ctx->net, ctx->cell); ctx->cell = cell; goto reget_key; } @@ -508,7 +508,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) if (ctx->dyn_root) { as->dyn_root = true; } else { - as->cell = afs_get_cell(ctx->cell); + as->cell = afs_use_cell(ctx->cell); as->volume = afs_get_volume(ctx->volume, afs_volume_trace_get_alloc_sbi); } @@ -521,7 +521,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) if (as) { struct afs_net *net = afs_net(as->net_ns); afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); - afs_put_cell(net, as->cell); + afs_unuse_cell(net, as->cell); put_net(as->net_ns); kfree(as); } @@ -607,7 +607,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); - afs_put_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell); key_put(ctx->key); kfree(ctx); } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 5082ef04e99c..ddb4cb67d0fd 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -177,7 +177,7 @@ static int afs_compare_cell_roots(struct afs_cell *cell) is_alias: rcu_read_unlock(); - cell->alias_of = afs_get_cell(p); + cell->alias_of = afs_use_cell(p); return 1; } @@ -247,18 +247,18 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) continue; if (p->root_volume) continue; /* Ignore cells that have a root.cell volume. */ - afs_get_cell(p); + afs_use_cell(p); mutex_unlock(&cell->net->proc_cells_lock); if (afs_query_for_alias_one(cell, key, p) != 0) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_put_cell(cell->net, p); + afs_unuse_cell(cell->net, p); return -ERESTARTSYS; } - afs_put_cell(cell->net, p); + afs_unuse_cell(cell->net, p); } mutex_unlock(&cell->net->proc_cells_lock); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index c0458c903b31..da3b072d4d63 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -45,7 +45,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - queue_work(afs_wq, &cell->manager); + afs_queue_cell(cell); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 9bc0509e3634..a838030e9563 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,7 +106,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, return volume; error_1: - afs_put_cell(params->net, volume->cell); + afs_put_cell(volume->cell); kfree(volume); error_0: return ERR_PTR(ret); @@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) afs_remove_volume_from_cell(volume); afs_put_serverlist(net, rcu_access_pointer(volume->servers)); - afs_put_cell(net, volume->cell); + afs_put_cell(volume->cell); trace_afs_volume(volume->vid, atomic_read(&volume->usage), afs_volume_trace_free); kfree_rcu(volume, rcu); -- cgit v1.2.3 From 286377f6bdf71568a4cf07104fe44006ae0dba6d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 11:05:01 +0100 Subject: afs: Fix cell purging with aliases When the afs module is removed, one of the things that has to be done is to purge the cell database. afs_cell_purge() cancels the management timer and then starts the cell manager work item to do the purging. This does a single run through and then assumes that all cells are now purged - but this is no longer the case. With the introduction of alias detection, a later cell in the database can now be holding an active count on an earlier cell (cell->alias_of). The purge scan passes by the earlier cell first, but this can't be got rid of until it has discarded the alias. Ordinarily, afs_unuse_cell() would handle this by setting the management timer to trigger another pass - but afs_set_cell_timer() doesn't do anything if the namespace is being removed (net->live == false). rmmod then hangs in the wait on cells_outstanding in afs_cell_purge(). Fix this by making afs_set_cell_timer() directly queue the cell manager if net->live is false. This causes additional management passes. Queueing the cell manager increments cells_outstanding to make sure the wait won't complete until all cells are destroyed. Fixes: 8a070a964877 ("afs: Detect cell aliases 1 - Cells with root volumes") Signed-off-by: David Howells --- fs/afs/cell.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index c906000b0ff8..1944be78e9b0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -19,6 +19,7 @@ static unsigned __read_mostly afs_cell_gc_delay = 10; static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; +static void afs_queue_cell_manager(struct afs_net *); static void afs_manage_cell_work(struct work_struct *); static void afs_dec_cells_outstanding(struct afs_net *net) @@ -37,6 +38,8 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay) atomic_inc(&net->cells_outstanding); if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) afs_dec_cells_outstanding(net); + } else { + afs_queue_cell_manager(net); } } -- cgit v1.2.3 From 1d0e850a49a5b56f8f3cb51e74a11e2fedb96be6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 16 Oct 2020 13:21:14 +0100 Subject: afs: Fix cell removal Fix cell removal by inserting a more final state than AFS_CELL_FAILED that indicates that the cell has been unpublished in case the manager is already requeued and will go through again. The new AFS_CELL_REMOVED state will just immediately leave the manager function. Going through a second time in the AFS_CELL_FAILED state will cause it to try to remove the cell again, potentially leading to the proc list being removed. Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Reported-by: syzbot+b994ecf2b023f14832c1@syzkaller.appspotmail.com Reported-by: syzbot+0e0db88e1eb44a91ae8d@syzkaller.appspotmail.com Reported-by: syzbot+2d0585e5efcd43d113c2@syzkaller.appspotmail.com Reported-by: syzbot+1ecc2f9d3387f1d79d42@syzkaller.appspotmail.com Reported-by: syzbot+18d51774588492bf3f69@syzkaller.appspotmail.com Reported-by: syzbot+a5e4946b04d6ca8fa5f3@syzkaller.appspotmail.com Suggested-by: Hillf Danton Signed-off-by: David Howells cc: Hillf Danton --- fs/afs/cell.c | 16 ++++++++++------ fs/afs/internal.h | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 1944be78e9b0..bc7ed46aaca9 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -291,11 +291,11 @@ wait_for_cell: wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED; + state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; })); /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_FAILED) { + if (state == AFS_CELL_REMOVED) { ret = cell->error; goto error; } @@ -700,7 +700,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) static void afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; - bool deleted; int ret, active; _enter("%s", cell->name); @@ -712,13 +711,15 @@ again: case AFS_CELL_FAILED: down_write(&net->cells_lock); active = 1; - deleted = atomic_try_cmpxchg_relaxed(&cell->active, &active, 0); - if (deleted) { + if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { rb_erase(&cell->net_node, &net->cells); + smp_store_release(&cell->state, AFS_CELL_REMOVED); } up_write(&net->cells_lock); - if (deleted) + if (cell->state == AFS_CELL_REMOVED) { + wake_up_var(&cell->state); goto final_destruction; + } if (cell->state == AFS_CELL_FAILED) goto done; smp_store_release(&cell->state, AFS_CELL_UNSET); @@ -760,6 +761,9 @@ again: wake_up_var(&cell->state); goto again; + case AFS_CELL_REMOVED: + goto done; + default: break; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0363511290c8..06e617ee4cd1 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -326,6 +326,7 @@ enum afs_cell_state { AFS_CELL_DEACTIVATING, AFS_CELL_INACTIVE, AFS_CELL_FAILED, + AFS_CELL_REMOVED, }; /* -- cgit v1.2.3 From dca54a7bbb8ca9148ae10d60c66c926e222a9c4b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Oct 2020 20:51:59 +0100 Subject: afs: Add tracing for cell refcount and active user count Add a tracepoint to log the cell refcount and active user count and pass in a reason code through various functions that manipulate these counters. Additionally, a helper function, afs_see_cell(), is provided to log interesting places that deal with a cell without actually doing any accounting directly. Signed-off-by: David Howells --- fs/afs/cell.c | 99 ++++++++++++++++++++++++++++++++++++++---------------- fs/afs/dynroot.c | 4 +-- fs/afs/internal.h | 15 +++++---- fs/afs/mntpt.c | 4 +-- fs/afs/proc.c | 2 +- fs/afs/super.c | 16 +++++---- fs/afs/vl_alias.c | 8 ++--- fs/afs/vl_rotate.c | 2 +- fs/afs/volume.c | 6 ++-- 9 files changed, 101 insertions(+), 55 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index bc7ed46aaca9..52233fa6195f 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -18,6 +18,7 @@ static unsigned __read_mostly afs_cell_gc_delay = 10; static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; +static atomic_t cell_debug_id; static void afs_queue_cell_manager(struct afs_net *); static void afs_manage_cell_work(struct work_struct *); @@ -48,7 +49,8 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay) * hold net->cells_lock at least read-locked. */ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, - const char *name, unsigned int namesz) + const char *name, unsigned int namesz, + enum afs_cell_trace reason) { struct afs_cell *cell = NULL; struct rb_node *p; @@ -87,19 +89,20 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, return ERR_PTR(-ENOENT); found: - return afs_use_cell(cell); + return afs_use_cell(cell, reason); } /* * Look up and get an activation reference on a cell record. */ struct afs_cell *afs_find_cell(struct afs_net *net, - const char *name, unsigned int namesz) + const char *name, unsigned int namesz, + enum afs_cell_trace reason) { struct afs_cell *cell; down_read(&net->cells_lock); - cell = afs_find_cell_locked(net, name, namesz); + cell = afs_find_cell_locked(net, name, namesz, reason); up_read(&net->cells_lock); return cell; } @@ -197,6 +200,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); + cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); return cell; @@ -236,7 +241,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - cell = afs_find_cell(net, name, namesz); + cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -280,13 +285,16 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; atomic_set(&cell->active, 2); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell); + afs_queue_cell(cell, afs_cell_trace_get_queue_new); wait_for_cell: + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active), + afs_cell_trace_wait); _debug("wait_for_cell"); wait_var_event(&cell->state, ({ @@ -309,17 +317,17 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_use_cell(cursor); + afs_use_cell(cursor, afs_cell_trace_use_lookup); ret = 0; } up_write(&net->cells_lock); if (candidate) - afs_put_cell(candidate); + afs_put_cell(candidate, afs_cell_trace_put_candidate); if (ret == 0) goto wait_for_cell; goto error_noput; error: - afs_unuse_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -364,15 +372,16 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) } if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) - afs_use_cell(new_root); + afs_use_cell(new_root, afs_cell_trace_use_pin); /* install the new cell */ down_write(&net->cells_lock); + afs_see_cell(new_root, afs_cell_trace_see_ws); old_root = net->ws_cell; net->ws_cell = new_root; up_write(&net->cells_lock); - afs_unuse_cell(net, old_root); + afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -485,9 +494,10 @@ static void afs_cell_destroy(struct rcu_head *rcu) u = atomic_read(&cell->ref); ASSERTCMP(u, ==, 0); + trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); - afs_unuse_cell(net, cell->alias_of); + afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); kfree(cell->name); kfree(cell); @@ -525,24 +535,30 @@ void afs_cells_timer(struct timer_list *timer) /* * Get a reference on a cell record. */ -struct afs_cell *afs_get_cell(struct afs_cell *cell) +struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason) { + int u; + if (atomic_read(&cell->ref) <= 0) BUG(); - atomic_inc(&cell->ref); + u = atomic_inc_return(&cell->ref); + trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason); return cell; } /* * Drop a reference on a cell record. */ -void afs_put_cell(struct afs_cell *cell) +void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) { if (cell) { + unsigned int debug_id = cell->debug_id; unsigned int u, a; + a = atomic_read(&cell->active); u = atomic_dec_return(&cell->ref); + trace_afs_cell(debug_id, u, a, reason); if (u == 0) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); @@ -554,12 +570,16 @@ void afs_put_cell(struct afs_cell *cell) /* * Note a cell becoming more active. */ -struct afs_cell *afs_use_cell(struct afs_cell *cell) +struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { + int u, a; + if (atomic_read(&cell->ref) <= 0) BUG(); - atomic_inc(&cell->active); + u = atomic_read(&cell->ref); + a = atomic_inc_return(&cell->active); + trace_afs_cell(cell->debug_id, u, a, reason); return cell; } @@ -567,10 +587,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell) * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ -void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell) +void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) { + unsigned int debug_id = cell->debug_id; time64_t now, expire_delay; - int a; + int u, a; if (!cell) return; @@ -583,21 +604,35 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell) if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; + u = atomic_read(&cell->ref); a = atomic_dec_return(&cell->active); + trace_afs_cell(debug_id, u, a, reason); WARN_ON(a == 0); if (a == 1) /* 'cell' may now be garbage collected. */ afs_set_cell_timer(net, expire_delay); } +/* + * Note that a cell has been seen. + */ +void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int u, a; + + u = atomic_read(&cell->ref); + a = atomic_read(&cell->active); + trace_afs_cell(cell->debug_id, u, a, reason); +} + /* * Queue a cell for management, giving the workqueue a ref to hold. */ -void afs_queue_cell(struct afs_cell *cell) +void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - afs_get_cell(cell); + afs_get_cell(cell, reason); if (!queue_work(afs_wq, &cell->manager)) - afs_put_cell(cell); + afs_put_cell(cell, afs_cell_trace_put_queue_fail); } /* @@ -713,6 +748,8 @@ again: active = 1; if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { rb_erase(&cell->net_node, &net->cells); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0, + afs_cell_trace_unuse_delete); smp_store_release(&cell->state, AFS_CELL_REMOVED); } up_write(&net->cells_lock); @@ -792,7 +829,7 @@ final_destruction: /* The root volume is pinning the cell */ afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; - afs_put_cell(cell); + afs_put_cell(cell, afs_cell_trace_put_destroy); } static void afs_manage_cell_work(struct work_struct *work) @@ -800,7 +837,7 @@ static void afs_manage_cell_work(struct work_struct *work) struct afs_cell *cell = container_of(work, struct afs_cell, manager); afs_manage_cell(cell); - afs_put_cell(cell); + afs_put_cell(cell, afs_cell_trace_put_queue_work); } /* @@ -838,13 +875,17 @@ void afs_manage_cells(struct work_struct *work) bool sched_cell = false; active = atomic_read(&cell->active); - _debug("manage %s %u %u", cell->name, atomic_read(&cell->ref), active); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), + active, afs_cell_trace_manage); ASSERTCMP(active, >=, 1); if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - atomic_dec(&cell->active); + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { + active = atomic_dec_return(&cell->active); + trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), + active, afs_cell_trace_unuse_pin); + } } if (active == 1) { @@ -870,7 +911,7 @@ void afs_manage_cells(struct work_struct *work) } if (sched_cell) - afs_queue_cell(cell); + afs_queue_cell(cell, afs_cell_trace_get_queue_manage); } up_read(&net->cells_lock); @@ -907,7 +948,7 @@ void afs_cell_purge(struct afs_net *net) ws = net->ws_cell; net->ws_cell = NULL; up_write(&net->cells_lock); - afs_unuse_cell(net, ws); + afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); _debug("del timer"); if (del_timer_sync(&net->cells_timer)) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index da32797dd425..db832cc931c8 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -123,9 +123,9 @@ static int afs_probe_cell_name(struct dentry *dentry) len--; } - cell = afs_find_cell(net, name, len); + cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); if (!IS_ERR(cell)) { - afs_unuse_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 06e617ee4cd1..81b0485fd22a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -375,6 +375,7 @@ struct afs_cell { enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ + unsigned int debug_id; /* The volumes belonging to this cell */ struct rb_root volumes; /* Tree of volumes on this server */ @@ -919,14 +920,16 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); -extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned); +extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, + enum afs_cell_trace); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); -extern struct afs_cell *afs_use_cell(struct afs_cell *); -extern void afs_unuse_cell(struct afs_net *, struct afs_cell *); -extern struct afs_cell *afs_get_cell(struct afs_cell *); -extern void afs_put_cell(struct afs_cell *); -extern void afs_queue_cell(struct afs_cell *); +extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index c69a0282960c..052dab2f5c03 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -88,7 +88,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_unuse_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -124,7 +124,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) char *buf; if (src_as->cell) - ctx->cell = afs_use_cell(src_as->cell); + ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 76fbe0560cfb..065a28bfa3f1 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -129,7 +129,7 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_unuse_cell(net, cell); + afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } diff --git a/fs/afs/super.c b/fs/afs/super.c index e72c223f831d..ac4e3ed4e9bd 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -294,7 +294,8 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_unuse_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -389,8 +390,9 @@ static int afs_validate_fc(struct fs_context *fc) _debug("switch to alias"); key_put(ctx->key); ctx->key = NULL; - cell = afs_use_cell(ctx->cell->alias_of); - afs_unuse_cell(ctx->net, ctx->cell); + cell = afs_use_cell(ctx->cell->alias_of, + afs_cell_trace_use_fc_alias); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -508,7 +510,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) if (ctx->dyn_root) { as->dyn_root = true; } else { - as->cell = afs_use_cell(ctx->cell); + as->cell = afs_use_cell(ctx->cell, afs_cell_trace_use_sbi); as->volume = afs_get_volume(ctx->volume, afs_volume_trace_get_alloc_sbi); } @@ -521,7 +523,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) if (as) { struct afs_net *net = afs_net(as->net_ns); afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); - afs_unuse_cell(net, as->cell); + afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -607,7 +609,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); - afs_unuse_cell(ctx->net, ctx->cell); + afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } @@ -634,7 +636,7 @@ static int afs_init_fs_context(struct fs_context *fc) ctx->net = afs_net(fc->net_ns); /* Default to the workstation cell. */ - cell = afs_find_cell(ctx->net, NULL, 0); + cell = afs_find_cell(ctx->net, NULL, 0, afs_cell_trace_use_fc); if (IS_ERR(cell)) cell = NULL; ctx->cell = cell; diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index ddb4cb67d0fd..f04a80e4f5c3 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -177,7 +177,7 @@ static int afs_compare_cell_roots(struct afs_cell *cell) is_alias: rcu_read_unlock(); - cell->alias_of = afs_use_cell(p); + cell->alias_of = afs_use_cell(p, afs_cell_trace_use_alias); return 1; } @@ -247,18 +247,18 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) continue; if (p->root_volume) continue; /* Ignore cells that have a root.cell volume. */ - afs_use_cell(p); + afs_use_cell(p, afs_cell_trace_use_check_alias); mutex_unlock(&cell->net->proc_cells_lock); if (afs_query_for_alias_one(cell, key, p) != 0) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_unuse_cell(cell->net, p); + afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_unuse_cell(cell->net, p); + afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index da3b072d4d63..488e58490b16 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -45,7 +45,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - afs_queue_cell(cell); + afs_queue_cell(cell, afs_cell_trace_get_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index a838030e9563..f84194b791d3 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -83,7 +83,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; - volume->cell = afs_get_cell(params->cell); + volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; @@ -106,7 +106,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, return volume; error_1: - afs_put_cell(volume->cell); + afs_put_cell(volume->cell, afs_cell_trace_put_vol); kfree(volume); error_0: return ERR_PTR(ret); @@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) afs_remove_volume_from_cell(volume); afs_put_serverlist(net, rcu_access_pointer(volume->servers)); - afs_put_cell(volume->cell); + afs_put_cell(volume->cell, afs_cell_trace_put_vol); trace_afs_volume(volume->vid, atomic_read(&volume->usage), afs_volume_trace_free); kfree_rcu(volume, rcu); -- cgit v1.2.3 From 7530d3eb3dcf1a30750e8e7f1f88b782b96b72b8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 09:02:25 +0100 Subject: afs: Don't assert on unpurgeable server records Don't give an assertion failure on unpurgeable afs_server records - which kills the thread - but rather emit a trace line when we are purging a record (which only happens during network namespace removal or rmmod) and print a notice of the problem. Signed-off-by: David Howells --- fs/afs/server.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/afs') diff --git a/fs/afs/server.c b/fs/afs/server.c index e82e452e2612..684a2b02b9ff 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -550,7 +550,12 @@ void afs_manage_servers(struct work_struct *work) _debug("manage %pU %u", &server->uuid, active); - ASSERTIFCMP(purging, active, ==, 0); + if (purging) { + trace_afs_server(server, atomic_read(&server->ref), + active, afs_server_trace_purging); + if (active != 0) + pr_notice("Can't purge s=%08x\n", server->debug_id); + } if (active == 0) { time64_t expire_at = server->unuse_time; -- cgit v1.2.3 From 06a17bbe1d47fec6232505c355b367797f6a635c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Oct 2020 09:39:04 +0000 Subject: afs: Fix copy_file_range() The prevention of splice-write without explicit ops made the copy_file_write() syscall to an afs file (as done by the generic/112 xfstest) fail with EINVAL. Fix by using iter_file_splice_write() for afs. Fixes: 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops") Signed-off-by: David Howells Reviewed-by: Christoph Hellwig --- fs/afs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/afs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 371d1488cc54..91225421ad37 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = { .write_iter = afs_file_write, .mmap = afs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, .flock = afs_flock, -- cgit v1.2.3 From acc080d15dde820bd39eb55a04f9a09c7ef52e67 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Oct 2020 10:42:56 +0000 Subject: afs: Fix tracing deref-before-check The patch dca54a7bbb8c: "afs: Add tracing for cell refcount and active user count" from Oct 13, 2020, leads to the following Smatch complaint: fs/afs/cell.c:596 afs_unuse_cell() warn: variable dereferenced before check 'cell' (see line 592) Fix this by moving the retrieval of the cell debug ID to after the check of the validity of the cell pointer. Reported-by: Dan Carpenter Fixes: dca54a7bbb8c ("afs: Add tracing for cell refcount and active user count") Signed-off-by: David Howells cc: Dan Carpenter --- fs/afs/cell.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/afs') diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 52233fa6195f..887b673f6223 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -589,7 +589,7 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) { - unsigned int debug_id = cell->debug_id; + unsigned int debug_id; time64_t now, expire_delay; int u, a; @@ -604,6 +604,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; + debug_id = cell->debug_id; u = atomic_read(&cell->ref); a = atomic_dec_return(&cell->active); trace_afs_cell(debug_id, u, a, reason); -- cgit v1.2.3 From 248c944e2159de4868bef558feea40214aaf8464 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Aug 2020 11:58:12 +0300 Subject: afs: Fix a use after free in afs_xattr_get_acl() The "op" pointer is freed earlier when we call afs_put_operation(). Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: Dan Carpenter Signed-off-by: David Howells cc: Colin Ian King --- fs/afs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/afs') diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 84f3c4f57531..38884d6c57cd 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -85,7 +85,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, if (acl->size <= size) memcpy(buffer, acl->data, acl->size); else - op->error = -ERANGE; + ret = -ERANGE; } } -- cgit v1.2.3 From d383e346f97d6bb0d654bb3d63c44ab106d92d29 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Oct 2020 14:40:31 +0100 Subject: afs: Fix afs_launder_page to not clear PG_writeback Fix afs_launder_page() to not clear PG_writeback on the page it is laundering as the flag isn't set in this case. Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/internal.h | 1 + fs/afs/write.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 81b0485fd22a..289f5dffa46f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -812,6 +812,7 @@ struct afs_operation { pgoff_t last; /* last page in mapping to deal with */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; diff --git a/fs/afs/write.c b/fs/afs/write.c index da12abd6db21..b937ec047ec9 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -396,7 +396,8 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { - afs_pages_written_back(vnode, op->store.first, op->store.last); + if (!op->store.laundering) + afs_pages_written_back(vnode, op->store.first, op->store.last); afs_stat_v(vnode, n_stores); atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - (op->store.first * PAGE_SIZE + op->store.first_offset), @@ -415,7 +416,7 @@ static const struct afs_operation_ops afs_store_data_operation = { */ static int afs_store_data(struct address_space *mapping, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to) + unsigned offset, unsigned to, bool laundering) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; @@ -448,6 +449,7 @@ static int afs_store_data(struct address_space *mapping, op->store.last = last; op->store.first_offset = offset; op->store.last_to = to; + op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -601,7 +603,7 @@ no_more: if (end > i_size) to = i_size & ~PAGE_MASK; - ret = afs_store_data(mapping, first, last, offset, to); + ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: ret = count; @@ -921,7 +923,7 @@ int afs_launder_page(struct page *page) trace_afs_page_dirty(vnode, tracepoint_string("launder"), page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f); + ret = afs_store_data(mapping, page->index, page->index, t, f, true); } trace_afs_page_dirty(vnode, tracepoint_string("laundered"), -- cgit v1.2.3 From fa04a40b169fcee615afbae97f71a09332993f64 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Oct 2020 13:22:19 +0100 Subject: afs: Fix to take ref on page when PG_private is set Fix afs to take a ref on a page when it sets PG_private on it and to drop the ref when removing the flag. Note that in afs_write_begin(), a lot of the time, PG_private is already set on a page to which we're going to add some data. In such a case, we leave the bit set and mustn't increment the page count. As suggested by Matthew Wilcox, use attach/detach_page_private() where possible. Fixes: 31143d5d515e ("AFS: implement basic file write support") Reported-by: Matthew Wilcox (Oracle) Signed-off-by: David Howells Reviewed-by: Matthew Wilcox (Oracle) --- fs/afs/dir.c | 12 ++++-------- fs/afs/dir_edit.c | 6 ++---- fs/afs/file.c | 8 ++------ fs/afs/write.c | 18 ++++++++++-------- 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1d2e61e0ab04..1bb5b9d7f0a2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -281,8 +281,7 @@ retry: if (ret < 0) goto error; - set_page_private(req->pages[i], 1); - SetPagePrivate(req->pages[i]); + attach_page_private(req->pages[i], (void *)1); unlock_page(req->pages[i]); i++; } else { @@ -1975,8 +1974,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); - set_page_private(page, 0); - ClearPagePrivate(page); + detach_page_private(page); /* The directory will need reloading. */ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) @@ -2003,8 +2001,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - set_page_private(page, 0); - ClearPagePrivate(page); - } + if (offset == 0 && length == PAGE_SIZE) + detach_page_private(page); } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index b108528bf010..2ffe09abae7f 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -243,10 +243,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode, index, gfp); if (!page) goto error; - if (!PagePrivate(page)) { - set_page_private(page, 1); - SetPagePrivate(page); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)1); dir_page = kmap(page); } diff --git a/fs/afs/file.c b/fs/afs/file.c index 91225421ad37..322973d12614 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -627,11 +627,9 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, #endif if (PagePrivate(page)) { - priv = page_private(page); + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); } } @@ -661,11 +659,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) #endif if (PagePrivate(page)) { - priv = page_private(page); + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("rel"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); } /* indicate that the page can be released */ diff --git a/fs/afs/write.c b/fs/afs/write.c index b937ec047ec9..02facb19a0f1 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -151,8 +151,10 @@ try_again: priv |= f; trace_afs_page_dirty(vnode, tracepoint_string("begin"), page->index, priv); - SetPagePrivate(page); - set_page_private(page, priv); + if (PagePrivate(page)) + set_page_private(page, priv); + else + attach_page_private(page, (void *)priv); _leave(" = 0"); return 0; @@ -334,10 +336,9 @@ static void afs_pages_written_back(struct afs_vnode *vnode, ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { - priv = page_private(pv.pages[loop]); + priv = (unsigned long)detach_page_private(pv.pages[loop]); trace_afs_page_dirty(vnode, tracepoint_string("clear"), pv.pages[loop]->index, priv); - set_page_private(pv.pages[loop], 0); end_page_writeback(pv.pages[loop]); } first += count; @@ -863,8 +864,10 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) priv |= 0; /* From */ trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), vmf->page->index, priv); - SetPagePrivate(vmf->page); - set_page_private(vmf->page, priv); + if (PagePrivate(vmf->page)) + set_page_private(vmf->page, priv); + else + attach_page_private(vmf->page, (void *)priv); file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -926,10 +929,9 @@ int afs_launder_page(struct page *page) ret = afs_store_data(mapping, page->index, page->index, t, f, true); } + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { -- cgit v1.2.3 From 21db2cdc667f744691a407105b7712bc18d74023 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Oct 2020 14:03:03 +0100 Subject: afs: Fix page leak on afs_write_begin() failure Fix the leak of the target page in afs_write_begin() when it fails. Fixes: 15b4650e55e0 ("afs: convert to new aops") Signed-off-by: David Howells cc: Nick Piggin --- fs/afs/write.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 02facb19a0f1..7fae9f8b38eb 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -76,7 +76,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, */ int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; @@ -110,9 +110,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, SetPageUptodate(page); } - /* page won't leak in error case: it eventually gets cleaned off LRU */ - *pagep = page; - try_again: /* See if this page is already partially written in a way that we can * merge the new write with. @@ -155,6 +152,7 @@ try_again: set_page_private(page, priv); else attach_page_private(page, (void *)priv); + *_page = page; _leave(" = 0"); return 0; @@ -164,17 +162,18 @@ try_again: flush_conflicting_write: _debug("flush conflict"); ret = write_one_page(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; ret = lock_page_killable(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; goto try_again; + +error: + put_page(page); + _leave(" = %d", ret); + return ret; } /* -- cgit v1.2.3 From f792e3ac82fe2c6c863e93187eb7ddfccab68fa7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 26 Oct 2020 14:05:33 +0000 Subject: afs: Fix where page->private is set during write In afs, page->private is set to indicate the dirty region of a page. This is done in afs_write_begin(), but that can't take account of whether the copy into the page actually worked. Fix this by moving the change of page->private into afs_write_end(). Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/write.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 7fae9f8b38eb..f28d85c38cd8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -135,23 +135,8 @@ try_again: if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && (to < f || from > t)) goto flush_conflicting_write; - if (from < f) - f = from; - if (to > t) - t = to; - } else { - f = from; - t = to; } - priv = (unsigned long)t << AFS_PRIV_SHIFT; - priv |= f; - trace_afs_page_dirty(vnode, tracepoint_string("begin"), - page->index, priv); - if (PagePrivate(page)) - set_page_private(page, priv); - else - attach_page_private(page, (void *)priv); *_page = page; _leave(" = 0"); return 0; @@ -185,6 +170,9 @@ int afs_write_end(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); + unsigned long priv; + unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; int ret; @@ -216,6 +204,29 @@ int afs_write_end(struct file *file, struct address_space *mapping, SetPageUptodate(page); } + if (PagePrivate(page)) { + priv = page_private(page); + f = priv & AFS_PRIV_MAX; + t = priv >> AFS_PRIV_SHIFT; + if (from < f) + f = from; + if (to > t) + t = to; + priv = (unsigned long)t << AFS_PRIV_SHIFT; + priv |= f; + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), + page->index, priv); + } else { + f = from; + t = to; + priv = (unsigned long)t << AFS_PRIV_SHIFT; + priv |= f; + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), + page->index, priv); + } + set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); -- cgit v1.2.3 From 185f0c7073bd5c78f86265f703f5daf1306ab5a7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 26 Oct 2020 13:22:47 +0000 Subject: afs: Wrap page->private manipulations in inline functions The afs filesystem uses page->private to store the dirty range within a page such that in the event of a conflicting 3rd-party write to the server, we write back just the bits that got changed locally. However, there are a couple of problems with this: (1) I need a bit to note if the page might be mapped so that partial invalidation doesn't shrink the range. (2) There aren't necessarily sufficient bits to store the entire range of data altered (say it's a 32-bit system with 64KiB pages or transparent huge pages are in use). So wrap the accesses in inline functions so that future commits can change how this works. Also move them out of the tracing header into the in-directory header. There's not really any need for them to be in the tracing header. Signed-off-by: David Howells --- fs/afs/internal.h | 28 ++++++++++++++++++++++++++++ fs/afs/write.c | 31 +++++++++++++------------------ 2 files changed, 41 insertions(+), 18 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 289f5dffa46f..edaccd07e18e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -858,6 +858,34 @@ struct afs_vnode_cache_aux { u64 data_version; } __packed; +/* + * We use page->private to hold the amount of the page that we've written to, + * splitting the field into two parts. However, we need to represent a range + * 0...PAGE_SIZE inclusive, so we can't support 64K pages on a 32-bit system. + */ +#if PAGE_SIZE > 32768 +#define __AFS_PAGE_PRIV_MASK 0xffffffffUL +#define __AFS_PAGE_PRIV_SHIFT 32 +#else +#define __AFS_PAGE_PRIV_MASK 0xffffUL +#define __AFS_PAGE_PRIV_SHIFT 16 +#endif + +static inline size_t afs_page_dirty_from(unsigned long priv) +{ + return priv & __AFS_PAGE_PRIV_MASK; +} + +static inline size_t afs_page_dirty_to(unsigned long priv) +{ + return (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; +} + +static inline unsigned long afs_page_dirty(size_t from, size_t to) +{ + return ((unsigned long)to << __AFS_PAGE_PRIV_SHIFT) | from; +} + #include /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index f28d85c38cd8..ea1768b3c0b5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -117,8 +117,8 @@ try_again: t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); ASSERTCMP(f, <=, t); } @@ -206,22 +206,18 @@ int afs_write_end(struct file *file, struct address_space *mapping, if (PagePrivate(page)) { priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); if (from < f) f = from; if (to > t) t = to; - priv = (unsigned long)t << AFS_PRIV_SHIFT; - priv |= f; + priv = afs_page_dirty(f, t); set_page_private(page, priv); trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page->index, priv); } else { - f = from; - t = to; - priv = (unsigned long)t << AFS_PRIV_SHIFT; - priv |= f; + priv = afs_page_dirty(from, to); attach_page_private(page, (void *)priv); trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page->index, priv); @@ -522,8 +518,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, */ start = primary_page->index; priv = page_private(primary_page); - offset = priv & AFS_PRIV_MAX; - to = priv >> AFS_PRIV_SHIFT; + offset = afs_page_dirty_from(priv); + to = afs_page_dirty_to(priv); trace_afs_page_dirty(vnode, tracepoint_string("store"), primary_page->index, priv); @@ -568,8 +564,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); if (f != 0 && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); @@ -870,8 +866,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) */ wait_on_page_writeback(vmf->page); - priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */ - priv |= 0; /* From */ + priv = afs_page_dirty(0, PAGE_SIZE); trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), vmf->page->index, priv); if (PagePrivate(vmf->page)) @@ -930,8 +925,8 @@ int afs_launder_page(struct page *page) f = 0; t = PAGE_SIZE; if (PagePrivate(page)) { - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); } trace_afs_page_dirty(vnode, tracepoint_string("launder"), -- cgit v1.2.3 From 65dd2d6072d393a3aa14ded8afa9a12f27d9c8ad Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 26 Oct 2020 13:57:44 +0000 Subject: afs: Alter dirty range encoding in page->private Currently, page->private on an afs page is used to store the range of dirtied data within the page, where the range includes the lower bound, but excludes the upper bound (e.g. 0-1 is a range covering a single byte). This, however, requires a superfluous bit for the last-byte bound so that on a 4KiB page, it can say 0-4096 to indicate the whole page, the idea being that having both numbers the same would indicate an empty range. This is unnecessary as the PG_private bit is clear if it's an empty range (as is PG_dirty). Alter the way the dirty range is encoded in page->private such that the upper bound is reduced by 1 (e.g. 0-0 is then specified the same single byte range mentioned above). Applying this to both bounds frees up two bits, one of which can be used in a future commit. This allows the afs filesystem to be compiled on ppc32 with 64K pages; without this, the following warnings are seen: ../fs/afs/internal.h: In function 'afs_page_dirty_to': ../fs/afs/internal.h:881:15: warning: right shift count >= width of type [-Wshift-count-overflow] 881 | return (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; | ^~ ../fs/afs/internal.h: In function 'afs_page_dirty': ../fs/afs/internal.h:886:28: warning: left shift count >= width of type [-Wshift-count-overflow] 886 | return ((unsigned long)to << __AFS_PAGE_PRIV_SHIFT) | from; | ^~ Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/internal.h | 6 +++--- fs/afs/write.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index edaccd07e18e..344c545f934c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -863,7 +863,7 @@ struct afs_vnode_cache_aux { * splitting the field into two parts. However, we need to represent a range * 0...PAGE_SIZE inclusive, so we can't support 64K pages on a 32-bit system. */ -#if PAGE_SIZE > 32768 +#ifdef CONFIG_64BIT #define __AFS_PAGE_PRIV_MASK 0xffffffffUL #define __AFS_PAGE_PRIV_SHIFT 32 #else @@ -878,12 +878,12 @@ static inline size_t afs_page_dirty_from(unsigned long priv) static inline size_t afs_page_dirty_to(unsigned long priv) { - return (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + return ((priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK) + 1; } static inline unsigned long afs_page_dirty(size_t from, size_t to) { - return ((unsigned long)to << __AFS_PAGE_PRIV_SHIFT) | from; + return ((unsigned long)(to - 1) << __AFS_PAGE_PRIV_SHIFT) | from; } #include diff --git a/fs/afs/write.c b/fs/afs/write.c index ea1768b3c0b5..1a49f5c89342 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -93,7 +93,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, /* We want to store information about how much of a page is altered in * page->private. */ - BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8); + BUILD_BUG_ON(PAGE_SIZE - 1 > __AFS_PAGE_PRIV_MASK && sizeof(page->private) < 8); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) -- cgit v1.2.3 From f86726a69dec5df6ba051baf9265584419478b64 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Oct 2020 14:08:23 +0100 Subject: afs: Fix afs_invalidatepage to adjust the dirty region Fix afs_invalidatepage() to adjust the dirty region recorded in page->private when truncating a page. If the dirty region is entirely removed, then the private data is cleared and the page dirty state is cleared. Without this, if the page is truncated and then expanded again by truncate, zeros from the expanded, but no-longer dirty region may get written back to the server if the page gets laundered due to a conflicting 3rd-party write. It mustn't, however, shorten the dirty region of the page if that page is still mmapped and has been marked dirty by afs_page_mkwrite(), so a flag is stored in page->private to record this. Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Signed-off-by: David Howells --- fs/afs/file.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++-------- fs/afs/internal.h | 16 +++++++++++-- fs/afs/write.c | 1 + 3 files changed, 76 insertions(+), 12 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 322973d12614..85f5adf21aa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -601,6 +601,63 @@ static int afs_readpages(struct file *file, struct address_space *mapping, return ret; } +/* + * Adjust the dirty region of the page on truncation or full invalidation, + * getting rid of the markers altogether if the region is entirely invalidated. + */ +static void afs_invalidate_dirty(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + unsigned long priv; + unsigned int f, t, end = offset + length; + + priv = page_private(page); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == thp_size(page)) + goto full_invalidate; + + /* If the page was dirtied by page_mkwrite(), the PTE stays writable + * and we don't get another notification to tell us to expand it + * again. + */ + if (afs_is_page_dirty_mmapped(priv)) + return; + + /* We may need to shorten the dirty region */ + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); + + if (t <= offset || f >= end) + return; /* Doesn't overlap */ + + if (f < offset && t > end) + return; /* Splits the dirty region - just absorb it */ + + if (f >= offset && t <= end) + goto undirty; + + if (f < offset) + t = offset; + else + f = end; + if (f == t) + goto undirty; + + priv = afs_page_dirty(f, t); + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + return; + +undirty: + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + clear_page_dirty_for_io(page); +full_invalidate: + priv = (unsigned long)detach_page_private(page); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); +} + /* * invalidate part or all of a page * - release a page and clean up its private data if offset is 0 (indicating @@ -609,29 +666,23 @@ static int afs_readpages(struct file *file, struct address_space *mapping, static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; - _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); +#ifdef CONFIG_AFS_FSCACHE /* we clean up only if the entire page is being invalidated */ if (offset == 0 && length == PAGE_SIZE) { -#ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); fscache_wait_on_page_write(vnode->cache, page); fscache_uncache_page(vnode->cache, page); } + } #endif - if (PagePrivate(page)) { - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), - page->index, priv); - } - } + if (PagePrivate(page)) + afs_invalidate_dirty(page, offset, length); _leave(""); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 344c545f934c..b0fce1f75397 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -864,11 +864,13 @@ struct afs_vnode_cache_aux { * 0...PAGE_SIZE inclusive, so we can't support 64K pages on a 32-bit system. */ #ifdef CONFIG_64BIT -#define __AFS_PAGE_PRIV_MASK 0xffffffffUL +#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL #define __AFS_PAGE_PRIV_SHIFT 32 +#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL #else -#define __AFS_PAGE_PRIV_MASK 0xffffUL +#define __AFS_PAGE_PRIV_MASK 0x7fffUL #define __AFS_PAGE_PRIV_SHIFT 16 +#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL #endif static inline size_t afs_page_dirty_from(unsigned long priv) @@ -886,6 +888,16 @@ static inline unsigned long afs_page_dirty(size_t from, size_t to) return ((unsigned long)(to - 1) << __AFS_PAGE_PRIV_SHIFT) | from; } +static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) +{ + return priv | __AFS_PAGE_PRIV_MMAPPED; +} + +static inline bool afs_is_page_dirty_mmapped(unsigned long priv) +{ + return priv & __AFS_PAGE_PRIV_MMAPPED; +} + #include /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index 1a49f5c89342..a2511e3ad2cc 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -867,6 +867,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) wait_on_page_writeback(vmf->page); priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty_mmapped(priv); trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), vmf->page->index, priv); if (PagePrivate(vmf->page)) -- cgit v1.2.3 From 2d9900f26ad61e63a34f239bc76c80d2f8a6ff41 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Oct 2020 12:08:39 +0000 Subject: afs: Fix dirty-region encoding on ppc32 with 64K pages The dirty region bounds stored in page->private on an afs page are 15 bits on a 32-bit box and can, at most, represent a range of up to 32K within a 32K page with a resolution of 1 byte. This is a problem for powerpc32 with 64K pages enabled. Further, transparent huge pages may get up to 2M, which will be a problem for the afs filesystem on all 32-bit arches in the future. Fix this by decreasing the resolution. For the moment, a 64K page will have a resolution determined from PAGE_SIZE. In the future, the page will need to be passed in to the helper functions so that the page size can be assessed and the resolution determined dynamically. Note that this might not be the ideal way to handle this, since it may allow some leakage of undirtied zero bytes to the server's copy in the case of a 3rd-party conflict. Fixing that would require a separately allocated record and is a more complicated fix. Fixes: 4343d00872e1 ("afs: Get rid of the afs_writeback record") Reported-by: kernel test robot Signed-off-by: David Howells Reviewed-by: Matthew Wilcox (Oracle) --- fs/afs/internal.h | 24 ++++++++++++++++++++---- fs/afs/write.c | 5 ----- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b0fce1f75397..14d5d75f4b6e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -861,7 +861,8 @@ struct afs_vnode_cache_aux { /* * We use page->private to hold the amount of the page that we've written to, * splitting the field into two parts. However, we need to represent a range - * 0...PAGE_SIZE inclusive, so we can't support 64K pages on a 32-bit system. + * 0...PAGE_SIZE, so we reduce the resolution if the size of the page + * exceeds what we can encode. */ #ifdef CONFIG_64BIT #define __AFS_PAGE_PRIV_MASK 0x7fffffffUL @@ -873,19 +874,34 @@ struct afs_vnode_cache_aux { #define __AFS_PAGE_PRIV_MMAPPED 0x8000UL #endif +static inline unsigned int afs_page_dirty_resolution(void) +{ + int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + return (shift > 0) ? shift : 0; +} + static inline size_t afs_page_dirty_from(unsigned long priv) { - return priv & __AFS_PAGE_PRIV_MASK; + unsigned long x = priv & __AFS_PAGE_PRIV_MASK; + + /* The lower bound is inclusive */ + return x << afs_page_dirty_resolution(); } static inline size_t afs_page_dirty_to(unsigned long priv) { - return ((priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK) + 1; + unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + + /* The upper bound is immediately beyond the region */ + return (x + 1) << afs_page_dirty_resolution(); } static inline unsigned long afs_page_dirty(size_t from, size_t to) { - return ((unsigned long)(to - 1) << __AFS_PAGE_PRIV_SHIFT) | from; + unsigned int res = afs_page_dirty_resolution(); + from >>= res; + to = (to - 1) >> res; + return (to << __AFS_PAGE_PRIV_SHIFT) | from; } static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) diff --git a/fs/afs/write.c b/fs/afs/write.c index a2511e3ad2cc..50371207f327 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -90,11 +90,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, _enter("{%llx:%llu},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); - /* We want to store information about how much of a page is altered in - * page->private. - */ - BUILD_BUG_ON(PAGE_SIZE - 1 > __AFS_PAGE_PRIV_MASK && sizeof(page->private) < 8); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; -- cgit v1.2.3 From c80afa1d9c3603d5eddeb8d63368823b1982f3f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 3 Nov 2020 16:32:58 +0000 Subject: afs: Fix warning due to unadvanced marshalling pointer When using the afs.yfs.acl xattr to change an AuriStor ACL, a warning can be generated when the request is marshalled because the buffer pointer isn't increased after adding the last element, thereby triggering the check at the end if the ACL wasn't empty. This just causes something like the following warning, but doesn't stop the call from happening successfully: kAFS: YFS.StoreOpaqueACL2: Request buffer underflow (36<108) Fix this simply by increasing the count prior to the check. Fixes: f5e4546347bc ("afs: Implement YFS ACL setting") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/yfsclient.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/afs') diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 3b1239b7e90d..bd787e71a657 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1990,6 +1990,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) memcpy(bp, acl->data, acl->size); if (acl->size != size) memset((void *)bp + acl->size, 0, size - acl->size); + bp += size / sizeof(__be32); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); -- cgit v1.2.3 From f4c79144edd8a49ffca8fa737a31d606be742a34 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 3 Nov 2020 16:33:07 +0000 Subject: afs: Fix incorrect freeing of the ACL passed to the YFS ACL store op The cleanup for the yfs_store_opaque_acl2_operation calls the wrong function to destroy the ACL content buffer. It's an afs_acl struct, not a yfs_acl struct - and the free function for latter may pass invalid pointers to kfree(). Fix this by using the afs_acl_put() function. The yfs_acl_put() function is then no longer used and can be removed. general protection fault, probably for non-canonical address 0x7ebde00000000: 0000 [#1] SMP PTI ... RIP: 0010:compound_head+0x0/0x11 ... Call Trace: virt_to_cache+0x8/0x51 kfree+0x5d/0x79 yfs_free_opaque_acl+0x16/0x29 afs_put_operation+0x60/0x114 __vfs_setxattr+0x67/0x72 __vfs_setxattr_noperm+0x66/0xe9 vfs_setxattr+0x67/0xce setxattr+0x14e/0x184 __do_sys_fsetxattr+0x66/0x8f do_syscall_64+0x2d/0x3a entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/xattr.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/afs') diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 38884d6c57cd..95c573dcda11 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -148,11 +148,6 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = { .set = afs_xattr_set_acl, }; -static void yfs_acl_put(struct afs_operation *op) -{ - yfs_free_opaque_acl(op->yacl); -} - static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = { .issue_yfs_rpc = yfs_fs_fetch_opaque_acl, .success = afs_acl_success, @@ -246,7 +241,7 @@ error: static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { .issue_yfs_rpc = yfs_fs_store_opaque_acl2, .success = afs_acl_success, - .put = yfs_acl_put, + .put = afs_acl_put, }; /* -- cgit v1.2.3