From 9ed900b1160ef306bc74ad0228d7ab199234c758 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:46 +0000
Subject: afs: Push the net ns pointer to more places

Push the network namespace pointer to more places in AFS, including the
afs_server structure (which doesn't hold a ref on the netns).

In particular, afs_put_cell() now takes requires a net ns parameter so that
it can safely alter the netns after decrementing the cell usage count - the
cell will be deallocated by a background thread after being cached for a
period, which means that it's not safe to access it after reducing its
usage count.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/afs/dir.c')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 613a77058263..97ec6a74589e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -771,7 +771,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	vnode->update_cnt++;
 	spin_unlock(&vnode->lock);
 	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 
 	d_instantiate(dentry, inode);
 	if (d_unhashed(dentry)) {
@@ -783,7 +783,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 
 iget_error:
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 mkdir_error:
 	key_put(key);
 error:
@@ -948,7 +948,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	vnode->update_cnt++;
 	spin_unlock(&vnode->lock);
 	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 
 	d_instantiate(dentry, inode);
 	if (d_unhashed(dentry)) {
@@ -960,7 +960,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return 0;
 
 iget_error:
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 create_error:
 	key_put(key);
 error:
@@ -1060,7 +1060,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 	vnode->update_cnt++;
 	spin_unlock(&vnode->lock);
 	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 
 	d_instantiate(dentry, inode);
 	if (d_unhashed(dentry)) {
@@ -1072,7 +1072,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 	return 0;
 
 iget_error:
-	afs_put_server(server);
+	afs_put_server(afs_i2net(dir), server);
 create_error:
 	key_put(key);
 error:
-- 
cgit v1.2.3


From c435ee34551e1f5a02a253ca8e235287efd2727c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:49 +0000
Subject: afs: Overhaul the callback handling

Overhaul the AFS callback handling by the following means:

 (1) Don't give up callback promises on vnodes that we are no longer using,
     rather let them just expire on the server or let the server break
     them.  This is actually more efficient for the server as the callback
     lookup is expensive if there are lots of extant callbacks.

 (2) Only give up the callback promises we have from a server when the
     server record is destroyed.  Then we can just give up *all* the
     callback promises on it in one go.

 (3) Servers can end up being shared between cells if cells are aliased, so
     don't add all the vnodes being backed by a particular server into a
     big FID-indexed tree on that server as there may be duplicates.

     Instead have each volume instance (~= superblock) register an interest
     in a server as it starts to make use of it and use this to allow the
     processor for callbacks from the server to find the superblock and
     thence the inode corresponding to the FID being broken by means of
     ilookup_nowait().

 (4) Rather than iterating over the entire callback list when a mass-break
     comes in from the server, maintain a counter of mass-breaks in
     afs_server (cb_seq) and make afs_validate() check it against the copy
     in afs_vnode.

     It would be nice not to have to take a read_lock whilst doing this,
     but that's tricky without using RCU.

 (5) Save a ref on the fileserver we're using for a call in the afs_call
     struct so that we can access its cb_s_break during call decoding.

 (6) Write-lock around callback and status storage in a vnode and read-lock
     around getattr so that we don't see the status mid-update.

This has the following consequences:

 (1) Data invalidation isn't seen until someone calls afs_validate() on a
     vnode.  Unfortunately, we need to use a key to query the server, but
     getting one from a background thread is tricky without caching loads
     of keys all over the place.

 (2) Mass invalidation isn't seen until someone calls afs_validate().

 (3) Callback breaking is going to hit the inode_hash_lock quite a bit.
     Could this be replaced with rcu_read_lock() since inodes are destroyed
     under RCU conditions.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c | 52 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

(limited to 'fs/afs/dir.c')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 97ec6a74589e..37083699a0df 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -384,7 +384,7 @@ out:
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file), 
+	return afs_dir_iterate(file_inode(file),
 			      ctx, file->private_data);
 }
 
@@ -581,6 +581,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid uninitialized_var(fid);
 	struct dentry *parent;
+	struct inode *inode;
 	struct key *key;
 	void *dir_version;
 	int ret;
@@ -588,30 +589,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	vnode = AFS_FS_I(d_inode(dentry));
-
-	if (d_really_is_positive(dentry))
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
 		_enter("{v={%x:%u} n=%pd fl=%lx},",
 		       vnode->fid.vid, vnode->fid.vnode, dentry,
 		       vnode->flags);
-	else
+	} else {
 		_enter("{neg n=%pd}", dentry);
+	}
 
 	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
 	if (IS_ERR(key))
 		key = NULL;
 
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_validate(vnode, key);
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				goto out_bad;
+		}
+	}
+
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
 	dir = AFS_FS_I(d_inode(parent));
 
 	/* validate the parent directory */
-	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
-		afs_validate(dir, key);
+	afs_validate(dir, key);
 
 	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%pd: parent dir deleted", dentry);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 	dir_version = (void *) (unsigned long) dir->status.data_version;
@@ -626,13 +636,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	case 0:
 		/* the filename maps to something */
 		if (d_really_is_negative(dentry))
-			goto out_bad;
-		if (is_bad_inode(d_inode(dentry))) {
+			goto out_bad_parent;
+		inode = d_inode(dentry);
+		if (is_bad_inode(inode)) {
 			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
 			       dentry);
-			goto out_bad;
+			goto out_bad_parent;
 		}
 
+		vnode = AFS_FS_I(inode);
+
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
 		if (fid.vnode != vnode->fid.vnode) {
@@ -649,10 +662,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 			_debug("%pd: file deleted (uq %u -> %u I:%u)",
 			       dentry, fid.unique,
 			       vnode->fid.unique,
-			       d_inode(dentry)->i_generation);
-			spin_lock(&vnode->lock);
+			       vnode->vfs_inode.i_generation);
+			write_seqlock(&vnode->cb_lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
-			spin_unlock(&vnode->lock);
+			write_sequnlock(&vnode->cb_lock);
 			goto not_found;
 		}
 		goto out_valid;
@@ -667,7 +680,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	default:
 		_debug("failed to iterate dir %pd: %d",
 		       parent, ret);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 out_valid:
@@ -683,9 +696,10 @@ not_found:
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
-out_bad:
+out_bad_parent:
 	_debug("dropping dentry %pd2", dentry);
 	dput(parent);
+out_bad:
 	key_put(key);
 
 	_leave(" = 0 [bad]");
@@ -820,7 +834,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		vnode = AFS_FS_I(d_inode(dentry));
 		clear_nlink(&vnode->vfs_inode);
 		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_discard_callback_on_delete(vnode);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 	}
 
 	key_put(key);
@@ -884,9 +898,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		vnode = AFS_FS_I(d_inode(dentry));
 		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 			_debug("AFS_VNODE_DELETED");
-		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-			_debug("AFS_VNODE_CB_BROKEN");
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 		ret = afs_validate(vnode, key);
 		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
 	}
-- 
cgit v1.2.3


From d2ddc776a4581d900fc3bdc7803b403daae64d88 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:50 +0000
Subject: afs: Overhaul volume and server record caching and fileserver
 rotation

The current code assumes that volumes and servers are per-cell and are
never shared, but this is not enforced, and, indeed, public cells do exist
that are aliases of each other.  Further, an organisation can, say, set up
a public cell and a private cell with overlapping, but not identical, sets
of servers.  The difference is purely in the database attached to the VL
servers.

The current code will malfunction if it sees a server in two cells as it
assumes global address -> server record mappings and that each server is in
just one cell.

Further, each server may have multiple addresses - and may have addresses
of different families (IPv4 and IPv6, say).

To this end, the following structural changes are made:

 (1) Server record management is overhauled:

     (a) Server records are made independent of cell.  The namespace keeps
     	 track of them, volume records have lists of them and each vnode
     	 has a server on which its callback interest currently resides.

     (b) The cell record no longer keeps a list of servers known to be in
     	 that cell.

     (c) The server records are now kept in a flat list because there's no
     	 single address to sort on.

     (d) Server records are now keyed by their UUID within the namespace.

     (e) The addresses for a server are obtained with the VL.GetAddrsU
     	 rather than with VL.GetEntryByName, using the server's UUID as a
     	 parameter.

     (f) Cached server records are garbage collected after a period of
     	 non-use and are counted out of existence before purging is allowed
     	 to complete.  This protects the work functions against rmmod.

     (g) The servers list is now in /proc/fs/afs/servers.

 (2) Volume record management is overhauled:

     (a) An RCU-replaceable server list is introduced.  This tracks both
     	 servers and their coresponding callback interests.

     (b) The superblock is now keyed on cell record and numeric volume ID.

     (c) The volume record is now tied to the superblock which mounts it,
     	 and is activated when mounted and deactivated when unmounted.
     	 This makes it easier to handle the cache cookie without causing a
     	 double-use in fscache.

     (d) The volume record is loaded from the VLDB using VL.GetEntryByNameU
     	 to get the server UUID list.

     (e) The volume name is updated if it is seen to have changed when the
     	 volume is updated (the update is keyed on the volume ID).

 (3) The vlocation record is got rid of and VLDB records are no longer
     cached.  Sufficient information is stored in the volume record, though
     an update to a volume record is now no longer shared between related
     volumes (volumes come in bundles of three: R/W, R/O and backup).

and the following procedural changes are made:

 (1) The fileserver cursor introduced previously is now fleshed out and
     used to iterate over fileservers and their addresses.

 (2) Volume status is checked during iteration, and the server list is
     replaced if a change is detected.

 (3) Server status is checked during iteration, and the address list is
     replaced if a change is detected.

 (4) The abort code is saved into the address list cursor and -ECONNABORTED
     returned in afs_make_call() if a remote abort happened rather than
     translating the abort into an error message.  This allows actions to
     be taken depending on the abort code more easily.

     (a) If a VMOVED abort is seen then this is handled by rechecking the
     	 volume and restarting the iteration.

     (b) If a VBUSY, VRESTARTING or VSALVAGING abort is seen then this is
         handled by sleeping for a short period and retrying and/or trying
         other servers that might serve that volume.  A message is also
         displayed once until the condition has cleared.

     (c) If a VOFFLINE abort is seen, then this is handled as VBUSY for the
     	 moment.

     (d) If a VNOVOL abort is seen, the volume is rechecked in the VLDB to
     	 see if it has been deleted; if not, the fileserver is probably
     	 indicating that the volume couldn't be attached and needs
     	 salvaging.

     (e) If statfs() sees one of these aborts, it does not sleep, but
     	 rather returns an error, so as not to block the umount program.

 (5) The fileserver iteration functions in vnode.c are now merged into
     their callers and more heavily macroised around the cursor.  vnode.c
     is removed.

 (6) Operations on a particular vnode are serialised on that vnode because
     the server will lock that vnode whilst it operates on it, so a second
     op sent will just have to wait.

 (7) Fileservers are probed with FS.GetCapabilities before being used.
     This is where service upgrade will be done.

 (8) A callback interest on a fileserver is set up before an FS operation
     is performed and passed through to afs_make_call() so that it can be
     set on the vnode if the operation returns a callback.  The callback
     interest is passed through to afs_iget() also so that it can be set
     there too.

In general, record updating is done on an as-needed basis when we try to
access servers, volumes or vnodes rather than offloading it to work items
and special threads.

Notes:

 (1) Pre AFS-3.4 servers are no longer supported, though this can be added
     back if necessary (AFS-3.4 was released in 1998).

 (2) VBUSY is retried forever for the moment at intervals of 1s.

 (3) /proc/fs/afs/<cell>/servers no longer exists.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c | 388 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 224 insertions(+), 164 deletions(-)

(limited to 'fs/afs/dir.c')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 37083699a0df..53f3917440e7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -553,7 +553,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
 
 	/* instantiate the dentry */
-	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL);
 	key_put(key);
 	if (IS_ERR(inode)) {
 		_leave(" = %ld", PTR_ERR(inode));
@@ -740,21 +740,49 @@ static void afs_d_release(struct dentry *dentry)
 	_enter("%pd", dentry);
 }
 
+/*
+ * Create a new inode for create/mkdir/symlink
+ */
+static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
+				struct dentry *new_dentry,
+				struct afs_fid *newfid,
+				struct afs_file_status *newstatus,
+				struct afs_callback *newcb)
+{
+	struct inode *inode;
+
+	if (fc->ac.error < 0)
+		return;
+
+	inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
+			 newfid, newstatus, newcb, fc->cbi);
+	if (IS_ERR(inode)) {
+		/* ENOMEM or EINTR at a really inconvenient time - just abandon
+		 * the new directory on the server.
+		 */
+		fc->ac.error = PTR_ERR(inode);
+		return;
+	}
+
+	d_instantiate(new_dentry, inode);
+	if (d_unhashed(new_dentry))
+		d_rehash(new_dentry);
+}
+
 /*
  * create a directory on an AFS filesystem
  */
 static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_file_status newstatus;
+	struct afs_fs_cursor fc;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFDIR;
 
 	_enter("{%x:%u},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
@@ -765,40 +793,27 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto error;
 	}
 
-	mode |= S_IFDIR;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto mkdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(afs_i2net(dir), server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(afs_i2net(dir), server);
-mkdir_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -806,17 +821,30 @@ error:
 	return ret;
 }
 
+/*
+ * Remove a subdir from a directory.
+ */
+static void afs_dir_remove_subdir(struct dentry *dentry)
+{
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	}
+}
+
 /*
  * remove a directory from an AFS filesystem
  */
 static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
@@ -826,45 +854,69 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
-	if (ret < 0)
-		goto rmdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, true);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		vnode = AFS_FS_I(d_inode(dentry));
-		clear_nlink(&vnode->vfs_inode);
-		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			afs_dir_remove_subdir(dentry);
 	}
 
 	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-rmdir_error:
-	key_put(key);
 error:
-	_leave(" = %d", ret);
 	return ret;
 }
 
 /*
- * remove a file from an AFS filesystem
+ * Remove a link to a file or symlink from a directory.
+ *
+ * If the file was not deleted due to excess hard links, the fileserver will
+ * break the callback promise on the file - if it had one - before it returns
+ * to us, and if it was deleted, it won't
+ *
+ * However, if we didn't have a callback promise outstanding, or it was
+ * outstanding on a different server, then it won't break it either...
+ */
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+{
+	int ret = 0;
+
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			kdebug("AFS_VNODE_DELETED");
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+		ret = afs_validate(vnode, key);
+		if (ret == -ESTALE)
+			ret = 0;
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Remove a file or symlink from an AFS filesystem.
  */
 static int afs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
-	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
+		return -ENAMETOOLONG;
 
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
@@ -872,42 +924,28 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
+	/* Try to make sure we have a callback promise on the victim. */
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
-
-		/* make sure we have a callback promise on the victim */
 		ret = afs_validate(vnode, key);
 		if (ret < 0)
-			goto error;
+			goto error_key;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
-	if (ret < 0)
-		goto remove_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, false);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		/* if the file wasn't deleted due to excess hard links, the
-		 * fileserver will break the callback promise on the file - if
-		 * it had one - before it returns to us, and if it was deleted,
-		 * it won't
-		 *
-		 * however, if we didn't have a callback promise outstanding,
-		 * or it was outstanding on a different server, then it won't
-		 * break it either...
-		 */
-		vnode = AFS_FS_I(d_inode(dentry));
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			_debug("AFS_VNODE_DELETED");
-		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-		ret = afs_validate(vnode, key);
-		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			ret = afs_dir_remove_link(dentry, key);
 	}
 
-	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-remove_error:
+error_key:
 	key_put(key);
 error:
 	_leave(" = %d", ret);
@@ -920,60 +958,50 @@ error:
 static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      bool excl)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFREG;
 
 	_enter("{%x:%u},{%pd},%ho,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	mode |= S_IFREG;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(afs_i2net(dir), server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(afs_i2net(dir), server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -987,6 +1015,7 @@ error:
 static int afs_link(struct dentry *from, struct inode *dir,
 		    struct dentry *dentry)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -999,23 +1028,45 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
-	if (ret < 0)
-		goto link_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
+			afs_end_vnode_operation(&fc);
+			return -ERESTARTSYS;
+		}
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_link(&fc, vnode, dentry->d_name.name);
+		}
+
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2);
+		ihold(&vnode->vfs_inode);
+		d_instantiate(dentry, &vnode->vfs_inode);
+
+		mutex_unlock(&vnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	}
 
-	ihold(&vnode->vfs_inode);
-	d_instantiate(dentry, &vnode->vfs_inode);
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-link_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1029,20 +1080,21 @@ error:
 static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *content)
 {
-	struct afs_file_status status;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd},%s",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry,
 	       content);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	ret = -EINVAL;
 	if (strlen(content) >= AFSPATHMAX)
 		goto error;
@@ -1053,39 +1105,27 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
-				&fid, &status, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_symlink(&fc, dentry->d_name.name, content,
+				       &newfid, &newstatus);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(afs_i2net(dir), server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(afs_i2net(dir), server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1100,6 +1140,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		      struct inode *new_dir, struct dentry *new_dentry,
 		      unsigned int flags)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -1123,16 +1164,35 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
-			       old_dentry->d_name.name,
-			       new_dentry->d_name.name);
-	if (ret < 0)
-		goto rename_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) {
+		if (orig_dvnode != new_dvnode) {
+			if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) {
+				afs_end_vnode_operation(&fc);
+				return -ERESTARTSYS;
+			}
+		}
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
+			fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+			afs_fs_rename(&fc, old_dentry->d_name.name,
+				      new_dvnode, new_dentry->d_name.name);
+		}
+
+		afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2);
+		if (orig_dvnode != new_dvnode)
+			mutex_unlock(&new_dvnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	}
+
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-rename_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(new_dentry);
-- 
cgit v1.2.3


From dab17c1add5c51b68027a9a3861af3a99cb5485a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:52 +0000
Subject: afs: Fix directory read/modify race

Because parsing of the directory wasn't being done under any sort of lock,
the pages holding the directory content can get invalidated whilst the
parsing is ongoing.

Further, the directory page check function gets called outside of the page
lock, so if the page gets cleared or updated, this may return reports of
bad magic numbers in the directory page.

Also, the directory may change size whilst checking and parsing are
ongoing, so more care needs to be taken here.

Fix this by:

 (1) Perform the page check from the page filling function before we set
     PageUptodate and drop the page lock.

 (2) Check for the file having shrunk and the page having been abandoned
     before checking the page contents.

 (3) Lock the page whilst parsing it for the directory iterator.

Whilst we're at it, add a tracepoint to report check failure.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs/afs/dir.c')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 53f3917440e7..ecda0e6a9f7e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -130,10 +130,11 @@ struct afs_lookup_cookie {
 /*
  * check that a directory page is valid
  */
-static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
+bool afs_dir_check_page(struct inode *dir, struct page *page)
 {
 	struct afs_dir_page *dbuf;
-	loff_t latter;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	loff_t latter, i_size, off;
 	int tmp, qty;
 
 #if 0
@@ -150,8 +151,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	}
 #endif
 
-	/* determine how many magic numbers there should be in this page */
-	latter = dir->i_size - page_offset(page);
+	/* Determine how many magic numbers there should be in this page, but
+	 * we must take care because the directory may change size under us.
+	 */
+	off = page_offset(page);
+	i_size = i_size_read(dir);
+	if (i_size <= off)
+		goto checked;
+
+	latter = i_size - off;
 	if (latter >= PAGE_SIZE)
 		qty = PAGE_SIZE;
 	else
@@ -162,13 +170,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	dbuf = page_address(page);
 	for (tmp = 0; tmp < qty; tmp++) {
 		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
-			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
+			printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
 			       __func__, dir->i_ino, tmp, qty,
 			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
+			trace_afs_dir_check_failed(vnode, off, i_size);
 			goto error;
 		}
 	}
 
+checked:
 	SetPageChecked(page);
 	return true;
 
@@ -183,6 +193,7 @@ error:
 static inline void afs_dir_put_page(struct page *page)
 {
 	kunmap(page);
+	unlock_page(page);
 	put_page(page);
 }
 
@@ -197,9 +208,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 
 	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
 	if (!IS_ERR(page)) {
+		lock_page(page);
 		kmap(page);
 		if (unlikely(!PageChecked(page))) {
-			if (PageError(page) || !afs_dir_check_page(dir, page))
+			if (PageError(page))
 				goto fail;
 		}
 	}
@@ -384,8 +396,7 @@ out:
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file),
-			      ctx, file->private_data);
+	return afs_dir_iterate(file_inode(file), ctx, file->private_data);
 }
 
 /*
-- 
cgit v1.2.3


From 215804a99283c57fdd869aab350fdf6acc3460b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:52 +0000
Subject: afs: Introduce a file-private data record

Introduce a file-private data record for kAFS and put the key into it
rather than storing the key in file->private_data.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/afs/dir.c')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ecda0e6a9f7e..ab618d32554c 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -396,7 +396,7 @@ out:
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file), ctx, file->private_data);
+	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file));
 }
 
 /*
-- 
cgit v1.2.3