From 5ff9d8a65ce80efb509ce4e8051394e9ed2cd942 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Mar 2013 21:04:39 -0700 Subject: vfs: Lock in place mounts from more privileged users When creating a less privileged mount namespace or propogating mounts from a more privileged to a less privileged mount namespace lock the submounts so they may not be unmounted individually in the child mount namespace revealing what is under them. This enforces the reasonable expectation that it is not possible to see under a mount point. Most of the time mounts are on empty directories and revealing that does not matter, however I have seen an occassionaly sloppy configuration where there were interesting things concealed under a mount point that probably should not be revealed. Expirable submounts are not locked because they will eventually unmount automatically so whatever is under them already needs to be safe for unprivileged users to access. From a practical standpoint these restrictions do not appear to be significant for unprivileged users of the mount namespace. Recursive bind mounts and pivot_root continues to work, and mounts that are created in a mount namespace may be unmounted there. All of which means that the common idiom of keeping a directory of interesting files and using pivot_root to throw everything else away continues to work just fine. Acked-by: Serge Hallyn Acked-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 7b1ca9ba0b0a..7e16a730559c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -831,6 +831,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + mnt->mnt.mnt_flags |= MNT_LOCKED; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1327,6 +1331,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: @@ -1381,6 +1387,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, if (IS_ERR(q)) return q; + q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -1696,6 +1703,19 @@ static int do_change_type(struct path *path, int flag) return err; } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /* * do loopback mount. */ @@ -1731,6 +1751,9 @@ static int do_loopback(struct path *path, const char *old_name, if (!check_mnt(parent) || !check_mnt(old)) goto out2; + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + if (recurse) mnt = copy_tree(old, old_path.dentry, 0); else @@ -1741,6 +1764,8 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); @@ -1853,6 +1878,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out1; + err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -2630,6 +2658,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; @@ -2653,6 +2683,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ -- cgit v1.2.3 From aee1c13dd0f6c2fc56e0e492b349ee8ac655880f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 25 Mar 2013 19:57:10 -0700 Subject: proc: Restrict mounting the proc filesystem Don't allow mounting the proc filesystem unless the caller has CAP_SYS_ADMIN rights over the pid namespace. The principle here is if you create or have capabilities over it you can mount it, otherwise you get to live with what other people have mounted. Andy pointed out that this is needed to prevent users in a user namespace from remounting proc and specifying different hidepid and gid options on already existing proc mounts. Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/proc/root.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index 229e366598da..38bd5d423fcd 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,7 +110,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!current_user_ns()->may_mount_proc) + if (!current_user_ns()->may_mount_proc || + !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); } -- cgit v1.2.3 From 4ce5d2b1a8fde84c0eebe70652cf28b9beda6b4e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 30 Mar 2013 01:35:18 -0700 Subject: vfs: Don't copy mount bind mounts of /proc//ns/mnt between namespaces Don't copy bind mounts of /proc//ns/mnt between namespaces. These files hold references to a mount namespace and copying them between namespaces could result in a reference counting loop. The current mnt_ns_loop test prevents loops on the assumption that mounts don't cross between namespaces. Unfortunately unsharing a mount namespace and shared substrees can both cause mounts to propogate between mount namespaces. Add two flags CL_COPY_UNBINDABLE and CL_COPY_MNT_NS_FILE are added to control this behavior, and CL_COPY_ALL is redefined as both of them. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 46 ++++++++++++++++++++++++++++++++++------------ fs/pnode.h | 5 ++++- 2 files changed, 38 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 7e16a730559c..64627f883bf2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1355,14 +1355,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) #endif -static bool mnt_ns_loop(struct path *path) +static bool is_mnt_ns_file(struct dentry *dentry) { - /* Could bind mounting the mount namespace inode cause a - * mount namespace loop? - */ - struct inode *inode = path->dentry->d_inode; + /* Is this a proxy for a mount namespace? */ + struct inode *inode = dentry->d_inode; struct proc_ns *ei; - struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; @@ -1371,7 +1368,19 @@ static bool mnt_ns_loop(struct path *path) if (ei->ns_ops != &mntns_operations) return false; - mnt_ns = ei->ns; + return true; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = get_proc_ns(dentry->d_inode)->ns; return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -1380,7 +1389,10 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, { struct mount *res, *p, *q, *r, *parent; - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); + + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); @@ -1397,7 +1409,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } @@ -1733,7 +1751,7 @@ static int do_loopback(struct path *path, const char *old_name, return err; err = -EINVAL; - if (mnt_ns_loop(&old_path)) + if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); @@ -1755,7 +1773,7 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; if (recurse) - mnt = copy_tree(old, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path.dentry, 0); @@ -2417,7 +2435,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, namespace_lock(); /* First pass: copy the tree topology */ - copy_flags = CL_COPY_ALL | CL_EXPIRE; + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); @@ -2452,6 +2470,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } p = next_mnt(p, old); q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); } namespace_unlock(); diff --git a/fs/pnode.h b/fs/pnode.h index b091445c1c4a..59e7eda1851e 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,11 +19,14 @@ #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 -#define CL_COPY_ALL 0x04 +#define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 #define CL_UNPRIVILEGED 0x40 +#define CL_COPY_MNT_NS_FILE 0x80 + +#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.3 From e51db73532955dc5eaba4235e62b74b460709d5b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 30 Mar 2013 19:57:41 -0700 Subject: userns: Better restrictions on when proc and sysfs can be mounted Rely on the fact that another flavor of the filesystem is already mounted and do not rely on state in the user namespace. Verify that the mounted filesystem is not covered in any significant way. I would love to verify that the previously mounted filesystem has no mounts on top but there are at least the directories /proc/sys/fs/binfmt_misc and /sys/fs/cgroup/ that exist explicitly for other filesystems to mount on top of. Refactor the test into a function named fs_fully_visible and call that function from the mount routines of proc and sysfs. This makes this test local to the filesystems involved and the results current of when the mounts take place, removing a weird threading of the user namespace, the mount namespace and the filesystems themselves. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 37 +++++++++++++++++++++++++------------ fs/proc/root.c | 7 +++++-- fs/sysfs/mount.c | 3 ++- 3 files changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 64627f883bf2..877e4277f496 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2867,25 +2867,38 @@ bool current_chrooted(void) return chrooted; } -void update_mnt_policy(struct user_namespace *userns) +bool fs_fully_visible(struct file_system_type *type) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; + bool visible = false; - down_read(&namespace_sem); + if (unlikely(!ns)) + return false; + + namespace_lock(); list_for_each_entry(mnt, &ns->list, mnt_list) { - switch (mnt->mnt.mnt_sb->s_magic) { - case SYSFS_MAGIC: - userns->may_mount_sysfs = true; - break; - case PROC_SUPER_MAGIC: - userns->may_mount_proc = true; - break; + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink != 2) + goto next; } - if (userns->may_mount_sysfs && userns->may_mount_proc) - break; + visible = true; + goto found; + next: ; } - up_read(&namespace_sem); +found: + namespace_unlock(); + return visible; } static void *mntns_get(struct task_struct *task) diff --git a/fs/proc/root.c b/fs/proc/root.c index 38bd5d423fcd..45e5fb7da09b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,8 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!current_user_ns()->may_mount_proc || - !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index afd83273e6ce..4a2da3a4b1b1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -112,7 +112,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; - if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + if (!(flags & MS_KERNMOUNT) && !capable(CAP_SYS_ADMIN) && + !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); info = kzalloc(sizeof(*info), GFP_KERNEL); -- cgit v1.2.3 From 7dc5dbc879bd0779924b5132a48b731a0bc04a1e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 25 Mar 2013 20:07:01 -0700 Subject: sysfs: Restrict mounting sysfs Don't allow mounting sysfs unless the caller has CAP_SYS_ADMIN rights over the net namespace. The principle here is if you create or have capabilities over it you can mount it, otherwise you get to live with what other people have mounted. Instead of testing this with a straight forward ns_capable call, perform this check the long and torturous way with kobject helpers, this keeps direct knowledge of namespaces out of sysfs, and preserves the existing sysfs abstractions. Acked-by: Greg Kroah-Hartman Signed-off-by: "Eric W. Biederman" --- fs/sysfs/mount.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 4a2da3a4b1b1..8c69ef49c7f3 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -112,9 +112,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; - if (!(flags & MS_KERNMOUNT) && !capable(CAP_SYS_ADMIN) && - !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); + if (!(flags & MS_KERNMOUNT)) { + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (!kobj_ns_current_may_mount(type)) + return ERR_PTR(-EPERM); + } + } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) -- cgit v1.2.3 From c7b96acf1456ef127fef461fcfedb54b81fecfbb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Mar 2013 12:49:49 -0700 Subject: userns: Kill nsown_capable it makes the wrong thing easy nsown_capable is a special case of ns_capable essentially for just CAP_SETUID and CAP_SETGID. For the existing users it doesn't noticably simplify things and from the suggested patches I have seen it encourages people to do the wrong thing. So remove nsown_capable. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 4 ++-- fs/open.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 877e4277f496..dc519a1437ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2929,8 +2929,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) diff --git a/fs/open.c b/fs/open.c index 9156cb050d08..1c9d23f7e683 100644 --- a/fs/open.c +++ b/fs/open.c @@ -443,7 +443,7 @@ retry: goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) -- cgit v1.2.3