diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 81 |
1 files changed, 67 insertions, 14 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d3c521137425..fc5e4a48582f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ -static void cgroup_call_pre_destroy(struct cgroup *cgrp) +static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; + int ret = 0; + for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) - ss->pre_destroy(ss, cgrp); - return; + if (ss->pre_destroy) { + ret = ss->pre_destroy(ss, cgrp); + if (ret) + break; + } + return ret; } static void free_cgroup_rcu(struct rcu_head *obj) @@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + */ +DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +{ + if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + wake_up_all(&cgroup_rmdir_waitq); +} + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); put_css_set(cg); + + /* + * wake up rmdir() waiter. the rmdir should fail since the cgroup + * is no longer empty. + */ + cgroup_wakeup_rmdir_waiters(cgrp); return 0; } @@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; + DEFINE_WAIT(wait); + int ret; /* the vfs holds both inode->i_mutex already */ - +again: mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); @@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ - cgroup_call_pre_destroy(cgrp); + ret = cgroup_call_pre_destroy(cgrp); + if (ret) + return ret; mutex_lock(&cgroup_mutex); parent = cgrp->parent; - - if (atomic_read(&cgrp->count) - || !list_empty(&cgrp->children) - || !cgroup_clear_css_refs(cgrp)) { + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } + /* + * css_put/get is provided for subsys to grab refcnt to css. In typical + * case, subsystem has no reference after pre_destroy(). But, under + * hierarchy management, some *temporal* refcnt can be hold. + * To avoid returning -EBUSY to a user, waitqueue is used. If subsys + * is really busy, it should return -EBUSY at pre_destroy(). wake_up + * is called when css_put() is called and refcnt goes down to 0. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); + + if (!cgroup_clear_css_refs(cgrp)) { + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + if (signal_pending(current)) + return -EINTR; + goto again; + } + /* NO css_tryget() can success after here. */ + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); @@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if ((atomic_dec_return(&css->refcnt) == 1) && - notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (atomic_dec_return(&css->refcnt) == 1) { + if (notify_on_release(cgrp)) { + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } + cgroup_wakeup_rmdir_waiters(cgrp); } rcu_read_unlock(); } |