diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 16:00:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 16:00:17 -0700 |
commit | e4bc13adfd016fc1036838170288b5680d1a98b0 (patch) | |
tree | 8d2cb749397749439732f3a827cb7f2336408337 /include | |
parent | ad90fb97515b732bc27a0109baa10af636c3c8cd (diff) | |
parent | 3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff) |
Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
Pull cgroup writeback support from Jens Axboe:
"This is the big pull request for adding cgroup writeback support.
This code has been in development for a long time, and it has been
simmering in for-next for a good chunk of this cycle too. This is one
of those problems that has been talked about for at least half a
decade, finally there's a solution and code to go with it.
Also see last weeks writeup on LWN:
http://lwn.net/Articles/648292/"
* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
writeback, blkio: add documentation for cgroup writeback support
vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
writeback: do foreign inode detection iff cgroup writeback is enabled
v9fs: fix error handling in v9fs_session_init()
bdi: fix wrong error return value in cgwb_create()
buffer: remove unusued 'ret' variable
writeback: disassociate inodes from dying bdi_writebacks
writeback: implement foreign cgroup inode bdi_writeback switching
writeback: add lockdep annotation to inode_to_wb()
writeback: use unlocked_inode_to_wb transaction in inode_congested()
writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
writeback: implement [locked_]inode_to_wb_and_lock_list()
writeback: implement foreign cgroup inode detection
writeback: make writeback_control track the inode being written back
writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
writeback: implement memcg writeback domain based throttling
writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
writeback: implement memcg wb_domain
writeback: update wb_over_bg_thresh() to use wb_domain aware operations
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/backing-dev-defs.h | 255 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 557 | ||||
-rw-r--r-- | include/linux/bio.h | 3 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 655 | ||||
-rw-r--r-- | include/linux/blkdev.h | 21 | ||||
-rw-r--r-- | include/linux/cgroup.h | 25 | ||||
-rw-r--r-- | include/linux/fs.h | 26 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 29 | ||||
-rw-r--r-- | include/linux/mm.h | 8 | ||||
-rw-r--r-- | include/linux/pagemap.h | 3 | ||||
-rw-r--r-- | include/linux/writeback.h | 221 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 15 |
12 files changed, 1617 insertions, 201 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h new file mode 100644 index 000000000000..a48d90e3bcbb --- /dev/null +++ b/include/linux/backing-dev-defs.h @@ -0,0 +1,255 @@ +#ifndef __LINUX_BACKING_DEV_DEFS_H +#define __LINUX_BACKING_DEV_DEFS_H + +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/percpu_counter.h> +#include <linux/percpu-refcount.h> +#include <linux/flex_proportions.h> +#include <linux/timer.h> +#include <linux/workqueue.h> + +struct page; +struct device; +struct dentry; + +/* + * Bits in bdi_writeback.state + */ +enum wb_state { + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ + WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ +}; + +enum wb_congested_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ +}; + +typedef int (congested_fn)(void *, int); + +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS +}; + +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + +/* + * For cgroup writeback, multiple wb's may map to the same blkcg. Those + * wb's can operate mostly independently but should share the congested + * state. To facilitate such sharing, the congested state is tracked using + * the following struct which is created on demand, indexed by blkcg ID on + * its bdi, and refcounted. + */ +struct bdi_writeback_congested { + unsigned long state; /* WB_[a]sync_congested flags */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct backing_dev_info *bdi; /* the associated bdi */ + atomic_t refcnt; /* nr of attached wb's and blkg */ + int blkcg_id; /* ID of the associated blkcg */ + struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ +#endif +}; + +/* + * Each wb (bdi_writeback) can perform writeback operations, is measured + * and throttled, independently. Without cgroup writeback, each bdi + * (bdi_writeback) is served by its embedded bdi->wb. + * + * On the default hierarchy, blkcg implicitly enables memcg. This allows + * using memcg's page ownership for attributing writeback IOs, and every + * memcg - blkcg combination can be served by its own wb by assigning a + * dedicated wb to each memcg, which enables isolation across different + * cgroups and propagation of IO back pressure down from the IO layer upto + * the tasks which are generating the dirty pages to be written back. + * + * A cgroup wb is indexed on its bdi by the ID of the associated memcg, + * refcounted with the number of inodes attached to it, and pins the memcg + * and the corresponding blkcg. As the corresponding blkcg for a memcg may + * change as blkcg is disabled and enabled higher up in the hierarchy, a wb + * is tested for blkcg after lookup and removed from index on mismatch so + * that a new wb for the combination can be created. + */ +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + + unsigned long state; /* Always use atomic bitops on this */ + unsigned long last_old_flush; /* last old data flush */ + + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ + spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; + + struct bdi_writeback_congested *congested; + + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long dirtied_stamp; + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ + + /* + * The base dirty throttle rate, re-calculated on every 200ms. + * All the bdi tasks' dirty rate will be curbed under it. + * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit + * in small steps and is much more smooth/stable than the latter. + */ + unsigned long dirty_ratelimit; + unsigned long balanced_dirty_ratelimit; + + struct fprop_local_percpu completions; + int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct percpu_ref refcnt; /* used only for !root wb's */ + struct fprop_local_percpu memcg_completions; + struct cgroup_subsys_state *memcg_css; /* the associated memcg */ + struct cgroup_subsys_state *blkcg_css; /* and blkcg */ + struct list_head memcg_node; /* anchored at memcg->cgwb_list */ + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + + union { + struct work_struct release_work; + struct rcu_head rcu; + }; +#endif +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; + + unsigned int min_ratio; + unsigned int max_ratio, max_prop_frac; + + /* + * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are + * any dirty wbs, which is depended upon by bdi_has_dirty(). + */ + atomic_long_t tot_write_bandwidth; + + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; /* its congested state */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rb_root cgwb_congested_tree; /* their congested states */ + atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#endif + wait_queue_head_t wb_waitq; + + struct device *dev; + + struct timer_list laptop_mode_wb_timer; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debug_dir; + struct dentry *debug_stats; +#endif +}; + +enum { + BLK_RW_ASYNC = 0, + BLK_RW_SYNC = 1, +}; + +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync); +void set_wb_congested(struct bdi_writeback_congested *congested, int sync); + +static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + clear_wb_congested(bdi->wb.congested, sync); +} + +static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + set_wb_congested(bdi->wb.congested, sync); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_dying - is a wb dying? + * @wb: bdi_writeback of interest + * + * Returns whether @wb is unlinked and being drained. + */ +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return percpu_ref_is_dying(&wb->refcnt); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline bool wb_dying(struct bdi_writeback *wb) +{ + return false; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d87d8eced064..0e6d4828a77a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -8,106 +8,13 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include <linux/percpu_counter.h> -#include <linux/log2.h> -#include <linux/flex_proportions.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> -#include <linux/timer.h> +#include <linux/blkdev.h> #include <linux/writeback.h> -#include <linux/atomic.h> -#include <linux/sysctl.h> -#include <linux/workqueue.h> - -struct page; -struct device; -struct dentry; - -/* - * Bits in backing_dev_info.state - */ -enum bdi_state { - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ -}; - -typedef int (congested_fn)(void *, int); - -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS -}; - -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - unsigned long last_old_flush; /* last old data flush */ - - struct delayed_work dwork; /* work item used for writeback */ - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - struct list_head b_dirty_time; /* time stamps are dirty */ - spinlock_t list_lock; /* protects the b_* lists */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ - - struct list_head work_list; - - struct device *dev; - - struct timer_list laptop_mode_wb_timer; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; - -struct backing_dev_info *inode_to_bdi(struct inode *inode); +#include <linux/blk-cgroup.h> +#include <linux/backing-dev-defs.h> int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -117,97 +24,99 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); -void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); +void wb_start_background_writeback(struct bdi_writeback *wb); +void wb_workfn(struct work_struct *work); +void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return test_bit(WB_has_dirty_io, &wb->state); +} + +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); } -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) +static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); + __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __inc_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, 1); + __add_wb_stat(wb, item, 1); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __inc_bdi_stat(bdi, item); + __inc_wb_stat(wb, item); local_irq_restore(flags); } -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __dec_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, -1); + __add_wb_stat(wb, item, -1); } -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __dec_bdi_stat(bdi, item); + __dec_wb_stat(wb, item); local_irq_restore(flags); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 __wb_stat_sum(struct bdi_writeback *wb, + enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_sum_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { s64 sum; unsigned long flags; local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); + sum = __wb_stat_sum(wb, item); local_irq_restore(flags); return sum; } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(struct bdi_writeback *wb) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif @@ -231,50 +140,57 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. + * + * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 +#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) extern struct backing_dev_info noop_backing_dev_info; -int writeback_in_progress(struct backing_dev_info *bdi); - -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +/** + * writeback_in_progress - determine whether there is writeback in progress + * @wb: bdi_writeback of interest + * + * Determine whether there is writeback waiting to be handled against a + * bdi_writeback. + */ +static inline bool writeback_in_progress(struct bdi_writeback *wb) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + return test_bit(WB_writeback_running, &wb->state); } -static inline int bdi_read_congested(struct backing_dev_info *bdi) +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { - return bdi_congested(bdi, 1 << BDI_sync_congested); -} + struct super_block *sb; -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << BDI_async_congested); + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; } -static inline int bdi_rw_congested(struct backing_dev_info *bdi) +static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); -} + struct backing_dev_info *bdi = wb->bdi; -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, cong_bits); + return wb->congested->state & cong_bits; +} -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, @@ -318,4 +234,333 @@ static inline int bdi_sched_wait(void *word) return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#ifdef CONFIG_CGROUP_WRITEBACK + +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); +int inode_congested(struct inode *inode, int cong_bits); + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest + * + * cgroup writeback requires support from both the bdi and filesystem. + * Test whether @inode has both. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi_cap_account_dirty(bdi) && + (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (inode->i_sb->s_iflags & SB_I_CGROUPWB); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_to_wb_is_valid - test whether an inode has a wb associated + * @inode: inode of interest + * + * Returns %true if @inode has a wb associated. May be called without any + * locking. + */ +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return inode->i_wb; +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. The caller must be + * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * associated wb's list_lock. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && + (!lockdep_is_held(&inode->i_lock) && + !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_wb->list_lock))); +#endif + return inode->i_wb; +} + +/** + * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction + * @inode: target inode + * @lockedp: temp bool output param, to be passed to the end function + * + * The caller wants to access the wb associated with @inode but isn't + * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * function determines the wb associated with @inode and ensures that the + * association doesn't change until the transaction is finished with + * unlocked_inode_to_wb_end(). + * + * The caller must call unlocked_inode_to_wb_end() with *@lockdep + * afterwards and can't sleep during transaction. IRQ may or may not be + * disabled on return. + */ +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + rcu_read_lock(); + + /* + * Paired with store_release in inode_switch_wb_work_fn() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + + if (unlikely(*lockedp)) + spin_lock_irq(&inode->i_mapping->tree_lock); + + /* + * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. + * inode_to_wb() will bark. Deref directly. + */ + return inode->i_wb; +} + +/** + * unlocked_inode_to_wb_end - end inode wb access transaction + * @inode: target inode + * @locked: *@lockedp from unlocked_inode_to_wb_begin() + */ +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ + if (unlikely(locked)) + spin_unlock_irq(&inode->i_mapping->tree_lock); + + rcu_read_unlock(); +} + +struct wb_iter { + int start_blkcg_id; + struct radix_tree_iter tree_iter; + void **slot; +}; + +static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, + struct backing_dev_info *bdi) +{ + struct radix_tree_iter *titer = &iter->tree_iter; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (iter->start_blkcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); + iter->start_blkcg_id = -1; + } else { + iter->slot = radix_tree_next_slot(iter->slot, titer, 0); + } + + if (!iter->slot) + iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0); + if (iter->slot) + return *iter->slot; + return NULL; +} + +static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, + struct backing_dev_info *bdi, + int start_blkcg_id) +{ + iter->start_blkcg_id = start_blkcg_id; + + if (start_blkcg_id) + return __wb_iter_next(iter, bdi); + else + return &bdi->wb; +} + +/** + * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * @wb_cur: cursor struct bdi_writeback pointer + * @bdi: bdi to walk wb's of + * @iter: pointer to struct wb_iter to be used as iteration buffer + * @start_blkcg_id: blkcg ID to start iteration from + * + * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending + * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * to be used as temp storage during iteration. rcu_read_lock() must be + * held throughout iteration. + */ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ + (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + return false; +} + +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline bool inode_to_wb_is_valid(struct inode *inode) +{ + return true; +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline struct bdi_writeback * +unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +{ + return inode_to_wb(inode); +} + +static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +{ +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + +struct wb_iter { + int next_id; +}; + +#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ + for ((iter)->next_id = (start_blkcg_id); \ + ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); ) + +static inline int inode_congested(struct inode *inode, int cong_bits) +{ + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +static inline int inode_read_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_sync_congested); +} + +static inline int inode_write_congested(struct inode *inode) +{ + return inode_congested(inode, 1 << WB_async_congested); +} + +static inline int inode_rw_congested(struct inode *inode) +{ + return inode_congested(inode, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + +static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) +{ + return wb_congested(&bdi->wb, cong_bits); +} + +static inline int bdi_read_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_sync_congested); +} + +static inline int bdi_write_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_async_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + +#endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index f0291cf64cc5..5e963a6d7c14 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h new file mode 100644 index 000000000000..58cfab80dd70 --- /dev/null +++ b/include/linux/blk-cgroup.h @@ -0,0 +1,655 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> + * + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> + * Paolo Valente <paolo.valente@unimore.it> + * + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> + * Nauman Rafique <nauman@google.com> + */ + +#include <linux/cgroup.h> +#include <linux/u64_stats_sync.h> +#include <linux/seq_file.h> +#include <linux/radix-tree.h> +#include <linux/blkdev.h> +#include <linux/atomic.h> + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + +#ifdef CONFIG_BLK_CGROUP + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +struct blkcg_gq; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; + + struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif +}; + +struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; +}; + +struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; +}; + +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + + /* used during policy activation */ + struct list_head alloc_node; +}; + +/* + * Policies that need to keep per-blkcg data which is independent + * from any request_queue associated to it must specify its size + * with the cpd_size field of the blkcg_policy structure and + * embed a blkcg_policy_data in it. blkcg core allocates + * policy-specific per-blkcg structures lazily the first time + * they are actually needed, so it handles them together with + * blkgs. cpd_init() is invoked to let each policy handle + * per-blkcg data. + */ +struct blkcg_policy_data { + /* the policy id this per-policy data belongs to */ + int plid; + + /* used during policy activation */ + struct list_head alloc_node; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* + * Each blkg gets congested separately and the congestion state is + * propagated to the matching bdi_writeback_congested. + */ + struct bdi_writeback_congested *wb_congested; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* request allocation list for this blkcg-q pair */ + struct request_list rl; + + /* reference count */ + atomic_t refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; +}; + +typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + +struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; + /* policy specific per-blkcg data size */ + size_t cpd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ + blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; +}; + +extern struct blkcg blkcg_root; +extern struct cgroup_subsys_state * const blkcg_root_css; + +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); +int blkcg_init_queue(struct request_queue *q); +void blkcg_drain_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + +/* Blkio controller policy registration */ +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + +struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; +}; + +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return task_blkcg(current); +} + +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return task_get_css(task, blkio_cgrp_id); +} + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, + struct blkcg_policy *pol) +{ + return blkcg ? blkcg->pd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + char *p; + + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { + strncpy(buf, "<unavailable>", buflen); + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); +} + +void __blkg_release_rcu(struct rcu_head *rcu); + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); +} + +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + +static inline void blkg_stat_init(struct blkg_stat *stat) +{ + u64_stats_init(&stat->syncp); +} + +/** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ +static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) +{ + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); +} + +/** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ +static inline uint64_t blkg_stat_read(struct blkg_stat *stat) +{ + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin_irq(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); + + return v; +} + +/** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ +static inline void blkg_stat_reset(struct blkg_stat *stat) +{ + stat->cnt = 0; +} + +/** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +{ + u64_stats_init(&rwstat->syncp); +} + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) +{ + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ +static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +{ + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin_irq(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + + return tmp; +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); +} + +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + +#else /* CONFIG_BLK_CGROUP */ + +struct blkcg { +}; + +struct blkg_policy_data { +}; + +struct blkcg_policy_data { +}; + +struct blkcg_gq { +}; + +struct blkcg_policy { +}; + +#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + +static inline struct cgroup_subsys_state * +task_get_blkcg_css(struct task_struct *task) +{ + return NULL; +} + +#ifdef CONFIG_BLOCK + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLOCK */ +#endif /* CONFIG_BLK_CGROUP */ +#endif /* _BLK_CGROUP_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5ced29cef03f..7f2f54b4587f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -12,7 +12,7 @@ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/pagemap.h> -#include <linux/backing-dev.h> +#include <linux/backing-dev-defs.h> #include <linux/wait.h> #include <linux/mempool.h> #include <linux/bio.h> @@ -787,25 +787,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -/* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ -static inline void blk_clear_queue_congested(struct request_queue *q, int sync) -{ - clear_bdi_congested(&q->backing_dev_info, sync); -} - -/* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ -static inline void blk_set_queue_congested(struct request_queue *q, int sync) -{ - set_bdi_congested(&q->backing_dev_info, sync); -} - extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..e7da0aa65b2d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -774,6 +774,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, } /** + * task_get_css - find and get the css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Find the css for the (@task, @subsys_id) combination, increment a + * reference on and return it. This function is guaranteed to return a + * valid css. + */ +static inline struct cgroup_subsys_state * +task_get_css(struct task_struct *task, int subsys_id) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + while (true) { + css = task_css(task, subsys_id); + if (likely(css_tryget_online(css))) + break; + cpu_relax(); + } + rcu_read_unlock(); + return css; +} + +/** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID diff --git a/include/linux/fs.h b/include/linux/fs.h index 5db7b1379d17..e351da4a934f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ #include <uapi/linux/fs.h> struct backing_dev_info; +struct bdi_writeback; struct export_operations; struct hd_geometry; struct iovec; @@ -634,6 +635,14 @@ struct inode { struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ + + /* foreign inode detection, see wbc_detach_inode() */ + int i_wb_frn_winner; + u16 i_wb_frn_avg_time; + u16 i_wb_frn_history; +#endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { @@ -1232,6 +1241,8 @@ struct mm_struct; #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ /* Possible states of 'frozen' field */ enum { @@ -1270,6 +1281,7 @@ struct super_block { const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; @@ -1806,6 +1818,11 @@ struct super_operations { * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab mapping->tree_lock. See + * inode_switch_wb_work_fn() for details. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1825,6 +1842,7 @@ struct super_operations { #define I_DIRTY_TIME (1 << 11) #define __I_DIRTY_TIME_EXPIRED 12 #define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_WB_SWITCH (1 << 13) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) @@ -2241,7 +2259,13 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); -extern int sb_is_blkdev_sb(struct super_block *sb); + +extern struct super_block *blockdev_superblock; + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c8918114804..73b02b0a8f60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -41,6 +41,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, @@ -67,6 +68,8 @@ enum mem_cgroup_events_index { }; #ifdef CONFIG_MEMCG +extern struct cgroup_subsys_state *mem_cgroup_root_css; + void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr); @@ -112,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, } extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); +extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, @@ -195,6 +199,8 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; +#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr) @@ -382,6 +388,29 @@ enum { OVER_LIMIT, }; +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + return NULL; +} + +static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, + unsigned long *pavail, + unsigned long *pdirty, + unsigned long *pwriteback) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); diff --git a/include/linux/mm.h b/include/linux/mm.h index 24ad583596d1..99959a34f4f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,6 +27,7 @@ struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; +struct bdi_writeback; #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1211,10 +1212,13 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_cleaned(struct page *page, struct address_space *mapping); +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg); +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); +void cancel_dirty_page(struct page *page); int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..fb0814ca65c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page, void *shadow); +extern void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b2dd371ec0ca..b333c945e571 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,8 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/fs.h> +#include <linux/flex_proportions.h> +#include <linux/backing-dev-defs.h> DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -84,18 +86,95 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *wb; /* wb this writeback is issued under */ + struct inode *inode; /* inode being written out */ + + /* foreign inode detection, see wbc_detach_inode() */ + int wb_id; /* current wb id */ + int wb_lcand_id; /* last foreign candidate wb id */ + int wb_tcand_id; /* this foreign candidate wb id */ + size_t wb_bytes; /* bytes written by current wb */ + size_t wb_lcand_bytes; /* bytes written by last candidate */ + size_t wb_tcand_bytes; /* bytes written by this candidate */ +#endif }; /* + * A wb_domain represents a domain that wb's (bdi_writeback's) belong to + * and are measured against each other in. There always is one global + * domain, global_wb_domain, that every wb in the system is a member of. + * This allows measuring the relative bandwidth of each wb to distribute + * dirtyable memory accordingly. + */ +struct wb_domain { + spinlock_t lock; + + /* + * Scale the writeback cache size proportional to the relative + * writeout speed. + * + * We do this by keeping a floating proportion between BDIs, based + * on page writeback completions [end_page_writeback()]. Those + * devices that write out pages fastest will get the larger share, + * while the slower will get a smaller share. + * + * We use page writeout completions because we are interested in + * getting rid of dirty pages. Having them written out is the + * primary goal. + * + * We introduce a concept of time, a period over which we measure + * these events, because demand can/will vary over time. The length + * of this period itself is measured in page writeback completions. + */ + struct fprop_global completions; + struct timer_list period_timer; /* timer for aging of completions */ + unsigned long period_time; + + /* + * The dirtyable memory and dirty threshold could be suddenly + * knocked down by a large amount (eg. on the startup of KVM in a + * swapless system). This may throw the system into deep dirty + * exceeded state and throttle heavy/light dirtiers alike. To + * retain good responsiveness, maintain global_dirty_limit for + * tracking slowly down to the knocked down dirty threshold. + * + * Both fields are protected by ->lock. + */ + unsigned long dirty_limit_tstamp; + unsigned long dirty_limit; +}; + +/** + * wb_domain_size_changed - memory available to a wb_domain has changed + * @dom: wb_domain of interest + * + * This function should be called when the amount of memory available to + * @dom has changed. It resets @dom's dirty limit parameters to prevent + * the past values which don't match the current configuration from skewing + * dirty throttling. Without this, when memory size of a wb_domain is + * greatly reduced, the dirty throttling logic may allow too many pages to + * be dirtied leading to consecutive unnecessary OOMs and may get stuck in + * that situation. + */ +static inline void wb_domain_size_changed(struct wb_domain *dom) +{ + spin_lock(&dom->lock); + dom->dirty_limit_tstamp = jiffies; + dom->dirty_limit = 0; + spin_unlock(&dom->lock); +} + +/* * fs/fs-writeback.c */ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); +bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); @@ -107,6 +186,123 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } +#ifdef CONFIG_CGROUP_WRITEBACK + +#include <linux/cgroup.h> +#include <linux/bio.h> + +void __inode_attach_wb(struct inode *inode, struct page *page); +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock); +void wbc_detach_inode(struct writeback_control *wbc); +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes); + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} + +/** + * wbc_init_bio - writeback specific initializtion of bio + * @wbc: writeback_control for the writeback in progress + * @bio: bio to be initialized + * + * @bio is a part of the writeback in progress controlled by @wbc. Perform + * writeback specific initialization. This is used to apply the cgroup + * writeback context. + */ +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (wbc->wb) + bio_associate_blkcg(bio, wbc->wb->blkcg_css); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + +static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ +} + +static inline void wbc_detach_inode(struct writeback_control *wbc) +{ +} + +static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) +{ +} + +static inline void wbc_account_io(struct writeback_control *wbc, + struct page *page, size_t bytes) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * mm/page-writeback.c */ @@ -120,8 +316,12 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); +int wb_domain_init(struct wb_domain *dom, gfp_t gfp); +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom); +#endif -extern unsigned long global_dirty_limit; +extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern int dirty_background_ratio; @@ -155,19 +355,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); - -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); +bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c178d13d6f4c..a7aa607a4c55 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -360,7 +360,7 @@ TRACE_EVENT(global_dirty_state, __entry->nr_written = global_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; - __entry->dirty_limit = global_dirty_limit; + __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu unstable=%lu " @@ -399,13 +399,13 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_fast_assign( strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->write_bw = KBps(bdi->write_bandwidth); - __entry->avg_write_bw = KBps(bdi->avg_write_bandwidth); + __entry->write_bw = KBps(bdi->wb.write_bandwidth); + __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); - __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit); + __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = - KBps(bdi->balanced_dirty_ratelimit); + KBps(bdi->wb.balanced_dirty_ratelimit); ), TP_printk("bdi %s: " @@ -462,8 +462,9 @@ TRACE_EVENT(balance_dirty_pages, unsigned long freerun = (thresh + bg_thresh) / 2; strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->limit = global_dirty_limit; - __entry->setpoint = (global_dirty_limit + freerun) / 2; + __entry->limit = global_wb_domain.dirty_limit; + __entry->setpoint = (global_wb_domain.dirty_limit + + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); |