diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2c947da39f6e..5826992910e9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void) global_page_state(NR_UNSTABLE_NFS) > background_thresh); } +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, start_time); +} + /* * Explicit flushing or periodic writeback of "old" data. * @@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { + unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; @@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); + wb_update_bandwidth(wb, wb_start); + /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 469d56443c63..a008982e7c08 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -73,6 +73,11 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + struct prop_local_percpu completions; int dirty_exceeded; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index df1b7f18f100..66862f2d90c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 83f18a1d9d10..a76cdd160277 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi) } bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + err = prop_local_init_percpu(&bdi->completions); if (err) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8cd71376c63d..446bdf7b975b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,6 +36,11 @@ #include #include +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. @@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + */ + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period)) { + do_div(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + bdi->write_bandwidth = bw; + bdi->avg_write_bandwidth = avg; +} + +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long written; + + /* + * rate-limit, only update once every 200ms. + */ + if (elapsed < BANDWIDTH_INTERVAL) + return; + + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* + * Skip quiet periods when disk bandwidth is under-utilized. + * (at least 1s idle time between two flusher runs) + */ + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + goto snapshot; + + bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +} + +static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + return; + spin_lock(&bdi->wb.list_lock); + __bdi_update_bandwidth(bdi, start_time); + spin_unlock(&bdi->wb.list_lock); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pause = 1; bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { nr_reclaimable = global_page_state(NR_FILE_DIRTY) + @@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; + bdi_update_bandwidth(bdi, start_time); + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been