diff options
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 254 |
1 files changed, 193 insertions, 61 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03f..dc32dae01e5f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) int dirty_background_ratio = 5; /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + +/* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ @@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; int vm_dirty_ratio = 10; /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + +/* * The interval between `kupdate'-style writebacks, in jiffies */ int dirty_writeback_interval = 5 * HZ; @@ -135,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; + + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + @@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; long nr_to_write = wbc->nr_to_write; @@ -881,83 +953,143 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + lock_page(page); + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - ret = (*writepage)(page, wbc, data); + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } - if (ret || (--nr_to_write <= 0)) + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (nr_to_write > 0) + nr_to_write--; + else if (wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are not + * doing integrity sync. In case of integrity + * sync we have to keep going because someone + * may be concurrently dirtying pages, and we + * might have synced a lot of newly appeared + * dirty pages, but have not synced all of the + * old dirty pages. + */ done = 1; + break; + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; wbc->nr_to_write = nr_to_write; } |