diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 176 |
1 files changed, 97 insertions, 79 deletions
@@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> -#include <linux/pmem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> @@ -84,7 +83,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; @@ -461,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if easily possible. This handles DAX - * entries for invalidate_inode_pages() so we evict the entry only if we can - * do so without blocking. - */ -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - int ret = 0; - void *entry, **slot; - struct radix_tree_root *page_tree = &mapping->page_tree; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - slot_locked(mapping, slot)) - goto out; - if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto out; - radix_tree_delete(page_tree, index); - mapping->nrexceptional--; - ret = 1; -out: - spin_unlock_irq(&mapping->tree_lock); - if (ret) - dax_wake_mapping_entry_waiter(mapping, index, entry, true); - return ret; -} - -/* * Invalidate exceptional DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -509,21 +479,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -531,8 +505,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -807,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - wb_cache_pmem(kaddr, size); + dax_flush(dax_dev, pgoff, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -817,6 +793,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); + trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); put_locked_mapping_entry(mapping, index, entry); @@ -857,6 +834,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -877,13 +856,16 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, dax_dev, mapping, indices[i], pvec.pages[i]); if (ret < 0) { - put_dax(dax_dev); - return ret; + mapping_set_error(mapping, ret); + goto out; } } + start_index = indices[pvec.nr - 1] + 1; } +out: put_dax(dax_dev); - return 0; + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -916,6 +898,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(ret); *entryp = ret; + trace_dax_insert_mapping(mapping->host, vmf, ret); return vm_insert_mixed(vma, vaddr, pfn); } @@ -927,6 +910,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -936,6 +920,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -948,6 +933,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -980,18 +966,19 @@ int __dax_zero_page_range(struct block_device *bdev, void *kaddr; pfn_t pfn; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); if (rc) return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; } - clear_pmem(kaddr + offset, size); + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, pgoff, kaddr + offset, size); dax_read_unlock(id); } return 0; @@ -1031,7 +1018,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + if (iomap->flags & IOMAP_F_NEW) { invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1070,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(kaddr, map_len, iter); + map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + map_len, iter); else map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { @@ -1150,34 +1138,50 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto out; + } + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PMD fault that overlaps with + * the PTE we need to set up. If so just return and the fault will be + * retried. + */ + if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + vmf_ret = VM_FAULT_NOPAGE; + goto unlock_entry; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ - goto finish_iomap; + if (error) { + vmf_ret = dax_fault_return(error); + goto unlock_entry; } - - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); - goto finish_iomap; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto error_finish_iomap; } sector = dax_iomap_sector(&iomap, pos); @@ -1199,20 +1203,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } if (error) - goto error_unlock_entry; + goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto unlock_entry; + goto finish_iomap; } switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, @@ -1225,7 +1229,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { vmf_ret = dax_load_hole(mapping, &entry, vmf); - goto unlock_entry; + goto finish_iomap; } /*FALLTHRU*/ default: @@ -1234,10 +1238,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; } - error_unlock_entry: + error_finish_iomap: vmf_ret = dax_fault_return(error) | major; - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1252,6 +1254,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } @@ -1397,6 +1403,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PTE fault that overlaps with + * the PMD we need to set up. If so just return and the fault will be + * retried. + */ + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && + !pmd_devmap(*vmf->pmd)) { + result = 0; + goto unlock_entry; + } + + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. @@ -1404,21 +1432,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto fallback; + goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto finish_iomap; - switch (iomap.type) { case IOMAP_MAPPED: result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); @@ -1426,7 +1444,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto unlock_entry; + break; result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: @@ -1434,8 +1452,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, break; } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; @@ -1451,6 +1467,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); |