summaryrefslogtreecommitdiff
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c176
1 files changed, 97 insertions, 79 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 43bbd6d1037d..306c2b603fb8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,7 +25,6 @@
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
-#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
@@ -84,7 +83,7 @@ struct exceptional_entry_key {
};
struct wait_exceptional_entry_queue {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct exceptional_entry_key key;
};
@@ -108,7 +107,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
@@ -461,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- int ret = 0;
- void *entry, **slot;
- struct radix_tree_root *page_tree = &mapping->page_tree;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
- slot_locked(mapping, slot))
- goto out;
- if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto out;
- radix_tree_delete(page_tree, index);
- mapping->nrexceptional--;
- ret = 1;
-out:
- spin_unlock_irq(&mapping->tree_lock);
- if (ret)
- dax_wake_mapping_entry_waiter(mapping, index, entry, true);
- return ret;
-}
-
-/*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -509,21 +479,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -531,8 +505,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -807,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev,
}
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
- wb_cache_pmem(kaddr, size);
+ dax_flush(dax_dev, pgoff, kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -817,6 +793,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry);
@@ -857,6 +834,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -877,13 +856,16 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, dax_dev, mapping,
indices[i], pvec.pages[i]);
if (ret < 0) {
- put_dax(dax_dev);
- return ret;
+ mapping_set_error(mapping, ret);
+ goto out;
}
}
+ start_index = indices[pvec.nr - 1] + 1;
}
+out:
put_dax(dax_dev);
- return 0;
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return (ret < 0 ? ret : 0);
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -916,6 +898,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(ret);
*entryp = ret;
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
return vm_insert_mixed(vma, vaddr, pfn);
}
@@ -927,6 +910,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -936,6 +920,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -948,6 +933,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -980,18 +966,19 @@ int __dax_zero_page_range(struct block_device *bdev,
void *kaddr;
pfn_t pfn;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
&pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
- clear_pmem(kaddr + offset, size);
+ memset(kaddr + offset, 0, size);
+ dax_flush(dax_dev, pgoff, kaddr + offset, size);
dax_read_unlock(id);
}
return 0;
@@ -1031,7 +1018,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+ if (iomap->flags & IOMAP_F_NEW) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1070,7 +1057,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(kaddr, map_len, iter);
+ map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ map_len, iter);
else
map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
@@ -1150,34 +1138,50 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto out;
+ }
+
+ /*
+ * It is possible, particularly with mixed reads & writes to private
+ * mappings, that we have raced with a PMD fault that overlaps with
+ * the PTE we need to set up. If so just return and the fault will be
+ * retried.
+ */
+ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+ vmf_ret = VM_FAULT_NOPAGE;
+ goto unlock_entry;
+ }
+
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
- if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
- goto finish_iomap;
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto unlock_entry;
}
-
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
- goto finish_iomap;
+ if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+ error = -EIO; /* fs corruption? */
+ goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
@@ -1199,20 +1203,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
if (error)
- goto error_unlock_entry;
+ goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
- goto unlock_entry;
+ goto finish_iomap;
}
switch (iomap.type) {
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
@@ -1225,7 +1229,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
- goto unlock_entry;
+ goto finish_iomap;
}
/*FALLTHRU*/
default:
@@ -1234,10 +1238,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
break;
}
- error_unlock_entry:
+ error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1252,6 +1254,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1397,6 +1403,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;
/*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto fallback;
+
+ /*
+ * It is possible, particularly with mixed reads & writes to private
+ * mappings, that we have raced with a PTE fault that overlaps with
+ * the PMD we need to set up. If so just return and the fault will be
+ * retried.
+ */
+ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
+ !pmd_devmap(*vmf->pmd)) {
+ result = 0;
+ goto unlock_entry;
+ }
+
+ /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
@@ -1404,21 +1432,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
- goto fallback;
+ goto unlock_entry;
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
- /*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
- */
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
- goto finish_iomap;
-
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1426,7 +1444,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
- goto unlock_entry;
+ break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
@@ -1434,8 +1452,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
break;
}
- unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1451,6 +1467,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);