From 3f4a2670deea53e3765e24a7f46aafe6f077cb68 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:37 -0800 Subject: pmem: add wb_cache_pmem() to the PMEM API __arch_wb_cache_pmem() was already an internal implementation detail of the x86 PMEM API, but this functionality needs to be exported as part of the general PMEM API to handle the fsync/msync case for DAX mmaps. One thing worth noting is that we really do want this to be part of the PMEM API as opposed to a stand-alone function like clflush_cache_range() because of ordering restrictions. By having wb_cache_pmem() as part of the PMEM API we can leave it unordered, call it multiple times to write back large amounts of memory, and then order the multiple calls with a single wmb_pmem(). Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pmem.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pmem.h b/include/linux/pmem.h index acfea8ce4a07..7c3d11a6b4ad 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { BUG(); } + +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) +{ + BUG(); +} #endif /* * Architectures that define ARCH_HAS_PMEM_API must provide * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() + * and arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { @@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size) else default_clear_pmem(addr, size); } + +/** + * wb_cache_pmem - write back processor cache for PMEM memory range + * @addr: virtual start address + * @size: number of bytes to write back + * + * Write back the processor cache range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline void wb_cache_pmem(void __pmem *addr, size_t size) +{ + if (arch_has_pmem_api()) + arch_wb_cache_pmem(addr, size); +} #endif /* __PMEM_H__ */ -- cgit v1.2.3 From f9fe48bece3af2d60e1bad65db4825f5a025dd36 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:40 -0800 Subject: dax: support dirty DAX entries in radix tree Add support for tracking dirty DAX entries in the struct address_space radix tree. This tree is already used for dirty page writeback, and it already supports the use of exceptional (non struct page*) entries. In order to properly track dirty DAX pages we will insert new exceptional entries into the radix tree that represent dirty DAX PTE or PMD pages. These exceptional entries will also contain the writeback addresses for the PTE or PMD faults that we can use at fsync/msync time. There are currently two types of exceptional entries (shmem and shadow) that can be placed into the radix tree, and this adds a third. We rely on the fact that only one type of exceptional entry can be found in a given radix tree based on its usage. This happens for free with DAX vs shmem but we explicitly prevent shadow entries from being added to radix trees for DAX mappings. The only shadow entries that would be generated for DAX radix trees would be to track zero page mappings that were created for holes. These pages would receive minimal benefit from having shadow entries, and the choice to have only one type of exceptional entry in a given radix tree makes the logic simpler both in clear_exceptional_entry() and in the rest of DAX. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 5 +++++ include/linux/fs.h | 3 ++- include/linux/radix-tree.h | 9 +++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..e9d57f680f50 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -36,4 +36,9 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } + +static inline bool dax_mapping(struct address_space *mapping) +{ + return mapping->host && IS_DAX(mapping->host); +} #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..0d7570320d63 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,8 @@ struct address_space { struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ - unsigned long nrshadows; /* number of shadow entries */ + /* number of shadow or DAX exceptional entries */ + unsigned long nrexceptional; pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 57e7d87d2d4c..7c88ad156a29 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -51,6 +51,15 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); -- cgit v1.2.3 From 7e7f774984cd88c45c18e7ffaf0256c3e9118043 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:44 -0800 Subject: mm: add find_get_entries_tag() Add find_get_entries_tag() to the family of functions that include find_get_entries(), find_get_pages() and find_get_pages_tag(). This is needed for DAX dirty page handling because we need a list of both page offsets and radix tree entries ('indices' and 'entries' in this function) that are marked with the PAGECACHE_TAG_TOWRITE tag. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4d08b6c33557..92395a0a7dc5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); -- cgit v1.2.3 From 9973c98ecfda3a1dfcab981665b5f1e39bcde64a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:47 -0800 Subject: dax: add support for fsync/sync To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index e9d57f680f50..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end); #endif -- cgit v1.2.3