From aa9ad44a42b4cf4387f8ecddaf8e51707fdcda5a Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 23 Aug 2017 12:48:26 -0700 Subject: libnvdimm: move poison list functions to a new 'badrange' file nfit_test needs to use the poison list manipulation code as well. Make it more generic and in the process rename poison to badrange, and move all the related helpers to a new file. Signed-off-by: Dave Jiang [vishal: Add badrange.o to nfit_test's Kbuild] [vishal: add a missed include in bus.c for the new badrange functions] [vishal: rename all instances of 'be' to 'bre'] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3eaad2fbf284..f8109ddb5ef1 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -18,6 +18,18 @@ #include #include #include +#include + +struct badrange_entry { + u64 start; + u64 length; + struct list_head list; +}; + +struct badrange { + struct list_head list; + spinlock_t lock; +}; enum { /* when a dimm supports both PMEM and BLK access a label is required */ @@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } -int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); -void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, - phys_addr_t start, unsigned int len); +void badrange_init(struct badrange *badrange); +int badrange_add(struct badrange *badrange, u64 addr, u64 length); +void badrange_forget(struct badrange *badrange, phys_addr_t start, + unsigned int len); +int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, + u64 length); struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); -- cgit v1.2.3 From 592e254502041f953e84d091eae2c68cba04c10b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 3 Nov 2017 12:21:21 +0100 Subject: mm: Handle 0 flags in _calc_vm_trans() macro _calc_vm_trans() does not handle the situation when some of the passed flags are 0 (which can happen if these VM flags do not make sense for the architecture). Improve the _calc_vm_trans() macro to return 0 in such situation. Since all passed flags are constant, this does not add any runtime overhead. Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/mman.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index c8367041fafd..edb6cf6a81ed 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -63,8 +63,9 @@ static inline bool arch_validate_prot(unsigned long prot) * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ + ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ - : ((x) & (bit1)) / ((bit1) / (bit2))) + : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. -- cgit v1.2.3 From 1c9725974074a047f6080eecc62c50a8e840d050 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 1 Nov 2017 16:36:30 +0100 Subject: mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags The mmap(2) syscall suffers from the ABI anti-pattern of not validating unknown flags. However, proposals like MAP_SYNC need a mechanism to define new behavior that is known to fail on older kernels without the support. Define a new MAP_SHARED_VALIDATE flag pattern that is guaranteed to fail on all legacy mmap implementations. It is worth noting that the original proposal was for a standalone MAP_VALIDATE flag. However, when that could not be supported by all archs Linus observed: I see why you *think* you want a bitmap. You think you want a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC etc, so that people can do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_SYNC, fd, 0); and "know" that MAP_SYNC actually takes. And I'm saying that whole wish is bogus. You're fundamentally depending on special semantics, just make it explicit. It's already not portable, so don't try to make it so. Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value of 0x3, and make people do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0); and then the kernel side is easier too (none of that random garbage playing games with looking at the "MAP_VALIDATE bit", but just another case statement in that map type thing. Boom. Done. Similar to ->fallocate() we also want the ability to validate the support for new flags on a per ->mmap() 'struct file_operations' instance basis. Towards that end arrange for flags to be generically validated against a mmap_supported_flags exported by 'struct file_operations'. By default all existing flags are implicitly supported, but new flags require MAP_SHARED_VALIDATE and per-instance-opt-in. Cc: Jan Kara Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Andrew Morton Suggested-by: Christoph Hellwig Suggested-by: Linus Torvalds Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/fs.h | 1 + include/linux/mman.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 13dab191a23e..57added3201d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1701,6 +1701,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); diff --git a/include/linux/mman.h b/include/linux/mman.h index edb6cf6a81ed..74452e3f2536 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -7,6 +7,45 @@ #include #include +/* + * Arrange for legacy / undefined architecture specific flags to be + * ignored by default in LEGACY_MAP_MASK. + */ +#ifndef MAP_32BIT +#define MAP_32BIT 0 +#endif +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB 0 +#endif +#ifndef MAP_HUGE_1GB +#define MAP_HUGE_1GB 0 +#endif +#ifndef MAP_UNINITIALIZED +#define MAP_UNINITIALIZED 0 +#endif + +/* + * The historical set of flags that all mmap implementations implicitly + * support when a ->mmap_validate() op is not provided in file_operations. + */ +#define LEGACY_MAP_MASK (MAP_SHARED \ + | MAP_PRIVATE \ + | MAP_FIXED \ + | MAP_ANONYMOUS \ + | MAP_DENYWRITE \ + | MAP_EXECUTABLE \ + | MAP_UNINITIALIZED \ + | MAP_GROWSDOWN \ + | MAP_LOCKED \ + | MAP_NORESERVE \ + | MAP_POPULATE \ + | MAP_NONBLOCK \ + | MAP_STACK \ + | MAP_HUGETLB \ + | MAP_32BIT \ + | MAP_HUGE_2MB \ + | MAP_HUGE_1GB) + extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -- cgit v1.2.3 From d81b8a722f691a3907eb4a6df410ab517ba5bfcc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:31 +0100 Subject: mm: Remove VM_FAULT_HWPOISON_LARGE_MASK It is unused. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 065d99deb847..ca72b67153d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1182,8 +1182,6 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ -#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ - #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_FALLBACK) -- cgit v1.2.3 From 9a0dd42251439de635088b97533109a935864a84 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:39 +0100 Subject: dax: Allow dax_iomap_fault() to return pfn For synchronous page fault dax_iomap_fault() will need to return PFN which will then need to be inserted into page tables after fsync() completes. Add necessary parameter to dax_iomap_fault(). Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/dax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 122197124b9d..e7fa4b8f45bc 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -95,7 +95,7 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops); + pfn_t *pfnp, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From b6fb293f2497a9841d94f6b57bd2bb2cd222da43 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:41 +0100 Subject: mm: Define MAP_SYNC and VM_SYNC flags Define new MAP_SYNC flag and corresponding VMA VM_SYNC flag. As the MAP_SYNC flag is not part of LEGACY_MAP_MASK, currently it will be refused by all MAP_SHARED_VALIDATE map attempts and silently ignored for everything else. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/mm.h | 1 + include/linux/mman.h | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ca72b67153d5..5411cb7442de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -189,6 +189,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 74452e3f2536..3427bf3daef5 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,7 +9,7 @@ /* * Arrange for legacy / undefined architecture specific flags to be - * ignored by default in LEGACY_MAP_MASK. + * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 @@ -23,6 +23,9 @@ #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif +#ifndef MAP_SYNC +#define MAP_SYNC 0 +#endif /* * The historical set of flags that all mmap implementations implicitly @@ -126,7 +129,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ); } unsigned long vm_commit_limit(void); -- cgit v1.2.3 From caa51d26f85c248f1c4f43a870ad3ef84bf9eb8f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:42 +0100 Subject: dax, iomap: Add support for synchronous faults Add a flag to iomap interface informing the caller that inode needs fdstasync(2) for returned extent to become persistent and use it in DAX fault code so that we don't map such extents into page tables immediately. Instead we propagate the information that fdatasync(2) is necessary from dax_iomap_fault() with a new VM_FAULT_NEEDDSYNC flag. Filesystem fault handler is then responsible for calling fdatasync(2) and inserting pfn into page tables. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/iomap.h | 5 +++++ include/linux/mm.h | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f64dc6ce5161..73e3b7085dbe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -22,6 +22,11 @@ struct vm_fault; * Flags for all iomap mappings: */ #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ +/* + * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access + * written data and requires fdatasync to commit them to persistent storage. + */ +#define IOMAP_F_DIRTY 0x02 /* * Flags that only need to be reported for IOMAP_REPORT requests: diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411cb7442de..f57e55782d7d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1182,6 +1182,9 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ +#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables + * and needs fsync() to complete (for + * synchronous page faults in DAX) */ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ @@ -1199,7 +1202,8 @@ static inline void clear_page_pfmemalloc(struct page *page) { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ - { VM_FAULT_DONE_COW, "DONE_COW" } + { VM_FAULT_DONE_COW, "DONE_COW" }, \ + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) -- cgit v1.2.3 From 71eab6dfd91eabf06fe782a590c07e8a0795f5ea Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:43 +0100 Subject: dax: Implement dax_finish_sync_fault() Implement a function that filesystems can call to finish handling of synchronous page faults. It takes care of syncing appropriare file range and insertion of page table entry. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index e7fa4b8f45bc..d403f78b706c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -96,6 +96,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, const struct iomap_ops *ops); +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From b8a6176c214cf9aa2679131ed7e4515cddaadc33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2017 16:36:45 +0100 Subject: ext4: Support for synchronous DAX faults We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to prepare blocks for writing and the inode has some uncommitted metadata changes. In the fault handler ext4_dax_fault() we then detect this case (through VM_FAULT_NEEDDSYNC return value) and call helper dax_finish_sync_fault() to flush metadata changes and insert page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 606b6bce3a5b..296d1e0ea87b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); -- cgit v1.2.3