From 65f87ee71852a754f7981d0653e7136039b8798a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 25 Jan 2016 17:23:18 -0800 Subject: fs, block: force direct-I/O for dax-enabled block devices Similar to the file I/O path, re-direct all I/O to the DAX path for I/O to a block-device special file. Both regular files and device special files can use the common filp->f_mapping->host lookup to determing is DAX is enabled. Otherwise, we confuse the DAX code that does not expect to find live data in the page cache: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 7676 at mm/filemap.c:217 __delete_from_page_cache+0x9f6/0xb60() Modules linked in: CPU: 0 PID: 7676 Comm: a.out Not tainted 4.4.0+ #276 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 00000000ffffffff ffff88006d3f7738 ffffffff82999e2d 0000000000000000 ffff8800620a0000 ffffffff86473d20 ffff88006d3f7778 ffffffff81352089 ffffffff81658d36 ffffffff86473d20 00000000000000d9 ffffea0000009d60 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] __delete_from_page_cache+0x9f6/0xb60 mm/filemap.c:217 [] delete_from_page_cache+0x112/0x200 mm/filemap.c:244 [] __dax_fault+0x859/0x1800 fs/dax.c:487 [] blkdev_dax_fault+0x26/0x30 fs/block_dev.c:1730 [< inline >] wp_pfn_shared mm/memory.c:2208 [] do_wp_page+0xc85/0x14f0 mm/memory.c:2307 [< inline >] handle_pte_fault mm/memory.c:3323 [< inline >] __handle_mm_fault mm/memory.c:3417 [] handle_mm_fault+0x2483/0x4640 mm/memory.c:3446 [] __do_page_fault+0x376/0x960 arch/x86/mm/fault.c:1238 [] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331 [] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264 [] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 ---[ end trace dae21e0f85f1f98c ]--- Fixes: 5a023cdba50c ("block: enable dax for raw block devices") Reported-by: Dmitry Vyukov Reported-by: Kirill A. Shutemov Suggested-by: Jan Kara Reviewed-by: Jan Kara Suggested-by: Matthew Wilcox Tested-by: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a2046275cdf..b10002d4a5f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2907,7 +2907,7 @@ extern void replace_mount_options(struct super_block *sb, char *options); static inline bool io_is_direct(struct file *filp) { - return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); + return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); } static inline int iocb_flags(struct file *file) -- cgit v1.2.3 From 9f4736fe7ca804aa79b5916221bb13dfc6221a0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:13:39 -0800 Subject: block: revert runtime dax control of the raw block device Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox Cc: Andrew Morton Cc: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..149bec83a907 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -222,7 +222,6 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) -#define BLKDAXSET _IO(0x12,128) #define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ -- cgit v1.2.3 From d1a5f2b4d8a125943dcb6b032fc7eaefc2c78296 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:25:31 -0800 Subject: block: use DAX for partition table reads Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Reported-by: Ross Zwisler Tested-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: Dan Williams --- include/linux/dax.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); -- cgit v1.2.3 From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } -- cgit v1.2.3