diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-08 21:03:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-08 21:03:40 -0700 |
commit | 9b06860d7c1f1f4cb7d70f92e47dfa4a91bd5007 (patch) | |
tree | 120882e574394ce3b11bd491533613b4488fae45 /drivers/nvdimm | |
parent | 0906d8b975ff713cfb55328e4f3bf6de5967415e (diff) | |
parent | f6d2b802f80d0ca89ee1f51c1781b3f79cdb25d5 (diff) |
Merge tag 'libnvdimm-for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams:
"There were multiple touches outside of drivers/nvdimm/ this round to
add cross arch compatibility to the devm_memremap_pages() interface,
enhance numa information for persistent memory ranges, and add a
zero_page_range() dax operation.
This cycle I switched from the patchwork api to Konstantin's b4 script
for collecting tags (from x86, PowerPC, filesystem, and device-mapper
folks), and everything looks to have gone ok there. This has all
appeared in -next with no reported issues.
Summary:
- Add support for region alignment configuration and enforcement to
fix compatibility across architectures and PowerPC page size
configurations.
- Introduce 'zero_page_range' as a dax operation. This facilitates
filesystem-dax operation without a block-device.
- Introduce phys_to_target_node() to facilitate drivers that want to
know resulting numa node if a given reserved address range was
onlined.
- Advertise a persistence-domain for of_pmem and papr_scm. The
persistence domain indicates where cpu-store cycles need to reach
in the platform-memory subsystem before the platform will consider
them power-fail protected.
- Promote numa_map_to_online_node() to a cross-kernel generic
facility.
- Save x86 numa information to allow for node-id lookups for reserved
memory ranges, deploy that capability for the e820-pmem driver.
- Pick up some miscellaneous minor fixes, that missed v5.6-final,
including a some smatch reports in the ioctl path and some unit
test compilation fixups.
- Fixup some flexible-array declarations"
* tag 'libnvdimm-for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (29 commits)
dax: Move mandatory ->zero_page_range() check in alloc_dax()
dax,iomap: Add helper dax_iomap_zero() to zero a range
dax: Use new dax zero page method for zeroing a page
dm,dax: Add dax zero_page_range operation
s390,dcssblk,dax: Add dax zero_page_range operation to dcssblk driver
dax, pmem: Add a dax operation zero_page_range
pmem: Add functions for reading/writing page to/from pmem
libnvdimm: Update persistence domain value for of_pmem and papr_scm device
tools/test/nvdimm: Fix out of tree build
libnvdimm/region: Fix build error
libnvdimm/region: Replace zero-length array with flexible-array member
libnvdimm/label: Replace zero-length array with flexible-array member
ACPI: NFIT: Replace zero-length array with flexible-array member
libnvdimm/region: Introduce an 'align' attribute
libnvdimm/region: Introduce NDD_LABELING
libnvdimm/namespace: Enforce memremap_compat_align()
libnvdimm/pfn: Prevent raw mode fallback if pfn-infoblock valid
libnvdimm: Out of bounds read in __nd_ioctl()
acpi/nfit: improve bounds checking for 'func'
mm/memremap_pages: Introduce memremap_compat_align()
...
Diffstat (limited to 'drivers/nvdimm')
-rw-r--r-- | drivers/nvdimm/bus.c | 6 | ||||
-rw-r--r-- | drivers/nvdimm/dimm.c | 2 | ||||
-rw-r--r-- | drivers/nvdimm/dimm_devs.c | 95 | ||||
-rw-r--r-- | drivers/nvdimm/e820.c | 18 | ||||
-rw-r--r-- | drivers/nvdimm/label.h | 2 | ||||
-rw-r--r-- | drivers/nvdimm/namespace_devs.c | 28 | ||||
-rw-r--r-- | drivers/nvdimm/nd.h | 7 | ||||
-rw-r--r-- | drivers/nvdimm/of_pmem.c | 4 | ||||
-rw-r--r-- | drivers/nvdimm/pfn.h | 12 | ||||
-rw-r--r-- | drivers/nvdimm/pfn_devs.c | 40 | ||||
-rw-r--r-- | drivers/nvdimm/pmem.c | 101 | ||||
-rw-r--r-- | drivers/nvdimm/region_devs.c | 130 |
12 files changed, 342 insertions, 103 deletions
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index a8b515968569..09087c38fabd 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -1042,8 +1042,10 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, return -EFAULT; } - if (!desc || (desc->out_num + desc->in_num == 0) || - !test_bit(cmd, &cmd_mask)) + if (!desc || + (desc->out_num + desc->in_num == 0) || + cmd > ND_CMD_CALL || + !test_bit(cmd, &cmd_mask)) return -ENOTTY; /* fail write commands (when read-only) */ diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 64776ed15bb3..7d4ddc4d9322 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -99,7 +99,7 @@ static int nvdimm_probe(struct device *dev) if (ndd->ns_current >= 0) { rc = nd_label_reserve_dpa(ndd); if (rc == 0) - nvdimm_set_aliasing(dev); + nvdimm_set_labeling(dev); } nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 94ea6dba6b4f..b7b77e8d9027 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -32,7 +32,7 @@ int nvdimm_check_config_data(struct device *dev) if (!nvdimm->cmd_mask || !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { - if (test_bit(NDD_ALIASING, &nvdimm->flags)) + if (test_bit(NDD_LABELING, &nvdimm->flags)) return -ENXIO; else return -ENOTTY; @@ -173,11 +173,11 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, return rc; } -void nvdimm_set_aliasing(struct device *dev) +void nvdimm_set_labeling(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); - set_bit(NDD_ALIASING, &nvdimm->flags); + set_bit(NDD_LABELING, &nvdimm->flags); } void nvdimm_set_locked(struct device *dev) @@ -312,8 +312,9 @@ static ssize_t flags_show(struct device *dev, { struct nvdimm *nvdimm = to_nvdimm(dev); - return sprintf(buf, "%s%s\n", + return sprintf(buf, "%s%s%s\n", test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "", + test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "", test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : ""); } static DEVICE_ATTR_RO(flags); @@ -562,6 +563,21 @@ int nvdimm_security_freeze(struct nvdimm *nvdimm) return rc; } +static unsigned long dpa_align(struct nd_region *nd_region) +{ + struct device *dev = &nd_region->dev; + + if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), + "bus lock required for capacity provision\n")) + return 0; + if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align + % nd_region->ndr_mappings, + "invalid region align %#lx mappings: %d\n", + nd_region->align, nd_region->ndr_mappings)) + return 0; + return nd_region->align / nd_region->ndr_mappings; +} + int alias_dpa_busy(struct device *dev, void *data) { resource_size_t map_end, blk_start, new; @@ -570,6 +586,7 @@ int alias_dpa_busy(struct device *dev, void *data) struct nd_region *nd_region; struct nvdimm_drvdata *ndd; struct resource *res; + unsigned long align; int i; if (!is_memory(dev)) @@ -607,13 +624,21 @@ int alias_dpa_busy(struct device *dev, void *data) * Find the free dpa from the end of the last pmem allocation to * the end of the interleave-set mapping. */ + align = dpa_align(nd_region); + if (!align) + return 0; + for_each_dpa_resource(ndd, res) { + resource_size_t start, end; + if (strncmp(res->name, "pmem", 4) != 0) continue; - if ((res->start >= blk_start && res->start < map_end) - || (res->end >= blk_start - && res->end <= map_end)) { - new = max(blk_start, min(map_end + 1, res->end + 1)); + + start = ALIGN_DOWN(res->start, align); + end = ALIGN(res->end + 1, align) - 1; + if ((start >= blk_start && start < map_end) + || (end >= blk_start && end <= map_end)) { + new = max(blk_start, min(map_end, end) + 1); if (new != blk_start) { blk_start = new; goto retry; @@ -653,6 +678,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) .res = NULL, }; struct resource *res; + unsigned long align; if (!ndd) return 0; @@ -660,10 +686,20 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); /* now account for busy blk allocations in unaliased dpa */ + align = dpa_align(nd_region); + if (!align) + return 0; for_each_dpa_resource(ndd, res) { + resource_size_t start, end, size; + if (strncmp(res->name, "blk", 3) != 0) continue; - info.available -= resource_size(res); + start = ALIGN_DOWN(res->start, align); + end = ALIGN(res->end + 1, align) - 1; + size = end - start + 1; + if (size >= info.available) + return 0; + info.available -= size; } return info.available; @@ -682,19 +718,31 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, struct nvdimm_bus *nvdimm_bus; resource_size_t max = 0; struct resource *res; + unsigned long align; /* if a dimm is disabled the available capacity is zero */ if (!ndd) return 0; + align = dpa_align(nd_region); + if (!align) + return 0; + nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm)) return 0; for_each_dpa_resource(ndd, res) { + resource_size_t start, end; + if (strcmp(res->name, "pmem-reserve") != 0) continue; - if (resource_size(res) > max) - max = resource_size(res); + /* trim free space relative to current alignment setting */ + start = ALIGN(res->start, align); + end = ALIGN_DOWN(res->end + 1, align) - 1; + if (end < start) + continue; + if (end - start + 1 > max) + max = end - start + 1; } release_free_pmem(nvdimm_bus, nd_mapping); return max; @@ -722,24 +770,33 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct resource *res; const char *reason; + unsigned long align; if (!ndd) return 0; + align = dpa_align(nd_region); + if (!align) + return 0; + map_start = nd_mapping->start; map_end = map_start + nd_mapping->size - 1; blk_start = max(map_start, map_end + 1 - *overlap); for_each_dpa_resource(ndd, res) { - if (res->start >= map_start && res->start < map_end) { + resource_size_t start, end; + + start = ALIGN_DOWN(res->start, align); + end = ALIGN(res->end + 1, align) - 1; + if (start >= map_start && start < map_end) { if (strncmp(res->name, "blk", 3) == 0) blk_start = min(blk_start, - max(map_start, res->start)); - else if (res->end > map_end) { + max(map_start, start)); + else if (end > map_end) { reason = "misaligned to iset"; goto err; } else - busy += resource_size(res); - } else if (res->end >= map_start && res->end <= map_end) { + busy += end - start + 1; + } else if (end >= map_start && end <= map_end) { if (strncmp(res->name, "blk", 3) == 0) { /* * If a BLK allocation overlaps the start of @@ -748,8 +805,8 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, */ blk_start = map_start; } else - busy += resource_size(res); - } else if (map_start > res->start && map_start < res->end) { + busy += end - start + 1; + } else if (map_start > start && map_start < end) { /* total eclipse of the mapping */ busy += nd_mapping->size; blk_start = map_start; @@ -759,7 +816,7 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, *overlap = map_end + 1 - blk_start; available = blk_start - map_start; if (busy < available) - return available - busy; + return ALIGN_DOWN(available - busy, align); return 0; err: diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c index e02f60ad6c99..4cd18be9d0e9 100644 --- a/drivers/nvdimm/e820.c +++ b/drivers/nvdimm/e820.c @@ -7,6 +7,7 @@ #include <linux/memory_hotplug.h> #include <linux/libnvdimm.h> #include <linux/module.h> +#include <linux/numa.h> static int e820_pmem_remove(struct platform_device *pdev) { @@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_MEMORY_HOTPLUG -static int e820_range_to_nid(resource_size_t addr) -{ - return memory_add_physaddr_to_nid(addr); -} -#else -static int e820_range_to_nid(resource_size_t addr) -{ - return NUMA_NO_NODE; -} -#endif - static int e820_register_one(struct resource *res, void *data) { struct nd_region_desc ndr_desc; struct nvdimm_bus *nvdimm_bus = data; + int nid = phys_to_target_node(res->start); memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.res = res; - ndr_desc.numa_node = e820_range_to_nid(res->start); - ndr_desc.target_node = ndr_desc.numa_node; + ndr_desc.numa_node = numa_map_to_online_node(nid); + ndr_desc.target_node = nid; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) return -ENXIO; diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index 4c7b775c2811..956b6d1bd8cc 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -62,7 +62,7 @@ struct nd_namespace_index { __le16 major; __le16 minor; __le64 checksum; - u8 free[0]; + u8 free[]; }; /** diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 032dc61725ff..ae155e860fdc 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -10,6 +10,7 @@ #include <linux/nd.h> #include "nd-core.h" #include "pmem.h" +#include "pfn.h" #include "nd.h" static void namespace_io_release(struct device *dev) @@ -541,6 +542,11 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, { bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + unsigned long align; + + align = nd_region->align / nd_region->ndr_mappings; + valid->start = ALIGN(valid->start, align); + valid->end = ALIGN_DOWN(valid->end + 1, align) - 1; if (valid->start >= valid->end) goto invalid; @@ -980,10 +986,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) return -ENXIO; } - div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, &remainder); + div_u64_rem(val, nd_region->align, &remainder); if (remainder) { dev_dbg(dev, "%llu is not %ldK aligned\n", val, - (PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K); + nd_region->align / SZ_1K); return -EINVAL; } @@ -1739,6 +1745,22 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) return ERR_PTR(-ENODEV); } + /* + * Note, alignment validation for fsdax and devdax mode + * namespaces happens in nd_pfn_validate() where infoblock + * padding parameters can be applied. + */ + if (pmem_should_map_pages(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct resource *res = &nsio->res; + + if (!IS_ALIGNED(res->start | (res->end + 1), + memremap_compat_align())) { + dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res); + return ERR_PTR(-EOPNOTSUPP); + } + } + if (is_namespace_pmem(&ndns->dev)) { struct nd_namespace_pmem *nspm; @@ -2521,7 +2543,7 @@ static int init_active_labels(struct nd_region *nd_region) if (!ndd) { if (test_bit(NDD_LOCKED, &nvdimm->flags)) /* fail, label data may be unreadable */; - else if (test_bit(NDD_ALIASING, &nvdimm->flags)) + else if (test_bit(NDD_LABELING, &nvdimm->flags)) /* fail, labels needed to disambiguate dpa */; else return 0; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index c9f6a5b5253a..85dbb2a322b9 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -39,7 +39,7 @@ struct nd_region_data { int ns_count; int ns_active; unsigned int hints_shift; - void __iomem *flush_wpq[0]; + void __iomem *flush_wpq[]; }; static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd, @@ -146,6 +146,7 @@ struct nd_region { struct device *btt_seed; struct device *pfn_seed; struct device *dax_seed; + unsigned long align; u16 ndr_mappings; u64 ndr_size; u64 ndr_start; @@ -156,7 +157,7 @@ struct nd_region { struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; int (*flush)(struct nd_region *nd_region, struct bio *bio); - struct nd_mapping mapping[0]; + struct nd_mapping mapping[]; }; struct nd_blk_region { @@ -252,7 +253,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, void *buf, size_t len); long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, unsigned int len); -void nvdimm_set_aliasing(struct device *dev); +void nvdimm_set_labeling(struct device *dev); void nvdimm_set_locked(struct device *dev); void nvdimm_clear_locked(struct device *dev); int nvdimm_security_setup_events(struct device *dev); diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c index 8224d1431ea9..6826a274a1f1 100644 --- a/drivers/nvdimm/of_pmem.c +++ b/drivers/nvdimm/of_pmem.c @@ -62,8 +62,10 @@ static int of_pmem_region_probe(struct platform_device *pdev) if (is_volatile) region = nvdimm_volatile_region_create(bus, &ndr_desc); - else + else { + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); region = nvdimm_pmem_region_create(bus, &ndr_desc); + } if (!region) dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n", diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h index acb19517f678..37cb1b8a2a39 100644 --- a/drivers/nvdimm/pfn.h +++ b/drivers/nvdimm/pfn.h @@ -24,6 +24,18 @@ struct nd_pfn_sb { __le64 npfns; __le32 mode; /* minor-version-1 additions for section alignment */ + /** + * @start_pad: Deprecated attribute to pad start-misaligned namespaces + * + * start_pad is deprecated because the original definition did + * not comprehend that dataoff is relative to the base address + * of the namespace not the start_pad adjusted base. The result + * is that the dax path is broken, but the block-I/O path is + * not. The kernel will no longer create namespaces using start + * padding, but it still supports block-I/O for legacy + * configurations mainly to allow a backup, reconfigure the + * namespace, and restore flow to repair dax operation. + */ __le32 start_pad; __le32 end_trunc; /* minor-version-2 record the base alignment of the mapping */ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index b94f7a7e94b8..34db557dbad1 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -446,6 +446,7 @@ static bool nd_supported_alignment(unsigned long align) int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) { u64 checksum, offset; + struct resource *res; enum nd_pfn_mode mode; struct nd_namespace_io *nsio; unsigned long align, start_pad; @@ -561,14 +562,14 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n", nd_pfn->align, align, nd_pfn->mode, mode); - return -EINVAL; + return -EOPNOTSUPP; } } if (align > nvdimm_namespace_capacity(ndns)) { dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n", align, nvdimm_namespace_capacity(ndns)); - return -EINVAL; + return -EOPNOTSUPP; } /* @@ -578,18 +579,31 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) * established. */ nsio = to_nd_namespace_io(&ndns->dev); - if (offset >= resource_size(&nsio->res)) { + res = &nsio->res; + if (offset >= resource_size(res)) { dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", dev_name(&ndns->dev)); - return -EBUSY; + return -EOPNOTSUPP; } - if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align)) + if ((align && !IS_ALIGNED(res->start + offset + start_pad, align)) || !IS_ALIGNED(offset, PAGE_SIZE)) { dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled align: %#lx\n", offset, align); - return -ENXIO; + return -EOPNOTSUPP; + } + + if (!IS_ALIGNED(res->start + le32_to_cpu(pfn_sb->start_pad), + memremap_compat_align())) { + dev_err(&nd_pfn->dev, "resource start misaligned\n"); + return -EOPNOTSUPP; + } + + if (!IS_ALIGNED(res->end + 1 - le32_to_cpu(pfn_sb->end_trunc), + memremap_compat_align())) { + dev_err(&nd_pfn->dev, "resource end misaligned\n"); + return -EOPNOTSUPP; } return 0; @@ -750,7 +764,19 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) start = nsio->res.start; size = resource_size(&nsio->res); npfns = PHYS_PFN(size - SZ_8K); - align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT)); + align = max(nd_pfn->align, memremap_compat_align()); + + /* + * When @start is misaligned fail namespace creation. See + * the 'struct nd_pfn_sb' commentary on why ->start_pad is not + * an option. + */ + if (!IS_ALIGNED(start, memremap_compat_align())) { + dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n", + dev_name(&ndns->dev), &start, + memremap_compat_align()); + return -EINVAL; + } end_trunc = start + size - ALIGN_DOWN(start + size, align); if (nd_pfn->mode == PFN_MODE_PMEM) { /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4ffc6f7ca131..2df6994acf83 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, return BLK_STS_OK; } -static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, unsigned int op, - sector_t sector) +static blk_status_t pmem_do_read(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) +{ + blk_status_t rc; + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return BLK_STS_IOERR; + + rc = read_pmem(page, page_off, pmem_addr, len); + flush_dcache_page(page); + return rc; +} + +static blk_status_t pmem_do_write(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) { blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; @@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (!op_is_write(op)) { - if (unlikely(bad_pmem)) - rc = BLK_STS_IOERR; - else { - rc = read_pmem(page, off, pmem_addr, len); - flush_dcache_page(page); - } - } else { - /* - * Note that we write the data both before and after - * clearing poison. The write before clear poison - * handles situations where the latest written data is - * preserved and the clear poison operation simply marks - * the address range as valid without changing the data. - * In this case application software can assume that an - * interrupted write will either return the new good - * data or an error. - * - * However, if pmem_clear_poison() leaves the data in an - * indeterminate state we need to perform the write - * after clear poison. - */ - flush_dcache_page(page); - write_pmem(pmem_addr, page, off, len); - if (unlikely(bad_pmem)) { - rc = pmem_clear_poison(pmem, pmem_off, len); - write_pmem(pmem_addr, page, off, len); - } + /* + * Note that we write the data both before and after + * clearing poison. The write before clear poison + * handles situations where the latest written data is + * preserved and the clear poison operation simply marks + * the address range as valid without changing the data. + * In this case application software can assume that an + * interrupted write will either return the new good + * data or an error. + * + * However, if pmem_clear_poison() leaves the data in an + * indeterminate state we need to perform the write + * after clear poison. + */ + flush_dcache_page(page); + write_pmem(pmem_addr, page, page_off, len); + if (unlikely(bad_pmem)) { + rc = pmem_clear_poison(pmem, pmem_off, len); + write_pmem(pmem_addr, page, page_off, len); } return rc; @@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { - rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, bio_op(bio), iter.bi_sector); + if (op_is_write(bio_op(bio))) + rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); + else + rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); if (rc) { bio->bi_status = rc; break; @@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct pmem_device *pmem = bdev->bd_queue->queuedata; blk_status_t rc; - rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, - 0, op, sector); - + if (op_is_write(op)) + rc = pmem_do_write(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); + else + rc = pmem_do_read(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); /* * The ->rw_page interface is subtle and tricky. The core * retries on any error, so we can only invoke page_endio() in @@ -268,6 +282,16 @@ static const struct block_device_operations pmem_fops = { .revalidate_disk = nvdimm_revalidate_disk, }; +static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0, + PFN_PHYS(pgoff) >> SECTOR_SHIFT, + PAGE_SIZE)); +} + static long pmem_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -299,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = { .dax_supported = generic_fsdax_supported, .copy_from_iter = pmem_copy_from_iter, .copy_to_iter = pmem_copy_to_iter, + .zero_page_range = pmem_dax_zero_page_range, }; static const struct attribute_group *pmem_attribute_groups[] = { @@ -461,9 +486,9 @@ static int pmem_attach_disk(struct device *dev, if (is_nvdimm_sync(nd_region)) flags = DAXDEV_F_SYNC; dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); - if (!dax_dev) { + if (IS_ERR(dax_dev)) { put_disk(disk); - return -ENOMEM; + return PTR_ERR(dax_dev); } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index a19e535830d9..ccbb5b43b8b2 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -195,16 +195,16 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); int nd_region_to_nstype(struct nd_region *nd_region) { if (is_memory(&nd_region->dev)) { - u16 i, alias; + u16 i, label; - for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) { + for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm *nvdimm = nd_mapping->nvdimm; - if (test_bit(NDD_ALIASING, &nvdimm->flags)) - alias++; + if (test_bit(NDD_LABELING, &nvdimm->flags)) + label++; } - if (alias) + if (label) return ND_DEVICE_NAMESPACE_PMEM; else return ND_DEVICE_NAMESPACE_IO; @@ -216,21 +216,25 @@ int nd_region_to_nstype(struct nd_region *nd_region) } EXPORT_SYMBOL(nd_region_to_nstype); -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) +static unsigned long long region_size(struct nd_region *nd_region) { - struct nd_region *nd_region = to_nd_region(dev); - unsigned long long size = 0; - - if (is_memory(dev)) { - size = nd_region->ndr_size; + if (is_memory(&nd_region->dev)) { + return nd_region->ndr_size; } else if (nd_region->ndr_mappings == 1) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; - size = nd_mapping->size; + return nd_mapping->size; } - return sprintf(buf, "%llu\n", size); + return 0; +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%llu\n", region_size(nd_region)); } static DEVICE_ATTR_RO(size); @@ -529,6 +533,54 @@ static ssize_t read_only_store(struct device *dev, } static DEVICE_ATTR_RW(read_only); +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%#lx\n", nd_region->align); +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long val, dpa; + u32 remainder; + int rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return rc; + + if (!nd_region->ndr_mappings) + return -ENXIO; + + /* + * Ensure space-align is evenly divisible by the region + * interleave-width because the kernel typically has no facility + * to determine which DIMM(s), dimm-physical-addresses, would + * contribute to the tail capacity in system-physical-address + * space for the namespace. + */ + dpa = div_u64_rem(val, nd_region->ndr_mappings, &remainder); + if (!is_power_of_2(dpa) || dpa < PAGE_SIZE + || val > region_size(nd_region) || remainder) + return -EINVAL; + + /* + * Given that space allocation consults this value multiple + * times ensure it does not change for the duration of the + * allocation. + */ + nvdimm_bus_lock(dev); + nd_region->align = val; + nvdimm_bus_unlock(dev); + + return len; +} +static DEVICE_ATTR_RW(align); + static ssize_t region_badblocks_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -571,6 +623,7 @@ static DEVICE_ATTR_RO(persistence_domain); static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, + &dev_attr_align.attr, &dev_attr_nstype.attr, &dev_attr_mappings.attr, &dev_attr_btt_seed.attr, @@ -626,6 +679,19 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) return a->mode; } + if (a == &dev_attr_align.attr) { + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (test_bit(NDD_LABELING, &nvdimm->flags)) + return a->mode; + } + return 0; + } + if (a != &dev_attr_set_cookie.attr && a != &dev_attr_available_size.attr) return a->mode; @@ -935,6 +1001,41 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) } EXPORT_SYMBOL(nd_region_release_lane); +/* + * PowerPC requires this alignment for memremap_pages(). All other archs + * should be ok with SUBSECTION_SIZE (see memremap_compat_align()). + */ +#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M + +static unsigned long default_align(struct nd_region *nd_region) +{ + unsigned long align; + int i, mappings; + u32 remainder; + + if (is_nd_blk(&nd_region->dev)) + align = PAGE_SIZE; + else + align = MEMREMAP_COMPAT_ALIGN_MAX; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (test_bit(NDD_ALIASING, &nvdimm->flags)) { + align = MEMREMAP_COMPAT_ALIGN_MAX; + break; + } + } + + mappings = max_t(u16, 1, nd_region->ndr_mappings); + div_u64_rem(align, mappings, &remainder); + if (remainder) + align *= mappings; + + return align; +} + static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc, const struct device_type *dev_type, const char *caller) @@ -1039,6 +1140,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, dev->of_node = ndr_desc->of_node; nd_region->ndr_size = resource_size(ndr_desc->res); nd_region->ndr_start = ndr_desc->res->start; + nd_region->align = default_align(nd_region); if (ndr_desc->flush) nd_region->flush = ndr_desc->flush; else |