summaryrefslogtreecommitdiff
path: root/drivers/nvdimm/pmem.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-29 10:34:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-29 10:34:42 -0700
commit88793e5c774ec69351ef6b5200bb59f532e41bca (patch)
tree54c4be61777ea53fde892b71e795322c5227d16e /drivers/nvdimm/pmem.c
parent1bc5e157ed2b4f5b206155fc772d860158acd201 (diff)
parent61031952f4c89dba1065f7a5b9419badb112554c (diff)
Merge tag 'libnvdimm-for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm
Pull libnvdimm subsystem from Dan Williams: "The libnvdimm sub-system introduces, in addition to the libnvdimm-core, 4 drivers / enabling modules: NFIT: Instantiates an "nvdimm bus" with the core and registers memory devices (NVDIMMs) enumerated by the ACPI 6.0 NFIT (NVDIMM Firmware Interface table). After registering NVDIMMs the NFIT driver then registers "region" devices. A libnvdimm-region defines an access mode and the boundaries of persistent memory media. A region may span multiple NVDIMMs that are interleaved by the hardware memory controller. In turn, a libnvdimm-region can be carved into a "namespace" device and bound to the PMEM or BLK driver which will attach a Linux block device (disk) interface to the memory. PMEM: Initially merged in v4.1 this driver for contiguous spans of persistent memory address ranges is re-worked to drive PMEM-namespaces emitted by the libnvdimm-core. In this update the PMEM driver, on x86, gains the ability to assert that writes to persistent memory have been flushed all the way through the caches and buffers in the platform to persistent media. See memcpy_to_pmem() and wmb_pmem(). BLK: This new driver enables access to persistent memory media through "Block Data Windows" as defined by the NFIT. The primary difference of this driver to PMEM is that only a small window of persistent memory is mapped into system address space at any given point in time. Per-NVDIMM windows are reprogrammed at run time, per-I/O, to access different portions of the media. BLK-mode, by definition, does not support DAX. BTT: This is a library, optionally consumed by either PMEM or BLK, that converts a byte-accessible namespace into a disk with atomic sector update semantics (prevents sector tearing on crash or power loss). The sinister aspect of sector tearing is that most applications do not know they have a atomic sector dependency. At least today's disk's rarely ever tear sectors and if they do one almost certainly gets a CRC error on access. NVDIMMs will always tear and always silently. Until an application is audited to be robust in the presence of sector-tearing the usage of BTT is recommended. Thanks to: Ross Zwisler, Jeff Moyer, Vishal Verma, Christoph Hellwig, Ingo Molnar, Neil Brown, Boaz Harrosh, Robert Elliott, Matthew Wilcox, Andy Rudoff, Linda Knippers, Toshi Kani, Nicholas Moulin, Rafael Wysocki, and Bob Moore" * tag 'libnvdimm-for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm: (33 commits) arch, x86: pmem api for ensuring durability of persistent memory updates libnvdimm: Add sysfs numa_node to NVDIMM devices libnvdimm: Set numa_node to NVDIMM devices acpi: Add acpi_map_pxm_to_online_node() libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only pmem: flag pmem block devices as non-rotational libnvdimm: enable iostat pmem: make_request cleanups libnvdimm, pmem: fix up max_hw_sectors libnvdimm, blk: add support for blk integrity libnvdimm, btt: add support for blk integrity fs/block_dev.c: skip rw_page if bdev has integrity libnvdimm: Non-Volatile Devices tools/testing/nvdimm: libnvdimm unit test infrastructure libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory nd_btt: atomic sector updates libnvdimm: infrastructure for btt devices libnvdimm: write blk label set libnvdimm: write pmem label set libnvdimm: blk labels and namespace instantiation ...
Diffstat (limited to 'drivers/nvdimm/pmem.c')
-rw-r--r--drivers/nvdimm/pmem.c301
1 files changed, 301 insertions, 0 deletions
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
new file mode 100644
index 000000000000..ade9eb917a4d
--- /dev/null
+++ b/drivers/nvdimm/pmem.c
@@ -0,0 +1,301 @@
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014-2015, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/pmem.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+struct pmem_device {
+ struct request_queue *pmem_queue;
+ struct gendisk *pmem_disk;
+
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ void __pmem *virt_addr;
+ size_t size;
+};
+
+static int pmem_major;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem = kmap_atomic(page);
+ size_t pmem_off = sector << 9;
+ void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
+
+ if (rw == READ) {
+ memcpy_from_pmem(mem + off, pmem_addr, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ memcpy_to_pmem(pmem_addr, mem + off, len);
+ }
+
+ kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ bool do_acct;
+ unsigned long start;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct block_device *bdev = bio->bi_bdev;
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter)
+ pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+ bio_data_dir(bio), iter.bi_sector);
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+ if (bio_data_dir(bio))
+ wmb_pmem();
+
+ bio_endio(bio, 0);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+ pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, 0);
+
+ return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ size_t offset = sector << 9;
+
+ if (!pmem)
+ return -ENODEV;
+
+ /* FIXME convert DAX to comprehend that this mapping has a lifetime */
+ *kaddr = (void __force *) pmem->virt_addr + offset;
+ *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+ return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = pmem_rw_page,
+ .direct_access = pmem_direct_access,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev,
+ struct resource *res, int id)
+{
+ struct pmem_device *pmem;
+
+ pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+ if (!pmem)
+ return ERR_PTR(-ENOMEM);
+
+ pmem->phys_addr = res->start;
+ pmem->size = resource_size(res);
+ if (!arch_has_pmem_api())
+ dev_warn(dev, "unable to guarantee persistence of writes\n");
+
+ if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
+ dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+ &pmem->phys_addr, pmem->size);
+ kfree(pmem);
+ return ERR_PTR(-EBUSY);
+ }
+
+ pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
+ if (!pmem->virt_addr) {
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+ return ERR_PTR(-ENXIO);
+ }
+
+ return pmem;
+}
+
+static void pmem_detach_disk(struct pmem_device *pmem)
+{
+ del_gendisk(pmem->pmem_disk);
+ put_disk(pmem->pmem_disk);
+ blk_cleanup_queue(pmem->pmem_queue);
+}
+
+static int pmem_attach_disk(struct nd_namespace_common *ndns,
+ struct pmem_device *pmem)
+{
+ struct gendisk *disk;
+
+ pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!pmem->pmem_queue)
+ return -ENOMEM;
+
+ blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+ blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
+ blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
+
+ disk = alloc_disk(0);
+ if (!disk) {
+ blk_cleanup_queue(pmem->pmem_queue);
+ return -ENOMEM;
+ }
+
+ disk->major = pmem_major;
+ disk->first_minor = 0;
+ disk->fops = &pmem_fops;
+ disk->private_data = pmem;
+ disk->queue = pmem->pmem_queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ nvdimm_namespace_disk_name(ndns, disk->disk_name);
+ disk->driverfs_dev = &ndns->dev;
+ set_capacity(disk, pmem->size >> 9);
+ pmem->pmem_disk = disk;
+
+ add_disk(disk);
+ revalidate_disk(disk);
+
+ return 0;
+}
+
+static int pmem_rw_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *buf, size_t size, int rw)
+{
+ struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
+
+ if (unlikely(offset + size > pmem->size)) {
+ dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+ return -EFAULT;
+ }
+
+ if (rw == READ)
+ memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
+ else {
+ memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
+ wmb_pmem();
+ }
+
+ return 0;
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+ memunmap_pmem(pmem->virt_addr);
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+}
+
+static int nd_pmem_probe(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_namespace_common *ndns;
+ struct nd_namespace_io *nsio;
+ struct pmem_device *pmem;
+ int rc;
+
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
+
+ nsio = to_nd_namespace_io(&ndns->dev);
+ pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
+ if (IS_ERR(pmem))
+ return PTR_ERR(pmem);
+
+ dev_set_drvdata(dev, pmem);
+ ndns->rw_bytes = pmem_rw_bytes;
+ if (is_nd_btt(dev))
+ rc = nvdimm_namespace_attach_btt(ndns);
+ else if (nd_btt_probe(ndns, pmem) == 0) {
+ /* we'll come back as btt-pmem */
+ rc = -ENXIO;
+ } else
+ rc = pmem_attach_disk(ndns, pmem);
+ if (rc)
+ pmem_free(pmem);
+ return rc;
+}
+
+static int nd_pmem_remove(struct device *dev)
+{
+ struct pmem_device *pmem = dev_get_drvdata(dev);
+
+ if (is_nd_btt(dev))
+ nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+ else
+ pmem_detach_disk(pmem);
+ pmem_free(pmem);
+
+ return 0;
+}
+
+MODULE_ALIAS("pmem");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
+static struct nd_device_driver nd_pmem_driver = {
+ .probe = nd_pmem_probe,
+ .remove = nd_pmem_remove,
+ .drv = {
+ .name = "nd_pmem",
+ },
+ .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
+};
+
+static int __init pmem_init(void)
+{
+ int error;
+
+ pmem_major = register_blkdev(0, "pmem");
+ if (pmem_major < 0)
+ return pmem_major;
+
+ error = nd_driver_register(&nd_pmem_driver);
+ if (error) {
+ unregister_blkdev(pmem_major, "pmem");
+ return error;
+ }
+
+ return 0;
+}
+module_init(pmem_init);
+
+static void pmem_exit(void)
+{
+ driver_unregister(&nd_pmem_driver.drv);
+ unregister_blkdev(pmem_major, "pmem");
+}
+module_exit(pmem_exit);
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");