From e1e5e5641e6f271321aec257ed26a72715e4a8c2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:39:03 -0700 Subject: NVMe: Metadata format support Adds support for NVMe metadata formats and exposes block devices for all namespaces regardless of their format. Namespace formats that are unusable will have disk capacity set to 0, but a handle to the block device is created to simplify device management. A namespace is not usable when the format requires host interleave block and metadata in single buffer, has no provisioned storage, or has better data but failed to register with blk integrity. The namespace has to be scanned in two phases to support separate metadata formats. The first establishes the sector size and capacity prior to invoking add_disk. If metadata is required, the capacity will be temporarilly set to 0 until it can be revalidated and registered with the integrity extenstions after add_disk completes. The driver relies on the integrity extensions to provide the metadata buffer. NVMe requires this be a single physically contiguous region, so only one integrity segment is allowed per command. If the metadata is used for T10 PI, the driver provides mappings to save and restore the reftag physical block translation. The driver provides no-op functions for generate and verify if metadata is not used for protection information. This way the setup is always provided by the block layer. If a request does not supply a required metadata buffer, the command is failed with bad address. This could only happen if a user manually disables verify/generate on such a disk. The only exception to where this is okay is if the controller is capable of stripping/generating the metadata, which is possible on some types of formats. The metadata scatter gather list now occupies the spot in the nvme_iod that used to be used to link retryable IOD's, but we don't do that anymore, so the field was unused. Signed-off-by: Keith Busch --- include/linux/nvme.h | 2 ++ include/uapi/linux/nvme.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 19a5d4b23209..cca264db2478 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -121,6 +121,7 @@ struct nvme_ns { unsigned ns_id; int lba_shift; int ms; + int pi_type; u64 mode_select_num_blocks; u32 mode_select_block_len; }; @@ -138,6 +139,7 @@ struct nvme_iod { int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ struct scatterlist sg[0]; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 26386cf3db44..406bfc95652c 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -124,10 +124,22 @@ struct nvme_id_ns { enum { NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, }; struct nvme_smart_log { @@ -261,6 +273,10 @@ enum { NVME_RW_DSM_LATENCY_LOW = 3 << 4, NVME_RW_DSM_SEQ_REQ = 1 << 6, NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, }; struct nvme_dsm_cmd { -- cgit v1.2.3 From 4f1982b4e262c45475a91b4253e9bc7f7c991c13 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:42:14 -0700 Subject: NVMe: Update SCSI Inquiry VPD 83h translation The original translation created collisions on Inquiry VPD 83 for many existing devices. Newer specifications provide other ways to translate based on the device's version can be used to create unique identifiers. Version 1.1 provides an EUI64 field that uniquely identifies each namespace, and 1.2 added the longer NGUID field for the same reason. Both follow the IEEE EUI format and readily translate to the SCSI device identification EUI designator type 2h. For devices implementing either, the translation will use this type, defaulting to the EUI64 8-byte type if implemented then NGUID's 16 byte version if not. If neither are provided, the 1.0 translation is used, and is updated to use the SCSI String format to guarantee a unique identifier. Knowing when to use the new fields depends on the nvme controller's revision. The NVME_VS macro was not decoding this correctly, so that is fixed in this patch and moved to a more appropriate place. Since the Identify Namespace structure required an update for the NGUID field, this patch adds the remaining new 1.2 fields to the structure. Signed-off-by: Keith Busch --- include/linux/nvme.h | 2 -- include/uapi/linux/nvme.h | 10 +++++++++- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index cca264db2478..1f062a9e521d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -62,8 +62,6 @@ enum { NVME_CSTS_SHST_MASK = 3 << 2, }; -#define NVME_VS(major, minor) (major << 16 | minor) - extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 406bfc95652c..aef9a81b2d75 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -115,7 +115,13 @@ struct nvme_id_ns { __le16 nawun; __le16 nawupf; __le16 nacwu; - __u8 rsvd40[80]; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __u16 rsvd46; + __le64 nvmcap[2]; + __u8 rsvd64[40]; + __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; __u8 rsvd192[192]; @@ -565,6 +571,8 @@ struct nvme_passthru_cmd { __u32 result; }; +#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) -- cgit v1.2.3 From b3fffdefabab266ae5176a136d93b6670b07bb30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 3 Feb 2015 11:21:42 -0700 Subject: NVMe: Register management handle under nvme class This creates a new class type for nvme devices to register their management character devices with. This is so we do not rely on miscdev to provide enough minors for as many nvme devices some people plan to use. The previous limit was approximately 60 NVMe controllers, depending on the platform and kernel. Now the limit is 1M, which ought to be enough for anybody. Since we have a new device class, it makes sense to attach the block devices under this as well, so part of this patch moves the management handle initialization prior to the namespaces discovery. Signed-off-by: Keith Busch --- include/linux/nvme.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1f062a9e521d..383d495c5e4c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -89,7 +88,7 @@ struct nvme_dev { struct nvme_bar __iomem *bar; struct list_head namespaces; struct kref kref; - struct miscdevice miscdev; + struct device *device; work_func_t reset_workfn; struct work_struct reset_work; char name[12]; -- cgit v1.2.3 From 2e1d8448196ba85cd78a18723413a3c92aabe0f3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Feb 2015 15:33:00 -0700 Subject: NVMe: Asynchronous controller probe This performs the longest parts of nvme device probe in scheduled work. This speeds up probe significantly when multiple devices are in use. Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 383d495c5e4c..e2429e8cdab4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -91,6 +91,7 @@ struct nvme_dev { struct device *device; work_func_t reset_workfn; struct work_struct reset_work; + struct work_struct probe_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From 07836e659c81ec6b0d683dfbf7958339a22a7b69 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 10:34:48 -0700 Subject: NVMe: Fix potential corruption during shutdown The driver has to end unreturned commands at some point even if the controller has not provided a completion. The driver tried to be safe by deleting IO queues prior to ending all unreturned commands. That should cause the controller to internally abort inflight commands, but IO queue deletion request does not have to be successful, so all bets are off. We still have to make progress, so to be extra safe, this patch doesn't clear a queue to release the dma mapping for a command until after the pci device has been disabled. This patch removes the special handling during device initialization so controller recovery can be done all the time. This is possible since initialization is not inlined with pci probe anymore. Reported-by: Nilish Choudhury Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e2429e8cdab4..0adad4a5419b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -103,7 +103,6 @@ struct nvme_dev { u16 abort_limit; u8 event_limit; u8 vwc; - u8 initialized; }; /* -- cgit v1.2.3