From 3c462c880b52aae2cfbbb8db8b401eef118cc128 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 19 Aug 2015 07:35:54 +1000 Subject: md: Increment version for clustered bitmaps Add BITMAP_MAJOR_CLUSTERED as 5, in order to prevent older kernels to assemble a clustered device. In order to maximize compatibility, the major version is set to BITMAP_MAJOR_CLUSTERED *only* if the bitmap is clustered. Added MD_FEATURE_CLUSTERED in order to return error for older kernels which would assemble MD even if the bitmap is corrupted. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 2ae6131e69a5..867ee874fa80 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -302,6 +302,7 @@ struct mdp_superblock_1 { #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening * is guided by bitmap. */ +#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -310,6 +311,7 @@ struct mdp_superblock_1 { |MD_FEATURE_RESHAPE_BACKWARDS \ |MD_FEATURE_NEW_OFFSET \ |MD_FEATURE_RECOVERY_BITMAP \ + |MD_FEATURE_CLUSTERED \ ) #endif -- cgit v1.2.3 From c4d4c91b44d8309082127893221a1971a27c50ca Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 13 Aug 2015 14:31:54 -0700 Subject: MD: replace special disk roles with macros Add the following two macros for special roles: spare and faulty MD_DISK_ROLE_SPARE 0xffff MD_DISK_ROLE_FAULTY 0xfffe Add MD_DISK_ROLE_MAX 0xff00 as the maximal possible regular role, and minimal value of special role. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 867ee874fa80..d1fc8a637368 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -90,6 +90,10 @@ * dire need */ +#define MD_DISK_ROLE_SPARE 0xffff +#define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ -- cgit v1.2.3 From bac624f3f86a8c7db395c7f85ccad6a504b9c4b4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 13 Aug 2015 14:31:55 -0700 Subject: MD: add a new disk role to present write journal device Next patches will use a disk as raid5/6 journaling. We need a new disk role to present the journal device and add MD_FEATURE_JOURNAL to feature_map for backward compability. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index d1fc8a637368..eaaab52077a3 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -89,9 +89,11 @@ * read requests will only be sent here in * dire need */ +#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ #define MD_DISK_ROLE_SPARE 0xffff #define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_JOURNAL 0xfffd #define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ typedef struct mdp_device_descriptor_s { @@ -307,6 +309,7 @@ struct mdp_superblock_1 { * is guided by bitmap. */ #define MD_FEATURE_CLUSTERED 256 /* clustered MD */ +#define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ -- cgit v1.2.3 From 3069aa8def32b0c2b83cd27d1c37ed30b47ce879 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Aug 2015 14:31:56 -0700 Subject: md: override md superblock recovery_offset for journal device Journal device stores data in a log structure. We need record the log start. Here we override md superblock recovery_offset for this purpose. This field of a journal device is meaningless otherwise. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index eaaab52077a3..a5f54ff26c20 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -258,7 +258,10 @@ struct mdp_superblock_1 { __le64 data_offset; /* sector start of data, often 0 */ __le64 data_size; /* sectors in this device that can be used for data */ __le64 super_offset; /* sector start of this superblock */ - __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + union { + __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __le64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; __le32 dev_number; /* permanent identifier of this device - not role in raid */ __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ -- cgit v1.2.3 From f6bed0ef0a808164f51197de062e0450ce6c1f96 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Aug 2015 14:31:59 -0700 Subject: raid5: add basic stripe log This introduces a simple log for raid5. Data/parity writing to raid array first writes to the log, then write to raid array disks. If crash happens, we can recovery data from the log. This can speed up raid resync and fix write hole issue. The log structure is pretty simple. Data/meta data is stored in block unit, which is 4k generally. It has only one type of meta data block. The meta data block can track 3 types of data, stripe data, stripe parity and flush block. MD superblock will point to the last valid meta data block. Each meta data block has checksum/seq number, so recovery can scan the log correctly. We store a checksum of stripe data/parity to the metadata block, so meta data and stripe data/parity can be written to log disk together. otherwise, meta data write must wait till stripe data/parity is finished. For stripe data, meta data block will record stripe data sector and size. Currently the size is always 4k. This meta data record can be made simpler if we just fix write hole (eg, we can record data of a stripe's different disks together), but this format can be extended to support caching in the future, which must record data address/size. For stripe parity, meta data block will record stripe sector. It's size should be 4k (for raid5) or 8k (for raid6). We always store p parity first. This format should work for caching too. flush block indicates a stripe is in raid array disks. Fixing write hole doesn't need this type of meta data, it's for caching extension. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 58 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index a5f54ff26c20..96e4196f9c79 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -324,4 +324,62 @@ struct mdp_superblock_1 { |MD_FEATURE_CLUSTERED \ ) +struct r5l_payload_header { + __le16 type; + __le16 flags; +} __attribute__ ((__packed__)); + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, + R5LOG_PAYLOAD_FLUSH = 2, +}; + +struct r5l_payload_data_parity { + struct r5l_payload_header header; + __le32 size; /* sector. data/parity size. each 4k + * has a checksum */ + __le64 location; /* sector. For data, it's raid sector. For + * parity, it's stripe sector */ + __le32 checksum[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_data_parity_flag { + R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */ + /* + * RESHAPED/RESHAPING is only set when there is reshape activity. Note, + * both data/parity of a stripe should have the same flag set + * + * RESHAPED: reshape is running, and this stripe finished reshape + * RESHAPING: reshape is running, and this stripe isn't reshaped + */ + R5LOG_PAYLOAD_FLAG_RESHAPED = 2, + R5LOG_PAYLOAD_FLAG_RESHAPING = 3, +}; + +struct r5l_payload_flush { + struct r5l_payload_header header; + __le32 size; /* flush_stripes size, bytes */ + __le64 flush_stripes[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_flush_flag { + R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */ +}; + +struct r5l_meta_block { + __le32 magic; + __le32 checksum; + __u8 version; + __u8 __zero_pading_1; + __le16 __zero_pading_2; + __le32 meta_size; /* whole size of the block */ + + __le64 seq; + __le64 position; /* sector, start from rdev->data_offset, current position */ + struct r5l_payload_header payloads[]; +} __attribute__ ((__packed__)); + +#define R5LOG_VERSION 0x1 +#define R5LOG_MAGIC 0x6433c509 #endif -- cgit v1.2.3 From 5c7e81c3de9eb3db01e16190a1da0899efee645b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Aug 2015 14:32:04 -0700 Subject: raid5: enable log for raid array with cache disk Now log is safe to enable for raid array with cache disk Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- include/uapi/linux/raid/md_p.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 96e4196f9c79..c3e654c6d518 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -322,6 +322,7 @@ struct mdp_superblock_1 { |MD_FEATURE_NEW_OFFSET \ |MD_FEATURE_RECOVERY_BITMAP \ |MD_FEATURE_CLUSTERED \ + |MD_FEATURE_JOURNAL \ ) struct r5l_payload_header { -- cgit v1.2.3