From b46020aa3a8a0f9c7324fe0af4aec4227f947a10 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: [PATCH 01/26] md/raid5: remove redundant check in stripe_add_to_batch_list() The stripe_add_to_batch_list() function is called only if stripe_can_batch() returned true, so there is no need for double check. Signed-off-by: Roman Gushchin Cc: Neil Brown Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 704ef7fcfbf8..22362505f810 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh int hash; int dd_idx; - if (!stripe_can_batch(sh)) - return; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) From ac277c6a8a39bc50f891a3477625330c276bd7f5 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: [PATCH 02/26] md-cluster: Avoid the resync ping-pong If a RESYNCING message with (0,0) has been sent before, do not send it again. This avoids a resync ping pong between the nodes. We read the bitmap lockresource's LVB to figure out the previous value of the RESYNCING message. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d6a1126d85ce..e57bbfed1638 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -882,8 +882,16 @@ static int resync_start(struct mddev *mddev) static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; + struct resync_info ri; struct cluster_msg cmsg = {0}; + /* do not send zero again, if we have sent before */ + if (hi == 0) { + memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + if (le64_to_cpu(ri.hi) == 0) + return 0; + } + add_resync_info(cinfo->bitmap_lockres, lo, hi); /* Re-acquire the lock to refresh LVB */ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); From 659b254fa7392e32b59a30d4b61fb12c4cd440ff Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: [PATCH 03/26] md-cluster: remove a disk asynchronously from cluster environment For cluster raid, if one disk couldn't be reach in one node, then other nodes would receive the REMOVE message for the disk. In receiving node, we can't call md_kick_rdev_from_array to remove the disk from array synchronously since the disk might still be busy in this node. So let's set a ClusterRemove flag on the disk, then let the thread to do the removal job eventually. Signed-off-by: Guoqing Jiang Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 7 +++++-- drivers/md/md.c | 12 ++++++++++++ drivers/md/md.h | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index e57bbfed1638..3fd7301fd7af 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -440,8 +440,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); - if (rdev) - md_kick_rdev_from_array(rdev); + if (rdev) { + set_bit(ClusterRemove, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 61aacab424cf..198e29dffb98 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8318,6 +8318,18 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } + if (mddev_is_clustered(mddev)) { + struct md_rdev *rdev; + /* kick the device if another node issued a + * remove disk. + */ + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(ClusterRemove, &rdev->flags) && + rdev->raid_disk < 0) + md_kick_rdev_from_array(rdev); + } + } + if (!mddev->external) { int did_change = 0; spin_lock(&mddev->lock); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca0b643fe3c1..f7b17aef837d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -183,6 +183,7 @@ enum flag_bits { * Usually, this device should be faster * than other devices in the array */ + ClusterRemove, }; #define BB_LEN_MASK (0x00000000000001FFULL) From 54a88392cdd84b4a739ce3a986bfabfaff67d9d2 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 04/26] md-cluster: Fix the remove sequence with the new MD reload code The remove disk message does not need metadata_update_start(), but can be an independent message. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3fd7301fd7af..b58374daff32 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -997,7 +997,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct md_cluster_info *cinfo = mddev->cluster_info; cmsg.type = cpu_to_le32(REMOVE); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - return __sendmsg(cinfo, &cmsg); + return sendmsg(cinfo, &cmsg); } static int gather_bitmaps(struct md_rdev *rdev) diff --git a/drivers/md/md.c b/drivers/md/md.c index 198e29dffb98..ab3995de0418 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6134,15 +6134,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; - int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; - if (mddev_is_clustered(mddev)) - ret = md_cluster_ops->metadata_update_start(mddev); - if (rdev->raid_disk < 0) goto kick_rdev; @@ -6153,7 +6149,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev) && ret == 0) + if (mddev_is_clustered(mddev)) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); @@ -6162,9 +6158,6 @@ kick_rdev: return 0; busy: - if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; From 09afd2a8d6ad2c40f3c1ae0b3f83784864cf4c15 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 05/26] md-cluster: Allow spare devices to be marked as faulty If a spare device was marked faulty, it would not be reflected in receiving nodes because it would mark it as activated and continue. Continue the operation, so it may be set as faulty. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ab3995de0418..f2f855c203e5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9106,7 +9106,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", bdevname(rdev2->bdev,b)); - continue; } /* device faulty * We just want to do the minimum to mark the disk From f6a2dc64ee74477c966f5220b1f560ed6308d010 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 06/26] md-cluster: append some actions when change bitmap from clustered to none For clustered raid, we need to do extra actions when change bitmap to none. 1. check if all the bitmap lock could be get or not, if yes then we can continue the change since cluster raid is only active in current node. Otherwise return fail and unlock the related bitmap locks 2. set nodes to 0 and then leave cluster environment. 3. release other nodes's bitmap lock. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 57 +++++++++++++++++++++++++++++++++++++++++ drivers/md/md-cluster.h | 2 ++ drivers/md/md.c | 13 ++++++++++ 3 files changed, 72 insertions(+) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b58374daff32..db9375f501ab 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -55,6 +55,7 @@ struct md_cluster_info { int slot_number; struct completion completion; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; @@ -803,6 +804,7 @@ static void resync_bitmap(struct mddev *mddev) __func__, __LINE__, err); } +static void unlock_all_bitmaps(struct mddev *mddev); static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -823,6 +825,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->bitmap_lockres); + unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); return 0; } @@ -1000,6 +1003,58 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) return sendmsg(cinfo, &cmsg); } +static int lock_all_bitmaps(struct mddev *mddev) +{ + int slot, my_slot, ret, held = 1, i = 0; + char str[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) * + sizeof(struct dlm_lock_resource *), + GFP_KERNEL); + if (!cinfo->other_bitmap_lockres) { + pr_err("md: can't alloc mem for other bitmap locks\n"); + return 0; + } + + my_slot = slot_number(mddev); + for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) { + if (slot == my_slot) + continue; + + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", slot); + cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1); + if (!cinfo->other_bitmap_lockres[i]) + return -ENOMEM; + + cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW); + if (ret) + held = -1; + i++; + } + + return held; +} + +static void unlock_all_bitmaps(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i; + + /* release other node's bitmap lock if they are existed */ + if (cinfo->other_bitmap_lockres) { + for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { + if (cinfo->other_bitmap_lockres[i]) { + dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); + lockres_free(cinfo->other_bitmap_lockres[i]); + } + } + kfree(cinfo->other_bitmap_lockres); + } +} + static int gather_bitmaps(struct md_rdev *rdev) { int sn, err; @@ -1045,6 +1100,8 @@ static struct md_cluster_operations cluster_ops = { .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, + .lock_all_bitmaps = lock_all_bitmaps, + .unlock_all_bitmaps = unlock_all_bitmaps, }; static int __init cluster_init(void) diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index e75ea2613184..45ce6c97d8bd 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -24,6 +24,8 @@ struct md_cluster_operations { int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); + int (*lock_all_bitmaps)(struct mddev *mddev); + void (*unlock_all_bitmaps)(struct mddev *mddev); }; #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index f2f855c203e5..495d8aa0a0d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6599,6 +6599,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { + /* hold PW on all the bitmap lock */ + if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + printk("md: can't change bitmap to none since the" + " array is in use by more than one node\n"); + rv = -EPERM; + md_cluster_ops->unlock_all_bitmaps(mddev); + goto err; + } + + mddev->bitmap_info.nodes = 0; + md_cluster_ops->leave(mddev); + } mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); From d323ef0f1a3e6d408eabacf0e91e2d741ffe1165 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 07/26] md-cluster: update the documentation Update design documentation based on recent development. original version comes from Neil. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Documentation/md-cluster.txt | 310 +++++++++++++++++++++++++---------- 1 file changed, 226 insertions(+), 84 deletions(-) diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt index 1b794369e03a..c100c7163507 100644 --- a/Documentation/md-cluster.txt +++ b/Documentation/md-cluster.txt @@ -3,7 +3,7 @@ The cluster MD is a shared-device RAID for a cluster. 1. On-disk format -Separate write-intent-bitmap are used for each cluster node. +Separate write-intent-bitmaps are used for each cluster node. The bitmaps record all writes that may have been started on that node, and may not yet have finished. The on-disk layout is: @@ -14,117 +14,161 @@ and may not yet have finished. The on-disk layout is: | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | | bm bits [3, contd] | | | -During "normal" functioning we assume the filesystem ensures that only one -node writes to any given block at a time, so a write -request will +During "normal" functioning we assume the filesystem ensures that only +one node writes to any given block at a time, so a write request will + - set the appropriate bit (if not already set) - commit the write to all mirrors - schedule the bit to be cleared after a timeout. -Reads are just handled normally. It is up to the filesystem to -ensure one node doesn't read from a location where another node (or the same +Reads are just handled normally. It is up to the filesystem to ensure +one node doesn't read from a location where another node (or the same node) is writing. 2. DLM Locks for management -There are two locks for managing the device: +There are three groups of locks for managing the device: 2.1 Bitmap lock resource (bm_lockres) - The bm_lockres protects individual node bitmaps. They are named in the - form bitmap001 for node 1, bitmap002 for node and so on. When a node - joins the cluster, it acquires the lock in PW mode and it stays so - during the lifetime the node is part of the cluster. The lock resource - number is based on the slot number returned by the DLM subsystem. Since - DLM starts node count from one and bitmap slots start from zero, one is - subtracted from the DLM slot number to arrive at the bitmap slot number. + The bm_lockres protects individual node bitmaps. They are named in + the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a + node joins the cluster, it acquires the lock in PW mode and it stays + so during the lifetime the node is part of the cluster. The lock + resource number is based on the slot number returned by the DLM + subsystem. Since DLM starts node count from one and bitmap slots + start from zero, one is subtracted from the DLM slot number to arrive + at the bitmap slot number. + + The LVB of the bitmap lock for a particular node records the range + of sectors that are being re-synced by that node. No other + node may write to those sectors. This is used when a new nodes + joins the cluster. + +2.2 Message passing locks + + Each node has to communicate with other nodes when starting or ending + resync, and for metadata superblock updates. This communication is + managed through three locks: "token", "message", and "ack", together + with the Lock Value Block (LVB) of one of the "message" lock. + +2.3 new-device management + + A single lock: "no-new-dev" is used to co-ordinate the addition of + new devices - this must be synchronized across the array. + Normally all nodes hold a concurrent-read lock on this device. 3. Communication -Each node has to communicate with other nodes when starting or ending -resync, and metadata superblock updates. + Messages can be broadcast to all nodes, and the sender waits for all + other nodes to acknowledge the message before proceeding. Only one + message can be processed at a time. 3.1 Message Types - There are 3 types, of messages which are passed + There are six types of messages which are passed: - 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been - updated, and the node must re-read the md superblock. This is performed - synchronously. + 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has + been updated, and the node must re-read the md superblock. This is + performed synchronously. It is primarily used to signal device + failure. - 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended - so that each node may suspend or resume the region. + 3.1.2 RESYNCING: informs other nodes that a resync is initiated or + ended so that each node may suspend or resume the region. Each + RESYNCING message identifies a range of the devices that the + sending node is about to resync. This over-rides any pervious + notification from that node: only one ranged can be resynced at a + time per-node. + + 3.1.3 NEWDISK: informs other nodes that a device is being added to + the array. Message contains an identifier for that device. See + below for further details. + + 3.1.4 REMOVE: A failed or spare device is being removed from the + array. The slot-number of the device is included in the message. + + 3.1.5 RE_ADD: A failed device is being re-activated - the assumption + is that it has been determined to be working again. + + 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap + isn't clean, then another node is informed to take the ownership of + resync. 3.2 Communication mechanism The DLM LVB is used to communicate within nodes of the cluster. There are three resources used for the purpose: - 3.2.1 Token: The resource which protects the entire communication + 3.2.1 token: The resource which protects the entire communication system. The node having the token resource is allowed to communicate. - 3.2.2 Message: The lock resource which carries the data to + 3.2.2 message: The lock resource which carries the data to communicate. - 3.2.3 Ack: The resource, acquiring which means the message has been + 3.2.3 ack: The resource, acquiring which means the message has been acknowledged by all nodes in the cluster. The BAST of the resource - is used to inform the receive node that a node wants to communicate. + is used to inform the receiving node that a node wants to + communicate. The algorithm is: - 1. receive status + 1. receive status - all nodes have concurrent-reader lock on "ack". - sender receiver receiver - ACK:CR ACK:CR ACK:CR + sender receiver receiver + "ack":CR "ack":CR "ack":CR - 2. sender get EX of TOKEN - sender get EX of MESSAGE + 2. sender get EX on "token" + sender get EX on "message" sender receiver receiver - TOKEN:EX ACK:CR ACK:CR - MESSAGE:EX - ACK:CR + "token":EX "ack":CR "ack":CR + "message":EX + "ack":CR - Sender checks that it still needs to send a message. Messages received - or other events that happened while waiting for the TOKEN may have made - this message inappropriate or redundant. + Sender checks that it still needs to send a message. Messages + received or other events that happened while waiting for the + "token" may have made this message inappropriate or redundant. - 3. sender write LVB. - sender down-convert MESSAGE from EX to CW - sender try to get EX of ACK - [ wait until all receiver has *processed* the MESSAGE ] + 3. sender writes LVB. + sender down-convert "message" from EX to CW + sender try to get EX of "ack" + [ wait until all receivers have *processed* the "message" ] - [ triggered by bast of ACK ] - receiver get CR of MESSAGE + [ triggered by bast of "ack" ] + receiver get CR on "message" receiver read LVB receiver processes the message [ wait finish ] - receiver release ACK + receiver releases "ack" + receiver tries to get PR on "message" - sender receiver receiver - TOKEN:EX MESSAGE:CR MESSAGE:CR - MESSAGE:CR - ACK:EX + sender receiver receiver + "token":EX "message":CR "message":CR + "message":CW + "ack":EX - 4. triggered by grant of EX on ACK (indicating all receivers have processed - message) - sender down-convert ACK from EX to CR - sender release MESSAGE - sender release TOKEN - receiver upconvert to PR of MESSAGE - receiver get CR of ACK - receiver release MESSAGE + 4. triggered by grant of EX on "ack" (indicating all receivers + have processed message) + sender down-converts "ack" from EX to CR + sender releases "message" + sender releases "token" + receiver upconvert to PR on "message" + receiver get CR of "ack" + receiver release "message" sender receiver receiver - ACK:CR ACK:CR ACK:CR + "ack":CR "ack":CR "ack":CR 4. Handling Failures 4.1 Node Failure - When a node fails, the DLM informs the cluster with the slot. The node - starts a cluster recovery thread. The cluster recovery thread: + + When a node fails, the DLM informs the cluster with the slot + number. The node starts a cluster recovery thread. The cluster + recovery thread: + - acquires the bitmap lock of the failed node - opens the bitmap - reads the bitmap of the failed node @@ -132,45 +176,143 @@ The algorithm is: - cleans the bitmap of the failed node - releases bitmap lock of the failed node - initiates resync of the bitmap on the current node + md_check_recovery is invoked within recover_bitmaps, + then md_check_recovery -> metadata_update_start/finish, + it will lock the communication by lock_comm. + Which means when one node is resyncing it blocks all + other nodes from writing anywhere on the array. - The resync process, is the regular md resync. However, in a clustered + The resync process is the regular md resync. However, in a clustered environment when a resync is performed, it needs to tell other nodes of the areas which are suspended. Before a resync starts, the node - send out RESYNC_START with the (lo,hi) range of the area which needs - to be suspended. Each node maintains a suspend_list, which contains - the list of ranges which are currently suspended. On receiving - RESYNC_START, the node adds the range to the suspend_list. Similarly, - when the node performing resync finishes, it send RESYNC_FINISHED - to other nodes and other nodes remove the corresponding entry from - the suspend_list. + send out RESYNCING with the (lo,hi) range of the area which needs to + be suspended. Each node maintains a suspend_list, which contains the + list of ranges which are currently suspended. On receiving RESYNCING, + the node adds the range to the suspend_list. Similarly, when the node + performing resync finishes, it sends RESYNCING with an empty range to + other nodes and other nodes remove the corresponding entry from the + suspend_list. - A helper function, should_suspend() can be used to check if a particular - I/O range should be suspended or not. + A helper function, ->area_resyncing() can be used to check if a + particular I/O range should be suspended or not. 4.2 Device Failure + Device failures are handled and communicated with the metadata update - routine. + routine. When a node detects a device failure it does not allow + any further writes to that device until the failure has been + acknowledged by all other nodes. 5. Adding a new Device -For adding a new device, it is necessary that all nodes "see" the new device -to be added. For this, the following algorithm is used: + + For adding a new device, it is necessary that all nodes "see" the new + device to be added. For this, the following algorithm is used: 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues - ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) - 2. Node 1 sends NEWDISK with uuid and slot number + ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD) + 2. Node 1 sends a NEWDISK message with uuid and slot number 3. Other nodes issue kobject_uevent_env with uuid and slot number (Steps 4,5 could be a udev rule) 4. In userspace, the node searches for the disk, perhaps using blkid -t SUB_UUID="" - 5. Other nodes issue either of the following depending on whether the disk - was found: + 5. Other nodes issue either of the following depending on whether + the disk was found: ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and - disc.number set to slot number) + disc.number set to slot number) ioctl(CLUSTERED_DISK_NACK) - 6. Other nodes drop lock on no-new-devs (CR) if device is found - 7. Node 1 attempts EX lock on no-new-devs - 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk - as SpareLocal - 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED - 10. Other nodes get the information whether a disk is added or not - by the following METADATA_UPDATED. + 6. Other nodes drop lock on "no-new-devs" (CR) if device is found + 7. Node 1 attempts EX lock on "no-new-dev" + 8. If node 1 gets the lock, it sends METADATA_UPDATED after + unmarking the disk as SpareLocal + 9. If not (get "no-new-dev" lock), it fails the operation and sends + METADATA_UPDATED. + 10. Other nodes get the information whether a disk is added or not + by the following METADATA_UPDATED. + +6. Module interface. + + There are 17 call-backs which the md core can make to the cluster + module. Understanding these can give a good overview of the whole + process. + +6.1 join(nodes) and leave() + + These are called when an array is started with a clustered bitmap, + and when the array is stopped. join() ensures the cluster is + available and initializes the various resources. + Only the first 'nodes' nodes in the cluster can use the array. + +6.2 slot_number() + + Reports the slot number advised by the cluster infrastructure. + Range is from 0 to nodes-1. + +6.3 resync_info_update() + + This updates the resync range that is stored in the bitmap lock. + The starting point is updated as the resync progresses. The + end point is always the end of the array. + It does *not* send a RESYNCING message. + +6.4 resync_start(), resync_finish() + + These are called when resync/recovery/reshape starts or stops. + They update the resyncing range in the bitmap lock and also + send a RESYNCING message. resync_start reports the whole + array as resyncing, resync_finish reports none of it. + + resync_finish() also sends a BITMAP_NEEDS_SYNC message which + allows some other node to take over. + +6.5 metadata_update_start(), metadata_update_finish(), + metadata_update_cancel(). + + metadata_update_start is used to get exclusive access to + the metadata. If a change is still needed once that access is + gained, metadata_update_finish() will send a METADATA_UPDATE + message to all other nodes, otherwise metadata_update_cancel() + can be used to release the lock. + +6.6 area_resyncing() + + This combines two elements of functionality. + + Firstly, it will check if any node is currently resyncing + anything in a given range of sectors. If any resync is found, + then the caller will avoid writing or read-balancing in that + range. + + Secondly, while node recovery is happening it reports that + all areas are resyncing for READ requests. This avoids races + between the cluster-filesystem and the cluster-RAID handling + a node failure. + +6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack() + + These are used to manage the new-disk protocol described above. + When a new device is added, add_new_disk_start() is called before + it is bound to the array and, if that succeeds, add_new_disk_finish() + is called the device is fully added. + + When a device is added in acknowledgement to a previous + request, or when the device is declared "unavailable", + new_disk_ack() is called. + +6.8 remove_disk() + + This is called when a spare or failed device is removed from + the array. It causes a REMOVE message to be send to other nodes. + +6.9 gather_bitmaps() + + This sends a RE_ADD message to all other nodes and then + gathers bitmap information from all bitmaps. This combined + bitmap is then used to recovery the re-added device. + +6.10 lock_all_bitmaps() and unlock_all_bitmaps() + + These are called when change bitmap to none. If a node plans + to clear the cluster raid's bitmap, it need to make sure no other + nodes are using the raid which is achieved by lock all bitmap + locks within the cluster, and also those locks are unlocked + accordingly. From 15858fa5b00c1067a8a8e53ea32f4a65f8bebbb8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 08/26] md-cluster: Defer MD reloading to mddev->thread Reloading of superblock must be performed under reconfig_mutex. However, this cannot be done with md_reload_sb because it would deadlock with the message DLM lock. So, we defer it in md_check_recovery() which is executed by mddev->thread. This introduces a new flag, MD_RELOAD_SB, which if set, will reload the superblock. And good_device_nr is also added to 'struct mddev' which is used to get the num of the good device within cluster raid. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 4 +++- drivers/md/md.c | 4 ++++ drivers/md/md.h | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index db9375f501ab..b659ef7b8daf 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -432,8 +432,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); + mddev->good_device_nr = le32_to_cpu(msg->raid_slot); + set_bit(MD_RELOAD_SB, &mddev->flags); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + md_wakeup_thread(mddev->thread); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) diff --git a/drivers/md/md.c b/drivers/md/md.c index 495d8aa0a0d2..504ce5d068ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8286,6 +8286,7 @@ void md_check_recovery(struct mddev *mddev) (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + test_bit(MD_RELOAD_SB, &mddev->flags) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) @@ -8334,6 +8335,9 @@ void md_check_recovery(struct mddev *mddev) rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); } + + if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) + md_reload_sb(mddev, mddev->good_device_nr); } if (!mddev->external) { diff --git a/drivers/md/md.h b/drivers/md/md.h index f7b17aef837d..8817e623258a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -235,6 +235,9 @@ struct mddev { */ #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ +#define MD_RELOAD_SB 7 /* Reload the superblock because another node + * updated it. + */ int suspended; atomic_t active_io; @@ -465,6 +468,7 @@ struct mddev { struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + unsigned int good_device_nr; /* good device num within cluster raid */ }; static inline int __must_check mddev_lock(struct mddev *mddev) From 8b9277c81450de9d8081ff6571ac5986e6c83f49 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: [PATCH 09/26] md-cluster: Protect communication with mutexes Communication can happen through multiple threads. It is possible that one thread steps over another threads sequence. So, we use mutexes to protect both the send and receive sequences. Send communication is locked through state bit, MD_CLUSTER_SEND_LOCK. Communication is locked with bit manipulation in order to allow "lock and hold" for the add operation. In case of an add operation, if the lock is held, MD_CLUSTER_SEND_LOCKED_ALREADY is set. When md_update_sb() calls metadata_update_start(), it checks (in a single statement to avoid races), if the communication is already locked. If yes, it merely returns zero, else it locks the token lockresource. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 73 +++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b659ef7b8daf..ad3ec7df1547 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -48,12 +48,26 @@ struct resync_info { #define MD_CLUSTER_SUSPEND_READ_BALANCING 2 #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 +/* Lock the send communication. This is done through + * bit manipulation as opposed to a mutex in order to + * accomodate lock and hold. See next comment. + */ +#define MD_CLUSTER_SEND_LOCK 4 +/* If cluster operations must lock the communication channel, + * so as to perform extra operations (and no other operation + * is allowed on the MD, such as adding a disk. Token needs + * to be locked and held until the operation completes with + * a md_update_sb(), which would eventually release the lock. + */ +#define MD_CLUSTER_SEND_LOCKED_ALREADY 5 + struct md_cluster_info { /* dlm lock space and resources for clustered raid. */ dlm_lockspace_t *lockspace; int slot_number; struct completion completion; + struct mutex recv_mutex; struct dlm_lock_resource *bitmap_lockres; struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; @@ -68,6 +82,7 @@ struct md_cluster_info { struct dlm_lock_resource *no_new_dev_lockres; struct md_thread *recv_thread; struct completion newdisk_completion; + wait_queue_head_t wait; unsigned long state; }; @@ -508,9 +523,11 @@ static void recv_daemon(struct md_thread *thread) struct cluster_msg msg; int ret; + mutex_lock(&cinfo->recv_mutex); /*get CR on Message*/ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { pr_err("md/raid1:failed to get CR on MESSAGE\n"); + mutex_unlock(&cinfo->recv_mutex); return; } @@ -534,33 +551,45 @@ static void recv_daemon(struct md_thread *thread) ret = dlm_unlock_sync(message_lockres); if (unlikely(ret != 0)) pr_info("unlock msg failed return %d\n", ret); + mutex_unlock(&cinfo->recv_mutex); } -/* lock_comm() +/* lock_token() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. - * If called again, and the TOKEN lock is alread in EX mode - * return success. However, care must be taken that unlock_comm() - * is called only once. */ -static int lock_comm(struct md_cluster_info *cinfo) +static int lock_token(struct md_cluster_info *cinfo) { int error; - if (cinfo->token_lockres->mode == DLM_LOCK_EX) - return 0; - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", __func__, __LINE__, error); + + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); return error; } +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo) +{ + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); + + return lock_token(cinfo); +} + static void unlock_comm(struct md_cluster_info *cinfo) { WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); + mutex_unlock(&cinfo->recv_mutex); dlm_unlock_sync(cinfo->token_lockres); + clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state); + wake_up(&cinfo->wait); } /* __sendmsg() @@ -713,6 +742,8 @@ static int join(struct mddev *mddev, int nodes) spin_lock_init(&cinfo->suspend_lock); init_completion(&cinfo->completion); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + init_waitqueue_head(&cinfo->wait); + mutex_init(&cinfo->recv_mutex); mddev->cluster_info = cinfo; @@ -843,9 +874,25 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } +/* + * Check if the communication is already locked, else lock the communication + * channel. + * If it is already locked, token is in EX mode, and hence lock_token() + * should not be called. + */ static int metadata_update_start(struct mddev *mddev) { - return lock_comm(mddev->cluster_info); + struct md_cluster_info *cinfo = mddev->cluster_info; + + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) || + test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state)); + + /* If token is already locked, return 0 */ + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + + return lock_token(cinfo); } static int metadata_update_finish(struct mddev *mddev) @@ -870,6 +917,7 @@ static int metadata_update_finish(struct mddev *mddev) ret = __sendmsg(cinfo, &cmsg); } else pr_warn("md-cluster: No good device id found to send\n"); + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); return ret; } @@ -877,6 +925,7 @@ static int metadata_update_finish(struct mddev *mddev) static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } @@ -970,14 +1019,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) ret = -ENOENT; if (ret) unlock_comm(cinfo); - else + else { dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + wake_up(&cinfo->wait); + } return ret; } static void add_new_disk_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } From e19508fa4df896b115f5321c21ce7669559b0863 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 10/26] md-cluster: update comments for MD_CLUSTER_SEND_LOCKED_ALREADY 1. fix unbalanced parentheses. 2. add more description about that MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared after set it in add_new_disk. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index ad3ec7df1547..0ded8e97751d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -53,11 +53,12 @@ struct resync_info { * accomodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 -/* If cluster operations must lock the communication channel, - * so as to perform extra operations (and no other operation - * is allowed on the MD, such as adding a disk. Token needs - * to be locked and held until the operation completes with - * a md_update_sb(), which would eventually release the lock. +/* If cluster operations (such as adding a disk) must lock the + * communication channel, so as to perform extra operations + * (update metadata) and no other operation is allowed on the + * MD. Token needs to be locked and held until the operation + * completes witha md_update_sb(), which would eventually release + * the lock. */ #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 @@ -1021,6 +1022,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) unlock_comm(cinfo); else { dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which + * will run soon after add_new_disk, the below path will be + * invoked: + * md_wakeup_thread(mddev->thread) + * -> conf->thread (raid1d) + * -> md_check_recovery -> md_update_sb + * -> metadata_update_start/finish + * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually. + * + * For other failure cases, metadata_update_cancel and + * add_new_disk_cancel also clear below bit as well. + * */ set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); wake_up(&cinfo->wait); } From abf3508d8faa281e01a780e022a6f43d1731fe0b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 11/26] md: update comment for md_allow_write MD_CHANGE_CLEAN had been replaced with MD_CHANGE_PENDING after commit 070dc6 ("md: resolve confusion of MD_CHANGE_CLEAN"), so make the change accordingly. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 504ce5d068ce..f71a81b37d08 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7714,7 +7714,7 @@ EXPORT_SYMBOL(md_write_end); * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. * - * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ int md_allow_write(struct mddev *mddev) From 3848c0bcb09c7b78e6f4ae9f8fc8d6d9aecbd35a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 12/26] raid5-cache: simplify r5l_move_io_unit_list It's only used for one kind of move, so make that explicit. Also clean up the code a bit by using list_for_each_safe. Signed-off-by: Christoph Hellwig Reviewed-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b887e04d7e5c..3699c4704ba8 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -156,21 +156,6 @@ static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) kmem_cache_free(log->io_kc, io); } -static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, - enum r5l_io_unit_state state) -{ - struct r5l_io_unit *io; - - while (!list_empty(from)) { - io = list_first_entry(from, struct r5l_io_unit, log_sibling); - /* don't change list order */ - if (io->state >= state) - list_move_tail(&io->log_sibling, to); - else - break; - } -} - static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { @@ -206,6 +191,20 @@ static void r5l_log_run_stripes(struct r5l_log *log) } } +static void r5l_move_to_end_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + list_move_tail(&io->log_sibling, &log->io_end_ios); + } +} + static void r5l_log_endio(struct bio *bio) { struct r5l_io_unit *io = bio->bi_private; @@ -220,8 +219,7 @@ static void r5l_log_endio(struct bio *bio) spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); if (log->need_cache_flush) - r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, - IO_UNIT_IO_END); + r5l_move_to_end_ios(log); else r5l_log_run_stripes(log); spin_unlock_irqrestore(&log->io_list_lock, flags); From ad66d445ee5a5f548142b880e1642c711fbcacd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 13/26] raid5-cache: free meta_page earlier Once the I/O completed we don't need the meta page anymore. As the iounits can live on for a long time this reduces memory pressure a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 3699c4704ba8..668e973f07e6 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -150,12 +150,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size) return log->device_size > used_size + size; } -static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) -{ - __free_page(io->meta_page); - kmem_cache_free(log->io_kc, io); -} - static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { @@ -215,6 +209,7 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); + __free_page(io->meta_page); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); @@ -552,7 +547,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_cp_seq = io->seq; list_del(&io->log_sibling); - r5l_free_io_unit(log, io); + kmem_cache_free(log->io_kc, io); found = true; } From 3312c951efaba55080958974047414576b9e5d63 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 14/26] md: avoid warning for 32-bit sector_t When CONFIG_LBDAF is not set, sector_t is only 32-bits wide, which means we cannot have devices with more than 2TB, and the code that is trying to handle compatibility support for large devices in md version 0.90 is meaningless but also causes a compile-time warning: drivers/md/md.c: In function 'super_90_load': drivers/md/md.c:1029:19: warning: large integer implicitly truncated to unsigned type [-Woverflow] drivers/md/md.c: In function 'super_90_rdev_size_change': drivers/md/md.c:1323:17: warning: large integer implicitly truncated to unsigned type [-Woverflow] This adds a check for CONFIG_LBDAF to avoid even getting into this code path, and also adds an explicit cast to let the compiler know it doesn't have to warn about the truncation. Signed-off-by: Arnd Bergmann Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f71a81b37d08..3d70d0d11b95 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1026,8 +1026,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) - rdev->sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && + sb->level >= 1) + rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ @@ -1320,8 +1321,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) - num_sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && + rdev->mddev->level >= 1) + num_sectors = (sector_t)(2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); From 9ebc6ef188a0656f3620835f9be7fe22c1644c1c Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: [PATCH 15/26] drivers: md: use ktime_get_real_seconds() get_seconds() API is not y2038 safe on 32 bit systems and the API is deprecated. Replace it with calls to ktime_get_real_seconds() API instead. Change mddev structure types to time64_t accordingly. 32 bit signed timestamps will overflow in the year 2038. Change the user interface mdu_array_info_s structure timestamps: ctime and utime values used in ioctls GET_ARRAY_INFO and SET_ARRAY_INFO to unsigned int. This will extend the field to last until the year 2106. The long term plan is to get rid of ctime and utime values in this structure as this information can be read from the on-disk meta data directly. Clamp the tim64_t timestamps to positive values with a max of U32_MAX when returning from GET_ARRAY_INFO ioctl to accommodate above changes in the data type of timestamps to unsigned int. v0.90 on disk meta data uses u32 for maintaining time stamps. So this will also last until year 2106. Assumption is that the usage of v0.90 will be deprecated by year 2106. Timestamp fields in the on disk meta data for v1.0 version already use 64 bit data types. Remove the truncation of the bits while writing to or reading from these from the disk. Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Signed-off-by: NeilBrown --- drivers/md/md.c | 18 +++++++++--------- drivers/md/md.h | 2 +- include/uapi/linux/raid/md_u.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3d70d0d11b95..d0f0621bf9b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1200,13 +1200,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); - sb->ctime = mddev->ctime; + sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; - sb->utime = mddev->utime; + sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; @@ -1547,8 +1547,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); - mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); - mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->ctime = le64_to_cpu(sb->ctime); + mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); @@ -2336,7 +2336,7 @@ repeat: spin_lock(&mddev->lock); - mddev->utime = get_seconds(); + mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; @@ -5843,7 +5843,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; - info.ctime = mddev->ctime; + info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ @@ -5853,7 +5853,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; - info.utime = mddev->utime; + info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 8817e623258a..e16a17c37418 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -264,7 +264,7 @@ struct mddev { * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; - time_t ctime, utime; + time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 1cb8aa6850b5..36cd8210a5d1 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -80,7 +80,7 @@ typedef struct mdu_array_info_s { int major_version; int minor_version; int patch_version; - int ctime; + unsigned int ctime; int level; int size; int nr_disks; @@ -91,7 +91,7 @@ typedef struct mdu_array_info_s { /* * Generic state information */ - int utime; /* 0 Superblock update time */ + unsigned int utime; /* 0 Superblock update time */ int state; /* 1 State bits (clean, ...) */ int active_disks; /* 2 Number of currently active disks */ int working_disks; /* 3 Number of working disks */ From f6b6ec5cfac306c1eea66f074050864efcb11851 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: [PATCH 16/26] raid5-cache: add journal hot add/remove support Add support for journal disk hot add/remove. Mostly trival checks in md part. The raid5 part is a little tricky. For hot-remove, we can't wait pending write as it's called from raid5d. The wait will cause deadlock. We simplily fail the hot-remove. A hot-remove retry can success eventually since if journal disk is faulty all pending write will be failed and finish. For hot-add, since an array supporting journal but without journal disk will be marked read-only, we are safe to hot add journal without stopping IO (should be read IO, while journal only handles write IO). Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 42 ++++++++++++++++++++++++++++------------ drivers/md/raid5-cache.c | 16 +++++++++++---- drivers/md/raid5.c | 34 ++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d0f0621bf9b0..c0c3e6dec248 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return -EEXIST; /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) } } rcu_read_unlock(); - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; @@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); - if (info->state & (1<state & (1<flags)) { + has_journal = true; + break; + } + } + if (has_journal) { + export_rdev(rdev); + return -EBUSY; + } set_bit(Journal, &rdev->flags); + } /* * check whether the device shows up in other nodes */ @@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; - if (test_bit(Journal, &rdev->flags)) - continue; - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; + if (!test_bit(Journal, &rdev->flags)) { + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; + } if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - spares++; + if (!test_bit(Journal, &rdev->flags)) + spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 668e973f07e6..c1c4d213a2c2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state) bool r5l_log_disk_error(struct r5conf *conf) { + struct r5l_log *log; + bool ret; /* don't allow write if journal disk is missing */ - if (!conf->log) - return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); - return test_bit(Faulty, &conf->log->rdev->flags); + rcu_read_lock(); + log = rcu_dereference(conf->log); + + if (!log) + ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + else + ret = test_bit(Faulty, &log->rdev->flags); + rcu_read_unlock(); + return ret; } struct r5l_recovery_ctx { @@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (r5l_load_log(log)) goto error; - conf->log = log; + rcu_assign_pointer(conf->log, log); return 0; error: md_unregister_thread(&log->reclaim_thread); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 22362505f810..a086014dcd49 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - if (test_bit(Journal, &rdev->flags)) { + if (test_bit(Journal, &rdev->flags) && conf->log) { + struct r5l_log *log; /* - * journal disk is not removable, but we need give a chance to - * update superblock of other disks. Otherwise journal disk - * will be considered as 'fresh' + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - return -EINVAL; + if (atomic_read(&mddev->writes_pending)) + return -EBUSY; + log = conf->log; + conf->log = NULL; + synchronize_rcu(); + r5l_exit_log(log); + return 0; } if (rdev == p->rdev) rdevp = &p->rdev; @@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (test_bit(Journal, &rdev->flags)) - return -EINVAL; + if (test_bit(Journal, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + if (conf->log) + return -EBUSY; + + rdev->raid_disk = 0; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + r5l_init_log(conf, rdev); + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); + return 0; + } if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; From c38d29b33bb3b3c792f3cca8a973422bb1897ebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: [PATCH 17/26] raid5-cache: use a bio_set This allows us to make guaranteed forward progress. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index c1c4d213a2c2..2a644977d90c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -34,6 +34,12 @@ #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) +/* + * We only need 2 bios per I/O unit to make progress, but ensure we + * have a few more available to not get too tight. + */ +#define R5L_POOL_SIZE 4 + struct r5l_log { struct md_rdev *rdev; @@ -70,6 +76,7 @@ struct r5l_log { struct bio flush_bio; struct kmem_cache *io_kc; + struct bio_set *bs; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); bio->bi_rw = WRITE; bio->bi_bdev = log->rdev->bdev; @@ -1161,6 +1168,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; + log->bs = bioset_create(R5L_POOL_SIZE, 0); + if (!log->bs) + goto io_bs; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) @@ -1178,6 +1189,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) error: md_unregister_thread(&log->reclaim_thread); reclaim_thread: + bioset_free(log->bs); +io_bs: kmem_cache_destroy(log->io_kc); io_kc: kfree(log); @@ -1187,6 +1200,7 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { md_unregister_thread(&log->reclaim_thread); + bioset_free(log->bs); kmem_cache_destroy(log->io_kc); kfree(log); } From e8deb6381051bf3ce9d817020e8ba972b405a070 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: [PATCH 18/26] raid5-cache: use a mempool for the metadata block We only have a limited number in flight, so use a page based mempool. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2a644977d90c..fa2d6321f3a4 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -77,6 +77,7 @@ struct r5l_log { struct kmem_cache *io_kc; struct bio_set *bs; + mempool_t *meta_pool; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -216,7 +217,7 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); - __free_page(io->meta_page); + mempool_free(io->meta_page, log->meta_pool); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); @@ -293,8 +294,9 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) INIT_LIST_HEAD(&io->stripe_list); io->state = IO_UNIT_RUNNING; - io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); + io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); block = page_address(io->meta_page); + clear_page(block); block->magic = cpu_to_le32(R5LOG_MAGIC); block->version = R5LOG_VERSION; block->seq = cpu_to_le64(log->seq); @@ -1172,6 +1174,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->bs) goto io_bs; + log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); + if (!log->meta_pool) + goto out_mempool; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) @@ -1186,9 +1192,12 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) rcu_assign_pointer(conf->log, log); return 0; + error: md_unregister_thread(&log->reclaim_thread); reclaim_thread: + mempool_destroy(log->meta_pool); +out_mempool: bioset_free(log->bs); io_bs: kmem_cache_destroy(log->io_kc); @@ -1200,6 +1209,7 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { md_unregister_thread(&log->reclaim_thread); + mempool_destroy(log->meta_pool); bioset_free(log->bs); kmem_cache_destroy(log->io_kc); kfree(log); From 5036c3902054358ee293b8cecfea13342d8019e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: [PATCH 19/26] raid5: allow r5l_io_unit allocations to fail And propagate the error up the stack so we can add the stripe to no_stripes_list and retry our log operation later. This avoids blocking raid5d due to reclaim, an it allows to get rid of the deadlock-prone GFP_NOFAIL allocation. shli: add missing mempool_destroy() Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 67 ++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index fa2d6321f3a4..6d2b4789a928 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -75,7 +75,10 @@ struct r5l_log { struct list_head finished_ios; /* io_units which settle down in log disk */ struct bio flush_bio; + struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ + struct kmem_cache *io_kc; + mempool_t *io_pool; struct bio_set *bs; mempool_t *meta_pool; @@ -287,8 +290,11 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) struct r5l_io_unit *io; struct r5l_meta_block *block; - /* We can't handle memory allocate failure so far */ - io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); + io = mempool_alloc(log->io_pool, GFP_ATOMIC); + if (!io) + return NULL; + memset(io, 0, sizeof(*io)); + io->log = log; INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->stripe_list); @@ -326,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) log->current_io->meta_offset + payload_size > PAGE_SIZE) r5l_submit_current_io(log); - if (!log->current_io) + if (!log->current_io) { log->current_io = r5l_new_meta(log); + if (!log->current_io) + return -ENOMEM; + } + return 0; } @@ -372,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) r5_reserve_log_entry(log, io); } -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, int data_pages, int parity_pages) { int i; int meta_size; + int ret; struct r5l_io_unit *io; meta_size = @@ -385,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, sizeof(struct r5l_payload_data_parity) + sizeof(__le32) * parity_pages; - r5l_get_meta(log, meta_size); + ret = r5l_get_meta(log, meta_size); + if (ret) + return ret; + io = log->current_io; for (i = 0; i < sh->disks; i++) { @@ -415,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, list_add_tail(&sh->log_list, &io->stripe_list); atomic_inc(&io->pending_stripe); sh->log_io = io; + + return 0; } static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); @@ -429,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) int meta_size; int reserve; int i; + int ret = 0; if (!log) return -EAGAIN; @@ -477,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) mutex_lock(&log->io_mutex); /* meta + data */ reserve = (1 + write_disks) << (PAGE_SHIFT - 9); - if (r5l_has_free_space(log, reserve)) - r5l_log_stripe(log, sh, data_pages, parity_pages); - else { + if (!r5l_has_free_space(log, reserve)) { spin_lock(&log->no_space_stripes_lock); list_add_tail(&sh->log_list, &log->no_space_stripes); spin_unlock(&log->no_space_stripes_lock); r5l_wake_reclaim(log, reserve); + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } } - mutex_unlock(&log->io_mutex); + mutex_unlock(&log->io_mutex); return 0; } @@ -540,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log) log->next_checkpoint); } +static void r5l_run_no_mem_stripe(struct r5l_log *log) +{ + struct stripe_head *sh; + + assert_spin_locked(&log->io_list_lock); + + if (!list_empty(&log->no_mem_stripes)) { + sh = list_first_entry(&log->no_mem_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + static bool r5l_complete_finished_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; @@ -556,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_cp_seq = io->seq; list_del(&io->log_sibling); - kmem_cache_free(log->io_kc, io); + mempool_free(io, log->io_pool); + r5l_run_no_mem_stripe(log); found = true; } @@ -1170,6 +1208,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; + log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); + if (!log->io_pool) + goto io_pool; + log->bs = bioset_create(R5L_POOL_SIZE, 0); if (!log->bs) goto io_bs; @@ -1184,6 +1226,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto reclaim_thread; init_waitqueue_head(&log->iounit_wait); + INIT_LIST_HEAD(&log->no_mem_stripes); + INIT_LIST_HEAD(&log->no_space_stripes); spin_lock_init(&log->no_space_stripes_lock); @@ -1200,6 +1244,8 @@ reclaim_thread: out_mempool: bioset_free(log->bs); io_bs: + mempool_destroy(log->io_pool); +io_pool: kmem_cache_destroy(log->io_kc); io_kc: kfree(log); @@ -1211,6 +1257,7 @@ void r5l_exit_log(struct r5l_log *log) md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); bioset_free(log->bs); + mempool_destroy(log->io_pool); kmem_cache_destroy(log->io_kc); kfree(log); } From bb9ef71646606e51adfebdc94231fbbc862dbe28 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 28 Dec 2015 10:46:38 +0800 Subject: [PATCH 20/26] md: remove unnecesary md_new_event_inintr md_new_event had removed sysfs_notify since 'commit 72a23c211e45 ("Make sure all changes to md/sync_action are notified.")', so we can use md_new_event and delete md_new_event_inintr. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c0c3e6dec248..43a140457e0c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -205,15 +205,6 @@ void md_new_event(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_new_event); -/* Alternate version that can be called from interrupts - * when calling sysfs_notify isn't needed. - */ -static void md_new_event_inintr(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} - /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. @@ -7209,7 +7200,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) md_wakeup_thread(mddev->thread); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); - md_new_event_inintr(mddev); + md_new_event(mddev); } EXPORT_SYMBOL(md_error); From 274d8cbde1bc3bdfb31c5d6a58113dff5cee4f87 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Jan 2016 16:16:58 +1100 Subject: [PATCH 21/26] md: Remove 'ready' field from mddev. This field is always set in tandem with ->pers, and when it is tested ->pers is also tested. So ->ready is not needed. It was needed once, but code rearrangement and locking changes have removed that needed. Signed-off-by: NeilBrown --- drivers/md/md.c | 5 +---- drivers/md/md.h | 2 -- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 43a140457e0c..0d1d822eeda5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -250,8 +250,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } @@ -5298,7 +5297,6 @@ int md_run(struct mddev *mddev) smp_wmb(); spin_lock(&mddev->lock); mddev->pers = pers; - mddev->ready = 1; spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) @@ -5498,7 +5496,6 @@ static void __md_stop(struct mddev *mddev) /* Ensure ->event_work is done */ flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); - mddev->ready = 0; mddev->pers = NULL; spin_unlock(&mddev->lock); pers->free(mddev, mddev->private); diff --git a/drivers/md/md.h b/drivers/md/md.h index e16a17c37418..fc6f7bbc9544 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -246,8 +246,6 @@ struct mddev { * are happening, so run/ * takeover/stop are not safe */ - int ready; /* See when safe to pass - * IO requests down */ struct gendisk *gendisk; struct kobject kobj; From a62ab49eb502a07814f9942770893118c6281223 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:13 -0800 Subject: [PATCH 22/26] md: set MD_HAS_JOURNAL in correct places Set MD_HAS_JOURNAL when a array is loaded or journal is initialized. This is to avoid the flags set too early in journal disk hotadd. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- drivers/md/raid5-cache.c | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0d1d822eeda5..29a4bbf62be5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1597,6 +1597,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { + set_bit(MD_HAS_JOURNAL, &mddev->flags); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ @@ -1643,8 +1648,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); - if (mddev->recovery_cp == MaxSector) - set_bit(MD_JOURNAL_CLEAN, &mddev->flags); rdev->raid_disk = 0; break; default: @@ -1664,8 +1667,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) - set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 6d2b4789a928..7ac035a73281 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1235,6 +1235,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto error; rcu_assign_pointer(conf->log, log); + set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; error: From 87d4d91616e4db9b8293ba9d9e5a2f3f0d0c8aa6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:14 -0800 Subject: [PATCH 23/26] MD: add journal with array suspended Hot add journal disk in recovery thread context brings a lot of trouble as IO could be running. Unlike spare disk hot add, adding journal disk with array suspended makes more sense and implmentation is much easier. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 29a4bbf62be5..8753dee3983b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2455,15 +2455,20 @@ static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; + bool add_journal = test_bit(Journal, &rdev->flags); - if (!mddev->pers->hot_remove_disk) { + if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, rdev); + if (add_journal) + mddev_suspend(mddev); err = mddev->pers->hot_add_disk(mddev, rdev); + if (add_journal) + mddev_resume(mddev); if (err) { unbind_rdev_from_array(rdev); export_rdev(rdev); From 16a43f6a65002ba9a5b063764b4ad5d288a1c15e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:15 -0800 Subject: [PATCH 24/26] raid5-cache: handle journal hotadd in quiesce Handle journal hotadd in quiesce to avoid creating duplicated threads. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 7ac035a73281..9531f5f05b93 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -827,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state) return; if (state == 0) { log->in_teardown = 0; + /* + * This is a special case for hotadd. In suspend, the array has + * no journal. In resume, journal is initialized as well as the + * reclaim thread. + */ + if (log->reclaim_thread) + return; log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); } else if (state == 1) { From 7aafc405ce5b706195e5df26305a0241b01d8d06 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 Jan 2016 14:11:02 +1100 Subject: [PATCH 25/26] Remove myself as MD Maintainer, and add to Credits. Signed-off-by: NeilBrown --- CREDITS | 1 + MAINTAINERS | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 8207cc62ee9d..fd934f05ee03 100644 --- a/CREDITS +++ b/CREDITS @@ -534,6 +534,7 @@ N: NeilBrown E: neil@brown.name P: 4096R/566281B9 1BC6 29EB D390 D870 7B5F 497A 39EC 9EDD 5662 81B9 D: NFSD Maintainer 2000-2007 +D: MD Maintainer 2001-2016 N: Zach Brown E: zab@zabbo.net diff --git a/MAINTAINERS b/MAINTAINERS index 233f83464814..917f27fccefa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9951,7 +9951,6 @@ S: Supported F: drivers/media/pci/solo6x10/ SOFTWARE RAID (Multiple Disks) SUPPORT -M: Neil Brown L: linux-raid@vger.kernel.org S: Supported F: drivers/md/ From 1501efadc524a0c99494b576923091589a52d2a4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 13 Jan 2016 16:00:07 -0800 Subject: [PATCH 26/26] md/raid: only permit hot-add of compatible integrity profiles It is not safe for an integrity profile to be changed while i/o is in-flight in the queue. Prevent adding new disks or otherwise online spares to an array if the device has an incompatible integrity profile. The original change to the blk_integrity_unregister implementation in md, commmit c7bfced9a671 "md: suspend i/o during runtime blk_integrity_unregister" introduced an immediate hang regression. This policy of disallowing changes the integrity profile once one has been established is shared with DM. Here is an abbreviated log from a test run that: 1/ Creates a degraded raid1 with an integrity-enabled device (pmem0s) [ 59.076127] 2/ Tries to add an integrity-disabled device (pmem1m) [ 90.489209] 3/ Retries with an integrity-enabled device (pmem1s) [ 205.671277] [ 59.076127] md/raid1:md0: active with 1 out of 2 mirrors [ 59.078302] md: data integrity enabled on md0 [..] [ 90.489209] md0: incompatible integrity profile for pmem1m [..] [ 205.671277] md: super_written gets error=-5 [ 205.677386] md/raid1:md0: Disk failure on pmem1m, disabling device. [ 205.677386] md/raid1:md0: Operation continuing on 1 devices. [ 205.683037] RAID1 conf printout: [ 205.684699] --- wd:1 rd:2 [ 205.685972] disk 0, wo:0, o:1, dev:pmem0s [ 205.687562] disk 1, wo:1, o:1, dev:pmem1s [ 205.691717] md: recovery of RAID array md0 Fixes: c7bfced9a671 ("md: suspend i/o during runtime blk_integrity_unregister") Cc: Cc: Mike Snitzer Reported-by: NeilBrown Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/md.c | 28 ++++++++++++++++------------ drivers/md/md.h | 2 +- drivers/md/multipath.c | 6 +++--- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 6 +++--- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8753dee3983b..2cf0e1c00b9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2010,28 +2010,32 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +/* + * Attempt to add an rdev, but only if it is consistent with the current + * integrity profile + */ +int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_rdev; struct blk_integrity *bi_mddev; + char name[BDEVNAME_SIZE]; if (!mddev->gendisk) - return; + return 0; bi_rdev = bdev_get_integrity(rdev->bdev); bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - WARN_ON_ONCE(!mddev->suspended); - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); + return 0; + + if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { + printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", + mdname(mddev), bdevname(rdev->bdev, name)); + return -ENXIO; + } + + return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index fc6f7bbc9544..a491e220e738 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -660,7 +660,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7331a80d89f1..0a72ab6e6c20 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; @@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); break; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2169ff6e0f0..c4b913409226 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84e597e1c489..ce959b4ae4df 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);