[PATCH] md: auto-correct correctable read errors in raid10
Largely just a cross-port from raid1.
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1fa70c3..64bb4ddc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -209,6 +209,7 @@
spin_lock_irqsave(&conf->device_lock, flags);
list_add(&r10_bio->retry_list, &conf->retry_list);
+ conf->nr_queued ++;
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
@@ -254,9 +255,9 @@
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (!uptodate)
- md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
- else
+ update_head_pos(slot, r10_bio);
+
+ if (uptodate) {
/*
* Set R10BIO_Uptodate in our master bio, so that
* we will return a good error code to the higher
@@ -267,15 +268,8 @@
* wait for the 'master' bio.
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
-
- update_head_pos(slot, r10_bio);
-
- /*
- * we have only one bio on the read side
- */
- if (uptodate)
raid_end_bio_io(r10_bio);
- else {
+ } else {
/*
* oops, read error:
*/
@@ -714,6 +708,33 @@
wake_up(&conf->wait_barrier);
}
+static void freeze_array(conf_t *conf)
+{
+ /* stop syncio and normal IO and wait for everything to
+ * go quite.
+ * We increment barrier and nr_waiting, and then
+ * wait until barrier+nr_pending match nr_queued+2
+ */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ conf->nr_waiting++;
+ wait_event_lock_irq(conf->wait_barrier,
+ conf->barrier+conf->nr_pending == conf->nr_queued+2,
+ conf->resync_lock,
+ raid10_unplug(conf->mddev->queue));
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void unfreeze_array(conf_t *conf)
+{
+ /* reverse the effect of the freeze */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ conf->nr_waiting--;
+ wake_up(&conf->wait_barrier);
+ spin_unlock_irq(&conf->resync_lock);
+}
+
static int make_request(request_queue_t *q, struct bio * bio)
{
mddev_t *mddev = q->queuedata;
@@ -1338,6 +1359,7 @@
break;
r10_bio = list_entry(head->prev, r10bio_t, retry_list);
list_del(head->prev);
+ conf->nr_queued--;
spin_unlock_irqrestore(&conf->device_lock, flags);
mddev = r10_bio->mddev;
@@ -1350,6 +1372,78 @@
unplug = 1;
} else {
int mirror;
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen.
+ */
+ int sect = 0; /* Offset from r10_bio->sector */
+ int sectors = r10_bio->sectors;
+ freeze_array(conf);
+ if (mddev->ro == 0) while(sectors) {
+ int s = sectors;
+ int sl = r10_bio->read_slot;
+ int success = 0;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ int d = r10_bio->devs[sl].devnum;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags) &&
+ sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9,
+ conf->tmppage, READ))
+ success = 1;
+ else {
+ sl++;
+ if (sl == conf->copies)
+ sl = 0;
+ }
+ } while (!success && sl != r10_bio->read_slot);
+
+ if (success) {
+ /* write it back and re-read */
+ while (sl != r10_bio->read_slot) {
+ int d;
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, WRITE) == 0 ||
+ sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, READ) == 0) {
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ }
+ }
+ }
+ } else {
+ /* Cannot read from anywhere -- bye bye array */
+ md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
+ break;
+ }
+ sectors -= s;
+ sect += s;
+ }
+
+ unfreeze_array(conf);
+
bio = r10_bio->devs[r10_bio->read_slot].bio;
r10_bio->devs[r10_bio->read_slot].bio = NULL;
bio_put(bio);
@@ -1793,22 +1887,24 @@
* bookkeeping area. [whatever we allocate in run(),
* should be freed in stop()]
*/
- conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+ conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
mddev->private = conf;
if (!conf) {
printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
mdname(mddev));
goto out;
}
- memset(conf, 0, sizeof(*conf));
- conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+ conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
GFP_KERNEL);
if (!conf->mirrors) {
printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
mdname(mddev));
goto out_free_conf;
}
- memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+
+ conf->tmppage = alloc_page(GFP_KERNEL);
+ if (!conf->tmppage)
+ goto out_free_conf;
conf->near_copies = nc;
conf->far_copies = fc;
@@ -1918,6 +2014,7 @@
out_free_conf:
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
+ put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
mddev->private = NULL;
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index b660cbf..dfa5283 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -42,6 +42,7 @@
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
+ int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
@@ -53,6 +54,7 @@
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
+ struct page *tmppage;
};
typedef struct r10_private_data_s conf_t;