dm raid1: handle read failures
This patch gives the ability to respond-to/record device failures
that happen during read operations. It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.
There are essentially two read paths in mirroring; the direct path
and the queued path. When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.
If the direct path is taken, we must record bio information so that
if the read fails we can retry it. We then discover the status of
a direct read through mirror_end_io. If the read has failed, we will
mark the device from which the read was attempted as failed (so we
don't try to read from it again), restore the bio and try again.
If the queued path is taken, we discover the results of the read
from 'read_callback'. If the device failed, we will mark the device
as failed and attempt the read again if there is another device
where this region is known to be 'in-sync'.
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec6d675..38efa70 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -6,6 +6,7 @@
#include "dm.h"
#include "dm-bio-list.h"
+#include "dm-bio-record.h"
#include "dm-io.h"
#include "dm-log.h"
#include "kcopyd.h"
@@ -141,6 +142,7 @@
struct bio_list failures;
struct dm_io_client *io_client;
+ mempool_t *read_record_pool;
/* recovery */
region_t nr_regions;
@@ -647,24 +649,30 @@
wake(rh->ms);
}
+#define MIN_READ_RECORDS 20
+struct dm_raid1_read_record {
+ struct mirror *m;
+ struct dm_bio_details details;
+};
+
/*
* Every mirror should look like this one.
*/
#define DEFAULT_MIRROR 0
/*
- * This is yucky. We squirrel the mirror_set struct away inside
- * bi_next for write buffers. This is safe since the bh
+ * This is yucky. We squirrel the mirror struct away inside
+ * bi_next for read/write buffers. This is safe since the bh
* doesn't get submitted to the lower levels of block layer.
*/
-static struct mirror_set *bio_get_ms(struct bio *bio)
+static struct mirror *bio_get_m(struct bio *bio)
{
- return (struct mirror_set *) bio->bi_next;
+ return (struct mirror *) bio->bi_next;
}
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+static void bio_set_m(struct bio *bio, struct mirror *m)
{
- bio->bi_next = (struct bio *) ms;
+ bio->bi_next = (struct bio *) m;
}
static struct mirror *get_default_mirror(struct mirror_set *ms)
@@ -857,17 +865,105 @@
*---------------------------------------------------------------*/
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
{
- /* FIXME: add read balancing */
- return get_default_mirror(ms);
+ struct mirror *m = get_default_mirror(ms);
+
+ do {
+ if (likely(!atomic_read(&m->error_count)))
+ return m;
+
+ if (m-- == ms->mirror)
+ m += ms->nr_mirrors;
+ } while (m != get_default_mirror(ms));
+
+ return NULL;
+}
+
+static int default_ok(struct mirror *m)
+{
+ struct mirror *default_mirror = get_default_mirror(m->ms);
+
+ return !atomic_read(&default_mirror->error_count);
+}
+
+static int mirror_available(struct mirror_set *ms, struct bio *bio)
+{
+ region_t region = bio_to_region(&ms->rh, bio);
+
+ if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
+ return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
+
+ return 0;
}
/*
* remap a buffer to a particular mirror.
*/
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+ return m->offset + (bio->bi_sector - m->ms->ti->begin);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
{
bio->bi_bdev = m->dev->bdev;
- bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+ bio->bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct io_region *io, struct mirror *m,
+ struct bio *bio)
+{
+ io->bdev = m->dev->bdev;
+ io->sector = map_sector(m, bio);
+ io->count = bio->bi_size >> 9;
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static void read_callback(unsigned long error, void *context)
+{
+ struct bio *bio = context;
+ struct mirror *m;
+
+ m = bio_get_m(bio);
+ bio_set_m(bio, NULL);
+
+ if (likely(!error)) {
+ bio_endio(bio, 0);
+ return;
+ }
+
+ fail_mirror(m, DM_RAID1_READ_ERROR);
+
+ if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
+ DMWARN_LIMIT("Read failure on mirror device %s. "
+ "Trying alternative device.",
+ m->dev->name);
+ queue_bio(m->ms, bio, bio_rw(bio));
+ return;
+ }
+
+ DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
+ m->dev->name);
+ bio_endio(bio, -EIO);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+ struct io_region io;
+ struct dm_io_request io_req = {
+ .bi_rw = READ,
+ .mem.type = DM_IO_BVEC,
+ .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+ .notify.fn = read_callback,
+ .notify.context = bio,
+ .client = m->ms->io_client,
+ };
+
+ map_region(&io, m, bio);
+ bio_set_m(bio, m);
+ (void) dm_io(&io_req, 1, &io, NULL);
}
static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -878,17 +974,20 @@
while ((bio = bio_list_pop(reads))) {
region = bio_to_region(&ms->rh, bio);
+ m = get_default_mirror(ms);
/*
* We can only read balance if the region is in sync.
*/
- if (rh_in_sync(&ms->rh, region, 1))
+ if (likely(rh_in_sync(&ms->rh, region, 1)))
m = choose_mirror(ms, bio->bi_sector);
- else
- m = get_default_mirror(ms);
+ else if (m && atomic_read(&m->error_count))
+ m = NULL;
- map_bio(ms, m, bio);
- generic_make_request(bio);
+ if (likely(m))
+ read_async_bio(m, bio);
+ else
+ bio_endio(bio, -EIO);
}
}
@@ -964,8 +1063,8 @@
int should_wake = 0;
unsigned long flags;
- ms = bio_get_ms(bio);
- bio_set_ms(bio, NULL);
+ ms = bio_get_m(bio)->ms;
+ bio_set_m(bio, NULL);
/*
* NOTE: We don't decrement the pending count here,
@@ -1008,7 +1107,7 @@
static void do_write(struct mirror_set *ms, struct bio *bio)
{
unsigned int i;
- struct io_region io[KCOPYD_MAX_REGIONS+1];
+ struct io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE,
@@ -1019,15 +1118,14 @@
.client = ms->io_client,
};
- for (i = 0; i < ms->nr_mirrors; i++) {
- m = ms->mirror + i;
+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
+ map_region(dest++, m, bio);
- io[i].bdev = m->dev->bdev;
- io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
- io[i].count = bio->bi_size >> 9;
- }
-
- bio_set_ms(bio, ms);
+ /*
+ * Use default mirror because we only need it to retrieve the reference
+ * to the mirror set in write_callback().
+ */
+ bio_set_m(bio, get_default_mirror(ms));
(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
}
@@ -1092,7 +1190,7 @@
rh_delay(&ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) {
- map_bio(ms, get_default_mirror(ms), bio);
+ map_bio(get_default_mirror(ms), bio);
generic_make_request(bio);
}
}
@@ -1231,9 +1329,19 @@
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
+ len = sizeof(struct dm_raid1_read_record);
+ ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
+ len);
+ if (!ms->read_record_pool) {
+ ti->error = "Error creating mirror read_record_pool";
+ kfree(ms);
+ return NULL;
+ }
+
ms->io_client = dm_io_client_create(DM_IO_PAGES);
if (IS_ERR(ms->io_client)) {
ti->error = "Error creating dm_io client";
+ mempool_destroy(ms->read_record_pool);
kfree(ms);
return NULL;
}
@@ -1241,6 +1349,7 @@
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "Error creating dirty region hash";
dm_io_client_destroy(ms->io_client);
+ mempool_destroy(ms->read_record_pool);
kfree(ms);
return NULL;
}
@@ -1256,6 +1365,7 @@
dm_io_client_destroy(ms->io_client);
rh_exit(&ms->rh);
+ mempool_destroy(ms->read_record_pool);
kfree(ms);
}
@@ -1510,10 +1620,11 @@
int r, rw = bio_rw(bio);
struct mirror *m;
struct mirror_set *ms = ti->private;
-
- map_context->ll = bio_to_region(&ms->rh, bio);
+ struct dm_raid1_read_record *read_record = NULL;
if (rw == WRITE) {
+ /* Save region for mirror_end_io() handler */
+ map_context->ll = bio_to_region(&ms->rh, bio);
queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED;
}
@@ -1523,28 +1634,34 @@
if (r < 0 && r != -EWOULDBLOCK)
return r;
- if (r == -EWOULDBLOCK) /* FIXME: ugly */
- r = DM_MAPIO_SUBMITTED;
-
/*
- * We don't want to fast track a recovery just for a read
- * ahead. So we just let it silently fail.
- * FIXME: get rid of this.
+ * If region is not in-sync queue the bio.
*/
- if (!r && rw == READA)
- return -EIO;
+ if (!r || (r == -EWOULDBLOCK)) {
+ if (rw == READA)
+ return -EWOULDBLOCK;
- if (!r) {
- /* Pass this io over to the daemon */
queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED;
}
+ /*
+ * The region is in-sync and we can perform reads directly.
+ * Store enough information so we can retry if it fails.
+ */
m = choose_mirror(ms, bio->bi_sector);
- if (!m)
+ if (unlikely(!m))
return -EIO;
- map_bio(ms, m, bio);
+ read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
+ if (likely(read_record)) {
+ dm_bio_record(&read_record->details, bio);
+ map_context->ptr = read_record;
+ read_record->m = m;
+ }
+
+ map_bio(m, bio);
+
return DM_MAPIO_REMAPPED;
}
@@ -1553,15 +1670,64 @@
{
int rw = bio_rw(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private;
- region_t region = map_context->ll;
+ struct mirror *m = NULL;
+ struct dm_bio_details *bd = NULL;
+ struct dm_raid1_read_record *read_record = map_context->ptr;
/*
* We need to dec pending if this was a write.
*/
- if (rw == WRITE)
- rh_dec(&ms->rh, region);
+ if (rw == WRITE) {
+ rh_dec(&ms->rh, map_context->ll);
+ return error;
+ }
- return 0;
+ if (error == -EOPNOTSUPP)
+ goto out;
+
+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+ goto out;
+
+ if (unlikely(error)) {
+ if (!read_record) {
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry or there was no other
+ * mirror in-sync.
+ */
+ DMERR_LIMIT("Mirror read failed from %s.",
+ m->dev->name);
+ return -EIO;
+ }
+ DMERR("Mirror read failed from %s. Trying alternative device.",
+ m->dev->name);
+
+ m = read_record->m;
+ fail_mirror(m, DM_RAID1_READ_ERROR);
+
+ /*
+ * A failed read is requeued for another attempt using an intact
+ * mirror.
+ */
+ if (default_ok(m) || mirror_available(ms, bio)) {
+ bd = &read_record->details;
+
+ dm_bio_restore(bd, bio);
+ mempool_free(read_record, ms->read_record_pool);
+ map_context->ptr = NULL;
+ queue_bio(ms, bio, rw);
+ return 1;
+ }
+ DMERR("All replicated volumes dead, failing I/O");
+ }
+
+out:
+ if (read_record) {
+ mempool_free(read_record, ms->read_record_pool);
+ map_context->ptr = NULL;
+ }
+
+ return error;
}
static void mirror_presuspend(struct dm_target *ti)